diff --git a/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md b/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md
index 34ba4cf..d562ced 100644
--- a/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md
+++ b/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md
@@ -18,10 +18,11 @@
 - CUDA/cuDNN version:
 - GPU model and memory:
 
-
-You can collect some of this information using our environment capture [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
-You can also obtain the TensorFlow version with
-python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
+You can collect some of this information using our environment capture
+[script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
+You can also obtain the TensorFlow version with: 1. TF 1.0: `python -c "import
+tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"` 2. TF 2.0: `python -c
+"import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
 
 **Describe the current behavior**
 
diff --git a/.github/ISSUE_TEMPLATE/20-documentation-issue.md b/.github/ISSUE_TEMPLATE/20-documentation-issue.md
index 7123ca6..7f4a1f1 100644
--- a/.github/ISSUE_TEMPLATE/20-documentation-issue.md
+++ b/.github/ISSUE_TEMPLATE/20-documentation-issue.md
@@ -1,17 +1,55 @@
 ---
 name: Documentation Issue
-about: Use this template for documentation related issues
+about: Use this template for documentation related
+labels: 'type:docs'
 
 ---
 
-<em>Please make sure that this is a documentation issue. As per our [GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md), we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:doc_template</em>
+Thank you for submitting a TensorFlow documentation issue. Per our GitHub
+policy, we only address code/doc bugs, performance issues, feature requests, and
+build/installation issues on GitHub.
 
+The TensorFlow docs are open source! To get involved, read the documentation
+contributor guide: https://www.tensorflow.org/community/contribute/docs
 
-**System information**
-- TensorFlow version:
-- Doc Link:
+## URL(s) with the issue:
 
+Please provide a link to the documentation entry, for example:
+https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/MyMethod
 
-**Describe the documentation issue**
+## Description of issue (what needs changing):
 
-**We welcome contributions by users. Will you be able to update submit a PR (use the [doc style guide](https://www.tensorflow.org/community/documentation)) to fix the doc Issue?**
+### Clear description
+
+For example, why should someone use this method? How is it useful?
+
+### Correct links
+
+Is the link to the source code correct?
+
+### Parameters defined
+
+Are all parameters defined and formatted correctly?
+
+### Returns defined
+
+Are return values defined?
+
+### Raises listed and defined
+
+Are the errors defined? For example,
+https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_file#raises
+
+### Usage example
+
+Is there a usage example?
+
+### Request visuals, if applicable
+
+Are there currently visuals? If not, will it clarify the content?
+
+### Submit a pull request?
+
+Are you planning to also submit a pull request to fix the issue? See the docs
+contributor guide: https://www.tensorflow.org/community/contribute/docs and the
+docs style guide: https://www.tensorflow.org/community/contribute/docs_style
diff --git a/.gitignore b/.gitignore
index edf3b59..99ba931 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,18 +20,8 @@
 [Bb]uild/
 /tensorflow/core/util/version_info.cc
 /tensorflow/python/framework/fast_tensor_util.cpp
-Pods
-Podfile.lock
-*.pbxproj
-*.xcworkspacedata
-/*.podspec
-/tensorflow/lite/experimental/objc/BUILD
-/tensorflow/lite/experimental/swift/BUILD
-/tensorflow/lite/examples/ios/simple/data/*.txt
-/tensorflow/lite/examples/ios/simple/data/*.tflite
 /tensorflow/lite/gen/**
 /tensorflow/lite/tools/make/downloads/**
-xcuserdata/**
 /api_init_files_list.txt
 /estimator_api_init_files_list.txt
 *.whl
@@ -42,3 +32,14 @@
 *.iml
 local.properties
 gradleBuild
+
+# iOS
+*.pbxproj
+*.xcworkspace
+/*.podspec
+/tensorflow/lite/**/[ios|objc|swift]*/BUILD
+/tensorflow/lite/examples/ios/simple/data/*.tflite
+/tensorflow/lite/examples/ios/simple/data/*.txt
+Podfile.lock
+Pods
+xcuserdata
diff --git a/README.md b/README.md
index 083eb2e..ec5e9af 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@
 uphold this code.**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs, so please see
+tracking requests and bugs, please see
 [TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss)
 for general questions and discussion, and please direct specific questions to
 [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
@@ -114,15 +114,16 @@
 
 ### Community Supported Builds
 
-Build Type                                                                       | Status                                                                                                                                                                                   | Artifacts
--------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**IBM s390x**                                                                    | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                        | TBA
-**Linux ppc64le CPU** Nightly                                                    | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                  | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
-**Linux ppc64le CPU** Stable Release                                             | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)
-**Linux ppc64le GPU** Nightly                                                    | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                  | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
-**Linux ppc64le GPU** Stable Release                                             | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
-**Linux CPU with Intel® MKL-DNN** Nightly                                        | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
-**Linux CPU with Intel® MKL-DNN** <br> **Supports Python 2.7, 3.4, 3.5 and 3.6** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.13.1 pypi](https://pypi.org/project/intel-tensorflow/)
+Build Type                                                                        | Status                                                                                                                                                                                        | Artifacts
+--------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**IBM s390x**                                                                     | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                             | TBA
+**Linux ppc64le CPU** Nightly                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
+**Linux ppc64le CPU** Stable Release                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                       | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)
+**Linux ppc64le GPU** Nightly                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
+**Linux ppc64le GPU** Stable Release                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                       | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
+**Linux CPU with Intel® MKL-DNN** Nightly                                         | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                     | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
+**Linux CPU with Intel® MKL-DNN** <br> **Supports Python 2.7, 3.4, 3.5, and 3.6** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild)      | [1.13.1 pypi](https://pypi.org/project/intel-tensorflow/)
+**Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6                 | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 pypi](https://tensorflow.pypi.thoth-station.ninja/index/)
 
 ## For more information
 
diff --git a/RELEASE.md b/RELEASE.md
index 02a1c4a..c2c50c5 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,10 @@
+# Release 1.12.2
+
+## Bug Fixes and Other Changes
+
+*   Fixes a potential security vulnerability where carefully crafted GIF images
+    can produce a null pointer dereference during decoding.
+
 # Release 1.13.0
 
 ## Major Features and Improvements
@@ -14,98 +21,185 @@
 
 ## Bug Fixes and Other Changes
 
-* Documentation
-  * Update the doc with the details about the rounding mode used in quantize_and_dequantize_v2.
-  * Clarify that tensorflow::port::InitMain() _should_ be called before using the TensorFlow library.  Programs failing to do this are not portable to all platforms.
-* Deprecations and Symbol renames.
-   * Removing deprecations for the following endpoints: `tf.acos`, `tf.acosh`, `tf.add`, `tf.as_string`, `tf.asin`, `tf.asinh`, `tf.atan`, `tf.atan2`, `tf.atanh`, `tf.cos`, `tf.cosh`, `tf.equal`, `tf.exp`, `tf.floor`, `tf.greater`, `tf.greater_equal`, `tf.less`, `tf.less_equal`, `tf.log`, `tf.logp1`, `tf.logical_and`, `tf.logical_not`, `tf.logical_or`, `tf.maximum`, `tf.minimum`, `tf.not_equal`, `tf.sin`, `tf.sinh`, `tf.tan`
-  * Deprecate `tf.data.Dataset.shard`.
-  * Deprecate `saved_model.loader.load` which is replaced by `saved_model.load` and `saved_model.main_op`, which will be replaced by `saved_model.main_op` in V2.
-  * Deprecate tf.QUANTIZED_DTYPES. The official new symbol is tf.dtypes.QUANTIZED_DTYPES.
-  * Update sklearn imports for deprecated packages.
-  * Deprecate `Variable.count_up_to` and `tf.count_up_to` in favor of `Dataset.range`.
-  * Export `confusion_matrix` op as `tf.math.confusion_matrix` instead of `tf.train.confusion_matrix`.
-  * Add `tf.dtypes.` endpoint for every constant in dtypes.py; moving endpoints in versions.py to corresponding endpoints in `tf.sysconfig.` and `tf.version.`; moving all constants under `tf.saved_model` submodules to `tf.saved_model` module. New endpoints are added in V1 and V2 but existing endpoint removals are only applied in V2.
-  * Deprecates behavior where device assignment overrides collocation constraints inside a collocation context manager.
-* Keras & Python API
-  * Add to Keras functionality analogous to `tf.register_tensor_conversion_function`.
-  * Subclassed Keras models can now be saved through `tf.contrib.saved_model.save_keras_model`.
-  * `LinearOperator.matmul` now returns a new `LinearOperator`.
-* New ops and improved op functionality
-  * Add a Nearest Neighbor Resize op.
-  * Add an `ignore_unknown` argument to `parse_values` which suppresses ValueError for unknown hyperparameter types. Such * Add `tf.linalg.matvec` convenience function.
-  * `tf.einsum()`raises `ValueError` for unsupported equations like `"ii->"`.
-  * Add DCT-I and IDCT-I in `tf.signal.dct` and `tf.signal.idct`.
-  * Add LU decomposition op.
-  * Add quantile loss to gradient boosted trees in estimator.
-  * Add `round_mode` to `QuantizeAndDequantizeV2` op to select rounding algorithm.
-  * Add `unicode_encode`, `unicode_decode`, `unicode_decode_with_offsets`, `unicode_split`, `unicode_split_with_offset`, and `unicode_transcode` ops. Amongst other things, this Op adds the ability to encode, decode, and transcode a variety of input text encoding formats into the main Unicode encodings (UTF-8, UTF-16-BE, UTF-32-BE)
-  * Add "unit" attribute to the substr op, which allows obtaining the substring of a string containing unicode characters.
-  * Broadcasting support for Ragged Tensors.
-  * `SpaceToDepth` supports uint8 data type.
-  * Support multi-label quantile regression in estimator.
-  * We now use "div" as the default partition_strategy in `tf.nn.safe_embedding_lookup_sparse`, `tf.nn.sampled_softmax` and `tf.nn.nce_loss`.
-  hyperparameter are ignored.
-* Performance
-  * Improve performance of GPU cumsum/cumprod by up to 300x.
-  * Added support for weight decay in most TPU embedding optimizers, including AdamW and MomentumW.
-* TensorFlow 2.0 Development
-  * Add a command line tool to convert to TF2.0, tf_upgrade_v2
-  * Merge `tf.spectral` into `tf.signal` for TensorFlow 2.0.
-  * Change the default recurrent activation function for LSTM from 'hard_sigmoid' to 'sigmoid' in 2.0. Historically recurrent activation is 'hard_sigmoid' since it is fast than 'sigmoid'. With new unified backend between CPU and GPU mode, since the CuDNN kernel is using sigmoid, we change the default for CPU mode to sigmoid as well. With that, the default LSTM will be compatible with both CPU and GPU kernel. This will enable user with GPU to use CuDNN kernel by default and get a 10x performance boost in training. Note that this is checkpoint breaking change. If user want to use their 1.x pre-trained checkpoint, please construct the layer with LSTM(recurrent_activation='hard_sigmoid') to fallback to 1.x behavior.
-* TensorFlow Lite
-  * Move from `tensorflow/contrib/lite` to `tensorflow/lite`.
-  * Add experimental Java API for injecting TensorFlow Lite delegates
-  * Add support for strings in TensorFlow Lite Java API.
-* `tf.contrib`:
-  * Add Apache Ignite Filesystem plugin to support accessing Apache IGFS.
-  * Dropout now takes `rate` argument, `keep_prob` is deprecated.
-  * Estimator occurrences references `tf.contrib.estimator` were changed to `tf.estimator`:
-    * `tf.contrib.estimator.BaselineEstimator` with `tf.estimator.BaselineEstimator`
-    * `tf.contrib.estimator.DNNLinearCombinedEstimator` with `tf.estimator.DNNLinearCombinedEstimator`
-    * `tf.contrib.estimator.DNNEstimator` with `tf.estimator.DNNEstimator`
-    * `tf.contrib.estimator.LinearEstimator` with `tf.estimator.LinearEstimator`
-    * `tf.contrib.estimator.InMemoryEvaluatorHook` and tf.estimator.experimental.InMemoryEvaluatorHook`.
-    * `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`.
-  * Expose `tf.distribute.Strategy as the new name for tf.contrib.distribute.DistributionStrategy.
-  * Migrate linear optimizer from contrib to core.
-  * Move `tf.contrib.signal` to `tf.signal` (preserving aliases in tf.contrib.signal).
-  * Users of `tf.contrib.estimator.export_all_saved_models` and related should switch to `tf.estimator.Estimator.experimental_export_all_saved_models`.
-* tf.data:
-  * Add `tf.data.experimental.StatsOptions()`, to configure options to collect statistics from `tf.data.Dataset` pipeline using `StatsAggregator`. Add nested option, `experimental_stats` (which takes a `tf.data.experimen tal.StatsOptions` object), to `tf.data.Options`. Deprecates `tf.data.experimental.set_stats_agregator`.
-  * Performance optimizations:
-    * Add `tf.data.experimental.OptimizationOptions()`, to configure options to enable `tf.data` performance optimizations. Add nested option, `experimental_optimization` (which takes a `tf.data.experimental.OptimizationOptions` object), to `tf.data.Options`. Remove performance optimization options from `tf.data.Options`, and add them under `tf.data.experimental.OptimizationOptions` instead.
-    * Enable `map_and_batch_fusion` and `noop_elimination` optimizations by default. They can be disabled by configuring `tf.data.experimental.OptimizationOptions` to set `map_and_batch = False` or `noop_elimination = False` respectively. To disable all default optimizations, set `apply_default_optimizations = False`.
-    * Support parallel map in `map_and_filter_fusion`.
-    * Disable static optimizations for input pipelines that use non-resource `tf.Variable`s.
-  * Add NUMA-aware MapAndBatch dataset.
-  * Deprecate `tf.data.Dataset.make_one_shot_iterator()` in V1, removed it from V2, and added tf.compat.v1.data.make_one_shot_iterator()`.
-  * Deprecate `tf.data.Dataset.make_initializable_iterator()` in V1, removed it from V2, and added `tf.compat.v1.data.make_initializable_iterator()`.
-  * Enable nested dataset support in core `tf.data` transformations.
-  * For `tf.data.Dataset` implementers: Added `tf.data.Dataset._element_structured property` to replace `Dataset.output_{types,shapes,classes}`.
-  * Make `num_parallel_calls` of `tf.data.Dataset.interleave` and `tf.data.Dataset.map` work in Eager mode.
-* Toolchains
-  * Fixed OpenSSL compatibility by avoiding `EVP_MD_CTX_destroy`.
-  * Added bounds checking to printing deprecation warnings.
-  * Upgraded CUDA dependency to 10.0
-  * To build with Android NDK r14b, add "#include <linux/compiler.h>" to android-ndk-r14b/platforms/android-14/arch-*/usr/include/linux/futex.h
-  * Removed `:android_tensorflow_lib_selective_registration*` targets, use `:android_tensorflow_lib_lite*` targets instead.
-* XLA
-  * Move `RoundToEven` function to xla/client/lib/math.h.
-  * A new environment variable `TF_XLA_DEBUG_OPTIONS_PASSTHROUGH` set to "1" or "true" allows the debug options passed within an XRTCompile op to be passed directly to the XLA compilation backend. If such variable is not set (service side), only a restricted set will be passed through.
-  * Allow the XRTCompile op to return the ProgramShape resulted form the XLA compilation as a second return argument.
-  * XLA HLO graphs can now be rendered as SVG/HTML.
-* Estimator
-  * Replace all occurences of `tf.contrib.estimator.BaselineEstimator` with `tf.estimator.BaselineEstimator`
-  * Replace all occurences of `tf.contrib.estimator.DNNLinearCombinedEstimator` with `tf.estimator.DNNLinearCombinedEstimator`
-  * Replace all occurrences of `tf.contrib.estimator.DNNEstimator` with `tf.estimator.DNNEstimator`
-  * Replace all occurrences of `tf.contrib.estimator.LinearEstimator` with `tf.estimator.LinearEstimator`
-  * Users of `tf.contrib.estimator.export_all_saved_models` and related should switch to `tf.estimator.Estimator.experimental_export_all_saved_models`.
-  * Update `regression_head` to the new Head API for Canned Estimator V2.
-  * Switch `multi_class_head` to Head API for Canned Estimator V2.
-  * Replace all occurences of `tf.contrib.estimator.InMemoryEvaluatorHook` and `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with `tf.estimator.experimental.InMemoryEvaluatorHook` and `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`
-  * Migrate linear optimizer from contrib to core.
-
+*   Documentation
+    *   Update the doc with the details about the rounding mode used in
+        quantize_and_dequantize_v2.
+    *   Clarify that tensorflow::port::InitMain() _should_ be called before
+        using the TensorFlow library. Programs failing to do this are not
+        portable to all platforms.
+*   Deprecations and Symbol renames.
+    *   Removing deprecations for the following endpoints: `tf.acos`,
+        `tf.acosh`, `tf.add`, `tf.as_string`, `tf.asin`, `tf.asinh`, `tf.atan`,
+        `tf.atan2`, `tf.atanh`, `tf.cos`, `tf.cosh`, `tf.equal`, `tf.exp`,
+        `tf.floor`, `tf.greater`, `tf.greater_equal`, `tf.less`,
+        `tf.less_equal`, `tf.log`, `tf.logp1`, `tf.logical_and`,
+        `tf.logical_not`, `tf.logical_or`, `tf.maximum`, `tf.minimum`,
+        `tf.not_equal`, `tf.sin`, `tf.sinh`, `tf.tan`
+    *   Deprecate `tf.data.Dataset.shard`.
+    *   Deprecate `saved_model.loader.load` which is replaced by
+        `saved_model.load` and `saved_model.main_op`, which will be replaced by
+        `saved_model.main_op` in V2.
+    *   Deprecate tf.QUANTIZED_DTYPES. The official new symbol is
+        tf.dtypes.QUANTIZED_DTYPES.
+    *   Update sklearn imports for deprecated packages.
+    *   Deprecate `Variable.count_up_to` and `tf.count_up_to` in favor of
+        `Dataset.range`.
+    *   Export `confusion_matrix` op as `tf.math.confusion_matrix` instead of
+        `tf.train.confusion_matrix`.
+    *   Add `tf.dtypes.` endpoint for every constant in dtypes.py. Moving
+        endpoints in versions.py to corresponding endpoints in `tf.sysconfig.`
+        and `tf.version.`. Moving all constants under `tf.saved_model`
+        submodules to `tf.saved_model` module. New endpoints are added in V1 and
+        V2 but existing endpoint removals are only applied in V2.
+    *   Deprecates behavior where device assignment overrides collocation
+        constraints inside a collocation context manager.
+*   Keras & Python API
+    *   Add to Keras functionality analogous to
+        `tf.register_tensor_conversion_function`.
+    *   Subclassed Keras models can now be saved through
+        `tf.contrib.saved_model.save_keras_model`.
+    *   `LinearOperator.matmul` now returns a new `LinearOperator`.
+*   New ops and improved op functionality
+    *   Add a Nearest Neighbor Resize op.
+    *   Add an `ignore_unknown` argument to `parse_values` which suppresses
+        ValueError for unknown hyperparameter types. Such * Add
+        `tf.linalg.matvec` convenience function.
+    *   `tf.einsum()`raises `ValueError` for unsupported equations like
+        `"ii->"`.
+    *   Add DCT-I and IDCT-I in `tf.signal.dct` and `tf.signal.idct`.
+    *   Add LU decomposition op.
+    *   Add quantile loss to gradient boosted trees in estimator.
+    *   Add `round_mode` to `QuantizeAndDequantizeV2` op to select rounding
+        algorithm.
+    *   Add `unicode_encode`, `unicode_decode`, `unicode_decode_with_offsets`,
+        `unicode_split`, `unicode_split_with_offset`, and `unicode_transcode`
+        ops. Amongst other things, this Op adds the ability to encode, decode,
+        and transcode a variety of input text encoding formats into the main
+        Unicode encodings (UTF-8, UTF-16-BE, UTF-32-BE)
+    *   Add "unit" attribute to the substr op, which allows obtaining the
+        substring of a string containing unicode characters.
+    *   Broadcasting support for Ragged Tensors.
+    *   `SpaceToDepth` supports uint8 data type.
+    *   Support multi-label quantile regression in estimator.
+    *   We now use "div" as the default partition_strategy in
+        `tf.nn.safe_embedding_lookup_sparse`, `tf.nn.sampled_softmax` and
+        `tf.nn.nce_loss`. hyperparameter are ignored.
+*   Performance
+    *   Improve performance of GPU cumsum/cumprod by up to 300x.
+    *   Added support for weight decay in most TPU embedding optimizers,
+        including AdamW and MomentumW.
+*   TensorFlow 2.0 Development
+    *   Add a command line tool to convert to TF2.0, tf_upgrade_v2
+    *   Merge `tf.spectral` into `tf.signal` for TensorFlow 2.0.
+    *   Change the default recurrent activation function for LSTM from
+        'hard_sigmoid' to 'sigmoid' in 2.0. Historically recurrent activation is
+        'hard_sigmoid' since it is fast than 'sigmoid'. With new unified backend
+        between CPU and GPU mode, since the CuDNN kernel is using sigmoid, we
+        change the default for CPU mode to sigmoid as well. With that, the
+        default LSTM will be compatible with both CPU and GPU kernel. This will
+        enable user with GPU to use CuDNN kernel by default and get a 10x
+        performance boost in training. Note that this is checkpoint breaking
+        change. If user want to use their 1.x pre-trained checkpoint, please
+        construct the layer with LSTM(recurrent_activation='hard_sigmoid') to
+        fallback to 1.x behavior.
+*   TensorFlow Lite
+    *   Move from `tensorflow/contrib/lite` to `tensorflow/lite`.
+    *   Add experimental Java API for injecting TensorFlow Lite delegates
+    *   Add support for strings in TensorFlow Lite Java API.
+*   `tf.contrib`:
+    *   Add Apache Ignite Filesystem plugin to support accessing Apache IGFS.
+    *   Dropout now takes `rate` argument, `keep_prob` is deprecated.
+    *   Estimator occurrences references `tf.contrib.estimator` were changed to
+        `tf.estimator`:
+    *   `tf.contrib.estimator.BaselineEstimator` with
+        `tf.estimator.BaselineEstimator`
+    *   `tf.contrib.estimator.DNNLinearCombinedEstimator` with
+        `tf.estimator.DNNLinearCombinedEstimator`
+    *   `tf.contrib.estimator.DNNEstimator` with `tf.estimator.DNNEstimator`
+    *   `tf.contrib.estimator.LinearEstimator` with
+        `tf.estimator.LinearEstimator`
+    *   `tf.contrib.estimator.InMemoryEvaluatorHook` and
+        tf.estimator.experimental.InMemoryEvaluatorHook`.
+    *   `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with
+        `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`.
+    *   Expose `tf.distribute.Strategy as the new name for
+        tf.contrib.distribute.DistributionStrategy.
+    *   Migrate linear optimizer from contrib to core.
+    *   Move `tf.contrib.signal` to `tf.signal` (preserving aliases in
+        tf.contrib.signal).
+    *   Users of `tf.contrib.estimator.export_all_saved_models` and related
+        should switch to
+        `tf.estimator.Estimator.experimental_export_all_saved_models`.
+*   tf.data:
+    *   Add `tf.data.experimental.StatsOptions()`, to configure options to
+        collect statistics from `tf.data.Dataset` pipeline using
+        `StatsAggregator`. Add nested option, `experimental_stats` (which takes
+        a `tf.data.experimen tal.StatsOptions` object), to `tf.data.Options`.
+        Deprecates `tf.data.experimental.set_stats_agregator`.
+    *   Performance optimizations:
+    *   Add `tf.data.experimental.OptimizationOptions()`, to configure options
+        to enable `tf.data` performance optimizations. Add nested option,
+        `experimental_optimization` (which takes a
+        `tf.data.experimental.OptimizationOptions` object), to
+        `tf.data.Options`. Remove performance optimization options from
+        `tf.data.Options`, and add them under
+        `tf.data.experimental.OptimizationOptions` instead.
+    *   Enable `map_and_batch_fusion` and `noop_elimination` optimizations by
+        default. They can be disabled by configuring
+        `tf.data.experimental.OptimizationOptions` to set `map_and_batch =
+        False` or `noop_elimination = False` respectively. To disable all
+        default optimizations, set `apply_default_optimizations = False`.
+    *   Support parallel map in `map_and_filter_fusion`.
+    *   Disable static optimizations for input pipelines that use non-resource
+        `tf.Variable`s.
+    *   Add NUMA-aware MapAndBatch dataset.
+    *   Deprecate `tf.data.Dataset.make_one_shot_iterator()` in V1, removed it
+        from V2, and added tf.compat.v1.data.make_one_shot_iterator()`.
+    *   Deprecate `tf.data.Dataset.make_initializable_iterator()` in V1, removed
+        it from V2, and added `tf.compat.v1.data.make_initializable_iterator()`.
+    *   Enable nested dataset support in core `tf.data` transformations.
+    *   For `tf.data.Dataset` implementers: Added
+        `tf.data.Dataset._element_structured property` to replace
+        `Dataset.output_{types,shapes,classes}`.
+    *   Make `num_parallel_calls` of `tf.data.Dataset.interleave` and
+        `tf.data.Dataset.map` work in Eager mode.
+*   Toolchains
+    *   Fixed OpenSSL compatibility by avoiding `EVP_MD_CTX_destroy`.
+    *   Added bounds checking to printing deprecation warnings.
+    *   Upgraded CUDA dependency to 10.0
+    *   To build with Android NDK r14b, add "#include <linux/compiler.h>" to
+        android-ndk-r14b/platforms/android-14/arch-*/usr/include/linux/futex.h
+    *   Removed `:android_tensorflow_lib_selective_registration*` targets, use
+        `:android_tensorflow_lib_lite*` targets instead.
+*   XLA
+    *   Move `RoundToEven` function to xla/client/lib/math.h.
+    *   A new environment variable `TF_XLA_DEBUG_OPTIONS_PASSTHROUGH` set to "1"
+        or "true" allows the debug options passed within an XRTCompile op to be
+        passed directly to the XLA compilation backend. If such variable is not
+        set (service side), only a restricted set will be passed through.
+    *   Allow the XRTCompile op to return the ProgramShape resulted form the XLA
+        compilation as a second return argument.
+    *   XLA HLO graphs can now be rendered as SVG/HTML.
+*   Estimator
+    *   Replace all occurences of `tf.contrib.estimator.BaselineEstimator` with
+        `tf.estimator.BaselineEstimator`
+    *   Replace all occurences of
+        `tf.contrib.estimator.DNNLinearCombinedEstimator` with
+        `tf.estimator.DNNLinearCombinedEstimator`
+    *   Replace all occurrences of `tf.contrib.estimator.DNNEstimator` with
+        `tf.estimator.DNNEstimator`
+    *   Replace all occurrences of `tf.contrib.estimator.LinearEstimator` with
+        `tf.estimator.LinearEstimator`
+    *   Users of `tf.contrib.estimator.export_all_saved_models` and related
+        should switch to
+        `tf.estimator.Estimator.experimental_export_all_saved_models`.
+    *   Update `regression_head` to the new Head API for Canned Estimator V2.
+    *   Switch `multi_class_head` to Head API for Canned Estimator V2.
+    *   Replace all occurences of `tf.contrib.estimator.InMemoryEvaluatorHook`
+        and `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with
+        `tf.estimator.experimental.InMemoryEvaluatorHook` and
+        `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`
+    *   Migrate linear optimizer from contrib to core.
 
 ## Thanks to our Contributors
 
diff --git a/WORKSPACE b/WORKSPACE
index aae8031..d135cc9 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -43,8 +43,8 @@
 # Apple and Swift rules.
 http_archive(
     name = "build_bazel_rules_apple",
-    sha256 = "8f32e2839fba28d549e1670dbed83606dd339a9f7489118e481814d61738270f",
-    urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.14.0/rules_apple.0.14.0.tar.gz"],
+    sha256 = "ec5d2ba70595e52b9678674ea3dd3eed85b5d2280e3f74b889c0f248ec1b835a",
+    urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.16.1/rules_apple.0.16.1.tar.gz"],
 )  # https://github.com/bazelbuild/rules_apple/releases
 http_archive(
     name = "build_bazel_apple_support",
@@ -58,14 +58,14 @@
 )  # https://github.com/bazelbuild/bazel-skylib/releases
 http_archive(
     name = "build_bazel_rules_swift",
-    sha256 = "31aad005a9c4e56b256125844ad05eb27c88303502d74138186f9083479f93a6",
-    urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.8.0/rules_swift.0.8.0.tar.gz"],
+    sha256 = "fef40f8afc331ae4b63da5603538134444f005e5ca7762112b0629334602c845",
+    urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.10.1/rules_swift.0.10.1.tar.gz"],
 )  # https://github.com/bazelbuild/rules_swift/releases
 http_archive(
     name = "com_github_apple_swift_swift_protobuf",
     type = "zip",
-    strip_prefix = "swift-protobuf-1.4.0/",
-    urls = ["https://github.com/apple/swift-protobuf/archive/1.4.0.zip"],
+    strip_prefix = "swift-protobuf-1.5.0/",
+    urls = ["https://github.com/apple/swift-protobuf/archive/1.5.0.zip"],
 )  # https://github.com/apple/swift-protobuf/releases
 http_file(
     name = "xctestrunner",
diff --git a/configure.py b/configure.py
index 5c09b21..b564fda 100644
--- a/configure.py
+++ b/configure.py
@@ -293,9 +293,9 @@
 
   Args:
     environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
-    query_item: string for feature related to the variable, e.g. "Hadoop File
-      System".
+    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
+    query_item: string for feature related to the variable, e.g. "CUDA for
+      Nvidia GPUs".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
     yes_reply: optional string for reply when feature is enabled.
@@ -376,9 +376,9 @@
 
   Args:
     environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
-    query_item: string for feature related to the variable, e.g. "Hadoop File
-      System".
+    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
+    query_item: string for feature related to the variable, e.g. "CUDA for
+      Nvidia GPUs".
     option_name: string for option to define in .bazelrc.
     enabled_by_default: boolean for default behavior.
     bazel_config_name: Name for Bazel --config argument to enable build feature.
@@ -411,9 +411,9 @@
 
   Args:
     environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
-    query_item: string for feature related to the variable, e.g. "Hadoop File
-      System".
+    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
+    query_item: string for feature related to the variable, e.g. "CUDA for
+      Nvidia GPUs".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
     yes_reply: optional string for reply when feature is enabled.
@@ -456,8 +456,8 @@
   """Check installed bazel version is between min_version and max_version.
 
   Args:
-    min_version: string for minimum bazel version.
-    max_version: string for maximum bazel version.
+    min_version: string for minimum bazel version (must exist!).
+    max_version: string for maximum bazel version (must exist!).
 
   Returns:
     The bazel version detected.
@@ -570,7 +570,7 @@
 
   Args:
     environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
     ask_for_var: string for how to ask for user input.
     var_default: default value string.
 
@@ -1261,7 +1261,8 @@
   write_to_bazelrc('build --copt=-w --host_copt=-w')
   # Fix winsock2.h conflicts
   write_to_bazelrc(
-      'build --copt=-DWIN32_LEAN_AND_MEAN --host_copt=-DWIN32_LEAN_AND_MEAN')
+      'build --copt=-DWIN32_LEAN_AND_MEAN --host_copt=-DWIN32_LEAN_AND_MEAN '
+      '--copt=-DNOGDI --host_copt=-DNOGDI')
   # Output more verbose information when something goes wrong
   write_to_bazelrc('build --verbose_failures')
   # The host and target platforms are the same in Windows build. So we don't
@@ -1324,9 +1325,9 @@
 
   cuda_libraries = ['cuda', 'cudnn']
   if is_linux():
-    if 'TF_TENSORRT_VERSION' in environ_cp:  # if env variable exists
+    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
       cuda_libraries.append('tensorrt')
-    if environ_cp.get('TF_NCCL_VERSION', None):  # if env variable not empty
+    if environ_cp.get('TF_NCCL_VERSION', None):
       cuda_libraries.append('nccl')
 
   proc = subprocess.Popen(
@@ -1387,7 +1388,7 @@
   # environment variables.
   environ_cp = dict(os.environ)
 
-  current_bazel_version = check_bazel_version('0.24.1', '0.25.0')
+  current_bazel_version = check_bazel_version('0.24.1', '0.24.1')
   _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
 
   reset_tf_configure_bazelrc()
@@ -1453,8 +1454,12 @@
         cuda_env_names = [
             'TF_CUDA_VERSION', 'TF_CUBLAS_VERSION', 'TF_CUDNN_VERSION',
             'TF_TENSORRT_VERSION', 'TF_NCCL_VERSION', 'TF_CUDA_PATHS',
-            'CUDA_TOOLKIT_PATH'
+            # Items below are for backwards compatibility when not using
+            # TF_CUDA_PATHS.
+            'CUDA_TOOLKIT_PATH', 'CUDNN_INSTALL_PATH', 'NCCL_INSTALL_PATH',
+            'NCCL_HDR_PATH', 'TENSORRT_INSTALL_PATH'
         ]
+        # Note: set_action_env_var above already writes to bazelrc.
         for name in cuda_env_names:
           if name in environ_cp:
             write_action_env_to_bazelrc(name, environ_cp[name])
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 7f20d80..a04ddf9 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -185,6 +185,12 @@
 )
 
 config_setting(
+    name = "linux_aarch64",
+    values = {"cpu": "aarch64"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
     name = "linux_x86_64",
     values = {"cpu": "k8"},
     visibility = ["//visibility:public"],
@@ -420,6 +426,9 @@
     values = {"cpu": "x64_windows"},
 )
 
+# DO NOT ADD ANY NEW EXCEPTIONS TO THIS LIST!
+# Instead, please use public APIs or public build rules TF provides.
+# If you need functionality that is not exposed, we will work with you to expand our public APIs.
 package_group(
     name = "internal",
     packages = [
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index ca213f6..feaf805 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -32,10 +32,13 @@
 
 import distutils as _distutils
 import inspect as _inspect
+import logging as _logging
 import os as _os
 import site as _site
 import sys as _sys
 
+from tensorflow.python.tools import module_util as _module_util
+
 # API IMPORTS PLACEHOLDER
 
 # Make sure directory containing top level submodules is in
@@ -49,25 +52,29 @@
 elif _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
-# pylint: disable=g-bad-import-order
-from tensorflow.python.tools import component_api_helper as _component_api_helper
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorboard.summary._tf.summary'),
-    error_msg="Limited tf.summary API due to missing TensorBoard installation")
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=(
-        'tensorflow_estimator.python.estimator.api._v2.estimator'))
+# Hook external TensorFlow modules.
+try:
+  from tensorboard.summary._tf import summary
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(summary)] + _current_module.__path__)
+except ImportError:
+  _logging.warning(
+      "Limited tf.summary API due to missing TensorBoard installation.")
 
-if not hasattr(_current_module, 'estimator'):
-  _component_api_helper.package_hook(
-      parent_package_str=__name__,
-      child_package_str=(
-          'tensorflow_estimator.python.estimator.api.estimator'))
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorflow.python.keras.api._v2.keras'))
+try:
+  from tensorflow_estimator.python.estimator.api._v2 import estimator
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(estimator)] + _current_module.__path__)
+except ImportError:
+  pass
+
+try:
+  from tensorflow.python.keras.api._v2 import keras
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(keras)] + _current_module.__path__)
+except ImportError:
+  pass
+
 
 # Enable TF2 behaviors
 from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index c5ba4e2..8a14abc 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -26,24 +26,37 @@
 
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+from tensorflow.python.tools import module_util as _module_util
 
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.tools import component_api_helper as _component_api_helper
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=(
-        'tensorflow_estimator.python.estimator.api._v1.estimator'))
-
+# Make sure directory containing top level submodules is in
+# the __path__ so that "from tensorflow.foo import bar" works.
+# We're using bitwise, but there's nothing special about that.
+_API_MODULE = bitwise  # pylint: disable=undefined-variable
 _current_module = _sys.modules[__name__]
-if not hasattr(_current_module, 'estimator'):
-  _component_api_helper.package_hook(
-      parent_package_str=__name__,
-      child_package_str=(
-          'tensorflow_estimator.python.estimator.api.estimator'))
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorflow.python.keras.api._v1.keras'))
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
+if not hasattr(_current_module, '__path__'):
+  __path__ = [_tf_api_dir]
+elif _tf_api_dir not in __path__:
+  __path__.append(_tf_api_dir)
+
+# Hook external TensorFlow modules.
+try:
+  from tensorflow_estimator.python.estimator.api._v1 import estimator
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(estimator)] + _current_module.__path__)
+except ImportError:
+  pass
+
+try:
+  from tensorflow.python.keras.api._v1 import keras
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(keras)] + _current_module.__path__)
+except ImportError:
+  pass
+
+
 from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
 _CONTRIB_WARNING = """
 WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
@@ -66,17 +79,6 @@
 # The 'app' module will be imported as part of the placeholder section above.
 app.flags = flags  # pylint: disable=undefined-variable
 
-# Also use 'app' module (choice is arbitrary) to derive the API directory below.
-_API_MODULE = app  # pylint: disable=undefined-variable
-
-# Make sure directory containing top level submodules is in
-# the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
-if not hasattr(_current_module, '__path__'):
-  __path__ = [_tf_api_dir]
-elif _tf_api_dir not in __path__:
-  __path__.append(_tf_api_dir)
-
 # Load all plugin libraries from site-packages/tensorflow-plugins if we are
 # running under pip.
 # TODO(gunan): Enable setting an environment variable to define arbitrary plugin
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 19a2504..71963c1 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -146,6 +146,7 @@
         "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:attr_builder",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -292,7 +293,6 @@
         "//conditions:default": [],
     }),
     tags = [
-        "no_oss",  # http://b/119522529
         "noasan",
     ],
     # We must ensure that the dependencies can be dynamically linked since
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 23913a6..21d72ac 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -30,8 +30,8 @@
 #include "tensorflow/cc/ops/while_loop.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/kernels/logging_ops.h"
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 6da2a02..726ce27 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -22,6 +22,7 @@
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -66,6 +67,24 @@
   }
 }
 
+unsigned char TF_SetXlaEnableLazyCompilation(unsigned char enable) {
+  tensorflow::BuildXlaOpsPassFlags* flags =
+      tensorflow::GetBuildXlaOpsPassFlags();
+  bool original = flags->tf_xla_enable_lazy_compilation;
+  flags->tf_xla_enable_lazy_compilation = enable;
+  return original;
+}
+
+void TF_SetXLaAutoJitMode(const char* mode) {
+  tensorflow::SetXlaAutoJitFlagFromFlagString(mode);
+}
+
+void TF_SetXlaMinClusterSize(int size) {
+  tensorflow::MarkForCompilationPassFlags* flags =
+      tensorflow::GetMarkForCompilationPassFlags();
+  flags->tf_xla_min_cluster_size = size;
+}
+
 TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
                            unsigned char gpu_memory_allow_growth,
                            unsigned int num_cpu_devices) {
@@ -676,7 +695,7 @@
 
   LOG_AND_RETURN_IF_ERROR(grpc_server->Start());
 
-  LOG_AND_RETURN_IF_ERROR(ctx->context.StoreCollectiveOpsServer(
+  LOG_AND_RETURN_IF_ERROR(ctx->context->StoreCollectiveOpsServer(
       std::move(server), grpc_server->worker_env()->device_mgr,
       grpc_server->worker_env()->collective_executor_mgr));
 
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 11b4c93..795768a 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -62,6 +62,20 @@
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
+// Set XLA's internal BuildXlaOpsPassFlags.tf_xla_enable_lazy_compilation to the
+// value of 'enabled'. Also returns the original value of that flag.
+//
+// Use in tests to allow XLA to fallback to TF classic. This has global effect.
+TF_CAPI_EXPORT unsigned char TF_SetXlaEnableLazyCompilation(
+    unsigned char enable);
+
+// Sets XLA's auto jit mode according to the specified string, which is parsed
+// as if passed in XLA_FLAGS. This has global effect.
+TF_CAPI_EXPORT void TF_SetXLaAutoJitMode(const char* mode);
+
+// Sets XLA's minimum cluster size. This has global effect.
+TF_CAPI_EXPORT void TF_SetXlaMinClusterSize(int size);
+
 // Create a serialized tensorflow.ConfigProto proto, where:
 //
 // a) ConfigProto.optimizer_options.global_jit_level is set to to ON_1 if
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 486dc94..5a82cb0 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -295,7 +295,8 @@
 }
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in tensorflow/python/framework/function.py.
+// function graph_to_function_def(), which is located in
+// tensorflow/python/framework/graph_to_function_def.py.
 Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           bool append_hash_to_fn_name,
                           const std::vector<const Node*>& body_nodes,
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 9a69c58..f021600 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -24,8 +24,10 @@
 #include <unordered_map>
 #include <vector>
 
+// clang-format off
 // Required for IS_MOBILE_PLATFORM
-#include "tensorflow/core/platform/platform.h"  // NO_LINT
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
 
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/framework/op_gen_lib.h"
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 56a3699..c453ec0 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -258,3 +258,22 @@
     srcs = ["c_api.h"],
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# TODO(karllessard): only used by //tensorflow/core:mobile_srcs_only_runtime
+# right now, remove this public rule when no longer needed (it should be
+# replaced by TF Lite)
+filegroup(
+    name = "srcs",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "c_api_experimental.cc",
+            "c_api_experimental.h",
+            "*test*",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
old mode 100755
new mode 100644
index f375a7e..6a5ca6b
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -21,6 +21,11 @@
 #include <string>
 #include <vector>
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
@@ -38,11 +43,15 @@
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/distributed_runtime/remote_device.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
+#endif  // !IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -88,6 +97,7 @@
   return (d == nullptr) ? "cpu:0" : d->name();
 }
 
+#if !defined(IS_MOBILE_PLATFORM)
 tensorflow::Status GetAllRemoteDevices(
     const std::vector<string>& remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
@@ -220,7 +230,7 @@
   tensorflow::gtl::FlatMap<string, tensorflow::uint64> remote_contexts;
   LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
       remote_workers, rendezvous_id, keep_alive_secs, server_def,
-      remote_eager_workers.get(), ctx->context.Async(), &remote_contexts));
+      remote_eager_workers.get(), ctx->context->Async(), &remote_contexts));
 
   tensorflow::RemoteRendezvous* r =
       grpc_server->worker_env()->rendezvous_mgr->Find(rendezvous_id);
@@ -239,12 +249,13 @@
 
   auto* device_mgr = grpc_server->worker_env()->device_mgr;
 
-  return ctx->context.InitializeRemote(
+  return ctx->context->InitializeRemote(
       std::move(server), std::move(remote_eager_workers),
       std::move(remote_device_mgr), remote_contexts, r, device_mgr,
       keep_alive_secs);
 #undef LOG_AND_RETURN_IF_ERROR
 }
+#endif  // !IS_MOBILE_PLATFORM
 
 tensorflow::Status OpInferSingleInputAttrs(TFE_Op* op,
                                            TFE_TensorHandle* input) {
@@ -341,7 +352,7 @@
 TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
                                                         unsigned char enable,
                                                         TF_Status* status) {
-  status->status = ctx->context.SetAsyncForThread(enable);
+  status->status = ctx->context->SetAsyncForThread(enable);
 }
 
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
@@ -381,16 +392,14 @@
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
-  ctx->context.local_device_mgr()->ListDeviceAttributes(&list->response);
-  if (ctx->context.remote_device_mgr()) {
-    ctx->context.remote_device_mgr()->ListDeviceAttributes(&list->response);
+  ctx->context->local_device_mgr()->ListDeviceAttributes(&list->response);
+  if (ctx->context->remote_device_mgr()) {
+    ctx->context->remote_device_mgr()->ListDeviceAttributes(&list->response);
   }
   return list;
 }
 
-void TFE_ContextClearCaches(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->context.ClearCaches();
-}
+void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context->ClearCaches(); }
 
 // Set server_def on the context, possibly updating it.
 TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
@@ -398,6 +407,10 @@
                                                    const void* proto,
                                                    size_t proto_len,
                                                    TF_Status* status) {
+#if defined(IS_MOBILE_PLATFORM)
+  status->status = tensorflow::errors::Unimplemented(
+      "TFE_ContextSetServerDef not supported on mobile");
+#else   // !defined(IS_MOBILE_PLATFORM)
   tensorflow::ServerDef server_def;
   if (!server_def.ParseFromArray(proto, proto_len)) {
     status->status = tensorflow::errors::InvalidArgument(
@@ -406,11 +419,12 @@
   }
   status->status =
       UpdateTFE_ContextWithServerDef(keep_alive_secs, server_def, ctx);
+#endif  // !IS_MOBILE_PLATFORM
 }
 
 void TFE_ContextSetThreadLocalDevicePlacementPolicy(
     TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
-  ctx->context.SetThreadLocalDevicePlacementPolicy(
+  ctx->context->SetThreadLocalDevicePlacementPolicy(
       static_cast<tensorflow::ContextDevicePlacementPolicy>(policy));
 }
 
@@ -420,19 +434,19 @@
 extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
     TFE_Context* ctx) {
   return static_cast<TFE_ContextDevicePlacementPolicy>(
-      ctx->context.GetDevicePlacementPolicy());
+      ctx->context->GetDevicePlacementPolicy());
 }
 
 void TFE_ContextAsyncWait(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->context.AsyncWait();
+  status->status = ctx->context->AsyncWait();
 }
 
 void TFE_ContextGetStatus(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->context.GetStatus();
+  status->status = ctx->context->GetStatus();
 }
 
 void TFE_ContextAsyncClearError(TFE_Context* ctx) {
-  ctx->context.ClearAsyncError();
+  ctx->context->ClearAsyncError();
 }
 
 TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
@@ -592,7 +606,7 @@
     return new TFE_Op(ctx, name, false, types,
                       new TFE_OpInferenceContext(op_def));
   }
-  if (!ctx->context.FindFunctionByName(name)) {
+  if (!ctx->context->FindFunctionByName(name)) {
     status->status = tensorflow::errors::NotFound(
         "'", name,
         "' is neither a type of a primitive operation nor a name "
@@ -890,7 +904,7 @@
                                                const char* device_name,
                                                TF_Status* status) {
   tensorflow::TensorHandle* handle;
-  status->status = tensorflow::EagerCopyToDevice(h->handle, &ctx->context,
+  status->status = tensorflow::EagerCopyToDevice(h->handle, ctx->context,
                                                  device_name, &handle);
   if (status->status.ok()) {
     return new TFE_TensorHandle(handle);
@@ -907,26 +921,26 @@
         tensorflow::errors::InvalidArgument("Invalid FunctionDef proto");
     return;
   }
-  status->status = ctx->context.AddFunctionDef(function_def);
+  status->status = ctx->context->AddFunctionDef(function_def);
 }
 
 void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
                             TF_Status* status) {
-  status->status = ctx->context.AddFunctionDef(function->fdef);
+  status->status = ctx->context->AddFunctionDef(function->fdef);
 }
 
 unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
-  return ctx->context.FindFunctionDef(name) != nullptr;
+  return ctx->context->FindFunctionDef(name) != nullptr;
 }
 
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreGraphs(true);
-  ctx->context.SetShouldStoreStepStats(true);
+  ctx->context->SetShouldStoreGraphs(true);
+  ctx->context->SetShouldStoreStepStats(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreGraphs(false);
-  ctx->context.SetShouldStoreStepStats(false);
+  ctx->context->SetShouldStoreGraphs(false);
+  ctx->context->SetShouldStoreStepStats(false);
 }
 
 }  // extern "C"
@@ -955,9 +969,9 @@
                                   TF_Status* status) {
   TFE_ContextAsyncWait(ctx, status);
   if (!status->status.ok()) return;
-  tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
-  status->status = MessageToBuffer(*ctx->context.RunMetadataProto(), buf);
-  ctx->context.ClearRunMetadata();
+  tensorflow::mutex_lock ml(*ctx->context->MetadataMu());
+  status->status = MessageToBuffer(*ctx->context->RunMetadataProto(), buf);
+  ctx->context->ClearRunMetadata();
 }
 
 namespace {
@@ -973,9 +987,9 @@
 }
 }  // namespace
 
-void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context.StartStep(); }
+void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context->StartStep(); }
 
-void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context.EndStep(); }
+void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context->EndStep(); }
 
 namespace tensorflow {
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 0361c48..c814b8a 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -98,8 +98,7 @@
 
 // Clears the internal caches in the TFE context. Useful when reseeding random
 // ops.
-TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx,
-                                                  TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx);
 
 // Sets a thread-local device placement policy. After this call, other calls to
 // TFE_Execute in the same thread will use the device policy specified here
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 6cbd73f..0c170ea 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -63,7 +63,7 @@
 
 void TFE_ProfilerContextSetEagerContext(TFE_ProfilerContext* profiler_context,
                                         TFE_Context* eager_context) {
-  profiler_context->profiler_context.eager_context = &eager_context->context;
+  profiler_context->profiler_context.eager_context = eager_context->context;
 }
 
 void TFE_DeleteProfilerContext(TFE_ProfilerContext* profiler_context) {
@@ -77,11 +77,11 @@
 }
 
 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreGraphs(true);
+  ctx->context->SetShouldStoreGraphs(true);
 }
 
 void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreGraphs(false);
+  ctx->context->SetShouldStoreGraphs(false);
 }
 
 bool TFE_ProfilerClientStartTracing(const char* service_addr,
@@ -99,59 +99,6 @@
   return s.ok();
 }
 
-static tensorflow::mutex gauges_map_lock(tensorflow::LINKER_INITIALIZED);
-
-static std::unordered_map<string,
-                          tensorflow::monitoring::Gauge<tensorflow::int64, 1>*>*
-get_gauges_map() EXCLUSIVE_LOCKS_REQUIRED(gauges_map_lock) {
-  static std::unordered_map<
-      string, tensorflow::monitoring::Gauge<tensorflow::int64, 1>*>*
-      gauges_map = new std::unordered_map<
-          string, tensorflow::monitoring::Gauge<tensorflow::int64, 1>*>;
-  return gauges_map;
-}
-
-static tensorflow::mutex samplers_map_lock(tensorflow::LINKER_INITIALIZED);
-
-static std::unordered_map<string, tensorflow::monitoring::Sampler<1>*>*
-get_samplers_map() EXCLUSIVE_LOCKS_REQUIRED(samplers_map_lock) {
-  static std::unordered_map<string, tensorflow::monitoring::Sampler<1>*>*
-      samplers_map =
-          new std::unordered_map<string, tensorflow::monitoring::Sampler<1>*>;
-  return samplers_map;
-}
-
-void TFE_MonitoringSetGauge(const char* name, const char* label,
-                            int64_t value) {
-  tensorflow::mutex_lock l(gauges_map_lock);
-  auto gauges_map = get_gauges_map();
-  if (gauges_map->find(name) == gauges_map->end()) {
-    gauges_map->emplace(
-        name, tensorflow::monitoring::Gauge<tensorflow::int64, 1>::New(
-                  name,
-                  tensorflow::strings::StrCat(
-                      name, " :Gauge metric collected from Python API."),
-                  "metric_descriptor"));
-  }
-  gauges_map->at(name)->GetCell(label)->Set(value);
-}
-
-void TFE_MonitoringAddSampler(const char* name, const char* label,
-                              double value) {
-  tensorflow::mutex_lock l(samplers_map_lock);
-  auto samplers_map = get_samplers_map();
-  if (samplers_map->find(name) == samplers_map->end()) {
-    samplers_map->emplace(
-        name, tensorflow::monitoring::Sampler<1>::New(
-                  {name,
-                   tensorflow::strings::StrCat(
-                       name, " :Counter metric collected from Python API."),
-                   "metric_descriptor"},
-                  {tensorflow::monitoring::Buckets::Exponential(1, 2, 30)}));
-  }
-  samplers_map->at(name)->GetCell(label)->Add(value);
-}
-
 void TFE_MonitoringCounterCellIncrementBy(TFE_MonitoringCounterCell* cell,
                                           int64_t value) {
   cell->cell.IncrementBy(value);
@@ -166,6 +113,10 @@
                                                   const char* description) {
   auto* result = new TFE_MonitoringCounter0({name, description});
   Set_TF_Status_from_Status(status, result->counter->GetStatus());
+  if (!result->counter->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
   return result;
 }
 
@@ -185,6 +136,10 @@
                                                   const char* label1) {
   auto* result = new TFE_MonitoringCounter1({name, description, label1});
   Set_TF_Status_from_Status(status, result->counter->GetStatus());
+  if (!result->counter->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
   return result;
 }
 
@@ -206,6 +161,10 @@
   auto* result =
       new TFE_MonitoringCounter2({name, description, label1, label2});
   Set_TF_Status_from_Status(status, result->counter->GetStatus());
+  if (!result->counter->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
   return result;
 }
 
@@ -218,3 +177,344 @@
   return static_cast<TFE_MonitoringCounterCell*>(
       static_cast<void*>(counter->counter->GetCell(label1, label2)));
 }
+
+void TFE_MonitoringIntGaugeCellSet(TFE_MonitoringIntGaugeCell* cell,
+                                   int64_t value) {
+  cell->cell.Set(value);
+}
+
+int64_t TFE_MonitoringIntGaugeCellValue(TFE_MonitoringIntGaugeCell* cell) {
+  return cell->cell.value();
+}
+
+TFE_MonitoringIntGauge0* TFE_MonitoringNewIntGauge0(const char* name,
+                                                    TF_Status* status,
+                                                    const char* description) {
+  auto* result = new TFE_MonitoringIntGauge0({name, description});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteIntGauge0(TFE_MonitoringIntGauge0* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringIntGaugeCell* TFE_MonitoringGetCellIntGauge0(
+    TFE_MonitoringIntGauge0* gauge) {
+  return static_cast<TFE_MonitoringIntGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell()));
+}
+
+TFE_MonitoringIntGauge1* TFE_MonitoringNewIntGauge1(const char* name,
+                                                    TF_Status* status,
+                                                    const char* description,
+                                                    const char* label1) {
+  auto* result = new TFE_MonitoringIntGauge1({name, description, label1});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteIntGauge1(TFE_MonitoringIntGauge1* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringIntGaugeCell* TFE_MonitoringGetCellIntGauge1(
+    TFE_MonitoringIntGauge1* gauge, const char* label1) {
+  return static_cast<TFE_MonitoringIntGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell(label1)));
+}
+
+TFE_MonitoringIntGauge2* TFE_MonitoringNewIntGauge2(const char* name,
+                                                    TF_Status* status,
+                                                    const char* description,
+                                                    const char* label1,
+                                                    const char* label2) {
+  auto* result =
+      new TFE_MonitoringIntGauge2({name, description, label1, label2});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteIntGauge2(TFE_MonitoringIntGauge2* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringIntGaugeCell* TFE_MonitoringGetCellIntGauge2(
+    TFE_MonitoringIntGauge2* gauge, const char* label1, const char* label2) {
+  return static_cast<TFE_MonitoringIntGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell(label1, label2)));
+}
+
+void TFE_MonitoringStringGaugeCellSet(TFE_MonitoringStringGaugeCell* cell,
+                                      const char* value) {
+  cell->cell.Set({value});
+}
+
+const void TFE_MonitoringStringGaugeCellValue(
+    TFE_MonitoringStringGaugeCell* cell, TF_Buffer* buf) {
+  tensorflow::string value = cell->cell.value();
+  void* data = tensorflow::port::Malloc(value.length());
+  value.copy(static_cast<char*>(data), value.length(), 0);
+  buf->data = data;
+  buf->length = value.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+}
+
+TFE_MonitoringStringGauge0* TFE_MonitoringNewStringGauge0(
+    const char* name, TF_Status* status, const char* description) {
+  auto* result = new TFE_MonitoringStringGauge0({name, description});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteStringGauge0(TFE_MonitoringStringGauge0* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringStringGaugeCell* TFE_MonitoringGetCellStringGauge0(
+    TFE_MonitoringStringGauge0* gauge) {
+  return static_cast<TFE_MonitoringStringGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell()));
+}
+
+TFE_MonitoringStringGauge1* TFE_MonitoringNewStringGauge1(
+    const char* name, TF_Status* status, const char* description,
+    const char* label1) {
+  auto* result = new TFE_MonitoringStringGauge1({name, description, label1});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteStringGauge1(TFE_MonitoringStringGauge1* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringStringGaugeCell* TFE_MonitoringGetCellStringGauge1(
+    TFE_MonitoringStringGauge1* gauge, const char* label1) {
+  return static_cast<TFE_MonitoringStringGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell(label1)));
+}
+
+TFE_MonitoringStringGauge2* TFE_MonitoringNewStringGauge2(
+    const char* name, TF_Status* status, const char* description,
+    const char* label1, const char* label2) {
+  auto* result =
+      new TFE_MonitoringStringGauge2({name, description, label1, label2});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteStringGauge2(TFE_MonitoringStringGauge2* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringStringGaugeCell* TFE_MonitoringGetCellStringGauge2(
+    TFE_MonitoringStringGauge2* gauge, const char* label1, const char* label2) {
+  return static_cast<TFE_MonitoringStringGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell(label1, label2)));
+}
+
+void TFE_MonitoringBoolGaugeCellSet(TFE_MonitoringBoolGaugeCell* cell,
+                                    bool value) {
+  cell->cell.Set(value);
+}
+
+bool TFE_MonitoringBoolGaugeCellValue(TFE_MonitoringBoolGaugeCell* cell) {
+  return cell->cell.value();
+}
+
+TFE_MonitoringBoolGauge0* TFE_MonitoringNewBoolGauge0(const char* name,
+                                                      TF_Status* status,
+                                                      const char* description) {
+  auto* result = new TFE_MonitoringBoolGauge0({name, description});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteBoolGauge0(TFE_MonitoringBoolGauge0* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringBoolGaugeCell* TFE_MonitoringGetCellBoolGauge0(
+    TFE_MonitoringBoolGauge0* gauge) {
+  return static_cast<TFE_MonitoringBoolGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell()));
+}
+
+TFE_MonitoringBoolGauge1* TFE_MonitoringNewBoolGauge1(const char* name,
+                                                      TF_Status* status,
+                                                      const char* description,
+                                                      const char* label1) {
+  auto* result = new TFE_MonitoringBoolGauge1({name, description, label1});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteBoolGauge1(TFE_MonitoringBoolGauge1* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringBoolGaugeCell* TFE_MonitoringGetCellBoolGauge1(
+    TFE_MonitoringBoolGauge1* gauge, const char* label1) {
+  return static_cast<TFE_MonitoringBoolGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell(label1)));
+}
+
+TFE_MonitoringBoolGauge2* TFE_MonitoringNewBoolGauge2(const char* name,
+                                                      TF_Status* status,
+                                                      const char* description,
+                                                      const char* label1,
+                                                      const char* label2) {
+  auto* result =
+      new TFE_MonitoringBoolGauge2({name, description, label1, label2});
+  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  if (!result->gauge->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteBoolGauge2(TFE_MonitoringBoolGauge2* gauge) {
+  delete gauge;
+}
+
+TFE_MonitoringBoolGaugeCell* TFE_MonitoringGetCellBoolGauge2(
+    TFE_MonitoringBoolGauge2* gauge, const char* label1, const char* label2) {
+  return static_cast<TFE_MonitoringBoolGaugeCell*>(
+      static_cast<void*>(gauge->gauge->GetCell(label1, label2)));
+}
+
+void TFE_MonitoringSamplerCellAdd(TFE_MonitoringSamplerCell* cell,
+                                  double value) {
+  cell->cell.Add(value);
+}
+
+void TFE_MonitoringSamplerCellValue(TFE_MonitoringSamplerCell* cell,
+                                    TF_Buffer* buf) {
+  string content;
+  cell->cell.value().SerializeToString(&content);
+  void* data = tensorflow::port::Malloc(content.length());
+  content.copy(static_cast<char*>(data), content.length(), 0);
+  buf->data = data;
+  buf->length = content.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+}
+
+TFE_MonitoringBuckets* TFE_MonitoringNewExponentialBuckets(double scale,
+                                                           double growth_factor,
+                                                           int bucket_count) {
+  return new TFE_MonitoringBuckets([scale, growth_factor, bucket_count]() {
+    return tensorflow::monitoring::Buckets::Exponential(scale, growth_factor,
+                                                        bucket_count);
+  });
+}
+
+void TFE_MonitoringDeleteBuckets(TFE_MonitoringBuckets* buckets) {
+  delete buckets;
+}
+
+TFE_MonitoringSampler0* TFE_MonitoringNewSampler0(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* status,
+    const char* description) {
+  auto* result = new TFE_MonitoringSampler0(
+      {name, buckets->create_buckets(), description});
+  Set_TF_Status_from_Status(status, result->sampler->GetStatus());
+  if (!result->sampler->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteSampler0(TFE_MonitoringSampler0* sampler) {
+  delete sampler;
+}
+
+TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler0(
+    TFE_MonitoringSampler0* sampler) {
+  return static_cast<TFE_MonitoringSamplerCell*>(
+      static_cast<void*>(sampler->sampler->GetCell()));
+}
+
+TFE_MonitoringSampler1* TFE_MonitoringNewSampler1(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* status,
+    const char* description, const char* label1) {
+  auto* result = new TFE_MonitoringSampler1(
+      {name, buckets->create_buckets(), description, label1});
+  Set_TF_Status_from_Status(status, result->sampler->GetStatus());
+  if (!result->sampler->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteSampler1(TFE_MonitoringSampler1* sampler) {
+  delete sampler;
+}
+
+TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler1(
+    TFE_MonitoringSampler1* sampler, const char* label1) {
+  return static_cast<TFE_MonitoringSamplerCell*>(
+      static_cast<void*>(sampler->sampler->GetCell(label1)));
+}
+
+TFE_MonitoringSampler2* TFE_MonitoringNewSampler2(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* status,
+    const char* description, const char* label1, const char* label2) {
+  auto* result = new TFE_MonitoringSampler2(
+      {name, buckets->create_buckets(), description, label1, label2});
+  Set_TF_Status_from_Status(status, result->sampler->GetStatus());
+  if (!result->sampler->GetStatus().ok()) {
+    delete result;
+    return nullptr;
+  }
+  return result;
+}
+
+void TFE_MonitoringDeleteSampler2(TFE_MonitoringSampler2* sampler) {
+  delete sampler;
+}
+
+TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
+    TFE_MonitoringSampler2* sampler, const char* label1, const char* label2) {
+  return static_cast<TFE_MonitoringSamplerCell*>(
+      static_cast<void*>(sampler->sampler->GetCell(label1, label2)));
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 11fed82..4dc57e1e 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -87,19 +87,7 @@
     const char* service_addr, const char* logdir, const char* worker_list,
     bool include_dataset_ops, int duration_ms, int num_tracing_attempts);
 
-// Set the value of a Gauge metric. If the metric with given name does not
-// exist, it will create a new Gauge metric. Right now it only supports type
-// int64, consider to add more type supports if needed.
-TF_CAPI_EXPORT extern void TFE_MonitoringSetGauge(const char* name,
-                                                  const char* label,
-                                                  int64_t value);
-
-// Add the given value to a Sampler metric. If the metric with given name
-// does not exist, it will create a new Sampler metric.
-TF_CAPI_EXPORT extern void TFE_MonitoringAddSampler(const char* name,
-                                                    const char* label,
-                                                    double value);
-
+// TODO(fishx): Move these monitoring APIs into a separate file.
 // -----------------------------------------------------------------------------
 // Monitoring Counter APIs.
 // These APIs de-templated monitoring Counter for swig.
@@ -149,6 +137,179 @@
 TF_CAPI_EXPORT extern TFE_MonitoringCounterCell* TFE_MonitoringGetCellCounter2(
     TFE_MonitoringCounter2* counter, const char* label1, const char* label2);
 
+// -----------------------------------------------------------------------------
+// Monitoring Gauge APIs.
+// These APIs de-templated monitoring Gauge for swig.
+
+typedef struct TFE_MonitoringIntGaugeCell TFE_MonitoringIntGaugeCell;
+
+// Atomically set the value of the cell.
+TF_CAPI_EXPORT extern void TFE_MonitoringIntGaugeCellSet(
+    TFE_MonitoringIntGaugeCell* cell, int64_t value);
+
+// Retrieves the current value of the cell.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringIntGaugeCellValue(
+    TFE_MonitoringIntGaugeCell* cell);
+
+// APIs for Int Gauge without label.
+typedef struct TFE_MonitoringIntGauge0 TFE_MonitoringIntGauge0;
+TF_CAPI_EXPORT extern TFE_MonitoringIntGauge0* TFE_MonitoringNewIntGauge0(
+    const char* name, TF_Status* out_status, const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteIntGauge0(
+    TFE_MonitoringIntGauge0* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringIntGaugeCell*
+TFE_MonitoringGetCellIntGauge0(TFE_MonitoringIntGauge0* gauge);
+
+// APIs for Int Gauge with 1 label.
+typedef struct TFE_MonitoringIntGauge1 TFE_MonitoringIntGauge1;
+TF_CAPI_EXPORT extern TFE_MonitoringIntGauge1* TFE_MonitoringNewIntGauge1(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteIntGauge1(
+    TFE_MonitoringIntGauge1* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringIntGaugeCell*
+TFE_MonitoringGetCellIntGauge1(TFE_MonitoringIntGauge1* gauge,
+                               const char* label1);
+
+// APIs for Int Gauge with 2 label.
+typedef struct TFE_MonitoringIntGauge2 TFE_MonitoringIntGauge2;
+TF_CAPI_EXPORT extern TFE_MonitoringIntGauge2* TFE_MonitoringNewIntGauge2(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteIntGauge2(
+    TFE_MonitoringIntGauge2* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringIntGaugeCell*
+TFE_MonitoringGetCellIntGauge2(TFE_MonitoringIntGauge2* gauge,
+                               const char* label1, const char* label2);
+
+typedef struct TFE_MonitoringStringGaugeCell TFE_MonitoringStringGaugeCell;
+TF_CAPI_EXPORT extern void TFE_MonitoringStringGaugeCellSet(
+    TFE_MonitoringStringGaugeCell* cell, const char* value);
+// Retrieves the string value and saves it in buffer.
+TF_CAPI_EXPORT extern const void TFE_MonitoringStringGaugeCellValue(
+    TFE_MonitoringStringGaugeCell* cell, TF_Buffer* buf);
+
+// APIs for String Gauge without label.
+typedef struct TFE_MonitoringStringGauge0 TFE_MonitoringStringGauge0;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge0* TFE_MonitoringNewStringGauge0(
+    const char* name, TF_Status* out_status, const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge0(
+    TFE_MonitoringStringGauge0* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge0(TFE_MonitoringStringGauge0* gauge);
+
+// APIs for String Gauge with 1 label.
+typedef struct TFE_MonitoringStringGauge1 TFE_MonitoringStringGauge1;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge1* TFE_MonitoringNewStringGauge1(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge1(
+    TFE_MonitoringStringGauge1* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge1(TFE_MonitoringStringGauge1* gauge,
+                                  const char* label1);
+
+// APIs for String Gauge with 2 label.
+typedef struct TFE_MonitoringStringGauge2 TFE_MonitoringStringGauge2;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge2* TFE_MonitoringNewStringGauge2(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge2(
+    TFE_MonitoringStringGauge2* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge2(TFE_MonitoringStringGauge2* gauge,
+                                  const char* label1, const char* label2);
+
+typedef struct TFE_MonitoringBoolGaugeCell TFE_MonitoringBoolGaugeCell;
+TF_CAPI_EXPORT extern void TFE_MonitoringBoolGaugeCellSet(
+    TFE_MonitoringBoolGaugeCell* cell, bool value);
+TF_CAPI_EXPORT extern bool TFE_MonitoringBoolGaugeCellValue(
+    TFE_MonitoringBoolGaugeCell* cell);
+
+// APIs for Bool Gauge without label.
+typedef struct TFE_MonitoringBoolGauge0 TFE_MonitoringBoolGauge0;
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGauge0* TFE_MonitoringNewBoolGauge0(
+    const char* name, TF_Status* out_status, const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBoolGauge0(
+    TFE_MonitoringBoolGauge0* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGaugeCell*
+TFE_MonitoringGetCellBoolGauge0(TFE_MonitoringBoolGauge0* gauge);
+
+// APIs for Bool Gauge with 1 label.
+typedef struct TFE_MonitoringBoolGauge1 TFE_MonitoringBoolGauge1;
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGauge1* TFE_MonitoringNewBoolGauge1(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBoolGauge1(
+    TFE_MonitoringBoolGauge1* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGaugeCell*
+TFE_MonitoringGetCellBoolGauge1(TFE_MonitoringBoolGauge1* gauge,
+                                const char* label1);
+
+// APIs for Bool Gauge with 2 label.
+typedef struct TFE_MonitoringBoolGauge2 TFE_MonitoringBoolGauge2;
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGauge2* TFE_MonitoringNewBoolGauge2(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBoolGauge2(
+    TFE_MonitoringBoolGauge2* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGaugeCell*
+TFE_MonitoringGetCellBoolGauge2(TFE_MonitoringBoolGauge2* gauge,
+                                const char* label1, const char* label2);
+
+// -----------------------------------------------------------------------------
+// Monitoring Sampler APIs.
+// These APIs de-templated monitoring Sampler for swig.
+
+typedef struct TFE_MonitoringSamplerCell TFE_MonitoringSamplerCell;
+
+// Atomically add the value of the cell.
+TF_CAPI_EXPORT extern void TFE_MonitoringSamplerCellAdd(
+    TFE_MonitoringSamplerCell* cell, double value);
+
+// Retrieves the current value of the cell. The return value is a HistogramProto
+// saved in buffer.
+TF_CAPI_EXPORT extern void TFE_MonitoringSamplerCellValue(
+    TFE_MonitoringSamplerCell* cell, TF_Buffer* buf);
+
+// APIs for sampler buckets
+typedef struct TFE_MonitoringBuckets TFE_MonitoringBuckets;
+TF_CAPI_EXPORT extern TFE_MonitoringBuckets*
+TFE_MonitoringNewExponentialBuckets(double scale, double growth_factor,
+                                    int bucket_count);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBuckets(
+    TFE_MonitoringBuckets* buckets);
+
+// APIs for Sampler without label.
+typedef struct TFE_MonitoringSampler0 TFE_MonitoringSampler0;
+TF_CAPI_EXPORT extern TFE_MonitoringSampler0* TFE_MonitoringNewSampler0(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* out_status,
+    const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler0(
+    TFE_MonitoringSampler0* sampler);
+TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler0(
+    TFE_MonitoringSampler0* sampler);
+
+// APIs for Sampler with 1 label.
+typedef struct TFE_MonitoringSampler1 TFE_MonitoringSampler1;
+TF_CAPI_EXPORT extern TFE_MonitoringSampler1* TFE_MonitoringNewSampler1(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* out_status,
+    const char* description, const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler1(
+    TFE_MonitoringSampler1* sampler);
+TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler1(
+    TFE_MonitoringSampler1* sampler, const char* label1);
+
+// APIs for Sampler with 2 label.
+typedef struct TFE_MonitoringSampler2 TFE_MonitoringSampler2;
+TF_CAPI_EXPORT extern TFE_MonitoringSampler2* TFE_MonitoringNewSampler2(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* out_status,
+    const char* description, const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler2(
+    TFE_MonitoringSampler2* sampler);
+TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
+    TFE_MonitoringSampler2* sampler, const char* label1, const char* label2);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index 3f45dd7..4e48a75 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -131,23 +131,6 @@
   TFE_DeleteProfilerContext(profiler_context);
 }
 
-TEST(CAPI, MonitoringSetGauge) {
-  TFE_MonitoringSetGauge("test/gauge", "label", 1);
-  auto* collection_registry = monitoring::CollectionRegistry::Default();
-  monitoring::CollectionRegistry::CollectMetricsOptions options;
-  std::unique_ptr<monitoring::CollectedMetrics> metrics =
-      collection_registry->CollectMetrics(options);
-
-  EXPECT_EQ("test/gauge", metrics->point_set_map.at("test/gauge")->metric_name);
-  EXPECT_EQ(1,
-            metrics->point_set_map.at("test/gauge")->points.at(0)->int64_value);
-
-  TFE_MonitoringSetGauge("test/gauge", "label", 5);
-  metrics = collection_registry->CollectMetrics(options);
-  EXPECT_EQ(5,
-            metrics->point_set_map.at("test/gauge")->points.at(0)->int64_value);
-}
-
 TEST(CAPI, MonitoringCounter0) {
   TF_Status* status = TF_NewStatus();
   auto* counter =
@@ -200,8 +183,59 @@
   TFE_MonitoringDeleteCounter2(counter2);
 }
 
-TEST(CAPI, MonitoringAddSampler) {
-  TFE_MonitoringAddSampler("test/sampler", "label", 1.0);
+TEST(CAPI, MonitoringGauge0) {
+  TF_Status* status = TF_NewStatus();
+  auto* gauge = TFE_MonitoringNewIntGauge0("test/gauge", status, "test");
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* cell = TFE_MonitoringGetCellIntGauge0(gauge);
+  TFE_MonitoringIntGaugeCellSet(cell, 1);
+  EXPECT_EQ(TFE_MonitoringIntGaugeCellValue(cell), 1);
+  auto* collection_registry = monitoring::CollectionRegistry::Default();
+  monitoring::CollectionRegistry::CollectMetricsOptions options;
+  std::unique_ptr<monitoring::CollectedMetrics> metrics =
+      collection_registry->CollectMetrics(options);
+
+  EXPECT_EQ("test/gauge", metrics->point_set_map.at("test/gauge")->metric_name);
+  EXPECT_EQ(1,
+            metrics->point_set_map.at("test/gauge")->points.at(0)->int64_value);
+
+  TFE_MonitoringIntGaugeCellSet(cell, 5);
+  metrics = collection_registry->CollectMetrics(options);
+  EXPECT_EQ(5,
+            metrics->point_set_map.at("test/gauge")->points.at(0)->int64_value);
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, MonitoringMultipleGauge) {
+  TF_Status* status = TF_NewStatus();
+  auto* gauge1 =
+      TFE_MonitoringNewBoolGauge1("test/gauge1", status, "test", "label1");
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* cell1 = TFE_MonitoringGetCellBoolGauge1(gauge1, "foo");
+  TFE_MonitoringBoolGaugeCellSet(cell1, true);
+  EXPECT_TRUE(TFE_MonitoringBoolGaugeCellValue(cell1));
+
+  auto* gauge2 = TFE_MonitoringNewStringGauge2("test/gauge2", status, "test",
+                                               "label1", "label2");
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* cell2 = TFE_MonitoringGetCellStringGauge2(gauge2, "foo", "bar");
+  TFE_MonitoringStringGaugeCellSet(cell2, "str");
+  auto* buf = new TF_Buffer;
+  TFE_MonitoringStringGaugeCellValue(cell2, buf);
+  string data(static_cast<const char*>(buf->data), buf->length);
+  delete buf;
+  EXPECT_EQ(data, "str");
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, MonitoringSampler0) {
+  TF_Status* status = TF_NewStatus();
+  auto* buckets = TFE_MonitoringNewExponentialBuckets(1.0, 2.0, 2);
+  auto* sampler =
+      TFE_MonitoringNewSampler0("test/sampler", buckets, status, "test");
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* cell = TFE_MonitoringGetCellSampler0(sampler);
+  TFE_MonitoringSamplerCellAdd(cell, 1.0);
   auto* collection_registry = monitoring::CollectionRegistry::Default();
   monitoring::CollectionRegistry::CollectMetricsOptions options;
   std::unique_ptr<monitoring::CollectedMetrics> metrics =
@@ -213,11 +247,48 @@
                      ->points.at(0)
                      ->histogram_value.sum());
 
-  TFE_MonitoringAddSampler("test/sampler", "label", 5.0);
+  TFE_MonitoringSamplerCellAdd(cell, 5.0);
   metrics = collection_registry->CollectMetrics(options);
   EXPECT_EQ(6.0, metrics->point_set_map.at("test/sampler")
                      ->points.at(0)
                      ->histogram_value.sum());
+  TFE_MonitoringDeleteBuckets(buckets);
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, MonitoringMultipleSampler) {
+  TF_Status* status = TF_NewStatus();
+  auto* buckets = TFE_MonitoringNewExponentialBuckets(1.0, 2.0, 2);
+  auto* sampler1 = TFE_MonitoringNewSampler1("test/sampler1", buckets, status,
+                                             "test", "label1");
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* cell1 = TFE_MonitoringGetCellSampler1(sampler1, "foo");
+  TFE_MonitoringSamplerCellAdd(cell1, 1.0);
+  TFE_MonitoringSamplerCellAdd(cell1, 2.0);
+  TF_Buffer* result1 = TF_NewBuffer();
+  TFE_MonitoringSamplerCellValue(cell1, result1);
+  tensorflow::HistogramProto hitogram1;
+  EXPECT_TRUE(hitogram1.ParseFromString(
+      {reinterpret_cast<const char*>(result1->data), result1->length}));
+  EXPECT_EQ(hitogram1.sum(), 3.0);
+  delete result1;
+
+  auto* sampler2 = TFE_MonitoringNewSampler2("test/sampler2", buckets, status,
+                                             "test", "label1", "label2");
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* cell2 = TFE_MonitoringGetCellSampler2(sampler2, "foo", "bar");
+  TFE_MonitoringSamplerCellAdd(cell2, 2.0);
+  TFE_MonitoringSamplerCellAdd(cell2, 3.0);
+  TF_Buffer* result2 = TF_NewBuffer();
+  TFE_MonitoringSamplerCellValue(cell2, result2);
+  tensorflow::HistogramProto hitogram2;
+  EXPECT_TRUE(hitogram2.ParseFromString(
+      {reinterpret_cast<const char*>(result2->data), result2->length}));
+  EXPECT_EQ(hitogram2.sum(), 5.0);
+  delete result2;
+
+  TFE_MonitoringDeleteBuckets(buckets);
+  TF_DeleteStatus(status);
 }
 
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 3071108..061b0e5 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -36,20 +36,14 @@
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
-#include "tensorflow/core/distributed_runtime/remote_device.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
-#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
@@ -68,13 +62,16 @@
               const tensorflow::DeviceMgr* device_mgr, bool device_mgr_owned,
               tensorflow::Rendezvous* rendezvous,
               const tensorflow::CustomKernelCreator* custom_kernel_creator)
-      : context(opts,
-                static_cast<tensorflow::ContextDevicePlacementPolicy>(
-                    default_policy),
-                async, device_mgr, device_mgr_owned, rendezvous,
-                custom_kernel_creator) {}
+      : context(new tensorflow::EagerContext(
+            opts,
+            static_cast<tensorflow::ContextDevicePlacementPolicy>(
+                default_policy),
+            async, device_mgr, device_mgr_owned, rendezvous,
+            custom_kernel_creator)) {}
 
-  tensorflow::EagerContext context;
+  ~TFE_Context() { context->Unref(); }
+
+  tensorflow::EagerContext* context;
 };
 
 struct TFE_TensorHandle {
@@ -114,7 +111,7 @@
   TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
          const tensorflow::AttrTypeMap* t,
          TFE_OpInferenceContext* inference_ctx)
-      : operation(&ctx->context, op, is_function, t),
+      : operation(ctx->context, op, is_function, t),
         inference_ctx(inference_ctx) {}
 
   tensorflow::EagerOperation operation;
@@ -159,6 +156,98 @@
   using TFE_MonitoringCounter::TFE_MonitoringCounter;
 };
 
+struct TFE_MonitoringIntGaugeCell {
+  tensorflow::monitoring::GaugeCell<tensorflow::int64> cell;
+};
+struct TFE_MonitoringStringGaugeCell {
+  tensorflow::monitoring::GaugeCell<tensorflow::string> cell;
+};
+struct TFE_MonitoringBoolGaugeCell {
+  tensorflow::monitoring::GaugeCell<bool> cell;
+};
+
+template <typename ValueType, int NumLabels>
+struct TFE_MonitoringGauge {
+  template <typename... LabelDesc>
+  TFE_MonitoringGauge(const char* name, const char* description,
+                      LabelDesc&&... label) {
+    gauge = absl::WrapUnique(
+        tensorflow::monitoring::Gauge<ValueType, NumLabels>::New(
+            name, description, label...));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Gauge<ValueType, NumLabels>> gauge;
+};
+
+struct TFE_MonitoringIntGauge0 : TFE_MonitoringGauge<tensorflow::int64, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringIntGauge1 : TFE_MonitoringGauge<tensorflow::int64, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringIntGauge2 : TFE_MonitoringGauge<tensorflow::int64, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringStringGauge0 : TFE_MonitoringGauge<tensorflow::string, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge1 : TFE_MonitoringGauge<tensorflow::string, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge2 : TFE_MonitoringGauge<tensorflow::string, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringBoolGauge0 : TFE_MonitoringGauge<bool, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringBoolGauge1 : TFE_MonitoringGauge<bool, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringBoolGauge2 : TFE_MonitoringGauge<bool, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringBuckets {
+  TFE_MonitoringBuckets(
+      std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
+          fn) {
+    create_buckets = fn;
+  }
+
+  std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
+      create_buckets;
+};
+
+struct TFE_MonitoringSamplerCell {
+  tensorflow::monitoring::SamplerCell cell;
+};
+
+template <int NumLabels>
+struct TFE_MonitoringSampler {
+  template <typename... LabelDesc>
+  TFE_MonitoringSampler(
+      const char* name,
+      std::unique_ptr<tensorflow::monitoring::Buckets> buckets,
+      const char* description, LabelDesc&&... label) {
+    sampler = absl::WrapUnique(tensorflow::monitoring::Sampler<NumLabels>::New(
+        {name, description, label...}, std::move(buckets)));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Sampler<NumLabels>> sampler;
+};
+
+struct TFE_MonitoringSampler0 : TFE_MonitoringSampler<0> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+struct TFE_MonitoringSampler1 : TFE_MonitoringSampler<1> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+struct TFE_MonitoringSampler2 : TFE_MonitoringSampler<2> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+
 namespace tensorflow {
 // Set an AttrValue on the op. Doesn't handle the list types.
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index abc733b..1d57937 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -14,10 +14,11 @@
 ==============================================================================*/
 
 #include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/c/eager/c_api_internal.h"
 
 #include <string.h>
+
 #include "absl/strings/match.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -297,6 +298,61 @@
   TestRemoteExecuteSilentCopies(true);
 }
 
+void TestRemoteExecuteDeleteTensorAfterContext(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  auto* h0_task1 =
+      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteTensorHandle(h0_task0);
+
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContext(ctx);
+
+  // Delete tensors after context is deleted.
+  TFE_DeleteTensorHandle(h0_task1);
+
+  TF_DeleteStatus(status);
+
+  // TODO(nareshmodi): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteDeleteTensorAfterContext) {
+  TestRemoteExecuteDeleteTensorAfterContext(false);
+}
+TEST(CAPI, RemoteExecuteDeleteTensorAfterContextAsync) {
+  TestRemoteExecuteDeleteTensorAfterContext(true);
+}
+
 void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
                                     const std::vector<float>& expected_values) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
diff --git a/tensorflow/c/experimental/BUILD b/tensorflow/c/experimental/BUILD
new file mode 100644
index 0000000..b66969e
--- /dev/null
+++ b/tensorflow/c/experimental/BUILD
@@ -0,0 +1,122 @@
+# Description:
+# Experimental C APIs for TensorFlow.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+    "tf_cuda_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+tf_cuda_library(
+    name = "rendezvous_internal",
+    srcs = [
+        "rendezvous.cc",
+    ],
+    hdrs = [
+        "rendezvous.h",
+        "rendezvous_internal.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/c:__subpackages__"],
+    deps = [
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+)
+
+tf_cuda_library(
+    name = "rendezvous",
+    hdrs = [
+        "rendezvous.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":rendezvous_internal",
+        "//tensorflow/c:c_api",
+    ],
+)
+
+tf_cuda_library(
+    name = "network_internal",
+    srcs = [
+        "network.cc",
+    ],
+    hdrs = [
+        "network.h",
+        "network_internal.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/c:__subpackages__"],
+    deps = [
+        ":rendezvous_internal",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+)
+
+tf_cuda_library(
+    name = "network",
+    hdrs = [
+        "network.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":network_internal",
+        ":rendezvous",
+        "//tensorflow/c:c_api",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Tests
+
+tf_cuda_cc_test(
+    name = "network_test",
+    size = "medium",
+    srcs = ["network_test.cc"],
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":network",
+        ":network_internal",
+        ":rendezvous",
+        ":rendezvous_internal",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:env",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_session",
+        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
diff --git a/tensorflow/c/experimental/network.cc b/tensorflow/c/experimental/network.cc
new file mode 100644
index 0000000..9dfce1b
--- /dev/null
+++ b/tensorflow/c/experimental/network.cc
@@ -0,0 +1,166 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/network.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/experimental/network_internal.h"
+#include "tensorflow/c/experimental/rendezvous_internal.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+using tensorflow::ServerFactory;
+
+namespace tensorflow {
+
+/* static */ Status CGrpcServer::Create(
+    const ServerDef& server_def,
+    void* (*init_function)(const TF_GrpcServer*, TF_Status*),
+    void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*delete_function)(void*),
+    TF_RemoteRendezvousBuilder* rendezvous_builder,
+    std::unique_ptr<ServerInterface>* out_server) {
+  auto* grpc_server = new CGrpcServer(server_def, start_function, stop_function,
+                                      join_function, delete_function);
+
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = [rendezvous_builder](const WorkerEnv* env) {
+    return new CRendezvousMgr(env, rendezvous_builder);
+  };
+  TF_RETURN_IF_ERROR(grpc_server->Init(options));
+  TF_Status* tf_status = TF_NewStatus();
+  grpc_server->SetContext(init_function(
+      reinterpret_cast<const TF_GrpcServer*>(grpc_server), tf_status));
+  TF_RETURN_IF_ERROR(tf_status->status);
+  TF_DeleteStatus(tf_status);
+
+  out_server->reset(grpc_server);
+  return Status::OK();
+}
+
+Status CGrpcServer::Start() {
+  Status status = GrpcServer::Start();
+  TF_Status* tf_status = TF_NewStatus();
+  (*start_function_)(reinterpret_cast<const TF_GrpcServer*>(this), context_,
+                     tf_status);
+  status.Update(tf_status->status);
+  TF_DeleteStatus(tf_status);
+  return status;
+}
+
+Status CGrpcServer::Stop() {
+  Status status = GrpcServer::Stop();
+  TF_Status* tf_status = TF_NewStatus();
+  (*stop_function_)(reinterpret_cast<const TF_GrpcServer*>(this), context_,
+                    tf_status);
+  status.Update(tf_status->status);
+  TF_DeleteStatus(tf_status);
+  return status;
+}
+
+Status CGrpcServer::Join() {
+  Status status = GrpcServer::Join();
+  TF_Status* tf_status = TF_NewStatus();
+  (*join_function_)(reinterpret_cast<const TF_GrpcServer*>(this), context_,
+                    tf_status);
+  status.Update(tf_status->status);
+  TF_DeleteStatus(tf_status);
+  return status;
+}
+
+namespace {
+// Factory that creates CGrpcServer instances.
+class CServerFactory : public ServerFactory {
+ public:
+  CServerFactory(bool (*accept_function)(const char*),
+                 void* (*init_function)(const TF_GrpcServer*, TF_Status*),
+                 void (*start_function)(const TF_GrpcServer*, void*,
+                                        TF_Status*),
+                 void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
+                 void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
+                 void (*delete_function)(void*),
+                 TF_RemoteRendezvousBuilder* rendezvous_builder)
+      : accept_function_(accept_function),
+        init_function_(init_function),
+        start_function_(start_function),
+        stop_function_(stop_function),
+        join_function_(join_function),
+        delete_function_(delete_function),
+        rendezvous_builder_(rendezvous_builder) {}
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    TF_RETURN_IF_ERROR(CGrpcServer::Create(
+        server_def, init_function_, start_function_, stop_function_,
+        join_function_, delete_function_, rendezvous_builder_, out_server));
+    return Status::OK();
+  }
+
+  // Returns true if and only if this factory can create a server
+  // based on the given `server_def`.
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return (*accept_function_)(server_def.protocol().c_str());
+  }
+
+ private:
+  bool (*accept_function_)(const char* protocol);
+  void* (*init_function_)(const TF_GrpcServer*, TF_Status*);
+  void (*start_function_)(const TF_GrpcServer*, void*, TF_Status*);
+  void (*stop_function_)(const TF_GrpcServer*, void*, TF_Status*);
+  void (*join_function_)(const TF_GrpcServer*, void*, TF_Status*);
+  void (*delete_function_)(void*);
+  TF_RemoteRendezvousBuilder* rendezvous_builder_;
+};
+}  // namespace
+}  // namespace tensorflow
+
+// Server factory representation to use in C API.
+// Holds CServerFactory pointer.
+struct TF_GrpcServerFactory {
+  ::tensorflow::CServerFactory* factory;
+};
+
+TF_GrpcServerFactory* TF_NewGrpcServerFactory(
+    bool (*accept_function)(const char*),
+    void* (*init_function)(const TF_GrpcServer*, TF_Status*),
+    void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*delete_function)(void*),
+    TF_RemoteRendezvousBuilder* rendezvous_builder) {
+  TF_GrpcServerFactory* server_factory = new TF_GrpcServerFactory;
+  server_factory->factory = new ::tensorflow::CServerFactory(
+      accept_function, init_function, start_function, stop_function,
+      join_function, delete_function, rendezvous_builder);
+  return server_factory;
+}
+
+void TF_DeleteGrpcServerFactory(TF_GrpcServerFactory* server_factory) {
+  DCHECK_NE(server_factory, nullptr);
+  delete server_factory;
+}
+
+void TF_RegisterGrpcServerFactory(const char* server_type,
+                                  TF_GrpcServerFactory* server_factory) {
+  ServerFactory::Register(server_type, server_factory->factory);
+}
diff --git a/tensorflow/c/experimental/network.h b/tensorflow/c/experimental/network.h
new file mode 100644
index 0000000..bd74ec8
--- /dev/null
+++ b/tensorflow/c/experimental/network.h
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_NETWORK_H_
+#define TENSORFLOW_C_EXPERIMENTAL_NETWORK_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/experimental/rendezvous.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow Networking.
+// NOTE: This API is unstable and almost certainly will change in the near
+// future.
+//
+// Users wishing to register a custom GrpcServer should call
+// TF_NewServerFactory and then TF_RegisterGrpcServerFactory.
+//
+// Example:
+// ```c++
+// auto* rendezvous_builder = TF_NewRemoteRendezvousBuilder(
+//     rendezvous_init_function,
+//     receive_from_remote_async_function,
+//     rendezvous_delete_function);
+//
+// TF_GrpcServerFactory* factory = TF_NewGrpcServerFactory(
+//     accept_function,
+//     init_function,
+//     start_function,
+//     stop_function,
+//     join_function,
+//     delete_function,
+//     rendezvous_builder);
+// TF_RegisterGrpcServerFactory("customfactory", factory);
+// ...
+// TF_DeleteGrpcServerFactory(factory);
+// ```
+
+typedef struct TF_GrpcServerFactory TF_GrpcServerFactory;
+typedef struct TF_GrpcServerOptions TF_GrpcServerOptions;
+typedef struct TF_GrpcServer TF_GrpcServer;
+typedef struct TF_ServerContext {
+  TF_GrpcServer* const server;
+  void* context;
+} TF_ServerContext;
+
+// Creates a new TF_GrpcServerFactory instance. Caller takes ownership
+// of TF_GrpcServerFactory instance and should deallocate it by calling
+// TF_GrpcDeleteServerFactory.
+// accept_function should return true if this ServerFactory can create
+// server instances for the given protocol name (for e.g. grpc+verbs).
+// GRPC servers created by this factory will call provided
+// init_function, start_function, stop_function, join_function and
+// delete_function.
+//
+// Note that clean shutdown is currently not implemented for GrpcServer.
+// So, stop_function will never be called now but may be in the future
+// when stop mechanism is supported.
+TF_CAPI_EXPORT extern TF_GrpcServerFactory* TF_NewGrpcServerFactory(
+    bool (*accept_function)(const char*),
+    void* (*init_function)(const TF_GrpcServer*, TF_Status*),
+    void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
+    void (*delete_function)(void*),
+    TF_RemoteRendezvousBuilder* rendezvous_builder);
+
+// Deletes TF_GrpcServerFactory instances.
+// Note that this function only deletes TF_GrpcServerFactory wrapper.
+// Actual underlying server factory would not be deleted and will
+// remain registered.
+TF_CAPI_EXPORT extern void TF_DeleteGrpcServerFactory(
+    TF_GrpcServerFactory* server_factory);
+
+// Registers provided server_factory for the given server_type.
+// server_type must be unique to the server factory.
+TF_CAPI_EXPORT extern void TF_RegisterGrpcServerFactory(
+    const char* server_type, TF_GrpcServerFactory* server_factory);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+#endif  // TENSORFLOW_C_EXPERIMENTAL_NETWORK_H_
diff --git a/tensorflow/c/experimental/network_internal.h b/tensorflow/c/experimental/network_internal.h
new file mode 100644
index 0000000..c257529
--- /dev/null
+++ b/tensorflow/c/experimental/network_internal.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_NETWORK_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_NETWORK_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/experimental/network.h"
+#include "tensorflow/c/experimental/rendezvous.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+
+// GrpcServer implementation that forwards calls to callbacks.
+class CGrpcServer : public GrpcServer {
+ protected:
+  CGrpcServer(const ServerDef& server_def,
+              void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
+              void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
+              void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
+              void (*delete_function)(void*))
+      : GrpcServer(server_def, ::tensorflow::Env::Default()),
+        start_function_(start_function),
+        stop_function_(stop_function),
+        join_function_(join_function),
+        delete_function_(delete_function),
+        context_(nullptr) {}
+
+ public:
+  static Status Create(
+      const ServerDef& server_def,
+      void* (*init_function)(const TF_GrpcServer*, TF_Status*),
+      void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
+      void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
+      void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
+      void (*delete_function)(void*),
+      TF_RemoteRendezvousBuilder* rendezvous_builder,
+      std::unique_ptr<ServerInterface>* out_server);
+
+  Status Start() override;
+  Status Stop() override;
+  Status Join() override;
+
+  ~CGrpcServer() override { delete_function_(context_); }
+
+ protected:
+  void SetContext(void* context) { context_ = context; }
+
+ private:
+  void (*start_function_)(const TF_GrpcServer*, void*, TF_Status*);
+  void (*stop_function_)(const TF_GrpcServer*, void*, TF_Status*);
+  void (*join_function_)(const TF_GrpcServer*, void*, TF_Status*);
+  void (*delete_function_)(void*);
+  void* context_;
+
+  friend class NetworksTest;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_C_EXPERIMENTAL_NETWORK_INTERNAL_H_
diff --git a/tensorflow/c/experimental/network_test.cc b/tensorflow/c/experimental/network_test.cc
new file mode 100644
index 0000000..39f7e64
--- /dev/null
+++ b/tensorflow/c/experimental/network_test.cc
@@ -0,0 +1,256 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/network.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/experimental/network_internal.h"
+#include "tensorflow/c/experimental/rendezvous.h"
+#include "tensorflow/c/experimental/rendezvous_internal.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+
+bool accept_functionA(const char* protocol_name) {
+  return strcmp(protocol_name, "grpc+A") == 0;
+}
+
+bool accept_functionB(const char* protocol_name) {
+  return strcmp(protocol_name, "grpc+B") == 0;
+}
+
+struct SomeServerData {
+  bool server_started = false;
+};
+
+struct SomeRendezvousData {
+  int test = 0;
+};
+
+void* init_function(const TF_GrpcServer* server, TF_Status* status) {
+  SomeServerData* server_data = new SomeServerData();
+  TF_SetStatus(status, TF_OK, "");
+  return server_data;
+}
+
+void start_function(const TF_GrpcServer* server, void* context,
+                    TF_Status* status) {
+  auto* server_data = static_cast<SomeServerData*>(context);
+  server_data->server_started = true;
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void stop_function(const TF_GrpcServer* server, void* context,
+                   TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void join_function(const TF_GrpcServer* server, void* context,
+                   TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void delete_function(void* context) {
+  auto* server_data = static_cast<SomeServerData*>(context);
+  delete server_data;
+}
+
+void* rendezvous_init_function(void* server_context) {
+  return new SomeRendezvousData();
+}
+
+void Deallocator(void* data, size_t, void* arg) {
+  tensorflow::cpu_allocator()->DeallocateRaw(data);
+  *reinterpret_cast<bool*>(arg) = true;
+}
+
+void receive_from_remote_async_function(TF_ParsedKey* key,
+                                        TF_RendezvousArgs* args,
+                                        TF_RendezvousDoneCallback* callback,
+                                        void* context) {
+  // Create dummy tensor
+  const int num_bytes = 6 * sizeof(float);
+  float* values =
+      reinterpret_cast<float*>(tensorflow::cpu_allocator()->AllocateRaw(
+          EIGEN_MAX_ALIGN_BYTES, num_bytes));
+  int64_t dims[] = {2, 3};
+  bool deallocator_called = false;
+  auto* tensor = TF_NewTensor(TF_FLOAT, dims, 2, values, num_bytes,
+                              &Deallocator, &deallocator_called);
+  callback->tensor = tensor;
+  auto* tf_status = TF_NewStatus();
+  TF_SetStatus(tf_status, TF_OK, "");
+  callback->status = tf_status;
+  TF_RendezvousDone(callback);
+  TF_DeleteStatus(tf_status);
+  TF_DeleteTensor(tensor);
+}
+
+void rendezvous_delete_function(void* context) {
+  auto* rendezvous_data = static_cast<SomeRendezvousData*>(context);
+  delete rendezvous_data;
+}
+
+tensorflow::ServerDef GetServerDef(const string& protocol,
+                                   const string& job_name, int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol(protocol);
+  server_def.set_job_name(job_name);
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name(job_name);
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost:", port)});
+  }
+  return server_def;
+}
+
+class NetworksTest : public ::testing::Test {
+ public:
+  ~NetworksTest() override {}
+
+  SomeServerData* GetServerData(CGrpcServer* server) {
+    EXPECT_NE(server->context_, nullptr);
+    return static_cast<SomeServerData*>(server->context_);
+  }
+};
+
+Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
+                          const string& receiver, const string& name) {
+  Rendezvous::ParsedKey result;
+  CHECK(
+      Rendezvous::ParseKey(Rendezvous::CreateKey(sender, incarnation, receiver,
+                                                 name, FrameAndIter(0, 0)),
+                           &result)
+          .ok());
+  return result;
+}
+
+void InitializeRendezvous(GrpcServer* grpc_server, ServerDef* server_def,
+                          RemoteRendezvous* remote_rendezvous) {
+  int rendezvous_id = 0;
+  auto session_name = tensorflow::strings::StrCat("test_", rendezvous_id);
+  TF_EXPECT_OK(grpc_server->worker_env()->session_mgr->CreateSession(
+      session_name, *server_def, true));
+
+  std::shared_ptr<tensorflow::WorkerSession> worker_session;
+  TF_EXPECT_OK(grpc_server->worker_env()->session_mgr->WorkerSessionForSession(
+      session_name, &worker_session));
+
+  TF_EXPECT_OK(remote_rendezvous->Initialize(worker_session.get()));
+}
+
+TEST_F(NetworksTest, TestStartServer) {
+  auto* rendezvous_builder = TF_NewRemoteRendezvousBuilder(
+      rendezvous_init_function, receive_from_remote_async_function,
+      rendezvous_delete_function);
+
+  TF_Status* tf_status = TF_NewStatus();
+  TF_GrpcServerFactory* factory = TF_NewGrpcServerFactory(
+      accept_functionA, init_function, start_function, stop_function,
+      join_function, delete_function, rendezvous_builder);
+  TF_RegisterGrpcServerFactory("testfactoryA", factory);
+
+  ServerDef server_def = GetServerDef("grpc+A", "localhost", 1);
+  std::unique_ptr<ServerInterface> server;
+  TF_EXPECT_OK(NewServer(server_def, &server));
+  auto* grpc_server = static_cast<CGrpcServer*>(server.get());
+  auto* server_data = GetServerData(grpc_server);
+  ASSERT_FALSE(server_data->server_started);
+
+  TF_EXPECT_OK(server->Start());
+  ASSERT_TRUE(server_data->server_started);
+
+  TF_DeleteStatus(tf_status);
+  TF_DeleteGrpcServerFactory(factory);
+  TF_DeleteRemoteRendezvousBuilder(rendezvous_builder);
+  // TODO(annarev): find a clean way to shutdown server.
+  server.release();
+}
+
+TEST_F(NetworksTest, TestReceiveData) {
+  auto* rendezvous_builder = TF_NewRemoteRendezvousBuilder(
+      rendezvous_init_function, receive_from_remote_async_function,
+      rendezvous_delete_function);
+
+  TF_Status* tf_status = TF_NewStatus();
+  TF_GrpcServerFactory* factory = TF_NewGrpcServerFactory(
+      accept_functionB, init_function, start_function, stop_function,
+      join_function, delete_function, rendezvous_builder);
+  TF_RegisterGrpcServerFactory("testfactoryB", factory);
+
+  ServerDef server_def = GetServerDef("grpc+B", "localhost", 1);
+  std::unique_ptr<ServerInterface> server;
+  TF_EXPECT_OK(NewServer(server_def, &server));
+  auto* grpc_server = static_cast<CGrpcServer*>(server.get());
+
+  TF_EXPECT_OK(server->Start());
+  auto* rendezvous_mgr = grpc_server->worker_env()->rendezvous_mgr;
+  auto* remote_rendezvous = rendezvous_mgr->Find(0);
+
+  auto key = Key("/job:localhost/replica:1/task:2/device:CPU:0", 1,
+                 "/job:localhost/replica:0/task:0/device:CPU:0", "test");
+  Rendezvous::Args args;
+  bool done_callback_called = false;
+  auto* done_callback_called_ptr = &done_callback_called;
+  absl::Notification notification;
+  auto* notification_ptr = &notification;
+
+  InitializeRendezvous(grpc_server, &server_def, remote_rendezvous);
+  remote_rendezvous->RecvAsync(
+      key, args,
+      [done_callback_called_ptr, notification_ptr](
+          const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
+          const Tensor&, const bool) mutable {
+        *done_callback_called_ptr = true;
+        notification_ptr->Notify();
+      });
+  notification.WaitForNotificationWithTimeout(absl::Seconds(10));
+  ASSERT_EQ(done_callback_called, true);
+
+  TF_DeleteStatus(tf_status);
+  TF_DeleteGrpcServerFactory(factory);
+  TF_DeleteRemoteRendezvousBuilder(rendezvous_builder);
+  // Server doesn't have a clean shutdown.
+  server.release();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/rendezvous.cc b/tensorflow/c/experimental/rendezvous.cc
new file mode 100644
index 0000000..0ee4907
--- /dev/null
+++ b/tensorflow/c/experimental/rendezvous.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/rendezvous.h"
+
+#include <functional>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/experimental/rendezvous_internal.h"
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+CRemoteRendezvous::CRemoteRendezvous(const WorkerEnv* env, int64 step_id,
+                                     void (*receive_from_remote_async_function)(
+                                         TF_ParsedKey*, TF_RendezvousArgs*,
+                                         TF_RendezvousDoneCallback*,
+                                         void* context),
+                                     void (*delete_function)(void* context),
+                                     void* server_context)
+    : BaseRemoteRendezvous(env, step_id),
+      receive_from_remote_async_function_(receive_from_remote_async_function),
+      delete_function_(delete_function),
+      context_(nullptr) {}
+
+void CRemoteRendezvous::RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                                            const Rendezvous::Args& args,
+                                            DoneCallback done) {
+  TF_ParsedKey key;
+  key.src_device = parsed.src_device.data();
+  key.src_device_len = parsed.src_device.size();
+  key.dst_device = parsed.dst_device.data();
+  key.dst_device_len = parsed.dst_device.size();
+  key.full_key = parsed.FullKey().data();
+  key.full_key_len = parsed.FullKey().size();
+
+  TF_DeviceContext* device_context = new TF_DeviceContext();
+  device_context->context = args.device_context;
+
+  TF_AllocatorAttributes* alloc_attrs = new TF_AllocatorAttributes();
+  alloc_attrs->value = args.alloc_attrs.value;
+  alloc_attrs->scope_id = args.alloc_attrs.scope_id;
+  alloc_attrs->on_host = args.alloc_attrs.on_host();
+  alloc_attrs->nic_compatible = args.alloc_attrs.nic_compatible();
+
+  TF_RendezvousArgs* cargs = new TF_RendezvousArgs();
+  cargs->device_context = device_context;
+  cargs->alloc_attrs = alloc_attrs;
+
+  TF_RendezvousDoneCallback* done_callback = new TF_RendezvousDoneCallback();
+  done_callback->done_callback = done;
+  done_callback->recv_args = cargs;
+
+  receive_from_remote_async_function_(&key, cargs, done_callback, context_);
+}
+
+CRemoteRendezvous::~CRemoteRendezvous() { delete_function_(context_); }
+}  // namespace tensorflow
+
+TF_RemoteRendezvousBuilder* TF_NewRemoteRendezvousBuilder(
+    void* (*init_function)(void* server_context),
+    void (*receive_from_remote_async_function)(TF_ParsedKey*,
+                                               TF_RendezvousArgs*,
+                                               TF_RendezvousDoneCallback*,
+                                               void* context),
+    void (*delete_function)(void* context)) {
+  TF_RemoteRendezvousBuilder* builder = new TF_RemoteRendezvousBuilder();
+  builder->init_function = init_function;
+  builder->delete_function = delete_function;
+  builder->receive_from_remote_async_function =
+      receive_from_remote_async_function;
+  return builder;
+}
+
+void TF_DeleteRemoteRendezvousBuilder(
+    TF_RemoteRendezvousBuilder* rendezvous_builder) {
+  DCHECK_NE(rendezvous_builder, nullptr);
+  delete rendezvous_builder;
+}
+
+TF_CAPI_EXPORT extern void TF_RendezvousDone(
+    TF_RendezvousDoneCallback* callback) {
+  DCHECK_NE(callback, nullptr);
+  ::tensorflow::Tensor tensor;
+  TF_CHECK_OK(TF_TensorToTensor(callback->tensor, &tensor));
+  ::tensorflow::Rendezvous::Args recv_args;
+  recv_args.alloc_attrs.value = callback->recv_args->alloc_attrs->value;
+  recv_args.alloc_attrs.scope_id = callback->recv_args->alloc_attrs->scope_id;
+  recv_args.device_context = callback->recv_args->device_context->context;
+  ::tensorflow::Rendezvous::Args sent_args;
+
+  callback->done_callback(callback->status->status, sent_args, recv_args,
+                          tensor, callback->dead);
+
+  if (callback->recv_args) {
+    DCHECK_NE(callback->recv_args, nullptr);
+    DCHECK_NE(callback->recv_args->alloc_attrs, nullptr);
+    DCHECK_NE(callback->recv_args->device_context, nullptr);
+    delete callback->recv_args->alloc_attrs;
+    delete callback->recv_args->device_context;
+    delete callback->recv_args;
+  }
+  delete callback;
+  callback = nullptr;
+}
diff --git a/tensorflow/c/experimental/rendezvous.h b/tensorflow/c/experimental/rendezvous.h
new file mode 100644
index 0000000..5b007d5
--- /dev/null
+++ b/tensorflow/c/experimental/rendezvous.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_H_
+
+#include "tensorflow/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// C API for Rendezvous.
+// NOTE: This API is unstable and almost certainly will change in the near
+// future.
+//
+// Custom rendezvous allows for custom implementations of Recv call.
+//
+// Users wishing to create custom rendezvous objects should call
+// TF_NewRemoteRendezvousBuilder and pass returned TF_RemoteRendezvousBuilder
+// to to TF_NewServerFactory.
+
+typedef struct TF_RemoteRendezvousBuilder TF_RemoteRendezvousBuilder;
+typedef struct TF_ParsedKey TF_ParsedKey;
+typedef struct TF_RendezvousArgs TF_RendezvousArgs;
+typedef struct TF_RendezvousDoneCallback TF_RendezvousDoneCallback;
+
+// Creates a new TF_RemoteRendezvousBuilder instance.
+// Rendezvous instances will forward calls to init_function,
+// receive_from_remote_async_function and delete_function passed here.
+//
+// Note that receive_from_remote_async_function implementation must call
+// TF_Done with the TF_DoneCallback passed as an argument.
+TF_CAPI_EXPORT extern TF_RemoteRendezvousBuilder* TF_NewRemoteRendezvousBuilder(
+    void* (*init_function)(void* server_context),
+    void (*receive_from_remote_async_function)(TF_ParsedKey*,
+                                               TF_RendezvousArgs*,
+                                               TF_RendezvousDoneCallback*,
+                                               void* context),
+    void (*delete_function)(void* context));
+
+// Deletes TF_RemoteRendezvousBuilder instances.
+TF_CAPI_EXPORT extern void TF_DeleteRemoteRendezvousBuilder(
+    TF_RemoteRendezvousBuilder* rendezvous_builder);
+
+// Calls TF_DoneCallback and destroys callback instance and
+// TF_DoneCallback members except `tensor` and `status`. Caller is
+// responsible for deleting `tensor` and `status` after TF_Done returns.
+TF_CAPI_EXPORT extern void TF_RendezvousDone(
+    TF_RendezvousDoneCallback* callback);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+#endif  // TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_H_
diff --git a/tensorflow/c/experimental/rendezvous_internal.h b/tensorflow/c/experimental/rendezvous_internal.h
new file mode 100644
index 0000000..f066860
--- /dev/null
+++ b/tensorflow/c/experimental/rendezvous_internal.h
@@ -0,0 +1,135 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_INTERNAL_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/experimental/rendezvous.h"
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/platform/macros.h"
+
+struct TF_ParsedKey {
+  // char* members might not be null-terminated.
+  const char* src_device;
+  size_t src_device_len;
+  const char* dst_device;
+  size_t dst_device_len;
+  const char* full_key;
+  size_t full_key_len;
+};
+
+struct TF_AllocatorAttributes {
+  bool on_host;
+  bool nic_compatible;
+  // NOTE: The upper 8 bits of the value are reserved for
+  // device-specific uses.  Implementors of a device can interpret these
+  // upper 8 bits in device-specific ways, and ops implemented for those
+  // devices are responsible for setting those 8 bits appropriately.
+  tensorflow::uint32 value = 0;
+  // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
+  // a named special-purpose allocator on the same device.
+  tensorflow::int32 scope_id = 0;
+};
+
+struct TF_DeviceContext {
+  ::tensorflow::DeviceContext* context;
+};
+
+struct TF_RendezvousArgs {
+  const TF_DeviceContext* device_context;
+  const TF_AllocatorAttributes* alloc_attrs;
+};
+
+struct TF_RendezvousDoneCallback {
+  ::tensorflow::Rendezvous::DoneCallback done_callback;
+
+  // TODO(annarev): figure out if we should also support sent_args.
+  const TF_RendezvousArgs* recv_args;
+  TF_Tensor* tensor = nullptr;
+  TF_Status* status;
+  bool dead;
+};
+
+struct TF_RemoteRendezvousBuilder {
+  void* (*init_function)(void* server_context);
+  void (*receive_from_remote_async_function)(TF_ParsedKey*, TF_RendezvousArgs*,
+                                             TF_RendezvousDoneCallback*,
+                                             void* context);
+  void (*delete_function)(void* context);
+  void* server_context;
+};
+
+namespace tensorflow {
+
+class CRemoteRendezvous : public BaseRemoteRendezvous {
+ public:
+  CRemoteRendezvous(const WorkerEnv* env, int64 step_id,
+                    void (*receive_from_remote_async_function)(
+                        TF_ParsedKey*, TF_RendezvousArgs*,
+                        TF_RendezvousDoneCallback*, void* context),
+                    void (*delete_function)(void* context),
+                    void* server_context);
+
+  void SetContext(void* context) { context_ = context; }
+
+ protected:
+  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                           const Rendezvous::Args& args,
+                           DoneCallback done) override;
+
+ private:
+  ~CRemoteRendezvous() override;
+
+  void (*receive_from_remote_async_function_)(TF_ParsedKey*, TF_RendezvousArgs*,
+                                              TF_RendezvousDoneCallback*,
+                                              void* context);
+  void (*delete_function_)(void* context);
+  void* context_;
+  TF_DISALLOW_COPY_AND_ASSIGN(CRemoteRendezvous);
+};
+
+class CRendezvousMgr : public BaseRendezvousMgr {
+ public:
+  CRendezvousMgr(const WorkerEnv* env,
+                 const TF_RemoteRendezvousBuilder* rendezvous_builder)
+      : BaseRendezvousMgr(env), rendezvous_builder_(rendezvous_builder) {}
+
+ protected:
+  BaseRemoteRendezvous* Create(int64 step_id,
+                               const WorkerEnv* worker_env) override {
+    auto* rendezvous = new CRemoteRendezvous(
+        worker_env, step_id,
+        rendezvous_builder_->receive_from_remote_async_function,
+        rendezvous_builder_->delete_function,
+        rendezvous_builder_->server_context);
+
+    rendezvous->SetContext(rendezvous_builder_->init_function(
+        rendezvous_builder_->server_context));
+    return rendezvous;
+  }
+
+ private:
+  const TF_RemoteRendezvousBuilder* rendezvous_builder_;
+  TF_DISALLOW_COPY_AND_ASSIGN(CRendezvousMgr);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_INTERNAL_H_
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 43a33cb..0605a62 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -42,14 +42,19 @@
 const int kRightMargin = 79;
 
 // Converts:
-//   bazel-out/.../genfiles/(external/YYY/)?XX
+//   bazel-out/.../(bin|genfiles)/(external/YYY/)?XX
 // to: XX.
 string GetPath(const string& dot_h_fname) {
-  auto pos = dot_h_fname.find("/genfiles/");
+  auto pos = dot_h_fname.find("/bin/");
   string result = dot_h_fname;
   if (pos != string::npos) {
     // - 1 account for the terminating null character (\0) in "/genfiles/".
-    result = dot_h_fname.substr(pos + sizeof("/genfiles/") - 1);
+    result = dot_h_fname.substr(pos + sizeof("/bin/") - 1);
+  } else {
+    pos = dot_h_fname.find("/genfiles/");
+    if (pos != string::npos) {
+      result = dot_h_fname.substr(pos + sizeof("/genfiles/") - 1);
+    }
   }
   if (result.size() > sizeof("external/") &&
       result.compare(0, sizeof("external/") - 1, "external/") == 0) {
diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
index 49cb74f..ad2443a 100644
--- a/tensorflow/compat_template.__init__.py
+++ b/tensorflow/compat_template.__init__.py
@@ -18,27 +18,41 @@
 from __future__ import division as _division
 from __future__ import print_function as _print_function
 
+import logging as _logging
 import os as _os
 import sys as _sys
 
+from tensorflow.python.tools import module_util as _module_util
+
 # pylint: disable=g-bad-import-order
 
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.tools import component_api_helper as _component_api_helper
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorboard.summary._tf.summary'),
-    error_msg=(
-        "Limited tf.compat.v2.summary API due to missing TensorBoard "
-        "installation"))
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=(
-        'tensorflow_estimator.python.estimator.api._v2.estimator'))
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorflow.python.keras.api._v2.keras'))
+# Hook external TensorFlow modules.
+_current_module = _sys.modules[__name__]
+try:
+  from tensorboard.summary._tf import summary
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(summary)] + _current_module.__path__)
+except ImportError:
+  _logging.warning(
+      "Limited tf.compat.v2.summary API due to missing TensorBoard "
+      "installation.")
+
+try:
+  from tensorflow_estimator.python.estimator.api._v2 import estimator
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(estimator)] + _current_module.__path__)
+except ImportError:
+  pass
+
+try:
+  from tensorflow.python.keras.api._v2 import keras
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(keras)] + _current_module.__path__)
+except ImportError:
+  pass
+
 
 # We would like the following to work for fully enabling 2.0 in a 1.0 install:
 #
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index 64dd293..23c722e 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -19,18 +19,30 @@
 from __future__ import print_function as _print_function
 
 import os as _os
+import sys as _sys
+
+from tensorflow.python.tools import module_util as _module_util
 
 # pylint: disable=g-bad-import-order
 
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.tools import component_api_helper as _component_api_helper
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=(
-        'tensorflow_estimator.python.estimator.api._v1.estimator'))
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorflow.python.keras.api._v1.keras'))
+# Hook external TensorFlow modules.
+_current_module = _sys.modules[__name__]
+try:
+  from tensorflow_estimator.python.estimator.api._v1 import estimator
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(estimator)] + _current_module.__path__)
+except ImportError:
+  pass
+
+try:
+  from tensorflow.python.keras.api._v1 import keras
+  _current_module.__path__ = (
+      [_module_util.get_parent_dir(keras)] + _current_module.__path__)
+except ImportError:
+  pass
+
+
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
 app.flags = flags  # pylint: disable=undefined-variable
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index ebb570d..2f063d7 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -263,38 +263,23 @@
   void set_var_{{NAME}}_data({{MAYBE_CONST}}{{TYPE}}* data) {
     set_arg_data({{I}}, data);
   }
-)";
-    const tf2xla::Variable& var = config.variable(i - config.feed_size());
-    rewrites.emplace_back("{{MAYBE_CONST}}", var.readonly() ? "const " : "");
-    *methods += RewriteWithName(
-        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  {{MAYBE_CONST}}{{TYPE}}* var_{{NAME}}_data() {
+    return static_cast<{{MAYBE_CONST}}{{TYPE}}*>(arg_data({{I}}));
   }
-  size_t num_results = ps.result().tuple_shapes_size();
-  int variable_num = -1;
-  for (int i = config.fetch_size(); i < num_results; ++i) {
-    std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(AddRewritesForShape(
-        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
-    string code = R"(
-  {{TYPE}}* var_{{NAME}}_data() {
-    return static_cast<{{TYPE}}*>(result_data({{I}}));
-  }
-  {{TYPE}}& var_{{NAME}}({{DIM_VARS}}) {
-    return (*static_cast<{{TYPE}}(*){{DIM_SIZES}}>(
-        result_data({{I}}))){{INDICES}};
+  {{MAYBE_CONST}}{{TYPE}}& var_{{NAME}}({{DIM_VARS}}) {
+    return (*static_cast<{{MAYBE_CONST}}{{TYPE}}(*){{DIM_SIZES}}>(
+        arg_data({{I}}))){{INDICES}};
   }
   const {{TYPE}}* var_{{NAME}}_data() const {
-    return static_cast<const {{TYPE}}*>(result_data({{I}}));
+    return static_cast<const {{TYPE}}*>(arg_data({{I}}));
   }
   const {{TYPE}}& var_{{NAME}}({{DIM_VARS}}) const {
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
-        result_data({{I}}))){{INDICES}};
+        arg_data({{I}}))){{INDICES}};
   }
 )";
-    do {
-      ++variable_num;
-    } while (config.variable(variable_num).readonly());
-    const tf2xla::Variable& var = config.variable(variable_num);
+    const tf2xla::Variable& var = config.variable(i - config.feed_size());
+    rewrites.emplace_back("{{MAYBE_CONST}}", var.readonly() ? "const " : "");
     *methods += RewriteWithName(
         var.name().empty() ? var.node_name() : var.name(), code, rewrites);
   }
@@ -549,7 +534,8 @@
     return *kStaticData;
   }
 
-  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
+  {{CLASS}}(AllocMode alloc_mode =
+            AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   {{CLASS}}(const {{CLASS}}&) = delete;
@@ -590,19 +576,29 @@
   // buffers are managed internally, and may change after each call to Run.
 {{METHODS_RESULT}}
 
-  // Methods for managing variable buffers. Buffers are in row-major order. The
-  // input and output buffers may or may not be identical.
+  // Methods for managing variable buffers. Buffers are in row-major order.
+  //
+  // For read-write variables we generate the following methods:
   //
   // void set_var_X_data(T* data)
-  //   Sets the buffer for variable X.
+  //   Sets the buffer for variable X.  Must be called before Run if the
+  //   allocation mode is RESULTS_PROFILES_AND_TEMPS_ONLY.
   //
   // T* var_X_data()
-  //   Returns the buffer of type T for variable X.
+  //   Returns the buffer of type T for variable X.  If the allocation mode is
+  //   RESULTS_PROFILES_AND_TEMPS_ONLY then this buffer is the same as the
+  //   buffer passed to set_var_X_data.
   //
   // T& var_X(...dim indices...)
   //   Returns a reference to the value of type T for variable X,
   //   with dim indices specifying which value. No bounds checking is performed
   //   on dim indices.
+  //
+  // For readonly variables we generate the same set of methods, except that we
+  // use `const T` instead of `T`.  We use `const T` to avoid erasing the
+  // constness of the buffer passed to `set_var_X_data` but the underlying
+  // buffer is not const (and thus the const can be safely const-cast'ed away)
+  // unless `set_var_X_data` is called with a pointer to constant storage.
 {{METHODS_VARIABLE}}
 
  private:
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index fa4f4c7..702582b 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -91,7 +91,8 @@
     return *kStaticData;
   }
 
-  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
+  MyClass(AllocMode alloc_mode =
+            AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   MyClass(const MyClass&) = delete;
@@ -214,60 +215,82 @@
         result_data(0)))[dim0][dim1];
   }
 
-  // Methods for managing variable buffers. Buffers are in row-major order. The
-  // input and output buffers may or may not be identical.
+  // Methods for managing variable buffers. Buffers are in row-major order.
+  //
+  // For read-write variables we generate the following methods:
   //
   // void set_var_X_data(T* data)
-  //   Sets the buffer for variable X.
+  //   Sets the buffer for variable X.  Must be called before Run if the
+  //   allocation mode is RESULTS_PROFILES_AND_TEMPS_ONLY.
   //
   // T* var_X_data()
-  //   Returns the buffer of type T for variable X.
+  //   Returns the buffer of type T for variable X.  If the allocation mode is
+  //   RESULTS_PROFILES_AND_TEMPS_ONLY then this buffer is the same as the
+  //   buffer passed to set_var_X_data.
   //
   // T& var_X(...dim indices...)
   //   Returns a reference to the value of type T for variable X,
   //   with dim indices specifying which value. No bounds checking is performed
   //   on dim indices.
+  //
+  // For readonly variables we generate the same set of methods, except that we
+  // use `const T` instead of `T`.  We use `const T` to avoid erasing the
+  // constness of the buffer passed to `set_var_X_data` but the underlying
+  // buffer is not const (and thus the const can be safely const-cast'ed away)
+  // unless `set_var_X_data` is called with a pointer to constant storage.
 
   void set_var_myvar_readonly_data(const float* data) {
     set_arg_data(2, data);
   }
+  const float* var_myvar_readonly_data() {
+    return static_cast<const float*>(arg_data(2));
+  }
+  const float& var_myvar_readonly() {
+    return (*static_cast<const float(*)[1]>(
+        arg_data(2)))[0];
+  }
+  const float* var_myvar_readonly_data() const {
+    return static_cast<const float*>(arg_data(2));
+  }
+  const float& var_myvar_readonly() const {
+    return (*static_cast<const float(*)[1]>(
+        arg_data(2)))[0];
+  }
 
   void set_var_myvar_data(float* data) {
     set_arg_data(3, data);
   }
+  float* var_myvar_data() {
+    return static_cast<float*>(arg_data(3));
+  }
+  float& var_myvar() {
+    return (*static_cast<float(*)[1]>(
+        arg_data(3)))[0];
+  }
+  const float* var_myvar_data() const {
+    return static_cast<const float*>(arg_data(3));
+  }
+  const float& var_myvar() const {
+    return (*static_cast<const float(*)[1]>(
+        arg_data(3)))[0];
+  }
 
   void set_var_myvar2_data(tensorflow::int32* data) {
     set_arg_data(4, data);
   }
-
-  float* var_myvar_data() {
-    return static_cast<float*>(result_data(1));
-  }
-  float& var_myvar() {
-    return (*static_cast<float(*)[1]>(
-        result_data(1)))[0];
-  }
-  const float* var_myvar_data() const {
-    return static_cast<const float*>(result_data(1));
-  }
-  const float& var_myvar() const {
-    return (*static_cast<const float(*)[1]>(
-        result_data(1)))[0];
-  }
-
   tensorflow::int32* var_myvar2_data() {
-    return static_cast<tensorflow::int32*>(result_data(2));
+    return static_cast<tensorflow::int32*>(arg_data(4));
   }
   tensorflow::int32& var_myvar2(size_t dim0) {
     return (*static_cast<tensorflow::int32(*)[5]>(
-        result_data(2)))[dim0];
+        arg_data(4)))[dim0];
   }
   const tensorflow::int32* var_myvar2_data() const {
-    return static_cast<const tensorflow::int32*>(result_data(2));
+    return static_cast<const tensorflow::int32*>(arg_data(4));
   }
   const tensorflow::int32& var_myvar2(size_t dim0) const {
     return (*static_cast<const tensorflow::int32(*)[5]>(
-        result_data(2)))[dim0];
+        arg_data(4)))[dim0];
   }
 
  private:
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 2d642c4..c55f3f9 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -83,7 +83,8 @@
 // Run tests that use set_argN_data separately, to avoid accidentally re-using
 // non-existent buffers.
 TEST(TFCompileTest, Add_SetArg) {
-  AddComp add(AddComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
+  AddComp add(
+      XlaCompiledCpuFunction::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
 
   int32 arg_x = 10;
   int32 arg_y = 32;
@@ -296,7 +297,7 @@
   Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
 
   foo::bar::MatMulComp matmul(
-      foo::bar::MatMulComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
+      XlaCompiledCpuFunction::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
   matmul.set_thread_pool(&device);
 
   // Test using the set_argN_data() methods.
@@ -503,8 +504,36 @@
 
   // This implements the recursion:
   // x[0] = 2.0
-  // x[n+1] = x[n] - 0.1*(x[n-1] + 1.0)
+  // x[n+1] = x[n] - 0.1*(x[n-1] + y)
   VariableSequentialUpdatesComp fn;
+  fn.var_x() = 2;
+  *const_cast<float*>(fn.var_y_data()) = 1;
+
+  fn.set_thread_pool(&device);
+  // First calculate x[3]
+  fn.Run();
+  EXPECT_NEAR(fn.var_x(), 1.187f, 1e-6);
+
+  const float y = 1;
+  fn.set_var_y_data(&y);
+
+  // Now const_cast<float*>(fn.var_y_data()) is not longer legal since we've set
+  // the buffer to point to a constant location.
+
+  // Then calculate x[6]
+  fn.Run();
+  EXPECT_NEAR(fn.var_x(), 0.594322f, 1e-6);
+}
+
+TEST(TFCompileTest, VariableSequentialUpdatesNoAlloc) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  // This implements the recursion:
+  // x[0] = 2.0
+  // x[n+1] = x[n] - 0.1*(x[n-1] + 1.0)
+  VariableSequentialUpdatesComp fn(
+      XlaCompiledCpuFunction::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
   float x = 2;
   float y = 1;
   fn.set_var_x_data(&x);
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 2c08cb6..e7f3c0a 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -174,6 +174,20 @@
             "'" + arg.replace("'", "'\\''") + "'"
             for arg in (tfcompile_flags or [])
         ])
+
+    # Do this before we append the `select` into `flags`, because doing so
+    # transforms `flags` into a variable of type `select`, and we can't call
+    # `find` on such an object.
+    need_xla_data_proto = flags and flags.find("--gen_program_shape") != -1
+
+    # Pass --target_cpu=haswell to tfcompile if compiling for Haswell (bazel
+    # build --cpu=haswell).  We put it at the beginning of the flags list so
+    # that tfcompile_flags can override if if desired.
+    flags = select({
+        "//tools/target_cpu:haswell": "--target_cpu=haswell ",
+        "//conditions:default": "",
+    }) + flags
+
     if enable_xla_hlo_profiling:
         profiling_flag = "--xla_hlo_profile"
     else:
@@ -251,7 +265,6 @@
 
     # The cc_library rule packaging up the header and object file, and needed
     # kernel implementations.
-    need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1)
     native.cc_library(
         name = name,
         srcs = [function_object_file, metadata_object_file],
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index e86499c..cd3ce75 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -20,12 +20,8 @@
     ],
 )
 
-# NB! Removing the cc_header_only_library import breaks the OSS build since
-# copybara injects some build rules that use it.
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 # Target that bundles up the XLA CPU and GPU JIT devices.
@@ -212,6 +208,7 @@
         "//tensorflow/core/kernels/data:iterator_ops",
         "//tensorflow/core/kernels/data:optional_ops",
         "//tensorflow/core/kernels/data:prefetch_dataset_op",
+        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
@@ -265,7 +262,6 @@
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -273,6 +269,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
@@ -321,6 +318,7 @@
     deps = [
         ":compilation_passes",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow_pass_registration",
+        "//tensorflow/compiler/tf2xla:rearrange_function_argument_pass_registration",
         "//tensorflow/core:core_cpu_internal",
     ],
     alwayslink = 1,
@@ -518,8 +516,9 @@
         "partially_decluster_pass.h",
     ],
     deps = [
+        "compilability_check_util",
         ":common",
-        ":device_info_cache",
+        ":device_util",
         ":encapsulate_util",
         ":flags",
         ":resource_operation_safety_analysis",
@@ -581,21 +580,35 @@
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 cc_library(
-    name = "device_info_cache",
-    srcs = ["device_info_cache.cc"],
-    hdrs = ["device_info_cache.h"],
+    name = "device_util",
+    srcs = ["device_util.cc"],
+    hdrs = ["device_util.h"],
     deps = [
-        ":xla_cluster_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "device_util_test",
+    srcs = ["device_util_test.cc"],
+    deps = [
+        ":device_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -661,6 +674,7 @@
         "introduce_floating_point_jitter_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
         "partially_decluster_pass_test.cc",
+        "rearrange_function_argument_pass_test.cc",
     ],
     deps = [
         ":common",
@@ -681,6 +695,7 @@
         "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla:rearrange_function_argument_pass",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:test_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -764,6 +779,34 @@
     ],
 )
 
+cc_library(
+    name = "compilability_check_util",
+    srcs = ["compilability_check_util.cc"],
+    hdrs = ["compilability_check_util.h"],
+    deps = [
+        ":common",
+        ":device_util",
+        ":flags",
+        ":resource_operation_safety_analysis",
+        ":union_find",
+        ":xla_cluster_util",
+        "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_custom_op_py_library(
     name = "xla_ops_py",
     kernels = ["//tensorflow/compiler/jit/ops:xla_ops"],
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index c144563..47b3c66 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
+
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -25,6 +26,7 @@
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/logging_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/device_util.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
@@ -231,14 +233,10 @@
 }
 
 // Returns true (into `result`) if a node placed on `device` must be compiled.
-Status DeviceRequiresCompilation(const string& device, bool* result) {
-  DeviceType device_type("");
-  TF_RETURN_IF_ERROR(DeviceToDeviceType(device, &device_type));
-  const XlaOpRegistry::DeviceRegistration* registration = nullptr;
-  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    return errors::Internal("Could not find compilation device ",
-                            device_type.type());
-  }
+Status DeviceRequiresCompilation(const jit::DeviceInfoCache& device_info_cache,
+                                 jit::DeviceId device, bool* result) {
+  const XlaOpRegistry::DeviceRegistration* registration =
+      device_info_cache.GetCompilationDevice(device);
   *result = registration->autoclustering_policy ==
             XlaOpRegistry::AutoclusteringPolicy::kAlways;
   return Status::OK();
@@ -291,17 +289,20 @@
   return Status::OK();
 }
 
-Status InferDeviceForCluster(Node* n, const string& function_name,
-                             const FunctionLibraryDefinition& flib_def,
-                             string* result) {
+xla::StatusOr<jit::DeviceId> InferDeviceForCluster(
+    jit::DeviceInfoCache* device_info_cache, Node* n,
+    const string& function_name, const FunctionLibraryDefinition& flib_def) {
   const FunctionDef* func_def = flib_def.Find(function_name);
   TF_RET_CHECK(func_def) << "Could not find " << function_name;
 
-  std::set<string> device_names;
+  jit::DeviceSet device_set;
+
   for (const NodeDef& ndef : func_def->node_def()) {
     VLOG(3) << ndef.DebugString();
     if (!ndef.device().empty()) {
-      device_names.insert(ndef.device());
+      TF_ASSIGN_OR_RETURN(jit::DeviceId device_id,
+                          device_info_cache->GetIdFor(ndef.device()));
+      device_set.Insert(device_id);
     }
   }
 
@@ -309,41 +310,47 @@
     // TODO(sanjoy): We need this because EncapsulateSubgraphsPass drops device
     // assignment when constant folding.  We should fix EncapsulateSubgraphsPass
     // instead.
-    device_names.insert(n->assigned_device_name());
+    TF_ASSIGN_OR_RETURN(jit::DeviceId device_id,
+                        device_info_cache->GetIdFor(n->assigned_device_name()));
+    device_set.Insert(device_id);
   }
 
-  std::vector<string> device_names_vector;
-  absl::c_copy(device_names, std::back_inserter(device_names_vector));
-
-  Status s = PickDeviceForXla(device_names_vector, true, result);
-  if (s.ok()) {
-    VLOG(2) << "For " << function_name << " PickDeviceForXla("
-            << absl::StrJoin(device_names_vector, ", ") << ") -> " << *result;
-  }
-  return s;
+  TF_ASSIGN_OR_RETURN(jit::DeviceId result,
+                      PickDeviceForXla(*device_info_cache, device_set,
+                                       /*allow_mixing_unknown_and_cpu=*/true));
+  VLOG(2) << "For " << function_name << " PickDeviceForXla("
+          << device_info_cache->DebugString(device_set) << ") -> "
+          << device_info_cache->GetNameFor(result);
+  return result;
 }
 
 Status ReplaceNodeWithXlaCompileAndXlaRun(
+    jit::DeviceInfoCache* device_info_cache,
     const GraphOptimizationPassOptions& options,
     const FunctionLibraryDefinition& flib_def, bool lazy_compilation_enabled,
     bool insert_print_nodes, Graph* g, Node* n) {
   XlaClusterInfo cluster_info;
   TF_RETURN_IF_ERROR(GetXlaClusterInfo(n, &cluster_info));
 
-  string device;
-  TF_RETURN_IF_ERROR(InferDeviceForCluster(n, cluster_info.function.name(),
-                                           flib_def, &device));
+  TF_ASSIGN_OR_RETURN(
+      jit::DeviceId device,
+      InferDeviceForCluster(device_info_cache, n, cluster_info.function.name(),
+                            flib_def));
+
   bool requires_compilation;
-  TF_RETURN_IF_ERROR(DeviceRequiresCompilation(device, &requires_compilation));
+  TF_RETURN_IF_ERROR(DeviceRequiresCompilation(*device_info_cache, device,
+                                               &requires_compilation));
   if (!lazy_compilation_enabled) {
     requires_compilation = true;
   }
 
+  string device_name_str = string(device_info_cache->GetNameFor(device));
+
   Status status;
   Scope root = NewInternalScope(g, &status, /*refiner=*/nullptr)
                    .NewSubScope(n->name())
                    .WithDevice(n->requested_device())
-                   .WithAssignedDevice(device);
+                   .WithAssignedDevice(device_name_str);
 
   ops::_XlaCompile xla_compile(root.WithOpName("xla_compile"),
                                /*constants=*/cluster_info.constant_inputs,
@@ -435,14 +442,16 @@
   bool lazy_compilation_enabled =
       enable_lazy_compilation_
           ? *enable_lazy_compilation_
-          : GetBuildXlaOpsPassFlags().tf_xla_enable_lazy_compilation;
+          : GetBuildXlaOpsPassFlags()->tf_xla_enable_lazy_compilation;
   bool insert_print_nodes =
-      GetBuildXlaOpsPassFlags().tf_xla_print_cluster_outputs;
+      GetBuildXlaOpsPassFlags()->tf_xla_print_cluster_outputs;
+
+  jit::DeviceInfoCache device_info_cache;
 
   for (Node* n : xla_compiled_kernels) {
     TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
-        options, *options.flib_def, lazy_compilation_enabled,
-        insert_print_nodes, graph, n));
+        &device_info_cache, options, *options.flib_def,
+        lazy_compilation_enabled, insert_print_nodes, graph, n));
   }
 
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
new file mode 100644
index 0000000..8621c43
--- /dev/null
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -0,0 +1,273 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/compilability_check_util.h"
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/device_util.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+#include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+
+namespace {
+bool HasResourceInput(const Node& node) {
+  return absl::c_count(node.input_types(), DT_RESOURCE) != 0;
+}
+}  // anonymous namespace
+
+bool RecursiveCompilabilityChecker::HasXLAKernel(const Node& node) {
+  // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
+  // is really a kind of function call and will be handled by
+  // IsCompilableCall().
+  if (node.type_string() == "SymbolicGradient") return false;
+  if (node.type_string() == "Const") {
+    // Skip Const op with type DT_STRING, since XLA doesn't support it, but the
+    // registered Const KernelDef says that it does, to support no-op Assert for
+    // tfcompile.
+    const AttrValue* attr = node.attrs().Find("dtype");
+    if (attr != nullptr && attr->type() == DT_STRING) {
+      return false;
+    }
+  }
+
+  // XLA does not offer guaranteed aliasing between the input and output of the
+  // XLA cluster so it can't implement the forward-tensor-ref semantic.  Leave
+  // such nodes out of XLA clusters.
+  if (HasForwardedRefInput(node)) {
+    VLOG(2) << "Rejecting " << node.name() << ": Identity with unsafe cast.";
+    return false;
+  }
+
+  return FindKernelDef(jit_device_type_, node.def(), nullptr, nullptr).ok();
+}
+
+// Tests whether 'while_node' is a completely compilable loop.
+// Every operator in the condition and body functions must be compilable for a
+// while loop to be compilable.
+bool RecursiveCompilabilityChecker::IsCompilableWhile(
+    const Node& while_node, int depth, FunctionLibraryRuntime* lib_runtime) {
+  const NameAttrList* name_attr;
+  NodeDef call;
+  Status status;
+  status = GetNodeAttr(while_node.attrs(), "cond", &name_attr);
+  if (!status.ok()) {
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": missing 'cond' attribute on While node.";
+    return false;
+  }
+  const string cond_func = name_attr->name();
+  call.set_name("while_cond");
+  call.set_op(cond_func);
+  *call.mutable_attr() = name_attr->attr();
+  if (!IsCompilableCall(call, depth + 1, lib_runtime)) {
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": can't compile loop condition: " << cond_func;
+    return false;
+  }
+  status = GetNodeAttr(while_node.attrs(), "body", &name_attr);
+  if (!status.ok()) {
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": missing 'body' attribute on While node.";
+    return false;
+  }
+  const string body_func = name_attr->name();
+  call.set_name("while_body");
+  call.set_op(body_func);
+  *call.mutable_attr() = name_attr->attr();
+  if (!IsCompilableCall(call, depth + 1, lib_runtime)) {
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": can't compile loop body: " << body_func;
+    return false;
+  }
+  return true;
+}
+
+// Tests whether 'call_def' is a call to a completely compilable function.
+// Every operator in the function must be compilable for a function to be
+// compilable.
+bool RecursiveCompilabilityChecker::IsCompilableCall(
+    const NodeDef& call_def, int depth, FunctionLibraryRuntime* lib_runtime) {
+  if (depth > kMaxRecursionDepth) {
+    VLOG(2) << "Rejecting " << call_def.op()
+            << ": function depth limit exceeded.";
+    return false;
+  }
+
+  FunctionLibraryRuntime::Handle handle;
+  Status status = InstantiateFunctionCall(call_def, lib_runtime, &handle);
+  if (!status.ok()) {
+    VLOG(2) << "Rejecting " << call_def.DebugString()
+            << ": could not instantiate: " << status;
+    return false;
+  }
+
+  auto release_handle_on_return = gtl::MakeCleanup(
+      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+
+  const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
+  for (Node* node : fbody->graph->op_nodes()) {
+    if (!IsCompilableNode(*node, depth + 1, lib_runtime)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool LogNotCompilableAndReturn(const Node& node,
+                               absl::string_view reason = "") {
+  VLOG(3) << "Not clustering " << node.name() << " (op " << node.type_string()
+          << ")" << (reason.empty() ? "" : ": ") << reason;
+  return false;
+}
+
+bool RecursiveCompilabilityChecker::OpIsInaccurate(const Node& node) {
+  // b/127344411: SelfAdjointEigV2 and Svd precision issues.
+  return node.type_string() == "SelfAdjointEigV2" ||
+         node.type_string() == "Svd";
+}
+
+bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) {
+  // b/128001705: SelfAdjointEigV2 and Svd performance issues.
+  return node.type_string() == "SelfAdjointEigV2" ||
+         node.type_string() == "Svd" || node.type_string() == "Qr";
+}
+
+bool RecursiveCompilabilityChecker::IsCompilableNode(
+    const Node& node, int depth, FunctionLibraryRuntime* lib_runtime) {
+  // _Arg nodes in a top-level function represent feeds and _Retval nodes in a
+  // top-level function represent fetches.
+  if (depth == 0 &&
+      (node.type_string() == "_Arg" || node.type_string() == "_Retval")) {
+    return LogNotCompilableAndReturn(node, "depth is 0");
+  }
+
+  if (node.attrs().Find("_scoped_allocator") ||
+      node.attrs().Find("_forward_from")) {
+    // TODO(b/128858118): XLA does not support _scoped_allocator and
+    // _forward_from.
+    return LogNotCompilableAndReturn(
+        node, "_scoped_allocator or _forward_from attribute");
+  }
+
+  if (IsFunctionCall(*lib_runtime->GetFunctionLibraryDefinition(), node)) {
+    if (!IsCompilableCall(node.def(), depth + 1, lib_runtime)) {
+      return LogNotCompilableAndReturn(node, "unsupported function");
+    }
+  } else if (!HasXLAKernel(node)) {
+    return LogNotCompilableAndReturn(node, "unsupported op");
+  }
+
+  if (node.type_string() == "While" &&
+      !IsCompilableWhile(node, depth + 1, lib_runtime)) {
+    return LogNotCompilableAndReturn(node, "unsupported while");
+  }
+
+  if (!op_filter_.allow_stateful_rng_ops &&
+      IsStatefulRandomOp(node.type_string())) {
+    return LogNotCompilableAndReturn(node, "stateful random op");
+  }
+
+  if (!op_filter_.allow_control_trigger && node.IsControlTrigger()) {
+    return LogNotCompilableAndReturn(node);
+  }
+
+  if (!op_filter_.allow_eliding_assert_and_checknumerics_ops &&
+      IsAssertOrCheckNumerics(node.type_string())) {
+    return LogNotCompilableAndReturn(node, "Assert or CheckNumerics");
+  }
+
+  if (!op_filter_.allow_ops_producing_or_consuming_variant &&
+      OpProducesOrConsumesVariant(node)) {
+    return LogNotCompilableAndReturn(node, "DT_VARIANT producer/consumer");
+  }
+
+  if (!op_filter_.allow_stack_ops && IsStackOp(node)) {
+    return LogNotCompilableAndReturn(node, "Stack op");
+  }
+
+  if (!op_filter_.allow_tensor_array_ops && IsTensorArrayOp(node)) {
+    return LogNotCompilableAndReturn(node, "TensorArray op");
+  }
+
+  if (!op_filter_.allow_resource_ops_in_called_functions && depth > 0 &&
+      HasResourceInput(node)) {
+    return LogNotCompilableAndReturn(node,
+                                     "resource variable op in called function");
+  }
+
+  if (!op_filter_.allow_slow_and_inaccurate_ops && OpIsInaccurate(node)) {
+    return LogNotCompilableAndReturn(node, "operation with correctness issues");
+  }
+
+  if (!op_filter_.allow_slow_and_inaccurate_ops && OpIsSlow(node)) {
+    return LogNotCompilableAndReturn(node, "slow operation");
+  }
+
+  return true;
+}
+
+RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
+    const XlaOpRegistry::DeviceRegistration& registration) {
+  RecursiveCompilabilityChecker::OperationFilter op_filter;
+  op_filter.allow_resource_ops_in_called_functions =
+      registration.cluster_resource_variable_ops_unsafely;
+  op_filter.allow_stack_ops = registration.cluster_stack_ops;
+  op_filter.allow_tensor_array_ops = registration.cluster_tensor_array_ops;
+  op_filter.allow_stateful_rng_ops = registration.cluster_stateful_rng_ops;
+  op_filter.allow_control_trigger = registration.cluster_control_trigger;
+  op_filter.allow_eliding_assert_and_checknumerics_ops =
+      registration.elide_assert_and_checknumerics;
+  op_filter.allow_ops_producing_or_consuming_variant =
+      registration.cluster_variant_ops;
+  op_filter.allow_slow_and_inaccurate_ops =
+      registration.cluster_slow_and_inaccurate_ops;
+  return op_filter;
+}
+
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
new file mode 100644
index 0000000..0ef42d6
--- /dev/null
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/device_util.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+#include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+// Checks whether a TF node can be compiled or not.  "Recursive" as in for call
+// and functional while nodes it recursively checks whether the callee functions
+// can be compiled.
+class RecursiveCompilabilityChecker {
+ public:
+  // Aggregates information about what kinds of ops are allowed.
+  struct OperationFilter {
+    // Whether resource variable ops are allowed are allowed in callees.  We do
+    // not allow resource variable ops in called functions (either as direct TF
+    // calls or as higher order control flow ops) because we do not yet model
+    // their memory effects in jit/resource_variable_safety_analysis.
+    bool allow_resource_ops_in_called_functions;
+
+    // Whether Stack operations are allowed.  We avoid auto-clustering Stack
+    // operations in general because we do not support snapshotting them.
+    //
+    // TODO(b/112837194): This restriction can be lifted with some work.
+    bool allow_stack_ops;
+
+    // Whether TensorArray operations are allowed.  We avoid auto-clustering
+    // TensorArray operations in general because we do not support snapshotting
+    // them.
+    //
+    // TODO(b/112837194): This restriction can be lifted with some work.
+    bool allow_tensor_array_ops;
+
+    // Whether stateful RNG ops are allowed.  XLA's RNG does not have the same
+    // seeding behavior as TensorFlow's RNG (b/34749654).  So we avoid
+    // auto-clustering stateful RNG ops.
+    bool allow_stateful_rng_ops;
+
+    // TODO(b/118970344): Whether ControlTrigger ops are allowed.  It is unsound
+    // to cluster ControlTrigger because of how we use deadness analysis.
+    bool allow_control_trigger;
+
+    // Whether it is okay to "cluster" Assert and CheckNumerics by simply
+    // removing them (they're not removed during clustering, but their
+    // XlaOpKernel is a no-op kernel).  We avoid auto-clustering these ops so
+    // that the user is not surprised when XLA is implicitly enabled. If the
+    // user explicitly specifies to use XLA, it is fine to resort to a dummy
+    // implementation. Currently Assert and CheckNumerics ops have dummy XLA
+    // implementations.
+    bool allow_eliding_assert_and_checknumerics_ops;
+
+    // Whether ops that produce or consume DT_VARIANT values are allowed.  We
+    // don't auto-cluster these ops because we don't yet support live-in or
+    // live-out DT_VARIANT values.
+    bool allow_ops_producing_or_consuming_variant;
+
+    // Whether ops known to be slow or to have correctness issues should be
+    // auto-clustered.
+    bool allow_slow_and_inaccurate_ops;
+  };
+
+  RecursiveCompilabilityChecker(const OperationFilter* op_filter,
+                                const DeviceType* jit_device_type)
+      : op_filter_(*op_filter), jit_device_type_(*jit_device_type) {}
+
+  // Returns true if `node` can be compiled by XLA.
+  bool IsCompilableNode(const Node& node, FunctionLibraryRuntime* lib_runtime) {
+    return IsCompilableNode(node, /*depth=*/0, lib_runtime);
+  }
+
+  // Returns true if `call_def` can be compiled by XLA.  It is assumed that
+  // `call_def` is a call operation.
+  bool IsCompilableCall(const NodeDef& call_def,
+                        FunctionLibraryRuntime* lib_runtime) {
+    return IsCompilableCall(call_def, /*depth=*/0, lib_runtime);
+  }
+
+  // Returns true if XLA supports this Op, but we don't want to cluster it (ie:
+  // due to performance or correctness concerns).
+  bool OpIsInaccurate(const Node& node);
+  bool OpIsSlow(const Node& node);
+
+ private:
+  bool IsCompilableNode(const Node& node, int depth,
+                        FunctionLibraryRuntime* lib_runtime);
+  bool IsCompilableCall(const NodeDef& call_def, int depth,
+                        FunctionLibraryRuntime* lib_runtime);
+  bool IsCompilableWhile(const Node& while_node, int depth,
+                         FunctionLibraryRuntime* lib_runtime);
+
+  bool IsStackOp(const Node& node) {
+    const XlaResourceOpInfo* op_info =
+        GetResourceOpInfoForOp(node.type_string());
+    return op_info && op_info->resource_kind() == XlaResourceKind::kStack;
+  }
+
+  bool IsTensorArrayOp(const Node& node) {
+    const XlaResourceOpInfo* op_info =
+        GetResourceOpInfoForOp(node.type_string());
+    return op_info && op_info->resource_kind() == XlaResourceKind::kTensorArray;
+  }
+
+  bool IsAssertOrCheckNumerics(absl::string_view op_name) {
+    return op_name == "Assert" || op_name == "CheckNumerics";
+  }
+
+  bool IsStatefulRandomOp(absl::string_view op_name) {
+    return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
+           op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
+           op_name == "TruncatedNormal" || op_name == "Multinomial";
+  }
+
+  bool OpProducesOrConsumesVariant(const Node& node) {
+    auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
+    return absl::c_any_of(node.input_types(), is_variant) ||
+           absl::c_any_of(node.output_types(), is_variant);
+  }
+
+  bool HasXLAKernel(const Node& node);
+
+  // Make sure we don't recurse infinitely on recursive functions.
+  const int kMaxRecursionDepth = 10;
+
+  const OperationFilter& op_filter_;
+  const DeviceType& jit_device_type_;
+};
+
+RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
+    const XlaOpRegistry::DeviceRegistration& registration);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 53098cf..0a92c06 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -371,7 +371,8 @@
                              Predicate** predicate) {
     TensorId tensor_id(node->name(), output_idx);
 
-    bool is_boolean_tensor = node->output_type(tensor_id.index()) == DT_BOOL;
+    bool is_boolean_tensor =
+        BaseType(node->output_type(tensor_id.index())) == DT_BOOL;
     TF_RET_CHECK(!must_be_true || is_boolean_tensor);
 
     if (node->type_string() == "Const" && must_be_true) {
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index b879b86..3a44eb7 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -1067,5 +1067,25 @@
   EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#false");
 }
 
+TEST(DeadnessAnalysisTest, RefBoolSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output condition_ref_var =
+      ops::Variable(root.WithOpName("cond_ref"), TensorShape({}), DT_BOOL);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, condition_ref_var);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "~*cond_ref:0");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "*cond_ref:0");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_info_cache.cc b/tensorflow/compiler/jit/device_info_cache.cc
deleted file mode 100644
index e813f18..0000000
--- a/tensorflow/compiler/jit/device_info_cache.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/device_info_cache.h"
-
-#include "tensorflow/compiler/jit/xla_cluster_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-
-namespace tensorflow {
-using xla::StatusOr;
-
-StatusOr<const XlaOpRegistry::DeviceRegistration*>
-DeviceInfoCache::GetCompilationDevice(absl::string_view device_name) {
-  auto it = device_to_device_registration_.find(device_name);
-  if (it != device_to_device_registration_.end()) {
-    return it->second;
-  }
-
-  string device_name_str = string(device_name);
-  TF_ASSIGN_OR_RETURN(const DeviceType& device_type,
-                      GetDeviceTypeFor(device_name_str));
-  const XlaOpRegistry::DeviceRegistration* registration;
-  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    registration = nullptr;
-  }
-
-  device_to_device_registration_.insert(
-      {std::move(device_name_str), registration});
-
-  return registration;
-}
-
-StatusOr<std::reference_wrapper<const DeviceType>>
-DeviceInfoCache::GetDeviceTypeFor(absl::string_view device_name) {
-  auto it = device_to_device_type_.find(device_name);
-  if (it != device_to_device_type_.end()) {
-    return std::cref(*it->second);
-  }
-
-  string device_name_str = string(device_name);
-  auto device_type = absl::make_unique<DeviceType>("");
-  TF_RETURN_IF_ERROR(DeviceToDeviceType(device_name_str, device_type.get()));
-
-  it = device_to_device_type_
-           .insert({std::move(device_name_str), std::move(device_type)})
-           .first;
-  return std::cref(*it->second);
-}
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_info_cache.h b/tensorflow/compiler/jit/device_info_cache.h
deleted file mode 100644
index 1e9179e..0000000
--- a/tensorflow/compiler/jit/device_info_cache.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_INFO_CACHE_H_
-#define TENSORFLOW_COMPILER_JIT_DEVICE_INFO_CACHE_H_
-
-#include <functional>
-#include <memory>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/framework/types.h"
-
-namespace tensorflow {
-// Caches some miscellaneous information about TF devices.  Thread compatible.
-class DeviceInfoCache {
- public:
-  xla::StatusOr<const XlaOpRegistry::DeviceRegistration*> GetCompilationDevice(
-      absl::string_view device_name);
-  xla::StatusOr<std::reference_wrapper<const DeviceType>> GetDeviceTypeFor(
-      absl::string_view device_name);
-
- private:
-  absl::flat_hash_map<string, const XlaOpRegistry::DeviceRegistration*>
-      device_to_device_registration_;
-  absl::flat_hash_map<string, std::unique_ptr<DeviceType>>
-      device_to_device_type_;
-};
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_INFO_CACHE_H_
diff --git a/tensorflow/compiler/jit/device_util.cc b/tensorflow/compiler/jit/device_util.cc
new file mode 100644
index 0000000..200e795
--- /dev/null
+++ b/tensorflow/compiler/jit/device_util.cc
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_util.h"
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace tensorflow {
+namespace jit {
+using xla::StatusOr;
+
+void DeviceSet::Insert(DeviceId device_id) {
+  int word_index = device_id.id() / kWordSize;
+  int bit_index = device_id.id() % kWordSize;
+
+  if (word_index >= storage_.size()) {
+    storage_.resize(word_index + 1, 0);
+  }
+
+  storage_[word_index] |= (1ull << bit_index);
+}
+
+void DeviceSet::UnionWith(const DeviceSet& other) {
+  if (other.storage_.size() > storage_.size()) {
+    storage_.resize(other.storage_.size(), 0);
+  }
+
+  for (int i = 0; i < other.storage_.size(); i++) {
+    storage_[i] |= other.storage_[i];
+  }
+}
+
+bool DeviceSet::IsEmpty() const {
+  return absl::c_all_of(storage_, [&](uint64 val) { return val == 0; });
+}
+
+xla::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
+  TF_RET_CHECK(!name.empty());
+
+  auto it = name_to_id_.find(name);
+  if (it != name_to_id_.end()) {
+    return it->second;
+  }
+
+  int new_id = names_.size();
+  names_.push_back(string(name));
+  id_to_device_type_.push_back(absl::make_unique<DeviceType>(""));
+  DeviceType* device_type = id_to_device_type_.back().get();
+  TF_RETURN_IF_ERROR(DeviceNameToDeviceType(names_.back(), device_type));
+
+  is_cpu_.push_back(device_type->type_string() == DEVICE_CPU);
+  is_gpu_.push_back(device_type->type_string() == DEVICE_GPU);
+
+  name_to_id_.emplace(string(name), DeviceId(new_id));
+
+  const XlaOpRegistry::DeviceRegistration* compilation_device;
+  if (!XlaOpRegistry::GetCompilationDevice(device_type->type(),
+                                           &compilation_device)) {
+    compilation_device = nullptr;
+  }
+  id_to_compilation_device_.push_back(compilation_device);
+
+  return DeviceId(new_id);
+}
+
+string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
+  std::vector<string> names;
+  device_set.ForEach([&](DeviceId device_id) {
+    names.push_back(string(GetNameFor(device_id)));
+    return false;
+  });
+
+  return absl::StrCat("[", absl::StrJoin(names, ","), "]");
+}
+}  // namespace jit
+
+Status DeviceNameToDeviceType(const string& device, DeviceType* device_type) {
+  DeviceNameUtils::ParsedName parsed;
+  if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
+    return errors::Internal("Malformed assigned device '", device, "'");
+  }
+  *device_type = DeviceType(parsed.type);
+  return Status::OK();
+}
+
+xla::StatusOr<absl::optional<jit::DeviceId>> PickDeviceForXlaImpl(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu,
+    bool failure_to_pick_is_error) {
+#define FAILED_TO_PICK_DEVICE(failing_status) \
+  do {                                        \
+    if (failure_to_pick_is_error) {           \
+      return failing_status;                  \
+    } else {                                  \
+      return {absl::nullopt};                 \
+    }                                         \
+  } while (false)
+
+  absl::optional<jit::DeviceId> maybe_gpu_device;
+  absl::optional<jit::DeviceId> maybe_cpu_device;
+  absl::optional<jit::DeviceId> maybe_unknown_device;
+
+  bool multiple_cpu_devices = false;
+  bool multiple_gpu_devices = false;
+  bool multiple_unknown_devices = false;
+
+  devices.ForEach([&](jit::DeviceId device) {
+    if (device_info_cache.IsGpu(device)) {
+      if (maybe_gpu_device) {
+        multiple_gpu_devices = true;
+        return false;
+      }
+      maybe_gpu_device = device;
+    } else if (device_info_cache.IsCpu(device)) {
+      if (maybe_cpu_device) {
+        multiple_cpu_devices = true;
+        return false;
+      }
+      maybe_cpu_device = device;
+    } else {
+      if (maybe_unknown_device) {
+        multiple_unknown_devices = true;
+        return false;
+      }
+      maybe_unknown_device = device;
+    }
+
+    return true;
+  });
+
+  if (multiple_cpu_devices) {
+    FAILED_TO_PICK_DEVICE(errors::Internal(
+        "Multiple CPU devices ", device_info_cache.DebugString(devices)));
+  }
+
+  if (multiple_gpu_devices) {
+    FAILED_TO_PICK_DEVICE(errors::Internal(
+        "Multiple GPU devices ", device_info_cache.DebugString(devices)));
+  }
+
+  if (multiple_unknown_devices) {
+    FAILED_TO_PICK_DEVICE(errors::Internal(
+        "Multiple unknown devices ", device_info_cache.DebugString(devices)));
+  }
+
+  if (maybe_unknown_device && maybe_gpu_device) {
+    FAILED_TO_PICK_DEVICE(errors::Internal(
+        "Found both unknown and GPU devices: ",
+        device_info_cache.GetNameFor(*maybe_unknown_device), ", ",
+        device_info_cache.GetNameFor(*maybe_gpu_device)));
+  }
+
+  if (!allow_mixing_unknown_and_cpu) {
+    if (maybe_unknown_device && maybe_cpu_device) {
+      FAILED_TO_PICK_DEVICE(errors::Internal(
+          "Found both unknown and CPU devices: ",
+          device_info_cache.GetNameFor(*maybe_unknown_device), ", ",
+          device_info_cache.GetNameFor(*maybe_cpu_device)));
+    }
+  }
+
+  if (maybe_gpu_device) {
+    return {*maybe_gpu_device};
+  } else if (maybe_unknown_device) {
+    return {*maybe_unknown_device};
+  } else if (maybe_cpu_device) {
+    return {*maybe_cpu_device};
+  }
+
+  FAILED_TO_PICK_DEVICE(errors::Internal("Empty device set!"));
+
+#undef FAILED_TO_PICK_DEVICE
+}
+
+xla::StatusOr<jit::DeviceId> PickDeviceForXla(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu) {
+  TF_ASSIGN_OR_RETURN(absl::optional<jit::DeviceId> device_id,
+                      PickDeviceForXlaImpl(device_info_cache, devices,
+                                           allow_mixing_unknown_and_cpu,
+                                           /*failure_to_pick_is_error=*/true));
+  return *device_id;
+}
+
+xla::StatusOr<absl::optional<jit::DeviceId>> MaybePickDeviceForXla(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu) {
+  return PickDeviceForXlaImpl(device_info_cache, devices,
+                              allow_mixing_unknown_and_cpu,
+                              /*failure_to_pick_is_error=*/false);
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_util.h b/tensorflow/compiler/jit/device_util.h
new file mode 100644
index 0000000..f26a565
--- /dev/null
+++ b/tensorflow/compiler/jit/device_util.h
@@ -0,0 +1,211 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_INFO_CACHE_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_INFO_CACHE_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace jit {
+// Instances of DeviceId represent TensorFlow devices as integers.
+//
+// This helps avoid having to manipulate device names as strings when
+// auto-clustering.
+class DeviceId {
+ public:
+  DeviceId(DeviceId&&) = default;
+  DeviceId(const DeviceId&) = default;
+  DeviceId& operator=(const DeviceId&) = default;
+
+  bool operator==(const DeviceId& other) const { return id() == other.id(); }
+  bool operator!=(const DeviceId& other) const { return !(*this == other); }
+
+ private:
+  int id_;
+
+  explicit DeviceId(int id) : id_(id) {}
+
+  int id() const { return id_; }
+
+  friend class DeviceInfoCache;
+  friend class DeviceSet;
+};
+
+// A set of DeviceIds, represented as a bitmap.
+class DeviceSet {
+ public:
+  void Insert(DeviceId device_id);
+  void UnionWith(const DeviceSet& other);
+  bool IsEmpty() const;
+
+  // Calls `func` on each DeviceId in the set.  Stops iterating early if `func`
+  // return false.
+  //
+  // TODO(sanjoy): Change this to take a typed std::function if that's
+  // performance neutral.
+  template <typename FnTy>
+  void ForEach(FnTy func) const {
+    // This is really a poor man's iterator, we should consider writing a proper
+    // iterator if this ends up being used widely.
+    for (int word_index = 0; word_index < storage_.size(); word_index++) {
+      uint64 word = storage_[word_index];
+      while (word != 0) {
+        uint64 only_lowest_bit_set = word & -word;
+        // The number of trailing zeros in a non-zero word is the index of the
+        // least significant 1.
+        int bit_index = ctz_uint64(word);
+        if (!func(DeviceId(word_index * kWordSize + bit_index))) {
+          return;
+        }
+        word ^= only_lowest_bit_set;
+      }
+    }
+  }
+
+ private:
+  static int ctz_uint64(uint64 x) {
+    DCHECK_NE(x, 0);
+#ifdef __GNUC__
+    return __builtin_ctzl(x);
+#else
+    int result = 0u;
+    while ((x & 1u) == 0u) {
+      x >>= 1;
+      ++result;
+    }
+    return result;
+#endif
+  }
+
+  absl::InlinedVector<uint64, 1> storage_;
+
+  const int kWordSize = 64;
+};
+
+// Caches some miscellaneous information about TF devices.  Thread compatible.
+class DeviceInfoCache {
+ public:
+  bool IsGpu(DeviceId device) const { return is_gpu_[device.id()]; }
+  bool IsCpu(DeviceId device) const { return is_cpu_[device.id()]; }
+
+  absl::string_view GetNameFor(DeviceId device) const {
+    return names_[device.id()];
+  }
+
+  xla::StatusOr<DeviceId> GetIdFor(absl::string_view name);
+
+  using DeviceRegistration = const XlaOpRegistry::DeviceRegistration;
+
+  DeviceRegistration* GetCompilationDevice(DeviceId device) const {
+    return id_to_compilation_device_[device.id()];
+  }
+
+  xla::StatusOr<DeviceRegistration*> GetCompilationDevice(
+      absl::string_view name) {
+    TF_ASSIGN_OR_RETURN(DeviceId device_id, GetIdFor(name));
+    return GetCompilationDevice(device_id);
+  }
+
+  const DeviceType& GetDeviceTypeFor(DeviceId device) const {
+    return *id_to_device_type_[device.id()];
+  }
+
+  using DeviceTypeConstRef = std::reference_wrapper<const DeviceType>;
+
+  xla::StatusOr<DeviceTypeConstRef> GetDeviceTypeFor(
+      absl::string_view device_name) {
+    TF_ASSIGN_OR_RETURN(DeviceId device_id, GetIdFor(device_name));
+    return std::cref(*id_to_device_type_[device_id.id()]);
+  }
+
+  string DebugString(const DeviceSet& device_set) const;
+
+ private:
+  absl::flat_hash_map<string, DeviceId> name_to_id_;
+
+  // These fields are populated for a device in GetIdFor, *before* we give out a
+  // DeviceId.
+  std::vector<const XlaOpRegistry::DeviceRegistration*>
+      id_to_compilation_device_;
+  std::vector<std::unique_ptr<DeviceType>> id_to_device_type_;
+  std::vector<string> names_;
+  std::vector<bool> is_cpu_;
+  std::vector<bool> is_gpu_;
+};
+
+}  // namespace jit
+
+// Returns the DeviceType corresponding to 'device'.
+Status DeviceNameToDeviceType(const string& device, DeviceType* device_type);
+
+// Picks the device for which XLA should compile a cluster that contains
+// operations placed in devices in `devices`.  For instance a cluster that
+// contains operations solely placed on the CPU will be compiled into a CPU
+// executable by XLA, whereas a cluster that contains operations placed on the
+// CPU and also operations placed on the GPU will be compiled into a GPU
+// executable.
+//
+// Returns a non-OK Status if no unambiguous choice of device exists.
+//
+// We choose the device using the following rules:
+//
+//  - It is an error for `device_names` to contain more than one device of the
+//    same type.
+//  - GPU is preferred over CPU.
+//  - If `allow_mixing_unknown_and_cpu` is true then unknown devices are
+//    preferred over CPU.
+//  - XLA devices count as "unrecognized devices".
+//
+// This set of rules above implicitly assume that XLA:GPU can compile all
+// operations in the cluster that XLA:CPU can compile, and if
+// `allow_mixing_unknown_and_cpu` then the unrecognized device can also compile
+// all operations in the cluster that XLA:CPU can compile.
+//
+// We provide the `allow_mixing_unknown_and_cpu` knob so that we can do both of
+// the following things:
+//
+// - Let MarkForCompilationPass not inject CPU-placed operations into clusters
+//   that will run on unknown devices (because the unknown XLA backend may not
+//   support every operation supported by CPU).
+// - Let BuildXlaOpsPass successfully infer a compilation device for a cluster
+//   that contains nodes placed on both the CPU and on unknown devices.  In this
+//   case it is the responsibility of the optimization pass that injected the
+//   CPU nodes into the cluster to ensure that these nodes can be compiled by
+//   the unknown XLA backend.
+xla::StatusOr<jit::DeviceId> PickDeviceForXla(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu);
+
+// This is like `PickDeviceForXla` except that it returns nullopt (instead of a
+// non-OK Status) if no unambiguous choice of device exists.
+//
+// We return a failing Status for errors unrelated to the device choice
+// algorithm itself.
+xla::StatusOr<absl::optional<jit::DeviceId>> MaybePickDeviceForXla(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_INFO_CACHE_H_
diff --git a/tensorflow/compiler/jit/device_util_test.cc b/tensorflow/compiler/jit/device_util_test.cc
new file mode 100644
index 0000000..9396c49
--- /dev/null
+++ b/tensorflow/compiler/jit/device_util_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_util.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+Status PickDeviceHelper(bool allow_mixing_unknown_and_cpu,
+                        absl::Span<const absl::string_view> device_names,
+                        string* result) {
+  jit::DeviceInfoCache cache;
+  jit::DeviceSet device_set;
+  for (absl::string_view name : device_names) {
+    TF_ASSIGN_OR_RETURN(jit::DeviceId device_id, cache.GetIdFor(name));
+    device_set.Insert(device_id);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      jit::DeviceId result_id,
+      PickDeviceForXla(cache, device_set, allow_mixing_unknown_and_cpu));
+  *result = string(cache.GetNameFor(result_id));
+  return Status::OK();
+}
+
+void CheckPickDeviceResult(absl::string_view expected_result,
+                           bool allow_mixing_unknown_and_cpu,
+                           absl::Span<const absl::string_view> inputs) {
+  string result;
+  TF_ASSERT_OK(PickDeviceHelper(allow_mixing_unknown_and_cpu, inputs, &result))
+      << "inputs = [" << absl::StrJoin(inputs, ", ")
+      << "], allow_mixing_unknown_and_cpu=" << allow_mixing_unknown_and_cpu
+      << ", expected_result=" << expected_result;
+  EXPECT_EQ(result, expected_result);
+}
+
+void CheckPickDeviceHasError(bool allow_mixing_unknown_and_cpu,
+                             absl::Span<const absl::string_view> inputs) {
+  string result;
+  EXPECT_FALSE(
+      PickDeviceHelper(allow_mixing_unknown_and_cpu, inputs, &result).ok());
+}
+
+const char* kCPU0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+const char* kGPU0 = "/job:localhost/replica:0/task:0/device:GPU:0";
+const char* kXPU0 = "/job:localhost/replica:0/task:0/device:XPU:0";
+const char* kYPU0 = "/job:localhost/replica:0/task:0/device:YPU:0";
+
+const char* kCPU1 = "/job:localhost/replica:0/task:0/device:CPU:1";
+const char* kGPU1 = "/job:localhost/replica:0/task:0/device:GPU:1";
+const char* kXPU1 = "/job:localhost/replica:0/task:0/device:XPU:1";
+
+TEST(PickDeviceForXla, UniqueDevice) {
+  CheckPickDeviceResult(kGPU0, false, {kGPU0, kGPU0});
+}
+
+TEST(PickDeviceForXla, DeviceOrder) {
+  CheckPickDeviceResult(kGPU0, false, {kGPU0, kCPU0});
+  CheckPickDeviceResult(kGPU0, false, {kCPU0, kGPU0});
+  CheckPickDeviceResult(kXPU0, true, {kXPU0, kCPU0});
+}
+
+TEST(PickDeviceForXla, MultipleUnknownDevices) {
+  CheckPickDeviceHasError(false, {kXPU0, kYPU0});
+}
+
+TEST(PickDeviceForXla, GpuAndUnknown) {
+  CheckPickDeviceHasError(false, {kGPU0, kXPU1});
+}
+
+TEST(PickDeviceForXla, UnknownAndCpu) {
+  CheckPickDeviceHasError(false, {kXPU0, kCPU1});
+}
+
+TEST(PickDeviceForXla, MultipleDevicesOfSameType) {
+  CheckPickDeviceHasError(true, {kCPU0, kCPU1});
+  CheckPickDeviceHasError(false, {kCPU0, kCPU1});
+  CheckPickDeviceHasError(false, {kGPU0, kGPU1});
+  CheckPickDeviceHasError(false, {kXPU0, kXPU1});
+  CheckPickDeviceHasError(false, {kCPU0, kCPU1, kGPU0});
+}
+
+void SimpleRoundTripTestForDeviceSet(int num_devices) {
+  jit::DeviceSet device_set;
+  jit::DeviceInfoCache device_info_cache;
+
+  std::vector<string> expected_devices, actual_devices;
+
+  for (int i = 0; i < num_devices; i++) {
+    string device_name =
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XPU:", i);
+    TF_ASSERT_OK_AND_ASSIGN(jit::DeviceId device_id,
+                            device_info_cache.GetIdFor(device_name));
+    device_set.Insert(device_id);
+    expected_devices.push_back(device_name);
+  }
+
+  device_set.ForEach([&](jit::DeviceId device_id) {
+    actual_devices.push_back(string(device_info_cache.GetNameFor(device_id)));
+    return true;
+  });
+
+  EXPECT_EQ(expected_devices, actual_devices);
+}
+
+TEST(DeviceSetTest, SimpleRoundTrip_One) { SimpleRoundTripTestForDeviceSet(1); }
+
+TEST(DeviceSetTest, SimpleRoundTrip_Small) {
+  SimpleRoundTripTestForDeviceSet(8);
+}
+
+TEST(DeviceSetTest, SimpleRoundTrip_Large) {
+  SimpleRoundTripTestForDeviceSet(800);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index d05df8f..b6d9743 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -2497,8 +2497,6 @@
     const Graph& graph_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
     bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
     FunctionLibraryDefinition* library) {
-  Status s;
-
   Encapsulator encapsulator(std::move(group_attribute),
                             std::move(outside_compilation_attribute),
                             &graph_in);
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 0c310c1..3ee3c5e 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -36,6 +36,10 @@
 
 bool SetterForXlaAutoJitFlag(const string& value) {
   int32 opt_level;
+  // We need to use the mark_for_compilation_flags directly here instead of
+  // going via GetMarkForCompilationPassFlags() to avoid infinite recursion. The
+  // latter will try to setup and parse flags, which would bring us back to this
+  // setter.
   if (absl::SimpleAtoi(value, &opt_level)) {
     mark_for_compilation_flags->xla_auto_jit_flag
         .optimization_level_single_gpu = opt_level;
@@ -155,9 +159,14 @@
 
 }  // namespace
 
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
+bool SetXlaAutoJitFlagFromFlagString(const string& value) {
   std::call_once(flags_init, &AllocateAndParseFlags);
-  return *build_ops_flags;
+  return SetterForXlaAutoJitFlag(value);
+}
+
+BuildXlaOpsPassFlags* GetBuildXlaOpsPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return build_ops_flags;
 }
 
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index a11d9d4..42608d1 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -38,6 +38,12 @@
   int32 optimization_level_general;
 };
 
+// Sets the xla_auto_jit_flag based on the given flag sting. Supported syntax
+// is:
+// <number>: sets general and single_gpu setting to the provided number.
+// single-gpu(<number>): sets the single_gpu setting to the provided number.
+bool SetXlaAutoJitFlagFromFlagString(const string& value);
+
 // Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
   XlaAutoJitFlag xla_auto_jit_flag;
@@ -111,7 +117,7 @@
 // parses TF_XLA_FLAGS for all of them.  Those functions which return a pointer
 // always return the same pointer.
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
+BuildXlaOpsPassFlags* GetBuildXlaOpsPassFlags();
 XlaDeviceFlags* GetXlaDeviceFlags();
 const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
 
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 8212956..f9be7c4 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -13,8 +13,23 @@
     srcs = ["graphcycles.cc"],
     hdrs = ["graphcycles.h"],
     deps = [
+        ":ordered_set",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "ordered_set",
+    hdrs = ["ordered_set.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -28,3 +43,14 @@
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "ordered_set_test",
+    srcs = ["ordered_set_test.cc"],
+    deps = [
+        ":ordered_set",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index 71abee2..f5655ff 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -34,14 +34,20 @@
 #include <algorithm>
 #include <unordered_set>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/graphcycles/ordered_set.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
 namespace {
 
-typedef std::unordered_set<int32> NodeSet;
+using NodeSet = absl::flat_hash_set<int32>;
+using OrderedNodeSet = OrderedSet<int32>;
+
 template <typename T>
 struct VecStruct {
   typedef absl::InlinedVector<T, 4> type;
@@ -50,13 +56,11 @@
 using Vec = typename VecStruct<T>::type;
 
 struct Node {
-  Node() : in(4), out(4) {}  // Small hashtables for in/out edges
-
   int32 rank;    // rank number assigned by Pearce-Kelly algorithm
   bool visited;  // Temporary marker used by depth-first-search
   void* data;    // User-supplied data
-  NodeSet in;    // List of immediate predecessor nodes in graph
-  NodeSet out;   // List of immediate successor nodes in graph
+  OrderedNodeSet in;   // List of immediate predecessor nodes in graph
+  OrderedNodeSet out;  // List of immediate successor nodes in graph
 };
 
 }  // namespace
@@ -93,7 +97,7 @@
     if (!ranks.insert(nx->rank).second) {
       LOG(FATAL) << "Duplicate occurrence of rank " << nx->rank;
     }
-    for (auto y : nx->out) {
+    for (int32 y : nx->out.GetSequence()) {
       Node* ny = r->nodes_[y];
       if (nx->rank >= ny->rank) {
         LOG(FATAL) << "Edge " << x << "->" << y << " has bad rank assignment "
@@ -124,14 +128,14 @@
 
 void GraphCycles::RemoveNode(int32 node) {
   Node* x = rep_->nodes_[node];
-  for (auto y : x->out) {
-    rep_->nodes_[y]->in.erase(node);
+  for (int32 y : x->out.GetSequence()) {
+    rep_->nodes_[y]->in.Erase(node);
   }
-  for (auto y : x->in) {
-    rep_->nodes_[y]->out.erase(node);
+  for (int32 y : x->in.GetSequence()) {
+    rep_->nodes_[y]->out.Erase(node);
   }
-  x->in.clear();
-  x->out.clear();
+  x->in.Clear();
+  x->out.Clear();
   rep_->free_nodes_.push_back(node);
 }
 
@@ -144,12 +148,12 @@
 }
 
 bool GraphCycles::HasEdge(int32 x, int32 y) const {
-  return rep_->nodes_[x]->out.find(y) != rep_->nodes_[x]->out.end();
+  return rep_->nodes_[x]->out.Contains(y);
 }
 
 void GraphCycles::RemoveEdge(int32 x, int32 y) {
-  rep_->nodes_[x]->out.erase(y);
-  rep_->nodes_[y]->in.erase(x);
+  rep_->nodes_[x]->out.Erase(y);
+  rep_->nodes_[y]->in.Erase(x);
   // No need to update the rank assignment since a previous valid
   // rank assignment remains valid after an edge deletion.
 }
@@ -165,13 +169,13 @@
   if (x == y) return false;
   Rep* r = rep_;
   Node* nx = r->nodes_[x];
-  if (!nx->out.insert(y).second) {
+  if (!nx->out.Insert(y)) {
     // Edge already exists.
     return true;
   }
 
   Node* ny = r->nodes_[y];
-  ny->in.insert(x);
+  ny->in.Insert(x);
 
   if (nx->rank <= ny->rank) {
     // New edge is consistent with existing rank assignment.
@@ -182,8 +186,8 @@
   // We only need to consider nodes that fall in the range [ny->rank,nx->rank].
   if (!ForwardDFS(r, y, nx->rank)) {
     // Found a cycle.  Undo the insertion and tell caller.
-    nx->out.erase(y);
-    ny->in.erase(x);
+    nx->out.Erase(y);
+    ny->in.Erase(x);
     // Since we do not call Reorder() on this path, clear any visited
     // markers left by ForwardDFS.
     ClearVisitedBits(r, r->deltaf_);
@@ -209,7 +213,7 @@
     nn->visited = true;
     r->deltaf_.push_back(n);
 
-    for (auto w : nn->out) {
+    for (auto w : nn->out.GetSequence()) {
       Node* nw = r->nodes_[w];
       if (nw->rank == upper_bound) {
         return false;  // Cycle
@@ -235,7 +239,7 @@
     nn->visited = true;
     r->deltab_.push_back(n);
 
-    for (auto w : nn->in) {
+    for (auto w : nn->in.GetSequence()) {
       Node* nw = r->nodes_[w];
       if (!nw->visited && lower_bound < nw->rank) {
         r->stack_.push_back(w);
@@ -321,7 +325,7 @@
       return path_len;
     }
 
-    for (auto w : r->nodes_[n]->out) {
+    for (auto w : r->nodes_[n]->out.GetSequence()) {
       if (seen.insert(w).second) {
         r->stack_.push_back(w);
       }
@@ -375,31 +379,84 @@
   }
 
   Node* nb = rep_->nodes_[b];
-  std::unordered_set<int32> out = std::move(nb->out);
-  std::unordered_set<int32> in = std::move(nb->in);
-  for (auto y : out) {
-    rep_->nodes_[y]->in.erase(b);
+  OrderedNodeSet out = std::move(nb->out);
+  OrderedNodeSet in = std::move(nb->in);
+  for (int32 y : out.GetSequence()) {
+    rep_->nodes_[y]->in.Erase(b);
   }
-  for (auto y : in) {
-    rep_->nodes_[y]->out.erase(b);
+  for (int32 y : in.GetSequence()) {
+    rep_->nodes_[y]->out.Erase(b);
   }
   rep_->free_nodes_.push_back(b);
 
-  for (auto y : out) {
+  rep_->nodes_[a]->out.Reserve(rep_->nodes_[a]->out.Size() + out.Size());
+  for (int32 y : out.GetSequence()) {
     InsertEdge(a, y);
   }
-  for (auto y : in) {
+
+  rep_->nodes_[a]->in.Reserve(rep_->nodes_[a]->in.Size() + in.Size());
+  for (int32 y : in.GetSequence()) {
     InsertEdge(y, a);
   }
+
   return true;
 }
 
-std::unordered_set<int32> GraphCycles::Successors(int32 node) const {
-  return rep_->nodes_[node]->out;
+absl::Span<const int32> GraphCycles::Successors(int32 node) const {
+  return rep_->nodes_[node]->out.GetSequence();
 }
 
-std::unordered_set<int32> GraphCycles::Predecessors(int32 node) const {
-  return rep_->nodes_[node]->in;
+absl::Span<const int32> GraphCycles::Predecessors(int32 node) const {
+  return rep_->nodes_[node]->in.GetSequence();
+}
+
+namespace {
+void SortInPostOrder(absl::Span<Node* const> nodes,
+                     std::vector<int32>* to_sort) {
+  absl::c_sort(*to_sort, [&](int32 a, int32 b) {
+    DCHECK(a == b || nodes[a]->rank != nodes[b]->rank);
+    return nodes[a]->rank > nodes[b]->rank;
+  });
+}
+}  // namespace
+
+std::vector<int32> GraphCycles::AllNodesInPostOrder() const {
+  absl::flat_hash_set<int32> free_nodes_set;
+  absl::c_copy(rep_->free_nodes_,
+               std::inserter(free_nodes_set, free_nodes_set.begin()));
+
+  std::vector<int32> all_nodes;
+  all_nodes.reserve(rep_->nodes_.size() - free_nodes_set.size());
+  for (int64 i = 0, e = rep_->nodes_.size(); i < e; i++) {
+    if (!free_nodes_set.contains(i)) {
+      all_nodes.push_back(i);
+    }
+  }
+
+  SortInPostOrder(rep_->nodes_, &all_nodes);
+  return all_nodes;
+}
+
+string GraphCycles::DebugString() const {
+  absl::flat_hash_set<int32> free_nodes_set;
+  for (int32 free_node : rep_->free_nodes_) {
+    free_nodes_set.insert(free_node);
+  }
+
+  string result = "digraph {\n";
+  for (int i = 0; i < rep_->nodes_.size(); i++) {
+    if (free_nodes_set.contains(i)) {
+      continue;
+    }
+
+    for (int32 succ : rep_->nodes_[i]->out.GetSequence()) {
+      absl::StrAppend(&result, "  \"", i, "\" -> \"", succ, "\"\n");
+    }
+  }
+
+  absl::StrAppend(&result, "}\n");
+
+  return result;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/jit/graphcycles/graphcycles.h
index 8e7801d..28f7fbb 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@@ -16,6 +16,8 @@
 #ifndef TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_GRAPHCYCLES_H_
 #define TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_GRAPHCYCLES_H_
 
+#include <vector>
+
 // GraphCycles detects the introduction of a cycle into a directed
 // graph that is being built up incrementally.
 //
@@ -38,8 +40,7 @@
 //   FindPath() is linear in the size of the graph.
 // The current implementation uses O(|V|+|E|) space.
 
-#include <unordered_set>
-
+#include "absl/types/span.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -117,8 +118,17 @@
   // Expensive: should only be called from graphcycles_test.cc.
   bool CheckInvariants() const;
 
-  std::unordered_set<int32> Successors(int32 node) const;
-  std::unordered_set<int32> Predecessors(int32 node) const;
+  absl::Span<const int32> Successors(int32 node) const;
+  absl::Span<const int32> Predecessors(int32 node) const;
+
+  // Returns all nodes in post order.
+  //
+  // If there is a path from X to Y then X appears after Y in the
+  // returned vector.
+  std::vector<int32> AllNodesInPostOrder() const;
+
+  // Returns the graph in graphviz format.
+  string DebugString() const;
 
   // ----------------------------------------------------
   struct Rep;
diff --git a/tensorflow/compiler/jit/graphcycles/ordered_set.h b/tensorflow/compiler/jit/graphcycles/ordered_set.h
new file mode 100644
index 0000000..0417782
--- /dev/null
+++ b/tensorflow/compiler/jit/graphcycles/ordered_set.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
+#define TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+// This is a set data structure that provides a deterministic iteration order.
+// The iteration order of elements only depends on the sequence of
+// inserts/deletes, so as long as the inserts/deletes happen in the same
+// sequence, the set will have the same iteration order.
+//
+// Assumes that T can be cheaply copied for simplicity.
+template <typename T>
+class OrderedSet {
+ public:
+  // Inserts `value` into the ordered set.  Returns true if the value was not
+  // present in the set before the insertion.
+  bool Insert(T value) {
+    bool new_insertion =
+        value_to_index_.insert({value, value_sequence_.size()}).second;
+    if (new_insertion) {
+      value_sequence_.push_back(value);
+    }
+    return new_insertion;
+  }
+
+  // Removes `value` from the set.  Assumes `value` is already present in the
+  // set.
+  void Erase(T value) {
+    auto it = value_to_index_.find(value);
+    DCHECK(it != value_to_index_.end());
+
+    // Since we don't want to move values around in `value_sequence_` we swap
+    // the value in the last position and with value to be deleted and then
+    // pop_back.
+    value_to_index_[value_sequence_.back()] = it->second;
+    std::swap(value_sequence_[it->second], value_sequence_.back());
+    value_sequence_.pop_back();
+    value_to_index_.erase(it);
+  }
+
+  void Reserve(size_t new_size) {
+    value_to_index_.reserve(new_size);
+    value_sequence_.reserve(new_size);
+  }
+
+  void Clear() {
+    value_to_index_.clear();
+    value_sequence_.clear();
+  }
+
+  bool Contains(T value) const { return value_to_index_.contains(value); }
+  size_t Size() const { return value_sequence_.size(); }
+
+  absl::Span<T const> GetSequence() const { return value_sequence_; }
+
+ private:
+  // The stable order that we maintain through insertions and deletions.
+  std::vector<T> value_sequence_;
+
+  // Maps values to their indices in `value_sequence_`.
+  absl::flat_hash_map<T, int> value_to_index_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
diff --git a/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc b/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc
new file mode 100644
index 0000000..38ac1cf
--- /dev/null
+++ b/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/graphcycles/ordered_set.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+TEST(OrderedSetTest, Insert) {
+  OrderedSet<int> ordered_set;
+  EXPECT_TRUE(ordered_set.Insert(90));
+  EXPECT_TRUE(ordered_set.Insert(100));
+  EXPECT_TRUE(ordered_set.Insert(80));
+
+  EXPECT_FALSE(ordered_set.Insert(100));
+
+  EXPECT_EQ(ordered_set.Size(), 3);
+
+  EXPECT_TRUE(ordered_set.Contains(90));
+  EXPECT_TRUE(ordered_set.Contains(100));
+  EXPECT_TRUE(ordered_set.Contains(80));
+
+  EXPECT_FALSE(ordered_set.Contains(40));
+
+  std::array<int, 3> expected_sequence = {90, 100, 80};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence);
+}
+
+TEST(OrderedSetTest, Erase) {
+  OrderedSet<int> ordered_set;
+  EXPECT_TRUE(ordered_set.Insert(90));
+  EXPECT_TRUE(ordered_set.Insert(100));
+  EXPECT_TRUE(ordered_set.Insert(80));
+
+  ordered_set.Erase(100);
+
+  EXPECT_EQ(ordered_set.Size(), 2);
+
+  EXPECT_TRUE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_TRUE(ordered_set.Contains(80));
+
+  std::array<int, 2> expected_sequence_0 = {90, 80};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence_0);
+
+  ordered_set.Erase(80);
+
+  EXPECT_EQ(ordered_set.Size(), 1);
+
+  EXPECT_TRUE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_FALSE(ordered_set.Contains(80));
+
+  std::array<int, 1> expected_sequence_1 = {90};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence_1);
+
+  ordered_set.Erase(90);
+
+  EXPECT_EQ(ordered_set.Size(), 0);
+
+  EXPECT_FALSE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_FALSE(ordered_set.Contains(80));
+
+  std::array<int, 0> expected_sequence_2 = {};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence_2);
+}
+
+TEST(OrderedSetTest, Clear) {
+  OrderedSet<int> ordered_set;
+  EXPECT_TRUE(ordered_set.Insert(90));
+  EXPECT_TRUE(ordered_set.Insert(100));
+  EXPECT_TRUE(ordered_set.Insert(80));
+
+  ordered_set.Clear();
+
+  EXPECT_EQ(ordered_set.Size(), 0);
+
+  EXPECT_FALSE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_FALSE(ordered_set.Contains(80));
+
+  std::array<int, 0> expected_sequence = {};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence);
+}
+
+TEST(OrderedSetTest, LargeInsertions) {
+  const int kSize = 50 * 9000;
+
+  OrderedSet<int> ordered_set;
+
+  for (int i = 0; i < kSize; i++) {
+    EXPECT_TRUE(ordered_set.Insert(i + 500));
+  }
+
+  for (int i = 0; i < kSize; i++) {
+    EXPECT_EQ(ordered_set.GetSequence()[i], i + 500);
+  }
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 69186da..6c470fa 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -39,6 +39,10 @@
 // third_party/tensorflow/compiler/tf2xla/functionalize_control_flow_pass_registration.cc
 // FunctionalizeControlFlowPass: 27
 //
+// from
+// third_party/tensorflow/compiler/tf2xla/rearrange_function_argument_pass_registration.cc
+// RearrangeFunctionArgumentPass: 28
+//
 // This pass looks at the graph and all associated FunctionDefs, and turns
 // traditional control flow structure (Switch/Merge/etc.) into functional
 // control flow structure (XlaIf/XlaWhile). Following passes must
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 88d00f7..6df0991 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -62,7 +62,7 @@
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
   std::unique_ptr<XlaAllocator> xla_allocator;
-  xla::DeviceMemoryAllocator* device_allocator = nullptr;
+  se::DeviceMemoryAllocator* device_allocator = nullptr;
 
   if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
     platform_id = se::host::kHostPlatformId;
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 7b4d4b5..eaa6867 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -40,7 +40,7 @@
                            se::Platform::Id platform_id,
                            const XlaDevice::Metadata* xla_device_metadata,
                            std::unique_ptr<XlaAllocator> xla_allocator,
-                           xla::DeviceMemoryAllocator* device_allocator)
+                           se::DeviceMemoryAllocator* device_allocator)
       : device_type_(device_type),
         platform_id_(platform_id),
         xla_device_metadata_(xla_device_metadata),
@@ -55,7 +55,7 @@
     return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
   }
 
-  xla::DeviceMemoryAllocator* allocator() const {
+  se::DeviceMemoryAllocator* allocator() const {
     return device_allocator_ ? device_allocator_ : xla_allocator_.get();
   }
   DeviceType device_type() const { return device_type_; }
@@ -86,7 +86,7 @@
   // then device_allocator_ is null and xla_allocator_ points to an appropriate
   // XlaAllocator instance.
   std::unique_ptr<XlaAllocator> xla_allocator_;
-  xla::DeviceMemoryAllocator* device_allocator_;
+  se::DeviceMemoryAllocator* device_allocator_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
 };
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 6ccfccd..1952a4e 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -24,9 +24,10 @@
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/jit/compilability_check_util.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/device_info_cache.h"
+#include "tensorflow/compiler/jit/device_util.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
@@ -56,16 +57,10 @@
 
 namespace {
 using DeadnessPredicate = DeadnessAnalysis::DeadnessPredicate;
+using jit::DeviceId;
+using jit::DeviceSet;
 using xla::StatusOr;
 
-bool HasResourceOutput(const Node& node) {
-  return absl::c_count(node.output_types(), DT_RESOURCE) != 0;
-}
-
-bool HasResourceInput(const Node& node) {
-  return absl::c_count(node.input_types(), DT_RESOURCE) != 0;
-}
-
 // The clusters we create here are eventually lowered into an
 // _XlaCompile/_XlaRun pair with a TF executor "fallback" that uses the
 // PartitionedCall op to execute the cluster in the regular graph executor if
@@ -82,306 +77,6 @@
 // cluster.
 const char* kXlaAlreadyClustered = "_XlaAlreadyClustered";
 
-// Checks whether a TF node can be compiled or not.  "Recursive" as in for call
-// and functional while nodes it recursively checks whether the callee functions
-// can be compiled.
-class RecursiveCompilabilityChecker {
- public:
-  // Aggregates information about what kinds of ops are allowed.
-  struct OperationFilter {
-    // Whether resource variable ops are allowed are allowed in callees.  We do
-    // not allow resource variable ops in called functions (either as direct TF
-    // calls or as higher order control flow ops) because we do not yet model
-    // their memory effects in jit/resource_variable_safety_analysis.
-    bool allow_resource_ops_in_called_functions;
-
-    // Whether Stack operations are allowed.  We avoid auto-clustering Stack
-    // operations in general because we do not support snapshotting them.
-    //
-    // TODO(b/112837194): This restriction can be lifted with some work.
-    bool allow_stack_ops;
-
-    // Whether TensorArray operations are allowed.  We avoid auto-clustering
-    // TensorArray operations in general because we do not support snapshotting
-    // them.
-    //
-    // TODO(b/112837194): This restriction can be lifted with some work.
-    bool allow_tensor_array_ops;
-
-    // Whether stateful RNG ops are allowed.  XLA's RNG does not have the same
-    // seeding behavior as TensorFlow's RNG (b/34749654).  So we avoid
-    // auto-clustering stateful RNG ops.
-    bool allow_stateful_rng_ops;
-
-    // TODO(b/118970344): Whether ControlTrigger ops are allowed.  It is unsound
-    // to cluster ControlTrigger because of how we use deadness analysis.
-    bool allow_control_trigger;
-
-    // Whether it is okay to "cluster" Assert and CheckNumerics by simply
-    // removing them (they're not removed during clustering, but their
-    // XlaOpKernel is a no-op kernel).  We avoid auto-clustering these ops so
-    // that the user is not surprised when XLA is implicitly enabled. If the
-    // user explicitly specifies to use XLA, it is fine to resort to a dummy
-    // implementation. Currently Assert and CheckNumerics ops have dummy XLA
-    // implementations.
-    bool allow_eliding_assert_and_checknumerics_ops;
-
-    // Whether ops that produce or consume DT_VARIANT values are allowed.  We
-    // don't auto-cluster these ops because we don't yet support live-in or
-    // live-out DT_VARIANT values.
-    bool allow_ops_producing_or_consuming_variant;
-  };
-
-  RecursiveCompilabilityChecker(const OperationFilter* op_filter,
-                                const DeviceType* jit_device_type)
-      : op_filter_(*op_filter), jit_device_type_(*jit_device_type) {}
-
-  // Returns true if `node` can be compiled by XLA.
-  bool IsCompilableNode(const Node& node, FunctionLibraryRuntime* lib_runtime) {
-    return IsCompilableNode(node, /*depth=*/0, lib_runtime);
-  }
-
-  // Returns true if `call_def` can be compiled by XLA.  It is assumed that
-  // `call_def` is a call operation.
-  bool IsCompilableCall(const NodeDef& call_def,
-                        FunctionLibraryRuntime* lib_runtime) {
-    return IsCompilableCall(call_def, /*depth=*/0, lib_runtime);
-  }
-
- private:
-  bool IsCompilableNode(const Node& node, int depth,
-                        FunctionLibraryRuntime* lib_runtime);
-  bool IsCompilableCall(const NodeDef& call_def, int depth,
-                        FunctionLibraryRuntime* lib_runtime);
-  bool IsCompilableWhile(const Node& while_node, int depth,
-                         FunctionLibraryRuntime* lib_runtime);
-
-  bool IsStackOp(const Node& node) {
-    const XlaResourceOpInfo* op_info =
-        GetResourceOpInfoForOp(node.type_string());
-    return op_info && op_info->resource_kind() == XlaResourceKind::kStack;
-  }
-
-  bool IsTensorArrayOp(const Node& node) {
-    const XlaResourceOpInfo* op_info =
-        GetResourceOpInfoForOp(node.type_string());
-    return op_info && op_info->resource_kind() == XlaResourceKind::kTensorArray;
-  }
-
-  bool IsAssertOrCheckNumerics(absl::string_view op_name) {
-    return op_name == "Assert" || op_name == "CheckNumerics";
-  }
-
-  bool IsStatefulRandomOp(absl::string_view op_name) {
-    return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
-           op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
-           op_name == "TruncatedNormal" || op_name == "Multinomial";
-  }
-
-  bool OpProducesOrConsumesVariant(const Node& node) {
-    auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
-    return absl::c_any_of(node.input_types(), is_variant) ||
-           absl::c_any_of(node.output_types(), is_variant);
-  }
-
-  bool HasXLAKernel(const Node& node);
-
-  // Make sure we don't recurse infinitely on recursive functions.
-  const int kMaxRecursionDepth = 10;
-
-  const OperationFilter& op_filter_;
-  const DeviceType& jit_device_type_;
-};
-
-bool RecursiveCompilabilityChecker::HasXLAKernel(const Node& node) {
-  // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
-  // is really a kind of function call and will be handled by
-  // IsCompilableCall().
-  if (node.type_string() == "SymbolicGradient") return false;
-  if (node.type_string() == "Const") {
-    // Skip Const op with type DT_STRING, since XLA doesn't support it, but the
-    // registered Const KernelDef says that it does, to support no-op Assert for
-    // tfcompile.
-    const AttrValue* attr = node.attrs().Find("dtype");
-    if (attr != nullptr && attr->type() == DT_STRING) {
-      return false;
-    }
-  }
-
-  // XLA does not offer guaranteed aliasing between the input and output of the
-  // XLA cluster so it can't implement the forward-tensor-ref semantic.  Leave
-  // such nodes out of XLA clusters.
-  if (HasForwardedRefInput(node)) {
-    VLOG(2) << "Rejecting " << node.name() << ": Identity with unsafe cast.";
-    return false;
-  }
-
-  return FindKernelDef(jit_device_type_, node.def(), nullptr, nullptr).ok();
-}
-
-// Tests whether 'while_node' is a completely compilable loop.
-// Every operator in the condition and body functions must be compilable for a
-// while loop to be compilable.
-bool RecursiveCompilabilityChecker::IsCompilableWhile(
-    const Node& while_node, int depth, FunctionLibraryRuntime* lib_runtime) {
-  const NameAttrList* name_attr;
-  NodeDef call;
-  Status status;
-  status = GetNodeAttr(while_node.attrs(), "cond", &name_attr);
-  if (!status.ok()) {
-    VLOG(2) << "Rejecting While " << while_node.name()
-            << ": missing 'cond' attribute on While node.";
-    return false;
-  }
-  const string cond_func = name_attr->name();
-  call.set_name("while_cond");
-  call.set_op(cond_func);
-  *call.mutable_attr() = name_attr->attr();
-  if (!IsCompilableCall(call, depth + 1, lib_runtime)) {
-    VLOG(2) << "Rejecting While " << while_node.name()
-            << ": can't compile loop condition: " << cond_func;
-    return false;
-  }
-  status = GetNodeAttr(while_node.attrs(), "body", &name_attr);
-  if (!status.ok()) {
-    VLOG(2) << "Rejecting While " << while_node.name()
-            << ": missing 'body' attribute on While node.";
-    return false;
-  }
-  const string body_func = name_attr->name();
-  call.set_name("while_body");
-  call.set_op(body_func);
-  *call.mutable_attr() = name_attr->attr();
-  if (!IsCompilableCall(call, depth + 1, lib_runtime)) {
-    VLOG(2) << "Rejecting While " << while_node.name()
-            << ": can't compile loop body: " << body_func;
-    return false;
-  }
-  return true;
-}
-
-// Tests whether 'call_def' is a call to a completely compilable function.
-// Every operator in the function must be compilable for a function to be
-// compilable.
-bool RecursiveCompilabilityChecker::IsCompilableCall(
-    const NodeDef& call_def, int depth, FunctionLibraryRuntime* lib_runtime) {
-  if (depth > kMaxRecursionDepth) {
-    VLOG(2) << "Rejecting " << call_def.op()
-            << ": function depth limit exceeded.";
-    return false;
-  }
-
-  FunctionLibraryRuntime::Handle handle;
-  Status status = InstantiateFunctionCall(call_def, lib_runtime, &handle);
-  if (!status.ok()) {
-    VLOG(2) << "Rejecting " << call_def.DebugString()
-            << ": could not instantiate: " << status;
-    return false;
-  }
-
-  auto release_handle_on_return = gtl::MakeCleanup(
-      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
-
-  const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
-  CHECK(fbody);
-  for (Node* node : fbody->graph->op_nodes()) {
-    if (!IsCompilableNode(*node, depth + 1, lib_runtime)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool LogNotCompilableAndReturn(const Node& node,
-                               absl::string_view reason = "") {
-  VLOG(3) << "Not clustering " << node.name() << " (op " << node.type_string()
-          << ")" << (reason.empty() ? "" : ": ") << reason;
-  return false;
-}
-
-bool RecursiveCompilabilityChecker::IsCompilableNode(
-    const Node& node, int depth, FunctionLibraryRuntime* lib_runtime) {
-  // _Arg nodes in a top-level function represent feeds and _Retval nodes in a
-  // top-level function represent fetches.
-  if (depth == 0 &&
-      (node.type_string() == "_Arg" || node.type_string() == "_Retval")) {
-    return LogNotCompilableAndReturn(node, "depth is 0");
-  }
-
-  if (node.attrs().Find("_scoped_allocator") ||
-      node.attrs().Find("_forward_from")) {
-    // TODO(b/128858118): XLA does not support _scoped_allocator and
-    // _forward_from.
-    return LogNotCompilableAndReturn(
-        node, "_scoped_allocator or _forward_from attribute");
-  }
-
-  if (IsFunctionCall(*lib_runtime->GetFunctionLibraryDefinition(), node)) {
-    if (!IsCompilableCall(node.def(), depth + 1, lib_runtime)) {
-      return LogNotCompilableAndReturn(node, "unsupported function");
-    }
-  } else if (!HasXLAKernel(node)) {
-    return LogNotCompilableAndReturn(node, "unsupported op");
-  }
-
-  if (node.type_string() == "While" &&
-      !IsCompilableWhile(node, depth + 1, lib_runtime)) {
-    return LogNotCompilableAndReturn(node, "unsupported while");
-  }
-
-  if (!op_filter_.allow_stateful_rng_ops &&
-      IsStatefulRandomOp(node.type_string())) {
-    return LogNotCompilableAndReturn(node, "stateful random op");
-  }
-
-  if (!op_filter_.allow_control_trigger && node.IsControlTrigger()) {
-    return LogNotCompilableAndReturn(node);
-  }
-
-  if (!op_filter_.allow_eliding_assert_and_checknumerics_ops &&
-      IsAssertOrCheckNumerics(node.type_string())) {
-    return LogNotCompilableAndReturn(node, "Assert or CheckNumerics");
-  }
-
-  if (!op_filter_.allow_ops_producing_or_consuming_variant &&
-      OpProducesOrConsumesVariant(node)) {
-    return LogNotCompilableAndReturn(node, "DT_VARIANT producer/consumer");
-  }
-
-  if (!op_filter_.allow_stack_ops && IsStackOp(node)) {
-    return LogNotCompilableAndReturn(node, "Stack op");
-  }
-
-  if (!op_filter_.allow_tensor_array_ops && IsTensorArrayOp(node)) {
-    return LogNotCompilableAndReturn(node, "TensorArray op");
-  }
-
-  if (!op_filter_.allow_resource_ops_in_called_functions && depth > 0 &&
-      HasResourceInput(node)) {
-    return LogNotCompilableAndReturn(node,
-                                     "resource variable op in called function");
-  }
-
-  return true;
-}
-
-RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
-    const XlaOpRegistry::DeviceRegistration& registration) {
-  RecursiveCompilabilityChecker::OperationFilter op_filter;
-  op_filter.allow_resource_ops_in_called_functions =
-      registration.cluster_resource_variable_ops_unsafely;
-  op_filter.allow_stack_ops = registration.cluster_stack_ops;
-  op_filter.allow_tensor_array_ops = registration.cluster_tensor_array_ops;
-  op_filter.allow_stateful_rng_ops = registration.cluster_stateful_rng_ops;
-  op_filter.allow_control_trigger = registration.cluster_control_trigger;
-  op_filter.allow_eliding_assert_and_checknumerics_ops =
-      registration.elide_assert_and_checknumerics;
-  op_filter.allow_ops_producing_or_consuming_variant =
-      registration.cluster_variant_ops;
-  return op_filter;
-}
-
 class MarkForCompilationPassImpl {
  public:
   struct DebugOptions {
@@ -424,8 +119,8 @@
    public:
     // Constructs a trivial cluster representing a single TF node.
     Cluster(int tf_graph_node_id, int effective_cluster_size,
-            bool has_functional_control_flow,
-            absl::flat_hash_set<string> devices, string resource_op_device,
+            bool has_functional_control_flow, DeviceSet devices,
+            absl::optional<DeviceId> resource_op_device,
             absl::optional<int> resource_var_operation_node_id,
             absl::optional<DeadnessPredicate> deadness_predicate,
             bool is_xla_compile_attr_true, absl::optional<string> xla_scope)
@@ -433,7 +128,7 @@
           effective_cluster_size_(effective_cluster_size),
           has_functional_control_flow_(has_functional_control_flow),
           devices_(std::move(devices)),
-          resource_op_device_(std::move(resource_op_device)),
+          resource_op_device_(resource_op_device),
           deadness_predicate_(deadness_predicate),
           is_xla_compile_attr_true_(is_xla_compile_attr_true),
           xla_scope_(std::move(xla_scope)) {
@@ -469,12 +164,14 @@
     }
 
     // The set of devices nodes in the cluster are placed on.
-    const absl::flat_hash_set<string>& devices() const { return devices_; }
+    const DeviceSet& devices() const { return devices_; }
 
     // If the cluster has a resource operation then the device the resource
     // operation is placed on.  A cluster may have resource ops placed only on a
     // single device.
-    const string& resource_op_device() const { return resource_op_device_; }
+    const absl::optional<DeviceId>& resource_op_device() const {
+      return resource_op_device_;
+    }
 
     // If not nullopt the a predicate that is true iff the cluster is alive.
     // Otherwise the user has (unsafely) disabled deadness analysis.  If this is
@@ -498,13 +195,25 @@
       return resource_var_operation_node_ids_;
     }
 
+    string DebugString(const Graph& graph) const {
+      Node* node = graph.FindNodeId(cycles_graph_node_id());
+      if (!node) {
+        // This should never happen but we try to be resilient because this is a
+        // debugging aid.
+        return absl::StrCat("NULL NODE IN #", cycles_graph_node_id());
+      }
+
+      return absl::StrCat("<", node->name(), " + ", cluster_size(), " others #",
+                          cycles_graph_node_id(), ">");
+    }
+
    private:
     int cluster_size_ = 1;
     int cycles_graph_node_id_;
     int effective_cluster_size_;
     bool has_functional_control_flow_;
-    absl::flat_hash_set<string> devices_;
-    string resource_op_device_;
+    DeviceSet devices_;
+    absl::optional<DeviceId> resource_op_device_;
     absl::optional<DeadnessPredicate> deadness_predicate_;
     bool is_xla_compile_attr_true_;
     absl::optional<string> xla_scope_;
@@ -520,6 +229,10 @@
   // Initialize some internal data structures.
   Status Initialize();
 
+  // Runs through all the nodes in `cycles_graph_` and tries to create clusters.
+  // Returns true if any new clusters were created.
+  StatusOr<bool> RunEdgeContractionLoopInPostOrderOnce();
+
   // Contracts as many edges as possible to create XLA clusters.  After this
   // finishes the clustering decisions made are implicitly stored in
   // `clusters_`.
@@ -540,18 +253,16 @@
   // true if successful.
   StatusOr<bool> TryToContractEdge(Cluster* from, Cluster* to);
 
-  // Tries to contract each edge from `cluster_from`.  Returns true as soon as a
-  // single edge contraction is successful.  Returns true if no edges were
-  // contracted.
-  StatusOr<bool> TryToContractEdgeFrom(Cluster* cluster_from);
+  // Tries to contract each edge from `cluster_from`.  Returns true if any edges
+  // were contracted, false otherwise.
+  StatusOr<bool> TryToContractEdgesFrom(Cluster* cluster_from);
 
   // Nodes that XLA can compile are put in `compilation_candidates_`.
   Status FindCompilationCandidates();
 
-  bool CompilationDisallowedByXlaCompileAttr(Node* node,
-                                             const DeviceType& jit_device_type);
+  bool CompilationDisallowedByXlaCompileAttr(Node* node);
 
-  // Populates `clusters_` and `worklist_`.
+  // Populates `clusters_`.
   Status BuildInitialClusterSet();
 
   StatusOr<bool> ShouldCompileClusterImpl(const Cluster& cluster);
@@ -559,7 +270,7 @@
   StatusOr<bool> ShouldCompileCluster(const Cluster& cluster);
 
   StatusOr<bool> ClusteringWillIntroduceInterDeviceDependency(
-      const Cluster& to);
+      const Cluster& from, const Cluster& to);
 
   // Returns true if the devices in `cluster_a` and `cluster_b` are compatible
   // and therefore not a hindrance for combining the two clusters into a larger
@@ -572,17 +283,17 @@
 
   Cluster* MakeNewCluster(int cycles_graph_node_id, int effective_cluster_size,
                           bool has_functional_control_flow,
-                          absl::flat_hash_set<string> devices,
-                          string resource_op_device,
+                          const DeviceSet& device_set,
+                          absl::optional<DeviceId> resource_op_device,
                           absl::optional<int> resource_var_operation_node_id,
                           absl::optional<DeadnessPredicate> deadness_predicate,
                           bool is_xla_compile_attr_true,
                           absl::optional<string> xla_scope) {
     cluster_storage_.push_back(absl::make_unique<Cluster>(
         cycles_graph_node_id, effective_cluster_size,
-        has_functional_control_flow, std::move(devices),
-        std::move(resource_op_device), resource_var_operation_node_id,
-        deadness_predicate, is_xla_compile_attr_true, xla_scope));
+        has_functional_control_flow, device_set, resource_op_device,
+        resource_var_operation_node_id, deadness_predicate,
+        is_xla_compile_attr_true, xla_scope));
     return cluster_storage_.back().get();
   }
 
@@ -610,6 +321,32 @@
     return cluster;
   }
 
+  bool LogNotContractableAndReturnFalse(Cluster* from, Cluster* to,
+                                        absl::string_view reason);
+
+  // Finds a path in `cycles_graph_` from `from` to `to` that is not a direct
+  // edge from `from` to `to`.
+  //
+  // Tries to find a path that contains at least one unclusterable node.
+  std::vector<int> FindAlternatePathForDebugging(int from, int to);
+
+  // Returns a string representing `cycles_graph_node_id`.  If the node is
+  // unclusterable (either it is a phatom "frame" node or is not a compilation
+  // candidate) then set `*found_unclustered` to true.
+  string DebugStringForCyclesGraphNode(int node_id, bool* found_unclustered);
+
+  // We could not contract the edge from `from` to `to`.  Return a string
+  // describing an alternate path from `from` to `to` (besides the direct edge
+  // from `from` to `to`) which would have created a cycle had we contracted the
+  // edge.
+  //
+  // Tries (if possible) to find a path that contains at least one unclusterable
+  // node as it is surprising to the user if we print "A->B could not be
+  // contracted because of the path [P,Q,R]" where P, Q and R are all clusters
+  // since in that case a natural question is why we could not form a {A, P, Q,
+  // R, B} cluster.
+  string DescribePotentialCycle(int from, int to);
+
   // Merge the clusters `cluster_from` and `cluster_to`.  After this step the
   // larger combined cluster is represented by `cluster_from`'s ID in
   // `cycles_graph_`.
@@ -617,7 +354,11 @@
     int from = cluster_from->cycles_graph_node_id();
     int to = cluster_to->cycles_graph_node_id();
 
-    if (!graph_cycles_.ContractEdge(from, to)) {
+    if (!cycles_graph_.ContractEdge(from, to)) {
+      VLOG(3) << "Could not contract " << cluster_from->DebugString(*graph_)
+              << " -> " << cluster_to->DebugString(*graph_)
+              << " because contracting the edge would create a cycle via "
+              << DescribePotentialCycle(from, to) << ".";
       return false;
     }
 
@@ -636,7 +377,7 @@
   Env* env_;
   OptimizerOptions::GlobalJitLevel global_jit_level_;
   absl::flat_hash_map<const Cluster*, bool> should_compile_cluster_cache_;
-  DeviceInfoCache device_info_cache_;
+  jit::DeviceInfoCache device_info_cache_;
 
   bool initialized_ = false;
   bool edges_contracted_ = false;
@@ -644,14 +385,101 @@
 
   std::vector<std::unique_ptr<Cluster>> cluster_storage_;
   std::vector<UnionFind<Cluster*>> cluster_for_node_;
-  std::deque<Cluster*> worklist_;
-  GraphCycles graph_cycles_;
+  GraphCycles cycles_graph_;
   OrderedNodeSet compilation_candidates_;
   std::unique_ptr<DeadnessAnalysis> deadness_analysis_;
   int64 iteration_count_ = 0;
   absl::flat_hash_set<std::pair<int, int>> unsafe_resource_deps_;
 };
 
+std::vector<int> MarkForCompilationPassImpl::FindAlternatePathForDebugging(
+    int from, int to) {
+  std::vector<int> rpo = cycles_graph_.AllNodesInPostOrder();
+  absl::c_reverse(rpo);
+
+  // best_pred_for_node[n] contains a predecessor of `n` that has an
+  // unclusterable node in some path from `from` to itself.
+  // best_pred_for_node[n] is unpopulated for nodes that are not reachable from
+  // `from`.  We build this table up inductively by traversing the cycles graph
+  // in RPO.
+  absl::flat_hash_map<int, int> best_pred_for_node;
+  best_pred_for_node[from] = -1;
+
+  int rpo_index = 0, current_rpo_node;
+  do {
+    current_rpo_node = rpo[rpo_index++];
+    absl::optional<int> some_pred, preferred_pred;
+    for (int pred : cycles_graph_.Predecessors(current_rpo_node)) {
+      if (!best_pred_for_node.contains(pred)) {
+        continue;
+      }
+
+      // Ignore the from->to edge since we're trying to find an alternate path.
+      if (current_rpo_node == to && pred == from) {
+        continue;
+      }
+
+      some_pred = pred;
+      if (GetClusterForCyclesGraphNode(pred) == nullptr) {
+        preferred_pred = pred;
+      }
+    }
+
+    if (some_pred || preferred_pred) {
+      best_pred_for_node[current_rpo_node] =
+          preferred_pred.has_value() ? *preferred_pred : *some_pred;
+    }
+  } while (current_rpo_node != to);
+
+  auto get_best_pred = [&](int n) {
+    auto it = best_pred_for_node.find(n);
+    CHECK(it != best_pred_for_node.end());
+    return it->second;
+  };
+
+  std::vector<int> path;
+  int current_path_node = get_best_pred(to);
+  while (current_path_node != from) {
+    path.push_back(current_path_node);
+    current_path_node = get_best_pred(current_path_node);
+  }
+
+  absl::c_reverse(path);
+  return path;
+}
+
+string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
+    int cycles_graph_node_id, bool* found_unclustered) {
+  Cluster* cluster = GetClusterForCyclesGraphNode(cycles_graph_node_id);
+  if (cluster) {
+    return cluster->DebugString(*graph_);
+  }
+
+  *found_unclustered = true;
+  if (cycles_graph_node_id >= graph_->num_node_ids()) {
+    return absl::StrCat("<oob #", cycles_graph_node_id, ">");
+  }
+
+  Node* node = graph_->FindNodeId(cycles_graph_node_id);
+  if (!node) {
+    return absl::StrCat("<bad #", cycles_graph_node_id, ">");
+  }
+
+  return node->name();
+}
+
+string MarkForCompilationPassImpl::DescribePotentialCycle(int from, int to) {
+  std::vector<string> path_str;
+  bool found_unclustered = false;
+  absl::c_transform(FindAlternatePathForDebugging(from, to),
+                    std::back_inserter(path_str), [&](int node_id) {
+                      return DebugStringForCyclesGraphNode(node_id,
+                                                           &found_unclustered);
+                    });
+  return absl::StrCat(!found_unclustered ? "(all clusters) " : "", "[",
+                      absl::StrJoin(path_str, ","), "]");
+}
+
 void MarkForCompilationPassImpl::Cluster::Merge(Cluster* other) {
   // We keep our own cycles_graph_node_id_ to mirror what GraphCycles does.
 
@@ -662,13 +490,15 @@
   effective_cluster_size_ += other->effective_cluster_size_;
   has_functional_control_flow_ |= other->has_functional_control_flow_;
 
-  for (string other_device : other->devices_) {
-    devices_.insert(other_device);
-  }
-  other->devices_.clear();
+  devices_.UnionWith(other->devices_);
 
-  if (resource_op_device_.empty()) {
-    resource_op_device_ = std::move(other->resource_op_device_);
+  DCHECK(!(resource_op_device_.has_value() &&
+           other->resource_op_device_.has_value()) ||
+         *resource_op_device_ == *other->resource_op_device_)
+      << "AreDevicesCompatible should have returned false otherwise!";
+
+  if (!resource_op_device_.has_value()) {
+    resource_op_device_ = other->resource_op_device_;
   }
 
   is_xla_compile_attr_true_ |= other->is_xla_compile_attr_true_;
@@ -685,8 +515,8 @@
   other->resource_var_operation_node_ids_.clear();
 }
 
-Status IgnoreResourceOpForSafetyAnalysis(DeviceInfoCache* device_info_cache,
-                                         const Node& n, bool* ignore) {
+Status IgnoreResourceOpForSafetyAnalysis(
+    jit::DeviceInfoCache* device_info_cache, const Node& n, bool* ignore) {
   // If a resource operation is assigned to XLA_CPU or XLA_GPU explicitly then
   // ignore it during resource operation safety analysis.  We need this hack
   // because of two reasons:
@@ -735,7 +565,7 @@
   }
 
   TF_ASSIGN_OR_RETURN(bool cycle_detection_graph_ok,
-                      CreateCycleDetectionGraph(graph_, &graph_cycles_));
+                      CreateCycleDetectionGraph(graph_, &cycles_graph_));
   if (!cycle_detection_graph_ok) {
     return Status::OK();
   }
@@ -751,28 +581,55 @@
   return BuildInitialClusterSet();
 }
 
+StatusOr<bool>
+MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
+  bool changed = false;
+  // Iterating over the graph once in post-order is sufficient to produce a
+  // maximal clustering:
+  //
+  // A. We visit a cluster only after maximally clustering all its children.
+  // B. By the time we're done with `node` (in `TryToContractEdgesFrom`) all of
+  //    its children that could have been absorbed into `node` have been
+  //    absorbed.
+  // C. We have an invariant that making a cluster larger does not make edges
+  //    leaving it more contractable. That is, if we have
+  //    digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
+  //    to contract Y->Z if Y->Z was not contractible originally.
+  for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
+    // We have to check `graph_->FindNodeId(node) == nullptr` because we add all
+    // nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
+    // TF graph may be missing some node ids.
+    if (node >= graph_->num_node_ids() || graph_->FindNodeId(node) == nullptr) {
+      continue;
+    }
+
+    Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
+    if (cluster_from == nullptr) {
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(bool contracted_one_edge,
+                        TryToContractEdgesFrom(cluster_from));
+    changed |= contracted_one_edge;
+  }
+
+  return changed;
+}
+
 Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
   TF_RET_CHECK(initialized_ && !edges_contracted_ && !clusters_created_);
   edges_contracted_ = true;
 
   // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
   // example, from the Grappler fusion pass).
-  while (!worklist_.empty()) {
-    Cluster* cluster_from = worklist_.front();
-    worklist_.pop_front();
 
-    TF_ASSIGN_OR_RETURN(bool contracted_one_edge,
-                        TryToContractEdgeFrom(cluster_from));
+  TF_ASSIGN_OR_RETURN(bool changed, RunEdgeContractionLoopInPostOrderOnce());
 
-    if (contracted_one_edge) {
-      worklist_.push_back(cluster_from);
-    }
-  }
-
-  VLOG(1) << iteration_count_ << " iterations in inner loop for graph with "
-          << compilation_candidates_.size()
-          << " compilation candidates.  Iterations per compilation candidate: "
-          << ((1.0 * iteration_count_) / compilation_candidates_.size());
+  // Check that RunEdgeContractionLoopInPostOrderOnce is idempotent.  Once the
+  // linear time post-order scheme has been battle tested we can move this to
+  // happen only in debug builds.
+  TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());
+  TF_RET_CHECK(!changed);
 
   return Status::OK();
 }
@@ -841,7 +698,7 @@
 
 StatusOr<bool>
 MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
-    const Cluster& cluster_to) {
+    const Cluster& cluster_from, const Cluster& cluster_to) {
   // If any of the consumer's producers are on a different device, do not
   // cluster these nodes. This prevents other work on this device from being
   // delayed by work on other devices. We consider predecessors of the entire
@@ -853,7 +710,7 @@
   // TODO(b/117085735): We probably want to handle the reciprocal of this case
   // where a cluster is producing data for multiple devices.
   for (const auto& in_id :
-       graph_cycles_.Predecessors(cluster_to.cycles_graph_node_id())) {
+       cycles_graph_.Predecessors(cluster_to.cycles_graph_node_id())) {
     if (in_id >= graph_->num_node_ids()) {
       continue;
     }
@@ -865,6 +722,11 @@
       if (!devices_compatible) {
         return true;
       }
+      TF_ASSIGN_OR_RETURN(devices_compatible,
+                          AreDevicesCompatible(cluster_from, *cluster_in));
+      if (!devices_compatible) {
+        return true;
+      }
     }
   }
 
@@ -928,12 +790,14 @@
           deadness_analysis_->GetPredicateFor(node, Graph::kControlSlot));
     }
 
-    const string& device = !node->assigned_device_name().empty()
-                               ? node->assigned_device_name()
-                               : node->requested_device();
+    const string& device_name_str = !node->assigned_device_name().empty()
+                                        ? node->assigned_device_name()
+                                        : node->requested_device();
+    TF_ASSIGN_OR_RETURN(DeviceId device,
+                        device_info_cache_.GetIdFor(device_name_str));
 
-    bool is_resource_op = HasResourceInput(*node) || HasResourceOutput(*node);
-    string resource_op_device;
+    bool is_resource_op = HasResourceInputOrOutput(*node);
+    absl::optional<DeviceId> resource_op_device;
     if (is_resource_op) {
       resource_op_device = device;
     }
@@ -954,19 +818,17 @@
       is_xla_compile_attr_true |= xla_compile_attr;
     }
 
-    absl::flat_hash_set<string> devices;
-    devices.insert(device);
+    DeviceSet devices;
+    devices.Insert(device);
 
     Cluster* new_cluster = MakeNewCluster(
         /*cycles_graph_node_id=*/node->id(),
         /*effective_cluster_size=*/effective_cluster_size,
-        /*has_functional_control_flow=*/has_functional_control_flow,
-        std::move(devices), std::move(resource_op_device),
-        resource_var_operation_node_id, deadness_predicate,
+        /*has_functional_control_flow=*/has_functional_control_flow, devices,
+        resource_op_device, resource_var_operation_node_id, deadness_predicate,
         /*is_xla_compile_attr_true=*/is_xla_compile_attr_true,
         GetXlaScope(node));
 
-    worklist_.push_back(new_cluster);
     cluster_for_node_[node->id()].Get() = new_cluster;
   }
 
@@ -1018,7 +880,7 @@
     VLOG(4) << "Device type for " << node->name() << ": "
             << device_type.type_string();
 
-    if (CompilationDisallowedByXlaCompileAttr(node, device_type)) {
+    if (CompilationDisallowedByXlaCompileAttr(node)) {
       VLOG(2) << "Not clustering " << node->name()
               << ": disallowed by _XlaCompile attribute";
       continue;
@@ -1104,17 +966,11 @@
 }
 
 bool MarkForCompilationPassImpl::CompilationDisallowedByXlaCompileAttr(
-    Node* node, const DeviceType& device_type) {
+    Node* node) {
   if (debug_options_.ignore_xla_compile_attr) {
     return false;
   }
 
-  const XlaOpRegistry::DeviceRegistration* registration;
-  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    VLOG(2) << "Rejecting " << node->name() << ": could not find JIT device.";
-    return false;
-  }
-
   // If there is a _XlaCompile annotation, use its value.
   bool compile = false;
   Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
@@ -1138,11 +994,11 @@
   return false;
 }
 
-// Is 'node' an operator that consumes only the shape of its input, not the
-// data itself?
-bool IsShapeConsumerOp(const Node& node) {
-  return node.type_string() == "Shape" || node.type_string() == "Rank" ||
-         node.type_string() == "Size";
+bool MarkForCompilationPassImpl::LogNotContractableAndReturnFalse(
+    Cluster* from, Cluster* to, absl::string_view reason) {
+  VLOG(3) << "Could not contract " << from->DebugString(*graph_) << " -> "
+          << to->DebugString(*graph_) << " because " << reason << ".";
+  return false;
 }
 
 StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdge(Cluster* from,
@@ -1150,38 +1006,36 @@
   DCHECK(from->deadness_predicate().has_value() ==
          to->deadness_predicate().has_value());
   if (from->deadness_predicate() != to->deadness_predicate()) {
-    return false;
+    return LogNotContractableAndReturnFalse(
+        from, to, "the two nodes have mismatching deadness");
   }
 
   TF_ASSIGN_OR_RETURN(bool devices_compatible,
                       AreDevicesCompatible(*from, *to));
   if (!devices_compatible) {
-    return false;
+    return LogNotContractableAndReturnFalse(
+        from, to, "the two nodes have incompatible devices");
   }
 
   if (from->xla_scope().has_value() && to->xla_scope().has_value() &&
       *from->xla_scope() != *to->xla_scope()) {
-    return false;
-  }
-
-  // Ops that consume shapes cannot be the root of a cluster. This is an
-  // optimization.
-  if (from->cluster_size() == 1 &&
-      IsShapeConsumerOp(*graph_->FindNodeId(from->GetIdOfOnlyNode()))) {
-    return false;
+    return LogNotContractableAndReturnFalse(
+        from, to, "the two nodes have mismatching XLA scopes");
   }
 
   // Don't exceed the maximum cluster size.
   if (from->cluster_size() + to->cluster_size() >
       debug_options_.max_cluster_size) {
-    return false;
+    return LogNotContractableAndReturnFalse(
+        from, to, "the new cluster will be larger than the max cluster size");
   }
 
   TF_ASSIGN_OR_RETURN(bool will_introduce_cross_device_dependency,
-                      ClusteringWillIntroduceInterDeviceDependency(*to));
+                      ClusteringWillIntroduceInterDeviceDependency(*from, *to));
 
   if (will_introduce_cross_device_dependency) {
-    return false;
+    return LogNotContractableAndReturnFalse(
+        from, to, "the new cluster will introduce a cross device dependency");
   }
 
   // Check if contracting this edge will break the resource variable concurrency
@@ -1200,7 +1054,9 @@
       // n^2 pairs of resource variable operations are forbidden.
       if (unsafe_resource_deps_.contains(
               {resource_var_from, resource_var_to})) {
-        return false;
+        return LogNotContractableAndReturnFalse(
+            from, to,
+            "the new cluster would break resource variable semantics");
       }
     }
   }
@@ -1208,10 +1064,19 @@
   return MergeClusters(from, to);
 }
 
-StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgeFrom(
+StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(
     Cluster* cluster_from) {
-  for (int to :
-       graph_cycles_.Successors(cluster_from->cycles_graph_node_id())) {
+  bool changed = false;
+
+  // Make a copy of the set of successors because we may modify the graph in
+  // TryToContractEdge.
+  std::vector<int32> successors_copy = [&] {
+    absl::Span<const int32> successors =
+        cycles_graph_.Successors(cluster_from->cycles_graph_node_id());
+    return std::vector<int32>(successors.begin(), successors.end());
+  }();
+
+  for (int to : successors_copy) {
     iteration_count_++;
     if (to >= graph_->num_node_ids()) {
       // Node is a fictitious node that is present only in the cycle detection
@@ -1227,12 +1092,10 @@
     TF_ASSIGN_OR_RETURN(bool contracted_edge,
                         TryToContractEdge(cluster_from, cluster_to));
 
-    if (contracted_edge) {
-      return true;
-    }
+    changed |= contracted_edge;
   }
 
-  return false;
+  return changed;
 }
 
 Status MarkForCompilationPassImpl::Run() {
@@ -1412,27 +1275,18 @@
 
 StatusOr<bool> MarkForCompilationPassImpl::AreDevicesCompatible(
     const Cluster& cluster_a, const Cluster& cluster_b) {
-  std::vector<string> devices;
-  absl::c_remove_copy(cluster_a.devices(), std::back_inserter(devices), "");
-  absl::c_remove_copy(cluster_b.devices(), std::back_inserter(devices), "");
-  absl::c_sort(devices);
+  DeviceSet devices = cluster_a.devices();
+  devices.UnionWith(cluster_b.devices());
 
-  if (devices.empty()) {
+  TF_ASSIGN_OR_RETURN(
+      absl::optional<jit::DeviceId> maybe_chosen_device,
+      MaybePickDeviceForXla(device_info_cache_, devices,
+                            /*allow_mixing_unknown_and_cpu=*/false));
+  if (!maybe_chosen_device.has_value()) {
     return false;
   }
 
-  // First check if we will even be able to pick a device for the larger
-  // combined cluster.
-  bool can_pick_device;
-  TF_RETURN_IF_ERROR(CanPickDeviceForXla(
-      devices, /*allow_mixing_unknown_and_cpu=*/false, &can_pick_device));
-  if (!can_pick_device) {
-    return false;
-  }
-
-  string chosen_device;
-  TF_RETURN_IF_ERROR(PickDeviceForXla(
-      devices, /*allow_mixing_unknown_and_cpu=*/false, &chosen_device));
+  jit::DeviceId chosen_device = *maybe_chosen_device;
 
   // If we are able to pick a device `chosen_device` for the larger cluster, the
   // resource operations in `cluster_a` and `cluster_b` must be placed on the
@@ -1440,9 +1294,11 @@
   // _XlaRun kernels are going to run on and therefore try to access the
   // resource variables from `chosen_device`, which will be an error if the
   // resource variables are placed on some other device.
-  auto resource_op_device_ok = [&](const string& resource_op_device) {
-    return resource_op_device.empty() || resource_op_device == chosen_device;
-  };
+  auto resource_op_device_ok =
+      [&](absl::optional<DeviceId> resource_op_device) {
+        return !resource_op_device.has_value() ||
+               *resource_op_device == chosen_device;
+      };
 
   return resource_op_device_ok(cluster_a.resource_op_device()) &&
          resource_op_device_ok(cluster_b.resource_op_device());
@@ -1451,22 +1307,18 @@
 // Returns `true` iff we should compile `cluster`.
 StatusOr<bool> MarkForCompilationPassImpl::ShouldCompileClusterImpl(
     const Cluster& cluster) {
-  std::vector<string> devices;
-  absl::c_remove_copy(cluster.devices(), std::back_inserter(devices), "");
-  absl::c_sort(devices);
+  TF_ASSIGN_OR_RETURN(DeviceId chosen_device,
+                      PickDeviceForXla(device_info_cache_, cluster.devices(),
+                                       /*allow_mixing_unknown_and_cpu=*/false));
 
-  string chosen_device;
-  TF_RETURN_IF_ERROR(PickDeviceForXla(
-      devices, /*allow_mixing_unknown_and_cpu=*/false, &chosen_device));
-
-  TF_ASSIGN_OR_RETURN(const DeviceType& device_type,
-                      device_info_cache_.GetDeviceTypeFor(chosen_device));
-  TF_ASSIGN_OR_RETURN(const XlaOpRegistry::DeviceRegistration* registration,
-                      device_info_cache_.GetCompilationDevice(chosen_device));
+  const DeviceType& device_type =
+      device_info_cache_.GetDeviceTypeFor(chosen_device);
+  const XlaOpRegistry::DeviceRegistration* registration =
+      device_info_cache_.GetCompilationDevice(chosen_device);
   TF_RET_CHECK(registration)
-      << "chosen device = " << chosen_device
+      << "chosen device = " << device_info_cache_.GetNameFor(chosen_device)
       << "; device type = " << device_type.type() << "; devices ("
-      << devices.size() << ") = " << absl::StrJoin(devices, ", ");
+      << device_info_cache_.DebugString(cluster.devices());
 
   bool should_compile =
       cluster.is_xla_compile_attr_true() ||
@@ -1500,7 +1352,8 @@
   }
 
   VLOG(3) << (should_compile ? "Compiling" : "Not compiling")
-          << " cluster with device " << chosen_device;
+          << " cluster with device "
+          << device_info_cache_.GetNameFor(chosen_device);
 
   return should_compile;
 }
@@ -1571,6 +1424,7 @@
   op_filter.allow_control_trigger = true;
   op_filter.allow_eliding_assert_and_checknumerics_ops = true;
   op_filter.allow_ops_producing_or_consuming_variant = true;
+  op_filter.allow_slow_and_inaccurate_ops = true;
 
   return RecursiveCompilabilityChecker{&op_filter, &jit_device_type}
       .IsCompilableCall(ndef, flr);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index fb6fcce..8b14dad 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -270,11 +270,11 @@
   auto clusters = GetClusters(*graph);
 
   EXPECT_EQ(2, clusters.size());
-  EXPECT_FALSE(clusters["B"].empty());
-  EXPECT_EQ(clusters["B"], clusters["C"]);
+  EXPECT_FALSE(clusters["C"].empty());
+  EXPECT_EQ(clusters["C"], clusters["E"]);
   EXPECT_TRUE(clusters.find("A") == clusters.cend());
+  EXPECT_TRUE(clusters.find("B") == clusters.cend());
   EXPECT_TRUE(clusters.find("D") == clusters.cend());
-  EXPECT_TRUE(clusters.find("E") == clusters.cend());
 }
 
 TEST(XlaCompilationTest, CallXlaDeviceFuncWithResourceOp) {
@@ -332,31 +332,6 @@
   EXPECT_NE(clusters["A"], "");
 }
 
-// Metadata-only operators such as Shape/Rank/Size may not be the root of a
-// cluster. This is partially to work around b/26800664, and partially because
-// we should probably prefer to compile metadata operators with their producers
-// wherever possible, rather than their consumers.
-TEST(XlaCompilationTest, MetadataOpsDontStartClusters) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  GraphDef graphdef;
-  {
-    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-    Node* a =
-        ops::SourceOp("UncompilableNullary", builder.opts().WithName("A"));
-    // While all of the following ops are notionally compilable, none is
-    // permitted
-    // to start a cluster. So nothing should be compiled.
-    Node* b = ops::UnaryOp("Shape", a, builder.opts().WithName("B"));
-    Node* c = ops::UnaryOp("Rank", b, builder.opts().WithName("C"));
-    Node* d = ops::UnaryOp("Size", c, builder.opts().WithName("D"));
-    ops::UnaryOp("Shape", d, builder.opts().WithName("E"));
-    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
-  }
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  auto clusters = GetClusters(*graph);
-  EXPECT_EQ(0, clusters.size());  // Nothing should be compiled.
-}
-
 static Status GradForUnaryCwise(FunctionDef* g,
                                 std::vector<FunctionDefHelper::Node> nodes) {
   for (auto& n : nodes) {
@@ -1137,6 +1112,45 @@
   EXPECT_EQ(clusters["B_dev1"], clusters["MatMul1_dev1"]);
 }
 
+TEST(XlaCompilationTest, DontClusterMergingNodesOnCPU) {
+  // This is similar to the 'DontClusterMergingNodes' above, except
+  // MatMulCombined is placed on the CPU.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  absl::string_view xla_gpu_dev0 = "/job:worker/replica:0/task:0/device:GPU:0";
+  absl::string_view xla_gpu_dev1 = "/job:worker/replica:0/task:0/device:GPU:1";
+  absl::string_view xla_cpu_dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Output a = ops::Tanh(root.WithOpName("tanh_A_dev0"),
+                       ops::Const(root.WithOpName("A_dev0"), 1.0f, {2, 2}));
+  Output b = ops::Tanh(root.WithOpName("tanh_B_dev1"),
+                       ops::Const(root.WithOpName("B_dev1"), 1.0f, {2, 2}));
+  Output matmul0 = ops::MatMul(root.WithOpName("MatMul0_dev0"), a, a);
+  Output matmul1 = ops::MatMul(root.WithOpName("MatMul1_dev1"), b, b);
+
+  Output combined =
+      ops::MatMul(root.WithOpName("MatMulCombined_cpu"), matmul0, matmul1);
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::EndsWith(n->name(), /*suffix=*/"cpu")) {
+      n->set_assigned_device_name(string(xla_cpu_dev0));
+    } else if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
+      n->set_assigned_device_name(string(xla_gpu_dev0));
+    } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
+      n->set_assigned_device_name(string(xla_gpu_dev1));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  // Each of the MatMuls should be in a separate cluster.
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
+  EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul0_dev0"]);
+  EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul1_dev1"]);
+  EXPECT_EQ(clusters["A_dev0"], clusters["MatMul0_dev0"]);
+  EXPECT_EQ(clusters["B_dev1"], clusters["MatMul1_dev1"]);
+}
+
 // TODO(b/117085735): This form of clustering should be prevented.
 TEST(XlaCompilationTest, NOT_DontClusterSpreadingNodes) {
   // MatMulSource below creates data for nodes on GPU0 and GPU1 and is placed
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index ffc5d0e..f4873b2 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -14,9 +14,11 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/partially_decluster_pass.h"
+
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/device_util.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -57,7 +59,7 @@
 
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
-        DeviceToDeviceType(n->assigned_device_name(), &device_type));
+        DeviceNameToDeviceType(n->assigned_device_name(), &device_type));
     TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type,
                                           n->def(), &input_mtypes,
                                           &output_mtypes));
@@ -77,8 +79,8 @@
       } else {
         MemoryTypeVector dst_input_mtypes, dst_output_mtypes;
         DeviceType dst_device_type("");
-        TF_RETURN_IF_ERROR(
-            DeviceToDeviceType(dst->assigned_device_name(), &dst_device_type));
+        TF_RETURN_IF_ERROR(DeviceNameToDeviceType(dst->assigned_device_name(),
+                                                  &dst_device_type));
         TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type,
                                               dst->def(), &dst_input_mtypes,
                                               &dst_output_mtypes));
@@ -237,7 +239,7 @@
 Status MustCompileNode(const Node* n, bool* must_compile) {
   DeviceType device_type("");
   TF_RETURN_IF_ERROR(
-      DeviceToDeviceType(n->assigned_device_name(), &device_type));
+      DeviceNameToDeviceType(n->assigned_device_name(), &device_type));
 
   if (IsMustCompileDevice(device_type)) {
     *must_compile = true;
@@ -340,6 +342,46 @@
   return Status::OK();
 }
 }  // namespace reduce_recompilation
+
+namespace decluster_root_shape_consumers {
+// Returns true if `node` an operator that consumes only the shape of its input,
+// not the data itself.
+bool IsShapeConsumerOp(const Node& node) {
+  return node.type_string() == "Shape" || node.type_string() == "Rank" ||
+         node.type_string() == "Size";
+}
+
+Status PartiallyDeclusterGraph(Graph* graph) {
+  std::vector<Node*> reverse_post_order;
+  GetReversePostOrder(*graph, &reverse_post_order,
+                      /*stable_comparator=*/NodeComparatorName(),
+                      /*edge_filter=*/NotBackedge);
+
+  for (Node* n : reverse_post_order) {
+    if (!IsShapeConsumerOp(*n)) {
+      continue;
+    }
+
+    absl::optional<absl::string_view> cluster = GetXlaClusterForNode(*n);
+    if (!cluster.has_value()) {
+      continue;
+    }
+
+    auto input_belongs_to_same_cluster = [&](const Edge* e) {
+      return cluster == GetXlaClusterForNode(*e->src());
+    };
+
+    if (absl::c_any_of(n->in_edges(), input_belongs_to_same_cluster)) {
+      continue;
+    }
+
+    VLOG(2) << "Declustering " << n->name()
+            << " because it is a root shape consumer";
+    RemoveFromXlaCluster(n);
+  }
+  return Status::OK();
+}
+}  // namespace decluster_root_shape_consumers
 }  // namespace
 
 Status PartiallyDeclusterPass::Run(
@@ -367,6 +409,9 @@
   TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(
       graph, options.flib_def, options.session_options->env));
 
+  TF_RETURN_IF_ERROR(
+      decluster_root_shape_consumers::PartiallyDeclusterGraph(graph));
+
   return Status::OK();
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 3494d0e..ac1c608 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -467,5 +467,37 @@
   EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer1Name), nullptr);
 }
 
+TEST(PartiallyDeclusterPassTest, MetadataOpsDontStartClusters) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  tensorflow::Scope in_cluster_and = root.WithXlaCluster("cluster_0");
+
+  Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
+  Output b = ops::Shape(in_cluster_and.WithOpName("b"), a);
+  Output c = ops::Rank(in_cluster_and.WithOpName("c"), b);
+  Output d = ops::Size(in_cluster_and.WithOpName("d"), c);
+  (void)ops::Shape(in_cluster_and.WithOpName("e"), d);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+
+  Node* n_b = FindNodeByName(*graph, "b");
+  ASSERT_NE(n_b, nullptr);
+  EXPECT_EQ(GetXlaClusterForNode(*n_b), absl::nullopt);
+
+  Node* n_c = FindNodeByName(*graph, "c");
+  ASSERT_NE(n_c, nullptr);
+  EXPECT_EQ(GetXlaClusterForNode(*n_c), absl::nullopt);
+
+  Node* n_d = FindNodeByName(*graph, "d");
+  ASSERT_NE(n_d, nullptr);
+  EXPECT_EQ(GetXlaClusterForNode(*n_d), absl::nullopt);
+
+  Node* n_e = FindNodeByName(*graph, "e");
+  ASSERT_NE(n_e, nullptr);
+  EXPECT_EQ(GetXlaClusterForNode(*n_e), absl::nullopt);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
new file mode 100644
index 0000000..09f1099
--- /dev/null
+++ b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
@@ -0,0 +1,278 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/rearrange_function_argument_pass.h"
+
+#include "absl/strings/match.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+class RearrangeFunctionArgumentForFunctionTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions session_options;
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        session_options, "/job:localhost/replica:0/task:0", &devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+  }
+
+  Status RearrangeFunctionArgumentTest(
+      const string &func_name, const string &new_func_name,
+      const protobuf::Map<string, tensorflow::AttrValue> &attrs,
+      FunctionLibraryDefinition *fld, bool *modified) {
+    OptimizerOptions opts;
+    pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, fld, opts,
+        /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+    std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+    auto flr = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+    return RearrangeFunctionArgumentForFunction(
+        func_name, new_func_name, attrs, fld, flr,
+        &canonicalized_name_to_new_name, modified);
+  }
+
+ private:
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+};
+
+TEST_F(RearrangeFunctionArgumentForFunctionTest, Basic) {
+  FunctionDefLibrary fdl;
+  {
+    // Function for StatefulPartitionedCall's "f", If's
+    // "then_branch"/"else_branch".
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_BOOL)
+    // "ret0" = "arg1"
+    // "ret1" = "arg0"
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_BOOL, 1);
+    auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
+    auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg0, 1);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
+  }
+  {
+    // Function for While's "body".
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_BOOL)
+    // "ret0" = "arg0"
+    // "ret1" = "arg1"
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_BOOL, 1);
+    auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg0, 0);
+    auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg1, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
+  }
+  {
+    // Function for While's "cond".
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_BOOL)
+    // "ret0" = "arg1"
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_BOOL, 1);
+    auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "f3", xla_fdef));
+  }
+  {
+    // Build the XLA computation func.
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_INT32)
+    // "arg0", "arg1" -> "call" (StatefulPartitionedCall) -> "ret0", "ret1"
+    // "arg0", "arg1" -> "if" (If) -> "ret2", "ret3"
+    // "arg0", "arg1" -> "while" (While) -> "ret4", "ret5"
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_BOOL, 1);
+    NameAttrList f;
+    f.set_name("f1");
+    auto call = ops::StatefulPartitionedCall(
+        s.WithOpName("call"), {arg0, arg1},
+        std::vector<DataType>{DT_BOOL, DT_RESOURCE}, f);
+    auto ret0 = ops::_Retval(s.WithOpName("ret0"), call.output[0], 0);
+    auto ret1 = ops::_Retval(s.WithOpName("ret1"), call.output[1], 1);
+    auto if_op = ops::If(s.WithOpName("if"), arg1,
+                         std::initializer_list<Input>{arg0, arg1},
+                         {DT_BOOL, DT_RESOURCE}, f, f);
+    auto ret2 = ops::_Retval(s.WithOpName("ret2"), if_op.output[0], 2);
+    auto ret3 = ops::_Retval(s.WithOpName("ret3"), if_op.output[1], 3);
+    NameAttrList cond_fn, body_fn;
+    cond_fn.set_name("f3");
+    body_fn.set_name("f2");
+    auto while_op =
+        ops::While(s.WithOpName("while"),
+                   std::initializer_list<Input>{arg0, arg1}, cond_fn, body_fn);
+    auto ret4 = ops::_Retval(s.WithOpName("ret4"), while_op.output[0], 4);
+    auto ret5 = ops::_Retval(s.WithOpName("ret5"), while_op.output[1], 5);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  bool modified;
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  TF_CHECK_OK(RearrangeFunctionArgumentTest("cluster", "cluster_rewritten",
+                                            attrs, &fld, &modified));
+
+  // Check function f1_rearrange_0, input types should be {DT_BOOL, DT_RESOURCE}
+  // and output types should be {DT_BOOL}.
+  const FunctionDef *f1_rewritten = fld.Find("f1_rearrange_0");
+  CHECK_NE(f1_rewritten, nullptr);
+  ASSERT_EQ(f1_rewritten->signature().input_arg_size(), 2);
+  EXPECT_EQ(f1_rewritten->signature().input_arg(0).type(), DT_BOOL);
+  EXPECT_EQ(f1_rewritten->signature().input_arg(1).type(), DT_RESOURCE);
+  ASSERT_EQ(f1_rewritten->signature().output_arg_size(), 1);
+  EXPECT_EQ(f1_rewritten->signature().output_arg(0).type(), DT_BOOL);
+
+  // Check node "call" input and output edges.
+  std::unique_ptr<FunctionBody> xla_fbody;
+  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                      AttrSlice(), &fld, &xla_fbody));
+  auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
+  const Node *call_node = node_name_index.at("call");
+  ASSERT_NE(call_node, nullptr);
+  const Node *input_node;
+  TF_CHECK_OK(call_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "arg1");
+  TF_CHECK_OK(call_node->input_node(1, &input_node));
+  EXPECT_EQ(input_node->name(), "arg0");
+  const Node *ret0_node = xla_fbody->ret_nodes[0];
+  TF_CHECK_OK(ret0_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "call");
+  const Node *ret1_node = xla_fbody->ret_nodes[1];
+  TF_CHECK_OK(ret1_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "arg0");
+
+  // Check node "if" input and output edges.
+  const Node *if_node = node_name_index.at("if");
+  ASSERT_NE(if_node, nullptr);
+  TF_CHECK_OK(if_node->input_node(1, &input_node));
+  EXPECT_EQ(input_node->name(), "arg1");
+  TF_CHECK_OK(if_node->input_node(2, &input_node));
+  EXPECT_EQ(input_node->name(), "arg0");
+  const Node *ret2_node = xla_fbody->ret_nodes[2];
+  TF_CHECK_OK(ret2_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "if");
+  const Node *ret3_node = xla_fbody->ret_nodes[3];
+  TF_CHECK_OK(ret3_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "arg0");
+
+  // Check node "while" input and output edges.
+  const Node *while_node = node_name_index.at("while");
+  ASSERT_NE(while_node, nullptr);
+  TF_CHECK_OK(while_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "arg1");
+  TF_CHECK_OK(while_node->input_node(1, &input_node));
+  EXPECT_EQ(input_node->name(), "arg0");
+  const Node *ret4_node = xla_fbody->ret_nodes[4];
+  TF_CHECK_OK(ret4_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "arg0");
+  const Node *ret5_node = xla_fbody->ret_nodes[5];
+  TF_CHECK_OK(ret5_node->input_node(0, &input_node));
+  EXPECT_EQ(input_node->name(), "while");
+}
+
+TEST_F(RearrangeFunctionArgumentForFunctionTest,
+       WhileResourceRetvalFromDifferentArgUnimplemented) {
+  FunctionDefLibrary fdl;
+  {
+    // Function for While's "body".
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_RESOURCE), "arg2" (T=DT_INT32)
+    // "ret0" = "arg1"
+    // "ret1" = "arg0"
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_RESOURCE, 1);
+    Output arg2 = ops::_Arg(s.WithOpName("arg2"), DT_INT32, 2);
+    auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
+    auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg0, 1);
+    auto ret2 = ops::_Retval(s.WithOpName("ret2"), arg2, 2);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
+  }
+  {
+    // Function for While's "cond".
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_RESOURCE), "arg2" (T=DT_INT32)
+    // "ret0" = true
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_RESOURCE, 1);
+    Output arg2 = ops::_Arg(s.WithOpName("arg2"), DT_INT32, 2);
+    Output cond = ops::Const(s.WithOpName("const"), true, TensorShape({}));
+    auto ret0 = ops::_Retval(s.WithOpName("ret0"), cond, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
+  }
+  {
+    // Build the XLA computation func.
+    // "arg0" (T=DT_RESOURCE), "arg1" (T=DT_RESOURCE), "arg2" (T=DT_INT32)
+    // "arg0", "arg1" -> "while" (While)
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_RESOURCE, 0);
+    Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_RESOURCE, 1);
+    Output arg2 = ops::_Arg(s.WithOpName("arg2"), DT_INT32, 2);
+    NameAttrList cond_fn, body_fn;
+    cond_fn.set_name("f1");
+    body_fn.set_name("f2");
+    auto while_op = ops::While(s.WithOpName("while"),
+                               std::initializer_list<Input>{arg0, arg1, arg2},
+                               cond_fn, body_fn);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  bool modified;
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  Status s = RearrangeFunctionArgumentTest("cluster", "cluster_rewritten",
+                                           attrs, &fld, &modified);
+  EXPECT_EQ(s.code(), error::UNIMPLEMENTED);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 2dfa34a..b010830 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -84,15 +84,6 @@
 
 }  // namespace
 
-Status DeviceToDeviceType(const string& device, DeviceType* device_type) {
-  DeviceNameUtils::ParsedName parsed;
-  if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
-    return errors::Internal("Malformed assigned device '", device, "'");
-  }
-  *device_type = DeviceType(parsed.type);
-  return Status::OK();
-}
-
 bool HasForwardedRefInput(const Node& node) {
   if (AlwaysForwardsRefInput(node)) {
     for (const Edge* incoming_edge : node.in_edges()) {
@@ -226,108 +217,6 @@
 
 void RemoveFromXlaCluster(Node* node) { node->ClearAttr(kXlaClusterAttr); }
 
-Status PickDeviceForXlaImpl(absl::Span<const string> device_names,
-                            bool allow_mixing_unknown_and_cpu,
-                            bool* out_can_pick_device,
-                            string* out_device_picked) {
-  if (out_can_pick_device) {
-    *out_can_pick_device = true;
-  }
-
-#define FAILED_TO_PICK_DEVICE(failing_status) \
-  do {                                        \
-    if (out_can_pick_device) {                \
-      *out_can_pick_device = false;           \
-      return Status::OK();                    \
-    } else {                                  \
-      return failing_status;                  \
-    }                                         \
-  } while (false)
-
-  TF_RET_CHECK(!device_names.empty()) << "No devices to choose from";
-  DCHECK_NE(out_can_pick_device == nullptr, out_device_picked == nullptr);
-
-  absl::flat_hash_set<absl::string_view> device_names_set;
-  for (absl::string_view device_name : device_names) {
-    if (!device_name.empty()) {
-      device_names_set.insert(device_name);
-    }
-  }
-
-  absl::optional<absl::string_view> maybe_gpu_device;
-  absl::optional<absl::string_view> maybe_cpu_device;
-  absl::optional<absl::string_view> maybe_unknown_device;
-
-  for (absl::string_view device_name : device_names_set) {
-    DeviceNameUtils::ParsedName parsed_name;
-    TF_RET_CHECK(DeviceNameUtils::ParseFullName(device_name, &parsed_name))
-        << device_name;
-    if (parsed_name.type == "GPU") {
-      if (maybe_gpu_device) {
-        FAILED_TO_PICK_DEVICE(errors::Internal(
-            "Multiple GPU devices ", absl::StrJoin(device_names, ", ")));
-      }
-      maybe_gpu_device = device_name;
-    } else if (parsed_name.type == "CPU") {
-      if (maybe_cpu_device) {
-        FAILED_TO_PICK_DEVICE(errors::Internal(
-            "Multiple CPU devices ", absl::StrJoin(device_names, ", ")));
-      }
-      maybe_cpu_device = device_name;
-    } else {
-      if (maybe_unknown_device) {
-        FAILED_TO_PICK_DEVICE(errors::Internal(
-            "Multiple unknown devices ", absl::StrJoin(device_names, ", ")));
-      }
-      maybe_unknown_device = device_name;
-    }
-  }
-
-  if (maybe_unknown_device && maybe_gpu_device) {
-    FAILED_TO_PICK_DEVICE(errors::Internal(
-        "Found both unknown and GPU devices: ", *maybe_unknown_device, ", ",
-        *maybe_gpu_device));
-  }
-
-  if (!allow_mixing_unknown_and_cpu) {
-    if (maybe_unknown_device && maybe_cpu_device) {
-      FAILED_TO_PICK_DEVICE(errors::Internal(
-          "Found both unknown and CPU devices: ", *maybe_unknown_device, ", ",
-          *maybe_cpu_device));
-    }
-  }
-
-  if (out_device_picked) {
-    if (maybe_gpu_device) {
-      *out_device_picked = string(*maybe_gpu_device);
-    } else if (maybe_unknown_device) {
-      *out_device_picked = string(*maybe_unknown_device);
-    } else {
-      *out_device_picked = string(*maybe_cpu_device);
-    }
-  }
-
-  return Status::OK();
-
-#undef FAILED_TO_PICK_DEVICE
-}
-
-Status PickDeviceForXla(absl::Span<const string> device_names,
-                        bool allow_mixing_unknown_and_cpu,
-                        string* out_device_picked) {
-  return PickDeviceForXlaImpl(device_names, allow_mixing_unknown_and_cpu,
-                              /*out_can_pick_device=*/nullptr,
-                              out_device_picked);
-}
-
-Status CanPickDeviceForXla(absl::Span<const string> device_names,
-                           bool allow_mixing_unknown_and_cpu,
-                           bool* out_can_pick_device) {
-  return PickDeviceForXlaImpl(device_names, allow_mixing_unknown_and_cpu,
-                              out_can_pick_device,
-                              /*out_device_picked=*/nullptr);
-}
-
 namespace {
 struct XlaGlobalJitLevel {
   OptimizerOptions::GlobalJitLevel single_gpu;
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index 79cfcd9..50e1e13 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -46,9 +46,6 @@
 
 using OrderedNodeSet = std::set<Node*, NodeComparatorID>;
 
-// Returns the DeviceType corresponding to 'device'.
-Status DeviceToDeviceType(const string& device, DeviceType* device_type);
-
 // Returns true if `node` has a ref tensor input that it forwards to its output.
 bool HasForwardedRefInput(const Node& node);
 
@@ -74,51 +71,6 @@
 // Returns true if `node` has a DT_RESOURCE typed input or output.
 bool HasResourceInputOrOutput(const Node& node);
 
-// Picks the device for which XLA should compile a cluster that contains
-// operations placed in devices in `device_names`.  For instance a cluster that
-// contains operations solely placed on the CPU will be compiled into a CPU
-// executable by XLA, whereas a cluster that contains operations placed on the
-// CPU and also operations placed on the GPU will be compiled into a GPU
-// executable.
-//
-// Returns a non-OK Status if no unambiguous choice of device exists.
-//
-// We choose the device using the following rules:
-//
-//  - It is an error for `device_names` to contain more than one device of the
-//    same type.
-//  - GPU is preferred over CPU.
-//  - If `allow_mixing_unknown_and_cpu` is true then unknown devices are
-//    preferred over CPU.
-//  - XLA devices count as "unrecognized devices".
-//
-// This set of rules above implicitly assume that XLA:GPU can compile all
-// operations in the cluster that XLA:CPU can compile, and if
-// `allow_mixing_unknown_and_cpu` then the unrecognized device can also compile
-// all operations in the cluster that XLA:CPU can compile.
-//
-// We provide the `allow_mixing_unknown_and_cpu` knob so that we can do both of
-// the following things:
-//
-// - Let MarkForCompilationPass not inject CPU-placed operations into clusters
-//   that will run on unknown devices (because the unknown XLA backend may not
-//   support every operation supported by CPU).
-// - Let BuildXlaOpsPass successfully infer a compilation device for a cluster
-//   that contains nodes placed on both the CPU and on unknown devices.  In this
-//   case it is the responsibility of the optimization pass that injected the
-//   CPU nodes into the cluster to ensure that these nodes can be compiled by
-//   the unknown XLA backend.
-Status PickDeviceForXla(absl::Span<const string> device_names,
-                        bool allow_mixing_unknown_and_cpu,
-                        string* out_device_picked);
-
-// This is like `PickDeviceForXla` except that it returns false (instead of a
-// non-OK Status) in `out_can_pick_device` if no unambiguous choice of device
-// exists.
-Status CanPickDeviceForXla(absl::Span<const string> device_names,
-                           bool allow_mixing_unknown_and_cpu,
-                           bool* out_can_pick_device);
-
 // Determines the global jit level based on GraphOptimizationPassOptions,
 // --tf_xla_auto_jit and whether the graph is a single GPU graph.
 OptimizerOptions::GlobalJitLevel GetGlobalJitLevelForGraph(
diff --git a/tensorflow/compiler/jit/xla_cluster_util_test.cc b/tensorflow/compiler/jit/xla_cluster_util_test.cc
index 1013970..571d247 100644
--- a/tensorflow/compiler/jit/xla_cluster_util_test.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util_test.cc
@@ -91,67 +91,9 @@
   EXPECT_FALSE(ok);
 }
 
-void CheckPickDeviceResult(absl::string_view expected_result,
-                           bool allow_mixing_unknown_and_cpu,
-                           absl::Span<const absl::string_view> inputs) {
-  std::vector<string> inputs_string;
-  absl::c_transform(inputs, std::back_inserter(inputs_string),
-                    [](absl::string_view sv) { return string(sv); });
-  string result;
-  TF_ASSERT_OK(
-      PickDeviceForXla(inputs_string, allow_mixing_unknown_and_cpu, &result))
-      << "inputs = [" << absl::StrJoin(inputs, ", ")
-      << "], allow_mixing_unknown_and_cpu=" << allow_mixing_unknown_and_cpu
-      << ", expected_result=" << expected_result;
-  EXPECT_EQ(result, expected_result);
-}
-
-void CheckPickDeviceHasError(bool allow_mixing_unknown_and_cpu,
-                             absl::Span<const absl::string_view> inputs) {
-  std::vector<string> inputs_string;
-  absl::c_transform(inputs, std::back_inserter(inputs_string),
-                    [](absl::string_view sv) { return string(sv); });
-  string result;
-  EXPECT_FALSE(
-      PickDeviceForXla(inputs_string, allow_mixing_unknown_and_cpu, &result)
-          .ok());
-}
-
 const char* kCPU0 = "/job:localhost/replica:0/task:0/device:CPU:0";
 const char* kGPU0 = "/job:localhost/replica:0/task:0/device:GPU:0";
-const char* kXPU0 = "/job:localhost/replica:0/task:0/device:XPU:0";
-
-const char* kCPU1 = "/job:localhost/replica:0/task:0/device:CPU:1";
 const char* kGPU1 = "/job:localhost/replica:0/task:0/device:GPU:1";
-const char* kXPU1 = "/job:localhost/replica:0/task:0/device:XPU:1";
-
-TEST(PickDeviceForXla, UniqueDevice) {
-  CheckPickDeviceResult(kGPU0, false, {kGPU0, kGPU0});
-}
-
-TEST(PickDeviceForXla, DeviceOrder) {
-  CheckPickDeviceResult(kGPU0, false, {kGPU0, kCPU0});
-  CheckPickDeviceResult(kXPU0, true, {kXPU0, kCPU0});
-}
-
-TEST(PickDeviceForXla, MultipleUnknownDevices) {
-  CheckPickDeviceHasError(false, {kXPU0, kXPU1});
-}
-
-TEST(PickDeviceForXla, GpuAndUnknown) {
-  CheckPickDeviceHasError(false, {kGPU0, kXPU1});
-}
-
-TEST(PickDeviceForXla, UnknownAndCpu) {
-  CheckPickDeviceHasError(false, {kXPU0, kCPU1});
-}
-
-TEST(PickDeviceForXla, MultipleDevicesOfSameType) {
-  CheckPickDeviceHasError(false, {kCPU0, kCPU1});
-  CheckPickDeviceHasError(false, {kGPU0, kGPU1});
-  CheckPickDeviceHasError(false, {kXPU0, kXPU1});
-  CheckPickDeviceHasError(false, {kCPU0, kCPU1, kGPU0});
-}
 
 TEST(IsSingleGpuGraph, ReturnsTrue) {
   Scope root = Scope::NewRootScope().WithAssignedDevice(kGPU0).ExitOnError();
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 241b75c..19e3793 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -60,6 +60,7 @@
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
+  registration.cluster_slow_and_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 5894cbf..a697246 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/compiler/jit/xla_device.h"
 
 #include <stdlib.h>
+
 #include <unordered_set>
 
 #include "absl/memory/memory.h"
@@ -47,6 +48,7 @@
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -380,14 +382,17 @@
                              AsyncOpKernel::DoneCallback done) {
   VLOG(2) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":"
           << op_kernel->type_string();
-  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
-                                   op_kernel->IsExpensive());
+  profiler::TraceMe activity(
+      [&] {
+        return absl::StrCat(op_kernel->name(), ":", op_kernel->type_string());
+      },
+      profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
   op_kernel->ComputeAsync(context, done);
 }
 
 Status XlaDevice::Sync() {
   VLOG(1) << "XlaDevice::Sync";
-  tracing::ScopedActivity activity("XlaDevice::Sync", /*is_expensive=*/true);
+  profiler::TraceMe activity("XlaDevice::Sync", profiler::TraceMeLevel::kInfo);
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
@@ -428,13 +433,12 @@
   // that everything enqueued onto the stream (i.e., the device) at this very
   // moment--when ThenEnqueueOnBackgroundThread is called--will have finished.
   // This achieves a device-wide sync.
-  stream->ThenEnqueueOnBackgroundThread(
-      [stream, done](se::StreamExecutor*) {
-        tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
-                                         /*is_expensive=*/true);
-        done(stream->ok() ? Status::OK()
-                          : errors::Internal("XlaDevice::Sync() failed."));
-      });
+  stream->ThenEnqueueOnBackgroundThread([stream, done](se::StreamExecutor*) {
+    profiler::TraceMe activity("XlaDevice::Sync::Callback",
+                               profiler::TraceMeLevel::kInfo);
+    done(stream->ok() ? Status::OK()
+                      : errors::Internal("XlaDevice::Sync() failed."));
+  });
 }
 
 Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
@@ -458,11 +462,13 @@
     Allocator* allocator = GetAllocatorLocked(alloc_attrs);
     Tensor copy(allocator, parsed.dtype(), parsed.shape());
     Notification n;
-    device_context->CopyCPUTensorToDevice(&parsed, this, &copy,
-                                          [&n, &status](const Status& s) {
-                                            status = s;
-                                            n.Notify();
-                                          });
+    device_context->CopyCPUTensorToDevice(
+        &parsed, this, &copy,
+        [&n, &status](const Status& s) {
+          status = s;
+          n.Notify();
+        },
+        true /*sync_dst_compute*/);
     n.WaitForNotification();
     *tensor = copy;
   }
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index b273cbb..ea784e7 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -65,6 +65,9 @@
   tf_stats.peak_bytes_in_use = se_stats->peak_bytes_in_use;
   tf_stats.largest_alloc_size = se_stats->largest_alloc_size;
   tf_stats.bytes_limit = se_stats->bytes_limit;
+  tf_stats.bytes_reserved = se_stats->bytes_reserved;
+  tf_stats.peak_bytes_reserved = se_stats->peak_bytes_reserved;
+  tf_stats.bytes_reservable_limit = se_stats->bytes_reservable_limit;
   return tf_stats;
 }
 
@@ -106,7 +109,8 @@
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
                                              Tensor* device_tensor,
-                                             StatusCallback done) const {
+                                             StatusCallback done,
+                                             bool sync_dst_compute) const {
   if (cpu_tensor->NumElements() == 0) {
     VLOG(2) << "CopyCPUTensorToDevice empty tensor";
     done(Status::OK());
@@ -242,16 +246,25 @@
       cpu_tensor, &literal));
 
   TensorReference ref(*device_tensor);
+  const bool device_allows_sync_on_completion =
+      device->AllowsSyncOnCompletion();
   // Explicitly capture device_to_host_stream to make sure the stream is alive
   // before the transfer finishes.
   transfer_manager_->TransferLiteralFromDevice(
       device_to_host_stream.get(), xla_tensor->shaped_buffer(), literal,
-      [ref, xla_tensor, done, device_to_host_stream](xla::Status status) {
-        done([&]() -> Status {
-          VLOG(2) << "Transfer from device as literal: "
-                  << xla_tensor->shaped_buffer().ToString();
-          return status;
-        }());
+      [ref, xla_tensor, done, device_to_host_stream,
+       device_allows_sync_on_completion](xla::Status status) {
+        Status done_status = status;
+        VLOG(2) << "Transfer from device as literal: "
+                << xla_tensor->shaped_buffer().ToString();
+        // For devices don't allow sync on completion, the device execution is
+        // deferred. We check the execution stream status here to avoid wrong
+        // results from a failed stream being propogated to following
+        // host-side ops.
+        if (!device_allows_sync_on_completion) {
+          done_status.Update(xla_tensor->RefreshStatusOfStreams());
+        }
+        done(done_status);
         ref.Unref();
       });
 }
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index efbc4bc..3b9c416 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -61,8 +61,8 @@
       thread::ThreadPool* thread_pool);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
-                             Tensor* device_tensor,
-                             StatusCallback done) const override;
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              absl::string_view tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 0c66a89..913612f 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -95,6 +95,7 @@
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
+  registration.cluster_slow_and_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 6e32096..4252e2e 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -63,6 +63,7 @@
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
+  registration.cluster_slow_and_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
                                            registration);
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 8ff9cd5..3bb698b 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -34,6 +34,7 @@
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
@@ -132,7 +133,8 @@
       // cluster because we would not handle variable updates correctly.  Any
       // locks we have already acquired will be released when the VariableInfo
       // objects are destroyed.
-      return errors::Internal("Duplicate variable passed to XLA cluster");
+      // TODO(b/128495870) Add support for passing aliased resource variables.
+      return errors::Unimplemented("Duplicate variable passed to XLA cluster");
     }
     VLOG(4) << "Acquiring lock for variable "
             << reinterpret_cast<void*>(variable);
@@ -166,11 +168,11 @@
 }
 
 XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
-    : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
+    : se::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
 
 XlaAllocator::~XlaAllocator() {}
 
-xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
+xla::StatusOr<se::OwningDeviceMemory> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   AllocationAttributes attrs;
   attrs.no_retry_on_failure = !retry_on_failure;
@@ -182,8 +184,8 @@
           "Out of memory while trying to allocate ", size, " bytes.");
     }
   }
-  return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
-                                 device_ordinal, this);
+  return se::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
+                                device_ordinal, this);
 }
 
 Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
@@ -192,7 +194,7 @@
 }
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
-    xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
+    xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator,
     bool allocate_xla_tensors, bool use_multiple_streams)
     : client_(client),
       xla_allocator_(xla_allocator),
@@ -242,7 +244,8 @@
       CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
       arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
     } else {
-      CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
+      CHECK(xla::Shape::Equal().MinorToMajorOnlyInLayout()(shape,
+                                                           on_device_shape))
           << "On-device shape "
           << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
           << " not the same as on-host shape "
@@ -371,7 +374,7 @@
         } else {
           Tensor output_tensor = XlaTensorBuffer::MakeTensor(
               ctx->expected_output_dtype(i), shape, buffer, allocator);
-          output.set_buffer(xla::OwningDeviceMemory(), {output_num});
+          output.set_buffer(se::OwningDeviceMemory(), {output_num});
           ctx->set_output(i, output_tensor);
         }
         ++output_num;
@@ -432,7 +435,7 @@
       *variable_infos[i].var()->tensor() = output_tensor;
     } else {
       se::DeviceMemoryBase buffer = output.buffer({output_num});
-      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
+      output.set_buffer(se::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
       *variable_infos[i].var()->tensor() = output_tensor;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index c915b71..c6a9b93 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -23,14 +23,14 @@
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace tensorflow {
 class XlaAllocator;
@@ -108,11 +108,11 @@
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
 // see comment on `AllowsAsynchronousDeallocation()`.
-class XlaAllocator : public xla::DeviceMemoryAllocator {
+class XlaAllocator : public se::DeviceMemoryAllocator {
  public:
   XlaAllocator(const se::Platform* platform, Allocator* wrapped);
   ~XlaAllocator() override;
-  xla::StatusOr<xla::OwningDeviceMemory> Allocate(
+  xla::StatusOr<se::OwningDeviceMemory> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure) override;
   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
@@ -142,7 +142,7 @@
   // because we track inter-stream dependencies through events inside XlaTensor
   // objects.
   XlaComputationLaunchContext(xla::LocalClient* client,
-                              xla::DeviceMemoryAllocator* xla_allocator,
+                              se::DeviceMemoryAllocator* xla_allocator,
                               bool allocate_xla_tensors,
                               bool use_multiple_streams);
 
@@ -186,7 +186,7 @@
 
  private:
   xla::LocalClient* client_;
-  xla::DeviceMemoryAllocator* xla_allocator_;
+  se::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
   bool use_multiple_streams_;
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index d1f7f75..1c1080f 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -59,7 +59,7 @@
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
+    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false));
     // Move our buffer into shaped_buffer, which takes ownership of it.
@@ -97,6 +97,15 @@
   streams_defined_on_ = {stream};
 }
 
+Status XlaTensor::RefreshStatusOfStreams() {
+  mutex_lock lock(mu_);
+  Status status;
+  for (se::Stream* stream : streams_defined_on_) {
+    status.Update(stream->RefreshStatus());
+  }
+  return status;
+}
+
 // The pointer tag, OR-ed into the XlaTensor's address to distinguish it from
 // device-side tensors, which are either CPU or GPU memory pointers. This works
 // because we're guaranteed that CPU and GPU pointers are aligned to > 1 bits.
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 77e80aa..8a4eb749 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -102,6 +102,10 @@
   void ResetDefinitionEvent(std::shared_ptr<se::Event> event,
                             se::Stream* stream);
 
+  // Refresh the status of streams_defined_on_. Return the first not-OK stream's
+  // status or OK.
+  Status RefreshStatusOfStreams();
+
   // Convert from a raw pointer to an XlaTensor, removing the pointer tag.
   static XlaTensor* FromOpaquePointer(void* ptr);
   // Convert to a raw pointer from an XlaTensor, adding the pointer tag.
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index d7e9870..25756de 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -458,10 +458,6 @@
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
-    tags = [
-        "manual",
-        "notap",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 1c6053a..37a1c9b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1044,7 +1044,7 @@
 
   def testBatchMatMulBroadcast(self):
     """Tests broadcasting behavior of BatchMatMul."""
-    with compat.forward_compatibility_horizon(2019, 4, 19):
+    with compat.forward_compatibility_horizon(2019, 4, 26):
       # [2, 3] @ [1, 3, 4] -> [1, 2, 4]
       self._testBinary(
           math_ops.matmul,
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index b7d08df..7a901e1 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -113,12 +113,6 @@
 
   def testDenseLayerJitScopeUndefinedShape(self):
     """Tests that the dense layer node is properly compiled in jit scope.
-
-    Dense layer uses shape op to get shape of input tensor if its shape is not
-    fully defined. XLA does not cluster shape op with other operators. But in
-    experimental_jit_scope, XLA is forced to compile shape op into its own
-    cluster, causing dense layer to be split into TWO XlaCompile/XlaRun op
-    pairs.
     """
 
     with self.cached_session() as sess:
@@ -136,7 +130,7 @@
               trace_level=config_pb2.RunOptions.FULL_TRACE))
 
     labels = GetRunMetadataLabels(run_metadata)
-    self.assertEqual(2, self.countXlaOps(labels))
+    self.assertEqual(1, self.countXlaOps(labels))
     self.assertFalse(InLabels(labels, "MatMult"))
 
 
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 0611d67..0c840d5 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -72,6 +72,30 @@
     for dtype in self._random_types() & self.float_types:
       self._testRngIsNotConstant(rng, dtype)
 
+  def testRandomNormalMean(self):
+    for dtype in self._random_types() & self.float_types:
+      with self.cached_session():
+        with self.test_scope():
+          normal = random_ops.random_normal([1024],
+                                            dtype=dtype,
+                                            mean=1.4,
+                                            stddev=1.2)
+          mean = math_ops.reduce_mean(normal)
+          x = self.evaluate(mean)
+          self.assertAllClose(x, 1.4, rtol=1e-1, atol=1e-1)
+
+  def testRandomNormalVariance(self):
+    for dtype in self._random_types() & self.float_types:
+      with self.cached_session():
+        with self.test_scope():
+          normal = random_ops.random_normal([1024],
+                                            dtype=dtype,
+                                            mean=2.3,
+                                            stddev=2.0)
+          variance = math_ops.reduce_variance(normal)
+          x = self.evaluate(variance)
+          self.assertAllClose(x, 4.0, rtol=1e-1, atol=1e-1)
+
   def testRandomUniformIsInRange(self):
     for dtype in self._random_types():
       # TODO (b/112272078): enable bfloat16 for CPU and GPU when the bug is
diff --git a/tensorflow/compiler/tests/stateful_random_ops_test.py b/tensorflow/compiler/tests/stateful_random_ops_test.py
index 1992a6e..b395f6d 100644
--- a/tensorflow/compiler/tests/stateful_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateful_random_ops_test.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -50,28 +51,33 @@
   return str(name)
 
 
-class StatefulRandomOpsTest(xla_test.XLATestCase):
+ALGS = [random.RNG_ALG_PHILOX, random.RNG_ALG_THREEFRY]
+INTS = [dtypes.int32, dtypes.uint32, dtypes.int64, dtypes.uint64]
+
+
+# TODO(wangpeng): use parametrized tests to test both ThreeFry and Philox
+class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   """Test cases for stateful random-number generator operators."""
 
-  _ints = [dtypes.int32, dtypes.uint32, dtypes.int64, dtypes.uint64]
+  _ints = INTS
   _floats = [dtypes.bfloat16, dtypes.float32]
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testSimple(self):
-    """A simple test.
-    """
+  def testSimple(self, alg):
+    """A simple test."""
     with ops.device(xla_device_name()):
-      gen = random.Generator(seed=0, algorithm=random.RNG_ALG_THREEFRY)
+      gen = random.Generator(seed=0, algorithm=alg)
       gen.normal(shape=(3,))
       gen.uniform(shape=(3,), minval=0, maxval=10, dtype=dtypes.uint32)
       gen.uniform_full_int(shape=(3,))
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testDefun(self):
-    """Test for defun.
-    """
+  def testDefun(self, alg):
+    """Test for defun."""
     with ops.device(xla_device_name()):
-      gen = random.Generator(seed=0, algorithm=random.RNG_ALG_THREEFRY)
+      gen = random.Generator(seed=0, algorithm=alg)
       @def_function.function
       def f():
         x = gen.normal(shape=(3,))
@@ -80,6 +86,28 @@
         return (x, y, z)
       f()
 
+  def _compareToKnownOutputs(self, counter, key, expect):
+    """Compares against known outputs for specific counter and key inputs."""
+    def uint32s_to_uint64(a, b):
+      return b << 32 | a
+
+    def uint32s_to_uint64s(ls):
+      return [uint32s_to_uint64(ls[2 * i], ls[2 * i + 1])
+              for i in range(len(ls) // 2)]
+
+    ctr_len = len(counter)
+    counter = uint32s_to_uint64s(counter)
+    key = uint32s_to_uint64s(key)
+    state = counter + key
+    random.get_global_generator().reset(state)
+    got = random.get_global_generator().uniform_full_int(
+        shape=(ctr_len,), dtype=dtypes.uint32)
+    self.assertAllEqual(expect, got)
+    random.get_global_generator().reset(state)
+    got = random.get_global_generator().uniform_full_int(
+        shape=(ctr_len // 2,), dtype=dtypes.uint64)
+    self.assertAllEqual(uint32s_to_uint64s(expect), got)
+
   @test_util.run_v2_only
   def testThreefry2x32(self):
     """Tests ThreeFry2x32 conforms to known results.
@@ -89,34 +117,43 @@
     # which is in turn based on
     # https://github.com/DEShawResearch/Random123-Boost/blob/65e3d874b67aa7b3e02d5ad8306462f52d2079c0/libs/random/test/test_threefry.cpp#L30-L32
 
-    def uint32s_to_uint64(a, b):
-      return b << 32 | a
-
-    def verify(counter1, counter2, key1, key2, expect1, expect2):
-      counter = uint32s_to_uint64(counter1, counter2)
-      key = uint32s_to_uint64(key1, key2)
-      random.get_global_generator().reset([counter, key])
-      got = random.get_global_generator().uniform_full_int(
-          shape=(2,), dtype=dtypes.uint32)
-      expect = [expect1, expect2]
-      self.assertAllEqual(expect, got)
-      random.get_global_generator().reset([counter, key])
-      got = random.get_global_generator().uniform_full_int(
-          shape=(), dtype=dtypes.uint64)
-      self.assertAllEqual(uint32s_to_uint64(*expect), got)
-
     with ops.device(xla_device_name()):
       random.reset_global_generator(seed=0, algorithm=random.RNG_ALG_THREEFRY)
-      verify(0x00000000, 0x00000000, 0x00000000, 0x00000000,
-             0x6b200159, 0x99ba4efe)
-      verify(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
-             0x1cb996fc, 0xbb002be7)
-      verify(0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
-             0xc4923a9c, 0x483df7a0)
+      self._compareToKnownOutputs(
+          [0x00000000, 0x00000000], [0x00000000, 0x00000000],
+          [0x6b200159, 0x99ba4efe])
+      self._compareToKnownOutputs(
+          [0xffffffff, 0xffffffff], [0xffffffff, 0xffffffff],
+          [0x1cb996fc, 0xbb002be7])
+      self._compareToKnownOutputs(
+          [0x243f6a88, 0x85a308d3], [0x13198a2e, 0x03707344],
+          [0xc4923a9c, 0x483df7a0])
 
   @test_util.run_v2_only
-  def testNewState(self):
-    """Tests that the new state is correct.
+  def testPhilox4x32(self):
+    """Tests Philox4x32 conforms to known results.
+    """
+    # Based on
+    # https://github.com/DEShawResearch/Random123-Boost/blob/65e3d874b67aa7b3e02d5ad8306462f52d2079c0/libs/random/test/test_philox.cpp#L50-L52
+
+    with ops.device(xla_device_name()):
+      random.reset_global_generator(seed=0, algorithm=random.RNG_ALG_PHILOX)
+      self._compareToKnownOutputs(
+          [0x00000000, 0x00000000, 0x00000000, 0x00000000],
+          [0x00000000, 0x00000000],
+          [0x6627e8d5, 0xe169c58d, 0xbc57ac4c, 0x9b00dbd8])
+      self._compareToKnownOutputs(
+          [0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff],
+          [0xffffffff, 0xffffffff],
+          [0x408f276d, 0x41c83b0e, 0xa20bc7c6, 0x6d5451fd])
+      self._compareToKnownOutputs(
+          [0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344],
+          [0xa4093822, 0x299f31d0],
+          [0xd16cfe09, 0x94fdcceb, 0x5001e420, 0x24126ea1])
+
+  @test_util.run_v2_only
+  def testNewStateThreeFry(self):
+    """Tests that the new state is correct (for ThreeFry).
     """
     with ops.device(xla_device_name()):
       counter = 57
@@ -131,6 +168,54 @@
       gen.uniform_full_int(shape=(size,), dtype=dtypes.uint64)
       self.assertAllEqual([counter+size, key], gen.state.read_value())
 
+  @test_util.run_v2_only
+  def testNewStatePhilox(self):
+    """Tests that the new state is correct (for Philox).
+    """
+    with ops.device(xla_device_name()):
+      counter_low = 57
+      counter_high = 283
+      key = 0x1234
+      size = 47
+      seed = [counter_low, counter_high, key]
+      gen = random.Generator(
+          seed=seed, algorithm=random.RNG_ALG_PHILOX)
+      gen.uniform_full_int(shape=(size,), dtype=dtypes.uint32)
+      self.assertAllEqual([counter_low+(size+3)//4, counter_high, key],
+                          gen.state.read_value())
+      gen.reset(seed=seed)
+      gen.uniform_full_int(shape=(size,), dtype=dtypes.uint64)
+      self.assertAllEqual([counter_low+(size+1)//2, counter_high, key],
+                          gen.state.read_value())
+      # Tests that large counter_low will correctly overflows to counter_high
+      counter_low = -1  # same as 0xffffffffffffffff
+      counter_high = 283
+      size = 47
+      seed = [counter_low, counter_high, key]
+      gen = random.Generator(
+          seed=seed, algorithm=random.RNG_ALG_PHILOX)
+      gen.uniform_full_int(shape=(size,), dtype=dtypes.uint32)
+      self.assertAllEqual([(size+3)//4-1, counter_high+1, key],
+                          gen.state.read_value())
+      gen.reset(seed=seed)
+      gen.uniform_full_int(shape=(size,), dtype=dtypes.uint64)
+      self.assertAllEqual([(size+1)//2-1, counter_high+1, key],
+                          gen.state.read_value())
+
+  @parameterized.parameters(INTS)
+  @test_util.run_v2_only
+  def testXLAEqualsCPU(self, dtype):
+    """Tests that XLA and CPU kernels generate the same integers."""
+    seed = 1234
+    shape = [315, 49]
+    with ops.device("/device:CPU:0"):
+      cpu = (random.Generator(seed=seed, algorithm=random.RNG_ALG_PHILOX)
+             .uniform_full_int(shape=shape, dtype=dtype))
+    with ops.device(xla_device_name()):
+      xla = (random.Generator(seed=seed, algorithm=random.RNG_ALG_PHILOX)
+             .uniform_full_int(shape=shape, dtype=dtype))
+    self.assertAllEqual(cpu, xla)
+
   def _testRngIsNotConstant(self, rng, dtype):
     # Tests that 'rng' does not always return the same value.
     # The random-number generator, if working correctly, should produce the
@@ -139,10 +224,11 @@
     y = rng(dtype).numpy()
     self.assertFalse(np.array_equal(x, y))
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testUniformIsNotConstant(self):
+  def testUniformIsNotConstant(self, alg):
     with ops.device(xla_device_name()):
-      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      gen = random.Generator(seed=1234, algorithm=alg)
       def rng(dtype):
         maxval = dtype.max
         # Workaround for b/125364959
@@ -153,45 +239,49 @@
       for dtype in self._ints + self._floats:
         self._testRngIsNotConstant(rng, dtype)
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testNormalIsNotConstant(self):
+  def testNormalIsNotConstant(self, alg):
     with ops.device(xla_device_name()):
-      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      gen = random.Generator(seed=1234, algorithm=alg)
       def rng(dtype):
         return gen.normal(shape=[2], dtype=dtype)
 
       for dtype in self._floats:
         self._testRngIsNotConstant(rng, dtype)
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testUniformIsInRange(self):
+  def testUniformIsInRange(self, alg):
     minval = 2
     maxval = 33
     size = 1000
     with ops.device(xla_device_name()):
       for dtype in self._ints + self._floats:
-        gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+        gen = random.Generator(seed=1234, algorithm=alg)
         x = gen.uniform(
             shape=[size], dtype=dtype, minval=minval, maxval=maxval).numpy()
         self.assertTrue(np.all(x >= minval))
         self.assertTrue(np.all(x <= maxval))
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testNormalIsFinite(self):
+  def testNormalIsFinite(self, alg):
     with ops.device(xla_device_name()):
-      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      gen = random.Generator(seed=1234, algorithm=alg)
       for dtype in self._floats:
         x = gen.normal(shape=[10000], dtype=dtype).numpy()
         self.assertTrue(np.all(np.isfinite(x)))
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testDistributionOfUniform(self):
+  def testDistributionOfUniform(self, alg):
     """Use Pearson's Chi-squared test to test for uniformity."""
     with ops.device(xla_device_name()):
       n = 1000
       seed = 12
       for dtype in self._ints + self._floats:
-        gen = random.Generator(seed=seed, algorithm=random.RNG_ALG_THREEFRY)
+        gen = random.Generator(seed=seed, algorithm=alg)
         maxval = 1
         if dtype.is_integer:
           maxval = 100
@@ -206,13 +296,14 @@
         val = random_test_util.chi_squared(x, 10)
         self.assertLess(val, 16.92)
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testDistributionOfNormal(self):
+  def testDistributionOfNormal(self, alg):
     """Use Anderson-Darling test to test distribution appears normal."""
     with ops.device(xla_device_name()):
       n = 1000
       for dtype in self._floats:
-        gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+        gen = random.Generator(seed=1234, algorithm=alg)
         x = gen.normal(shape=[n], dtype=dtype).numpy()
         # The constant 2.492 is the 5% critical value for the Anderson-Darling
         # test where the mean and variance are known. This test is probabilistic
@@ -220,14 +311,16 @@
         self.assertLess(
             random_test_util.anderson_darling(x.astype(float)), 2.492)
 
+  @parameterized.parameters(ALGS)
   @test_util.run_v2_only
-  def testTruncatedNormal(self):
-    for dtype in self._floats:
-      gen = random.Generator(seed=123)
-      n = 10000000
-      y = gen.truncated_normal(shape=[n], dtype=dtype).numpy()
-      random_test_util.test_truncated_normal(
-          self.assertEqual, self.assertAllClose, dtype, n, y)
+  def testTruncatedNormal(self, alg):
+    with ops.device(xla_device_name()):
+      for dtype in self._floats:
+        gen = random.Generator(seed=123, algorithm=alg)
+        n = 10000000
+        y = gen.truncated_normal(shape=[n], dtype=dtype).numpy()
+        random_test_util.test_truncated_normal(
+            self.assertEqual, self.assertAllClose, dtype, n, y)
 
   @test_util.run_v2_only
   def testErrors(self):
@@ -265,9 +358,15 @@
       var = variables.Variable([0], dtype=dtypes.int64)
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
-          "For the ThreeFry algorithm, the size of state must be at least"):
+          "The size of the state must be at least"):
         gen_stateful_random_ops.stateful_standard_normal_v2(
             var.handle, random.RNG_ALG_THREEFRY, shape)
+      var = variables.Variable([0, 0], dtype=dtypes.int64)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          "The size of the state must be at least"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            var.handle, random.RNG_ALG_PHILOX, shape)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 2b5e8f8..7884c21 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1969,6 +1969,10 @@
   } else {
     padding = {{0, 0}, {0, 0}};
   }
+
+// TensorRT 5.1 added support for asymmetric padding. Due to a bug in 5.1.2, we
+// can only use asymmetric padding in convolutions with 5.1.3+.
+#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // Handle asymmetric padding.
@@ -1981,6 +1985,7 @@
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
   }
+#endif
 
   // Add convolution.
   nvinfer1::ILayer* conv_layer = nullptr;
@@ -1991,7 +1996,23 @@
             biases.GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     layer->setStride(stride);
-    layer->setPadding({padding[0].first, padding[1].first});
+// TensorRT 5.1.3 added support for padding modes.
+#if IS_TRT_VERSION_GE(5, 1, 3, 0)
+    if (attrs.get<string>("padding") == "SAME") {
+      VLOG(2) << "Using SAME padding";
+      // SAME_UPPER means that post padding is preferred.
+      layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+    }
+    // For VALID padding, we need to manually set the padding.
+    layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
+    layer->setPostPadding(
+        nvinfer1::DimsHW{padding[0].second, padding[1].second});
+    VLOG(2) << "Set pre-padding to: " << DebugString(layer->getPrePadding())
+            << " and post-padding to: " << DebugString(layer->getPostPadding());
+#else
+    layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
+    VLOG(2) << "Set padding to: " << DebugString(layer->getPadding());
+#endif
     layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
@@ -2002,7 +2023,20 @@
             biases.GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     layer->setStride(stride);
-    layer->setPadding({padding[0].first, padding[1].first});
+#if IS_TRT_VERSION_GE(5, 1, 3, 0)
+    if (attrs.get<string>("padding") == "SAME") {
+      VLOG(2) << "Using SAME padding";
+      layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+    }
+    layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
+    layer->setPostPadding(
+        nvinfer1::DimsHW{padding[0].second, padding[1].second});
+    VLOG(2) << "Set pre-padding to: " << DebugString(layer->getPrePadding())
+            << " and post-padding to: " << DebugString(layer->getPostPadding());
+#else
+    layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
+    VLOG(2) << "Set padding to: " << DebugString(layer->getPadding());
+#endif
     layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilation(dilation);
@@ -2748,6 +2782,8 @@
     padding = {{0, 0}, {0, 0}};
   }
 
+// TensorRT 5.1 added support for asymmetric padding.
+#if !IS_TRT_VERSION_GE(5, 1, 0, 0)
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
@@ -2761,6 +2797,7 @@
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
   }
+#endif
 
   nvinfer1::IPoolingLayer* layer =
       params->converter->network()->addPooling(*tensor, type, ksize);
@@ -2772,7 +2809,21 @@
                                                         layer->getOutput(0));
 
   layer->setStride(stride);
-  layer->setPadding({padding[0].first, padding[1].first});
+// TensorRT 5.1.3 added support for padding modes.
+#if IS_TRT_VERSION_GE(5, 1, 3, 0)
+  if (attrs.get<string>("padding") == "SAME") {
+    // SAME_UPPER means that post padding is preferred.
+    layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  }
+#endif
+// TensorRT 5.1 has support for asymmetric padding.
+#if IS_TRT_VERSION_GE(5, 1, 0, 0)
+  // If padding mode is not SAME, then these values will be used instead.
+  layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
+  layer->setPostPadding(nvinfer1::DimsHW{padding[0].second, padding[1].second});
+#else
+  layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
+#endif
   layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
@@ -2784,17 +2835,28 @@
   return Status::OK();
 }
 
-// TODO(tmorris): Use ActivationType::kLEAKY_RELU in TRT 5.1+ once perf
-// improves.
 Status ConvertLeakyRelu(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-
   TFAttrs attrs(node_def);
   const float alpha = attrs.get<float>("alpha");
+
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+  // Use IActivationLayer when available.
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::IActivationLayer* layer =
+      params->converter->network()->addActivation(
+          *inputs.at(0).tensor(), nvinfer1::ActivationType::kLEAKY_RELU);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setAlpha(alpha);
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+#else
+  // Use elementwise ops when IActivationLayer is not available.
   if (alpha < 0.0f || alpha > 1.0f) {
     return errors::Unimplemented(
         "Alpha value for LeakyRelu must be between 0 and 1, at ",
@@ -2802,7 +2864,6 @@
   }
   if (params->validation_only) return Status::OK();
 
-  // Input Tensor
   nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   // Create const for alpha.
   nvinfer1::ITensor* const_alpha_tensor = nullptr;
@@ -2825,6 +2886,67 @@
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
+#endif
+}
+
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+Status ConvertClipByValue(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  // TODO(tmorris): We can also allow the case where min and max are tensors by
+  // using elementwise min and max layers.
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params,
+      {{"t", false}, {"clip_value_min", true}, {"clip_value_max", true}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  if (params->validation_only) return Status::OK();
+
+  TFAttrs attrs(node_def);
+  const DataType dtype = attrs.get<DataType>("T");
+  float clip_value_min = 0.0f;
+  float clip_value_max = 0.0f;
+  // TODO(tmorris): Add a templated helper function to get scalar weights of
+  // InType casted to OutType.
+  if (dtype == DataType::DT_FLOAT) {
+    clip_value_min = inputs.at(1).weights().GetSpan<float>()[0];
+    clip_value_max = inputs.at(2).weights().GetSpan<float>()[0];
+  } else if (dtype == DataType::DT_HALF) {
+    clip_value_min = Eigen::half_impl::half_to_float(
+        inputs.at(1).weights().GetSpan<Eigen::half>()[0]);
+    clip_value_max = Eigen::half_impl::half_to_float(
+        inputs.at(2).weights().GetSpan<Eigen::half>()[0]);
+  }
+
+  nvinfer1::IActivationLayer* layer =
+      params->converter->network()->addActivation(
+          *inputs.at(0).tensor(), nvinfer1::ActivationType::kCLIP);
+  layer->setAlpha(clip_value_min);
+  layer->setBeta(clip_value_max);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  params->converter->ProvideQuantizationRange(output_tensor, clip_value_min,
+                                              clip_value_max);
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+#endif
+
+const std::unordered_map<string, nvinfer1::ActivationType>*
+ActivationTypeMap() {
+  static auto* const m =
+      new std::unordered_map<string, nvinfer1::ActivationType>({
+        {"Relu", nvinfer1::ActivationType::kRELU},
+            {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
+            {"Tanh", nvinfer1::ActivationType::kTANH},
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+            {"Elu", nvinfer1::ActivationType::kELU},
+            {"Selu", nvinfer1::ActivationType::kSELU},
+            {"Softsign", nvinfer1::ActivationType::kSOFTSIGN},
+            {"Softplus", nvinfer1::ActivationType::kSOFTPLUS},
+#endif
+      });
+  return m;
 }
 
 Status ConvertActivation(OpConverterParams* params) {
@@ -2833,29 +2955,39 @@
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  static const std::unordered_map<string, nvinfer1::ActivationType> ops{
-      {"Relu", nvinfer1::ActivationType::kRELU},
-      {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
-      {"Tanh", nvinfer1::ActivationType::kTANH},
-  };
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end()) {
+  auto op_pair = ActivationTypeMap()->find(node_def.op());
+  if (op_pair == ActivationTypeMap()->end()) {
     return errors::Unimplemented("Activation op: ", node_def.op(),
                                  " not supported at: ", node_def.name());
   }
   if (params->validation_only) return Status::OK();
 
   // Start conversion.
-  nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   nvinfer1::IActivationLayer* layer =
-      params->converter->network()->addActivation(*tensor, op_pair->second);
+      params->converter->network()->addActivation(*inputs.at(0).tensor(),
+                                                  op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  // Set parameters.
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+  if (node_def.op() == "Elu") {
+    layer->setAlpha(1.0f);
+  } else if (node_def.op() == "Selu") {
+    // From tensorflow/core/kernels/relu_op_functor.h
+    layer->setAlpha(1.7580993408473768599402175208123f);
+    layer->setBeta(1.0507009873554804934193349852946f);
+  } else if (node_def.op() == "Softplus") {
+    layer->setAlpha(1.0f);
+    layer->setBeta(1.0f);
+  }
+#endif
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // Set quantization range for output of Sigmoid, Tanh.
+  // Set quantization range for output when known.
   if (node_def.op() == "Sigmoid") {
     params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
   } else if (node_def.op() == "Tanh") {
     params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  } else if (node_def.op() == "Softsign") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -2922,7 +3054,6 @@
   return Status::OK();
 }
 
-// TODO(tmorris): Use ActivationType::kCLIP in TRT 5.1+ once perf improves.
 Status ConvertRelu6(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -2930,11 +3061,21 @@
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   if (params->validation_only) return Status::OK();
-  // ***************************************************************************
-  // TensorRT does not implement Relu6 natively. This function converts Relu6 op
-  // to available TensorRT ops: Relu6(x) = min(Relu(x), 6)
-  // ***************************************************************************
 
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+  // Use IActivationLayer for TRT >= 5.1
+  nvinfer1::IActivationLayer* layer =
+      params->converter->network()->addActivation(
+          *inputs.at(0).tensor(), nvinfer1::ActivationType::kCLIP);
+  layer->setAlpha(0.0f);
+  layer->setBeta(6.0f);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+#else
+  // Convert using min(Relu(x), 6) before TRT 5.1
   // Input Tensor
   nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
@@ -2969,6 +3110,7 @@
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
+#endif
 }
 
 Status ConvertBiasAdd(OpConverterParams* params) {
@@ -4313,6 +4455,40 @@
   return Status::OK();
 }
 
+Status ConvertSquaredDifference(OpConverterParams* params) {
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  // Broadcast inputs.
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  TF_RETURN_IF_ERROR(params->converter->GetTrtBroadcastShape(
+      inputs.at(0), inputs.at(1), &broadcasted_dims_l, &broadcasted_dims_r));
+  nvinfer1::ITensor* tensor_l = nullptr;
+  nvinfer1::ITensor* tensor_r = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+  if (params->validation_only) return Status::OK();
+
+  // Subtract x - y.
+  nvinfer1::IElementWiseLayer* sub =
+      params->converter->network()->addElementWise(
+          *tensor_l, *tensor_r, nvinfer1::ElementWiseOperation::kSUB);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
+  // Multiply (x - y) * (x - y).
+  nvinfer1::IElementWiseLayer* mul =
+      params->converter->network()->addElementWise(
+          *sub->getOutput(0), *sub->getOutput(0),
+          nvinfer1::ElementWiseOperation::kPROD);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
+
+  params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
+  return Status::OK();
+}
+
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)
 Status ConvertCombinedNMS(OpConverterParams* params) {
   TF_RETURN_IF_ERROR(
@@ -4485,6 +4661,9 @@
     std::unordered_map<string, OpConverter>* registration) {
   (*registration)["BatchMatMul"] = ConvertBatchMatMul;
   (*registration)["BiasAdd"] = ConvertBiasAdd;
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+  (*registration)["ClipByValue"] = ConvertClipByValue;
+#endif
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)
   (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
 #endif
@@ -4496,7 +4675,6 @@
   (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   (*registration)["ExpandDims"] = ConvertExpandDims;
   (*registration)["GatherV2"] = ConvertGather;
-  (*registration)["Identity"] = ConvertIdentity;  // Identity should be removed
   (*registration)["LeakyRelu"] = ConvertLeakyRelu;
   (*registration)["MatMul"] = ConvertMatMul;
   (*registration)["Pack"] = ConvertPack;
@@ -4505,11 +4683,11 @@
   (*registration)["Reshape"] = ConvertReshape;
   (*registration)["Rsqrt"] = ConvertRsqrt;
   (*registration)["Slice"] = ConvertSlice;
-  (*registration)["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
   (*registration)["Softmax"] = ConvertSoftmax;
   (*registration)["SpaceToDepth"] = ConvertDepthSpaceShuffle;
   (*registration)["Split"] = ConvertSplit;
   (*registration)["Square"] = ConvertSquare;
+  (*registration)["SquaredDifference"] = ConvertSquaredDifference;
   (*registration)["Squeeze"] = ConvertSqueeze;
   (*registration)["StridedSlice"] = ConvertStridedSlice;
   (*registration)["TopKV2"] = ConvertTopK;
@@ -4525,8 +4703,8 @@
        {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum", "Pow"}) {
     (*registration)[binary_op_type] = ConvertBinary;
   }
-  for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
-    (*registration)[activation_op_type] = ConvertActivation;
+  for (auto activation_op_pair : *ActivationTypeMap()) {
+    (*registration)[activation_op_pair.first] = ConvertActivation;
   }
   for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
     (*registration)[pool_op_type] = ConvertPool;
@@ -4543,6 +4721,11 @@
   for (auto arg_minmax_type : {"ArgMin", "ArgMax"}) {
     (*registration)[arg_minmax_type] = ConvertArgMinMax;
   }
+  // The following are no-ops during inference and will not be mapped to any TRT
+  // layer.
+  for (auto identity_op_type : {"Identity", "Snapshot", "StopGradient"}) {
+    (*registration)[identity_op_type] = ConvertIdentity;
+  }
 }
 
 void TrtNodeValidator::RegisterOpValidators() {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index dc32834..0356c6d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -594,6 +594,8 @@
 
 // Map of all supported UnaryOperations
 const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
+// Map of all supported ActivationTypes
+const std::unordered_map<string, nvinfer1::ActivationType>* ActivationTypeMap();
 
 }  // namespace convert
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index ec2e9b9..4185e41 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -2481,16 +2481,18 @@
         "The input \"input\" for Relu must be a tensor, at my_act");
   }
 
-  constexpr float kAlpha = 0.2f;
+  constexpr float kLeakyReluAlpha = 0.2f;
+  constexpr float kSeluAlpha = 1.7580993408473768599402175208123f;
+  constexpr float kSeluScale = 1.0507009873554804934193349852946f;
 
   // Get nodedef for activation layer.
   auto get_act_nodedef = [](string op_name) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     if (op_name == "LeakyRelu") {
-      auto act =
-          ops::internal::LeakyRelu(s.WithOpName("my_act"), input,
-                                   ops::internal::LeakyRelu::Alpha(kAlpha));
+      auto act = ops::internal::LeakyRelu(
+          s.WithOpName("my_act"), input,
+          ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha));
       return act.operation.node()->def();
     } else if (op_name == "Relu") {
       auto act = ops::Relu(s.WithOpName("my_act"), input);
@@ -2504,6 +2506,18 @@
     } else if (op_name == "Tanh") {
       auto act = ops::Tanh(s.WithOpName("my_act"), input);
       return act.operation.node()->def();
+    } else if (op_name == "Elu") {
+      auto act = ops::Elu(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Selu") {
+      auto act = ops::Selu(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Softsign") {
+      auto act = ops::Softsign(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Softplus") {
+      auto act = ops::Softplus(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
     }
     EXPECT_TRUE(false);
     return NodeDef();
@@ -2511,7 +2525,7 @@
   // Get expected output for activation layer.
   auto get_act_output = [](string op_name, float input) -> float {
     if (op_name == "LeakyRelu") {
-      return (input > 0.0f) ? input : input * kAlpha;
+      return (input > 0.0f) ? input : input * kLeakyReluAlpha;
     } else if (op_name == "Relu") {
       return (input > 0.0f) ? input : 0.0f;
     } else if (op_name == "Relu6") {
@@ -2520,14 +2534,33 @@
       return 1.0f / (1.0f + std::exp(-input));
     } else if (op_name == "Tanh") {
       return std::tanh(input);
+    } else if (op_name == "Elu") {
+      return (input > 0.0f) ? input : std::exp(input) - 1;
+    } else if (op_name == "Selu") {
+      return (input > 0.0f) ? kSeluScale * input
+                            : kSeluScale * kSeluAlpha * (std::exp(input) - 1);
+    } else if (op_name == "Softsign") {
+      return input / (std::abs(input) + 1);
+    } else if (op_name == "Softplus") {
+      return std::log(std::exp(input) + 1);
     }
     EXPECT_TRUE(false);
     return 0;
   };
 
+  // Get list of ops to test.
+  std::vector<string> ops_to_test;
+  // Add all ops supported by ConvertUnary.
+  auto* map = ActivationTypeMap();
+  ops_to_test.reserve(map->size());
+  for (auto& pair : *map) {
+    ops_to_test.push_back(pair.first);
+  }
+  // Add other activation ops to test.
+  ops_to_test.push_back("Relu6");
+  ops_to_test.push_back("LeakyRelu");
   // Ok.
-  for (const string& op_name :
-       {"LeakyRelu", "Relu", "Relu6", "Sigmoid", "Tanh"}) {
+  for (const string& op_name : ops_to_test) {
     Reset();
     NodeDef node_def = get_act_nodedef(op_name);
     AddTestTensor("input", {1, 2, 3});
@@ -2536,13 +2569,18 @@
     TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
     ASSERT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+
+    // Certain activations should set quantization range automatically.
+    auto ranges = quantization_ranges();
     if (op_name == "Relu6") {
-      // Relu6 should set quantization range automatically.
-      auto ranges = quantization_ranges();
       EXPECT_EQ(ranges[output.tensor()], 6.0f);
+    } else if (op_name == "Sigmoid" || op_name == "Tanh" ||
+               op_name == "Softsign") {
+      EXPECT_EQ(ranges[output.tensor()], 1.0f);
     }
 
-    const std::vector<float> input = {-100, -2, -1, 0, 1, 100};
+    // std::exp in Softplus will overflow for input > 88
+    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
     const DataVec input_data{{"input", test::AsTensor<float>(input)}};
     DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
     BuildAndRun(input_data, &output_data);
@@ -4054,7 +4092,7 @@
   // Add other unary ops to test.
   ops_to_test.push_back("Rsqrt");
   // Ok.
-  for (string op_name : ops_to_test) {
+  for (const string& op_name : ops_to_test) {
     Reset();
     NodeDef node_def = get_unary_nodedef(op_name);
     AddTestTensor("input", {1, 2, 3});
@@ -5198,6 +5236,225 @@
   TestConvertSpaceToDepth<DT_INT32>(this);
 }
 
+#if IS_TRT_VERSION_GE(5, 1, 2, 0)
+// Get the NodeDef for ClipByValue.
+NodeDef GetClipByValueNodeDef(DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto t = ops::Placeholder(s.WithOpName("t"), dtype);
+  auto clip_value_min = ops::Placeholder(s.WithOpName("clip_value_min"), dtype);
+  auto clip_value_max = ops::Placeholder(s.WithOpName("clip_value_max"), dtype);
+  auto clip = ops::ClipByValue(s.WithOpName("my_clip"), t, clip_value_min,
+                               clip_value_max);
+  return clip.operation.node()->def();
+}
+
+template <DataType dtype>
+void TestConvertClipByValue(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  struct TestParams {
+    std::vector<int> dims;
+    std::vector<CType> input_value;
+    CType clip_value_min;
+    CType clip_value_max;
+    std::vector<CType> expected_output;
+  };
+
+  const std::vector<CType> common_input = InitTestVector<CType>(6);
+  std::vector<TestParams> params = {
+      {
+          /*dims=*/{1, 2, 3},
+          /*input_value=*/common_input,
+          /*clip_value_min=*/CType(2),
+          /*clip_value_max=*/CType(5),
+          /*expected_output=*/
+          {CType(2), CType(2), CType(2), CType(3), CType(4), CType(5)},
+      },
+      {
+          /*dims=*/{2, 1, 3},
+          /*input_value=*/common_input,
+          /*clip_value_min=*/CType(-1),
+          /*clip_value_max=*/CType(8),
+          /*expected_output=*/common_input,
+      },
+  };
+
+  for (int i = 0; i < params.size(); ++i) {
+    test->Reset();
+
+    NodeDef node_def = GetClipByValueNodeDef(dtype);
+    test->AddTestTensor("t", params[i].dims, 1, TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("clip_value_min", {1},
+                                {params[i].clip_value_min});
+    test->AddTestWeights<CType>("clip_value_max", {1},
+                                {params[i].clip_value_max});
+    test->RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_clip", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(params[i].dims, output.tensor()->getDimensions());
+
+    DataVec input_data{{"t", test::AsTensor<CType>(params[i].input_value)}};
+    DataVec output_data{
+        {"my_clip", ConstructTensor<CType>(params[i].expected_output.size())}};
+    test->BuildAndRun(
+        input_data, &output_data,
+        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                ElementsAreArray(params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertClipByValue) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_clip", "ClipByValue", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "ClipByValue got 0 inputs but expected 3, at my_clip");
+  }
+  {
+    // Input is a weight, should fail.
+    Reset();
+    NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT);
+    AddTestWeights<float>("t", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<float>("clip_value_min", {1}, {1});
+    AddTestWeights<float>("clip_value_max", {1}, {5});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"t\" for ClipByValue must be a "
+                               "tensor, at my_clip");
+  }
+  {
+    // Clip min is a tensor, should fail.
+    Reset();
+    NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT);
+    AddTestTensor("t", {1, 2, 3});
+    AddTestTensor("clip_value_min", {1});
+    AddTestWeights<float>("clip_value_max", {1}, {1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"clip_value_min\" for ClipByValue "
+                               "must be a constant, at my_clip");
+  }
+  {
+    // Clip max is a tensor, should fail.
+    Reset();
+    NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT);
+    AddTestTensor("t", {1, 2, 3});
+    AddTestWeights<float>("clip_value_min", {1}, {1});
+    AddTestTensor("clip_value_max", {1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"clip_value_max\" for ClipByValue "
+                               "must be a constant, at my_clip");
+  }
+
+  TestConvertClipByValue<DT_FLOAT>(this);
+  TestConvertClipByValue<DT_HALF>(this);
+}
+#endif  // IS_TRT_VERSION_GE(5, 1, 2, 0)
+
+// Get the NodeDef for SquaredDifference.
+NodeDef GetSquaredDifferenceNodeDef(DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto x = ops::Placeholder(s.WithOpName("x"), dtype);
+  auto y = ops::Placeholder(s.WithOpName("y"), dtype);
+  auto squared_diff =
+      ops::SquaredDifference(s.WithOpName("my_squared_diff"), x, y);
+  return squared_diff.operation.node()->def();
+}
+
+template <DataType dtype>
+void TestConvertSquaredDifference(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  struct TestParams {
+    std::vector<int> dims_x;
+    std::vector<int> dims_y;
+    std::vector<CType> value_x;
+    std::vector<CType> value_y;
+    std::vector<int> expected_output_dims;
+    std::vector<CType> expected_output;
+  };
+
+  const std::vector<CType> common_input = InitTestVector<CType>(6);
+  std::vector<TestParams> params = {
+      {
+          /*dims_x=*/{1, 2, 3},
+          /*dims_y=*/{1, 2, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/CastTestVector<int, CType>({0, -1, 3, 0, 10, -7}),
+          /*expected_output_dims=*/{1, 2, 3},
+          /*expected_output=*/CastTestVector<int, CType>({0, 4, 1, 9, 36, 144}),
+      },
+      {
+          /*dims_x=*/{1, 2, 3},
+          /*dims_y=*/{1, 1, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/CastTestVector<int, CType>({0, 1, 2}),
+          /*expected_output_dims=*/{1, 2, 3},
+          /*expected_output=*/CastTestVector<int, CType>({0, 0, 0, 9, 9, 9}),
+      },
+  };
+
+  for (int i = 0; i < params.size(); ++i) {
+    test->Reset();
+
+    NodeDef node_def = GetSquaredDifferenceNodeDef(dtype);
+    test->AddTestTensor("x", params[i].dims_x, 1, TfDataTypeToTrt(dtype));
+    test->AddTestTensor("y", params[i].dims_y, 1, TfDataTypeToTrt(dtype));
+    test->RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_squared_diff", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    DataVec input_data{{"x", test::AsTensor<CType>(params[i].value_x)},
+                       {"y", test::AsTensor<CType>(params[i].value_y)}};
+    DataVec output_data{
+        {"my_squared_diff",
+         ConstructTensor<CType>(params[i].expected_output.size())}};
+    test->BuildAndRun(
+        input_data, &output_data,
+        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                ElementsAreArray(params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSquaredDifference) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_squared_diff", "SquaredDifference", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "SquaredDifference got 0 inputs but expected 2, at my_squared_diff");
+  }
+  {
+    // Input is a weight, should fail.
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
+    AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestTensor("y", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"x\" for SquaredDifference must be "
+                               "a tensor, at my_squared_diff");
+  }
+  {
+    // Shapes are not broadcastable, should fail.
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
+    AddTestTensor("x", {2, 3});
+    AddTestTensor("y", {7, 5});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Infeasible broadcast scheme");
+  }
+
+  TestConvertSquaredDifference<DT_FLOAT>(this);
+  TestConvertSquaredDifference<DT_HALF>(this);
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 247d4f0..3b7c586 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -259,7 +259,6 @@
   }
   auto lib = ctx->function_library();
   FunctionLibraryRuntime::Options opts;
-  opts.step_id = ctx->step_id();
   opts.rendezvous = ctx->rendezvous();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.runner = ctx->runner();
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
index cce4f52..4147a38 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
@@ -34,6 +34,13 @@
 
 class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
  public:
+  // TODO(b/131313301): Delete this when IPluginFactory is fixed upstream.
+  // IPluginFactory defines virtual methods and no virtual destructor. To avoid
+  // a non-virtual-dtor error, we need to add a virtual destructor here. Do not
+  // use a pointer to IPluginFactory because deleting through such a pointer
+  // results in undefined behavior.
+  virtual ~PluginFactoryTensorRT() {}
+
   // TODO(aaroey): this static method has to be inlined to make the singleton a
   // unique global symbol. Find a way to fix it.
   static PluginFactoryTensorRT* GetInstance() {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 378fb1c..0ac0367 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -101,17 +101,14 @@
   }
 
   // Creates n free positions in cache
-  Status DiscardOld(size_t n = 0) {
-    if (n > capacity_) {
-      return errors::Internal("Insufficient capacity in cache (capacity = ",
-                              capacity_, ", requested ", n, ")");
-    }
+  void DiscardOld(size_t n = 0) {
+    DCHECK(capacity_ >= n) << "Insufficient capacity in cache (capacity = "
+                           << capacity_ << ", requested " << n << ")";
     while (objects_.size() > (capacity_ - n)) {
       key_type discard_key = keys_.back();
       keys_.pop_back();
       objects_.erase(discard_key);
     }
-    return Status::OK();
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index b20af4e..c8f7214 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -17,7 +17,10 @@
 package_group(
     name = "friends",
     includes = [":internal"],
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/brain/tools/tf_replay/...",
+        "//tensorflow/...",
+    ],
 )
 
 package(
@@ -286,6 +289,7 @@
     name = "tf2xla_util",
     srcs = ["tf2xla_util.cc"],
     hdrs = ["tf2xla_util.h"],
+    visibility = [":friends"],
     deps = [
         ":sharding_util",
         ":tf2xla_proto",
@@ -507,6 +511,39 @@
 )
 
 cc_library(
+    name = "rearrange_function_argument_pass",
+    srcs = [
+        "rearrange_function_argument_pass.cc",
+    ],
+    hdrs = [
+        "rearrange_function_argument_pass.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "rearrange_function_argument_pass_registration",
+    srcs = [
+        "rearrange_function_argument_pass_registration.cc",
+    ],
+    deps = [
+        ":rearrange_function_argument_pass",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
     name = "functionalize_control_flow_pass_registration",
     srcs = [
         "functionalize_control_flow_pass_registration.cc",
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 9784985..6c6b6cd 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -1010,13 +1010,14 @@
           ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
       auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
       auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+      auto retval3 = ops::_Retval(scope.WithOpName("_retval3_RetVal"), arg3, 3);
 
       GraphDef expected;
       TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
       EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
                 result.arg_types);
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
                 result.ret_types);
       TF_EXPECT_GRAPH_EQ(expected, result.gdef);
     }
@@ -1083,6 +1084,7 @@
       auto retval1 =
           ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
       auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+      auto retval3 = ops::_Retval(scope.WithOpName("_retval3_RetVal"), arg3, 3);
 
       GraphDef expected;
       TF_EXPECT_OK(scope.ToGraphDef(&expected));
@@ -1093,7 +1095,7 @@
 
       EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
                 result.arg_types);
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
                 result.ret_types);
       TF_EXPECT_GRAPH_EQ(expected, result.gdef);
     }
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index 517924b..fbab280 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -200,33 +200,28 @@
     arg_types->push_back(dtype);
 
     TF_ASSIGN_OR_RETURN(Node * arg_node, BuildArgNode(output, dtype, i));
-
-    if (dtype == DT_RESOURCE) {
-      // The convention of the XLA bridge is that resource variable arguments
-      // are only inputs to the loop body and have no corresponding output.
-      // TODO(b/37741920): change the convention so that DT_RESOURCE variables
-      // are both inputs and outputs, and then remove this case.
-      TF_RET_CHECK(arg.is_loop_invariant);
+    TF_ASSIGN_OR_RETURN(Node * retval_node, BuildRetvalNode(output, dtype, i));
+    if (arg.is_loop_invariant) {
+      // Argument is loop-invariant. Forward it from the Arg to the Retval.
       node_map[arg.enter->id()] = arg_node;
+      output->AddEdge(arg_node, 0, retval_node, 0);
     } else {
-      TF_ASSIGN_OR_RETURN(Node * retval_node,
-                          BuildRetvalNode(output, dtype, i));
-
-      if (arg.is_loop_invariant) {
-        // Argument is loop-invariant. Forward it from the Arg to the Retval.
-        node_map[arg.enter->id()] = arg_node;
-        output->AddEdge(arg_node, 0, retval_node, 0);
-      } else {
-        // Argument is loop-varying.
-        node_map[arg.switch_node->id()] = arg_node;
-        // The Switch node has two outputs, but _Arg only has one. This tells
-        // the CopySubgraph function to rewrite the output number of edges from
-        // the _Arg node to be 0 rather than copying the output number from the
-        // Switch node.
-        squash_src_outputs[arg.switch_node->id()] = true;
-        node_map[arg.next_iteration->id()] = retval_node;
-        next_iterations.push_back(arg.next_iteration);
+      // Argument is loop-varying.
+      if (dtype == DT_RESOURCE) {
+        // DT_RESOURCE arguments should always be loop-invariant in the graphs
+        // generated from TF.
+        return errors::Unimplemented("Loop-varying DT_RESOURCE Enter node ",
+                                     arg.enter->name(), " is currently not",
+                                     " supported.");
       }
+      node_map[arg.switch_node->id()] = arg_node;
+      // The Switch node has two outputs, but _Arg only has one. This tells
+      // the CopySubgraph function to rewrite the output number of edges from
+      // the _Arg node to be 0 rather than copying the output number from the
+      // Switch node.
+      squash_src_outputs[arg.switch_node->id()] = true;
+      node_map[arg.next_iteration->id()] = retval_node;
+      next_iterations.push_back(arg.next_iteration);
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 00abae8..fcc1ea2 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -145,6 +145,7 @@
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:comparators",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:loops",
         "//tensorflow/compiler/xla/client/lib:math",
@@ -253,6 +254,7 @@
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -365,7 +367,7 @@
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -378,7 +380,7 @@
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index 6fa1126..f34b2ff 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -36,19 +36,36 @@
     xla::XlaOp sum;
     switch (kind) {
       case XlaExpression::Kind::kTensorList: {
+        // Check that all TensorLists are initialized.
+        for (int i = 1; i < ctx->num_inputs(); ++i) {
+          xla::XlaOp list = ctx->Input(i);
+          bool is_initialized;
+          OP_REQUIRES_OK(ctx, IsTensorListInitialized(list, &is_initialized));
+          OP_REQUIRES(
+              ctx, is_initialized,
+              errors::InvalidArgument("TensorList input #", i,
+                                      " for AddN op is an uninitialized list"));
+        }
+        // Nested TensorList is not supported.
+        bool is_nested_list;
+        OP_REQUIRES_OK(ctx, IsNestedTensorList(ctx->Input(0), &is_nested_list));
+        OP_REQUIRES(ctx, !is_nested_list,
+                    errors::Unimplemented(
+                        "Nested TensorList is not supported for AddN op"));
+
         OP_REQUIRES_OK(ctx, GetTensorListBuffer(ctx->Input(0), &sum));
-        TensorShape sum_shape;
+        xla::Shape sum_shape;
         OP_REQUIRES_OK(ctx,
                        GetTensorListBufferShape(ctx->Input(0), &sum_shape));
         for (int i = 1; i < ctx->num_inputs(); ++i) {
           xla::XlaOp operand;
           OP_REQUIRES_OK(ctx, GetTensorListBuffer(ctx->Input(i), &operand));
           // Check that the shapes match.
-          TensorShape operand_shape;
+          xla::Shape operand_shape;
           OP_REQUIRES_OK(
               ctx, GetTensorListBufferShape(ctx->Input(i), &operand_shape));
           OP_REQUIRES(
-              ctx, sum_shape.dim_sizes() == operand_shape.dim_sizes(),
+              ctx, sum_shape.dimensions() == operand_shape.dimensions(),
               errors::InvalidArgument(
                   "TensorList arguments to AddN must all have the same ",
                   "shape.\n", "Expected: ", sum_shape.DebugString(), "\n",
@@ -57,7 +74,7 @@
         }
         xla::XlaOp push_index;
         OP_REQUIRES_OK(ctx, GetTensorListPushIndex(ctx->Input(0), &push_index));
-        OP_REQUIRES_OK(ctx, BuildTensorList(sum, push_index, &sum));
+        OP_REQUIRES_OK(ctx, BuildNonNestedTensorList(sum, push_index, &sum));
         ctx->SetTensorListOutput(0, sum);
         break;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 01d5945..f60509b 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -30,11 +30,8 @@
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result =
-        xla::BatchDot(MaybeTransposeInMinorDims(
-                          MaybeConjugate(ctx->Input(0), adj_x_), adj_x_),
-                      MaybeTransposeInMinorDims(
-                          MaybeConjugate(ctx->Input(1), adj_y_), adj_y_));
+    auto result = xla::BatchDot(MaybeConjugate(ctx->Input(0), adj_x_), adj_x_,
+                                MaybeConjugate(ctx->Input(1), adj_y_), adj_y_);
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index e1c3352..a64ce55 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -144,7 +144,8 @@
 class StatelessCategoricalOp : public CategoricalOp {
  public:
   explicit StatelessCategoricalOp(OpKernelConstruction* ctx)
-      : CategoricalOp(ctx) {
+      : CategoricalOp(ctx),
+        device_type_string_(ctx->device_type().type_string()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
   }
 
@@ -160,7 +161,7 @@
     // * log(-log(0)) is ∞.
     // * log(-log(1)) is -∞.
     xla::XlaOp uniforms = StatelessRngUniform(
-        seed, uniform_shape,
+        device_type_string_, seed, uniform_shape,
         xla::MinPositiveNormalValue(builder, uniform_shape.element_type()),
         xla::One(builder, uniform_shape.element_type()));
     return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
@@ -176,6 +177,7 @@
 
  private:
   DataType dtype_;
+  string device_type_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp);
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 29687c7..d801d56 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -17,7 +17,9 @@
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
@@ -99,23 +101,22 @@
     // The following code is equivalent to:
     // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
     int64 kernel_size = 1;
-    std::vector<int64> lhs_shape(num_dims, 1);
+    std::vector<int64> kernel_shape(num_dims, 1);
     for (int i = 0; i < num_spatial_dims; ++i) {
       int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
-      lhs_shape[i] = ksizes_[input_dim];
+      kernel_shape[i] = ksizes_[input_dim];
       kernel_size *= ksizes_[input_dim];
     }
-    lhs_shape[num_spatial_dims] = depth;
-    lhs_shape[num_spatial_dims + 1] = 1;
-
-    // Builds an identity matrix as a broadcast equality of iotas.
-    // iota = np.arange(np.prod(ksize), depth)
-    // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
-    xla::XlaOp iota = xla::Iota(builder, xla::S32, kernel_size * depth);
-
-    auto lhs = xla::Reshape(iota, lhs_shape);
-    auto filter = xla::ConvertElementType(
-        xla::Eq(lhs, iota, {num_spatial_dims + 1}), type);
+    kernel_shape[num_spatial_dims] = 1;
+    kernel_shape[num_spatial_dims + 1] = kernel_size * depth;
+    xla::Shape iota_kernel_shape =
+        xla::ShapeUtil::MakeShape(xla::S32, {kernel_size, depth, kernel_size});
+    xla::XlaOp filter =
+        xla::Reshape(xla::ConvertElementType(
+                         xla::Eq(xla::Iota(builder, iota_kernel_shape, 0),
+                                 xla::Iota(builder, iota_kernel_shape, 2)),
+                         type),
+                     kernel_shape);
 
     xla::ConvolutionDimensionNumbers dims;
     std::vector<int64> window_strides(num_spatial_dims);
@@ -148,7 +149,7 @@
 
     xla::XlaOp conv =
         xla::ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
-                                lhs_dilation, rhs_dilation, dims);
+                                lhs_dilation, rhs_dilation, dims, depth);
     ctx->SetOutput(0, conv);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index 92b20fe..dcd523e 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -20,11 +20,13 @@
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -427,7 +429,8 @@
                 errors::InvalidArgument("XLA compilation requires number of "
                                         "boxes to be <= kint32max, got ",
                                         num_boxes));
-
+    xla::PrimitiveType boxes_xla_type = context->InputXlaType("boxes");
+    xla::PrimitiveType scores_xla_type = context->InputXlaType("scores");
     const xla::XlaOp boxes_input = context->Input("boxes");
     const xla::XlaOp scores_input = context->Input("scores");
     int64 output_size;
@@ -445,15 +448,18 @@
     // Choose a more convenient layout.
     const xla::XlaOp boxes = xla::Transpose(boxes_input, {1, 0});
     const xla::XlaOp boxes_sorted = xla::GetTupleElement(
-        xla::Sort(/*keys=*/-xla::Broadcast(scores_input, {4}),
-                  /*values=*/{boxes},
+        xla::Sort({xla::Broadcast(scores_input, {4}), boxes},
+                  xla::CreateScalarGtComputation(
+                      {scores_xla_type, boxes_xla_type}, builder),
                   /*dimension=*/1),
         1);
     // Track the mapping of indices into sorted domain.
     const xla::XlaOp iota_indices = xla::Iota(builder, xla::S32, num_boxes);
-    const xla::XlaOp indices_sort = xla::Sort(-scores_input, {iota_indices});
+    const xla::XlaOp indices_sort = xla::Sort(
+        {scores_input, iota_indices},
+        xla::CreateScalarGtComputation({scores_xla_type, xla::S32}, builder));
     const xla::XlaOp indices_sorted = xla::GetTupleElement(indices_sort, 1);
-    const xla::XlaOp scores = xla::Neg(xla::GetTupleElement(indices_sort, 0));
+    const xla::XlaOp scores = xla::GetTupleElement(indices_sort, 0);
 
     // Shapes are henceforth [1, num_boxes]. 'c_y0' denotes 'coordinate' y0.
     const xla::XlaOp c_y0 = xla::Reshape(xla::SliceInDim(boxes_sorted,
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 39d96e7..19ec222 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -16,7 +16,7 @@
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/macros.h"
@@ -46,4 +46,4 @@
   tensorflow::argmax_float_1d_xla_impl(out, data);
 }
 
-REGISTER_CUSTOM_CALL_TARGET(argmax_float_1d_xla_impl);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(argmax_float_1d_xla_impl);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
index 9b83392..6e1c122 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
@@ -16,7 +16,7 @@
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/macros.h"
@@ -51,4 +51,4 @@
   tensorflow::argmax_float_2d_xla_impl(out, data);
 }
 
-REGISTER_CUSTOM_CALL_TARGET(argmax_float_2d_xla_impl);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(argmax_float_2d_xla_impl);
diff --git a/tensorflow/compiler/tf2xla/kernels/permute_op.cc b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
index 71920bf..94db561 100644
--- a/tensorflow/compiler/tf2xla/kernels/permute_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -77,7 +78,10 @@
     if (input_rank == 2) {
       keys = xla::BroadcastInDim(keys, {4, 2}, {0});
     }
-    auto sorted = xla::Sort(keys, {ctx->Input(0)}, 0);
+    auto sorted = xla::Sort({keys, ctx->Input(0)},
+                            xla::CreateScalarLtComputation(
+                                {xla::S32, ctx->input_xla_type(0)}, builder),
+                            0);
     auto output = xla::GetTupleElement(sorted, 1);
     ctx->SetOutput(0, output);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 8716484..507bc8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -15,6 +15,7 @@
 
 // XLA specific pooling ops.
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -327,6 +328,20 @@
     xla::Padding xla_padding =
         (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
 
+    // Create a MaxPool operation to check the expected resulting shape, and
+    // then throw away the operation because we don't actually neeed it here.
+    TensorShape expected_out_shape;
+    auto pooling =
+        xla::MaxPool(ctx->Input(0), ksize_, stride_, xla_padding,
+                     XlaTensorFormat(data_format_, tensor_in_shape.dims() - 2));
+    auto status_or_shape = pooling.builder()->GetShape(pooling);
+    OP_REQUIRES_OK(ctx, status_or_shape.status());
+    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(status_or_shape.ValueOrDie(),
+                                              &expected_out_shape));
+    OP_REQUIRES(ctx, expected_out_shape == out_backprop_shape,
+                errors::Unimplemented("The output dimensions do not match the "
+                                      "other input values."));
+
     xla::PrimitiveType element_type;
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(2), &element_type));
     xla::XlaOp init_value = XlaHelpers::Zero(ctx->builder(), input_type(2));
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index d6c70d4..0b54c88 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -25,6 +25,7 @@
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -135,7 +136,9 @@
       xla::XlaOp curr = input;
       for (int i = 0; i < rounds; ++i) {
         xla::XlaOp keys = xla::RngUniform(zero, max_value, key_shape);
-        xla::XlaOp sorted = xla::Sort(keys, {curr});
+        xla::XlaOp sorted = xla::Sort(
+            {keys, curr}, xla::CreateScalarLtComputation(
+                              {xla::U32, ctx->input_xla_type(0)}, builder));
         curr = xla::GetTupleElement(sorted, 1);
       }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
index c7b1fca..9a6dc37 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
@@ -27,7 +27,8 @@
 // `bit_generator` and converted to the requested data type and range. This
 // routine requires 2 32-bit integer seeds and currently only supports 'shape's
 // of type F32, S32 and S64.
-xla::XlaOp StatelessRngUniform(xla::XlaOp seeds, const xla::Shape& shape,
+xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
+                               xla::XlaOp seeds, const xla::Shape& shape,
                                xla::XlaOp minval, xla::XlaOp maxval);
 
 // Converts to bfloat16 if `dtype` equals DT_BFLOAT16, no-op otherwise.
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index bc1ef61..265e7e7 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -26,6 +26,7 @@
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
@@ -227,21 +228,30 @@
   void Compile(XlaOpKernelContext* ctx) override {
     if (IsTensorListInput(ctx, 0)) {
       // Input is a TensorList.
-      // TODO(b/124707753): support nested TensorList.
-      xla::XlaOp tensor_list = ctx->Input(0);
-      TensorShape shape;
-      OP_REQUIRES_OK(ctx, GetTensorListBufferShape(tensor_list, &shape));
-      xla::PrimitiveType type;
-      OP_REQUIRES_OK(ctx, GetTensorListPrimitiveType(tensor_list, &type));
-      xla::XlaOp buffer;
-      OP_REQUIRES_OK(ctx, CreateZerosList(ctx, shape, type, &buffer));
+
+      // Check the TensorList input is initialized.
+      xla::XlaOp list = ctx->Input(0);
+      bool is_initialized;
+      OP_REQUIRES_OK(ctx, IsTensorListInitialized(list, &is_initialized));
+      OP_REQUIRES(
+          ctx, is_initialized,
+          errors::InvalidArgument(
+              "TensorList input for ZerosLike op is an uninitialized list"));
+
+      auto list_shape_or = ctx->builder()->GetShape(list);
+      OP_REQUIRES_OK(ctx, list_shape_or.status());
+      xla::XlaOp new_list;
+      OP_REQUIRES_OK(
+          ctx, CreateZerosTensorListWithShape(
+                   ctx->builder(), list_shape_or.ValueOrDie(), &new_list));
 
       xla::XlaOp push_index;
-      OP_REQUIRES_OK(ctx, GetTensorListPushIndex(tensor_list, &push_index));
+      OP_REQUIRES_OK(ctx, GetTensorListPushIndex(list, &push_index));
 
-      xla::XlaOp output_list;
-      OP_REQUIRES_OK(ctx, BuildTensorList(buffer, push_index, &output_list));
-      ctx->SetTensorListOutput(0, output_list);
+      xla::XlaOp result;
+      OP_REQUIRES_OK(ctx,
+                     SetTensorListPushIndex(new_list, push_index, &result));
+      ctx->SetTensorListOutput(0, result);
     } else {
       const TensorShape input_shape = ctx->InputShape(0);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
index 6cfdf4a..8cfd985 100644
--- a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
@@ -15,6 +15,7 @@
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
@@ -25,7 +26,10 @@
   explicit XlaSortOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    context->SetOutput(0, xla::Sort(context->Input("input")));
+    context->SetOutput(0, xla::Sort({context->Input("input")},
+                                    xla::CreateScalarLtComputation(
+                                        {context->InputXlaType("input")},
+                                        context->builder())));
   }
 };
 
@@ -37,8 +41,11 @@
       : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    xla::XlaOp result =
-        xla::Sort(context->Input("keys"), {context->Input("values")});
+    xla::XlaOp result = xla::Sort(
+        {context->Input("keys"), context->Input("values")},
+        xla::CreateScalarLtComputation(
+            {context->InputXlaType("keys"), context->InputXlaType("values")},
+            context->builder()));
     context->SetOutput(0, xla::GetTupleElement(result, 0));
     context->SetOutput(1, xla::GetTupleElement(result, 1));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index cd9a6ee..7e210f5 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -13,6 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+
 #include <cmath>
 
 #include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
@@ -29,26 +31,35 @@
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/stateful_random_ops.h"
 #include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 namespace {
 
-xla::RngOutput StatefulRngUniform(xla::XlaOp key, xla::XlaOp initial_state,
+xla::BitGeneratorTy BitGenerator(Algorithm alg) {
+  if (alg == RNG_ALG_PHILOX) {
+    return [](xla::XlaOp key, xla::XlaOp state, const xla::Shape& shape) {
+      return xla::PhiloxBitGenerator(key, state, shape, /*scramble=*/false);
+    };
+  }
+  return xla::ThreeFryBitGenerator;
+}
+
+xla::RngOutput StatefulRngUniform(Algorithm alg, xla::XlaOp key,
+                                  xla::XlaOp initial_state,
                                   const xla::Shape& shape, xla::XlaOp minval,
                                   xla::XlaOp maxval) {
   xla::PrimitiveType type = shape.element_type();
   switch (type) {
     case xla::F32:
-      return xla::UniformF32Distribution(
-          key, initial_state, xla::ThreeFryBitGenerator, minval, maxval, shape);
+      return xla::UniformF32Distribution(key, initial_state, BitGenerator(alg),
+                                         minval, maxval, shape);
     case xla::U32:
     case xla::S32:
     case xla::U64:
     case xla::S64:
-      return UniformIntDistribution(
-          key, initial_state, xla::ThreeFryBitGenerator, minval, maxval, shape);
+      return UniformIntDistribution(key, initial_state, BitGenerator(alg),
+                                    minval, maxval, shape);
     default:
       return {key.builder()->ReportError(xla::Unimplemented(
                   "Types other than F32, U32, S32, U64 and S64 "
@@ -59,11 +70,11 @@
   }
 }
 
-xla::RngOutput StatefulRngUniformFullInt(xla::XlaOp key,
+xla::RngOutput StatefulRngUniformFullInt(Algorithm alg, xla::XlaOp key,
                                          xla::XlaOp initial_state,
                                          const xla::Shape& shape) {
   xla::PrimitiveType type = shape.element_type();
-  xla::RngOutput output = xla::ThreeFryBitGenerator(key, initial_state, shape);
+  xla::RngOutput output = BitGenerator(alg)(key, initial_state, shape);
   switch (type) {
     case xla::U32:
     case xla::U64:
@@ -82,33 +93,68 @@
   }
 }
 
-template <typename ListB, typename ListA, typename F>
-ListB Map(F f, ListA const& list_a) {
-  ListB list_b;
-  for (auto a : list_a) {
-    list_b.push_back(f(a));
-  }
-  return list_b;
-}
-
-xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
-                         absl::Span<const xla::XlaOp> scalars) {
-  return ConcatInDim(
-      builder,
-      Map<std::vector<xla::XlaOp>>(
-          [](xla::XlaOp x) { return xla::Reshape(x, {1}); }, scalars),
-      0);
-}
-
 using SamplerReturnType = xla::StatusOr<xla::RngOutput>;
 
+int64 GetMinStateSize(Algorithm alg) {
+  if (alg == RNG_ALG_PHILOX) {
+    return PHILOX_MIN_STATE_SIZE;
+  }
+  return THREEFRY_MIN_STATE_SIZE;
+}
+
+Status CheckStateShape(Algorithm alg, const TensorShape& shape) {
+  if (shape.dims() != 1) {
+    return errors::InvalidArgument(
+        "RNG state must have one and only one dimension, not ", shape.dims());
+  }
+  auto state_size = shape.dim_size(0);
+  auto min_state_size = GetMinStateSize(alg);
+  if (state_size < min_state_size) {
+    return errors::InvalidArgument("The size of the state must be at least ",
+                                   min_state_size, "; got ", state_size);
+  }
+  return Status::OK();
+}
+
+std::pair<xla::XlaOp, xla::XlaOp> StateAndKeyFromVariable(Algorithm alg,
+                                                          xla::XlaOp var) {
+  if (alg == RNG_ALG_THREEFRY) {
+    static constexpr int kStateSize = 1;
+    auto state = BitcastConvertType(
+        xla::Reshape(xla::Slice(var, {0}, {kStateSize}, {1}), {}), xla::U64);
+    auto key = BitcastConvertType(
+        xla::Reshape(xla::Slice(var, {kStateSize}, {kStateSize + 1}, {1}), {}),
+        xla::U64);
+    return std::make_pair(state, key);
+  } else {
+    static constexpr int kStateSize = 2;
+    auto state =
+        BitcastConvertType(xla::Slice(var, {0}, {kStateSize}, {1}), xla::U64);
+    auto key = xla::Reshape(
+        BitcastConvertType(xla::Slice(var, {kStateSize}, {kStateSize + 1}, {1}),
+                           xla::U64),
+        {});
+    return std::make_pair(state, key);
+  }
+}
+
+xla::XlaOp StateAndKeyToVariable(Algorithm alg, xla::XlaOp state,
+                                 xla::XlaOp key) {
+  auto builder = state.builder();
+  if (alg == RNG_ALG_THREEFRY) {
+    return ConcatScalars(builder, {state, key});
+  } else {
+    return ConcatInDim(builder, {state, xla::Reshape(key, {1})}, 0);
+  }
+}
+
 // A helper function containing the common part of several kernels below.
 // Precondition: 'algorithm' and 'shape' are compile-time constants.
 Status CompileImpl(
     XlaOpKernelContext* ctx, int state_input_idx, int alg_input_idx,
     int shape_input_idx,
-    std::function<SamplerReturnType(xla::XlaOp, xla::XlaOp, TensorShape)> const&
-        sampler) {
+    std::function<SamplerReturnType(Algorithm, xla::XlaOp, xla::XlaOp,
+                                    TensorShape)> const& sampler) {
   auto alg_shape = ctx->InputShape(alg_input_idx);
   if (alg_shape.dims() != 0) {
     return errors::InvalidArgument("algorithm must be of shape [], not ",
@@ -117,53 +163,35 @@
   xla::Literal alg_literal;
   TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
   auto alg = alg_literal.Get<Algorithm>({});
-
-  if (alg == RNG_ALG_THREEFRY) {
-    xla::XlaOp var;
-    TensorShape var_shape;
-    TF_RETURN_IF_ERROR(ctx->ReadVariableInput(
-        state_input_idx, STATE_ELEMENT_DTYPE, &var_shape, &var));
-    if (var_shape.dims() != 1) {
-      return errors::InvalidArgument(
-          "RNG state must have one and only one dimension, not ",
-          var_shape.dims());
-    }
-    auto state_size = var_shape.dim_size(0);
-    if (state_size < THREEFRY_MIN_STATE_SIZE) {
-      return errors::InvalidArgument(
-          "For the ThreeFry algorithm, the size of state"
-          " must be at least ",
-          THREEFRY_MIN_STATE_SIZE, "; got ", state_size);
-    }
-    TensorShape shape;
-    TF_RETURN_IF_ERROR(ctx->ConstantInputAsShape(shape_input_idx, &shape));
-
-    static constexpr int kStateSize = 1;
-    auto state = BitcastConvertType(
-        xla::Reshape(xla::Slice(var, {0}, {kStateSize}, {1}), {}), xla::U64);
-    auto key = BitcastConvertType(
-        xla::Reshape(xla::Slice(var, {kStateSize}, {kStateSize + 1}, {1}), {}),
-        xla::U64);
-
-    auto status_or_value = sampler(state, key, shape);
-    if (!status_or_value.ok()) {
-      return status_or_value.status();
-    }
-    xla::RngOutput value_state = status_or_value.ConsumeValueOrDie();
-    state = value_state.state;
-    ctx->SetOutput(0, value_state.value);
-    xla::XlaBuilder* builder = ctx->builder();
-    var = ConcatScalars(builder, {state, key});
-    xla::PrimitiveType state_element_type;
-    TF_RETURN_IF_ERROR(
-        DataTypeToPrimitiveType(STATE_ELEMENT_DTYPE, &state_element_type));
-    var = BitcastConvertType(var, state_element_type);
-    TF_RETURN_IF_ERROR(
-        ctx->AssignVariable(state_input_idx, STATE_ELEMENT_DTYPE, var));
-    return Status::OK();
-  } else {
+  if (!(alg == RNG_ALG_THREEFRY || alg == RNG_ALG_PHILOX)) {
     return errors::InvalidArgument("Unsupported algorithm id: ", alg);
   }
+
+  xla::XlaOp var;
+  TensorShape var_shape;
+  TF_RETURN_IF_ERROR(ctx->ReadVariableInput(
+      state_input_idx, STATE_ELEMENT_DTYPE, &var_shape, &var));
+  TF_RETURN_IF_ERROR(CheckStateShape(alg, var_shape));
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(ctx->ConstantInputAsShape(shape_input_idx, &shape));
+  xla::XlaOp state;
+  xla::XlaOp key;
+  std::tie(state, key) = StateAndKeyFromVariable(alg, var);
+  auto status_or_value = sampler(alg, state, key, shape);
+  if (!status_or_value.ok()) {
+    return status_or_value.status();
+  }
+  xla::RngOutput value_state = status_or_value.ConsumeValueOrDie();
+  state = value_state.state;
+  ctx->SetOutput(0, value_state.value);
+  var = StateAndKeyToVariable(alg, state, key);
+  xla::PrimitiveType state_element_type;
+  TF_RETURN_IF_ERROR(
+      DataTypeToPrimitiveType(STATE_ELEMENT_DTYPE, &state_element_type));
+  var = BitcastConvertType(var, state_element_type);
+  TF_RETURN_IF_ERROR(
+      ctx->AssignVariable(state_input_idx, STATE_ELEMENT_DTYPE, var));
+  return Status::OK();
 }
 
 class StatefulUniformOp : public XlaOpKernel {
@@ -174,12 +202,13 @@
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* builder = ctx->builder();
-    auto sampler = [builder, this](xla::XlaOp state, xla::XlaOp key,
+    auto sampler = [builder, this](Algorithm alg, xla::XlaOp state,
+                                   xla::XlaOp key,
                                    TensorShape shape) -> SamplerReturnType {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
       xla::RngOutput uniform_state = StatefulRngUniform(
-          key, state, xla_shape, xla::ConstantR0<float>(builder, 0.0),
+          alg, key, state, xla_shape, xla::ConstantR0<float>(builder, 0.0),
           xla::ConstantR0<float>(builder, 1.0));
       xla::XlaOp uniform = uniform_state.value;
       state = uniform_state.state;
@@ -215,12 +244,12 @@
   void Compile(XlaOpKernelContext* ctx) override {
     auto sampler =
         // Needs explicit lambda return type because it fails to be inferred.
-        [this](xla::XlaOp state, xla::XlaOp key,
+        [this](Algorithm alg, xla::XlaOp state, xla::XlaOp key,
                TensorShape shape) -> SamplerReturnType {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
-      xla::RngOutput value_state = xla::NormalF32Distribution(
-          key, state, xla::ThreeFryBitGenerator, xla_shape);
+      xla::RngOutput value_state =
+          xla::NormalF32Distribution(key, state, BitGenerator(alg), xla_shape);
       xla::XlaOp normal = MaybeConvertF32ToBF16(value_state.value, dtype_);
       return {{normal, value_state.state}};
     };
@@ -254,13 +283,13 @@
     xla::XlaBuilder* builder = ctx->builder();
     auto sampler =
         // Needs explicit lambda return type because it fails to be inferred.
-        [builder, this](xla::XlaOp state, xla::XlaOp key,
+        [builder, this](Algorithm alg, xla::XlaOp state, xla::XlaOp key,
                         TensorShape shape) -> SamplerReturnType {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
 
       xla::RngOutput uniform_result = StatefulRngUniform(
-          key, state, xla_shape,
+          alg, key, state, xla_shape,
           xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
           xla::One(builder, xla_shape.element_type()));
       xla::XlaOp uniform = uniform_result.value;
@@ -297,12 +326,12 @@
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaOp minval = ctx->Input(3);
     xla::XlaOp maxval = ctx->Input(4);
-    auto sample_with_threefry = [minval, maxval, this](
-                                    xla::XlaOp state, xla::XlaOp key,
-                                    TensorShape shape) -> SamplerReturnType {
+    auto sample_with_threefry =
+        [minval, maxval, this](Algorithm alg, xla::XlaOp state, xla::XlaOp key,
+                               TensorShape shape) -> SamplerReturnType {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype_, shape, &xla_shape));
-      return StatefulRngUniform(key, state, xla_shape, minval, maxval);
+      return StatefulRngUniform(alg, key, state, xla_shape, minval, maxval);
     };
     OP_REQUIRES_OK(ctx,
                    CompileImpl(ctx, /*state_input_idx=*/0, /*alg_input_idx=*/1,
@@ -330,11 +359,12 @@
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto sample_with_threefry = [this](xla::XlaOp state, xla::XlaOp key,
+    auto sample_with_threefry = [this](Algorithm alg, xla::XlaOp state,
+                                       xla::XlaOp key,
                                        TensorShape shape) -> SamplerReturnType {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype_, shape, &xla_shape));
-      return StatefulRngUniformFullInt(key, state, xla_shape);
+      return StatefulRngUniformFullInt(alg, key, state, xla_shape);
     };
     OP_REQUIRES_OK(ctx,
                    CompileImpl(ctx, /*state_input_idx=*/0, /*alg_input_idx=*/1,
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index ea6a260..648181e 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -33,6 +33,24 @@
 
 namespace tensorflow {
 
+namespace {
+
+xla::BitGeneratorTy GetBitGeneratorForDevice(
+    absl::string_view device_type_string) {
+  // The Philox algorithm may cause performance regression on other devices.
+  // Turn on the Philox algorithm for the CPU and GPU backends only.
+  if (device_type_string == DEVICE_GPU_XLA_JIT ||
+      device_type_string == DEVICE_CPU_XLA_JIT) {
+    return [](xla::XlaOp key, xla::XlaOp state, const xla::Shape& shape) {
+      return xla::PhiloxBitGenerator(key, state, shape, /*scramble=*/true);
+    };
+  }
+
+  return xla::ThreeFryBitGenerator;
+}
+
+}  // namespace
+
 xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
   if (dtype == DT_BFLOAT16) {
     xla::XlaBuilder* builder = input.builder();
@@ -45,7 +63,8 @@
   }
 }
 
-xla::XlaOp StatelessRngUniform(xla::XlaOp seeds, const xla::Shape& shape,
+xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
+                               xla::XlaOp seeds, const xla::Shape& shape,
                                xla::XlaOp minval, xla::XlaOp maxval) {
   xla::XlaBuilder* builder = seeds.builder();
 
@@ -58,15 +77,17 @@
   xla::PrimitiveType type = shape.element_type();
   switch (type) {
     case xla::F32:
-      return xla::UniformF32Distribution(key, initial_state,
-                                         xla::ThreeFryBitGenerator, minval,
-                                         maxval, shape)
+      return xla::UniformF32Distribution(
+                 key, initial_state,
+                 GetBitGeneratorForDevice(device_type_string), minval, maxval,
+                 shape)
           .value;
     case xla::S32:  // fall through
     case xla::S64:
-      return UniformIntDistribution(key, initial_state,
-                                    xla::ThreeFryBitGenerator, minval, maxval,
-                                    shape)
+      return UniformIntDistribution(
+                 key, initial_state,
+                 GetBitGeneratorForDevice(device_type_string), minval, maxval,
+                 shape)
           .value;
       break;
     default:
@@ -82,7 +103,8 @@
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {
+      : XlaOpKernel(ctx),
+        device_type_string_(ctx->device_type().type_string()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
   }
 
@@ -100,15 +122,17 @@
 
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
-    xla::XlaOp uniform = StatelessRngUniform(
-        seed, xla_shape, xla::ConstantR0<float>(builder, 0.0),
-        xla::ConstantR0<float>(builder, 1.0));
+    xla::XlaOp uniform =
+        StatelessRngUniform(device_type_string_, seed, xla_shape,
+                            xla::ConstantR0<float>(builder, 0.0),
+                            xla::ConstantR0<float>(builder, 1.0));
     uniform = MaybeConvertF32ToBF16(uniform, dtype_);
     ctx->SetOutput(0, uniform);
   }
 
  private:
   DataType dtype_;
+  string device_type_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
 };
@@ -123,7 +147,8 @@
 class StatelessRandomUniformIntOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformIntOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {
+      : XlaOpKernel(ctx),
+        device_type_string_(ctx->device_type().type_string()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
   }
 
@@ -150,13 +175,15 @@
 
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
-    xla::XlaOp uniform = StatelessRngUniform(seed, xla_shape, minval, maxval);
+    xla::XlaOp uniform = StatelessRngUniform(device_type_string_, seed,
+                                             xla_shape, minval, maxval);
 
     ctx->SetOutput(0, uniform);
   }
 
  private:
   DataType dtype_;
+  string device_type_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformIntOp);
 };
@@ -171,7 +198,8 @@
 class StatelessRandomNormalOp : public XlaOpKernel {
  public:
   explicit StatelessRandomNormalOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {
+      : XlaOpKernel(ctx),
+        device_type_string_(ctx->device_type().type_string()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
   }
 
@@ -195,8 +223,9 @@
                      ShiftLeft(ConvertElementType(seed1, xla::U64),
                                ConstantR0WithType(builder, xla::U64, 32));
     xla::XlaOp normal =
-        xla::NormalF32Distribution(key, initial_state,
-                                   xla::ThreeFryBitGenerator, xla_shape)
+        xla::NormalF32Distribution(
+            key, initial_state, GetBitGeneratorForDevice(device_type_string_),
+            xla_shape)
             .value;
     normal = MaybeConvertF32ToBF16(normal, dtype_);
     ctx->SetOutput(0, normal);
@@ -204,6 +233,7 @@
 
  private:
   DataType dtype_;
+  string device_type_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
 };
@@ -218,7 +248,8 @@
 class StatelessTruncatedNormalOp : public XlaOpKernel {
  public:
   explicit StatelessTruncatedNormalOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {
+      : XlaOpKernel(ctx),
+        device_type_string_(ctx->device_type().type_string()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
   }
 
@@ -236,7 +267,7 @@
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
     xla::XlaOp uniform = StatelessRngUniform(
-        seed, xla_shape,
+        device_type_string_, seed, xla_shape,
         xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
         xla::One(builder, xla_shape.element_type()));
     xla::XlaOp truncated_normal = TruncatedNormal(uniform);
@@ -246,6 +277,7 @@
 
  private:
   DataType dtype_;
+  string device_type_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp);
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index f4e4904..0e4dd7e 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -27,6 +27,7 @@
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -47,11 +48,11 @@
   explicit TensorListLengthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    TensorShape buffer_shape;
-    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &buffer_shape));
+    int64 leading_dim;
+    OP_REQUIRES_OK(ctx,
+                   GetLeadingDimForTensorList(ctx->Input(0), &leading_dim));
     Tensor length_tensor(DT_INT32, {});
-    length_tensor.scalar<int32>()() =
-        static_cast<int32>(buffer_shape.dim_size(0));
+    length_tensor.scalar<int32>()() = static_cast<int32>(leading_dim);
     ctx->SetConstantOutput(0, length_tensor);
   }
 
@@ -61,40 +62,30 @@
 
 REGISTER_XLA_OP(Name("TensorListLength").IsMetadataOp(), TensorListLengthOp);
 
-// Creates an empty list with size (leading_dim, *element_shape) if
-// element_shape is known at compile time. Otherwise creates one with size
-// (leading_dim, 0) which gets initialized later in `GetInitializedList`.
-Status CreateZerosList(XlaOpKernelContext* ctx, int element_shape_index,
-                       int64 leading_dim, DataType dtype, xla::XlaOp* list) {
-  TensorShape list_shape;
-  list_shape.AddDim(leading_dim);
-  xla::XlaOp element_shape_handle = ctx->Input(element_shape_index);
-  TF_ASSIGN_OR_RETURN(
-      bool is_element_shape_compile_time_const,
-      element_shape_handle.builder()->IsConstant(element_shape_handle));
-  PartialTensorShape partial_element_shape;
-  if (is_element_shape_compile_time_const) {
-    TF_RETURN_IF_ERROR(ctx->ConstantInputAsPartialShape(
-        element_shape_index, &partial_element_shape));
+// "input" is the shape input for EmptyTensorList/TensorListReserve ops.
+// If "input" is a compile time constant and not "unknown rank" (-1), return
+// its value in "*shape".
+Status TryGetElementShapeFromInput(XlaOpKernelContext* ctx, xla::XlaOp input,
+                                   xla::PrimitiveType dtype, bool* got_shape,
+                                   xla::Shape* shape) {
+  auto is_compile_time_constant_or = input.builder()->IsConstant(input);
+  TF_RETURN_IF_ERROR(is_compile_time_constant_or.status());
+
+  bool is_compile_time_constant = is_compile_time_constant_or.ValueOrDie();
+  if (!is_compile_time_constant) {
+    *got_shape = false;
+    return Status::OK();
   }
-  if (is_element_shape_compile_time_const &&
-      partial_element_shape.IsFullyDefined()) {
-    TensorShape element_shape;
-    partial_element_shape.AsTensorShape(&element_shape);
-    list_shape.AppendShape(element_shape);
-  } else {
-    // If element_shape is not a compile time constant or if it is not fully
-    // defined we will have to wait for the first write call to fully allocate
-    // the array.
-    // TODO(srbs): We are using element_shape of [0] as a proxy to denote an
-    // uninitialized list. A better implementation may be to represent the
-    // list as a 3-tuple containining an explicit "initialized" flag. However,
-    // we would still need to create a dummy tensor for the first tuple
-    // element.
-    list_shape.AddDim(0);
+
+  PartialTensorShape partial_shape;
+  TF_RETURN_IF_ERROR(ctx->ConstantInputAsPartialShape(0, &partial_shape));
+  if (!partial_shape.IsFullyDefined()) {
+    *got_shape = false;
+    return Status::OK();
   }
-  *list = xla::Broadcast(XlaHelpers::Zero(ctx->builder(), dtype),
-                         list_shape.dim_sizes());
+
+  *shape = xla::ShapeUtil::MakeShape(dtype, partial_shape.dim_sizes());
+  *got_shape = true;
   return Status::OK();
 }
 
@@ -102,21 +93,53 @@
  public:
   explicit TensorListReserveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+    // Only non-nested TensorList is supported for now.
+    OP_REQUIRES(
+        ctx, dtype_ != DT_VARIANT,
+        errors::Unimplemented(
+            "Only non-nested TensorList is supported for TensorListReserve."));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
     int64 num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
+    OP_REQUIRES(
+        ctx, num_elements >= 0,
+        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
+                                "size. Set the number of elements."));
 
-    xla::XlaOp buffer;
-    OP_REQUIRES_OK(ctx, CreateZerosList(ctx, 0, num_elements, dtype_, &buffer));
+    // If element shape is compile time constant and it's not "unknown rank"
+    // shape (-1), create an initialized TensorList. Otherwise create an
+    // uninitialized TensorList.
+    xla::XlaOp element_shape_handle = ctx->Input(0);
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype_, &type));
+    bool got_shape;
+    xla::Shape element_shape;
+    OP_REQUIRES_OK(ctx,
+                   TryGetElementShapeFromInput(ctx, element_shape_handle, type,
+                                               &got_shape, &element_shape));
+    if (got_shape) {
+      xla::Shape list_shape;
+      OP_REQUIRES_OK(ctx, GetTensorListShapeFromElementShape(
+                              element_shape, num_elements, &list_shape));
 
-    xla::XlaOp output_list;
-    OP_REQUIRES_OK(
-        ctx, BuildTensorList(
-                 buffer, xla::ConstantR0<int32>(ctx->builder(), num_elements),
-                 &output_list));
-    ctx->SetTensorListOutput(0, output_list);
+      xla::XlaOp new_list;
+      OP_REQUIRES_OK(ctx, CreateZerosTensorListWithShape(
+                              ctx->builder(), list_shape, &new_list));
+      xla::XlaOp result;
+      OP_REQUIRES_OK(
+          ctx,
+          SetTensorListPushIndex(
+              new_list, xla::ConstantR0<int32>(ctx->builder(), num_elements),
+              &result));
+      ctx->SetTensorListOutput(0, result);
+      return;
+    }
+
+    xla::XlaOp result =
+        BuildUninitializedTensorList(ctx->builder(), num_elements);
+    ctx->SetTensorListOutput(0, result);
   }
 
  private:
@@ -144,15 +167,37 @@
         errors::InvalidArgument("XLA compilation requires a fixed tensor list "
                                 "size. Set the max number of elements."));
 
-    xla::XlaOp buffer;
-    OP_REQUIRES_OK(ctx,
-                   CreateZerosList(ctx, 0, max_num_elements, dtype_, &buffer));
+    if (dtype_ != DT_VARIANT) {
+      // We are creating a non-nested TensorList.
+      // If element shape is compile time constant and it's not "unknown rank"
+      // shape (-1), create an initialized TensorList. Otherwise create an
+      // uninitialized TensorList.
+      xla::XlaOp element_shape_handle = ctx->Input(0);
+      xla::PrimitiveType type;
+      OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype_, &type));
+      bool got_shape;
+      xla::Shape element_shape;
+      OP_REQUIRES_OK(
+          ctx, TryGetElementShapeFromInput(ctx, element_shape_handle, type,
+                                           &got_shape, &element_shape));
+      if (got_shape) {
+        xla::Shape list_shape;
+        OP_REQUIRES_OK(ctx, GetTensorListShapeFromElementShape(
+                                element_shape, max_num_elements, &list_shape));
 
-    xla::XlaOp output_list;
-    OP_REQUIRES_OK(
-        ctx, BuildTensorList(buffer, xla::ConstantR0<int32>(ctx->builder(), 0),
-                             &output_list));
-    ctx->SetTensorListOutput(0, output_list);
+        xla::XlaOp result;
+        OP_REQUIRES_OK(ctx, CreateZerosTensorListWithShape(
+                                ctx->builder(), list_shape, &result));
+        ctx->SetTensorListOutput(0, result);
+        return;
+      }
+    }
+
+    // We are creating a nested TensorList or a non-nested TensorList with
+    // unknown shape. Just create an uninitialized TensorList.
+    xla::XlaOp result =
+        BuildUninitializedTensorList(ctx->builder(), max_num_elements);
+    ctx->SetTensorListOutput(0, result);
   }
 
  private:
@@ -163,7 +208,8 @@
 
 REGISTER_XLA_OP(Name("EmptyTensorList")
                     .CompileTimeConstantInput("element_shape")
-                    .CompileTimeConstantInput("max_num_elements"),
+                    .CompileTimeConstantInput("max_num_elements")
+                    .AllowVariantTypes(),
                 EmptyTensorListOp);
 
 class TensorListElementShapeOp : public XlaOpKernel {
@@ -174,18 +220,34 @@
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx,
+                   (IsTensorListInitialized(ctx->Input(0), &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
+
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(ctx->Input(0), &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListElementShape."));
+
+    // For non-nested TensorList, element shape is the buffer shape without
+    // the first dimension.
     xla::XlaBuilder* b = ctx->builder();
-    TensorShape shape;
-    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &shape));
-    shape.RemoveDim(0);
+    xla::Shape list_shape;
+    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &list_shape));
+    list_shape.DeleteDimension(0);
 
     switch (shape_type_) {
       case DT_INT64:
-        ctx->SetOutput(0, xla::ConstantR1<int64>(b, shape.dim_sizes()));
+        ctx->SetOutput(0, xla::ConstantR1<int64>(b, list_shape.dimensions()));
         break;
       case DT_INT32: {
         std::vector<int32> size;
-        for (int64 s : shape.dim_sizes()) {
+        for (int64 s : list_shape.dimensions()) {
           size.push_back(s);
         }
         ctx->SetOutput(0, xla::ConstantR1<int32>(b, size));
@@ -213,28 +275,27 @@
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp state = ctx->Input(0);
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx,
+                   (IsTensorListInitialized(ctx->Input(0), &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
 
-    TensorShape shape;
-    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &shape));
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(ctx->Input(0), &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListGetItem."));
 
-    xla::XlaOp buffer;
-    OP_REQUIRES_OK(ctx, GetTensorListBuffer(state, &buffer));
+    xla::XlaOp list = ctx->Input(0);
     xla::XlaOp index = ctx->Input(1);
 
-    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    std::vector<xla::XlaOp> start_indices(shape.dims(),
-                                          xla::ConstantR0<int32>(b, 0));
-    start_indices[0] = index;
-    auto slice_shape = shape.dim_sizes();
-    slice_shape[0] = 1LL;
+    xla::XlaOp result;
+    OP_REQUIRES_OK(ctx, ExecuteTensorListGetItem(list, index, &result));
 
-    xla::XlaOp read = xla::DynamicSlice(buffer, start_indices, slice_shape);
-    // Remove the leading '1' dimension.
-    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
-
-    ctx->SetOutput(0, xla::Reshape(read, value_shape));
+    ctx->SetOutput(0, result);
   }
 
  private:
@@ -247,19 +308,29 @@
 
 class TensorListStackOp : public XlaOpKernel {
  public:
-  explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
-  }
+  explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx,
+                   (IsTensorListInitialized(ctx->Input(0), &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
+
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(ctx->Input(0), &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListGetItem."));
+
     xla::XlaOp buffer;
     OP_REQUIRES_OK(ctx, GetTensorListBuffer(ctx->Input(0), &buffer));
     ctx->SetOutput(0, buffer);
   }
 
  private:
-  DataType dtype_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(TensorListStackOp);
 };
 
@@ -268,42 +339,20 @@
 class TensorListFromTensorOp : public XlaOpKernel {
  public:
   explicit TensorListFromTensorOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
-  }
+      : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    PartialTensorShape element_shape;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsPartialShape(1, &element_shape));
-
-    const TensorShape tensor_shape = ctx->InputShape(0);
-    // Ensure that tensor_shape is compatible with element_shape.
-    PartialTensorShape unused;
-    OP_REQUIRES_OK(
-        ctx,
-        element_shape.MergeWith(
-            PartialTensorShape(
-                absl::Span<const int64>(tensor_shape.dim_sizes()).subspan(1)),
-            &unused));
-    OP_REQUIRES(ctx, tensor_shape.dims() > 0,
-                errors::InvalidArgument("Input value must be at least a "
-                                        "vector but received shape: ",
-                                        tensor_shape.DebugString()));
-    const int num_elements = tensor_shape.dim_size(0);
-
-    xla::XlaBuilder* b = ctx->builder();
+    const TensorShape& tensor_shape = ctx->InputShape(0);
+    int num_elements = tensor_shape.dim_size(0);
     const xla::XlaOp tensor = ctx->Input(0);
-
-    xla::XlaOp output_list;
-    OP_REQUIRES_OK(
-        ctx, BuildTensorList(tensor, xla::ConstantR0<int32>(b, num_elements),
-                             &output_list));
-    ctx->SetTensorListOutput(0, output_list);
+    xla::XlaOp result;
+    OP_REQUIRES_OK(ctx,
+                   ExecuteTensorListFromTensor(num_elements, tensor, &result));
+    auto list_shape_or = ctx->builder()->GetShape(result);
+    ctx->SetTensorListOutput(0, result);
   }
 
  private:
-  DataType dtype_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(TensorListFromTensorOp);
 };
 
@@ -311,75 +360,34 @@
     Name("TensorListFromTensor").CompileTimeConstantInput("element_shape"),
     TensorListFromTensorOp);
 
-// Returns the 0'th element of `tuple` containing the list tensor if it has been
-// initialized already else creates one lazily. This allows lazy initialization
-// of the list on the first call to SetItem or PushBack.
-Status GetInitializedList(const xla::XlaOp& input_list,
-                          const TensorShape& element_shape, DataType dtype,
-                          xla::XlaOp* output_list_buffer) {
-  bool is_already_initialized;
-  TF_RETURN_IF_ERROR(
-      IsTensorListInitialized(input_list, &is_already_initialized));
-  TensorShape input_list_shape;
-  TF_RETURN_IF_ERROR(GetTensorListBufferShape(input_list, &input_list_shape));
-  TensorShape input_list_element_shape = input_list_shape;
-  input_list_element_shape.RemoveDim(0);
-
-  if (is_already_initialized) {
-    TF_RET_CHECK(element_shape == input_list_element_shape);
-    TF_RETURN_IF_ERROR(GetTensorListBuffer(input_list, output_list_buffer));
-    return Status::OK();
-  }
-
-  int64 leading_dim = input_list_shape.dim_size(0);
-  TensorShape output_list_shape = element_shape;
-  output_list_shape.InsertDim(0, leading_dim);
-
-  xla::XlaOp output_list;
-  TF_RETURN_IF_ERROR(
-      InitializeTensorList(input_list, output_list_shape, &output_list));
-  TF_RETURN_IF_ERROR(GetTensorListBuffer(output_list, output_list_buffer));
-  return Status::OK();
-}
-
 class TensorListSetItemOp : public XlaOpKernel {
  public:
-  explicit TensorListSetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
-  }
+  explicit TensorListSetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp tl = ctx->Input(0);
-    TensorShape elem_shape = ctx->InputShape(2);
-
-    xla::XlaOp buffer;
-    OP_REQUIRES_OK(ctx, GetInitializedList(tl, elem_shape, dtype_, &buffer));
-    xla::XlaOp push_index;
-    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(tl, &push_index));
-
+    xla::XlaOp list = ctx->Input(0);
     xla::XlaOp index = ctx->Input(1);
-    xla::XlaOp value = ctx->Input(2);
+    xla::XlaOp element = ctx->Input(2);
+    xla::XlaOp initialized_list;
+    OP_REQUIRES_OK(ctx, GetInitializedTensorListForElement(
+                            list, element, /*element_is_tensor_list=*/false,
+                            &initialized_list));
 
-    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
-    start_indices[0] = index;
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(initialized_list, &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListSetItem."));
 
-    TensorShape slice_shape = elem_shape;
-    slice_shape.InsertDim(0, 1LL);
-    auto update = xla::Reshape(value, slice_shape.dim_sizes());
+    xla::XlaOp result;
+    OP_REQUIRES_OK(ctx, ExecuteTensorListSetItem(initialized_list, index,
+                                                 element, &result));
 
-    xla::XlaOp output_list;
-    OP_REQUIRES_OK(ctx, BuildTensorList(xla::DynamicUpdateSlice(buffer, update,
-                                                                start_indices),
-                                        push_index, &output_list));
-    ctx->SetTensorListOutput(0, output_list);
+    ctx->SetTensorListOutput(0, result);
   }
 
  private:
-  DataType dtype_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(TensorListSetItemOp);
 };
 
@@ -387,83 +395,57 @@
 
 class TensorListPushBackOp : public XlaOpKernel {
  public:
-  explicit TensorListPushBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
-  }
+  explicit TensorListPushBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp list_tuple = ctx->Input(0);
-    TensorShape elem_shape = ctx->InputShape(1);
-
-    xla::XlaOp buffer;
-    OP_REQUIRES_OK(ctx,
-                   GetInitializedList(list_tuple, elem_shape, dtype_, &buffer));
-
-    xla::XlaOp index;
-    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(list_tuple, &index));
-    xla::XlaOp value = ctx->Input(1);
-
-    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
-    start_indices[0] = index;
-
-    TensorShape slice_shape = elem_shape;
-    slice_shape.InsertDim(0, 1LL);
-    auto update = xla::Reshape(value, slice_shape.dim_sizes());
-
-    xla::XlaOp output_list;
+    xla::XlaOp list = ctx->Input(0);
+    xla::XlaOp element = ctx->Input(1);
+    bool element_is_tensor_list = IsTensorListInput(ctx, 1);
+    xla::XlaOp initialized_list;
     OP_REQUIRES_OK(
-        ctx,
-        BuildTensorList(xla::DynamicUpdateSlice(buffer, update, start_indices),
-                        index + xla::ConstantR0<int32>(b, 1), &output_list));
-    ctx->SetTensorListOutput(0, output_list);
+        ctx, GetInitializedTensorListForElement(
+                 list, element, element_is_tensor_list, &initialized_list));
+
+    xla::XlaOp result;
+    OP_REQUIRES_OK(ctx,
+                   ExecuteTensorListPushBack(initialized_list, element,
+                                             element_is_tensor_list, &result));
+
+    ctx->SetTensorListOutput(0, result);
   }
 
  private:
-  DataType dtype_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(TensorListPushBackOp);
 };
 
-REGISTER_XLA_OP(Name("TensorListPushBack"), TensorListPushBackOp);
+REGISTER_XLA_OP(Name("TensorListPushBack").AllowVariantTypes(),
+                TensorListPushBackOp);
 
 class TensorListPopBackOp : public XlaOpKernel {
  public:
-  explicit TensorListPopBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
-  }
+  explicit TensorListPopBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp state = ctx->Input(0);
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx,
+                   (IsTensorListInitialized(ctx->Input(0), &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
 
-    TensorShape shape;
-    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &shape));
+    xla::XlaOp list = ctx->Input(0);
+    xla::XlaOp list_result, element_result;
+    bool element_is_tensor_list;
+    OP_REQUIRES_OK(ctx,
+                   ExecuteTensorListPopBack(list, &list_result, &element_result,
+                                            &element_is_tensor_list));
 
-    xla::XlaOp ta;
-    OP_REQUIRES_OK(ctx, GetTensorListBuffer(state, &ta));
-    xla::XlaOp index;
-    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(state, &index));
-
-    index = index - xla::ConstantR0<int32>(b, 1);
-
-    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    std::vector<xla::XlaOp> start_indices(shape.dims(),
-                                          xla::ConstantR0<int32>(b, 0));
-    start_indices[0] = index;
-    auto slice_shape = shape.dim_sizes();
-    slice_shape[0] = 1LL;
-
-    xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
-    // Remove the leading '1' dimension.
-    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
-
-    xla::XlaOp output_list;
-    OP_REQUIRES_OK(ctx, BuildTensorList(ta, index, &output_list));
-    ctx->SetTensorListOutput(0, output_list);
-    ctx->SetOutput(1, xla::Reshape(read, value_shape));
+    ctx->SetTensorListOutput(0, list_result);
+    if (element_is_tensor_list) {
+      ctx->SetTensorListOutput(1, element_result);
+    } else {
+      ctx->SetOutput(1, element_result);
+    }
   }
 
  private:
@@ -472,7 +454,8 @@
   TF_DISALLOW_COPY_AND_ASSIGN(TensorListPopBackOp);
 };
 
-REGISTER_XLA_OP(Name("TensorListPopBack"), TensorListPopBackOp);
+REGISTER_XLA_OP(Name("TensorListPopBack").AllowVariantTypes(),
+                TensorListPopBackOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index 9a47af2..579c9ac 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -18,102 +18,476 @@
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+// TensorList is represented by a tuple.
+// - The first part of the tuple is a buffer containing all the tensors,
+// - The following parts are push indices for all nested levels of
+//   TensorLists. The last part is push index for the outermost TensorList.
+//
+// TensorList, as it name suggests, is conceptually a list of tensors. In actual
+// representation of a non-nested TensorList, the buffer shape is
+// [tensor_list_size, element shape]. We will call tensor_list_size "leading
+// dimension" below. Notice that the leading dimension must be a compile time
+// constant, since it's part of the buffer shape.
+//
+// Example: consider a 3-level nested TensorList whose element type is scalar.
+// Assume inner TensorList has leading dimension 4, middle TensorList has 3,
+// and outer TensorList has 3.
+// Assume that lower cased letter means there is data in that position, and "."
+// means there is no data in that position.
+// First element of outer TensorList:
+// [ a . . . ]
+// [ b c . . ]
+// [ d e f . ]
+// Second element of outer TensorList:
+// [ g h i . ]
+// [ j k . . ]
+// [ . . . . ]
+// Third element: not pushed yet.
+//
+// The first part of the tuple is an array of shape [3, 3, 4] containing data.
+// The second part is an array of shape [3, 3], each element is push index
+// for the inner TensorList. In this case, its values are:
+// [ 1 2 3 ]
+// [ 3 2 . ]
+// [ . . . ]
+// The third part is an array of shape [3], each element is push index for
+// the middle TensorList. In this case, its values are:
+// [ 3 ]
+// [ 2 ]
+// [ . ]
+// The forth (and last) part is a scalar. It's the push index for the outer
+// TensorList. In this case, its values is 2.
+//
+// Now imagine we need to push the following element to the outer TensorList:
+// [ l . . . ]
+// [ m n . . ]
+// [ . . . . ]
+// This element is represented by a tuple of 3 parts:
+// First part is all data.
+// Second part is push indices for the inner TensorList, which is [ 1 2 . ].
+// Third part is push index for the middle TensorList, which is 2.
+// Now let's do the push.
+// First, we append its data to outer TensorList's data.
+// Then we start to deal with push indices. Similar to data, we append push
+// indices for each level of TensorList.
+// For the inner TensorList: append push indices for the pushed element.
+// [ 1 2 3 ]               [ 1 2 3 ]
+// [ 3 2 . ] +           = [ 3 2 . ]
+// [ . . . ]   [ 1 2 . ]   [ 1 2 . ]
+// For the middle TensorList: append push indices for the pushed element.
+// [ 3 ]           [ 3 ]
+// [ 2 ] +       = [ 2 ]
+// [ . ]   [ 2 ]   [ 2 ]
+// For the outer TensorList: just add 1.
+// 2 + 1 = 3
+//
+// Popping an element from the outer TensorList also follows a similar process.
+// First part is data. We get data by slicing data with push index for outer
+// TensorList (which is 3).
+// Second part is push indices for inner TensorList. We get it by slicing
+// push indices for inner TensorList with push index for outer TensorList (which
+// is 3).
+// [ 1 2 3 ]
+// [ 3 2 . ]
+// [ 1 2 . ] ===> This is what we want
+// Third part is push index for middle TensorList. We get it by slicing
+// push indices for middle TensorList with push index for outer TensorList
+// (which is 3).
+// [ 3 ]
+// [ 2 ]
+// [ 2 ] ===> This is what we want
+
 namespace tensorflow {
 
 bool IsTensorListInput(XlaOpKernelContext* ctx, int index) {
   return ctx->InputExpression(index).kind() == XlaExpression::Kind::kTensorList;
 }
 
-Status BuildTensorList(const xla::XlaOp& buffer, const xla::XlaOp& push_index,
-                       xla::XlaOp* output_list) {
+Status IsTensorListInitialized(xla::XlaOp list, bool* is_initialized) {
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
+  *is_initialized = list_shape.IsTuple();
+  return Status::OK();
+}
+
+Status IsNestedTensorList(xla::XlaOp list, bool* is_nested_list) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
+  *is_nested_list = (xla::ShapeUtil::TupleElementCount(list_shape) > 2);
+  return Status::OK();
+}
+
+Status BuildNonNestedTensorList(xla::XlaOp buffer, xla::XlaOp push_index,
+                                xla::XlaOp* output_list) {
   TF_RET_CHECK(buffer.builder());
   *output_list = xla::Tuple(buffer.builder(), {buffer, push_index});
   return Status::OK();
 }
 
-Status GetTensorListPrimitiveType(const xla::XlaOp& op,
-                                  xla::PrimitiveType* type) {
-  TF_RET_CHECK(op.builder());
-  TF_ASSIGN_OR_RETURN(const xla::Shape& list_tuple_shape,
-                      op.builder()->GetShape(op));
-  xla::Shape buffer_shape =
-      xla::ShapeUtil::GetTupleElementShape(list_tuple_shape, 0);
-  *type = buffer_shape.element_type();
-  return Status::OK();
-}
-
-Status GetTensorListBuffer(const xla::XlaOp& op, xla::XlaOp* buffer) {
-  TF_RET_CHECK(op.builder());
-  *buffer = xla::GetTupleElement(op, 0);
-  return Status::OK();
-}
-
-Status GetTensorListPushIndex(const xla::XlaOp& op, xla::XlaOp* push_index) {
-  TF_RET_CHECK(op.builder());
-  *push_index = xla::GetTupleElement(op, 1);
-  return Status::OK();
-}
-
-Status GetTensorListBufferShape(const xla::XlaOp& op,
-                                TensorShape* buffer_shape) {
-  TF_RET_CHECK(op.builder());
-  TensorShape shape;
-  TF_ASSIGN_OR_RETURN(const xla::Shape& list_tuple_shape,
-                      op.builder()->GetShape(op));
-  return GetTensorListBufferShape(list_tuple_shape, buffer_shape);
-}
-
-Status GetTensorListBufferShape(const xla::Shape& list_shape,
-                                TensorShape* buffer_shape) {
-  TF_RET_CHECK(list_shape.IsTuple());
-  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(
-      xla::ShapeUtil::GetTupleElementShape(list_shape, 0), buffer_shape));
-  return Status::OK();
-}
-
-Status IsTensorListInitialized(const xla::XlaOp& op, bool* is_initialized) {
-  TensorShape list_shape;
-  TF_RETURN_IF_ERROR(GetTensorListBufferShape(op, &list_shape));
-  *is_initialized = !(list_shape.dims() == 2 && list_shape.dim_size(1) == 0);
-  return Status::OK();
-}
-
-Status InitializeTensorList(const xla::XlaOp& uninitialized_list,
-                            const TensorShape& buffer_shape,
-                            xla::XlaOp* output_list) {
-  TensorShape input_buffer_shape;
-  TF_RETURN_IF_ERROR(
-      GetTensorListBufferShape(uninitialized_list, &input_buffer_shape));
-  if (input_buffer_shape.dim_size(0) != buffer_shape.dim_size(0)) {
-    return errors::InvalidArgument(
-        "Number of elements in input list does not match buffer size. ",
-        "input list size: ", input_buffer_shape.dim_size(0),
-        "buffer size: ", buffer_shape.dim_size(0));
+Status GetTensorListBufferShape(xla::XlaOp list, xla::Shape* buffer_shape) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
   }
-  xla::XlaBuilder* builder = uninitialized_list.builder();
-  xla::XlaOp input_buffer;
-  TF_RETURN_IF_ERROR(GetTensorListBuffer(uninitialized_list, &input_buffer));
-  TF_ASSIGN_OR_RETURN(const xla::Shape& input_buffer_xla_shape,
-                      builder->GetShape(input_buffer));
-  auto new_buffer = xla::Broadcast(
-      xla::ConstantLiteral(builder, xla::LiteralUtil::Zero(
-                                        input_buffer_xla_shape.element_type())),
-      buffer_shape.dim_sizes());
-  xla::XlaOp push_index;
-  TF_RETURN_IF_ERROR(GetTensorListPushIndex(uninitialized_list, &push_index));
-  return BuildTensorList(new_buffer, push_index, output_list);
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
+  *buffer_shape = xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
+  return Status::OK();
 }
 
-Status CreateZerosList(XlaOpKernelContext* ctx, const TensorShape& buffer_shape,
-                       xla::PrimitiveType type, xla::XlaOp* list) {
+Status GetTensorListBuffer(xla::XlaOp list, xla::XlaOp* buffer) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+  *buffer = xla::GetTupleElement(list, 0);
+  return Status::OK();
+}
+
+Status GetTensorListPushIndex(xla::XlaOp list, xla::XlaOp* push_index) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
+  int tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
+  *push_index = xla::GetTupleElement(list, tuple_size - 1);
+  return Status::OK();
+}
+
+Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
+                              xla::XlaOp* result) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
+  int tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
+  std::vector<xla::XlaOp> result_parts;
+  result_parts.reserve(tuple_size);
+  for (int i = 0; i < tuple_size - 1; i++) {
+    result_parts.push_back(xla::GetTupleElement(list, i));
+  }
+  result_parts.push_back(push_index);
+  *result = xla::Tuple(list.builder(), result_parts);
+  return Status::OK();
+}
+
+xla::XlaOp BuildUninitializedTensorList(xla::XlaBuilder* b,
+                                        int64 leading_dimension) {
   auto zero =
-      xla::ConstantLiteral(ctx->builder(), xla::LiteralUtil::Zero(type));
-  *list = xla::Broadcast(zero, buffer_shape.dim_sizes());
+      xla::ConstantLiteral(b, xla::LiteralUtil::Zero(xla::PrimitiveType::S32));
+  return xla::Broadcast(zero, std::vector<int64>{leading_dimension});
+}
+
+Status GetLeadingDimForTensorList(xla::XlaOp list, int64* leading_dim) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
+  if (is_initialized) {
+    auto buffer_shape = xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
+    *leading_dim = buffer_shape.dimensions(0);
+  } else {
+    *leading_dim = list_shape.dimensions(0);
+  }
+  return Status::OK();
+}
+
+Status GetTensorListShapeFromElementTensorListShape(
+    const xla::Shape& element_tensor_list_shape, int64 leading_dim,
+    xla::Shape* tensor_list_shape) {
+  std::vector<xla::Shape> shapes;
+  int tuple_size = xla::ShapeUtil::TupleElementCount(element_tensor_list_shape);
+  for (int i = 0; i < tuple_size; i++) {
+    const xla::Shape& shape =
+        xla::ShapeUtil::GetTupleElementShape(element_tensor_list_shape, i);
+    std::vector<int64> dimensions = shape.dimensions();
+    dimensions.insert(dimensions.begin(), leading_dim);
+    shapes.push_back(
+        xla::ShapeUtil::MakeShape(shape.element_type(), dimensions));
+  }
+  shapes.push_back(
+      xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, std::vector<int64>{}));
+  *tensor_list_shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  return Status::OK();
+}
+
+Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
+                                          int64 leading_dim,
+                                          xla::Shape* tensor_list_shape) {
+  if (!element_shape.IsArray()) {
+    return errors::InvalidArgument(
+        "GetTensorListShapeFromElementShape() only supports normal tensor "
+        "shape. But element shape is ",
+        element_shape.DebugString());
+  }
+
+  std::vector<xla::Shape> shapes;
+  std::vector<int64> dimensions = element_shape.dimensions();
+  dimensions.insert(dimensions.begin(), leading_dim);
+  shapes.push_back(
+      xla::ShapeUtil::MakeShape(element_shape.element_type(), dimensions));
+  shapes.push_back(
+      xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, std::vector<int64>{}));
+  *tensor_list_shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  return Status::OK();
+}
+
+Status CreateZerosTensorListWithShape(xla::XlaBuilder* b,
+                                      const xla::Shape& list_shape,
+                                      xla::XlaOp* list) {
+  int tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
+  std::vector<xla::XlaOp> elements;
+  for (int i = 0; i < tuple_size; i++) {
+    const xla::Shape& shape =
+        xla::ShapeUtil::GetTupleElementShape(list_shape, i);
+    xla::XlaOp zero =
+        xla::ConstantLiteral(b, xla::LiteralUtil::Zero(shape.element_type()));
+    xla::XlaOp zeros = xla::Broadcast(zero, shape.dimensions());
+    elements.push_back(zeros);
+  }
+  *list = xla::Tuple(b, elements);
+  return Status::OK();
+}
+
+Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
+                                          bool element_is_tensor_list,
+                                          xla::XlaOp* initialized_list) {
+  int64 leading_dim;
+  TF_RETURN_IF_ERROR(GetLeadingDimForTensorList(list, &leading_dim));
+
+  xla::XlaBuilder* b = list.builder();
+  xla::Shape list_shape;
+  if (element_is_tensor_list) {
+    TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
+    TF_RETURN_IF_ERROR(GetTensorListShapeFromElementTensorListShape(
+        element_shape, leading_dim, &list_shape));
+  } else {
+    TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
+    TF_RETURN_IF_ERROR(GetTensorListShapeFromElementShape(
+        element_shape, leading_dim, &list_shape));
+  }
+
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (is_initialized) {
+    // Check shape of initialized list is correct.
+    TF_ASSIGN_OR_RETURN(xla::Shape original_list_shape, b->GetShape(list));
+    if (!xla::ShapeUtil::Equal(original_list_shape, list_shape)) {
+      return errors::Internal(
+          "Invalid TensorList shape: ", original_list_shape.DebugString(),
+          ", expected: ", list_shape.DebugString());
+    }
+    *initialized_list = list;
+    return Status::OK();
+  } else {
+    return CreateZerosTensorListWithShape(b, list_shape, initialized_list);
+  }
+}
+
+Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
+                                 bool element_is_tensor_list,
+                                 xla::XlaOp* result) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+
+  xla::XlaBuilder* b = list.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, b->GetShape(list));
+  int list_tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
+  xla::XlaOp push_index = xla::GetTupleElement(list, list_tuple_size - 1);
+
+  std::vector<xla::XlaOp> result_parts;
+
+  if (element_is_tensor_list) {
+    TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
+    int element_tuple_size = xla::ShapeUtil::TupleElementCount(element_shape);
+    for (int i = 0; i < element_tuple_size; i++) {
+      const xla::Shape& element_part_shape =
+          xla::ShapeUtil::GetTupleElementShape(element_shape, i);
+      xla::XlaOp element_part = xla::GetTupleElement(element, i);
+      std::vector<int64> element_part_dims = element_part_shape.dimensions();
+      element_part_dims.insert(element_part_dims.begin(), 1);
+      element_part = xla::Reshape(element_part, element_part_dims);
+
+      std::vector<xla::XlaOp> start_indices(
+          element_part_shape.dimensions_size() + 1,
+          xla::ConstantR0<int32>(b, 0));
+      start_indices[0] = push_index;
+
+      xla::XlaOp list_part = xla::GetTupleElement(list, i);
+      xla::XlaOp updated_list_part =
+          xla::DynamicUpdateSlice(list_part, element_part, start_indices);
+      result_parts.push_back(updated_list_part);
+    }
+  } else {
+    TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
+    std::vector<int64> element_dims = element_shape.dimensions();
+    element_dims.insert(element_dims.begin(), 1);
+    xla::XlaOp update = xla::Reshape(element, element_dims);
+
+    std::vector<xla::XlaOp> start_indices(element_shape.dimensions_size() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = push_index;
+
+    xla::XlaOp list_part = xla::GetTupleElement(list, 0);
+    xla::XlaOp updated_list_part =
+        xla::DynamicUpdateSlice(list_part, update, start_indices);
+    result_parts.push_back(updated_list_part);
+  }
+
+  xla::XlaOp updated_push_index = push_index + xla::ConstantR0<int32>(b, 1);
+  result_parts.push_back(updated_push_index);
+
+  *result = xla::Tuple(b, result_parts);
+  return Status::OK();
+}
+
+Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
+                                xla::XlaOp* element_result,
+                                bool* element_is_tensor_list) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+
+  // If the TensorList is a nested TensorList, element will be TensorList.
+  TF_RETURN_IF_ERROR(IsNestedTensorList(list, element_is_tensor_list));
+
+  xla::XlaBuilder* b = list.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, b->GetShape(list));
+  int list_tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
+  xla::XlaOp push_index = xla::GetTupleElement(list, list_tuple_size - 1);
+  push_index = push_index - xla::ConstantR0<int32>(b, 1);
+
+  std::vector<xla::XlaOp> list_result_parts, element_result_parts;
+  for (int i = 0; i < list_tuple_size - 1; i++) {
+    const xla::Shape& list_part_shape =
+        xla::ShapeUtil::GetTupleElementShape(list_shape, i);
+    std::vector<xla::XlaOp> start_indices(list_part_shape.dimensions_size(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = push_index;
+
+    std::vector<int64> slice_shape = list_part_shape.dimensions();
+    slice_shape[0] = 1LL;
+
+    xla::XlaOp list_part = xla::GetTupleElement(list, i);
+    xla::XlaOp read = xla::DynamicSlice(list_part, start_indices, slice_shape);
+
+    slice_shape.erase(slice_shape.begin());
+    element_result_parts.push_back(xla::Reshape(read, slice_shape));
+    list_result_parts.push_back(list_part);
+  }
+  list_result_parts.push_back(push_index);
+
+  *list_result = xla::Tuple(b, list_result_parts);
+  if (*element_is_tensor_list) {
+    *element_result = xla::Tuple(b, element_result_parts);
+  } else {
+    *element_result = element_result_parts[0];
+  }
+
+  return Status::OK();
+}
+
+Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
+                                xla::XlaOp element, xla::XlaOp* result) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+  bool is_nested;
+  TF_RETURN_IF_ERROR(IsNestedTensorList(list, &is_nested));
+  if (is_nested) {
+    return errors::Unimplemented(
+        "ExecuteTensorListSetItem() only supports non-nested TensorList");
+  }
+
+  xla::XlaBuilder* b = list.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape element_shape, b->GetShape(element));
+  std::vector<int64> element_dims = element_shape.dimensions();
+  element_dims.insert(element_dims.begin(), 1);
+  xla::XlaOp update = xla::Reshape(element, element_dims);
+
+  std::vector<xla::XlaOp> start_indices(element_shape.dimensions_size() + 1,
+                                        xla::ConstantR0<int32>(b, 0));
+  start_indices[0] = index;
+
+  xla::XlaOp list_part = xla::GetTupleElement(list, 0);
+  xla::XlaOp updated_list_part =
+      xla::DynamicUpdateSlice(list_part, update, start_indices);
+
+  std::vector<xla::XlaOp> result_parts;
+  result_parts.push_back(updated_list_part);
+  result_parts.push_back(xla::GetTupleElement(list, 1));
+  *result = xla::Tuple(b, result_parts);
+  return Status::OK();
+}
+
+Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
+                                xla::XlaOp* result) {
+  bool is_initialized;
+  TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
+  if (!is_initialized) {
+    return errors::InvalidArgument("TensorList is not initialized");
+  }
+  bool is_nested;
+  TF_RETURN_IF_ERROR(IsNestedTensorList(list, &is_nested));
+  if (is_nested) {
+    return errors::Unimplemented(
+        "ExecuteTensorListGetItem() only supports non-nested TensorList");
+  }
+
+  xla::XlaBuilder* b = list.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape list_shape, b->GetShape(list));
+  const xla::Shape& buffer_shape =
+      xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
+  std::vector<xla::XlaOp> start_indices(buffer_shape.dimensions_size(),
+                                        xla::ConstantR0<int32>(b, 0));
+  start_indices[0] = index;
+
+  std::vector<int64> slice_shape = buffer_shape.dimensions();
+  slice_shape[0] = 1LL;
+
+  xla::XlaOp list_part = xla::GetTupleElement(list, 0);
+  xla::XlaOp read = xla::DynamicSlice(list_part, start_indices, slice_shape);
+
+  slice_shape.erase(slice_shape.begin());
+  *result = xla::Reshape(read, slice_shape);
+  return Status::OK();
+}
+
+Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
+                                   xla::XlaOp* result) {
+  xla::XlaBuilder* b = tensor.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, b->GetShape(tensor));
+  if (!shape.IsArray()) {
+    return errors::InvalidArgument(
+        "ExecuteTensorListFromTensor() only supports normal tensor. But input "
+        "shape is ",
+        shape.DebugString());
+  }
+
+  std::vector<xla::XlaOp> result_parts{tensor,
+                                       xla::ConstantR0<int32>(b, push_index)};
+  *result = xla::Tuple(b, result_parts);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
index 5ee82d0..7fac2d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
@@ -16,12 +16,6 @@
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
 
-// TensorList utilities.
-//
-// Tensor lists are represented as tuple consisting of a pre-allocated buffer
-// consisting of the tensors (and where dim 0 is the list index), along with a
-// scalar telling us the next index to push a value at.
-
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -31,44 +25,97 @@
 // Whether the input expression at `index` corresponds to a TensorList.
 bool IsTensorListInput(XlaOpKernelContext* ctx, int index);
 
-// Builds a TensorList from its constituents, `buffer` and `push_index`.
-Status BuildTensorList(const xla::XlaOp& buffer, const xla::XlaOp& push_index,
-                       xla::XlaOp* output_list);
+// Whether the TensorList is initialized (has known data type and shape).
+Status IsTensorListInitialized(xla::XlaOp list, bool* is_initialized);
 
-// Returns XLA PrimitiveType for the TensorList.
-Status GetTensorListPrimitiveType(const xla::XlaOp& op,
-                                  xla::PrimitiveType* type);
+// Whether the TensorList is a nested TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status IsNestedTensorList(xla::XlaOp list, bool* is_nested_list);
 
-// Returns the buffer for the TensorList.
-Status GetTensorListBuffer(const xla::XlaOp& op, xla::XlaOp* buffer);
+// Builds a non-nested TensorList from `buffer` and `push_index`.
+Status BuildNonNestedTensorList(xla::XlaOp buffer, xla::XlaOp push_index,
+                                xla::XlaOp* output_list);
 
-// Returns the push_index for the TensorList.
-Status GetTensorListPushIndex(const xla::XlaOp& op, xla::XlaOp* push_index);
+// Returns buffer shape for the TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status GetTensorListBufferShape(xla::XlaOp list, xla::Shape* buffer_shape);
 
-// Returns the shape of the TensorList buffer.
-Status GetTensorListBufferShape(const xla::XlaOp& op,
-                                TensorShape* buffer_shape);
+// Returns buffer for the TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status GetTensorListBuffer(xla::XlaOp list, xla::XlaOp* buffer);
 
-// Inputs the TensorList shape and returns the buffer shape.
-Status GetTensorListBufferShape(const xla::Shape& list_shape,
-                                TensorShape* buffer_shape);
+// Returns push index for the TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status GetTensorListPushIndex(xla::XlaOp list, xla::XlaOp* push_index);
 
-// Returns whether the TensorList has been initialized.
-//
-// A TensorList is considered initialized if its element_shape is completely
-// known.
-Status IsTensorListInitialized(const xla::XlaOp& op, bool* is_initialized);
+// Returns a new TensorList with given push_index.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
+                              xla::XlaOp* result);
 
-// Inputs an uninitialized list and a buffer_shape and returns an initialized
-// list. The initialized list uses the dtype and push index of the uninitialized
-// list and is filled with zeros.
-Status InitializeTensorList(const xla::XlaOp& uninitialized_list,
-                            const TensorShape& buffer_shape,
-                            xla::XlaOp* output_list);
+// Returns an uninitialized TensorList.
+xla::XlaOp BuildUninitializedTensorList(xla::XlaBuilder* b,
+                                        int64 leading_dimension);
 
-// Returns a TensorList filled with zero.
-Status CreateZerosList(XlaOpKernelContext* ctx, const TensorShape& buffer_shape,
-                       xla::PrimitiveType type, xla::XlaOp* list);
+// Returns leading dimension for the TensorList.
+// Input can be initialized or uninitialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status GetLeadingDimForTensorList(xla::XlaOp list, int64* leading_dim);
+
+// Returns TensorList shape for the element shape.
+// Element shape must be a normal tensor shape.
+Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
+                                          int64 leading_dim,
+                                          xla::Shape* tensor_list_shape);
+
+// Returns a TensorList filled by zeros with the given shape.
+Status CreateZerosTensorListWithShape(xla::XlaBuilder* b,
+                                      const xla::Shape& list_shape,
+                                      xla::XlaOp* list);
+
+// If the TensorList is initialized, check that its shape matches element shape;
+// If the TensorList is uninitialized, initialize it with the element shape.
+// Input can be initialized or uninitialized TensorList.
+// "element" can be normal tensor or TensorList.
+Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
+                                          bool element_is_tensor_list,
+                                          xla::XlaOp* initialized_list);
+
+// Executes TensorListPushBack with given TensorList and element.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
+                                 bool element_is_tensor_list,
+                                 xla::XlaOp* result);
+
+// Executes TensorListPopBack with given TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
+                                xla::XlaOp* element_result,
+                                bool* element_is_tensor_list);
+
+// Executes TensorListSetItem with given TensorList, index and element.
+// Input must be an initialized TensorList.
+// Only non-nested TensorList is supported.
+Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
+                                xla::XlaOp element, xla::XlaOp* result);
+
+// Executes TensorListGetItem with given TensorList and index.
+// Input must be an initialized TensorList.
+// Only non-nested TensorList is supported.
+Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
+                                xla::XlaOp* result);
+
+// Executes TensorListPushBack with given tensor and push index.
+// "tensor" must be a normal tensor.
+Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
+                                   xla::XlaOp* result);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 6fbf354..f8d33a4 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -479,22 +479,18 @@
       OP_REQUIRES_OK(ctx, ctx->GetResourceInput(input_num, &resource));
       OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], builder));
     } else if (IsTensorListInput(ctx, input_num)) {
-      // If the list received as input is uninitialized but its shape was
-      // inferred in the first compilation pass we create a new list filled
-      // with zeros and used that as the input to the while op.
-      TensorShape input_list_shape;
-      OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(input_num),
-                                                   &input_list_shape));
-      TensorShape body_arg_shape;
-      OP_REQUIRES_OK(ctx,
-                     GetTensorListBufferShape(body_input_shape.tuple_shapes(i),
-                                              &body_arg_shape));
-      // Shape of the input list may differ from the shape of the body/cond
-      // input if the list's shape was inferred after the first compilation and
-      // the body/cond was recompiled with the updated shape of the list.
-      if (input_list_shape != body_arg_shape) {
-        OP_REQUIRES_OK(ctx, InitializeTensorList(ctx->Input(input_num),
-                                                 body_arg_shape, &inputs[i]));
+      xla::XlaOp input = ctx->Input(input_num);
+      auto input_shape_or = ctx->builder()->GetShape(input);
+      OP_REQUIRES_OK(ctx, input_shape_or.status());
+      xla::Shape input_shape = input_shape_or.ValueOrDie();
+      const xla::Shape& list_shape = body_input_shape.tuple_shapes(i);
+      // Shape/datatype of the input list may differ from shape/datatype of the
+      // body/cond input if the list's shape/datatype was inferred after the
+      // first compilation and the body/cond was recompiled with the updated
+      // shape/datatype of the list.
+      if (input_shape != list_shape) {
+        OP_REQUIRES_OK(ctx, CreateZerosTensorListWithShape(
+                                ctx->builder(), list_shape, &inputs[i]));
       } else {
         inputs[i] = ctx->Input(input_num);
       }
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 06eda41..d348d2b 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -111,7 +111,7 @@
       break;
     case xla::TUPLE:
       LOG(FATAL) << "tuple element type is not integral";
-    case xla::OPAQUE:
+    case xla::OPAQUE_TYPE:
       LOG(FATAL) << "opaque element type is not integral";
     default:
       LOG(FATAL) << "unhandled element type " << type;
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index cb6e0fb..4f1f3d7 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -1,5 +1,5 @@
 package(
-    default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
+    default_visibility = ["//tensorflow:internal"],
 )
 
 licenses(["notice"])  # Apache 2.0
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index c98e3d1..c9bf15a 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -562,11 +562,11 @@
             lhs_rhs_split[1], "'", " with size: ", lhs_rhs_split[1].size()));
       }
 
-      for (const char& c : lhs_rhs_split[0]) {
-        left_map[c] = context->Dim(input_a, left_map.size());
+      for (int i = 0; i < lhs_rhs_split[0].size(); ++i) {
+        left_map[lhs_rhs_split[0][i]] = context->Dim(input_a, i);
       }
-      for (const char& c : lhs_rhs_split[1]) {
-        right_map[c] = context->Dim(input_b, right_map.size());
+      for (int i = 0; i < lhs_rhs_split[1].size(); ++i) {
+        right_map[lhs_rhs_split[1][i]] = context->Dim(input_b, i);
       }
 
       for (const char& c : equation_split[1]) {
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 869b234..bedb9a6 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -306,26 +306,6 @@
 dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
 einsum = gen_xla_ops.xla_einsum
 
-
-@ops.RegisterGradient('XlaEinsum')
-def _einsum_grad(op, grad):
-  equation = op.get_attr('equation')
-  inputs, output = equation.split('->')
-  left, right = inputs.split(',')
-
-  return [
-      gen_xla_ops.xla_einsum(
-          grad,
-          op.inputs[1],
-          equation='{},{}->{}'.format(output, right, left),
-          name=None),
-      gen_xla_ops.xla_einsum(
-          grad,
-          op.inputs[0],
-          equation='{},{}->{}'.format(output, left, right),
-          name=None)
-  ]
-
 # TODO(phawkins): generalize tf.pad to support interior padding, and then remove
 # the XLA-specific pad operator.
 pad = gen_xla_ops.xla_pad
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc
new file mode 100644
index 0000000..23dd326
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc
@@ -0,0 +1,766 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/rearrange_function_argument_pass.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Given original input types and argument index mapping, return the new input
+// types.
+std::vector<DataType> ShuffleInputDataTypeAttribute(
+    const std::vector<DataType>& in_types,
+    const std::vector<int>& index_mapping) {
+  std::vector<DataType> result(index_mapping.size());
+  for (int i = 0; i < in_types.size(); i++) {
+    result[index_mapping.at(i)] = in_types[i];
+  }
+  return result;
+}
+
+// Given original input types, check if we need to rewrite the function (by
+// checking if all DT_RESOURCE inputs are in the end). If the function needs to
+// be rewritten, `resource_input_count` will be set to number of DT_RESOURCE
+// inputs, and `index_mapping` will hold a mapping for original input index to
+// rearranged input index.
+Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
+                                bool* need_rewrite, int* resource_input_count,
+                                std::vector<int>* index_mapping) {
+  int first_resource_index = -1;
+  for (int i = 0; i < in_types.size(); i++) {
+    DataType type = in_types[i];
+    if (type == DT_RESOURCE) {
+      first_resource_index = i;
+      break;
+    }
+  }
+  if (first_resource_index == -1) {
+    // No resource input. No need to rewrite.
+    *need_rewrite = false;
+    return Status::OK();
+  }
+
+  *need_rewrite = false;
+  for (int i = first_resource_index + 1; i < in_types.size(); i++) {
+    if (in_types[i] != DT_RESOURCE) {
+      *need_rewrite = true;
+      break;
+    }
+  }
+  if (!*need_rewrite) {
+    return Status::OK();
+  }
+
+  *resource_input_count = 0;
+  for (int i = 0; i < in_types.size(); i++) {
+    DataType type = in_types[i];
+    if (type == DT_RESOURCE) {
+      ++(*resource_input_count);
+    }
+  }
+  int non_resource_index = 0,
+      resource_index = in_types.size() - *resource_input_count;
+  index_mapping->resize(in_types.size());
+  for (int i = 0; i < in_types.size(); i++) {
+    if (in_types[i] != DT_RESOURCE) {
+      (*index_mapping)[i] = non_resource_index;
+      non_resource_index++;
+    } else {
+      (*index_mapping)[i] = resource_index;
+      resource_index++;
+    }
+  }
+
+  return Status::OK();
+}
+
+// Given mapping between original input index and rearranged input index,
+// reorder input edges for the node.
+Status ReorderInputEdges(Graph* g, Node* n,
+                         const std::vector<int>& index_mapping) {
+  std::vector<const Edge*> input_edges;
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    input_edges.push_back(e);
+  }
+  for (const Edge* e : input_edges) {
+    Node* src = e->src();
+    int src_output = e->src_output();
+    int dst_input = e->dst_input();
+    int new_dst_input = index_mapping.at(dst_input);
+    g->RemoveEdge(e);
+    g->AddEdge(src, src_output, n, new_dst_input)->DebugString();
+  }
+  return Status::OK();
+}
+
+// For While node, given mapping between original input index and rearranged
+// input index, reorder output edges for the node. DT_RESOURCE outputs are
+// removed from the node and we will use the node's corresponding input for the
+// edge.
+Status ReorderOutputEdges(Graph* g, Node* n, int input_count,
+                          int resource_input_count,
+                          const std::vector<int>& index_mapping) {
+  std::vector<const Edge*> output_edges;
+  for (const Edge* e : n->out_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    output_edges.push_back(e);
+  }
+  for (const Edge* e : output_edges) {
+    int src_output = e->src_output();
+    int new_src_output = index_mapping.at(src_output);
+    Node* dst = e->dst();
+    int dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    if (new_src_output < input_count - resource_input_count) {
+      g->AddEdge(n, new_src_output, dst, dst_input);
+    } else {
+      const Edge* input_edge;
+      TF_RETURN_IF_ERROR(n->input_edge(new_src_output, &input_edge));
+      g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+    }
+  }
+  return Status::OK();
+}
+
+// Given mapping between original input index and rearranged input index, change
+// "index" attribute for _Arg nodes.
+void RearrangeArgNodes(gtl::InlinedVector<Node*, 4>* arg_nodes,  // non-absl ok
+                       const std::vector<int>& index_mapping) {
+  for (int i = 0; i < arg_nodes->size(); i++) {
+    Node* n = (*arg_nodes)[i];
+    int new_index = index_mapping.at(i);
+    n->ClearAttr("index");
+    n->AddAttr("index", new_index);
+  }
+}
+
+// Given all _Retval nodes in the function, return if we need to rewrite the
+// function (by checking if we have DT_RESOURCE return values). If we need to
+// rewrite the function, `retval_index_mapping` will hold the mapping from
+// original _Retval to rearranged _Retval, and `resource_retval_to_arg` will
+// hold mapping from DT_RESOURCE _Retval index to its input _Arg index. Here we
+// assume that all DT_RESOURCE _Retval nodes come from _Arg nodes directly.
+Status CalculateRetvalRearrange(
+    const gtl::InlinedVector<Node*, 4>& ret_nodes,  // non-absl ok
+    std::map<int, int>* retval_index_mapping,
+    std::map<int, int>* resource_retval_to_arg) {
+  for (int i = 0; i < ret_nodes.size(); i++) {
+    Node* n = ret_nodes[i];
+    DataType t;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &t));
+    if (t != DT_RESOURCE) {
+      int new_retval_index = retval_index_mapping->size();
+      retval_index_mapping->insert(std::make_pair(i, new_retval_index));
+      continue;
+    }
+
+    const Edge* e;
+    TF_RETURN_IF_ERROR(n->input_edge(0, &e));
+    if (!e->src()->IsArg()) {
+      return errors::Unimplemented(
+          "Resource _Retval node's input does not come from _Arg "
+          "directly: ",
+          e->DebugString());
+    }
+    Node* arg = e->src();
+    int src_index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(arg->def(), "index", &src_index));
+    resource_retval_to_arg->insert(std::make_pair(i, src_index));
+  }
+  return Status::OK();
+}
+
+// Given original output types and return value index mapping, return the new
+// output types. Notice that DT_RESOURCE will be removed.
+std::vector<DataType> ShuffleOutputDataTypeAttribute(
+    const std::vector<DataType>& out_types,
+    const std::map<int, int>& index_mapping) {
+  std::vector<DataType> result(index_mapping.size());
+  for (int i = 0; i < out_types.size(); i++) {
+    auto iter = index_mapping.find(i);
+    if (iter != index_mapping.end()) {
+      result[iter->second] = out_types[i];
+    }
+  }
+  return result;
+}
+
+// For StatefulPartitionedCall node, given mapping between original input index
+// and rearranged input index, reorder output edges for the node. DT_RESOURCE
+// outputs are removed from the node and we will use the node's corresponding
+// input for the edge.
+Status RearrangeOutputEdges(Node* n, Graph* g,
+                            const std::map<int, int>& retval_index_mapping,
+                            const std::map<int, int>& resource_retval_to_arg) {
+  std::vector<const Edge*> out_edges;
+  for (const Edge* e : n->out_edges()) {
+    if (!e->IsControlEdge()) {
+      out_edges.push_back(e);
+    }
+  }
+  for (const Edge* e : out_edges) {
+    Node* dst = e->dst();
+    int dst_input = e->dst_input();
+    int src_output = e->src_output();
+    auto iter = retval_index_mapping.find(src_output);
+    if (iter == retval_index_mapping.end()) {
+      TF_RET_CHECK(resource_retval_to_arg.find(src_output) !=
+                   resource_retval_to_arg.end());
+      g->RemoveEdge(e);
+      const Edge* input_edge;
+      TF_RETURN_IF_ERROR(
+          n->input_edge(resource_retval_to_arg.at(src_output), &input_edge));
+      g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+    } else {
+      g->RemoveEdge(e);
+      g->AddEdge(n, iter->second, dst, dst_input);
+    }
+  }
+  return Status::OK();
+}
+
+// Given mapping between original output index and rearranged output index,
+// change "index" attribute for _Retval nodes. Notice that DT_RESOURCE _Retval
+// nodes will be removed.
+void RearrangeRetvalNodes(
+    const gtl::InlinedVector<Node*, 4>& ret_nodes,  // non-absl ok
+    Graph* g, const std::map<int, int>& retval_index_mapping) {
+  for (int i = 0; i < ret_nodes.size(); i++) {
+    Node* n = ret_nodes[i];
+    auto iter = retval_index_mapping.find(i);
+    if (iter == retval_index_mapping.end()) {
+      g->RemoveNode(n);
+    } else {
+      n->ClearAttr("index");
+      n->AddAttr("index", iter->second);
+    }
+  }
+}
+
+Status MaybeRewriteWhileNode(Graph* g, Node* n, FunctionLibraryDefinition* fld,
+                             bool* node_rewritten) {
+  // Check if this While node needs rewrite.
+  std::vector<DataType> types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &types));
+  bool input_need_rearrange;
+  int resource_input_count;
+  std::vector<int> index_mapping;
+  TF_RETURN_IF_ERROR(InputTypesNeedsRearrange(
+      types, &input_need_rearrange, &resource_input_count, &index_mapping));
+  if (!input_need_rearrange) {
+    *node_rewritten = false;
+    return Status::OK();
+  }
+
+  *node_rewritten = true;
+
+  // Modify "T" attribute for this While node.
+  std::vector<DataType> new_types =
+      ShuffleInputDataTypeAttribute(types, index_mapping);
+  n->ClearAttr("T");
+  n->AddAttr("T", new_types);
+
+  // Reorder input and output edges.
+  TF_RETURN_IF_ERROR(ReorderInputEdges(g, n, index_mapping));
+  TF_RETURN_IF_ERROR(ReorderOutputEdges(g, n, types.size(),
+                                        resource_input_count, index_mapping));
+
+  // Modify cond and body functions.
+  for (auto const& attr_name : std::vector<string>{"cond", "body"}) {
+    NameAttrList attr_value;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &attr_value));
+    const FunctionDef* fdef = fld->Find(attr_value.name());
+    TF_RET_CHECK(fdef != nullptr);
+    std::unique_ptr<FunctionBody> fbody;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, AttrSlice(), fld, &fbody));
+
+    // Check that resource _Arg nodes for While node are always returned with
+    // the same index, and we don't have cases like this:
+    // tf.while_loop(
+    //     cond,
+    //     lambda resource_var1, resource_var2: [resource_var2, resource_var1],
+    //     [resource_var1, resource_var2])
+    if (attr_name == "body") {
+      for (int i = 0; i < fbody->ret_nodes.size(); i++) {
+        Node* n = fbody->ret_nodes[i];
+        DataType dtype;
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
+        if (dtype != DT_RESOURCE) {
+          continue;
+        }
+
+        Node* input_node;
+        TF_RETURN_IF_ERROR(n->input_node(0, &input_node));
+        while (input_node->IsIdentity()) {
+          TF_RETURN_IF_ERROR(input_node->input_node(0, &input_node));
+        }
+        if (input_node->IsArg()) {
+          int index;
+          TF_RETURN_IF_ERROR(GetNodeAttr(input_node->def(), "index", &index));
+          if (index != i) {
+            return errors::Unimplemented("While node ", n->DebugString(),
+                                         " has resource _Retval[", i,
+                                         "] coming from _Arg[", index, "]");
+          }
+        } else {
+          return errors::Unimplemented("Encountered node ",
+                                       input_node->DebugString(),
+                                       " while tracing _Arg node for _Retval[",
+                                       i, "] of while node ", n->DebugString());
+        }
+      }
+    }
+
+    RearrangeArgNodes(&fbody->arg_nodes, index_mapping);
+    if (attr_name == "body") {
+      for (int i = 0; i < fbody->ret_nodes.size(); i++) {
+        Node* n = fbody->ret_nodes[i];
+        int new_index = index_mapping.at(i);
+        if (new_index < types.size() - resource_input_count) {
+          n->ClearAttr("index");
+          n->AddAttr("index", new_index);
+        } else {
+          fbody->graph->RemoveNode(n);
+        }
+      }
+    }
+
+    // Save the new FunctionDef.
+    FunctionDef new_fdef;
+    string new_name =
+        fld->UniqueFunctionName(absl::StrCat(attr_value.name(), "_rearrange_"));
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef));
+
+    // Change node to use rewritten function.
+    attr_value.set_name(new_name);
+    n->ClearAttr(attr_name);
+    n->AddAttr(attr_name, attr_value);
+  }
+  return Status::OK();
+}
+
+Status MaybeRewriteCallNode(Graph* g, Node* n, FunctionLibraryDefinition* fld,
+                            bool* node_rewritten) {
+  // This node needs rewrite when either of these is true:
+  // 1) Tin has DT_RESOURCE which requires rearrange;
+  // 2) Tout has DT_RESOURCE.
+  std::vector<DataType> in_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "Tin", &in_types));
+  bool input_need_rearrange;
+  int resource_input_count;
+  std::vector<int> index_mapping;
+  TF_RETURN_IF_ERROR(InputTypesNeedsRearrange(
+      in_types, &input_need_rearrange, &resource_input_count, &index_mapping));
+  std::vector<DataType> out_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "Tout", &out_types));
+  bool has_resource_output = std::find(out_types.begin(), out_types.end(),
+                                       DT_RESOURCE) != out_types.end();
+  if (!resource_input_count && !has_resource_output) {
+    *node_rewritten = false;
+    return Status::OK();
+  }
+
+  *node_rewritten = true;
+
+  string attr_name = "f";
+  NameAttrList f;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &f));
+  const FunctionDef* fdef = fld->Find(f.name());
+  TF_RET_CHECK(fdef != nullptr);
+  std::unique_ptr<FunctionBody> fbody;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(), fld, &fbody));
+
+  if (input_need_rearrange) {
+    // Reorder input edges.
+    TF_RETURN_IF_ERROR(ReorderInputEdges(g, n, index_mapping));
+
+    // Change Tin attribute.
+    std::vector<DataType> new_in_types =
+        ShuffleInputDataTypeAttribute(in_types, index_mapping);
+    n->ClearAttr("Tin");
+    n->AddAttr("Tin", new_in_types);
+
+    // Change _Arg node index.
+    RearrangeArgNodes(&fbody->arg_nodes, index_mapping);
+  }
+
+  if (has_resource_output) {
+    // Resource _Retval must come from resource _Arg directly, or we do not
+    // support it.
+    std::map<int, int> resource_retval_to_arg, retval_index_mapping;
+    TF_RETURN_IF_ERROR(CalculateRetvalRearrange(
+        fbody->ret_nodes, &retval_index_mapping, &resource_retval_to_arg));
+
+    // Rearrange output edges.
+    TF_RETURN_IF_ERROR(RearrangeOutputEdges(n, g, retval_index_mapping,
+                                            resource_retval_to_arg));
+
+    // Change Tout attribute for the node.
+    std::vector<DataType> new_out_types =
+        ShuffleOutputDataTypeAttribute(out_types, retval_index_mapping);
+    n->ClearAttr("Tout");
+    n->AddAttr("Tout", new_out_types);
+
+    // Change index for _Retval nodes.
+    RearrangeRetvalNodes(fbody->ret_nodes, fbody->graph, retval_index_mapping);
+  }
+
+  // Save the new FunctionDef.
+  FunctionDef new_fdef;
+  string new_name =
+      fld->UniqueFunctionName(absl::StrCat(f.name(), "_rearrange_"));
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
+  TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef));
+
+  // Change node to use rewritten function.
+  f.set_name(new_name);
+  n->ClearAttr(attr_name);
+  n->AddAttr(attr_name, f);
+  return Status::OK();
+}
+
+Status MaybeRewriteIfNode(Graph* g, Node* n, FunctionLibraryDefinition* fld,
+                          bool* node_rewritten) {
+  // This node needs rewrite when either of these is true:
+  // 1) Tin has DT_RESOURCE which requires rearrange;
+  // 2) Tout has DT_RESOURCE.
+  std::vector<DataType> in_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "Tin", &in_types));
+  bool input_need_rearrange;
+  int resource_input_count;
+  std::vector<int> index_mapping;
+  TF_RETURN_IF_ERROR(InputTypesNeedsRearrange(
+      in_types, &input_need_rearrange, &resource_input_count, &index_mapping));
+  std::vector<DataType> out_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "Tout", &out_types));
+  bool has_resource_output = std::find(out_types.begin(), out_types.end(),
+                                       DT_RESOURCE) != out_types.end();
+  if (!input_need_rearrange && !has_resource_output) {
+    *node_rewritten = false;
+    return Status::OK();
+  }
+
+  *node_rewritten = true;
+
+  if (input_need_rearrange) {
+    // Reorder input edges.
+    std::vector<const Edge*> input_edges;
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge() || e->dst_input() == 0) {
+        continue;
+      }
+      input_edges.push_back(e);
+    }
+    for (const Edge* e : input_edges) {
+      Node* src = e->src();
+      int src_output = e->src_output();
+      int dst_input = e->dst_input();
+      int new_dst_input = index_mapping.at(dst_input - 1) + 1;
+      g->RemoveEdge(e);
+      g->AddEdge(src, src_output, n, new_dst_input)->DebugString();
+    }
+
+    // Change Tin attribute.
+    std::vector<DataType> new_in_types =
+        ShuffleInputDataTypeAttribute(in_types, index_mapping);
+    n->ClearAttr("Tin");
+    n->AddAttr("Tin", new_in_types);
+  }
+
+  std::map<int, int> resource_retval_to_arg, retval_index_mapping;
+  for (auto const& attr_name :
+       std::vector<string>{"then_branch", "else_branch"}) {
+    NameAttrList f;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &f));
+    const FunctionDef* fdef = fld->Find(f.name());
+    TF_RET_CHECK(fdef != nullptr);
+    std::unique_ptr<FunctionBody> fbody;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, AttrSlice(), fld, &fbody));
+
+    if (input_need_rearrange) {
+      // Change _Arg node index.
+      RearrangeArgNodes(&fbody->arg_nodes, index_mapping);
+    }
+
+    if (has_resource_output) {
+      // Resource _Retval must come from resource _Arg directly, or we do
+      // not support it.
+      TF_RETURN_IF_ERROR(CalculateRetvalRearrange(
+          fbody->ret_nodes, &retval_index_mapping, &resource_retval_to_arg));
+
+      // Change index for _Retval nodes.
+      RearrangeRetvalNodes(fbody->ret_nodes, fbody->graph,
+                           retval_index_mapping);
+    }
+
+    // Save the new FunctionDef.
+    FunctionDef new_fdef;
+    string new_name =
+        fld->UniqueFunctionName(absl::StrCat(f.name(), "_rearrange_"));
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef));
+
+    // Change node to use rewritten function.
+    f.set_name(new_name);
+    n->ClearAttr(attr_name);
+    n->AddAttr(attr_name, f);
+  }
+
+  if (has_resource_output) {
+    // Rearrange output edges.
+    std::vector<const Edge*> out_edges;
+    for (const Edge* e : n->out_edges()) {
+      if (!e->IsControlEdge()) {
+        out_edges.push_back(e);
+      }
+    }
+    for (const Edge* e : out_edges) {
+      Node* dst = e->dst();
+      int dst_input = e->dst_input();
+      int src_output = e->src_output();
+      auto iter = retval_index_mapping.find(src_output);
+      if (iter == retval_index_mapping.end()) {
+        TF_RET_CHECK(resource_retval_to_arg.find(src_output) !=
+                     resource_retval_to_arg.end());
+        g->RemoveEdge(e);
+        const Edge* input_edge;
+        TF_RETURN_IF_ERROR(n->input_edge(
+            resource_retval_to_arg.at(src_output) + 1, &input_edge));
+        g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+      } else {
+        g->RemoveEdge(e);
+        g->AddEdge(n, iter->second, dst, dst_input);
+      }
+    }
+
+    // Change Tout attribute for the node.
+    std::vector<DataType> new_out_types =
+        ShuffleOutputDataTypeAttribute(out_types, retval_index_mapping);
+    n->ClearAttr("Tout");
+    n->AddAttr("Tout", new_out_types);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status RearrangeFunctionArgumentForFunction(
+    const string& func_name, const string& new_func_name,
+    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
+    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
+    bool* modified) {
+  *modified = false;
+
+  // Convert the function to Graph.
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
+  Status ret_status = Status::OK();
+  auto cleanup_handle = gtl::MakeCleanup([&]() {
+    auto s = flr->ReleaseHandle(handle);
+    if (!s.ok()) {
+      ret_status.Update(s);
+    }
+  });
+  const FunctionBody* body = flr->GetFunctionBody(handle);
+  Graph* g = body->graph;
+
+  // If any node has associated functions, rewrite them first.
+  // Gather nodes with associated functions first, because rewriting those nodes
+  // might involve node deletion/addition. Avoid modifying nodes while iterating
+  // it.
+  std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
+      nodes_to_associated_functions;
+  for (auto* n : g->nodes()) {
+    auto associated_functions = GetAssociatedFunctions(*n, fld);
+    if (!associated_functions.empty()) {
+      nodes_to_associated_functions.push_back({n, associated_functions});
+    }
+  }
+  for (auto iter : nodes_to_associated_functions) {
+    Node* n = iter.first;
+    auto associated_functions = iter.second;
+    for (auto& associated_function : associated_functions) {
+      string name = associated_function.func_name();
+      string canonicalized_name =
+          Canonicalize(name, AttrSlice(&associated_function.attrs()));
+      auto iter = canonicalized_name_to_new_name->find(canonicalized_name);
+      string new_name;
+      bool function_modified;
+      if (iter != canonicalized_name_to_new_name->end()) {
+        // If we already processed this function, check if it was rewritten. If
+        // the function was rewritten, the entry will be non-empty. Otherwise
+        // the entry will be empty.
+        function_modified = iter->second.has_value();
+        if (function_modified) {
+          new_name = iter->second.value();
+        }
+      } else {
+        if (associated_function.type() ==
+            AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
+          // For SymbolicGradient, `name` is always "SymbolicGradient",
+          // which is not very informative. Use node name instead.
+          new_name =
+              fld->UniqueFunctionName(absl::StrCat(n->name(), "_rearrange_"));
+        } else {
+          new_name = fld->UniqueFunctionName(absl::StrCat(name, "_rearrange_"));
+        }
+        TF_RETURN_IF_ERROR(RearrangeFunctionArgumentForFunction(
+            name, new_name, associated_function.attrs(), fld, flr,
+            canonicalized_name_to_new_name, &function_modified));
+        if (function_modified) {
+          // If the function was rewritten, add an non-empty entry. So later we
+          // know we have processed this function, and it was rewritten into
+          // another function.
+          (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
+        } else {
+          // If the function was not rewritten, add an empty entry. So later
+          // we know we have processed this function, and it does not need to be
+          // rewritten.
+          (*canonicalized_name_to_new_name)[canonicalized_name] = absl::nullopt;
+        }
+      }
+      if (function_modified) {
+        *modified = true;
+
+        // Notice that if "n" is a function call, RewriteAssociatedFunction()
+        // will delete it and create a new node instead, making "n" an invalid
+        // pointer. That's fine because in that case, associated_functions will
+        // only have one member and the loop will only run once.
+        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+            g, n, fld, associated_function, new_name));
+      }
+    }
+  }
+
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "While") {
+      bool node_rewritten;
+      TF_RETURN_IF_ERROR(MaybeRewriteWhileNode(g, n, fld, &node_rewritten));
+      if (node_rewritten) {
+        *modified = true;
+      }
+    } else if (n->type_string() == "StatefulPartitionedCall") {
+      bool node_rewritten;
+      TF_RETURN_IF_ERROR(MaybeRewriteCallNode(g, n, fld, &node_rewritten));
+      if (node_rewritten) {
+        *modified = true;
+      }
+    } else if (n->type_string() == "If") {
+      bool node_rewritten;
+      TF_RETURN_IF_ERROR(MaybeRewriteIfNode(g, n, fld, &node_rewritten));
+      if (node_rewritten) {
+        *modified = true;
+      }
+    }
+  }
+
+  if (*modified) {
+    // Add rewritten FunctionDef into library.
+    FunctionDef functionalized_fdef;
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
+    if (func_name == new_func_name) {
+      VLOG(2) << "Replacing function " << func_name;
+      TF_RETURN_IF_ERROR(
+          fld->ReplaceFunction(new_func_name, functionalized_fdef));
+    } else {
+      VLOG(2) << "Adding function " << new_func_name;
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+    }
+  }
+
+  return ret_status;
+}  // namespace tensorflow
+
+Status RearrangeFunctionArgumentPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  Graph* graph = options.graph->get();
+  if (VLOG_IS_ON(4)) {
+    DumpGraphToFile("rearrange_function_argument_before", *graph,
+                    options.flib_def);
+  }
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(
+          /*device_mgr=*/nullptr, options.session_options->env,
+          TF_GRAPH_DEF_VERSION, options.flib_def, OptimizerOptions()));
+  FunctionLibraryRuntime* flr =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+  // Find XLA compile ops and its corresponding FunctionDef.
+  static std::map<string, string>* kNodeTypeToFunctionAttrMapping =
+      new std::map<string, string>{
+          // TPUReplicate ops are generated by EncapsulateTPUComputationsPass.
+          {"TPUReplicate", "computation"},
+          // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
+          {"XlaLaunch", "function"},
+      };
+  std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+  for (Node* n : graph->nodes()) {
+    auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
+    if (it == kNodeTypeToFunctionAttrMapping->end()) {
+      continue;
+    }
+    const string func_attr = it->second;
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
+    VLOG(2) << "Graph has node " << n->type_string()
+            << ". Corresponding function: " << func.name();
+    string new_func_name = options.flib_def->UniqueFunctionName(
+        absl::StrCat(func.name(), "_rearrange_"));
+    bool modified = false;
+    TF_RETURN_IF_ERROR(RearrangeFunctionArgumentForFunction(
+        func.name(), new_func_name, func.attr(), options.flib_def, flr,
+        &canonicalized_name_to_new_name, &modified));
+    if (modified) {
+      n->ClearAttr(func_attr);
+      func.set_name(new_func_name);
+      n->AddAttr(func_attr, func);
+    }
+  }
+
+  if (VLOG_IS_ON(4)) {
+    DumpGraphToFile("rearrange_function_argument_after", *graph,
+                    options.flib_def);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.h b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.h
new file mode 100644
index 0000000..98ffd62
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_REARRANGE_FUNCTION_ARGUMENT_PASS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_REARRANGE_FUNCTION_ARGUMENT_PASS_H_
+
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// For the function with `func_name`, rewrite any
+// StatefulPartitionedCall/If/While node that does not satisfy the rules.
+// We will rewrite related FunctionDef to rearrange arguments and return values,
+// also adjust node's input/output edges accordingly.
+Status RearrangeFunctionArgumentForFunction(
+    const string& func_name, const string& new_func_name,
+    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
+    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
+    bool* modified);
+
+// TF/XLA bridge expects FunctionDef to satisfy the following rules:
+// 1. DT_RESOURCE arguments are always in the last;
+// 2. Do not return DT_RESOURCE as return values.
+// But functions defined by Tensorflow might not satisfy them.
+// This rewrite pass rewrites the function for TPUCompile/XlaLaunch node
+// to follow the rules, using RearrangeFunctionArgumentForFunction() above.
+class RearrangeFunctionArgumentPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_REARRANGE_FUNCTION_ARGUMENT_PASS_H_
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass_registration.cc
similarity index 62%
copy from tensorflow/core/kernels/logging_ops.h
copy to tensorflow/compiler/tf2xla/rearrange_function_argument_pass_registration.cc
index 92a8d63..0661902 100644
--- a/tensorflow/core/kernels/logging_ops.h
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass_registration.cc
@@ -13,21 +13,13 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/tf2xla/rearrange_function_argument_pass.h"
 
 namespace tensorflow {
 
-namespace logging {
+// This pass is required for some AOT backends and all JIT backends, so this
+// file exists as a separate lib and will be linked to both AOT and JIT.
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 28,
+                      RearrangeFunctionArgumentPass);
 
-// Register a listener method to call on any printed messages.
-// Returns true if it is successfully registered.
-bool RegisterListener(void (*listener)(const char*));
-
-}  // namespace logging
 }  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla.proto b/tensorflow/compiler/tf2xla/tf2xla.proto
index 26ba540..3093a0b 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.proto
+++ b/tensorflow/compiler/tf2xla/tf2xla.proto
@@ -38,6 +38,10 @@
 message Fetch {
   TensorId id = 1;
   string name = 2;  // Optional name for generated code.
+
+  // Optional shape and data type. If specified, may be used for validation.
+  TensorShapeProto shape = 3;
+  DataType type = 4;
 }
 
 // Variable represents a resource variable with the given name, shape and type.
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 403e579..c14519c 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -58,18 +58,13 @@
 
   // Make sure that even tensors with 0 elements have allocated
   // buffers, so they get ids to track.
-  bool ShouldAllocateEmptyTensors() const override { return true; }
-
- private:
-  // Don't run any constructors or destructors for complex objects,
-  // since there is no backing store for the tensor to run them
-  // on. strings are the only complex objects currently stored in
-  // Tensors. If others are added, this set of overrides must be
-  // extended to include them.
-  void RunStringCtor(string* p, size_t n) override {}
-  void RunStringDtor(string* p, size_t n) override {}
-  void RunResourceCtor(ResourceHandle* p, size_t n) override {}
-  void RunResourceDtor(ResourceHandle* p, size_t n) override {}
+  //
+  // NOTE: It is the caller's responsibility to track whether an allocated
+  // object is a buffer or an opaque handle. In particular, when this allocator
+  // is used, the caller must not run any constructors or destructors for
+  // complex objects, since there is no backing store for the tensor in which to
+  // place their outputs.
+  bool AllocatesOpaqueHandle() const override { return true; }
 };
 
 XlaCompilationDevice::XlaCompilationDevice(const SessionOptions& options,
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 2047132..5420cf3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -33,7 +33,7 @@
       program_shape_(static_data.program_shape_),
       hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) {
   bool allocate_entry_params =
-      alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS;
+      alloc_mode == AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS;
   // Allocate arg and temp buffers.
   alloc_buffer_table_ = xla::cpu_function_runtime::MallocContiguousBuffers(
       static_data.buffer_infos_, static_data.num_buffers_,
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 9fe4873..5e452b5 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -105,7 +105,7 @@
   // AllocMode controls the buffer allocation mode.
   enum class AllocMode {
     // Allocate all buffers - args, results, profile and temps.
-    ARGS_RESULTS_PROFILES_AND_TEMPS,
+    ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS,
 
     // Only allocate result, profile and temp buffers.
     // Use set_arg_data to set argument buffers before Run is called.
@@ -114,7 +114,8 @@
 
   explicit XlaCompiledCpuFunction(
       const StaticData& static_data,
-      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS);
+      AllocMode alloc_mode =
+          AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS);
   virtual ~XlaCompiledCpuFunction();
 
   XlaCompiledCpuFunction(const XlaCompiledCpuFunction&) = delete;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 456f7ab..3b87b52 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -106,7 +106,6 @@
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       TF_RET_CHECK(index >= 0) << "Negative _Retval index";
-      TF_ASSIGN_OR_RETURN(retval_cores[index], get_sharding_for_node(n));
       retval_cores[index] = core;
     }
   }
@@ -234,6 +233,11 @@
       }
 
       case XlaExpression::Kind::kResource:
+        // Resources are pushed into elems later when processing resource
+        // arguments. This is correct as long as the input and output resources
+        // are in the same order. In the case of functionalized while body,
+        // this property is guaranteed since a corresponding output is always
+        // created for a DT_RESOURCE input in a corresponding location.
         output.is_constant = false;
         output.input_index = retval.resource()->arg_num();
         output.shape = retval.resource()->shape();
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 406d5ba..1cc5d8d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -339,7 +339,7 @@
     // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
     // allocate most or all available memory on the device, leaving none for the
     // compiler to access, unless it can use TensorFlow's allocator.
-    xla::DeviceMemoryAllocator* device_allocator = nullptr;
+    se::DeviceMemoryAllocator* device_allocator = nullptr;
   };
 
   explicit XlaCompiler(Options options);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 3df014f..16f18c0 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -1576,5 +1576,120 @@
   ASSERT_TRUE(output1.is_tensor_list);
 }
 
+// Test the compiler supports WhileOp with a loop body where DT_RESOURCE
+// variables are both inputs and outputs.
+TEST_F(XlaCompilerTest, WhileWithResources) {
+  FunctionDefLibrary fdef_lib;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  // Build cond fn for While.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_RESOURCE, 2);
+    auto less = ops::Less(scope, arg0, ops::Const<int32>(scope, 10));
+    (void)ops::_Retval(scope.WithOpName("ret"), less, 0);
+    TF_ASSERT_OK(scope.ToGraph(graph.get()));
+    FunctionDef fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(*graph, "cond", &fdef));
+    TF_ASSERT_OK(flib_def.AddFunctionDef(fdef));
+  }
+  // Build body fn for While.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_RESOURCE, 2);
+    auto read1 = ops::ReadVariableOp(scope.WithOpName("read1"), arg1, DT_INT32);
+    auto plus_read1 = ops::Add(scope, arg0, read1);
+    auto read2 = ops::ReadVariableOp(scope.WithOpName("read2"), arg2, DT_INT32);
+    auto minus_read2 = ops::Sub(scope, plus_read1, read2);
+    (void)ops::_Retval(scope.WithOpName("ret0"), minus_read2, 0);
+    (void)ops::_Retval(scope.WithOpName("ret1"), arg1, 1);
+    (void)ops::_Retval(scope.WithOpName("ret2"), arg2, 2);
+    TF_ASSERT_OK(scope.ToGraph(graph.get()));
+    FunctionDef fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(*graph, "body", &fdef));
+    TF_ASSERT_OK(flib_def.AddFunctionDef(fdef));
+  }
+
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+  auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
+  auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_RESOURCE, 2);
+
+  NameAttrList cond_fn, body_fn;
+  cond_fn.set_name("cond");
+  body_fn.set_name("body");
+  auto while_op = ops::While(
+      scope, std::initializer_list<Input>{arg0, arg1, arg2}, cond_fn, body_fn);
+
+  (void)ops::_Retval(scope.WithOpName("ret0"), while_op.output[0], 0);
+  (void)ops::_Retval(scope.WithOpName("ret1"), while_op.output[1], 1);
+  (void)ops::_Retval(scope.WithOpName("ret2"), while_op.output[2], 2);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(3);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({});
+  args[2].kind = XlaCompiler::Argument::kResource;
+  args[2].resource_kind = XlaResource::kVariable;
+  args[2].initialized = true;
+  args[2].type = DT_INT32;
+  args[2].shape = TensorShape({});
+
+  // Compiles the graph.
+  XlaCompiler::Options options = DefaultOptions();
+  options.flib_def = &flib_def;
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompileOptions compile_options = XlaCompiler::CompileOptions();
+  compile_options.return_updated_values_for_all_resources = true;
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "tested_while_with_vars",
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
+  ASSERT_EQ(result.outputs.size(), 3);
+  const XlaCompiler::OutputDescription& output1 = result.outputs[1];
+  ASSERT_EQ(output1.input_index, 1);
+  const XlaCompiler::OutputDescription& output2 = result.outputs[2];
+  ASSERT_EQ(output2.input_index, 2);
+
+  // Tests that the generated computation works.
+  xla::Literal literal0 = xla::LiteralUtil::CreateR0<int32>(0);
+  xla::Literal literal1 = xla::LiteralUtil::CreateR0<int32>(2);
+  xla::Literal literal2 = xla::LiteralUtil::CreateR0<int32>(1);
+  std::unique_ptr<xla::GlobalData> data0 =
+      client_->TransferToServer(literal0).ConsumeValueOrDie();
+  std::unique_ptr<xla::GlobalData> data1 =
+      client_->TransferToServer(literal1).ConsumeValueOrDie();
+  std::unique_ptr<xla::GlobalData> data2 =
+      client_->TransferToServer(literal2).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::GlobalData> actual =
+      client_
+          ->Execute(*result.computation,
+                    {data0.get(), data1.get(), data2.get()})
+          .ConsumeValueOrDie();
+  xla::Literal actual_literal = client_->Transfer(*actual).ConsumeValueOrDie();
+
+  xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32>(10);
+  xla::Literal expected1 = xla::LiteralUtil::CreateR0<int32>(2);
+  xla::Literal expected2 = xla::LiteralUtil::CreateR0<int32>(1);
+  xla::Literal expected_literal =
+      xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index d21ecb6..dd5a52e 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -126,6 +126,16 @@
   return type;
 }
 
+xla::PrimitiveType XlaOpKernelContext::InputXlaType(absl::string_view name) {
+  xla::PrimitiveType type;
+  Status status = DataTypeToPrimitiveType(InputType(name), &type);
+  if (!status.ok()) {
+    SetStatus(status);
+    return xla::PRIMITIVE_TYPE_INVALID;
+  }
+  return type;
+}
+
 Status XlaOpKernelContext::ConstantInput(int index,
                                          xla::Literal* constant_literal) {
   return ConstantInputReshaped(
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index cc2d5e8..7794786 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -81,6 +81,11 @@
   // xla::PRIMITIVE_TYPE_INVALID.
   xla::PrimitiveType input_xla_type(int index);
 
+  // Returns the type of input `name` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType InputXlaType(absl::string_view name);
+
   // Returns the shape of input `index`.
   TensorShape InputShape(int index);
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 490feb6..95d1bf2 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -115,6 +115,10 @@
 
     // If we should cluster operations returning DT_VARIANT.
     bool cluster_variant_ops = false;
+
+    // Whether ops known to be slow or to have correctness issues should be
+    // auto-clustered.
+    bool cluster_slow_and_inaccurate_ops = false;
   };
 
   // Registers an XLA backend. `compilation_device_name` is the name of the
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 1e5d170..91f33ff 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -18,8 +18,7 @@
     ],
 )
 
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
@@ -85,8 +84,8 @@
     ],
     visibility = [":friends"],
     deps = [
+        ":debug_options_flags",
         ":xla_proto",
-        "//tensorflow/compiler/xla:debug_options_flags",
     ],
 )
 
@@ -190,6 +189,7 @@
         ":xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -812,7 +812,7 @@
     hdrs = ["parse_flags_from_env.h"],
     deps =
         [
-            "//tensorflow/compiler/xla:types",
+            ":types",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "@com_google_absl//absl/strings",
@@ -827,7 +827,7 @@
     deps =
         [
             ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
+            ":types",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "//tensorflow/core:test",
@@ -847,7 +847,7 @@
         [
             ":parse_flags_from_env",
             ":status",
-            "//tensorflow/compiler/xla:xla_proto",
+            ":xla_proto",
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
             "@com_google_absl//absl/strings",
@@ -871,7 +871,7 @@
     ],
     deps =
         [
-            "//tensorflow/compiler/xla:xla_proto",
+            ":xla_proto",
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index d5ade8f..b800229 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -96,7 +96,7 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -117,7 +117,6 @@
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -126,6 +125,7 @@
         "//tensorflow/compiler/xla/service:source_map_util",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
         "@llvm//:support",
@@ -165,11 +165,11 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compile_only_service",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 62d225c..33d1de3 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -31,7 +31,6 @@
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,6 +38,7 @@
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index f2d124d..d5de53a 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -22,12 +22,12 @@
 namespace xla {
 
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
-    DeviceMemoryAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
   device_allocator_ = allocator;
   return *this;
 }
 
-DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
   return device_allocator_;
 }
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 1d85fb3..e2e2319 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -18,11 +18,11 @@
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -57,11 +57,11 @@
   // want to run various algorithms on the device and pick the fastest one -- it
   // might allocate buffers for use by these algorithms using this allocator.
   //
-  // This does not need to be the same as the DeviceMemoryAllocator passed when
-  // running the executable.
+  // This does not need to be the same as the se::DeviceMemoryAllocator passed
+  // when running the executable.
   ExecutableBuildOptions& set_device_allocator(
-      DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* device_allocator() const;
+      se::DeviceMemoryAllocator* allocator);
+  se::DeviceMemoryAllocator* device_allocator() const;
 
   // Returns a string representation of the build options, suitable for
   // debugging.
@@ -77,7 +77,7 @@
   Shape result_layout_;
   bool result_layout_set_ = false;
   absl::optional<DebugOptions> debug_options_;
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
 };
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 1ddd3c2..8e306a6 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -471,6 +471,7 @@
 xla_test(
     name = "svd_test",
     srcs = ["svd_test.cc"],
+    # Blacklisted because the tests are flaky.
     blacklisted_backends = [
         "cpu",
         "gpu",
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
index c620c98..11a79a2 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.cc
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -32,8 +32,7 @@
 namespace xla {
 namespace {
 
-using XlaOpGenerator = XlaOp (*)(const XlaOp&, const XlaOp&,
-                                 absl::Span<const int64>);
+using XlaOpGenerator = XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>);
 
 XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
                                             int64 bit_width) {
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 75bda22..3d15101 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -528,28 +528,149 @@
 
 XlaOp Atan(XlaOp x) { return Atan2(x, ScalarLike(x, 1.0)); }
 
-XlaOp Tan(XlaOp x) { return Sin(x) / Cos(x); }
+XlaOp Tan(XlaOp x) {
+  return DoWithUpcastToF32(x, {F16}, [](XlaOp x) { return Sin(x) / Cos(x); });
+}
 
 // Hyperbolic trigonometric functions.
 
-// acosh(x) = log(x + sqrt(x^2 - 1))
+// acosh(x) = log(x + sqrt(x^2 - 1))      if x >= -1
 //          = log(x + sqrt((x+1)*(x-1)))
+// acosh(x) = nan                         if x < -1
+//
+// If x^2 will overflow, we approximate sqrt(x^2 - 1) == x and compute as
+// log(2*x) = log(2) + log(x).  (Note this works because negative x never
+// overflows; x < -1 simply yields nan.  This is quite different than asinh!)
 XlaOp Acosh(XlaOp x) {
-  return Log(x + Sqrt((x + ScalarLike(x, 1.0)) * (x - ScalarLike(x, 1.0))));
+  XlaBuilder* b = x.builder();
+  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+
+    auto one = ScalarLike(x, 1);
+    auto neg_one = ScalarLike(x, -1);
+    auto nan = FullLike(x, std::numeric_limits<float>::quiet_NaN());
+
+    // return
+    //
+    //   nan                        if x < -1
+    //   log(x) + log(2)            if x >= sqrt_max_value
+    //   log(x + sqrt((x+1)*(x-1))) otherwise
+    //
+    // TODO(jlebar): For now, we ignore the question of overflow if x is a
+    // complex type, because we don't yet have exhaustive tests for complex trig
+    // functions.
+    auto naive_result = Log(x + Sqrt((x + one) * (x - one)));
+    if (primitive_util::IsComplexType(shape.element_type())) {
+      return naive_result;
+    }
+    auto overflow_result = Log(x) + Log(ScalarLike(x, 2));
+
+    auto sqrt_max_value = Sqrt(MaxFiniteValue(b, shape.element_type()));
+    return Select(Lt(x, neg_one), nan,
+                  Select(Ge(x, sqrt_max_value), overflow_result, naive_result));
+  });
 }
 
 // asinh(x) = log(x + sqrt(x^2 + 1))
-XlaOp Asinh(XlaOp x) { return Log(x + Sqrt(x * x + ScalarLike(x, 1.0))); }
+//
+// If x^2 will overflow and x is positive, we can approximate x + sqrt(x^2 + 1)
+// as 2*x and return log(2) + log(x).
+//
+// If x is negative, the above would give us some trouble; we can't approximate
+// the result as x + abs(x) = 0!  But we're saved by the fact that asinh(-x) =
+// -asinh(x).
+XlaOp Asinh(XlaOp x) {
+  XlaBuilder* b = x.builder();
+  auto do_it = [&](XlaOp x) -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+    auto one = ScalarLike(x, 1);
 
-// atanh(x) = 0.5 * log((1 + x) / (1 - x))
-XlaOp Atanh(XlaOp x) {
-  return Log((ScalarLike(x, 1.0) + x) / (ScalarLike(x, 1.0) - x)) *
-         ScalarLike(x, 0.5);
+    // Let a = abs(x).  Compute
+    //
+    //   y = log(a + sqrt(a*a + 1))  if a < sqrt_max_value, or
+    //   y = log(a) + log(2)         otherwise
+    //
+    // and then return
+    //
+    //   y * sign(x).
+    //
+    // TODO(jlebar): For now, we ignore the question of overflow if x is a
+    // complex type, because we don't yet have exhaustive tests for complex trig
+    // functions.
+    if (primitive_util::IsComplexType(shape.element_type())) {
+      return Log(x + Sqrt(x * x + one));
+    }
+    auto a = Abs(x);
+    auto naive_result = Log(a + Sqrt(a * a + one));
+    auto overflow_result = Log(Abs(a)) + Log(ScalarLike(a, 2));
+    auto sqrt_max_value = Sqrt(MaxFiniteValue(b, shape.element_type()));
+    return Sign(x) *
+           Select(Ge(a, sqrt_max_value), overflow_result, naive_result);
+  };
+  // These upcasts are not strictly necessary on all platforms to get within our
+  // error tolerances, so we could relax this if it ever mattered.
+  return DoWithUpcastToF32(x, {BF16, F16}, [&](XlaOp x) {
+    return b->ReportErrorOrReturn(do_it(x));
+  });
 }
 
-XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); }
+// atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) <= 1
+// atanh(x) = nan                          otherwise
+XlaOp Atanh(XlaOp x) {
+  XlaBuilder* b = x.builder();
+  auto do_it = [&](XlaOp x) -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+    auto naive_result =
+        Log((ScalarLike(x, 1.0) + x) / (ScalarLike(x, 1.0) - x)) *
+        ScalarLike(x, 0.5);
 
-XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); }
+    // TODO(jlebar): For now, we ignore the nan edge case for complex inputs,
+    // because we don't yet have exhaustive tests for complex trig functions.
+    if (primitive_util::IsComplexType(shape.element_type())) {
+      return naive_result;
+    }
+
+    auto nan = FullLike(x, std::numeric_limits<float>::quiet_NaN());
+    return Select(Gt(Abs(x), ScalarLike(x, 1)), nan, naive_result);
+  };
+  return DoWithUpcastToF32(x, {BF16}, [&](XlaOp x) {  //
+    return b->ReportErrorOrReturn(do_it(x));
+  });
+}
+
+// Cosh(x) = (e^x + e^-x) / 2
+//         = e^(x + log(1/2)) + e^(-x + log(1/2)).
+//
+// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not
+// inf.
+//
+// This incorrectly overflows to inf for two f32 input values, namely
+// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
+// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
+// we deem this acceptable.
+XlaOp Cosh(XlaOp x) {
+  return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+    auto log_one_half = Log(ScalarLike(x, 0.5));
+    return Exp(x + log_one_half) + Exp(-x + log_one_half);
+  });
+}
+
+// Sinh(x) = (e^x - e^-x) / 2
+//         = e^(x + log(1/2)) - e^(-x + log(1/2)).
+//
+// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not
+// inf.
+//
+// This incorrectly overflows to +/-inf for two f32 input values, namely
+// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
+// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
+// we deem this acceptable.
+XlaOp Sinh(XlaOp x) {
+  return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+    auto log_one_half = Log(ScalarLike(x, 0.5));
+    return Exp(x + log_one_half) - Exp(-x + log_one_half);
+  });
+}
 
 XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
   XlaBuilder* builder = x.builder();
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 50613ce..d042999 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -180,7 +180,7 @@
       shape = ShapeUtil::MakeShape(ty, {42});
     } else if (ty == PrimitiveType::TUPLE) {
       shape = ShapeUtil::MakeTupleShape({});
-    } else if (ty == PrimitiveType::OPAQUE) {
+    } else if (ty == PrimitiveType::OPAQUE_TYPE) {
       shape = ShapeUtil::MakeOpaqueShape();
     } else if (ty == PrimitiveType::TOKEN) {
       shape = ShapeUtil::MakeTokenShape();
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index 9500f9a..93f3d3a 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -253,6 +253,11 @@
 }
 
 XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  return BatchDot(x, false, y, false, precision);
+}
+
+XlaOp BatchDot(XlaOp x, bool transpose_x, XlaOp y, bool transpose_y,
+               PrecisionConfig::Precision precision) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
@@ -296,10 +301,20 @@
         return InvalidArgument("Expected batch dot dimension to be equal or 1");
       }
     }
-    x_config.push_back(ndims - 2);
-    x_config.push_back(ndims);
-    y_config.push_back(ndims);
-    y_config.push_back(ndims - 1);
+    if (transpose_x) {
+      x_config.push_back(ndims);
+      x_config.push_back(ndims - 2);
+    } else {
+      x_config.push_back(ndims - 2);
+      x_config.push_back(ndims);
+    }
+    if (transpose_y) {
+      y_config.push_back(ndims - 1);
+      y_config.push_back(ndims);
+    } else {
+      y_config.push_back(ndims);
+      y_config.push_back(ndims - 1);
+    }
     output_config.push_back(ndims - 2);
     output_config.push_back(ndims - 1);
     if (!x_implicit_broadcast.empty()) {
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 60c41ec..5f1ca96 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -17,6 +17,7 @@
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
 #include <array>
+
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -73,6 +74,9 @@
 xla::XlaOp BatchDot(
     xla::XlaOp x, xla::XlaOp y,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+xla::XlaOp BatchDot(
+    xla::XlaOp x, bool transpose_x, xla::XlaOp y, bool transpose_y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
 
 // Parse an einsum string into dimension numbers:
 //   "ab,cb->ac"
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 13ad598..77ebb75 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -24,6 +24,15 @@
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
+
+xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
+                         absl::Span<const xla::XlaOp> scalars) {
+  std::vector<xla::XlaOp> vectors;
+  absl::c_transform(scalars, std::back_inserter(vectors),
+                    [](xla::XlaOp x) { return xla::Reshape(x, {1}); });
+  return ConcatInDim(builder, vectors, 0);
+}
+
 namespace {
 
 // Rotates a 32-bit integer 'v' left by 'distance' bits.
@@ -109,7 +118,7 @@
 }
 
 // Converts a uint64 to two uint32s.
-ThreeFry2x32State Uint64ToUint32s(XlaOp u64) {
+std::array<XlaOp, 2> Uint64ToUint32s(XlaOp u64) {
   XlaBuilder* builder = u64.builder();
   XlaOp const32 = ConstantR0WithType(builder, U64, 32);
   XlaOp fst = ConvertElementType(u64, U32);
@@ -118,7 +127,7 @@
 }
 
 // Converts two uint32s to a uint64.
-XlaOp Uint32sToUint64(ThreeFry2x32State u32s) {
+XlaOp Uint32sToUint64(std::array<XlaOp, 2> u32s) {
   XlaBuilder* builder = u32s[0].builder();
   return ConvertElementType(u32s[0], U64) |
          ShiftLeft(ConvertElementType(u32s[1], U64),
@@ -168,6 +177,215 @@
           inputs_state.second};
 }
 
+// The key of the Philox random number generator.
+using Philox4x32Key = std::array<XlaOp, 2>;
+// The internal state of the Philox random number generator.
+using Philox4x32State = std::array<XlaOp, 4>;
+
+// Computes the Philox4x32 algorithm using 10 rounds.
+Philox4x32State Philox4x32(Philox4x32State state, Philox4x32Key key) {
+  // Constants specified by the Philox algorithm.
+  static const uint32 kPhiloxW32A = 0x9E3779B9;
+  static const uint32 kPhiloxW32B = 0xBB67AE85;
+  static const uint32 kPhiloxM4x32A = 0xD2511F53;
+  static const uint32 kPhiloxM4x32B = 0xCD9E8D57;
+
+  struct HighLowPair {
+    XlaOp high;
+    XlaOp low;
+  };
+
+  // Compute the high and low words from multiplying two 32-bit integers.
+  auto mul_hi_low = [](XlaOp x, uint32 k) {
+    auto product =
+        ConvertElementType(x, U64) * ConstantR0<uint64>(x.builder(), k);
+    auto low = ConvertElementType(product, U32);
+    auto high =
+        ConvertElementType(product >> ConstantR0<uint64>(x.builder(), 32), U32);
+    return HighLowPair{high, low};
+  };
+
+  // Perform a single round of the Philox algorithm.
+  auto philox_round = [&](Philox4x32State x, Philox4x32Key key) {
+    auto product0 = mul_hi_low(x[0], kPhiloxM4x32A);
+    auto product1 = mul_hi_low(x[2], kPhiloxM4x32B);
+    return Philox4x32State{product1.high ^ x[1] ^ key[0], product1.low,
+                           product0.high ^ x[3] ^ key[1], product0.low};
+  };
+
+  // Update the key after a round of Philox algorithm.
+  auto raise_key = [](Philox4x32Key key) {
+    XlaBuilder* builder = key[0].builder();
+    return Philox4x32Key{key[0] + ConstantR0<uint32>(builder, kPhiloxW32A),
+                         key[1] + ConstantR0<uint32>(builder, kPhiloxW32B)};
+  };
+
+  static const int kNumRounds = 10;
+  for (int round = 0; round < kNumRounds; ++round, key = raise_key(key)) {
+    state = philox_round(state, key);
+  }
+  return state;
+}
+
+// Scrambles the input key so that users don't need to worry about which part
+// of the key needs to be strong.
+std::pair<Philox4x32State, Philox4x32Key> ScramblePhiloxKey(Philox4x32Key key) {
+  XlaBuilder* builder = key[0].builder();
+  XlaOp key0 = ConvertElementType(key[0], U64);
+  XlaOp key1 = ConvertElementType(key[1], U64);
+
+  Philox4x32State state = {
+      ConvertElementType(key0, U32),
+      ConvertElementType(key0 >> ScalarLike(key0, 32), U32),
+      ConvertElementType(key1, U32),
+      ConvertElementType(key1 >> ScalarLike(key1, 32), U32),
+  };
+  key = {ConstantR0<uint32>(builder, 0x3ec8f720),
+         ConstantR0<uint32>(builder, 0x02461e29)};
+  state = Philox4x32(state, key);
+  XlaOp zero = ConstantR0<uint32>(builder, 0);
+  return {Philox4x32State{zero, zero, state[2], state[3]},
+          Philox4x32Key{state[0], state[1]}};
+}
+
+// Adds an U128 tensor with an U64 tensor. The U128 tensor is represented as two
+// U64s with the low 64bits in the front. This routine supports explicit
+// broadcasting of the U128 tensor, with `broadcast_sizes` representing the
+// dimensions prepended to its shape.
+std::array<XlaOp, 2> Uint128AddUint64(
+    const std::array<XlaOp, 2>& u128, XlaOp u64,
+    absl::Span<const int64> broadcast_sizes = {}) {
+  auto u128_low = u128[0];
+  auto u128_high = u128[1];
+  XlaOp new_u128_low = u128_low + u64;
+  XlaOp one = ConstantR0<uint64>(u128[0].builder(), 1);
+  XlaOp new_u128_high = Select(Lt(new_u128_low, u128_low),
+                               Broadcast(u128_high + one, broadcast_sizes),
+                               Broadcast(u128_high, broadcast_sizes));
+  return {new_u128_low, new_u128_high};
+}
+
+std::array<XlaOp, 2> Uint32sToUint128(const std::array<XlaOp, 4>& u32s) {
+  return {Uint32sToUint64({u32s[0], u32s[1]}),
+          Uint32sToUint64({u32s[2], u32s[3]})};
+}
+
+std::array<XlaOp, 4> Uint128ToUint32s(const std::array<XlaOp, 2>& u128) {
+  std::array<XlaOp, 2> u128_low_32s = Uint64ToUint32s(u128[0]);
+  std::array<XlaOp, 2> u128_high_32s = Uint64ToUint32s(u128[1]);
+  return {u128_low_32s[0], u128_low_32s[1], u128_high_32s[0], u128_high_32s[1]};
+}
+
+std::array<XlaOp, 2> Uint128FromOp(XlaOp op) {
+  auto u128_low = xla::Reshape(xla::Slice(op, {0}, {1}, {1}), {});
+  auto u128_high = xla::Reshape(xla::Slice(op, {1}, {2}, {1}), {});
+  return {u128_low, u128_high};
+}
+
+XlaOp Uint128ToOp(std::array<XlaOp, 2> u128) {
+  return ConcatScalars(u128[0].builder(), {u128[0], u128[1]});
+}
+
+// Returns the pair (state + [0, 1, ..., n-1], state + n), which should be used
+// as the inputs fed to `Philox4x32` and the updated state. `state` is an U128
+// represented as 4 U32s in the order from the least significant one to the most
+// significant one.
+std::pair<Philox4x32State, XlaOp> GetPhiloxInputsAndUpdatedState(
+    const Philox4x32State& state, int64 n) {
+  XlaBuilder* builder = state[0].builder();
+  XlaOp iota = Iota(builder, U64, n);
+  auto state_u128 = Uint32sToUint128(state);
+  auto inputs = Uint128ToUint32s(Uint128AddUint64(state_u128, iota, {n}));
+  XlaOp new_state =
+      Uint128ToOp(Uint128AddUint64(state_u128, ConstantR0<uint64>(builder, n)));
+  return std::make_pair(inputs, new_state);
+}
+
+// Generates CeilOfRatio(num_elems, 4)*4 32bit Philox random numbers, as Philox
+// numbers are generated in the unit of 128bits.
+std::pair<Philox4x32State, XlaOp> GeneratePhiloxBits(int64 num_elems,
+                                                     XlaOp initial_state,
+                                                     Philox4x32Key key,
+                                                     bool scramble) {
+  Philox4x32State state;
+  if (scramble) {
+    // When `scramble` is true, `initial_state` is not used. This is because
+    // scramble is true only when this function is called by stateless random
+    // ops, for which `initial_state` is always zero.
+    std::tie(state, key) = ScramblePhiloxKey(key);
+  } else {
+    state = Uint128ToUint32s(Uint128FromOp(initial_state));
+  }
+  const int64 num_vector4 = CeilOfRatio<int64>(num_elems, 4);
+  Philox4x32State inputs;
+  XlaOp new_state;
+  std::tie(inputs, new_state) =
+      GetPhiloxInputsAndUpdatedState(state, num_vector4);
+  auto outputs = Philox4x32(inputs, key);
+  return std::make_pair(outputs, new_state);
+}
+
+// Generates an array of primitive type U32 with the given shape containing
+// random bits generated by the Philox algorithm. Returns the array and the new
+// state of the random number generator.
+RngOutput PhiloxRngBit32(XlaOp op_key, XlaOp initial_state, const Shape& shape,
+                         bool scramble) {
+  XlaBuilder* builder = op_key.builder();
+  const int64 num_elems = ShapeUtil::ElementsIn(shape);
+
+  Philox4x32Key key = Uint64ToUint32s(op_key);
+  Philox4x32State bits;
+  XlaOp new_state;
+  std::tie(bits, new_state) =
+      GeneratePhiloxBits(num_elems, initial_state, key, scramble);
+  // Combining bits[i] in a round-robin fashion, to align with non-XLA
+  // implementations
+  int64 bits_len = (num_elems + 3) / 4;
+  for (auto i = 0; i < 4; ++i) {
+    bits[i] = Reshape(bits[i], {bits_len, 1});
+  }
+  XlaOp numbers = ConcatInDim(builder, {bits[0], bits[1], bits[2], bits[3]},
+                              /*dimension=*/1);
+  numbers = Reshape(numbers, {bits_len * 4});
+  numbers = Slice(numbers, /*start_indices=*/{0},
+                  /*limit_indices=*/{num_elems},
+                  /*strides=*/{1});
+  return {Reshape(numbers, AsInt64Slice(shape.dimensions())), new_state};
+}
+
+// Generates an array of primitive type U64 with the given shape containing
+// random bits generated by the Philox algorithm. Returns the array and the new
+// state of the random number generator.
+RngOutput PhiloxRngBit64(XlaOp op_key, XlaOp initial_state, const Shape& shape,
+                         bool scramble) {
+  XlaBuilder* builder = op_key.builder();
+  const int64 num_elems = ShapeUtil::ElementsIn(shape);
+
+  Philox4x32Key key = Uint64ToUint32s(op_key);
+  Philox4x32State bits32;
+  XlaOp new_state;
+  std::tie(bits32, new_state) =
+      GeneratePhiloxBits(num_elems * 2, initial_state, key, scramble);
+
+  std::array<XlaOp, 2> bits64;
+  bits64[0] = Uint32sToUint64({bits32[0], bits32[1]});
+  bits64[1] = Uint32sToUint64({bits32[2], bits32[3]});
+
+  // Combining bits64[i] in a round-robin fashion, to align with non-XLA
+  // implementations
+  int64 bits64_len = (num_elems + 1) / 2;
+  for (auto i = 0; i < 2; ++i) {
+    bits64[i] = Reshape(bits64[i], {bits64_len, 1});
+  }
+  XlaOp numbers = ConcatInDim(builder, {bits64[0], bits64[1]},
+                              /*dimension=*/1);
+  numbers = Reshape(numbers, {bits64_len * 2});
+  numbers = Slice(numbers, /*start_indices=*/{0},
+                  /*limit_indices=*/{num_elems},
+                  /*strides=*/{1});
+  return {Reshape(numbers, AsInt64Slice(shape.dimensions())), new_state};
+}
+
 XlaOp ConvertRandomBitsToUniformF32(XlaOp bits, XlaOp minval, XlaOp maxval) {
   XlaBuilder* builder = bits.builder();
   // Form 23 random mantissa bits, with a leading 1 bit. The leading 1 bit
@@ -235,6 +453,26 @@
   }
 }
 
+RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state, const Shape& shape,
+                             bool scramble) {
+  PrimitiveType type = shape.element_type();
+  switch (type) {
+    case F32:
+    case U32:
+    case S32:
+      return PhiloxRngBit32(key, initial_state, shape, scramble);
+    case U64:
+    case S64:
+      return PhiloxRngBit64(key, initial_state, shape, scramble);
+    default:
+      return {key.builder()->ReportError(Unimplemented(
+                  "Types other than F32, U32, S32, U64 and S64 "
+                  "are not implemented by ThreeFryBitGenerator; got %s",
+                  primitive_util::LowercasePrimitiveTypeName(type))),
+              initial_state};
+  }
+}
+
 RngOutput UniformF32Distribution(XlaOp key, XlaOp initial_state,
                                  BitGeneratorTy bit_generator, XlaOp minval,
                                  XlaOp maxval, const Shape& shape) {
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index 4cca47c..fcd1dbf 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -50,6 +50,19 @@
 RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
                                const xla::Shape& shape);
 
+// Implements the Philox algorithm to generate random numbers in parallel.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+//
+// The paper presents a few variants of the Philox algorithm, we picked the
+// 4x32_10 version of the algorithm for the following reasons:
+//   . 4x32 uses 32-bit multiplication which is fast on GPUs.
+//   . The authors recommend the 10-round variant, and TensorFlow also uses it.
+// 'scramble` controls whether to scramble 'key' and 'initial_state' to form
+// the actual key and state fed to the Philox algorithm.
+RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state, const Shape& shape,
+                             bool scramble);
+
 // Uses the given bit generator to generate random bits and then converts the
 // random bits to random numbers of uniform distribution in the given range.
 // Returns the random numbers and the state of the random number generator.
@@ -70,6 +83,10 @@
                                 BitGeneratorTy bit_generator,
                                 const xla::Shape& shape);
 
+// Concatenates scalars into a vector.
+xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
+                         absl::Span<const xla::XlaOp> scalars);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_
diff --git a/tensorflow/compiler/xla/client/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
index 640412e..5a7c826 100644
--- a/tensorflow/compiler/xla/client/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -101,7 +101,7 @@
 
   auto sigma_is_zero = Eq(sigma, zero);
 
-  *beta = Select(sigma_is_zero, alpha, -Sign(alpha) * mu);
+  *beta = Select(sigma_is_zero, alpha, Select(Lt(alpha, zero), one, -one) * mu);
   *tau = Select(sigma_is_zero, Broadcast(zero, batch_dims),
                 (*beta - alpha) / *beta);
   auto divisor =
@@ -192,7 +192,7 @@
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
     auto vva = BatchDot(v_broadcast, a, precision);
-    vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision);
+    vva = BatchDot(v_broadcast, true, vva, false, precision);
     a = a - Mul(tau, vva,
                 /*broadcast_dimensions=*/batch_dim_indices);
 
@@ -271,7 +271,7 @@
     auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(TransposeInMinorDims(y), v, precision);
+    auto yv = BatchDot(y, true, v, false, precision);
     // wyv has shape [..., m, 1]
     auto wyv = BatchDot(w, yv, precision);
 
@@ -365,7 +365,7 @@
 
     // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update = BatchDot(TransposeInMinorDims(w), a_panel, precision);
+    auto a_update = BatchDot(w, true, a_panel, false, precision);
     a_update = BatchDot(y, a_update, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
@@ -373,7 +373,7 @@
     // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
     auto q_update = BatchDot(q_panel, w, precision);
-    q_update = BatchDot(q_update, TransposeInMinorDims(y), precision);
+    q_update = BatchDot(q_update, false, y, true, precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
diff --git a/tensorflow/compiler/xla/client/lib/qr_test.cc b/tensorflow/compiler/xla/client/lib/qr_test.cc
index b27d364..a61f243 100644
--- a/tensorflow/compiler/xla/client/lib/qr_test.cc
+++ b/tensorflow/compiler/xla/client/lib/qr_test.cc
@@ -60,6 +60,33 @@
                              xla::ErrorSpec(1e-4, 1e-4));
 }
 
+XLA_TEST_F(QrTest, ZeroDiagonal) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {0, 1, 1},
+      {1, 0, 1},
+      {1, 1, 0},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      xla::QRDecomposition(a, /*full_matrices=*/true, /*block_size=*/8));
+
+  // Verifies that the decomposition composes back to the original matrix.
+  //
+  // This isn't a terribly demanding test, (e.g., we should verify that Q is
+  // orthonormal and R is upper-triangular) but it's awkward to write such tests
+  // without more linear algebra libraries. It's easier to test the numerics
+  // from Python, anyway, where we have access to numpy and scipy.
+  xla::BatchDot(result.q, result.r, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR2<float>(&builder, a_vals, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
 XLA_TEST_F(QrTest, SimpleBatched) {
   xla::XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index ddc39f4..49b3a4f 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -31,11 +31,9 @@
         ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
     XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    // TODO(b/122298745): Get rid of Neg() and use CreateScalarGtComputation
-    // once the TPU backend supports the comparison computations.
     XlaOp sort_result =
-        Sort({Neg(input), iota_s32},
-             CreateScalarLtComputation({input_shape.element_type(), S32},
+        Sort({input, iota_s32},
+             CreateScalarGtComputation({input_shape.element_type(), S32},
                                        iota_s32.builder()),
              last_dim, /*is_stable=*/true);
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
@@ -43,8 +41,8 @@
     limit_indices[last_dim] = k;
     std::vector<int64> strides(input_shape.dimensions_size(), 1);
 
-    XlaOp values = Neg(Slice(GetTupleElement(sort_result, 0), start_indices,
-                             limit_indices, strides));
+    XlaOp values = Slice(GetTupleElement(sort_result, 0), start_indices,
+                         limit_indices, strides);
     XlaOp indices = Slice(GetTupleElement(sort_result, 1), start_indices,
                           limit_indices, strides);
     return Tuple(builder, {values, indices});
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index 0fbd138..3bba84d 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -44,8 +44,7 @@
   ComputeAndCompareR1<int>(&builder, {0, 1, 2}, {});
 }
 
-// TODO(b/119930279): enable this test.
-XLA_TEST_F(SortingTest, DISABLED_TopKFullSortMinInt) {
+XLA_TEST_F(SortingTest, TopKFullSortMinInt) {
   XlaBuilder builder(TestName());
   auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
                                           std::numeric_limits<int>::min() + 1,
@@ -54,18 +53,6 @@
   ComputeAndCompareR1<int>(&builder, {2, 1, 0}, {});
 }
 
-XLA_TEST_F(SortingTest, NOT_TopKFullSortMinInt) {
-  XlaBuilder builder(TestName());
-  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
-                                          std::numeric_limits<int>::min() + 1,
-                                          std::numeric_limits<int>::max()});
-  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
-  // TopK currently negates the keys, which doesn't work correctly for
-  // std::numeric_limits<int>::min(). Therefore, it will sort this key to the
-  // front instead of to the back.
-  ComputeAndCompareR1<int>(&builder, {0, 2, 1}, {});
-}
-
 XLA_TEST_F(SortingTest, TopKFullSort) {
   XlaBuilder builder(TestName());
   const int kSize = 16;
diff --git a/tensorflow/compiler/xla/client/lib/svd.cc b/tensorflow/compiler/xla/client/lib/svd.cc
index dd4547d..53a2387 100644
--- a/tensorflow/compiler/xla/client/lib/svd.cc
+++ b/tensorflow/compiler/xla/client/lib/svd.cc
@@ -163,9 +163,8 @@
   HouseHolderResult result;
   result.v = v;
   result.beta = beta;
-  result.a =
-      Sub(a, Mul(beta, BatchDot(BatchDot(a, TransposeInMinorDims(v), precision),
-                                v, precision)));
+  result.a = Sub(a, Mul(beta, BatchDot(BatchDot(a, false, v, true, precision),
+                                       v, precision)));
 
   return result;
 }
@@ -231,8 +230,8 @@
   result.v = v;
   result.beta = beta;
   result.a = Sub(
-      a, Mul(beta, BatchDot(v, BatchDot(TransposeInMinorDims(v), a, precision),
-                            precision)));
+      a, Mul(beta, BatchDot(v, false, BatchDot(v, true, a, false, precision),
+                            false, precision)));
 
   return result;
 }
@@ -290,18 +289,16 @@
 
     TF_ASSIGN_OR_RETURN(HouseHolderResult house_col,
                         HouseCol(a, i, i, eps, precision));
-    u = Sub(u, Mul(house_col.beta,
-                   BatchDot(BatchDot(u, house_col.v, precision),
-                            TransposeInMinorDims(house_col.v), precision)));
+    u = Sub(u,
+            Mul(house_col.beta, BatchDot(BatchDot(u, house_col.v, precision),
+                                         false, house_col.v, true, precision)));
     a = house_col.a;
 
     TF_ASSIGN_OR_RETURN(HouseHolderResult house_row,
                         HouseRow(a, i, i + one, eps, precision));
-    v = Sub(
-        v,
-        Mul(house_row.beta,
-            BatchDot(BatchDot(v, TransposeInMinorDims(house_row.v), precision),
-                     house_row.v, precision)));
+    v = Sub(v, Mul(house_row.beta,
+                   BatchDot(BatchDot(v, false, house_row.v, true, precision),
+                            house_row.v, precision)));
     a = house_row.a;
 
     std::vector<XlaOp> updated_values;
@@ -331,11 +328,10 @@
       XlaOp index = ScalarLike(values[0], n - k);
       TF_ASSIGN_OR_RETURN(HouseHolderResult house_col,
                           HouseCol(values[3], index, index, eps, precision));
-      values[1] =
-          Sub(values[1],
-              Mul(house_col.beta,
-                  BatchDot(BatchDot(values[1], house_col.v, precision),
-                           TransposeInMinorDims(house_col.v), precision)));
+      values[1] = Sub(values[1],
+                      Mul(house_col.beta,
+                          BatchDot(BatchDot(values[1], house_col.v, precision),
+                                   false, house_col.v, true, precision)));
       values[3] = house_col.a;
     }
   }
@@ -751,23 +747,20 @@
 
   d = BroadcastInDim(d, dimensions, broadcast_dims);
 
-  // As m >= n, only first m columns vectors are needed to be permuted, and the
-  // rest of m - n vectors are appended after the sorting is done.
+  // As m >= n, only first n column vectors need to be permuted, and the rest of
+  // m - n vectors are appended after the sorting is done.
   XlaOp sort_u_result =
-      Sort({-d, SliceInMinorDims(result.u, {0, 0}, {m, n})},
-           CreateScalarLtComputation(
+      Sort({d, SliceInMinorDims(result.u, {0, 0}, {m, n})},
+           CreateScalarGtComputation(
                {shape.element_type(), shape.element_type()}, builder),
            num_dims - 1);
 
-  // TODO(kuny): using CreateScalarGtComputation after b/124862300 is fixed.
   XlaOp sort_v_result =
-      Sort({SliceInMinorDims(-d, {0, 0}, {n, n}), result.v},
-           CreateScalarLtComputation(
+      Sort({SliceInMinorDims(d, {0, 0}, {n, n}), result.v},
+           CreateScalarGtComputation(
                {shape.element_type(), shape.element_type()}, builder),
            num_dims - 1);
-  // Make sure all the signular values are non-negative.
-  result.d = Max(-GetMatrixDiagonal(GetTupleElement(sort_v_result, 0)),
-                 ScalarLike(d, 0.0));
+  result.d = GetMatrixDiagonal(GetTupleElement(sort_v_result, 0));
 
   result.v = GetTupleElement(sort_v_result, 1);
   result.v = Mul(
diff --git a/tensorflow/compiler/xla/client/lib/svd_test.cc b/tensorflow/compiler/xla/client/lib/svd_test.cc
index c3c6ae9..a987f7f 100644
--- a/tensorflow/compiler/xla/client/lib/svd_test.cc
+++ b/tensorflow/compiler/xla/client/lib/svd_test.cc
@@ -77,11 +77,10 @@
     auto u = result.u;
     auto d = result.d;
 
-    auto zero = Zero(builder, S32);
     if (m > n) {
-      u = DynamicSliceInMinorDims(u, {zero, zero}, {m, n});
+      u = SliceInMinorDims(u, {0, 0}, {m, n});
     } else if (m < n) {
-      v = DynamicSliceInMinorDims(v, {zero, zero}, {n, m});
+      v = SliceInMinorDims(v, {0, 0}, {n, m});
     }
 
     int num_dims = u_shape.rank();
@@ -92,25 +91,6 @@
                     PrecisionConfig::HIGHEST);
   }
 
-  Array3D<float> ExtractTriangularMatrix(const Array3D<float>& matrix,
-                                         bool lower) {
-    Array3D<float> result(matrix);
-    for (int i = 0; i < result.n1(); ++i) {
-      for (int j = 0; j < result.n2(); ++j) {
-        if (lower) {
-          for (int k = j + 1; k < result.n3(); ++k) {
-            result({i, j, k}) = 0.0;
-          }
-        } else {
-          for (int k = 0; k < j; ++k) {
-            result({i, j, k}) = 0.0;
-          }
-        }
-      }
-    }
-    return result;
-  }
-
   XlaOp GetAverageAbsoluteError(XlaOp m1, XlaOp m2, XlaBuilder* builder) {
     Shape shape = builder->GetShape(m1).ValueOrDie();
     int64 size = 1;
@@ -268,7 +248,7 @@
   Array2D<float> a_val = GenerateRandomMatrix(512, 512);
   XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
-  auto result = SVD(a, 100, 1e-6);
+  auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
   ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index dcb9d77..1bd9d7b 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -140,7 +140,8 @@
   return Status::OK();
 }
 
-StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
+StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+LocalExecutable::RunHelper(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
   TF_RETURN_IF_ERROR(
@@ -149,7 +150,7 @@
   StreamPool::Ptr stream;
   if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
-    // `actual_options` (otherwise we will end up using a returned stream in
+    // `service_options` (otherwise we will end up using a returned stream in
     // ExecuteOnStreamWrapper), which is why it isn't declared in the inner "if"
     // scope.
     TF_ASSIGN_OR_RETURN(
@@ -167,12 +168,29 @@
   //    backend_->eigen_intra_op_thread_pool().
   ServiceExecutableRunOptions service_options(run_options,
                                               backend_->StreamBorrower());
+  return std::make_pair(service_options, std::move(stream));
+}
+
+StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
+    const absl::Span<const ShapedBuffer* const> arguments,
+    ExecutableRunOptions run_options) {
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      RunHelper(arguments, run_options));
 
   if (executable_->dumping_snapshot()) {
-    return ExecuteAndDump(&service_options, arguments);
+    return ExecuteAndDump(&options_and_stream.first, arguments);
   }
   return executable_->ExecuteOnStreamWrapper(
-      &service_options, run_options.execution_profile(), arguments);
+      &options_and_stream.first, run_options.execution_profile(), arguments);
+}
+
+StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
+    const absl::Span<const ShapedBuffer* const> arguments,
+    ExecutableRunOptions run_options) {
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      RunHelper(arguments, run_options));
+  return executable_->ExecuteAsyncOnStream(&options_and_stream.first,
+                                           arguments);
 }
 
 StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
@@ -261,7 +279,7 @@
 
 StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
-    DeviceMemoryAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index c9f8b26..1e7c97d 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -24,7 +24,6 @@
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@@ -32,6 +31,7 @@
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -43,6 +43,12 @@
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
+  // Similar to Run(), but need not block the host waiting for the computation
+  // to complete before returning.
+  StatusOr<ScopedShapedBuffer> RunAsync(
+      const absl::Span<const ShapedBuffer* const> arguments,
+      ExecutableRunOptions run_options);
+
   // Return the options used to build the executable.
   const ExecutableBuildOptions& build_options() const { return build_options_; }
 
@@ -86,6 +92,10 @@
   // Returns a literal containing the contents of the given ShapedBuffer.
   StatusOr<Literal> LiteralFromShapedBuffer(const ShapedBuffer& shaped_buffer);
 
+  StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>> RunHelper(
+      const absl::Span<const ShapedBuffer* const> arguments,
+      ExecutableRunOptions run_options);
+
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
   // Backend::devices_equivalent).
@@ -127,7 +137,7 @@
   // device is used.
   StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
-      DeviceMemoryAllocator* allocator = nullptr);
+      se::DeviceMemoryAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
   StatusOr<TransferToServerResponse> TransferToLocalServer(
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index cc0e02d..1fa52a1 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -35,6 +35,7 @@
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -554,33 +555,50 @@
                             const XlaOp& ehs) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
-    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferTernaryOpShape(triop, lhs_shape,
-                                                         rhs_shape, ehs_shape));
-    *instr.mutable_shape() = shape.ToProto();
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
-    if (!shape.IsTuple()) {
-      if (!lhs_shape.IsTuple() &&
-          !ShapeUtil::SameDimensions(shape, lhs_shape)) {
-        // lhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_lhs, AddBroadcastSequence(shape, lhs));
+    // The client API supports implicit broadcast for kSelect and kClamp, but
+    // XLA does not support implicit broadcast. Make implicit broadcast explicit
+    // and update the operands.
+    if (triop == HloOpcode::kSelect || triop == HloOpcode::kClamp) {
+      TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+      TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+      TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
+
+      absl::optional<Shape> non_scalar_shape;
+      for (const Shape& shape : {lhs_shape, rhs_shape, ehs_shape}) {
+        if (shape.IsArray() && shape.rank() != 0) {
+          non_scalar_shape = shape;
+        }
       }
-      if (!rhs_shape.IsTuple() &&
-          !ShapeUtil::SameDimensions(shape, rhs_shape)) {
-        // rhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_rhs, AddBroadcastSequence(shape, rhs));
-      }
-      if (!ehs_shape.IsTuple() &&
-          !ShapeUtil::SameDimensions(shape, ehs_shape)) {
-        // ehs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_ehs, AddBroadcastSequence(shape, ehs));
+      if (non_scalar_shape.has_value()) {
+        if (ShapeUtil::IsScalar(lhs_shape)) {
+          TF_ASSIGN_OR_RETURN(updated_lhs,
+                              AddBroadcastSequence(*non_scalar_shape, lhs));
+        }
+        if (ShapeUtil::IsScalar(rhs_shape)) {
+          TF_ASSIGN_OR_RETURN(updated_rhs,
+                              AddBroadcastSequence(*non_scalar_shape, rhs));
+        }
+        if (ShapeUtil::IsScalar(ehs_shape)) {
+          TF_ASSIGN_OR_RETURN(updated_ehs,
+                              AddBroadcastSequence(*non_scalar_shape, ehs));
+        }
       }
     }
+
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(updated_lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(updated_rhs));
+    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(updated_ehs));
+    StatusOr<const Shape> status_or_shape = ShapeInference::InferTernaryOpShape(
+        triop, lhs_shape, rhs_shape, ehs_shape);
+    if (!status_or_shape.status().ok()) {
+      return InvalidArgument(
+          "%s Input scalar shapes may have been changed to non-scalar shapes.",
+          status_or_shape.status().error_message());
+    }
+    *instr.mutable_shape() = status_or_shape.ConsumeValueOrDie().ToProto();
     return AddInstruction(std::move(instr), triop,
                           {updated_lhs, updated_rhs, updated_ehs});
   });
@@ -1034,18 +1052,6 @@
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    // If one operand is a scalar, just multiply the two operands.
-    if (ShapeUtil::IsScalar(lhs_shape) || ShapeUtil::IsScalar(rhs_shape)) {
-      if (dimension_numbers.rhs_batch_dimensions_size() != 0 ||
-          dimension_numbers.lhs_batch_dimensions_size() != 0 ||
-          dimension_numbers.rhs_contracting_dimensions_size() != 0 ||
-          dimension_numbers.lhs_contracting_dimensions_size() != 0) {
-        return InvalidArgument(
-            "Dots with scalar operands must have no contracting or batch "
-            "dimensions");
-      }
-      return xla::Mul(lhs, rhs);
-    }
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
@@ -1511,7 +1517,7 @@
     }
     *instr.mutable_shape() = shape.ToProto();
     instr.set_custom_call_target(call_target_name);
-    instr.set_custom_call_opaque(opaque);
+    instr.set_backend_config(opaque);
     if (operand_shapes_with_layout.has_value()) {
       if (!LayoutUtil::HasLayout(shape)) {
         return InvalidArgument(
@@ -1571,122 +1577,6 @@
   });
 }
 
-namespace {
-// Switch from a floating point value to a integer value in such a way that when
-// using the integer value to compare, we get the same result for normal values,
-// and -Nan is treated as the smallest value, and Nan is treated as the largest
-// value.
-// If f is a float, and
-// x = bit_cast<int32>(f);
-// y = x < 0 ? numeric_limits<int32>::max() - x : x;
-// then y is ordered as an int32 such that finite values have the obvious order,
-// -0 is ordered before 0, and -NaN and NaN appear at the beginning and end of
-// the ordering.
-// Note that in order to avoid -x to overflow, we calculate
-// numeric_limits<int32>::max() - x as unsigned, and then convert back to
-// signed.
-XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
-                                            int64 bit_width) {
-  PrimitiveType signed_type;
-  PrimitiveType unsigned_type;
-  XlaOp max_value;
-  switch (bit_width) {
-    case 16:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint16>(std::numeric_limits<int16>::max()));
-      signed_type = S16;
-      unsigned_type = U16;
-      break;
-    case 32:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint32>(std::numeric_limits<int32>::max()));
-      signed_type = S32;
-      unsigned_type = U32;
-      break;
-    case 64:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint64>(std::numeric_limits<int64>::max()));
-      signed_type = S64;
-      unsigned_type = U64;
-      break;
-    default:
-      return value.builder()->ReportError(
-          InvalidArgument("Invalid bit width %lld for Comparator floating "
-                          "point parameter.",
-                          bit_width));
-  }
-  auto signed_value = BitcastConvertType(value, signed_type);
-  auto unsigned_value = BitcastConvertType(value, unsigned_type);
-  auto flipped_value =
-      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
-  auto is_negative =
-      Lt(signed_value,
-         ConstantLiteral(value.builder(), LiteralUtil::Zero(signed_type)));
-  return Select(is_negative, flipped_value, signed_value);
-}
-}  // namespace
-
-XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
-                       int64 dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    std::vector<XlaOp> operands{keys};
-    for (const XlaOp& value : values) {
-      operands.push_back(value);
-    }
-    // Build the default less-than comparator (copied from lib/comparators.cc).
-    // TODO(b/122298745): Remove the deprecated API method so that this code
-    // duplication can be deleted.
-    auto b = this->CreateSubBuilder("comparator");
-    std::vector<PrimitiveType> operand_types;
-    for (const XlaOp& operand : operands) {
-      TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(operand));
-      operand_types.push_back(operand_shape.element_type());
-    }
-
-    int64 parameter_count = 0;
-    XlaOp first_lhs_param;
-    XlaOp first_rhs_param;
-
-    for (auto operand_type : operand_types) {
-      auto scalar_shape = ShapeUtil::MakeShape(operand_type, {});
-      auto lhs_param =
-          b->Parameter(parameter_count * 2, scalar_shape,
-                       absl::StrCat("p.", parameter_count, ".lhs"));
-      auto rhs_param =
-          b->Parameter(parameter_count * 2 + 1, scalar_shape,
-                       absl::StrCat("p.", parameter_count, ".rhs"));
-      if (parameter_count == 0) {
-        first_lhs_param = lhs_param;
-        first_rhs_param = rhs_param;
-      }
-      ++parameter_count;
-    }
-    if (primitive_util::IsFloatingPointType(operand_types[0])) {
-      PrimitiveType compare_type = operand_types[0];
-      // Special-case handling for BF16. We currently do not support direct
-      // comparisons with BF16, so we convert to F32 and then use the F32
-      // comparison logic.
-      if (compare_type == BF16) {
-        compare_type = F32;
-        first_lhs_param = b->ConvertElementType(first_lhs_param, F32);
-        first_rhs_param = b->ConvertElementType(first_rhs_param, F32);
-      }
-      int64 bit_width = primitive_util::BitWidth(compare_type);
-      first_lhs_param =
-          BitcastConvertFloatingPointToIntegral(first_lhs_param, bit_width);
-      first_rhs_param =
-          BitcastConvertFloatingPointToIntegral(first_rhs_param, bit_width);
-    }
-    Lt(first_lhs_param, first_rhs_param);
-
-    TF_ASSIGN_OR_RETURN(auto comparator, b->Build());
-    return Sort(operands, comparator, dimension, /*is_stable=*/false);
-  });
-}
-
 XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
                        const XlaComputation& comparator, int64 dimension,
                        bool is_stable) {
@@ -1899,10 +1789,20 @@
                               const XlaComputation& true_computation,
                               const XlaOp& false_operand,
                               const XlaComputation& false_computation) {
-  // The index of true_computation must be 0 and that of false computation
-  // must be 1.
-  return Conditional(predicate, {&true_computation, &false_computation},
-                     {true_operand, false_operand});
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, GetShape(predicate));
+
+    if (!ShapeUtil::IsScalar(shape) || shape.element_type() != PRED) {
+      return InvalidArgument(
+          "Argument to predicated-Conditional is not a scalar of PRED type "
+          "(%s).",
+          ShapeUtil::HumanString(shape));
+    }
+    // The index of true_computation must be 0 and that of false computation
+    // must be 1.
+    return ConditionalImpl(predicate, {&true_computation, &false_computation},
+                           {true_operand, false_operand});
+  });
 }
 
 XlaOp XlaBuilder::Conditional(
@@ -1910,6 +1810,22 @@
     absl::Span<const XlaComputation* const> branch_computations,
     absl::Span<const XlaOp> branch_operands) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, GetShape(branch_index));
+
+    if (!ShapeUtil::IsScalar(shape) || shape.element_type() != S32) {
+      return InvalidArgument(
+          "Argument to indexed-Conditional is not a scalar of S32 type (%s).",
+          ShapeUtil::HumanString(shape));
+    }
+    return ConditionalImpl(branch_index, branch_computations, branch_operands);
+  });
+}
+
+XlaOp XlaBuilder::ConditionalImpl(
+    const XlaOp& branch_index,
+    absl::Span<const XlaComputation* const> branch_computations,
+    absl::Span<const XlaOp> branch_operands) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape& branch_index_shape,
@@ -2851,63 +2767,63 @@
   return builder->ConstantLiteral(literal);
 }
 
-XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes) {
+XlaOp Broadcast(const XlaOp operand, absl::Span<const int64> broadcast_sizes) {
   return operand.builder()->Broadcast(operand, broadcast_sizes);
 }
 
-XlaOp BroadcastInDim(const XlaOp& operand,
+XlaOp BroadcastInDim(const XlaOp operand,
                      const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions) {
   return operand.builder()->BroadcastInDim(operand, out_dim_size,
                                            broadcast_dimensions);
 }
 
-XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+XlaOp Pad(const XlaOp operand, const XlaOp padding_value,
           const PaddingConfig& padding_config) {
   return operand.builder()->Pad(operand, padding_value, padding_config);
 }
 
-XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
+XlaOp Reshape(const XlaOp operand, absl::Span<const int64> dimensions,
               absl::Span<const int64> new_sizes) {
   return operand.builder()->Reshape(operand, dimensions, new_sizes);
 }
 
-XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes) {
+XlaOp Reshape(const XlaOp operand, absl::Span<const int64> new_sizes) {
   return operand.builder()->Reshape(operand, new_sizes);
 }
 
-XlaOp Collapse(const XlaOp& operand, absl::Span<const int64> dimensions) {
+XlaOp Collapse(const XlaOp operand, absl::Span<const int64> dimensions) {
   return operand.builder()->Collapse(operand, dimensions);
 }
 
-XlaOp Slice(const XlaOp& operand, absl::Span<const int64> start_indices,
+XlaOp Slice(const XlaOp operand, absl::Span<const int64> start_indices,
             absl::Span<const int64> limit_indices,
             absl::Span<const int64> strides) {
   return operand.builder()->Slice(operand, start_indices, limit_indices,
                                   strides);
 }
 
-XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+XlaOp SliceInDim(const XlaOp operand, int64 start_index, int64 limit_index,
                  int64 stride, int64 dimno) {
   return operand.builder()->SliceInDim(operand, start_index, limit_index,
                                        stride, dimno);
 }
 
-XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+XlaOp DynamicSlice(const XlaOp operand, const XlaOp start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
-XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+XlaOp DynamicSlice(const XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
 
-XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                         const XlaOp& start_indices) {
+XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
+                         const XlaOp start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
 }
 
-XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
                          absl::Span<const XlaOp> start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
 }
@@ -2917,11 +2833,11 @@
   return builder->ConcatInDim(operands, dimension);
 }
 
-void Trace(const string& tag, const XlaOp& operand) {
+void Trace(const string& tag, const XlaOp operand) {
   return operand.builder()->Trace(tag, operand);
 }
 
-XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false) {
+XlaOp Select(const XlaOp pred, const XlaOp on_true, const XlaOp on_false) {
   return pred.builder()->Select(pred, on_true, on_false);
 }
 
@@ -2929,60 +2845,60 @@
   return builder->Tuple(elements);
 }
 
-XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index) {
+XlaOp GetTupleElement(const XlaOp tuple_data, int64 index) {
   return tuple_data.builder()->GetTupleElement(tuple_data, index);
 }
 
-XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Eq(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kEq);
 }
 
-XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Ne(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kNe);
 }
 
-XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Ge(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGe);
 }
 
-XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Gt(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGt);
 }
 
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Le(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLe);
 }
 
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Lt(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLt);
 }
 
-XlaOp Compare(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction) {
   return lhs.builder()->BinaryOp(HloOpcode::kCompare, lhs, rhs,
                                  broadcast_dimensions, direction);
 }
 
-XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Dot(const XlaOp lhs, const XlaOp rhs,
           const PrecisionConfig* precision_config) {
   return lhs.builder()->Dot(lhs, rhs, precision_config);
 }
 
-XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp DotGeneral(const XlaOp lhs, const XlaOp rhs,
                  const DotDimensionNumbers& dimension_numbers,
                  const PrecisionConfig* precision_config) {
   return lhs.builder()->DotGeneral(lhs, rhs, dimension_numbers,
                                    precision_config);
 }
 
-XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Conv(const XlaOp lhs, const XlaOp rhs,
            absl::Span<const int64> window_strides, Padding padding,
            int64 feature_group_count, int64 batch_group_count,
            const PrecisionConfig* precision_config) {
@@ -2991,7 +2907,7 @@
                              precision_config);
 }
 
-XlaOp ConvWithGeneralPadding(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ConvWithGeneralPadding(const XlaOp lhs, const XlaOp rhs,
                              absl::Span<const int64> window_strides,
                              absl::Span<const std::pair<int64, int64>> padding,
                              int64 feature_group_count, int64 batch_group_count,
@@ -3002,7 +2918,7 @@
 }
 
 XlaOp ConvWithGeneralDimensions(
-    const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
+    const XlaOp lhs, const XlaOp rhs, absl::Span<const int64> window_strides,
     Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
     int64 feature_group_count, int64 batch_group_count,
     const PrecisionConfig* precision_config) {
@@ -3011,7 +2927,7 @@
       batch_group_count, precision_config);
 }
 
-XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ConvGeneral(const XlaOp lhs, const XlaOp rhs,
                   absl::Span<const int64> window_strides,
                   absl::Span<const std::pair<int64, int64>> padding,
                   const ConvolutionDimensionNumbers& dimension_numbers,
@@ -3022,7 +2938,7 @@
                                     batch_group_count, precision_config);
 }
 
-XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ConvGeneralDilated(const XlaOp lhs, const XlaOp rhs,
                          absl::Span<const int64> window_strides,
                          absl::Span<const std::pair<int64, int64>> padding,
                          absl::Span<const int64> lhs_dilation,
@@ -3036,7 +2952,7 @@
       precision_config);
 }
 
-XlaOp Fft(const XlaOp& operand, FftType fft_type,
+XlaOp Fft(const XlaOp operand, FftType fft_type,
           absl::Span<const int64> fft_length) {
   return operand.builder()->Fft(operand, fft_type, fft_length);
 }
@@ -3083,7 +2999,7 @@
   return builder->Infeed(shape, config);
 }
 
-void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+void Outfeed(const XlaOp operand, const Shape& shape_with_layout,
              const string& outfeed_config) {
   return operand.builder()->Outfeed(operand, shape_with_layout, outfeed_config);
 }
@@ -3108,103 +3024,103 @@
                              operand_shapes_with_layout);
 }
 
-XlaOp Complex(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Complex(const XlaOp lhs, const XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kComplex, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Conj(const XlaOp& operand) {
+XlaOp Conj(const XlaOp operand) {
   return Complex(Real(operand), Neg(Imag(operand)));
 }
 
-XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Add(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kAdd, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Sub(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kSubtract, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Mul(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kMultiply, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Div(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kDivide, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Rem(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kRemainder, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Max(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kMaximum, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Min(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kMinimum, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp And(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kAnd, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Or(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kOr, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Xor(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kXor, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Not(const XlaOp& operand) {
+XlaOp Not(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kNot, operand);
 }
 
-XlaOp PopulationCount(const XlaOp& operand) {
+XlaOp PopulationCount(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kPopulationCount, operand);
 }
 
-XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ShiftLeft(const XlaOp lhs, const XlaOp rhs,
                 absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kShiftLeft, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ShiftRightArithmetic(const XlaOp lhs, const XlaOp rhs,
                            absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ShiftRightLogical(const XlaOp lhs, const XlaOp rhs,
                         absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+XlaOp Reduce(const XlaOp operand, const XlaOp init_value,
              const XlaComputation& computation,
              absl::Span<const int64> dimensions_to_reduce) {
   return operand.builder()->Reduce(operand, init_value, computation,
@@ -3221,12 +3137,12 @@
                          dimensions_to_reduce);
 }
 
-XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+XlaOp ReduceAll(const XlaOp operand, const XlaOp init_value,
                 const XlaComputation& computation) {
   return operand.builder()->ReduceAll(operand, init_value, computation);
 }
 
-XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+XlaOp ReduceWindow(const XlaOp operand, const XlaOp init_value,
                    const XlaComputation& computation,
                    absl::Span<const int64> window_dimensions,
                    absl::Span<const int64> window_strides, Padding padding) {
@@ -3236,7 +3152,7 @@
 }
 
 XlaOp ReduceWindowWithGeneralPadding(
-    const XlaOp& operand, const XlaOp& init_value,
+    const XlaOp operand, const XlaOp init_value,
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
@@ -3248,19 +3164,19 @@
       base_dilations, window_dilations, padding);
 }
 
-XlaOp CrossReplicaSum(const XlaOp& operand,
+XlaOp CrossReplicaSum(const XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups) {
   return operand.builder()->CrossReplicaSum(operand, replica_groups);
 }
 
-XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
+XlaOp CrossReplicaSum(const XlaOp operand, const XlaComputation& computation,
                       absl::Span<const ReplicaGroup> replica_groups,
                       const absl::optional<ChannelHandle>& channel_id) {
   return operand.builder()->CrossReplicaSum(operand, computation,
                                             replica_groups, channel_id);
 }
 
-XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
+XlaOp AllToAll(const XlaOp operand, int64 split_dimension,
                int64 concat_dimension, int64 split_count,
                const std::vector<ReplicaGroup>& replica_groups) {
   return operand.builder()->AllToAll(operand, split_dimension, concat_dimension,
@@ -3268,17 +3184,17 @@
 }
 
 XlaOp CollectivePermute(
-    const XlaOp& operand,
+    const XlaOp operand,
     const std::vector<std::pair<int64, int64>>& source_target_pairs) {
   return operand.builder()->CollectivePermute(operand, source_target_pairs);
 }
 
 XlaOp ReplicaId(XlaBuilder* builder) { return builder->ReplicaId(); }
 
-XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+XlaOp SelectAndScatter(const XlaOp operand, const XlaComputation& select,
                        absl::Span<const int64> window_dimensions,
                        absl::Span<const int64> window_strides, Padding padding,
-                       const XlaOp& source, const XlaOp& init_value,
+                       const XlaOp source, const XlaOp init_value,
                        const XlaComputation& scatter) {
   return operand.builder()->SelectAndScatter(operand, select, window_dimensions,
                                              window_strides, padding, source,
@@ -3286,116 +3202,112 @@
 }
 
 XlaOp SelectAndScatterWithGeneralPadding(
-    const XlaOp& operand, const XlaComputation& select,
+    const XlaOp operand, const XlaComputation& select,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
-    absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
-    const XlaOp& init_value, const XlaComputation& scatter) {
+    absl::Span<const std::pair<int64, int64>> padding, const XlaOp source,
+    const XlaOp init_value, const XlaComputation& scatter) {
   return operand.builder()->SelectAndScatterWithGeneralPadding(
       operand, select, window_dimensions, window_strides, padding, source,
       init_value, scatter);
 }
 
-XlaOp Abs(const XlaOp& operand) {
+XlaOp Abs(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kAbs, operand);
 }
 
-XlaOp Atan2(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Atan2(const XlaOp lhs, const XlaOp rhs,
             absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kAtan2, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp Exp(const XlaOp& operand) {
+XlaOp Exp(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kExp, operand);
 }
-XlaOp Expm1(const XlaOp& operand) {
+XlaOp Expm1(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kExpm1, operand);
 }
-XlaOp Floor(const XlaOp& operand) {
+XlaOp Floor(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kFloor, operand);
 }
-XlaOp Ceil(const XlaOp& operand) {
+XlaOp Ceil(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kCeil, operand);
 }
-XlaOp Round(const XlaOp& operand) {
+XlaOp Round(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kRoundNearestAfz, operand);
 }
-XlaOp Log(const XlaOp& operand) {
+XlaOp Log(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLog, operand);
 }
-XlaOp Log1p(const XlaOp& operand) {
+XlaOp Log1p(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
 }
-XlaOp Sign(const XlaOp& operand) {
+XlaOp Sign(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
 }
-XlaOp Clz(const XlaOp& operand) {
+XlaOp Clz(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kClz, operand);
 }
-XlaOp Cos(const XlaOp& operand) {
+XlaOp Cos(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kCos, operand);
 }
-XlaOp Sin(const XlaOp& operand) {
+XlaOp Sin(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSin, operand);
 }
-XlaOp Tanh(const XlaOp& operand) {
+XlaOp Tanh(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kTanh, operand);
 }
-XlaOp Real(const XlaOp& operand) {
+XlaOp Real(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kReal, operand);
 }
-XlaOp Imag(const XlaOp& operand) {
+XlaOp Imag(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kImag, operand);
 }
-XlaOp Sqrt(const XlaOp& operand) {
+XlaOp Sqrt(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSqrt, operand);
 }
-XlaOp Rsqrt(const XlaOp& operand) {
+XlaOp Rsqrt(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kRsqrt, operand);
 }
 
-XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Pow(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kPower, lhs, rhs,
                                  broadcast_dimensions);
 }
 
-XlaOp IsFinite(const XlaOp& operand) {
+XlaOp IsFinite(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kIsFinite, operand);
 }
 
-XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type) {
+XlaOp ConvertElementType(const XlaOp operand, PrimitiveType new_element_type) {
   return operand.builder()->ConvertElementType(operand, new_element_type);
 }
 
-XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type) {
+XlaOp BitcastConvertType(const XlaOp operand, PrimitiveType new_element_type) {
   return operand.builder()->BitcastConvertType(operand, new_element_type);
 }
 
-XlaOp Neg(const XlaOp& operand) {
+XlaOp Neg(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kNegate, operand);
 }
 
-XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation) {
+XlaOp Transpose(const XlaOp operand, absl::Span<const int64> permutation) {
   return operand.builder()->Transpose(operand, permutation);
 }
 
-XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions) {
+XlaOp Rev(const XlaOp operand, absl::Span<const int64> dimensions) {
   return operand.builder()->Rev(operand, dimensions);
 }
 
-XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values, int64 dimension) {
-  return keys.builder()->Sort(keys, values, dimension);
-}
-
 XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
            int64 dimension, bool is_stable) {
   return operands[0].builder()->Sort(operands, comparator, dimension,
                                      is_stable);
 }
 
-XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max) {
+XlaOp Clamp(const XlaOp min, const XlaOp operand, const XlaOp max) {
   return min.builder()->Clamp(min, operand, max);
 }
 
@@ -3405,56 +3317,56 @@
   return builder->Map(operands, computation, dimensions, static_operands);
 }
 
-XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape) {
+XlaOp RngNormal(const XlaOp mu, const XlaOp sigma, const Shape& shape) {
   return mu.builder()->RngNormal(mu, sigma, shape);
 }
 
-XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape) {
+XlaOp RngUniform(const XlaOp a, const XlaOp b, const Shape& shape) {
   return a.builder()->RngUniform(a, b, shape);
 }
 
 XlaOp While(const XlaComputation& condition, const XlaComputation& body,
-            const XlaOp& init) {
+            const XlaOp init) {
   return init.builder()->While(condition, body, init);
 }
 
-XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+XlaOp Conditional(const XlaOp predicate, const XlaOp true_operand,
                   const XlaComputation& true_computation,
-                  const XlaOp& false_operand,
+                  const XlaOp false_operand,
                   const XlaComputation& false_computation) {
   return predicate.builder()->Conditional(predicate, true_operand,
                                           true_computation, false_operand,
                                           false_computation);
 }
 
-XlaOp Conditional(const XlaOp& branch_index,
+XlaOp Conditional(const XlaOp branch_index,
                   absl::Span<const XlaComputation* const> branch_computations,
                   absl::Span<const XlaOp> branch_operands) {
   return branch_index.builder()->Conditional(branch_index, branch_computations,
                                              branch_operands);
 }
 
-XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+XlaOp ReducePrecision(const XlaOp operand, const int exponent_bits,
                       const int mantissa_bits) {
   return operand.builder()->ReducePrecision(operand, exponent_bits,
                                             mantissa_bits);
 }
 
-XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
+XlaOp Gather(const XlaOp input, const XlaOp start_indices,
              const GatherDimensionNumbers& dimension_numbers,
              absl::Span<const int64> slice_sizes) {
   return input.builder()->Gather(input, start_indices, dimension_numbers,
                                  slice_sizes);
 }
 
-XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
-              const XlaOp& updates, const XlaComputation& update_computation,
+XlaOp Scatter(const XlaOp input, const XlaOp scatter_indices,
+              const XlaOp updates, const XlaComputation& update_computation,
               const ScatterDimensionNumbers& dimension_numbers) {
   return input.builder()->Scatter(input, scatter_indices, updates,
                                   update_computation, dimension_numbers);
 }
 
-void Send(const XlaOp& operand, const ChannelHandle& handle) {
+void Send(const XlaOp operand, const ChannelHandle& handle) {
   return operand.builder()->Send(operand, handle);
 }
 
@@ -3463,33 +3375,33 @@
   return builder->Recv(shape, handle);
 }
 
-XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+XlaOp SendWithToken(const XlaOp operand, const XlaOp token,
                     const ChannelHandle& handle) {
   return operand.builder()->SendWithToken(operand, token, handle);
 }
 
-XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+XlaOp RecvWithToken(const XlaOp token, const Shape& shape,
                     const ChannelHandle& handle) {
   return token.builder()->RecvWithToken(token, shape, handle);
 }
 
-XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+XlaOp SendToHost(const XlaOp operand, const XlaOp token,
                  const Shape& shape_with_layout, const ChannelHandle& handle) {
   return operand.builder()->SendToHost(operand, token, shape_with_layout,
                                        handle);
 }
 
-XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+XlaOp RecvFromHost(const XlaOp token, const Shape& shape,
                    const ChannelHandle& handle) {
   return token.builder()->RecvFromHost(token, shape, handle);
 }
 
-XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+XlaOp InfeedWithToken(const XlaOp token, const Shape& shape,
                       const string& config) {
   return token.builder()->InfeedWithToken(token, shape, config);
 }
 
-XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+XlaOp OutfeedWithToken(const XlaOp operand, const XlaOp token,
                        const Shape& shape_with_layout,
                        const string& outfeed_config) {
   return operand.builder()->OutfeedWithToken(operand, token, shape_with_layout,
@@ -3502,24 +3414,24 @@
   return builder->AfterAll(tokens);
 }
 
-XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                        const XlaOp& offset, float epsilon,
+XlaOp BatchNormTraining(const XlaOp operand, const XlaOp scale,
+                        const XlaOp offset, float epsilon,
                         int64 feature_index) {
   return operand.builder()->BatchNormTraining(operand, scale, offset, epsilon,
                                               feature_index);
 }
 
-XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                         const XlaOp& offset, const XlaOp& mean,
-                         const XlaOp& variance, float epsilon,
+XlaOp BatchNormInference(const XlaOp operand, const XlaOp scale,
+                         const XlaOp offset, const XlaOp mean,
+                         const XlaOp variance, float epsilon,
                          int64 feature_index) {
   return operand.builder()->BatchNormInference(
       operand, scale, offset, mean, variance, epsilon, feature_index);
 }
 
-XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                    const XlaOp& batch_mean, const XlaOp& batch_var,
-                    const XlaOp& grad_output, float epsilon,
+XlaOp BatchNormGrad(const XlaOp operand, const XlaOp scale,
+                    const XlaOp batch_mean, const XlaOp batch_var,
+                    const XlaOp grad_output, float epsilon,
                     int64 feature_index) {
   return operand.builder()->BatchNormGrad(operand, scale, batch_mean, batch_var,
                                           grad_output, epsilon, feature_index);
@@ -3533,7 +3445,7 @@
   return builder->Iota(shape, iota_dimension);
 }
 
-XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension) {
+XlaOp GetDimensionSize(const XlaOp operand, int64 dimension) {
   return operand.builder()->GetDimensionSize(operand, dimension);
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index d43ae89..508f16a 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -515,9 +515,6 @@
 
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  ABSL_DEPRECATED("Use form with comparator computation instead")
-  XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
-             int64 dimension = -1);
   XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
              int64 dimension = -1, bool is_stable = false);
 
@@ -719,104 +716,98 @@
   friend XlaOp ConstantLiteral(XlaBuilder* builder,
                                const LiteralSlice& literal);
 
-  friend XlaOp Broadcast(const XlaOp& operand,
+  friend XlaOp Broadcast(XlaOp operand,
                          absl::Span<const int64> broadcast_sizes);
 
   friend XlaOp BroadcastInDim(
-      const XlaOp& operand, const absl::Span<const int64> out_dim_size,
+      XlaOp operand, const absl::Span<const int64> out_dim_size,
       const absl::Span<const int64> broadcast_dimensions);
 
-  friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+  friend XlaOp Pad(XlaOp operand, XlaOp padding_value,
                    const PaddingConfig& padding_config);
 
-  friend XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
+  friend XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
                        absl::Span<const int64> new_sizes);
 
-  friend XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes);
+  friend XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes);
 
-  friend XlaOp Collapse(const XlaOp& operand,
-                        absl::Span<const int64> dimensions);
+  friend XlaOp Collapse(XlaOp operand, absl::Span<const int64> dimensions);
 
-  friend XlaOp Slice(const XlaOp& operand,
-                     absl::Span<const int64> start_indices,
+  friend XlaOp Slice(XlaOp operand, absl::Span<const int64> start_indices,
                      absl::Span<const int64> limit_indices,
                      absl::Span<const int64> strides);
 
-  friend XlaOp SliceInDim(const XlaOp& operand, int64 start_index,
-                          int64 limit_index, int64 stride, int64 dimno);
+  friend XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
+                          int64 stride, int64 dimno);
 
-  friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+  friend XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
                             absl::Span<const int64> slice_sizes);
-  friend XlaOp DynamicSlice(const XlaOp& operand,
+  friend XlaOp DynamicSlice(XlaOp operand,
                             absl::Span<const XlaOp> start_indices,
                             absl::Span<const int64> slice_sizes);
 
-  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                                  const XlaOp& start_indices);
-  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
+                                  XlaOp start_indices);
+  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                   absl::Span<const XlaOp> start_indices);
 
   friend XlaOp ConcatInDim(XlaBuilder* builder,
                            absl::Span<const XlaOp> operands, int64 dimension);
 
-  friend void Trace(const string& tag, const XlaOp& operand);
+  friend void Trace(const string& tag, XlaOp operand);
 
-  friend XlaOp Select(const XlaOp& pred, const XlaOp& on_true,
-                      const XlaOp& on_false);
+  friend XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
   friend XlaOp Tuple(XlaBuilder* builder, absl::Span<const XlaOp> elements);
-  friend XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-  friend XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
+  friend XlaOp Eq(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Ne(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Ge(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Gt(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Lt(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Le(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Compare(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
                        absl::Span<const int64> broadcast_dimensions,
                        ComparisonDirection direction);
-  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
                    const PrecisionConfig* precision_config);
-  friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
                           const DotDimensionNumbers& dimension_number,
                           const PrecisionConfig* precision_config);
-  friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Conv(XlaOp lhs, XlaOp rhs,
                     absl::Span<const int64> window_strides, Padding padding,
                     int64 feature_group_count, int64 batch_group_count,
                     const PrecisionConfig* precision_config);
   friend XlaOp ConvWithGeneralPadding(
-      const XlaOp& lhs, const XlaOp& rhs,
-      absl::Span<const int64> window_strides,
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64>> padding,
       int64 feature_group_count, int64 batch_group_count,
       const PrecisionConfig* precision_config);
   friend XlaOp ConvWithGeneralDimensions(
-      const XlaOp& lhs, const XlaOp& rhs,
-      absl::Span<const int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers,
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+      Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
       int64 feature_group_count, int64 batch_group_count,
       const PrecisionConfig* precision_config);
-  friend XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp ConvGeneral(XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
                            const ConvolutionDimensionNumbers& dimension_numbers,
                            int64 feature_group_count, int64 batch_group_count,
                            const PrecisionConfig* precision_config);
   friend XlaOp ConvGeneralDilated(
-      const XlaOp& lhs, const XlaOp& rhs,
-      absl::Span<const int64> window_strides,
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64>> padding,
       absl::Span<const int64> lhs_dilation,
       absl::Span<const int64> rhs_dilation,
       const ConvolutionDimensionNumbers& dimension_numbers,
       int64 feature_group_count, int64 batch_group_count,
       const PrecisionConfig* precision_config);
-  friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
+  friend XlaOp Fft(XlaOp operand, FftType fft_type,
                    absl::Span<const int64> fft_length);
   friend XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
                                bool unit_diagonal,
@@ -824,7 +815,7 @@
   friend XlaOp Cholesky(XlaOp a, bool lower);
   friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
                       const string& config);
-  friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+  friend void Outfeed(XlaOp operand, const Shape& shape_with_layout,
                       const string& outfeed_config);
   friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
                     absl::Span<const XlaOp> operands);
@@ -835,183 +826,180 @@
       XlaBuilder* builder, const string& call_target_name,
       absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
       absl::Span<const Shape> operand_shapes_with_layout, const string& opaque);
-  friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+  friend XlaOp Complex(XlaOp real, XlaOp imag,
                        absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Conj(const XlaOp& operand);
-  friend XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Conj(XlaOp operand);
+  friend XlaOp Add(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Sub(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Mul(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Div(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Rem(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Max(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Min(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp And(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Or(XlaOp lhs, XlaOp rhs,
                   absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Xor(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Not(const XlaOp& operand);
-  friend XlaOp PopulationCount(const XlaOp& operand);
-  friend XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Not(XlaOp operand);
+  friend XlaOp PopulationCount(XlaOp operand);
+  friend XlaOp ShiftLeft(XlaOp lhs, XlaOp rhs,
                          absl::Span<const int64> broadcast_dimensions);
   friend XlaOp ShiftRightArithmetic(
-      const XlaOp& lhs, const XlaOp& rhs,
-      absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> broadcast_dimensions);
+  friend XlaOp ShiftRightLogical(XlaOp lhs, XlaOp rhs,
                                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+  friend XlaOp Reduce(XlaOp operand, XlaOp init_value,
                       const XlaComputation& computation,
                       absl::Span<const int64> dimensions_to_reduce);
   friend XlaOp Reduce(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                       absl::Span<const XlaOp> init_values,
                       const XlaComputation& computation,
                       absl::Span<const int64> dimensions_to_reduce);
-  friend XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+  friend XlaOp ReduceAll(XlaOp operand, XlaOp init_value,
                          const XlaComputation& computation);
-  friend XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+  friend XlaOp ReduceWindow(XlaOp operand, XlaOp init_value,
                             const XlaComputation& computation,
                             absl::Span<const int64> window_dimensions,
                             absl::Span<const int64> window_strides,
                             Padding padding);
   friend XlaOp ReduceWindowWithGeneralPadding(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
+      XlaOp operand, XlaOp init_value, const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const int64> base_dilations,
       absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
-  friend XlaOp CrossReplicaSum(const XlaOp& operand,
+  friend XlaOp CrossReplicaSum(XlaOp operand,
                                absl::Span<const ReplicaGroup> replica_groups);
-  friend XlaOp CrossReplicaSum(const XlaOp& operand,
-                               const XlaComputation& computation,
+  friend XlaOp CrossReplicaSum(XlaOp operand, const XlaComputation& computation,
                                absl::Span<const ReplicaGroup> replica_groups,
                                const absl::optional<ChannelHandle>& channel_id);
-  friend XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
+  friend XlaOp AllToAll(XlaOp operand, int64 split_dimension,
                         int64 concat_dimension, int64 split_count,
                         const std::vector<ReplicaGroup>& replica_groups);
   friend XlaOp CollectivePermute(
-      const XlaOp& operand,
+      XlaOp operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
   friend XlaOp ReplicaId(XlaBuilder* builder);
-  friend XlaOp SelectAndScatter(const XlaOp& operand,
-                                const XlaComputation& select,
+  friend XlaOp SelectAndScatter(XlaOp operand, const XlaComputation& select,
                                 absl::Span<const int64> window_dimensions,
                                 absl::Span<const int64> window_strides,
-                                Padding padding, const XlaOp& source,
-                                const XlaOp& init_value,
+                                Padding padding, XlaOp source, XlaOp init_value,
                                 const XlaComputation& scatter);
   friend XlaOp SelectAndScatterWithGeneralPadding(
-      const XlaOp& operand, const XlaComputation& select,
+      XlaOp operand, const XlaComputation& select,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
-      absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
-      const XlaOp& init_value, const XlaComputation& scatter);
-  friend XlaOp Abs(const XlaOp& operand);
-  friend XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+      absl::Span<const std::pair<int64, int64>> padding, XlaOp source,
+      XlaOp init_value, const XlaComputation& scatter);
+  friend XlaOp Abs(XlaOp operand);
+  friend XlaOp Atan2(XlaOp y, XlaOp x,
                      absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Exp(const XlaOp& operand);
-  friend XlaOp Expm1(const XlaOp& operand);
-  friend XlaOp Floor(const XlaOp& operand);
-  friend XlaOp Ceil(const XlaOp& operand);
-  friend XlaOp Round(const XlaOp& operand);
-  friend XlaOp Log(const XlaOp& operand);
-  friend XlaOp Log1p(const XlaOp& operand);
-  friend XlaOp Sign(const XlaOp& operand);
-  friend XlaOp Clz(const XlaOp& operand);
-  friend XlaOp Cos(const XlaOp& operand);
-  friend XlaOp Sin(const XlaOp& operand);
-  friend XlaOp Tanh(const XlaOp& operand);
-  friend XlaOp Real(const XlaOp& operand);
-  friend XlaOp Imag(const XlaOp& operand);
-  friend XlaOp Sqrt(const XlaOp& operand);
-  friend XlaOp Rsqrt(const XlaOp& operand);
-  friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+  friend XlaOp Exp(XlaOp operand);
+  friend XlaOp Expm1(XlaOp operand);
+  friend XlaOp Floor(XlaOp operand);
+  friend XlaOp Ceil(XlaOp operand);
+  friend XlaOp Round(XlaOp operand);
+  friend XlaOp Log(XlaOp operand);
+  friend XlaOp Log1p(XlaOp operand);
+  friend XlaOp Sign(XlaOp operand);
+  friend XlaOp Clz(XlaOp operand);
+  friend XlaOp Cos(XlaOp operand);
+  friend XlaOp Sin(XlaOp operand);
+  friend XlaOp Tanh(XlaOp operand);
+  friend XlaOp Real(XlaOp operand);
+  friend XlaOp Imag(XlaOp operand);
+  friend XlaOp Sqrt(XlaOp operand);
+  friend XlaOp Rsqrt(XlaOp operand);
+  friend XlaOp Pow(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp IsFinite(const XlaOp& operand);
+  friend XlaOp IsFinite(XlaOp operand);
   friend XlaOp Iota(XlaBuilder* builder, const Shape& shape,
                     int64 iota_dimension);
   friend XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size);
-  friend XlaOp ConvertElementType(const XlaOp& operand,
+  friend XlaOp ConvertElementType(XlaOp operand,
                                   PrimitiveType new_element_type);
-  friend XlaOp BitcastConvertType(const XlaOp& operand,
+  friend XlaOp BitcastConvertType(XlaOp operand,
                                   PrimitiveType new_element_type);
-  friend XlaOp Neg(const XlaOp& operand);
-  friend XlaOp Transpose(const XlaOp& operand,
-                         absl::Span<const int64> permutation);
-  friend XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
-  friend XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
-                    int64 dimension);
+  friend XlaOp Neg(XlaOp operand);
+  friend XlaOp Transpose(XlaOp operand, absl::Span<const int64> permutation);
+  friend XlaOp Rev(XlaOp operand, absl::Span<const int64> dimensions);
   friend XlaOp Sort(absl::Span<const XlaOp> operands,
                     const XlaComputation& comparator, int64 dimension,
                     bool is_stable);
-  friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+  friend XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
   friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                    const XlaComputation& computation,
                    absl::Span<const int64> dimensions,
                    absl::Span<const XlaOp> static_operands);
-  friend XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma,
-                         const Shape& shape);
-  friend XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+  friend XlaOp RngNormal(XlaOp mu, XlaOp sigma, const Shape& shape);
+  friend XlaOp RngUniform(XlaOp a, XlaOp b, const Shape& shape);
   friend XlaOp While(const XlaComputation& condition,
-                     const XlaComputation& body, const XlaOp& init);
-  friend XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                     const XlaComputation& body, XlaOp init);
+  friend XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
                            const XlaComputation& true_computation,
-                           const XlaOp& false_operand,
+                           XlaOp false_operand,
                            const XlaComputation& false_computation);
   friend XlaOp Conditional(
+      XlaOp branch_index,
+      absl::Span<const XlaComputation* const> branch_computations,
+      absl::Span<const XlaOp> branch_operands);
+  friend XlaOp ConditionalImpl(
       const XlaOp& branch_index,
       absl::Span<const XlaComputation* const> branch_computations,
       absl::Span<const XlaOp> branch_operands);
-  friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+  friend XlaOp ReducePrecision(XlaOp operand, const int exponent_bits,
                                const int mantissa_bits);
-  friend XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
+  friend XlaOp Gather(XlaOp input, XlaOp start_indices,
                       const GatherDimensionNumbers& dimension_numbers,
                       absl::Span<const int64> slice_sizes);
-  friend XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
-                       const XlaOp& updates,
+  friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
                        const XlaComputation& update_computation,
                        const ScatterDimensionNumbers& dimension_numbers);
-  friend void Send(const XlaOp& operand, const ChannelHandle& handle);
+  friend void Send(XlaOp operand, const ChannelHandle& handle);
   friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
                     const ChannelHandle& handle);
-  friend XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                                 const XlaOp& offset, float epsilon,
-                                 int64 feature_index);
-  friend XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                                  const XlaOp& offset, const XlaOp& mean,
-                                  const XlaOp& variance, float epsilon,
+  friend XlaOp BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
+                                 float epsilon, int64 feature_index);
+  friend XlaOp BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset,
+                                  XlaOp mean, XlaOp variance, float epsilon,
                                   int64 feature_index);
-  friend XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                             const XlaOp& batch_mean, const XlaOp& batch_var,
-                             const XlaOp& grad_output, float epsilon,
+  friend XlaOp BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
+                             XlaOp batch_var, XlaOp grad_output, float epsilon,
                              int64 feature_index);
-  friend XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+  friend XlaOp SendWithToken(XlaOp operand, XlaOp token,
                              const ChannelHandle& handle);
-  friend XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+  friend XlaOp RecvWithToken(XlaOp token, const Shape& shape,
                              const ChannelHandle& handle);
-  friend XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+  friend XlaOp SendToHost(XlaOp operand, XlaOp token,
                           const Shape& shape_with_layout,
                           const ChannelHandle& handle);
-  friend XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+  friend XlaOp RecvFromHost(XlaOp token, const Shape& shape,
                             const ChannelHandle& handle);
-  friend XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+  friend XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
                                const string& config);
-  friend XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+  friend XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
                                 const Shape& shape_with_layout,
                                 const string& outfeed_config);
   friend XlaOp CreateToken(XlaBuilder* builder);
   friend XlaOp AfterAll(XlaBuilder* builder, absl::Span<const XlaOp> tokens);
 
-  friend XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+  friend XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
+
+ private:
+  XlaOp ConditionalImpl(
+      const XlaOp& branch_index,
+      absl::Span<const XlaComputation* const> branch_computations,
+      absl::Span<const XlaOp> branch_operands);
 };
 
 // RAII-style object: sets the current sharding assignment in builder on
@@ -1130,7 +1118,7 @@
 // The new dimensions index into copies of the operand, i.e.
 //
 //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes);
+XlaOp Broadcast(XlaOp operand, absl::Span<const int64> broadcast_sizes);
 
 // This op broadcasts the `operand` to an output with the given `shape`.
 // `broadcast_dimensions` are the dimensions to be broadcasting into, i.e., the
@@ -1147,14 +1135,13 @@
 //   will generate output
 //   {{1 , 1},
 //    {2 , 2}}
-XlaOp BroadcastInDim(const XlaOp& operand,
-                     const absl::Span<const int64> out_dim_size,
+XlaOp BroadcastInDim(XlaOp operand, const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions);
 
 // Enqueues a pad operation onto the computation that pads the given value on
 // the edges as well as between the elements of the input. padding_config
 // specifies the padding amount for each dimension.
-XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+XlaOp Pad(XlaOp operand, XlaOp padding_value,
           const PaddingConfig& padding_config);
 
 // Enqueues an operation onto the computation that flattens the operand based
@@ -1162,13 +1149,13 @@
 // given, followed by reshaping it into the shape with the given dimension
 // sizes (also major to minor). Conceptually, this is a limited form of
 // "shape casting".
-XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
+XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
               absl::Span<const int64> new_sizes);
 
 // Enqueues an operation onto the computation that collapses the operand, from
 // first to last dimension (C order), then reshapes it to the given dimension
 // sizes. Conceptually, this is a limited form of "shape casting".
-XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes);
+XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes);
 
 // Wrapper for Reshape.
 // Enqueues an operation to collapse the provided dimensions; e.g. an
@@ -1188,7 +1175,7 @@
 //
 // This could potentially cause data to be moved -- it provides a more
 // structured form of reshaping than an arbitrary Reshape operation.
-XlaOp Collapse(const XlaOp& operand, absl::Span<const int64> dimensions);
+XlaOp Collapse(XlaOp operand, absl::Span<const int64> dimensions);
 
 // Enqueues a slice operation onto the computation that slices the operand
 // from the start indices to the limit indices; e.g.
@@ -1201,7 +1188,7 @@
 // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
 // range notation.
 // The strides parameter determines the stride over the slice
-XlaOp Slice(const XlaOp& operand, absl::Span<const int64> start_indices,
+XlaOp Slice(XlaOp operand, absl::Span<const int64> start_indices,
             absl::Span<const int64> limit_indices,
             absl::Span<const int64> strides);
 
@@ -1211,7 +1198,7 @@
 // for:
 //
 //  array[:, 2:4:1, :]
-XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                  int64 stride, int64 dimno);
 
 // Enqueues a slice operation onto the computation that slices the 'operand'
@@ -1224,11 +1211,11 @@
 // have the same shape.
 // Slice index calculations are computed modulo input dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
-XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes);
 
 ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
                    absl::Span<const int64> slice_sizes);
 
 // Enqueues a dynamic update slice operation onto the computation, which
@@ -1248,12 +1235,11 @@
 // have the same shape.
 // Slice index calculations are computed modulo update dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
-XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                          absl::Span<const XlaOp> start_indices);
 
 ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                         const XlaOp& start_indices);
+XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
 
 // Enqueues a concatenate instruction onto the computation. 'operands' must
 // have >= 1 entry.
@@ -1262,66 +1248,66 @@
 
 // Enqueue a tracing operation onto the computation; the computation will emit
 // a logging message with the operand.
-void Trace(const string& tag, const XlaOp& operand);
+void Trace(const string& tag, XlaOp operand);
 
 // Enqueues a conditional-move-like select operation onto the computation;
 // predicated on pred, selects between on_true and on_false.
-XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
 
 // Enqueues a tuple-creation instruction onto the computation.
 XlaOp Tuple(XlaBuilder* builder, absl::Span<const XlaOp> elements);
 
 // Enqueues a tuple-element-get instruction onto the computation.
-XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
 
 // Enqueues an equal-to comparison instruction onto the computation.
-XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Eq(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a not-equal comparison instruction onto the computation.
-XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Ne(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a greater-or-equal comparison instruction onto the computation.
-XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Ge(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a greater-than comparison instruction onto the computation.
-XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Gt(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a less-than comparison instruction onto the computation.
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Lt(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a less-or-equal comparison instruction onto the computation.
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Le(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a comparison instruction onto the computation.
-XlaOp Compare(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Compare(XlaOp lhs, XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction);
 
 // Enqueues a dot instruction onto the computation.
-XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Dot(XlaOp lhs, XlaOp rhs,
           const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a general dot instruction onto the computation.
-XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
                  const DotDimensionNumbers& dimension_numbers,
                  const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, which uses the
 // default convolution dimension numbers.
-XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> window_strides, Padding padding,
-           int64 feature_group_count = 1, int64 batch_group_count = 1,
+XlaOp Conv(XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+           Padding padding, int64 feature_group_count = 1,
+           int64 batch_group_count = 1,
            const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration in the format returned by MakePadding().
-XlaOp ConvWithGeneralPadding(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ConvWithGeneralPadding(XlaOp lhs, XlaOp rhs,
                              absl::Span<const int64> window_strides,
                              absl::Span<const std::pair<int64, int64>> padding,
                              int64 feature_group_count = 1,
@@ -1331,15 +1317,14 @@
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided dimension numbers configuration.
 XlaOp ConvWithGeneralDimensions(
-    const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
     Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
     int64 feature_group_count = 1, int64 batch_group_count = 1,
     const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration as well as the dimension numbers.
-XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                  absl::Span<const int64> window_strides,
+XlaOp ConvGeneral(XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
                   absl::Span<const std::pair<int64, int64>> padding,
                   const ConvolutionDimensionNumbers& dimension_numbers,
                   int64 feature_group_count = 1, int64 batch_group_count = 1,
@@ -1347,7 +1332,7 @@
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration, dilation factors and dimension numbers.
-XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ConvGeneralDilated(XlaOp lhs, XlaOp rhs,
                          absl::Span<const int64> window_strides,
                          absl::Span<const std::pair<int64, int64>> padding,
                          absl::Span<const int64> lhs_dilation,
@@ -1359,8 +1344,7 @@
 
 // Enqueues an FFT instruction onto the computation, of the given type and
 // with the given FFT length.
-XlaOp Fft(const XlaOp& operand, FftType fft_type,
-          absl::Span<const int64> fft_length);
+XlaOp Fft(XlaOp operand, FftType fft_type, absl::Span<const int64> fft_length);
 
 // Solves systems of linear equations with lower or upper triangular coefficient
 // matrices by forward- or back-substitution. Broadcasting along leading
@@ -1410,7 +1394,7 @@
 // two-element tuple containing the data value and a token-shaped value.
 // Tokens are used for ordering side-effecting operations.
 // TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
                       const string& config = "");
 
 // Enqueues an outfeed instruction onto the computation. This instruction
@@ -1419,13 +1403,13 @@
 // shape_with_layout communicates the laid out shape that we want to outfeed
 // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
 // will occur.
-void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+void Outfeed(XlaOp operand, const Shape& shape_with_layout,
              const string& outfeed_config);
 
 // Variant of Outfeed which takes a token-shaped operand and produces a
 // token-shaped value. Tokens are used for ordering side-effecting operations.
 // TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
                        const Shape& shape_with_layout,
                        const string& outfeed_config);
 
@@ -1462,89 +1446,86 @@
 // (see g3doc for more details).
 
 // Enqueues a complex compose instruction onto the computation.
-XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+XlaOp Complex(XlaOp real, XlaOp imag,
               absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a complex conjugate instruction onto the computation.
-XlaOp Conj(const XlaOp& operand);
+XlaOp Conj(XlaOp operand);
 
 // Enqueues an add instruction onto the computation.
-XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Add(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a subtract instruction onto the computation.
-XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Sub(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a multiply instruction onto the computation.
-XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Mul(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a divide instruction onto the computation.
-XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Div(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a remainder instruction onto the computation.
-XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Rem(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a max instruction onto the computation.
-XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Max(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a min instruction onto the computation.
-XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Min(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Element-wise logical operators
-XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp And(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Overload to call And with 3 or more operands.  We need the following somewhat
 // convoluted overload set to disambiguate with the overload that takes the
 // `broadcast_dimensions` optional param.
-inline XlaOp And(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3) {
+inline XlaOp And(XlaOp op1, XlaOp op2, XlaOp op3) {
   return And(op1, And(op2, op3));
 }
 template <typename... XlaOpTs>
-XlaOp And(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3,
-          const XlaOpTs&... operands) {
+XlaOp And(XlaOp op1, XlaOp op2, XlaOp op3, const XlaOpTs&... operands) {
   return And(op1, And(op2, And(op3, operands...)));
 }
 
-XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Or(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
 // Overload to call Or with 3 or more operands.  As with `And`, we need the
 // following complicated overload set to handle the default arg in the `Or`
 // overload above.
-inline XlaOp Or(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3) {
+inline XlaOp Or(XlaOp op1, XlaOp op2, XlaOp op3) {
   return Or(op1, Or(op2, op3));
 }
 template <typename... XlaOpTs>
-XlaOp Or(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3,
-         const XlaOpTs&... operands) {
+XlaOp Or(XlaOp op1, XlaOp op2, XlaOp op3, const XlaOpTs&... operands) {
   return Or(op1, Or(op2, Or(op3, operands...)));
 }
 
-XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Xor(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
-XlaOp Not(const XlaOp& operand);
+XlaOp Not(XlaOp operand);
 
-XlaOp PopulationCount(const XlaOp& operand);
+XlaOp PopulationCount(XlaOp operand);
 
-XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ShiftLeft(XlaOp lhs, XlaOp rhs,
                 absl::Span<const int64> broadcast_dimensions = {});
-XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ShiftRightArithmetic(XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64> broadcast_dimensions = {});
-XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp ShiftRightLogical(XlaOp lhs, XlaOp rhs,
                         absl::Span<const int64> broadcast_dimensions = {});
 
 // Reduces an array among the provided dimensions, given "computation" as a
 // reduction operator.
-XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-             const XlaComputation& computation,
+XlaOp Reduce(XlaOp operand, XlaOp init_value, const XlaComputation& computation,
              absl::Span<const int64> dimensions_to_reduce);
 
 // Reduces several arrays simultaneously among the provided dimensions, given
@@ -1556,11 +1537,11 @@
 
 // Convenience wrapper around the above that reduces all the dimensions in the
 // operand shape.
-XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+XlaOp ReduceAll(XlaOp operand, XlaOp init_value,
                 const XlaComputation& computation);
 
 // Enqueues a windowed reduce instruction onto the computation.
-XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+XlaOp ReduceWindow(XlaOp operand, XlaOp init_value,
                    const XlaComputation& computation,
                    absl::Span<const int64> window_dimensions,
                    absl::Span<const int64> window_strides, Padding padding);
@@ -1568,8 +1549,7 @@
 // As ReduceWindow(), but the padding is given in the format
 // returned by MakePadding().
 XlaOp ReduceWindowWithGeneralPadding(
-    const XlaOp& operand, const XlaOp& init_value,
-    const XlaComputation& computation,
+    XlaOp operand, XlaOp init_value, const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const int64> base_dilations,
@@ -1579,7 +1559,7 @@
 // Returns the sum of the operand value within each subgroup of replicas. All
 // replicas supply one input to the sum and all replicas receive the resulting
 // sum for each subgroup.
-XlaOp CrossReplicaSum(const XlaOp& operand,
+XlaOp CrossReplicaSum(XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups = {});
 
 // Enqueues an operation that do an AllReduce of the operand cross cores. Here
@@ -1600,13 +1580,13 @@
 //
 // TODO(b/117564385): Rename this to AllReduce when it's ready to use.
 XlaOp CrossReplicaSum(
-    const XlaOp& operand, const XlaComputation& computation,
+    XlaOp operand, const XlaComputation& computation,
     absl::Span<const ReplicaGroup> replica_groups = {},
     const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
 // Enqueues an operation that do an Alltoall of the operand cross cores.
-XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
-               int64 concat_dimension, int64 split_count,
+XlaOp AllToAll(XlaOp operand, int64 split_dimension, int64 concat_dimension,
+               int64 split_count,
                const std::vector<ReplicaGroup>& replica_groups = {});
 
 // Enqueues an collective operation that sends and receives data cross replicas.
@@ -1618,7 +1598,7 @@
 // is not a target in any pair, then the output on that replica is a tensor
 // consists of 0(s) with the same shape as the input.
 XlaOp CollectivePermute(
-    const XlaOp& operand,
+    XlaOp operand,
     const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
 // Enqueues an operation that returns the replica ID.
@@ -1626,79 +1606,79 @@
 
 // Enqueues an operation that scatters the `source` array to the selected
 // indices of each window.
-XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+XlaOp SelectAndScatter(XlaOp operand, const XlaComputation& select,
                        absl::Span<const int64> window_dimensions,
                        absl::Span<const int64> window_strides, Padding padding,
-                       const XlaOp& source, const XlaOp& init_value,
+                       XlaOp source, XlaOp init_value,
                        const XlaComputation& scatter);
 
 // As SelectAndScatter(), but the padding is given in the format
 // returned by MakePadding().
 XlaOp SelectAndScatterWithGeneralPadding(
-    const XlaOp& operand, const XlaComputation& select,
+    XlaOp operand, const XlaComputation& select,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
-    absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
-    const XlaOp& init_value, const XlaComputation& scatter);
+    absl::Span<const std::pair<int64, int64>> padding, XlaOp source,
+    XlaOp init_value, const XlaComputation& scatter);
 
 // Enqueues an abs instruction onto the computation.
-XlaOp Abs(const XlaOp& operand);
+XlaOp Abs(XlaOp operand);
 
 // Enqueues a atan2 instruction onto the computation.
-XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+XlaOp Atan2(XlaOp y, XlaOp x,
             absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues an exp instruction onto the computation.
-XlaOp Exp(const XlaOp& operand);
+XlaOp Exp(XlaOp operand);
 
 // Enqueues an expm1 instruction onto the computation.
-XlaOp Expm1(const XlaOp& operand);
+XlaOp Expm1(XlaOp operand);
 
 // Enqueues a floor instruction onto the computation.
-XlaOp Floor(const XlaOp& operand);
+XlaOp Floor(XlaOp operand);
 
 // Enqueues a ceil instruction onto the computation.
-XlaOp Ceil(const XlaOp& operand);
+XlaOp Ceil(XlaOp operand);
 
 // Enqueues a round instruction onto the computation, rounding to nearest even
 // with half-way cases rounding away from zero.
-XlaOp Round(const XlaOp& operand);
+XlaOp Round(XlaOp operand);
 
 // Enqueues an log instruction (natural logarithm) onto the computation.
-XlaOp Log(const XlaOp& operand);
+XlaOp Log(XlaOp operand);
 
 // Enqueues an log1p instruction (log(x+1)) onto the computation.
-XlaOp Log1p(const XlaOp& operand);
+XlaOp Log1p(XlaOp operand);
 
 // Enqueues a sign instruction onto the computation.
-XlaOp Sign(const XlaOp& operand);
+XlaOp Sign(XlaOp operand);
 
 // Enqueues a count leading zeros instruction onto the computation.
-XlaOp Clz(const XlaOp& operand);
+XlaOp Clz(XlaOp operand);
 
 // Enqueues a cosine instruction onto the computation.
-XlaOp Cos(const XlaOp& operand);
+XlaOp Cos(XlaOp operand);
 
 // Enqueues a sine instruction onto the computation.
-XlaOp Sin(const XlaOp& operand);
+XlaOp Sin(XlaOp operand);
 
 // Enqueues a tanh instruction onto the computation.
-XlaOp Tanh(const XlaOp& operand);
+XlaOp Tanh(XlaOp operand);
 
 // Enqueues a real-part instruction onto the computation.
-XlaOp Real(const XlaOp& operand);
+XlaOp Real(XlaOp operand);
 
 // Enqueues an imaginary-part instruction onto the computation.
-XlaOp Imag(const XlaOp& operand);
+XlaOp Imag(XlaOp operand);
 
 // Enqueues a sqrt computation onto the computation.
-XlaOp Sqrt(const XlaOp& operand);
+XlaOp Sqrt(XlaOp operand);
 
 // Enqueues a rsqrt computation onto the computation.
-XlaOp Rsqrt(const XlaOp& operand);
+XlaOp Rsqrt(XlaOp operand);
 
 // Enqueues a lhs^rhs computation onto the computation.
-XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Pow(XlaOp lhs, XlaOp rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues an operator that tests if the operand's values are finite, i.e., not
@@ -1709,7 +1689,7 @@
 // an error for other types.
 //
 // See also IsInf, IsPosInf, IsNegInf, and IsNan in lib/math.h.
-XlaOp IsFinite(const XlaOp& operand);
+XlaOp IsFinite(XlaOp operand);
 
 // Enqueues an iota operation onto the computation.
 XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64 iota_dimension);
@@ -1719,44 +1699,24 @@
 
 // Enqueues a convert instruction onto the computation that changes the
 // element type of the operand array to primitive_type.
-XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type);
+XlaOp ConvertElementType(XlaOp operand, PrimitiveType new_element_type);
 
 // Enqueues a no-op instruction onto the computation that changes
 // the element type of the operand array to primitive_type. The
 // bit-widths of the source and destination element types must be
 // identical.
-XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type);
+XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
 
 // Enqueues a negate instruction onto the computation.
-XlaOp Neg(const XlaOp& operand);
+XlaOp Neg(XlaOp operand);
 
 // Enqueues a transpose instruction onto the computation.
-XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
+XlaOp Transpose(XlaOp operand, absl::Span<const int64> permutation);
 
 // Enqueues a reverse instruction onto the computation. The order of the
 // elements in the given dimensions is reversed (i.e., the element at index i
 // is moved to index dimension_size - 1 - i).
-XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
-
-// Enqueues a sort (as increasing order) instruction onto the computation.
-// If only keys are provided:
-// * If the keys are an rank-1 tensor (an array), the result is a sorted array
-// of keys, in ascending order.
-// * If the keys have higher rank, the keys are sorted along the provided
-// dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-// value of 0 will independently sort every column, and a dimension value of 1
-// will independently sort each row. If no dimension number is provided, then
-// the last dimension is chosen by default.
-//
-// If both keys and values are provided:
-// * The keys and all values must be tensors with the same dimensions. The
-// element types of the tensors may be different.
-// * The result is a tuple that consists of a sorted tensor of keys (along the
-// provided dimension, as above) as the first element, and tensors with their
-// corresponding values as the other elements.
-ABSL_DEPRECATED("Use form with comparator computation instead")
-XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
-           int64 dimension = -1);
+XlaOp Rev(XlaOp operand, absl::Span<const int64> dimensions);
 
 // Enqueues a sort instruction onto the computation, using 'comparator' for
 // comparisons. 'comparator' needs to define a strict weak order. 'is_stable'
@@ -1788,7 +1748,7 @@
            int64 dimension = -1, bool is_stable = false);
 
 // Enqueues a clamp instruction onto the computation.
-XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
 
 // Enqueues a map instruction onto the computation.
 XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
@@ -1797,20 +1757,19 @@
 
 // Enqueues a N(mu, sigma) random number generation instruction onto the
 // computation.
-XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+XlaOp RngNormal(XlaOp mu, XlaOp sigma, const Shape& shape);
 
 // Enqueues a U(a, b) random number generation instruction onto the
 // computation. Returns values in the semi-open interval [a, b).
-XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+XlaOp RngUniform(XlaOp a, XlaOp b, const Shape& shape);
 
 // Enqueues a while node onto the computation.
 XlaOp While(const XlaComputation& condition, const XlaComputation& body,
-            const XlaOp& init);
+            XlaOp init);
 
 // Enqueues a conditional node onto the computation.
-XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                  const XlaComputation& true_computation,
-                  const XlaOp& false_operand,
+XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
+                  const XlaComputation& true_computation, XlaOp false_operand,
                   const XlaComputation& false_computation);
 
 // Enqueues either a predicated (if/else) or indexed (switch/case/default)
@@ -1818,35 +1777,34 @@
 // branch_operands are matched by index. branch_index selects the branch that
 // will be executed. Out of range branch_index uses the N-1'th
 // branch_computation as default.
-XlaOp Conditional(const XlaOp& branch_index,
+XlaOp Conditional(XlaOp branch_index,
                   absl::Span<const XlaComputation* const> branch_computations,
                   absl::Span<const XlaOp> branch_operands);
 
 // Enqueues a ReducePrecision node onto the computation.
-XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+XlaOp ReducePrecision(XlaOp operand, const int exponent_bits,
                       const int mantissa_bits);
 
 // Enqueues a Gather node onto the computation.
-XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
+XlaOp Gather(XlaOp input, XlaOp start_indices,
              const GatherDimensionNumbers& dimension_numbers,
              absl::Span<const int64> slice_sizes);
 
 // Enqueues a Scatter node onto the computation.
-XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
-              const XlaOp& updates, const XlaComputation& update_computation,
+XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
+              const XlaComputation& update_computation,
               const ScatterDimensionNumbers& dimension_numbers);
 
 // Enqueues a Send node onto the computation for device-to-device
 // communication. This operation sends the given operand to
 // a Recv instruction in a different computation that shares the same channel
 // handle.
-void Send(const XlaOp& operand, const ChannelHandle& handle);
+void Send(XlaOp operand, const ChannelHandle& handle);
 
 // Variant of Send which takes a token-shaped operand and produces a
 // token-shaped value.  Tokens are used for ordering side-effecting operations.
 // TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                    const ChannelHandle& handle);
+XlaOp SendWithToken(XlaOp operand, XlaOp token, const ChannelHandle& handle);
 
 // Enqueues a Recv node onto the computation for device-to-device
 // communication. The data comes from a Send instruction in a different
@@ -1859,7 +1817,7 @@
 // tuple containing the data value and a token-shaped value. Tokens are used
 // for ordering side-effecting operations.
 // TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+XlaOp RecvWithToken(XlaOp token, const Shape& shape,
                     const ChannelHandle& handle);
 
 // Enqueues a Send node which transfers data from the device to the host. The
@@ -1867,13 +1825,13 @@
 // shape must be compatible with the shape of the operand. The operand must be
 // array-shaped.
 // TODO(b/111544877): Support tuple shapes.
-XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                 const Shape& shape_with_layout, const ChannelHandle& handle);
+XlaOp SendToHost(XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+                 const ChannelHandle& handle);
 
 // Enqueues a Recv node which transfers data from the host to the device. The
 // given shape must contain a layout and must be an array.
 // TODO(b/111544877): Support tuple shapes.
-XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+XlaOp RecvFromHost(XlaOp token, const Shape& shape,
                    const ChannelHandle& handle);
 
 // Enqueues an operation (AfterAll) with no operands that produces a
@@ -1894,8 +1852,7 @@
 // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
 // is the normalized result and batch_mean and batch_var are the mean and
 // variance, respectively, across batch for the operand.
-XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                        const XlaOp& offset, float epsilon,
+XlaOp BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset, float epsilon,
                         int64 feature_index);
 
 // Normalizes operand across spatial and batch dimensions for each feature.
@@ -1908,10 +1865,8 @@
 //
 // The output has the same shape as `operand`, and contains the normalized
 // values for each batch.
-XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                         const XlaOp& offset, const XlaOp& mean,
-                         const XlaOp& variance, float epsilon,
-                         int64 feature_index);
+XlaOp BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset, XlaOp mean,
+                         XlaOp variance, float epsilon, int64 feature_index);
 
 // Calculates the gradients of a batch norm op.
 //
@@ -1922,14 +1877,13 @@
 //   - grad_operand: Gradient with respect to input `operand`
 //   - grad_offset: Gradient with respect to input `offset`
 //   - grad_scale: Gradient with respect to input `scale`
-XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                    const XlaOp& batch_mean, const XlaOp& batch_var,
-                    const XlaOp& grad_output, float epsilon,
+XlaOp BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
+                    XlaOp batch_var, XlaOp grad_output, float epsilon,
                     int64 feature_index);
 
 // Returns the size of the given dimension of the operand. The operand must be
 // array shaped.
-XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
 
 // Implementation details below this point.
 //
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 230f3b2..39c90b6 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -26,12 +26,13 @@
 int ExecutableRunOptions::device_ordinal() const { return device_ordinal_; }
 
 ExecutableRunOptions& ExecutableRunOptions::set_allocator(
-    DeviceMemoryAllocator* allocator) {
+    stream_executor::DeviceMemoryAllocator* allocator) {
   allocator_ = allocator;
   return *this;
 }
 
-DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
+stream_executor::DeviceMemoryAllocator* ExecutableRunOptions::allocator()
+    const {
   return allocator_;
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 1ac26a0..8462959 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -23,6 +23,7 @@
 namespace stream_executor {
 class Stream;
 class Platform;
+class DeviceMemoryAllocator;
 }  // namespace stream_executor
 
 namespace Eigen {
@@ -31,7 +32,6 @@
 
 namespace xla {
 
-class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
 
@@ -39,8 +39,9 @@
 class ExecutableRunOptions {
  public:
   // Specifies the allocator to use during execution.
-  ExecutableRunOptions& set_allocator(DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* allocator() const;
+  ExecutableRunOptions& set_allocator(
+      stream_executor::DeviceMemoryAllocator* allocator);
+  stream_executor::DeviceMemoryAllocator* allocator() const;
 
   // If set, this is the device to run the computation on. Valid device_ordinal
   // values are: 0 to # of devices - 1. These values are identical to the device
@@ -87,7 +88,7 @@
   int rng_seed() const;
 
  private:
-  DeviceMemoryAllocator* allocator_ = nullptr;
+  stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   const DeviceAssignment* device_assignment_ = nullptr;
   stream_executor::Stream* stream_ = nullptr;
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index d756cd7..dafc334 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -29,6 +29,8 @@
         path: /xla/tiled_layout
       - title: Using AOT compilation
         path: /xla/tfcompile
+      - title: Writing custom calls
+        path: /xla/custom_call
       - heading: Tutorials
       - title: XLA compile API
         path: /xla/tutorials/xla_compile
diff --git a/tensorflow/compiler/xla/g3doc/custom_call.md b/tensorflow/compiler/xla/g3doc/custom_call.md
new file mode 100644
index 0000000..e4cf0bf
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/custom_call.md
@@ -0,0 +1,284 @@
+# XLA Custom Calls
+
+This document describes how to write and use XLA "custom calls". Custom calls
+let you invoke code written in a programming language like C++ or CUDA from an
+XLA program.
+
+Warning: Custom calls are a low-level power-user feature. It is easy to break
+your program in difficult-to-debug (and even difficult-to-notice) ways using
+custom-calls. You shouldn't use custom calls unless you're prepared to debug XLA
+yourself when something goes wrong, and you should expect relatively less
+assistance from XLA developers if you run into trouble.
+
+Warning: The custom-call API/ABI is not currently stable. We don't intend to
+change it capriciously, but it may change. Some possible future changes are
+described below.
+
+## Custom-call on CPU
+
+You can create an HLO instruction which represents a custom-call via XLA's
+client API. This is not exposed via TensorFlow as of writing.
+
+For example, the following code uses a custom-call to compute `A[i] = B[i % 128]
++ C[i]` on the CPU. (Of course you could -- and should! -- do this with regular
+HLO.)
+
+```c++
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+
+void do_it() {
+  xla::XlaBuilder b("do_it");
+  xla::XlaOp param0 =
+      xla::Parameter(0, xla::ShapeUtil::CreateShape(F32, {128}), "p0");
+  xla::XlaOp param1 =
+      xla::Parameter(1, xla::ShapeUtil::CreateShape(F32, {2048}), "p1");
+  xla::XlaOp custom_call =
+      xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
+                      /*output_shape=*/ShapeUtil::CreateShape(F32, {2048}));
+}
+
+void do_custom_call(void* out, const void** in) {
+  float* out_buf = reinterpret_cast<float*>(out);
+  const float* in0 = reinterpret_cast<const float*>(in[0]);
+  const float* in1 = reinterpret_cast<const float*>(in[1]);
+  for (int i = 0; i < 2048; ++i) {
+    out_buf[i] = in0[i % 128] + in1[i];
+  }
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(do_custom_call, "Host");
+```
+
+Notice that the function `do_custom_call` needs to know the dimensions of the
+buffers it operates over. In this example we hardcode the sizes 128 and 2048. If
+you don't want to do this, you can pass the dimensions in as parameters to the
+call.
+
+## Custom-call on GPU
+
+The GPU custom call framework is somewhat different than that on the CPU. Here
+is a CUDA example that does the same `A[i] = B[i % 128] + C[i]` computation as
+the CPU code above.
+
+```c++
+void do_it() { /* same implementation as above */ }
+
+__global__ custom_call_kernel(const float* in0, const float* in1, float* out) {
+  size_t idx = threadIdx.x * blockSize.x + gridIdx.x;
+  out[idx] = in0[idx % 128] + in1[idx];
+}
+
+void do_custom_call(CUstream stream, void** buffers,
+                    const char* opaque, size_t opaque_len) {
+  const float* in0 = reinterpret_cast<const float*>(buffers[0]);
+  const float* in1 = reinterpret_cast<const float*>(buffers[1]);
+  float* out = reinterpret_cast<float*>(buffers[2]);
+
+  const int64 block_dim = 64;
+  const int64 grid_dim = 2048 / block_dim;
+  custom_call_kernel<<<grid_dim, block_dim,
+                       /*dynamic_shared_mem_bytes=*/0, stream>>>(in0, in1, out);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(do_custom_call, "CUDA");
+```
+
+Notice first that the GPU custom call function *is still a function executed on
+the CPU*. Our `do_custom_call` CPU function is responsible for enqueueing work
+on the GPU. Here it launches a CUDA kernel, but it could also do something else,
+like call cublas.
+
+`buffers` is an array of pointers which lives on the host, and each element it
+contains points to device (i.e. GPU) memory. The parameters come first, followed
+by the output value. This is notably different from the CPU calling convention,
+which has two params, `ins` and `out`. The main reason we diverge is to make it
+possible to handle tuple-shaped inputs/outputs efficiently; see the section
+below.
+
+As in the CPU example, we've hardcoded the input and output buffer sizes into
+our custom call. However unlike in the CPU case, passing the buffer sizes in as
+operands to the custom call would not work well. Usually we need the buffer
+sizes available to us on the CPU; e.g. when launching a kernel, we need to know
+the block/grid dimensions to use. But if we were to pass the buffer sizes as
+operands to our custom call, their values would live in GPU memory. We'd then
+have to do an expensive synchronous device-to-host memcpy at the start of our
+operation just to read the sizes.
+
+To let you work around this, we provide the `opaque` parameter. You can set this
+to an arbitrary string of bytes when you create the custom call:
+
+```c++
+std::string opaque = "...";
+xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
+                /*output_shape=*/ShapeUtil::CreateShape(F32, {2048}),
+                opaque);
+```
+
+Since `xla::Shape` has a protocol buffer representation, you could store this
+serialized proto inside of `opaque` and deserialize it within your GPU
+custom-call. Note however that although `xla::ShapeProto` does not change
+frequently, it *does* change. Check the git log to see how it has changed in the
+past.
+
+## Passing tuples to custom-calls
+
+Consider the following custom-call.
+
+```c++
+using xla::ShapeUtil;
+Shape p0_shape = ShapeUtil::MakeTuple({
+    ShapeUtil::MakeShape(F32, {32}),
+    ShapeUtil::MakeTuple({
+        ShapeUtil::MakeTuple(F32, {64}),
+        ShapeUtil::MakeTuple(F32, {128}),
+    }),
+    ShapeUtil::MakeShape(F32, {256}),
+});
+xla::XlaOp p0 = xla::Parameter(0, p0_shape, "p0");
+
+Shape out_shape = ShapeUtil::MakeTuple({
+  ShapeUtil::MakeShape(F32, {512}),
+  ShapeUtil::MakeShape(F32, {1024}),
+});
+xla::CustomCall(&b, "do_custom_call", /*operands=*/{p0}, out_shape);
+```
+
+On both CPU and GPU, a tuple is represented in memory as an array of pointers.
+In C++-pseudocode, parameter 0 above is laid out as follows.
+
+```c++
+// In-memory layout of parameter 0 from custom-call above.  True on both CPU
+// and GPU.
+float* subbuf0 = new float[32];
+float* subbuf1 = new float[64];
+float* subbuf2 = new float[128]
+float* subbuf3 = new float[256];
+
+void* subtuple = new void*[2];
+(*subtuple)[0] = subbuf1;
+(*subtuple)[1] = subbuf2;
+
+void* p0 = new void*[3];
+(*p0)[0] = subbuf0;
+(*p0)[1] = subtuple;
+(*p0)[2] = subbuf3;
+```
+
+Although the in-memory representation of tuples is the same in CPU and GPU, they
+are handled differently in the CPU and GPU custom-call calling conventions.
+
+### Tuple outputs as temp buffers
+
+Tuple inputs to custom-calls are a convenience, but they aren't strictly
+necessary. If we didn't support tuple inputs to custom calls, you could always
+unpack the tuples using get-tuple-element before passing them to the custom
+call.
+
+On the other hand, tuple *outputs* do let you do things you couldn't otherwise.
+
+The obvious reason to have tuple outputs is, that's how a custom call (or any
+other XLA op) returns multiple independent arrays.
+
+But less obviously, a tuple output is also a way to give your custom call temp
+memory. Yes, an *output* can represent a temp buffer. Consider, an output buffer
+has the property that the op can write to it, and it can read from it after it's
+been written to. That's exactly what you want from a temp buffer.
+
+In the example above, suppose we wanted to use the `F32[1024]` as a temp buffer.
+Then we'd write the HLO just as above, and we'd simply never read tuple index 1
+of the custom call's output.
+
+### Tuples in CPU custom-calls
+
+In CPU code, we have a function `do_custom_call(const void** ins, void* out)`.
+`ins` is an array with just one element, which points to `param0`. The
+subbuffers of param0 are accessible by dereferencing that pointer.
+
+### Tuples in GPU custom-calls
+
+In GPU code, we have a function `do_custom_call(..., void** buffers, ...)`. In
+this case `buffers` is a host array of *seven* device pointers, one for each
+nested buffer. To generate the flat list, we iterate over the parameters and
+output, and then do preorder traversal of their shapes. Concretely:
+
+```c++
+// Layout of `buffers` parameter to GPU custom call function for custom-call
+// above.
+buffers[0] == param0
+buffers[1] == subbuf0 or null
+buffers[2] == subtuple or null
+buffers[3] == subbuf1 or null
+buffers[4] == subbuf2 or null
+buffers[5] == subbuf3 or null
+buffers[6] == output_tuple
+buffers[7] == output_subbuf0
+buffers[8] == output_subbuf1
+```
+
+The `or null` part is significant. A sub-buffer of a tuple will be non-null in
+the `buffers` list if XLA is able to statically analyze the program and figure
+out the address of the sub-buffer. This is usually the case, but may not be in
+programs with control flow and/or `select` ops over tuples.
+
+A correct custom-call implementation that accepts a tuple as input must always
+handle null sub-buffers, by dereferencing the root tuple.
+
+The rule is reversed for output buffers. The output sub-buffers will always be
+populated, but it's up to the op to populate the root tuple at the end.
+
+See the following code.
+
+```c++
+void do_custom_call(CUstream stream, void** buffers, const char* opaque,
+                    size_t opaque_len) {
+  bool needs_sync = false;
+  const float* subbuf0 = reinterpret_cast<const float*>(buffers[1]);
+  if (subbuf0 == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subbuf0, buffers[0], sizeof(void*),
+                    cudaMemcpyDeviceToHost, stream);
+  }
+  const void** subtuple = reinterpret_cast<const void**>(buffers[2]);
+  if (subtuple == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subtuple, buffers[2], ...);
+  }
+
+  // ... similarly for other params ...
+
+  // Wait for copies enqueued above to complete.
+  if (needs_sync) {
+    cudaStreamSynchronize(stream);
+  }
+  needs_sync = false;
+
+  // Now that we have `subtuple`, we can get subbuf1 and subbuf2.
+  float* subbuf1 = buffers[3];
+  if (subbuf1 == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subbuf1, subtuple, ...);
+  }
+  float* subbuf2 = buffers[4];
+  if (subbuf2 == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subbuf2, subtuple + 1, ...);
+  }
+
+  // Wait for copies enqueued above to complete.
+  if (needs_sync) {
+    cudaStreamSynchronize(stream);
+  }
+
+  // ... actually run the kernel ...
+
+  // Fill the output tuple.
+  void* outputs[2] = {buffers[7], buffers[8]};
+  cudaMemcpyAsync(outputs[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice,
+                  stream);
+
+  // A cudaStreamSynchronize call is technically required here, because
+  // cudaMemcpyAsync may continue running after `outputs` goes out of scope.
+  // This synchronization is expensive.  One way you could work around this
+  // problem would be to make `outputs` a global variable and protect
+  // do_custom_call by a mutex.
+}
+```
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 632d006..4ce7703 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -595,8 +595,7 @@
 
 | Arguments             | Type                  | Semantics                    |
 | --------------------- | --------------------- | ---------------------------- |
-| `branch_index`        | `XlaOp`               | Scalar of type `PRED` or     |
-:                       :                       : `S32`                        :
+| `branch_index`        | `XlaOp`               | Scalar of type `S32`         |
 | `branch_computations` | sequence of N         | XlaComputations of type $$   |
 :                       : `XlaComputation`      : T_0 \to S , T_1 \to S , ..., :
 :                       :                       : T_{N-1} \to S $$             :
@@ -604,9 +603,8 @@
 :                       :                       : T_1 , ..., T_{N-1} $$        :
 
 Executes `branch_computations[branch_index]`, and returns the result. If
-`branch_index` is a `PRED`, then the `true` branch is in position 0 and the
-`false` branch is in position 1. If `branch_index` is an `S32` which is < 0
-or >= N, then `branch_computations[N-1]` is executed as the default branch.
+`branch_index` is an `S32` which is < 0 or >= N, then `branch_computations[N-1]`
+is executed as the default branch.
 
 Each `branch_computations[b]` must take in a single argument of type `T_b` and
 will be invoked with `branch_operands[b]` which must be of the same type. The
@@ -928,11 +926,11 @@
 | matrix [m x k] `dot`    | matrix [m x n]        | matrix-matrix           |
 : matrix [k x n]          :                       : multiplication          :
 
-The operation performs sum of products over the last dimension of `lhs` and the
-one-before-last dimension of `rhs`. These are the "contracted" dimensions. The
-contracted dimensions of `lhs` and `rhs` must be of the same size. In practice,
-it can be used to perform dot products between vectors, vector/matrix
-multiplications or matrix/matrix multiplications.
+The operation performs sum of products over the second dimension of `lhs` (or
+the first if it has rank 1) and the first dimension of `rhs`. These are the
+"contracted" dimensions. The contracted dimensions of `lhs` and `rhs` must be of
+the same size. In practice, it can be used to perform dot products between
+vectors, vector/matrix multiplications or matrix/matrix multiplications.
 
 ## DotGeneral
 
@@ -2537,43 +2535,58 @@
 See also
 [`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
-There are two versions of the Sort instruction: a single-operand and a
-multi-operand version.
+<b>`Sort(operands, comparator, dimension, is_stable)`</b>
 
-<b>`Sort(operand, dimension)`</b>
+Arguments    | Type                | Semantics
+------------ | ------------------- | --------------------
+`operands`   | `ArraySlice<XlaOp>` | The operands to sort.
+`comparator` | `XlaComputation`    | The comparator computation to use.
+`dimension`  | `int64`             | The dimension along which to sort.
+`is_stable`  | `bool`              | Whether stable sorting should be used.
 
-Arguments   | Type    | Semantics
------------ | ------- | --------------------
-`operand`   | `XlaOp` | The operand to sort.
-`dimension` | `int64` | The dimension along which to sort.
+If only one operand is provided:
 
-Sorts the elements in the operand in ascending order along the provided
-dimension. For example, for a rank-2 (matrix) operand, a `dimension` value of 0
-will sort each column independently, and a `dimension` value of 1 will sort each
-row independently. If the operand's elements have floating point type, and the
-operand contains NaN elements, the order of elements in the output is
-implementation-defined.
+* If the operand is a rank-1 tensor (an array), the result is a sorted array.
+  If you want to sort the array into ascending order, the comparator should
+  perform a less-than comparison. Formally, after the array is sorted, it holds
+  for all index positions `i, j` with `i < j` that either
+  `comparator(value[i], value[j]) = comparator(value[j], value[i]) = false` or
+  `comparator(value[i], value[j]) = true`.
 
-<b>`Sort(keys, values, ... values, dimension)`</b>
+* If the operand has higher rank, the operand is sorted along the provided
+  dimension. For example, for a rank-2 tensor (a matrix), a dimension value of
+  `0` will independently sort every column, and a dimension value of `1` will
+  independently sort each row. If no dimension number is provided, then the last
+  dimension is chosen by default. For the dimension which is sorted, the same
+  sorting order applies as in the rank-1 case.
 
-Sorts both the key and one or more value operands. The keys are sorted as in the
-single-operand version. Each of the values inputs is sorted according to the
-order of the corresponding keys. For example, if the three inputs are `keys =
-[3, 1]`, `values0 = [42, 50]`, `values1 = [-3.0, 1.1]`, then the output of the
-sort is the tuple `{[1, 3], [50, 42], [1.1, -3.0]}`.
+If `n > 1` operands are provided:
 
-The sort is not guaranteed to be stable, that is, if the keys array contains
-duplicates, the order of values corresponding to these keys may not be
-preserved.
+* All `n` operands must be tensors with the same dimensions. The element types
+  of the tensors may be different.
 
-Arguments   | Type                   | Semantics
------------ | ---------------------- | ----------------------------------
-`keys`      | `XlaOp`                | The sort keys.
-`values`    | Sequence of N `XlaOp`s | The values to sort.
-`dimension` | `int64`                | The dimension along which to sort.
+* All operands are sorted together, not individually. Conceptually the operands
+  are treated as a tuple. When checking whether the elements of each operand at
+  index positions `i` and `j` need to be swapped, the comparator is called with
+  `2 * n` scalar parameters, where parameter `2 * k` corresponds to the value at
+  position `i` from the `k-th` operand, and parameter `2 * k + 1` corresponds to
+  the value at position `j` from the `k-th` operand. Usually, the comparator
+  would thus compare parameters `2 * k` and `2 * k + 1` with each other and
+  possibly use other parameter pairs as tie breakers.
 
-The `keys` and each of the `values` inputs must have the same dimensions, but
-may have different element types.
+* The result is a tuple that consists of the operands in sorted order (along
+  the provided dimension, as above). The `i-th` operand of the tuple corresponds
+  to the `i-th` operand of Sort.
+
+For example, if there are three operands `operand0 = [3, 1]`,
+`operand1 = [42, 50]`, `operand2 = [-3.0, 1.1]`, and the comparator compares
+only the values of `operand0` with less-than, then the output of the sort is the
+tuple `([1, 3], [50, 42], [1.1, -3.0])`.
+
+If `is_stable` is set to true, the sort is guaranteed to be stable, that is, if
+there are elements which are considered to be equal by the comparator, the
+relative order of the equal values is preserved. By default, `is_stable` is set
+to false.
 
 ## Transpose
 
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 854607d..23eaf31 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -293,8 +293,9 @@
     return InvalidArgument("LiteralProto has no shape");
   }
   Shape shape(proto.shape());
-  if (ShapeUtil::HasPrimitiveType(shape, OPAQUE)) {
-    return InvalidArgument("Literal shape cannot include OPAQUE sub-shape");
+  if (ShapeUtil::HasPrimitiveType(shape, OPAQUE_TYPE)) {
+    return InvalidArgument(
+        "Literal shape cannot include OPAQUE_TYPE sub-shape");
   }
   if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("LiteralProto has no layout");
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 26b029c..3234814 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -136,7 +136,7 @@
       return LiteralUtil::CreateR0<bool>(false);
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 0";
-    case OPAQUE:
+    case OPAQUE_TYPE:
       LOG(FATAL) << "opaque element type cannot take on value of 0";
     default:
       LOG(FATAL) << "Unhandled primitive type " << primitive_type;
@@ -176,7 +176,7 @@
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
-    case OPAQUE:
+    case OPAQUE_TYPE:
       LOG(FATAL) << "opaque element type cannot take on value of 1";
     default:
       LOG(FATAL) << "Unhandled primitive type " << primitive_type;
@@ -220,7 +220,7 @@
           static_cast<bfloat16>(-std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
-    case OPAQUE:
+    case OPAQUE_TYPE:
       LOG(FATAL) << "opaque element type has no minimum value";
     default:
       LOG(FATAL) << "Unhandled primitive type " << primitive_type;
@@ -260,7 +260,7 @@
           static_cast<bfloat16>(std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
-    case OPAQUE:
+    case OPAQUE_TYPE:
       LOG(FATAL) << "opaque element type has no maximum value";
     default:
       LOG(FATAL) << "Unhandled primitive type " << primitive_type;
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 1eedddf..2143d1d 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -89,8 +89,8 @@
     case TUPLE:
       LOG(FATAL) << "TUPLE is an invalid type for BitWidth";
 
-    case OPAQUE:
-      LOG(FATAL) << "OPAQUE is an invalid type for BitWidth";
+    case OPAQUE_TYPE:
+      LOG(FATAL) << "OPAQUE_TYPE is an invalid type for BitWidth";
 
     default:
       LOG(FATAL) << "Unhandled primitive type " << type;
@@ -126,17 +126,22 @@
 
 bool IsArrayType(PrimitiveType primitive_type) {
   return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
-         primitive_type != OPAQUE && primitive_type != TOKEN;
+         primitive_type != OPAQUE_TYPE && primitive_type != TOKEN;
 }
 
 // Class to memoize the computation of
 //   absl::AsciiStrToLower(PrimitiveType_Name(p))
 // for all PrimitiveType values "p"
+//
+// xla::OPAQUE_TYPE canonically maps to the string "opaque" -- the only reason
+// it's called OPAQUE_TYPE is to avoid clashing with a windows.h macro.
 class PrimitiveTypeNameGenerator {
  public:
   PrimitiveTypeNameGenerator() {
     for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
-      if (PrimitiveType_IsValid(i)) {
+      if (i == static_cast<int>(OPAQUE_TYPE)) {
+        lowercase_name_[i] = "opaque";
+      } else if (PrimitiveType_IsValid(i)) {
         lowercase_name_[i] = absl::AsciiStrToLower(
             PrimitiveType_Name(static_cast<PrimitiveType>(i)));
       }
@@ -158,6 +163,9 @@
 namespace {
 
 // Returns a map from lower-case primitive type name to primitive type.
+//
+// Due to Postel's Law considerations, both "opaque" and "opaque_type" map to
+// the xla::OPAQUE_TYPE enumerator.
 const std::unordered_map<string, PrimitiveType>& GetPrimitiveTypeStringMap() {
   static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
     static auto* map = new std::unordered_map<string, PrimitiveType>;
@@ -167,6 +175,7 @@
         (*map)[LowercasePrimitiveTypeName(value)] = value;
       }
     }
+    (*map)["opaque"] = OPAQUE_TYPE;
     return map;
   }();
   return *name_to_type;
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index a5d1319..d7b69dc 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -5,6 +5,7 @@
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_python_default_plugins")
 load("//tensorflow:tensorflow.bzl", "tf_pybind_extension")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 py_library(
     name = "xla_client",
@@ -27,6 +28,7 @@
     name = "xla_client_test",
     srcs = ["xla_client_test.py"],
     main = "xla_client_test.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_oss"],
     deps = [
@@ -38,6 +40,16 @@
 )
 
 cc_library(
+    name = "worker_thread",
+    srcs = ["worker_thread.cc"],
+    hdrs = ["worker_thread.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
     name = "types",
     srcs = ["types.cc"],
     hdrs = ["types.h"],
@@ -55,8 +67,8 @@
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
@@ -70,6 +82,7 @@
     copts = [
         "-fexceptions",
         "-fno-strict-aliasing",
+        "-Wno-c++98-c++11-compat",
     ],
     features = ["-use_header_modules"],
     deps = [
@@ -90,6 +103,34 @@
     ],
 )
 
+cc_library(
+    name = "shared_device_buffer",
+    srcs = ["shared_device_buffer.cc"],
+    hdrs = ["shared_device_buffer.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "shared_device_buffer_test",
+    srcs = ["shared_device_buffer_test.cc"],
+    deps = [
+        ":shared_device_buffer",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_pybind_extension(
     name = "xla_extension",
     srcs = [
@@ -105,8 +146,11 @@
     features = ["-use_header_modules"],
     module_name = "xla_extension",
     deps = [
+        ":shared_device_buffer",
         ":types",
+        ":worker_thread",
         ":xrt",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -129,17 +173,21 @@
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:comparators",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
         "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme",
         # Do NOT remove this dependency. The XLA Python extension must not
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index 4f9d2a4..40d737a 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -13,6 +13,51 @@
 limitations under the License.
 ==============================================================================*/
 
+// Implementation notes:
+//
+// Asynchronous execution:
+// -----------------------
+//
+// If 'asynchronous' is set when constructing the client, computations and
+// host-to-device transfers do not block the host waiting for the operation to
+// complete but instead return control to the host immediately. This allows
+// Python logic to overlap with device-side computation.
+//
+// For a good user experience, we must be careful only to enqueue operations
+// that are unlikely to fail; as a rule error checking must be done eagerly
+// before returning control to the client.
+//
+// Multi-stream execution:
+// -----------------------
+//
+// On certain platforms (e.g., TPU), we use a multistream execution design,
+// where different Streams are used for host-to-device transfers,
+// device-to-host transfers, and compute. This allows us to overlap transfers on
+// and off the device with computation.
+//
+// Synchronization between streams occurs via BufferDefinitionEvents that
+// describe when the contents of a logical buffer are known to be valid on
+// a particular stream.
+//
+// Synchronous vs asynchronous deallocation:
+// -----------------------------------------
+//
+// In asynchronous deallocation mode (currently only enabled on TPU), the client
+// need only keep buffers alive from its perspective until all operations that
+// touch those buffers have been enqueued.
+// The allocator and lower-level runtime is responsible for keeping buffers
+// alive (if that is needed) from the perspective of the device until any
+// device-side work actually completes. The client's use of the device allocator
+// thereby corresponds to a view of the tail of the compute stream instead of
+// its head.
+//
+// In synchronous deallocation mode the client is responsible for keeping
+// buffers alive until all device-side activity that consumes those buffers has
+// ceased. This is the case for CPU since HostExecutor performs allocation
+// and deallocation eagerly. In this mode, the client's use of the device
+// allocator is logically synchronized to the head of the compute stream, not
+// the tail.
+
 #include "tensorflow/compiler/xla/python/local_client.h"
 
 #include <memory>
@@ -23,6 +68,7 @@
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "include/pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -30,9 +76,9 @@
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 #include "tensorflow/compiler/xla/python/types.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -41,7 +87,6 @@
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace xla {
-namespace xla_python {
 
 namespace py = pybind11;
 
@@ -56,41 +101,105 @@
         "Argument to RegisterCpuCustomCallTargetRegistry was not a "
         "xla._CPU_CUSTOM_CALL_TARGET capsule.");
   }
-  cpu::CustomCallTargetRegistry::Global()->Register(
-      std::string(fn_name.begin(), fn_name.end()), static_cast<void*>(capsule));
+  CustomCallTargetRegistry::Global()->Register(
+      fn_name, static_cast<void*>(capsule), "Host");
   return Status::OK();
 }
 
+std::shared_ptr<py::object> PythonRefManager::ManageReference(
+    const py::object& object) {
+  auto deleter = [this](py::object* x) {
+    {
+      absl::MutexLock lock(&mu_);
+      python_garbage_.push_back(std::move(*x));
+    }
+    delete x;
+  };
+  return std::shared_ptr<py::object>(new py::object(object), deleter);
+}
+
+void PythonRefManager::CollectGarbage() {
+  // TODO(phawkins): ideally we would assert that the GIL is held, but there is
+  // no API to do this across all Python versions.
+  absl::MutexLock lock(&mu_);
+  python_garbage_.clear();
+}
+
+Device::Device(se::StreamExecutor* executor, bool use_multiple_streams,
+               bool synchronous_deallocation, bool asynchronous)
+    : use_multiple_streams_(use_multiple_streams),
+      synchronous_deallocation_(synchronous_deallocation),
+      asynchronous_(asynchronous) {
+  compute_stream_ = std::make_shared<se::Stream>(executor);
+  compute_stream_->Init();
+  if (use_multiple_streams) {
+    host_to_device_stream_ = std::make_shared<se::Stream>(executor);
+    device_to_host_stream_ = std::make_shared<se::Stream>(executor);
+    callback_stream_ = std::make_shared<se::Stream>(executor);
+    host_to_device_stream_->Init();
+    device_to_host_stream_->Init();
+    callback_stream_->Init();
+  } else {
+    callback_stream_ = host_to_device_stream_ = device_to_host_stream_ =
+        compute_stream_;
+  }
+  worker_thread_ = absl::make_unique<WorkerThread>(tensorflow::Env::Default(),
+                                                   "py_xla_execute");
+}
+
+Device::~Device() { compute_stream_->parent()->SynchronizeAllActivity(); }
+
+void Device::ThenExecuteOnWorkerThread(se::Stream* stream,
+                                       std::function<void()> callback) const {
+  stream->ThenDoHostCallback(
+      [this, callback]() { worker_thread_->Schedule(std::move(callback)); });
+}
+
 StatusOr<std::unique_ptr<PyLocalClient>> PyLocalClient::Get(
-    const std::string& platform_name) {
+    const std::string& platform_name, const std::string& xla_platform_name,
+    bool asynchronous) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      PlatformUtil::GetPlatform(platform_name));
+                      PlatformUtil::GetPlatform(xla_platform_name));
   if (platform->VisibleDeviceCount() <= 0) {
-    return InvalidArgument("Platform %s has no visible devices.",
-                           platform_name);
+    return InvalidArgument("Platform %s (%s) has no visible devices.",
+                           platform_name, xla_platform_name);
   }
   LocalClientOptions options;
   options.set_platform(platform);
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
-  return absl::make_unique<PyLocalClient>(client);
+  return absl::make_unique<PyLocalClient>(platform_name, client, asynchronous);
 }
 
-PyLocalClient::PyLocalClient(LocalClient* client)
-    : client_(client),
+PyLocalClient::PyLocalClient(std::string platform_name, LocalClient* client,
+                             bool asynchronous)
+    : platform_name_(std::move(platform_name)),
+      client_(client),
       h2d_transfer_pool_(tensorflow::Env::Default(), "py_xla_h2d_transfer",
-                         client->device_count()),
-      execute_pool_(tensorflow::Env::Default(), "py_xla_execute",
-                    client->device_count()) {}
+                         client->device_count()) {
+  devices_.reserve(client->device_count());
+  // TODO(phawkins): enable multistream mode on GPU too.
+  bool use_multiple_streams = (platform_name == "tpu");
+  bool synchronous_deallocation = !use_multiple_streams;
+  for (int i = 0; i < client->device_count(); ++i) {
+    se::StreamExecutor* executor =
+        client_->backend().stream_executor(i).ValueOrDie();
+    devices_.push_back(absl::make_unique<Device>(executor, use_multiple_streams,
+                                                 synchronous_deallocation,
+                                                 asynchronous));
+  }
+}
 
 Status PyLocalClient::TransferToInfeed(const LiteralSlice& literal,
                                        int device_ordinal) {
+  py_ref_manager().CollectGarbage();
   py::gil_scoped_release gil_release;
   return client_->TransferToInfeedLocal(literal, device_ordinal);
 }
 
 StatusOr<pybind11::object> PyLocalClient::TransferFromOutfeed(
     const Shape& shape, int device_ordinal) {
+  py_ref_manager().CollectGarbage();
   Literal literal;
   {
     py::gil_scoped_release gil_release;
@@ -100,10 +209,10 @@
   return LiteralToPython(absl::make_unique<Literal>(std::move(literal)));
 }
 
-static StatusOr<LocalShapedBuffer> TransferHostToDeviceAsync(
+static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
     const PythonBufferTree& tree, int device_ordinal, PyLocalClient* client,
-    se::Stream* stream) {
-  DeviceMemoryAllocator* allocator =
+    const Device& device) {
+  se::DeviceMemoryAllocator* allocator =
       client->client()->backend().memory_allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
@@ -112,8 +221,8 @@
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer,
                       transfer_manager->AllocateScopedShapedBuffer(
                           shape, allocator, device_ordinal));
-  TF_RETURN_IF_ERROR(
-      transfer_manager->WriteTupleIndexTablesAsync(stream, buffer));
+  TF_RETURN_IF_ERROR(transfer_manager->WriteTupleIndexTablesAsync(
+      device.host_to_device_stream(), buffer));
 
   auto it = tree.leaves.begin();
   for (const ShapeUtil::IndexedShape& indexed_shape :
@@ -124,70 +233,93 @@
         transfer_manager->HostShapeToDeviceShape(indexed_shape.shape),
         client->client()->platform(), device_ordinal);
     leaf.buffers().CopySubtreeFrom(buffer.buffers(), indexed_shape.index, {});
-    TF_RETURN_IF_ERROR(
-        transfer_manager->TransferLiteralToDeviceAsync(stream, *it, leaf));
+    if (device.use_multiple_streams() &&
+        !transfer_manager->CanShapedBufferBeAccessedNow(
+            device.host_to_device_stream()->parent(), leaf)) {
+      device.host_to_device_stream()->ThenWaitFor(device.compute_stream());
+    }
+    TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDeviceAsync(
+        device.host_to_device_stream(), *it, leaf));
     ++it;
   }
-  return LocalShapedBuffer(std::move(buffer), client);
+  std::shared_ptr<BufferDefinitionEvent> definition_event;
+  if (device.use_multiple_streams()) {
+    definition_event = std::make_shared<BufferDefinitionEvent>(
+        device.host_to_device_stream()->parent());
+    definition_event->RecordOnStream(device.host_to_device_stream());
+  }
+  std::shared_ptr<PySharedDeviceBuffer> device_buffer =
+      PySharedDeviceBuffer::FromScopedShapedBuffer(std::move(buffer),
+                                                   definition_event);
+  if (device.synchronous_deallocation()) {
+    device.ThenReleaseOnWorkerThread(device.host_to_device_stream(),
+                                     device_buffer);
+  }
+  return PyLocalBuffer(shape, std::move(device_buffer), client);
 }
 
 /* static */
-StatusOr<LocalShapedBuffer> LocalShapedBuffer::FromPython(
-    const py::object& argument, PyLocalClient* client, int device_ordinal) {
-  tensorflow::profiler::TraceMe traceme("LocalShapedBuffer::FromPython");
+StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(const py::object& argument,
+                                                  PyLocalClient* client,
+                                                  int device_ordinal) {
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPython");
   TF_ASSIGN_OR_RETURN(PythonBufferTree tree, GetPythonBufferTree(argument));
 
+  client->py_ref_manager().CollectGarbage();
+
+  // Take a reference to the buffer to ensure that the inputs in host memory
+  // remain live until the transfer is complete.
+  auto py_buffer_ref = client->py_ref_manager().ManageReference(argument);
+
   // We are done manipulating Python objects; release the GIL.
   py::gil_scoped_release gil_release;
-  VLOG(1) << "LocalShapedBuffer::FromPython: shape: " << tree.shape.ToString()
+  VLOG(1) << "PyLocalBuffer::FromPython: shape: " << tree.shape.ToString()
           << " device ordinal: " << device_ordinal;
 
+  const Device& device = client->device(device_ordinal);
   TF_ASSIGN_OR_RETURN(
-      StreamPool::Ptr stream,
-      client->client()->mutable_backend()->BorrowStream(device_ordinal));
-  TF_ASSIGN_OR_RETURN(
-      LocalShapedBuffer buffer,
-      TransferHostToDeviceAsync(tree, device_ordinal, client, stream.get()));
-  stream->BlockHostUntilDone();
+      PyLocalBuffer buffer,
+      TransferHostToDeviceAsync(tree, device_ordinal, client, device));
+
+  device.ThenRelease(device.host_to_device_stream(), std::move(py_buffer_ref));
+  if (!device.asynchronous()) {
+    device.host_to_device_stream()->BlockHostUntilDone();
+  }
   return buffer;
 }
 
-/*static */ StatusOr<std::vector<LocalShapedBuffer>>
-LocalShapedBuffer::FromPythonValues(
+/*static */ StatusOr<std::vector<PyLocalBuffer>>
+PyLocalBuffer::FromPythonValues(
     const std::vector<std::pair<py::object, int>>& arguments,
     PyLocalClient* client) {
-  tensorflow::profiler::TraceMe traceme("LocalShapedBuffer::FromPythonValues");
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPythonValues");
   int num_arguments = static_cast<int>(arguments.size());
-  std::vector<LocalShapedBuffer> outputs(num_arguments);
+  std::vector<PyLocalBuffer> outputs(num_arguments);
   if (num_arguments == 0) {
     return outputs;
   }
 
   struct H2DTransfer {
     PythonBufferTree tree;
-    StreamPool::Ptr stream;
-    StatusOr<LocalShapedBuffer> buffer;
+    StatusOr<PyLocalBuffer> buffer;
+    std::shared_ptr<py::object> py_buffer_ref;
   };
 
   std::vector<H2DTransfer> transfers(num_arguments);
   for (int i = 0; i < num_arguments; ++i) {
     TF_ASSIGN_OR_RETURN(transfers[i].tree,
                         GetPythonBufferTree(arguments[i].first));
+    transfers[i].py_buffer_ref =
+        client->py_ref_manager().ManageReference(arguments[i].first);
   }
+  client->py_ref_manager().CollectGarbage();
   // We are done manipulating Python objects; release the GIL.
   py::gil_scoped_release gil_release;
 
-  for (int i = 0; i < num_arguments; ++i) {
-    int device_ordinal = arguments[i].second;
-    TF_ASSIGN_OR_RETURN(
-        transfers[i].stream,
-        client->client()->mutable_backend()->BorrowStream(device_ordinal));
-  }
-
-  auto transfer_h2d = [&](int i) -> StatusOr<LocalShapedBuffer> {
+  auto transfer_h2d = [&](int i) -> StatusOr<PyLocalBuffer> {
     int device_ordinal = arguments[i].second;
     return TransferHostToDeviceAsync(transfers[i].tree, device_ordinal, client,
-                                     transfers[i].stream.get());
+                                     client->device(device_ordinal));
   };
 
   // We perform the transfers on a thread pool in case XLA needs to do any
@@ -195,106 +327,141 @@
   if (num_arguments == 1) {
     transfers[0].buffer = transfer_h2d(0);
   } else {
-    absl::BlockingCounter counter(num_arguments - 1);
-    for (int i = 1; i < num_arguments; ++i) {
+    absl::BlockingCounter counter(num_arguments);
+    for (int i = 0; i < num_arguments; ++i) {
       client->h2d_transfer_pool()->Schedule([&, i]() {
         transfers[i].buffer = transfer_h2d(i);
         counter.DecrementCount();
       });
     }
-    // Perform the first transfer on the main thread.
-    transfers[0].buffer = transfer_h2d(0);
     counter.Wait();
   }
 
-  // First, wait for all transfers to complete. We wait for all to complete
-  // since currently we maintain the invariant that the device's view of the
-  // state matches the host's view of the state. Returning early would mean that
-  // we might deallocate device-side memory before a transfer completes, which
-  // violates that invariant.
+  // Release our references once the transfers have completed.
   for (int i = 0; i < num_arguments; ++i) {
-    transfers[i].stream->BlockHostUntilDone();
+    int device_ordinal = arguments[i].second;
+    const Device& device = client->device(device_ordinal);
+    device.ThenRelease(device.host_to_device_stream(),
+                       std::move(transfers[i].py_buffer_ref));
+    if (!device.asynchronous()) {
+      device.host_to_device_stream()->BlockHostUntilDone();
+    }
   }
+
   for (int i = 0; i < num_arguments; ++i) {
     TF_ASSIGN_OR_RETURN(outputs[i], std::move(transfers[i].buffer));
   }
   return outputs;
 }
 
-LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer,
-                                     PyLocalClient* client)
-    : shaped_buffer_(std::move(shaped_buffer)), client_(client) {}
+/* static */ StatusOr<PyLocalBuffer> PyLocalBuffer::MakeTuple(
+    const std::vector<PyLocalBuffer> buffers, PyLocalClient* client,
+    int device_ordinal) {
+  std::vector<xla::Shape> host_shapes;
+  std::vector<std::shared_ptr<PySharedDeviceBuffer>> device_buffers;
+  host_shapes.reserve(buffers.size());
+  device_buffers.reserve(buffers.size());
+  for (const PyLocalBuffer& buffer : buffers) {
+    TF_RET_CHECK(buffer.device_buffer()->device_memory().device_ordinal() ==
+                 device_ordinal);
+    host_shapes.push_back(buffer.on_host_shape());
+    device_buffers.push_back(buffer.device_buffer());
+  }
+  se::DeviceMemoryAllocator* allocator =
+      client->client()->backend().memory_allocator();
+  TransferManager* transfer_manager =
+      client->client()->backend().transfer_manager();
+  const Device& device = client->device(device_ordinal);
+  std::shared_ptr<BufferDefinitionEvent> definition_event;
+  if (device.use_multiple_streams()) {
+    definition_event = std::make_shared<BufferDefinitionEvent>(
+        device.host_to_device_stream()->parent());
+  }
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<PySharedDeviceBuffer> tuple_buffer,
+                      PySharedDeviceBuffer::MakeTuple(
+                          device_buffers, transfer_manager, allocator,
+                          device_ordinal, definition_event));
+  PyLocalBuffer buffer(ShapeUtil::MakeTupleShape(host_shapes), tuple_buffer,
+                       client);
 
-const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
-  return &shaped_buffer_.value();
+  // TODO(phawkins): extend TransferManager so we do not need to form a full
+  // ShapedBuffer just to write the root tuple index table.
+  ShapedBuffer shaped_buffer = buffer.AsShapedBuffer();
+  if (device.use_multiple_streams() &&
+      !transfer_manager->CanShapedBufferBeAccessedNow(
+          device.host_to_device_stream()->parent(), shaped_buffer)) {
+    // Wait for the compute stream so that memory allocations are synchronized.
+    device.host_to_device_stream()->ThenWaitFor(device.compute_stream());
+  }
+  transfer_manager->WriteRootTupleIndexTable(device.host_to_device_stream(),
+                                             shaped_buffer);
+  if (definition_event) {
+    definition_event->RecordOnStream(device.host_to_device_stream());
+  }
+
+  if (device.synchronous_deallocation()) {
+    device.ThenReleaseOnWorkerThread(device.host_to_device_stream(),
+                                     std::move(tuple_buffer));
+  }
+  if (!device.asynchronous()) {
+    device.host_to_device_stream()->BlockHostUntilDone();
+  }
+
+  return buffer;
 }
 
-ScopedShapedBuffer LocalShapedBuffer::Release() {
-  ScopedShapedBuffer result = std::move(*shaped_buffer_);
-  shaped_buffer_ = absl::nullopt;
-  return result;
-}
+PyLocalBuffer::PyLocalBuffer(
+    Shape on_host_shape, std::shared_ptr<PySharedDeviceBuffer> device_buffer,
+    PyLocalClient* client)
+    : on_host_shape_(std::move(on_host_shape)),
+      device_buffer_(std::move(device_buffer)),
+      client_(client) {}
 
-const Shape& LocalShapedBuffer::shape() const {
-  return shaped_buffer()->on_device_shape();
-}
-
-StatusOr<py::object> LocalShapedBuffer::ToPython() const {
-  tensorflow::profiler::TraceMe traceme("LocalShapedBuffer::ToPython");
-  auto literal = absl::make_unique<Literal>();
+StatusOr<py::object> PyLocalBuffer::ToPython() const {
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToPython");
+  auto literal = absl::make_unique<Literal>(on_host_shape());
+  client_->py_ref_manager().CollectGarbage();
   {
     py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(
-        *literal, client_->client()->ShapedBufferToLiteral(*shaped_buffer()));
+    se::Stream* stream = client_->device(device_buffer_->device_ordinal())
+                             .device_to_host_stream();
+    WaitForBufferDefinitionEventsOnStream(*device_buffer_, stream);
+    absl::Notification done;
+    Status status;
+    client_->client()->backend().transfer_manager()->TransferLiteralFromDevice(
+        stream, AsShapedBuffer(), *literal, [&](Status done_status) {
+          status = done_status;
+          done.Notify();
+        });
+    done.WaitForNotification();
   }
   return LiteralToPython(std::move(literal));
 }
 
-StatusOr<std::vector<LocalShapedBuffer>> LocalShapedBuffer::DestructureTuple() {
-  tensorflow::profiler::TraceMe traceme("LocalShapedBuffer::DestructureTuple");
-  const Shape tuple_shape = shape();
+ShapedBuffer PyLocalBuffer::AsShapedBuffer() const {
+  return device_buffer_->AsShapedBuffer(on_host_shape_);
+}
 
-  if (!tuple_shape.IsTuple()) {
+StatusOr<std::vector<PyLocalBuffer>> PyLocalBuffer::DestructureTuple() {
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::DestructureTuple");
+  if (!on_host_shape().IsTuple()) {
     return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "Attemped to destructure a PyLocalBuffer that did not have a tuple "
         "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
+        ShapeUtil::HumanString(on_host_shape()));
   }
-
-  DeviceMemoryAllocator* allocator = shaped_buffer()->memory_allocator();
-  ScopedShapedBuffer tuple_buffer = Release();
-
-  // Extract some metadata we use to construct scoped buffers.
-  const se::Platform* platform = tuple_buffer.platform();
-  int device_ordinal = tuple_buffer.device_ordinal();
-
-  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
-  std::vector<LocalShapedBuffer> results;
-  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    // Create a shaped buffer for this destructured tuple element.
-    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
-    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
-    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
-
-    ShapeUtil::ForEachSubshape(
-        subshape, [&](const Shape& s, const ShapeIndex& index) {
-          ShapeIndex original(index);
-          original.push_front(i);
-          se::DeviceMemoryBase* device_memory =
-              shape_tree.mutable_element(original);
-          shaped_buffer.set_buffer(*device_memory, index);
-          *device_memory = se::DeviceMemoryBase();
-        });
-
-    VLOG(3) << "Completed tuple element: " << i;
-    results.push_back(LocalShapedBuffer(
-        ScopedShapedBuffer(std::move(shaped_buffer), allocator), client_));
+  int num_children = ShapeUtil::TupleElementCount(on_host_shape());
+  std::vector<PyLocalBuffer> results;
+  results.reserve(num_children);
+  for (int64 i = 0; i < num_children; ++i) {
+    results.push_back(PyLocalBuffer(on_host_shape().tuple_shapes(i),
+                                    device_buffer_->children().at(i), client_));
   }
   return results;
 }
 
 PyLocalExecutable::PyLocalExecutable(
-    std::unique_ptr<LocalExecutable> executable,
+    std::shared_ptr<LocalExecutable> executable,
     DeviceAssignment device_assignment, PyLocalClient* client)
     : executable_(std::move(executable)),
       device_assignment_(std::move(device_assignment)),
@@ -310,43 +477,89 @@
   return device_ordinals;
 }
 
-StatusOr<LocalShapedBuffer> PyLocalExecutable::Execute(
-    absl::Span<LocalShapedBuffer* const> argument_handles) {
+StatusOr<PyLocalBuffer> PyLocalExecutable::ExecuteHelper(
+    absl::Span<PyLocalBuffer* const> argument_handles, int replica) {
+  const int device_ordinal = device_assignment_(replica, 0);
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Execute");
-  if (num_replicas() != 1) {
-    return InvalidArgument(
-        "Attempted to execute computation with %d replicas using Execute()",
-        num_replicas());
-  }
-  StatusOr<ScopedShapedBuffer> result_buffer_status;
-  const int device_ordinal = device_assignment_(0, 0);
-  VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
-          << device_ordinal;
+  VLOG(3) << "Replica " << replica
+          << " mapped to device ordinal for execution: " << device_ordinal;
 
-  std::vector<const ShapedBuffer*> argument_buffers;
+  absl::flat_hash_set<BufferDefinitionEvent*> events;
+  std::vector<ShapedBuffer> argument_buffers;
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs;
   argument_buffers.reserve(argument_handles.size());
+  argument_buffer_ptrs.reserve(argument_handles.size());
   for (auto& handle : argument_handles) {
-    argument_buffers.push_back(handle->shaped_buffer());
+    argument_buffers.push_back(handle->AsShapedBuffer());
+    argument_buffer_ptrs.push_back(&argument_buffers.back());
+    GetDeviceBufferDefinitionEvents(*handle->device_buffer(), &events);
+    VLOG(4) << "Argument " << argument_buffers.size() - 1
+            << " buffer: " << argument_buffers.back().ToString();
+  }
+
+  const Device& device = client_->device(device_ordinal);
+  for (BufferDefinitionEvent* event : events) {
+    event->WaitForEventOnStream(device.compute_stream());
   }
 
   ExecutableRunOptions options;
-  options.set_device_ordinal(device_ordinal);
+  options.set_stream(device.compute_stream());
+  options.set_host_to_device_stream(device.host_to_device_stream());
   options.set_allocator(client_->client()->backend().memory_allocator());
   options.set_intra_op_thread_pool(
       client_->client()->backend().eigen_intra_op_thread_pool_device());
   options.set_device_assignment(&device_assignment_);
 
-  result_buffer_status = executable_->Run(argument_buffers, options);
+  StatusOr<ScopedShapedBuffer> result_buffer =
+      executable_->RunAsync(argument_buffer_ptrs, options);
 
-  if (!result_buffer_status.ok()) {
-    return result_buffer_status.status();
+  VLOG(1) << "Replica " << replica << " completed; ok=" << result_buffer.ok();
+  if (!result_buffer.ok()) {
+    LOG(ERROR) << "Execution of replica " << replica
+               << " failed: " << result_buffer.status();
+    return result_buffer.status();
   }
-  return LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie(),
-                           client_);
+
+  std::shared_ptr<BufferDefinitionEvent> definition_event;
+  if (device.use_multiple_streams()) {
+    definition_event = std::make_shared<BufferDefinitionEvent>(
+        device.compute_stream()->parent());
+    definition_event->RecordOnStream(device.compute_stream());
+  }
+  Shape on_host_shape = result_buffer.ValueOrDie().on_host_shape();
+  std::shared_ptr<PySharedDeviceBuffer> out_buffer =
+      PySharedDeviceBuffer::FromScopedShapedBuffer(
+          std::move(result_buffer.ValueOrDie()), definition_event);
+
+  if (device.synchronous_deallocation()) {
+    std::vector<std::shared_ptr<PySharedDeviceBuffer>> buffers;
+    buffers.reserve(argument_handles.size() + 1);
+    for (auto& handle : argument_handles) {
+      buffers.push_back(handle->device_buffer());
+    }
+    buffers.push_back(out_buffer);
+    device.ThenReleaseOnWorkerThread(device.compute_stream(),
+                                     std::move(buffers));
+    device.ThenReleaseOnWorkerThread(device.compute_stream(), executable_);
+  }
+  if (!device.asynchronous()) {
+    device.compute_stream()->BlockHostUntilDone();
+  }
+  return PyLocalBuffer(on_host_shape, std::move(out_buffer), client_);
 }
 
-StatusOr<std::vector<LocalShapedBuffer>> PyLocalExecutable::ExecutePerReplica(
-    absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
+StatusOr<PyLocalBuffer> PyLocalExecutable::Execute(
+    absl::Span<PyLocalBuffer* const> argument_handles) {
+  if (num_replicas() != 1) {
+    return InvalidArgument(
+        "Attempted to execute computation with %d replicas using Execute()",
+        num_replicas());
+  }
+  return ExecuteHelper(argument_handles, /*replica=*/0);
+}
+
+StatusOr<std::vector<PyLocalBuffer>> PyLocalExecutable::ExecutePerReplica(
+    absl::Span<const std::vector<PyLocalBuffer*>> argument_handles) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::ExecutePerReplica");
   const int num_devices = client_->device_count();
 
@@ -361,61 +574,34 @@
         argument_handles.size(), num_devices);
   }
 
-  VLOG(1) << "Executing with " << num_replicas() << " replicas.";
-
-  auto execute =
-      [this, &argument_handles](int replica) -> StatusOr<ScopedShapedBuffer> {
-    const int device_ordinal = device_assignment_(replica, 0);
-    VLOG(3) << "Replica " << replica
-            << " mapped to device ordinal for execution: " << device_ordinal;
-
-    std::vector<const ShapedBuffer*> argument_buffers;
-    argument_buffers.reserve(argument_handles[replica].size());
-    for (auto& handle : argument_handles[replica]) {
-      argument_buffers.push_back(handle->shaped_buffer());
-    }
-
-    ExecutableRunOptions options;
-    options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client_->client()->backend().memory_allocator());
-    options.set_intra_op_thread_pool(
-        client_->client()->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment_);
-    StatusOr<ScopedShapedBuffer> result_buffer_status =
-        executable_->Run(argument_buffers, options);
-
-    VLOG(1) << "Replica " << replica
-            << " completed; ok=" << result_buffer_status.ok();
-    if (!result_buffer_status.ok()) {
-      LOG(ERROR) << "Execution of replica " << replica
-                 << " failed: " << result_buffer_status.status();
-    }
-    return result_buffer_status;
-  };
-
   VLOG(1) << "Executing replicated computation; num_replicas="
           << num_replicas();
-  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas());
+  std::vector<StatusOr<PyLocalBuffer>> results(num_replicas());
   if (num_replicas() == 1) {
     // Fast-path if there is only one replica — run the computation on the
     // current thread.
-    results[0] = execute(0);
+    results[0] = ExecuteHelper(argument_handles[0], /*replica=*/0);
   } else {
     absl::Mutex mu;
     int running GUARDED_BY(mu) = num_replicas();
     int failed GUARDED_BY(mu) = 0;
+    Status first_failure_status GUARDED_BY(mu);
 
     for (int replica = 0; replica < num_replicas(); ++replica) {
-      client_->execute_pool()->Schedule(
-          [&execute, &mu, &running, &failed, &results, replica] {
-            results[replica] = execute(replica);
+      const int device_ordinal = device_assignment_(replica, 0);
+      const Device& device = client_->device(device_ordinal);
+      device.worker_thread()->Schedule([&, replica] {
+        results[replica] = ExecuteHelper(argument_handles[replica], replica);
 
-            absl::MutexLock lock(&mu);
-            --running;
-            if (!results[replica].ok()) {
-              ++failed;
-            }
-          });
+        absl::MutexLock lock(&mu);
+        --running;
+        if (!results[replica].ok()) {
+          if (failed == 0) {
+            first_failure_status = results[replica].status();
+          }
+          ++failed;
+        }
+      });
     }
 
     auto done_running_or_failed = [&]() {
@@ -437,14 +623,16 @@
                                absl::Seconds(10))) {
         LOG(FATAL)
             << "Replicated computation launch failed, but not all replicas "
-               "terminated. Aborting process to work around deadlock. See the "
-               "error log for details of the failure.";
+               "terminated. Aborting process to work around deadlock. Failure "
+               "message (there may have been multiple failures, see the "
+               "error log for all failures): \n\n"
+            << first_failure_status.error_message();
       }
     }
   }
   VLOG(1) << "Replicated execution complete.";
 
-  std::vector<LocalShapedBuffer> wrapped_results(num_replicas());
+  std::vector<PyLocalBuffer> wrapped_results(num_replicas());
   for (int replica = 0; replica < num_replicas(); ++replica) {
     auto& statusor = results[replica];
     if (!statusor.ok()) {
@@ -455,47 +643,11 @@
               "replicas may have failed as well).",
               replica));
     }
-    wrapped_results[replica] =
-        LocalShapedBuffer(std::move(statusor).ValueOrDie(), client_);
+    wrapped_results[replica] = std::move(statusor.ValueOrDie());
   }
   return wrapped_results;
 }
 
-StatusOr<py::bytes> GetComputationSerializedProto(
-    const XlaComputation& computation) {
-  std::string result;
-  if (!computation.proto().SerializeToString(&result)) {
-    return Unknown("Failed to serialize the HloModuleProto.");
-  }
-  return py::bytes(result);
-}
-
-StatusOr<std::string> GetComputationHloText(const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          computation.proto(), GetDebugOptionsFromFlags()));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProto(computation.proto(), module_config));
-  HloPrintOptions options;
-  options = HloPrintOptions::ShortParsable();
-  options.set_print_large_constants(false);
-  return hlo_module->ToString(options);
-}
-
-StatusOr<std::string> GetComputationHloDotGraph(
-    const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          computation.proto(), GetDebugOptionsFromFlags()));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProto(computation.proto(), module_config));
-  return RenderGraph(*hlo_module->entry_computation(), /*label=*/"",
-                     hlo_module->config().debug_options(),
-                     RenderedGraphFormat::kDot);
-}
-
 /*static*/ StatusOr<std::unique_ptr<PyLocalExecutable>>
 PyLocalExecutable::Compile(const XlaComputation& computation,
                            std::vector<Shape> argument_layouts,
@@ -523,7 +675,7 @@
 
   for (Shape& layout : argument_layouts) {
     argument_layout_pointers.push_back(&layout);
-    assign_layouts(&layout);
+    TF_RETURN_IF_ERROR(assign_layouts(&layout));
   }
 
   ExecutableBuildOptions options;
@@ -540,7 +692,7 @@
     result_layout = program_shape.result();
     LayoutUtil::ClearLayout(&result_layout);
   }
-  assign_layouts(&result_layout);
+  TF_RETURN_IF_ERROR(assign_layouts(&result_layout));
   options.set_result_layout(result_layout);
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<LocalExecutable> local_executable,
@@ -552,8 +704,8 @@
           options.num_replicas(), /*computation_count=*/1));
 
   return absl::make_unique<PyLocalExecutable>(
-      std::move(local_executable), std::move(device_assignment), client);
+      std::shared_ptr<LocalExecutable>(std::move(local_executable)),
+      std::move(device_assignment), client);
 }
 
-}  // namespace xla_python
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h
index e4cee0c..83cb9bb 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@@ -16,6 +16,7 @@
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_CLIENT_H_
 
+#include <deque>
 #include <string>
 #include <vector>
 
@@ -24,6 +25,8 @@
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+#include "tensorflow/compiler/xla/python/worker_thread.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
@@ -31,7 +34,6 @@
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
-namespace xla_python {
 
 // Registers a 'fn_capsule' as a CPU custom call target.
 // 'fn_capsule' is a void* pointer encapsulated in a PyCapsule object, with name
@@ -39,69 +41,209 @@
 Status RegisterCpuCustomCallTarget(const std::string& fn_name,
                                    pybind11::capsule capsule);
 
+// Class that manages destruction of Python objects.
+//
+// We must not destroy Python objects without holding the GIL. However, we
+// frequently want to hold references to Python objects for the duration of
+// an asynchronous transfer on a Stream, and release our reference when the
+// transfer completes.
+//
+// This class holds references to Python objects outside a GIL scope, that can
+// be collected later when the GIL is held by calling CollectGarbage().
+class PythonRefManager {
+ public:
+  PythonRefManager() = default;
+
+  // Creates a managed std::shared_ptr to an object. When the shared_ptr is
+  // destroyed, the reference to 'object' will be added to python_garbage_,
+  // and collected next time CollectGarbage() is called.
+  std::shared_ptr<pybind11::object> ManageReference(
+      const pybind11::object& object);
+
+  // Releases the contents of python_garbage_. Requires that the GIL is held.
+  // The client calls this method during API entry points where the GIL is held
+  // to free any garbage that has accumulated.
+  void CollectGarbage();
+
+ private:
+  absl::Mutex mu_;
+  std::deque<pybind11::object> python_garbage_ GUARDED_BY(mu_);
+};
+
+// Class that encapsulates state relating to a device (e.g., a GPU) on which we
+// can perform computation and transfers.
+class Device {
+ public:
+  // If use_multiple_streams is true, we allocate separate streams for compute
+  // and transfers. If it is false, we share a single stream for compute and
+  // transfers. The CPU device does not support multiple streams, and this is
+  // a workaround until it does.
+  //
+  // If synchronous_deallocation is true, the host must not free buffers until
+  // compute/transfers that use those buffers have completed. For example, this
+  // typically is the case for the "platform" where compute/transfers are
+  // operations that take place on another thread.
+  //
+  // If asynchronous is false, the host will synchronize to the device after
+  // each execution or transfer. This is intended for debugging only.
+  Device(se::StreamExecutor* executor, bool use_multiple_streams,
+         bool synchronous_deallocation, bool asynchronous);
+  ~Device();
+
+  bool use_multiple_streams() const { return use_multiple_streams_; }
+  bool synchronous_deallocation() const { return synchronous_deallocation_; }
+  bool asynchronous() const { return asynchronous_; }
+  se::Stream* compute_stream() const { return compute_stream_.get(); }
+  se::Stream* host_to_device_stream() const {
+    return host_to_device_stream_.get();
+  }
+  se::Stream* device_to_host_stream() const {
+    return device_to_host_stream_.get();
+  }
+
+  // A worker thread, used for replicated computation launches and callbacks.
+  WorkerThread* worker_thread() const { return worker_thread_.get(); }
+
+  // Enqueues a host callback on 'stream', to be executed by worker_thread_.
+  // ThenDoHostCallback is often constrained in what it can do, in particular,
+  // on GPU the callback runs on a thread belonging to the GPU runtime and
+  // cannot perform GPU operations itself.
+  void ThenExecuteOnWorkerThread(se::Stream* stream,
+                                 std::function<void()> callback) const;
+
+  // Helper for releasing values from a callback at the tail of a stream.
+  // This is only permitted if object's destructor will not free any device
+  // objects, since the callback may be called from a device thread pool on
+  // GPU.
+  template <typename T>
+  void ThenRelease(se::Stream* stream, std::shared_ptr<T> object) const {
+    if (callback_stream_.get() != stream) {
+      callback_stream_->ThenWaitFor(stream);
+    }
+    callback_stream_->ThenDoHostCallback([object]() { /* releases object */ });
+  }
+
+  // Helpers for releasing values on a worker thread at the tail of a stream on
+  // a worker thread.
+  template <typename T>
+  void ThenReleaseOnWorkerThread(se::Stream* stream,
+                                 std::shared_ptr<T> object) const {
+    // We use a non-smart pointer here because we want to ensure that the worker
+    // thread is the only callee of the shared_ptr destructor, and if we passed
+    // object by lambda capture we have a race where the worker thread might
+    // run and release its reference first.
+    auto* ref = new std::shared_ptr<T>(std::move(object));
+    if (callback_stream_.get() != stream) {
+      callback_stream_->ThenWaitFor(stream);
+    }
+    ThenExecuteOnWorkerThread(callback_stream_.get(), [ref]() { delete ref; });
+  }
+  template <typename T>
+  void ThenReleaseOnWorkerThread(se::Stream* stream,
+                                 std::vector<std::shared_ptr<T>> object) const {
+    auto* ref = new std::vector<std::shared_ptr<T>>(std::move(object));
+    if (callback_stream_.get() != stream) {
+      callback_stream_->ThenWaitFor(stream);
+    }
+    ThenExecuteOnWorkerThread(callback_stream_.get(), [ref]() { delete ref; });
+  }
+
+ private:
+  bool use_multiple_streams_;
+  bool synchronous_deallocation_;
+  bool asynchronous_;
+  std::shared_ptr<se::Stream> compute_stream_;
+  std::shared_ptr<se::Stream> host_to_device_stream_;
+  std::shared_ptr<se::Stream> device_to_host_stream_;
+
+  // Callback stream is used for running short host-side callbacks after device
+  // side events, without preventing the device-side stream from doing useful
+  // work.
+  std::shared_ptr<se::Stream> callback_stream_;
+
+  std::unique_ptr<WorkerThread> worker_thread_;
+};
+
+// Encapsulates the state of Python session with XLA.
 class PyLocalClient {
  public:
   // Initializes a local XLA client for `platform_name`. Returns an error if no
   // such platform exists, or if the platform has no visible devices.
   static StatusOr<std::unique_ptr<PyLocalClient>> Get(
-      const std::string& platform_name);
+      const std::string& platform_name, const std::string& xla_platform_id,
+      bool asynchronous);
 
-  explicit PyLocalClient(LocalClient* client);
+  explicit PyLocalClient(std::string platform_name, LocalClient* client,
+                         bool asynchronous);
 
   Status TransferToInfeed(const LiteralSlice& literal, int device_ordinal);
   StatusOr<pybind11::object> TransferFromOutfeed(const Shape& shape,
                                                  int device_ordinal);
 
   int device_count() const { return client_->device_count(); }
+  const Device& device(int device_ordinal) const {
+    return *devices_.at(device_ordinal);
+  }
   LocalClient* client() const { return client_; }
 
   tensorflow::thread::ThreadPool* h2d_transfer_pool() {
     return &h2d_transfer_pool_;
   }
-  tensorflow::thread::ThreadPool* execute_pool() { return &execute_pool_; }
+
+  PythonRefManager& py_ref_manager() { return py_ref_manager_; }
 
  private:
+  std::string platform_name_;
   LocalClient* client_;
+  std::vector<std::unique_ptr<Device>> devices_;
+
   tensorflow::thread::ThreadPool h2d_transfer_pool_;
-  tensorflow::thread::ThreadPool execute_pool_;
+
+  PythonRefManager py_ref_manager_;
 };
 
-// Represents a reference to literals that live in a device-allocated buffer via
-// XLA. Specifically, wraps a ScopedShapedBuffer produced by transferring a
-// literal to device via the local client.
-class LocalShapedBuffer {
+// Holds a reference from Python to one or more device buffers.
+class PyLocalBuffer {
  public:
-  static StatusOr<LocalShapedBuffer> FromPython(
-      const pybind11::object& argument, PyLocalClient* client,
-      int device_ordinal);
+  static StatusOr<PyLocalBuffer> FromPython(const pybind11::object& argument,
+                                            PyLocalClient* client,
+                                            int device_ordinal);
 
   // Converts multiple (python object, device ordinal) pairs into
-  // LocalShapedBuffers in parallel.
-  static StatusOr<std::vector<LocalShapedBuffer>> FromPythonValues(
+  // PyLocalBuffers in parallel.
+  static StatusOr<std::vector<PyLocalBuffer>> FromPythonValues(
       const std::vector<std::pair<pybind11::object, int>>& argument,
       PyLocalClient* client);
 
-  LocalShapedBuffer() = default;
-  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer, PyLocalClient* client);
-  StatusOr<pybind11::object> ToPython() const;
-  const Shape& shape() const;
-  const ScopedShapedBuffer* shaped_buffer() const;
+  static StatusOr<PyLocalBuffer> MakeTuple(
+      const std::vector<PyLocalBuffer> buffers, PyLocalClient* client,
+      int device_ordinal);
 
-  // Transfers ownership of the encapsulated ShapedBuffer to the caller,
-  // analogous to std::unique_ptr::release().
-  ScopedShapedBuffer Release();
+  PyLocalBuffer() = default;
+  PyLocalBuffer(Shape on_host_shape,
+                std::shared_ptr<PySharedDeviceBuffer> device_buffer,
+                PyLocalClient* client);
+  StatusOr<pybind11::object> ToPython() const;
+  const Shape& on_host_shape() const { return on_host_shape_; }
+  const std::shared_ptr<PySharedDeviceBuffer>& device_buffer() const {
+    return device_buffer_;
+  }
 
   void Delete() {
-    shaped_buffer_ = absl::nullopt;
+    device_buffer_ = nullptr;
     client_ = nullptr;
   }
 
-  // Destructures a tuple-valued LocalShapedBuffer into its constituent
-  // elements in LocalShapedBufferTuple form.
-  StatusOr<std::vector<LocalShapedBuffer>> DestructureTuple();
+  // Returns a view of the PyLocalBuffer DAG as a ShapedBuffer. The
+  // PyLocalBuffer retains ownership of the device buffers.
+  ShapedBuffer AsShapedBuffer() const;
+
+  // Destructures a tuple-valued PyLocalBuffer into its constituent elements.
+  StatusOr<std::vector<PyLocalBuffer>> DestructureTuple();
 
  private:
-  absl::optional<ScopedShapedBuffer> shaped_buffer_;
+  Shape on_host_shape_;
+  std::shared_ptr<PySharedDeviceBuffer> device_buffer_;
   PyLocalClient* client_ = nullptr;
 };
 
@@ -114,7 +256,7 @@
       const XlaComputation& computation, std::vector<Shape> argument_layouts,
       const ExecutableBuildOptions* build_options, PyLocalClient* client);
 
-  PyLocalExecutable(std::unique_ptr<LocalExecutable> executable,
+  PyLocalExecutable(std::shared_ptr<LocalExecutable> executable,
                     DeviceAssignment device_assignment, PyLocalClient* client);
 
   int num_replicas() const {
@@ -128,35 +270,26 @@
     return device_assignment_;
   }
 
-  StatusOr<LocalShapedBuffer> Execute(
-      absl::Span<LocalShapedBuffer* const> argument_handles);
+  StatusOr<PyLocalBuffer> Execute(
+      absl::Span<PyLocalBuffer* const> argument_handles);
 
   // Execute on many replicas. Takes a sequence of argument lists (one argument
   // list per replica) and returns a tuple of results (one result per replica).
   // The number of argument lists must be equal to the replica count.
-  StatusOr<std::vector<LocalShapedBuffer>> ExecutePerReplica(
-      absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles);
+  StatusOr<std::vector<PyLocalBuffer>> ExecutePerReplica(
+      absl::Span<const std::vector<PyLocalBuffer*>> argument_handles);
 
   void Delete() { executable_ = nullptr; }
 
  private:
-  std::unique_ptr<LocalExecutable> executable_;
+  StatusOr<PyLocalBuffer> ExecuteHelper(
+      absl::Span<PyLocalBuffer* const> argument_handles, int replica);
+
+  std::shared_ptr<LocalExecutable> executable_;
   const DeviceAssignment device_assignment_;
   PyLocalClient* const client_;
 };
 
-// Converts a computation to a serialized HloModuleProto
-StatusOr<pybind11::bytes> GetComputationSerializedProto(
-    const XlaComputation& computation);
-
-// Converts a computation to textual HLO form.
-StatusOr<std::string> GetComputationHloText(const XlaComputation& computation);
-
-// Converts a computation to HLO dot graph form.
-StatusOr<std::string> GetComputationHloDotGraph(
-    const XlaComputation& computation);
-
-}  // namespace xla_python
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.cc b/tensorflow/compiler/xla/python/shared_device_buffer.cc
new file mode 100644
index 0000000..f9fbd9e
--- /dev/null
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.cc
@@ -0,0 +1,184 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+
+namespace xla {
+
+BufferDefinitionEvent::BufferDefinitionEvent(se::StreamExecutor* executor)
+    : event_(executor) {}
+
+void BufferDefinitionEvent::RecordOnStream(se::Stream* stream) {
+  absl::MutexLock lock(&mu_);
+  CHECK(streams_defined_on_.empty());
+  stream->ThenRecordEvent(&event_);
+  streams_defined_on_.push_back(stream);
+}
+
+void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) {
+  absl::MutexLock lock(&mu_);
+
+  // The set of defined streams is expected to be very small indeed (usually
+  // 1-2), so a simple linear scan should be fast enough.
+  if (std::find(streams_defined_on_.begin(), streams_defined_on_.end(),
+                stream) != streams_defined_on_.end()) {
+    // stream is in streams_defined_on_; it doesn't need to be waited on.
+    return;
+  }
+
+  stream->ThenWaitFor(&event_);
+  streams_defined_on_.push_back(stream);
+}
+
+static std::shared_ptr<PySharedDeviceBuffer>
+BufferFromScopedShapedBufferIterator(
+    const Shape& on_device_shape, int device_ordinal,
+    se::DeviceMemoryAllocator* allocator,
+    ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
+    const ShapeTree<se::DeviceMemoryBase>::iterator& end,
+    const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
+  CHECK(*iterator != end);
+
+  se::OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
+                                       allocator);
+  (*iterator)->second = se::DeviceMemoryBase();
+  ++*iterator;
+
+  std::vector<std::shared_ptr<PySharedDeviceBuffer>> children;
+  if (on_device_shape.IsTuple()) {
+    int num_children = ShapeUtil::TupleElementCount(on_device_shape);
+    children.reserve(num_children);
+    for (int i = 0; i < num_children; ++i) {
+      children.push_back(BufferFromScopedShapedBufferIterator(
+          on_device_shape.tuple_shapes(i), device_ordinal, allocator, iterator,
+          end, definition_event));
+    }
+  }
+  return std::make_shared<PySharedDeviceBuffer>(
+      on_device_shape, std::move(device_memory), children, definition_event);
+}
+
+/* static */ std::shared_ptr<PySharedDeviceBuffer>
+PySharedDeviceBuffer::FromScopedShapedBuffer(
+    ScopedShapedBuffer shaped_buffer,
+    const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
+  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+      shaped_buffer.buffers().begin();
+  std::shared_ptr<PySharedDeviceBuffer> output =
+      BufferFromScopedShapedBufferIterator(
+          shaped_buffer.on_device_shape(), shaped_buffer.device_ordinal(),
+          shaped_buffer.memory_allocator(), &iterator,
+          shaped_buffer.buffers().end(), definition_event);
+  CHECK(iterator == shaped_buffer.buffers().end());
+  return output;
+}
+
+/* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
+PySharedDeviceBuffer::MakeTuple(
+    std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
+    TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
+    int device_ordinal,
+    std::shared_ptr<BufferDefinitionEvent> definition_event) {
+  std::vector<Shape> child_shapes;
+  child_shapes.reserve(children.size());
+  for (const auto& child : children) {
+    TF_RET_CHECK(child->device_memory().device_ordinal() == device_ordinal);
+    child_shapes.push_back(child->on_device_shape());
+  }
+
+  Shape shape = ShapeUtil::MakeTupleShape(child_shapes);
+  TF_ASSIGN_OR_RETURN(
+      se::OwningDeviceMemory device_memory,
+      allocator->Allocate(device_ordinal,
+                          transfer_manager->GetByteSizeRequirement(shape)));
+  return std::make_shared<PySharedDeviceBuffer>(
+      std::move(shape), std::move(device_memory), std::move(children),
+      std::move(definition_event));
+}
+
+/* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
+PySharedDeviceBuffer::MakeArray(
+    Shape on_device_shape, TransferManager* transfer_manager,
+    se::DeviceMemoryAllocator* allocator, int device_ordinal,
+    std::shared_ptr<BufferDefinitionEvent> definition_event) {
+  TF_ASSIGN_OR_RETURN(
+      se::OwningDeviceMemory device_memory,
+      allocator->Allocate(
+          device_ordinal,
+          transfer_manager->GetByteSizeRequirement(on_device_shape)));
+  return std::make_shared<PySharedDeviceBuffer>(
+      std::move(on_device_shape), std::move(device_memory),
+      /*children=*/std::vector<std::shared_ptr<PySharedDeviceBuffer>>{},
+      std::move(definition_event));
+}
+
+// Populates a buffer tree from a ShapeTree iterator.
+static void PopulateShapedBufferFromBuffer(
+    const PySharedDeviceBuffer& buffer,
+    ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
+    const ShapeTree<se::DeviceMemoryBase>::iterator& end) {
+  CHECK(*iterator != end);
+  (*iterator)->second = buffer.device_memory().AsDeviceMemoryBase();
+  ++*iterator;
+  for (const auto& child : buffer.children()) {
+    PopulateShapedBufferFromBuffer(*child, iterator, end);
+  }
+}
+
+ShapedBuffer PySharedDeviceBuffer::AsShapedBuffer(
+    const Shape& on_host_shape) const {
+  ShapedBuffer shaped_buffer(on_host_shape, on_device_shape_,
+                             device_memory_.allocator()->platform(),
+                             device_memory_.device_ordinal());
+  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+      shaped_buffer.buffers().begin();
+  PopulateShapedBufferFromBuffer(*this, &iterator,
+                                 shaped_buffer.buffers().end());
+  CHECK(iterator == shaped_buffer.buffers().end());
+  return shaped_buffer;
+}
+
+PySharedDeviceBuffer::PySharedDeviceBuffer(
+    Shape on_device_shape, se::OwningDeviceMemory device_memory,
+    std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
+    std::shared_ptr<BufferDefinitionEvent> definition_event)
+    : on_device_shape_(std::move(on_device_shape)),
+      device_memory_(std::move(device_memory)),
+      children_(std::move(children)),
+      definition_event_(std::move(definition_event)) {}
+
+void GetDeviceBufferDefinitionEvents(
+    const PySharedDeviceBuffer& buffer,
+    absl::flat_hash_set<BufferDefinitionEvent*>* events) {
+  if (buffer.definition_event()) {
+    events->insert(buffer.definition_event().get());
+  }
+  for (const auto& child : buffer.children()) {
+    GetDeviceBufferDefinitionEvents(*child, events);
+  }
+}
+
+void WaitForBufferDefinitionEventsOnStream(const PySharedDeviceBuffer& buffer,
+                                           se::Stream* stream) {
+  absl::flat_hash_set<BufferDefinitionEvent*> events;
+  GetDeviceBufferDefinitionEvents(buffer, &events);
+  for (BufferDefinitionEvent* event : events) {
+    event->WaitForEventOnStream(stream);
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.h b/tensorflow/compiler/xla/python/shared_device_buffer.h
new file mode 100644
index 0000000..6a57d7f
--- /dev/null
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.h
@@ -0,0 +1,155 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
+
+namespace xla {
+
+// A BufferDefinitionEvent describes whether a buffer is valid from the
+// viewpoint of each of stream that may access it.
+//
+// Each logical buffer in an XLA computation may be defined (i.e., written to)
+// at most once, although the same physical piece of memory may be reused for
+// multiple logical buffers. We call the operation that writes the buffer's
+// value on some stream (e.g., a transfer or compute kernel) the buffer's
+// definition event.
+//
+// After the operation that populates the value of a buffer has been enqueued on
+// 'stream', RecordOnStream(stream) should also be called to trigger the
+// definition event after the operation has completed.
+//
+// Since different streams are not necessarily synchronized with one another,
+// if we wish to consume the value of the buffer on a different stream, we
+// should first call WaitForEventOnStream(stream), which add a cross-stream
+// from 'stream' to the buffer's definition event, causing 'stream' to pause
+// until the definition event has been triggered, if needed. Operations on
+// 'stream' may then assume that the buffer is valid and its contents correspond
+// to the desired buffer.
+//
+// The dependency logic caches the set of streams at the tail of which the
+// definition event is known to have occurred; waiting for the same event on the
+// same stream causes no additional waiting.
+class BufferDefinitionEvent {
+ public:
+  // Creates a new definition event whose event has not yet been triggered.
+  explicit BufferDefinitionEvent(se::StreamExecutor* executor);
+
+  // Records the definition event on the tail of 'stream'.
+  void RecordOnStream(se::Stream* stream);
+
+  // Adds synchronization events to 'stream' that wait for this event to be
+  // defined on 'stream'. Does nothing if the event is already known to have
+  // occurred by the tail of 'stream'.
+  void WaitForEventOnStream(se::Stream* stream);
+
+ private:
+  // An event that is triggered when the content of one or more buffers is
+  // ready. If this event is nullptr, it is assumed that the buffer's content is
+  // always defined.
+  se::Event event_;
+
+  absl::Mutex mu_;
+
+  // A list of all streams for which the buffer's content is known to be defined
+  // at the tail of the queue, i.e., for any newly enqueued command.
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ GUARDED_BY(mu_);
+};
+
+// Class that represents a node in a reference-counted DAG of device buffers.
+// Unlike a ShapedBuffer, which owns none of its buffers, and
+// ScopedShapedBuffer, which owns an entire buffer tree, the reference counting
+// in a PySharedDeviceBuffer DAG is done at the level of individual device
+// buffers. Reference counting buffer individually is more convenient when
+// manipulating on-device tuples where a tuple and its elements may have
+// different lifetimes.
+class PySharedDeviceBuffer {
+ public:
+  // Converts a ScopedShapedBuffer into a Buffer tree. Takes ownership of the
+  // contents of the shaped_buffer.
+  static std::shared_ptr<PySharedDeviceBuffer> FromScopedShapedBuffer(
+      ScopedShapedBuffer shaped_buffer,
+      const std::shared_ptr<BufferDefinitionEvent>& definition_event);
+
+  // Makes a tuple buffer. Does not initialize the tuple table.
+  static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeTuple(
+      std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
+      TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
+      int device_ordinal,
+      std::shared_ptr<BufferDefinitionEvent> definition_event);
+
+  // Makes an uninitialized array buffer.
+  static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeArray(
+      Shape on_device_shape, TransferManager* transfer_manager,
+      se::DeviceMemoryAllocator* allocator, int device_ordinal,
+      std::shared_ptr<BufferDefinitionEvent> definition_event);
+
+  // Builds a ShapedBuffer view onto the buffers of 'tree'. Since
+  // PySharedDeviceBuffer does not maintain the on-host shape, the caller must
+  // provide it. We require but do not verify that
+  // TransferManager::HostShapeToDeviceShape(on_host_shape) == on_device_shape()
+  ShapedBuffer AsShapedBuffer(const Shape& on_host_shape) const;
+
+  const Shape& on_device_shape() const { return on_device_shape_; }
+  const std::vector<std::shared_ptr<PySharedDeviceBuffer>>& children() const {
+    return children_;
+  }
+  const se::OwningDeviceMemory& device_memory() const { return device_memory_; }
+  int device_ordinal() const { return device_memory_.device_ordinal(); }
+  const std::shared_ptr<BufferDefinitionEvent> definition_event() const {
+    return definition_event_;
+  }
+
+  PySharedDeviceBuffer() = default;
+  PySharedDeviceBuffer(
+      Shape on_device_shape, se::OwningDeviceMemory device_memory,
+      std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
+      std::shared_ptr<BufferDefinitionEvent> definition_event);
+
+ private:
+  // We only represent the on-device shape. The on-host shape may not be
+  // one-to-one with the tree of device buffers, so to avoid representational
+  // awkwardness we maintain on-host shapes separately.
+  Shape on_device_shape_;
+  se::OwningDeviceMemory device_memory_;
+  std::vector<std::shared_ptr<PySharedDeviceBuffer>> children_;
+
+  // An event that is triggered when the content of one or more buffers is
+  // ready during multistream execution. May be nullptr, which is used in the
+  // single-stream execution case where events are not necessary for buffer
+  // event sequencing.
+  std::shared_ptr<BufferDefinitionEvent> definition_event_;
+};
+
+// Populates 'events' with the set of buffer definition events for all buffers
+// in the buffer DAG rooted at 'buffer'.
+void GetDeviceBufferDefinitionEvents(
+    const PySharedDeviceBuffer& buffer,
+    absl::flat_hash_set<BufferDefinitionEvent*>* events);
+
+// Waits for all of the buffer definition events in a buffer DAG on 'stream'.
+void WaitForBufferDefinitionEventsOnStream(const PySharedDeviceBuffer& buffer,
+                                           se::Stream* stream);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer_test.cc b/tensorflow/compiler/xla/python/shared_device_buffer_test.cc
new file mode 100644
index 0000000..28151f9
--- /dev/null
+++ b/tensorflow/compiler/xla/python/shared_device_buffer_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
+
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace {
+
+TEST(PySharedDeviceBufferTest, MakeArray) {
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+
+  Shape shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer, PySharedDeviceBuffer::MakeArray(
+                       shape, client->backend().transfer_manager(),
+                       client->backend().memory_allocator(), 0, nullptr));
+  EXPECT_EQ(
+      buffer->on_device_shape(),
+      client->backend().transfer_manager()->HostShapeToDeviceShape(shape));
+  EXPECT_EQ(buffer->children().size(), 0);
+  EXPECT_EQ(buffer->device_memory().device_ordinal(), 0);
+  EXPECT_EQ(buffer->device_memory().allocator(),
+            client->backend().memory_allocator());
+  EXPECT_FALSE(buffer->device_memory().is_null());
+}
+
+TEST(PySharedDeviceBufferTest, MakeTuple) {
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+
+  Shape a_shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
+  Shape b_shape = ShapeUtil::MakeShape(S8, {77});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({a_shape, b_shape});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto a_buffer, PySharedDeviceBuffer::MakeArray(
+                         a_shape, client->backend().transfer_manager(),
+                         client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto b_buffer, PySharedDeviceBuffer::MakeArray(
+                         b_shape, client->backend().transfer_manager(),
+                         client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto tuple_buffer,
+      PySharedDeviceBuffer::MakeTuple(
+          {a_buffer, b_buffer}, client->backend().transfer_manager(),
+          client->backend().memory_allocator(), 0, nullptr));
+  EXPECT_EQ(tuple_buffer->on_device_shape(),
+            client->backend().transfer_manager()->HostShapeToDeviceShape(
+                tuple_shape));
+  ASSERT_EQ(tuple_buffer->children().size(), 2);
+  EXPECT_EQ(tuple_buffer->children()[0], a_buffer);
+  EXPECT_EQ(tuple_buffer->children()[1], b_buffer);
+  EXPECT_EQ(tuple_buffer->device_memory().device_ordinal(), 0);
+  EXPECT_EQ(tuple_buffer->device_memory().allocator(),
+            client->backend().memory_allocator());
+  EXPECT_FALSE(tuple_buffer->device_memory().is_null());
+}
+
+TEST(PySharedDeviceBufferTest, AsShapedBuffer) {
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+
+  Shape a_shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
+  Shape b_shape = ShapeUtil::MakeShape(S8, {77});
+  Shape ab_tuple_shape = ShapeUtil::MakeTupleShape({a_shape, b_shape});
+  Shape c_shape = ShapeUtil::MakeShape(S64, {});
+  Shape abc_tuple_shape = ShapeUtil::MakeTupleShape({c_shape, ab_tuple_shape});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto a_buffer, PySharedDeviceBuffer::MakeArray(
+                         a_shape, client->backend().transfer_manager(),
+                         client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto b_buffer, PySharedDeviceBuffer::MakeArray(
+                         b_shape, client->backend().transfer_manager(),
+                         client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ab_tuple_buffer,
+      PySharedDeviceBuffer::MakeTuple(
+          {a_buffer, b_buffer}, client->backend().transfer_manager(),
+          client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto c_buffer, PySharedDeviceBuffer::MakeArray(
+                         c_shape, client->backend().transfer_manager(),
+                         client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto abc_tuple_buffer,
+      PySharedDeviceBuffer::MakeTuple(
+          {c_buffer, ab_tuple_buffer}, client->backend().transfer_manager(),
+          client->backend().memory_allocator(), 0, nullptr));
+  EXPECT_EQ(abc_tuple_buffer->on_device_shape(),
+            client->backend().transfer_manager()->HostShapeToDeviceShape(
+                abc_tuple_shape));
+
+  ShapedBuffer shaped_buffer =
+      abc_tuple_buffer->AsShapedBuffer(abc_tuple_shape);
+  EXPECT_EQ(shaped_buffer.on_host_shape(), abc_tuple_shape);
+  EXPECT_EQ(shaped_buffer.on_device_shape(),
+            abc_tuple_buffer->on_device_shape());
+
+  std::vector<se::DeviceMemoryBase> expected_buffer_sequence = {
+      abc_tuple_buffer->device_memory().AsDeviceMemoryBase(),
+      c_buffer->device_memory().AsDeviceMemoryBase(),
+      ab_tuple_buffer->device_memory().AsDeviceMemoryBase(),
+      a_buffer->device_memory().AsDeviceMemoryBase(),
+      b_buffer->device_memory().AsDeviceMemoryBase(),
+  };
+  auto it = shaped_buffer.buffers().begin();
+  auto expected_it = expected_buffer_sequence.begin();
+  while (it != shaped_buffer.buffers().end()) {
+    ASSERT_TRUE(expected_it != expected_buffer_sequence.end());
+    EXPECT_TRUE(expected_it->IsSameAs(it->second));
+    ++it;
+    ++expected_it;
+  }
+  EXPECT_TRUE(expected_it == expected_buffer_sequence.end());
+}
+
+TEST(PySharedDeviceBufferTest, FromScopedShapedBuffer) {
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+
+  Literal literal = LiteralUtil::MakeTupleOwned(
+      LiteralUtil::CreateFullWithDescendingLayout<float>({10, 3, 7}, 33.4f),
+      LiteralUtil::One(S64));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer shaped_buffer,
+      client->LiteralToShapedBuffer(literal, /*device_ordinal=*/0));
+  std::shared_ptr<PySharedDeviceBuffer> device_buffer =
+      PySharedDeviceBuffer::FromScopedShapedBuffer(std::move(shaped_buffer),
+                                                   nullptr);
+
+  EXPECT_EQ(device_buffer->on_device_shape(),
+            client->backend().transfer_manager()->HostShapeToDeviceShape(
+                literal.shape()));
+  ASSERT_EQ(device_buffer->children().size(), 2);
+  EXPECT_EQ(device_buffer->children()[0]->on_device_shape(),
+            client->backend().transfer_manager()->HostShapeToDeviceShape(
+                ShapeUtil::MakeShape(F32, {10, 3, 7})));
+  EXPECT_EQ(device_buffer->children()[1]->on_device_shape(),
+            client->backend().transfer_manager()->HostShapeToDeviceShape(
+                ShapeUtil::MakeShape(S64, {})));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/types.cc b/tensorflow/compiler/xla/python/types.cc
index 2d0eb8a..2e76b89 100644
--- a/tensorflow/compiler/xla/python/types.cc
+++ b/tensorflow/compiler/xla/python/types.cc
@@ -16,15 +16,14 @@
 #include "tensorflow/compiler/xla/python/types.h"
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
 namespace py = pybind11;
 
-xla::StatusOr<PrimitiveType> NumpyTypeToPrimitiveType(
-    const py::dtype& np_type) {
+xla::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
   static auto* types =
       new absl::flat_hash_map<std::pair<char, int>, PrimitiveType>({
           {{'b', 1}, PRED},
@@ -50,6 +49,42 @@
   return it->second;
 }
 
+xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
+  switch (type) {
+    case PRED:
+      return py::dtype::of<bool>();
+    case S8:
+      return py::dtype::of<int8>();
+    case S16:
+      return py::dtype::of<int16>();
+    case S32:
+      return py::dtype::of<int32>();
+    case S64:
+      return py::dtype::of<int64>();
+    case U8:
+      return py::dtype::of<uint8>();
+    case U16:
+      return py::dtype::of<uint16>();
+    case U32:
+      return py::dtype::of<uint32>();
+    case U64:
+      return py::dtype::of<uint64>();
+    case F16:
+      return py::dtype("e");
+    case F32:
+      return py::dtype::of<float>();
+    case F64:
+      return py::dtype::of<double>();
+    case C64:
+      return py::dtype::of<std::complex<float>>();
+    case C128:
+      return py::dtype::of<std::complex<double>>();
+    default:
+      return Unimplemented("Unimplemented primitive type %s",
+                           PrimitiveType_Name(type));
+  }
+}
+
 // Returns a numpy-style format descriptor string for `type`.
 StatusOr<std::string> FormatDescriptorForPrimitiveType(PrimitiveType type) {
   switch (type) {
@@ -159,4 +194,20 @@
   return tree;
 }
 
+py::tuple IntSpanToTuple(absl::Span<int64 const> xs) {
+  py::tuple out(xs.size());
+  for (int i = 0; i < xs.size(); ++i) {
+    out[i] = py::int_(xs[i]);
+  }
+  return out;
+}
+
+std::vector<int64> IntSequenceToVector(const py::object& sequence) {
+  std::vector<int64> output;
+  for (auto item : sequence) {
+    output.push_back(item.cast<int64>());
+  }
+  return output;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index 7a46bb7..c2be8c6 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -32,29 +32,48 @@
 
 namespace xla {
 
-// Converts a pybind11-style NumPy dtype to a PrimitiveType.
-StatusOr<PrimitiveType> NumpyTypeToPrimitiveType(
-    const pybind11::dtype& np_type);
+// Helper that converts a failing StatusOr to an exception.
+// For use only inside pybind11 code.
+template <typename T>
+T ValueOrThrow(StatusOr<T> v) {
+  if (!v.ok()) {
+    throw std::runtime_error(v.status().ToString());
+  }
+  return v.ConsumeValueOrDie();
+}
+
+// Converts a NumPy dtype to a PrimitiveType.
+StatusOr<PrimitiveType> DtypeToPrimitiveType(const pybind11::dtype& np_type);
+
+// Converts a PrimitiveType to a Numpy dtype.
+StatusOr<pybind11::dtype> PrimitiveTypeToDtype(PrimitiveType type);
 
 // Converts a literal to (possibly-nested tuples of) NumPy arrays.
 // The literal's leaf arrays are not copied; instead the NumPy arrays share
 // buffers with the literals. Takes ownership of `literal` and keeps the
 // necessary pieces alive using Python reference counting.
 // Requires the GIL.
-StatusOr<pybind11::object> LiteralToPython(
-    std::unique_ptr<xla::Literal> literal);
+StatusOr<pybind11::object> LiteralToPython(std::unique_ptr<Literal> literal);
 
 // Converts a Python object into an XLA shape and a vector of leaf buffers.
 // The leaf buffers correspond to a depth-first, left-to-right traversal of
 // the Python value.
 // Requires the GIL.
 struct PythonBufferTree {
-  absl::InlinedVector<xla::BorrowingLiteral, 1> leaves;
-  xla::Shape shape;
+  absl::InlinedVector<BorrowingLiteral, 1> leaves;
+  Shape shape;
 };
 StatusOr<PythonBufferTree> GetPythonBufferTree(
     const pybind11::object& argument);
 
+// Converts a sequence of int64s to a Python tuple of ints.
+// Pybind11 by default converts a std::vector<int64> to a Python list; for
+// shapes we frequently want a tuple instead.
+pybind11::tuple IntSpanToTuple(absl::Span<int64 const> xs);
+
+// Converts a Python sequence of integers to a std::vector<int64>
+std::vector<int64> IntSequenceToVector(const pybind11::object& sequence);
+
 }  // namespace xla
 
 // This namespace is a documented pybind11 extension point.
@@ -64,12 +83,16 @@
 namespace pybind11 {
 namespace detail {
 
+// When absl::optional is an alias for std::optional, the type_caster
+// specializations are provided by pybind11.
+#ifndef ABSL_HAVE_STD_OPTIONAL
 // absl::optional
 template <typename T>
 struct type_caster<absl::optional<T>> : optional_caster<absl::optional<T>> {};
 
 template <>
 struct type_caster<absl::nullopt_t> : public void_caster<absl::nullopt_t> {};
+#endif
 
 // absl::Span
 template <typename T>
@@ -157,7 +180,7 @@
     for (int i = 0; i < array.ndim(); ++i) {
       dims[i] = array.shape(i);
     }
-    auto type = xla::NumpyTypeToPrimitiveType(array.dtype());
+    auto type = xla::DtypeToPrimitiveType(array.dtype());
     if (!type.ok()) {
       throw std::runtime_error(type.status().ToString());
     }
diff --git a/tensorflow/compiler/xla/python/worker_thread.cc b/tensorflow/compiler/xla/python/worker_thread.cc
new file mode 100644
index 0000000..d3fb020
--- /dev/null
+++ b/tensorflow/compiler/xla/python/worker_thread.cc
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/worker_thread.h"
+
+namespace xla {
+
+WorkerThread::WorkerThread(tensorflow::Env* env, const std::string& name) {
+  thread_.reset(env->StartThread(tensorflow::ThreadOptions(), name,
+                                 [this]() { WorkLoop(); }));
+}
+
+WorkerThread::~WorkerThread() {
+  absl::MutexLock lock(&mu_);
+  work_queue_.push(nullptr);
+}
+
+void WorkerThread::Schedule(std::function<void()> fn) {
+  CHECK(fn != nullptr);
+  absl::MutexLock lock(&mu_);
+  work_queue_.push(std::move(fn));
+}
+
+bool WorkerThread::WorkAvailable() { return !work_queue_.empty(); }
+
+void WorkerThread::WorkLoop() {
+  while (true) {
+    std::function<void()> fn;
+    {
+      absl::MutexLock lock(&mu_);
+      mu_.Await(absl::Condition(this, &WorkerThread::WorkAvailable));
+      fn = std::move(work_queue_.front());
+      work_queue_.pop();
+    }
+    if (!fn) {
+      return;
+    }
+    fn();
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/worker_thread.h b/tensorflow/compiler/xla/python/worker_thread.h
new file mode 100644
index 0000000..bc7dd39
--- /dev/null
+++ b/tensorflow/compiler/xla/python/worker_thread.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_WORKER_THREAD_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_WORKER_THREAD_H_
+
+#include <functional>
+#include <memory>
+#include <queue>
+#include <string>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace xla {
+
+// A worker thread that runs a sequence of closures. Equivalent to a thread
+// pool of size 1.
+class WorkerThread {
+ public:
+  // 'name' is a name for the thread for debugging purposes.
+  WorkerThread(tensorflow::Env* env, const std::string& name);
+
+  // Blocks until all enqueued closures have completed.
+  ~WorkerThread();
+
+  // Adds 'fn' to the queue of closures to be executed by the worker thread.
+  void Schedule(std::function<void()> fn);
+
+ private:
+  bool WorkAvailable() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void WorkLoop();
+
+  absl::Mutex mu_;
+  std::queue<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+
+  std::unique_ptr<tensorflow::Thread> thread_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_WORKER_THREAD_H_
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index de5c2e1..edc3b1f 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -13,11 +13,18 @@
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
+#include "absl/hash/hash.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "include/pybind11/numpy.h"
 #include "include/pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
@@ -28,17 +35,22 @@
 #include "tensorflow/compiler/xla/python/local_client.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/xrt.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
-namespace xla_python {
 
 namespace py = pybind11;
 
+namespace {
+
 struct Uniquer {
   absl::Mutex mu;
   NameUniquer name_uniquer GUARDED_BY(mu);
@@ -55,6 +67,46 @@
   return uniquer->name_uniquer.GetUniqueName(name);
 }
 
+// Converts a computation to a serialized HloModuleProto.
+StatusOr<py::bytes> GetComputationSerializedProto(
+    const XlaComputation& computation) {
+  std::string result;
+  if (!computation.proto().SerializeToString(&result)) {
+    return Unknown("Failed to serialize the HloModuleProto.");
+  }
+  return py::bytes(result);
+}
+
+// Converts a computation to textual HLO form.
+StatusOr<std::string> GetComputationHloText(const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation.proto(), module_config));
+  HloPrintOptions options;
+  options = HloPrintOptions::ShortParsable();
+  options.set_print_large_constants(false);
+  return hlo_module->ToString(options);
+}
+
+// Converts a computation to HLO dot graph form.
+StatusOr<std::string> GetComputationHloDotGraph(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation.proto(), module_config));
+  return RenderGraph(*hlo_module->entry_computation(), /*label=*/"",
+                     hlo_module->config().debug_options(),
+                     RenderedGraphFormat::kDot);
+}
+
+}  // namespace
+
 PYBIND11_MODULE(xla_extension, m) {
   // Types
   py::enum_<PrimitiveType>(m, "PrimitiveType")
@@ -75,38 +127,99 @@
       .value("C64", C64)
       .value("C128", C128)
       .value("TUPLE", TUPLE)
-      .value("OPAQUE", OPAQUE)
+      .value("OPAQUE_TYPE", OPAQUE_TYPE)
       .value("TOKEN", TOKEN);
 
   // Shapes
-  py::class_<Shape>(m, "Shape")
+  py::class_<Shape> shape_class(m, "Shape");
+  shape_class
       .def_static(
-          "Tuple",
+          "tuple_shape",
           [](std::vector<Shape> shapes) -> Shape {
             return ShapeUtil::MakeTupleShape(shapes);
           },
-          "Makes a tuple shape.")
+          "Constructs a tuple shape.")
       .def_static(
-          "Array",
-          [](PrimitiveType type, std::vector<int64> dims,
-             absl::optional<std::vector<int64>> layout) -> Shape {
-            if (layout) {
-              return ShapeUtil::MakeShapeWithLayout(type, dims, *layout);
+          "array_shape",
+          [](PrimitiveType type, py::object dims_seq,
+             absl::optional<py::object> layout_seq) -> Shape {
+            std::vector<int64> dims = IntSequenceToVector(dims_seq);
+            if (layout_seq) {
+              std::vector<int64> layout = IntSequenceToVector(*layout_seq);
+              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
             } else {
               Shape shape = ShapeUtil::MakeShape(type, dims);
               shape.clear_layout();
               return shape;
             }
           },
-          "Makes an array shape.", py::arg("type"), py::arg("dims"),
+          "Constructs an array shape.", py::arg("type"), py::arg("dims"),
+          py::arg("layout") = absl::nullopt)
+      .def_static(
+          "array_shape",
+          [](py::dtype dtype, py::object dims_seq,
+             absl::optional<py::object> layout_seq) -> Shape {
+            PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
+            std::vector<int64> dims = IntSequenceToVector(dims_seq);
+            if (layout_seq) {
+              std::vector<int64> layout = IntSequenceToVector(*layout_seq);
+              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
+            } else {
+              Shape shape = ShapeUtil::MakeShape(type, dims);
+              shape.clear_layout();
+              return shape;
+            }
+          },
+          "Constructs an array shape.", py::arg("type"), py::arg("dims"),
           py::arg("layout") = absl::nullopt)
       .def("dimensions",
-           static_cast<const std::vector<int64>& (Shape::*)() const>(
-               &Shape::dimensions))
-      .def("element_type", &Shape::element_type)
+           [](const Shape& shape) -> py::tuple {
+             return IntSpanToTuple(shape.dimensions());
+           })
+      .def("xla_element_type", &Shape::element_type)
+      .def("element_type",
+           [](const Shape& shape) {
+             return ValueOrThrow(PrimitiveTypeToDtype(shape.element_type()));
+           })
+      .def("numpy_dtype",
+           [](const Shape& shape) {
+             if (shape.IsTuple()) {
+               return py::dtype("O");
+             }
+             return ValueOrThrow(PrimitiveTypeToDtype(shape.element_type()));
+           })
+      .def("is_tuple", &Shape::IsTuple)
+      .def("is_array", &Shape::IsArray)
+      .def("rank", &Shape::rank)
+      .def("to_serialized_proto",
+           [](const Shape& shape) {
+             ShapeProto proto = shape.ToProto();
+             return py::bytes(proto.SerializeAsString());
+           })
       .def("tuple_shapes",
-           static_cast<const std::vector<Shape>& (Shape::*)() const>(
-               &Shape::tuple_shapes))
+           [](const Shape& shape) {
+             return std::vector<Shape>(shape.tuple_shapes());
+           })
+      .def(
+          "with_major_to_minor_layout_if_absent",
+          [](const Shape& shape) {
+            Shape out = shape;
+            ShapeUtil::ForEachMutableSubshape(
+                &out, [](Shape* subshape, const ShapeIndex&) {
+                  if (!subshape->has_layout()) {
+                    LayoutUtil::SetToDefaultLayout(subshape);
+                  }
+                });
+            return out;
+          },
+          "Returns a copy of a shape with missing layouts set to "
+          "major-to-minor.")
+      .def("__eq__", [](const Shape& shape,
+                        const Shape& other) { return shape == other; })
+      .def("__ne__", [](const Shape& shape,
+                        const Shape& other) { return shape != other; })
+      .def("__hash__",
+           [](const Shape& shape) { return absl::Hash<Shape>()(shape); })
       .def("__repr__", [](const Shape& shape) {
         return shape.ToString(/*print_layouts=*/true);
       });
@@ -121,10 +234,10 @@
             *program_shape.mutable_result() = result;
             return program_shape;
           }))
-      .def("Parameters",
+      .def("parameter_shapes",
            static_cast<const std::vector<Shape>& (ProgramShape::*)() const>(
                &ProgramShape::parameters))
-      .def("Result", &ProgramShape::result)
+      .def("result_shape", &ProgramShape::result)
       .def("__repr__", &ProgramShape::ToString);
 
   // Literals
@@ -161,19 +274,22 @@
   // CPU custom-call targets.
   m.def("RegisterCpuCustomCallTarget", &RegisterCpuCustomCallTarget);
 
-  py::class_<PyLocalClient>(m, "LocalClient")
+  // The LocalClient object allows dynamic attributes to allow external backends
+  // (e.g., TPU) to stash private data in the client.
+  py::class_<PyLocalClient>(m, "LocalClient", py::dynamic_attr())
       .def_static("Get", &PyLocalClient::Get)
       .def("DeviceCount", &PyLocalClient::device_count)
       .def("TransferToInfeed", &PyLocalClient::TransferToInfeed)
       .def("TransferFromOutfeed", &PyLocalClient::TransferFromOutfeed);
 
-  py::class_<LocalShapedBuffer>(m, "LocalShapedBuffer")
-      .def_static("FromPython", &LocalShapedBuffer::FromPython)
-      .def_static("FromPythonValues", &LocalShapedBuffer::FromPythonValues)
-      .def("Delete", &LocalShapedBuffer::Delete)
-      .def("DestructureTuple", &LocalShapedBuffer::DestructureTuple)
-      .def("ToPython", &LocalShapedBuffer::ToPython)
-      .def("shape", &LocalShapedBuffer::shape);
+  py::class_<PyLocalBuffer>(m, "PyLocalBuffer")
+      .def_static("FromPython", &PyLocalBuffer::FromPython)
+      .def_static("FromPythonValues", &PyLocalBuffer::FromPythonValues)
+      .def_static("MakeTuple", &PyLocalBuffer::MakeTuple)
+      .def("Delete", &PyLocalBuffer::Delete)
+      .def("DestructureTuple", &PyLocalBuffer::DestructureTuple)
+      .def("ToPython", &PyLocalBuffer::ToPython)
+      .def("shape", &PyLocalBuffer::on_host_shape);
 
   py::class_<PyLocalExecutable>(m, "LocalExecutable")
       .def_static("Compile", &PyLocalExecutable::Compile,
@@ -185,6 +301,17 @@
       .def("ExecutePerReplica", &PyLocalExecutable::ExecutePerReplica,
            py::call_guard<py::gil_scoped_release>());
 
+  py::class_<DebugOptions>(m, "DebugOptions")
+      .def_property("xla_cpu_enable_fast_math",
+                    &DebugOptions::xla_cpu_enable_fast_math,
+                    &DebugOptions::set_xla_cpu_enable_fast_math)
+      .def_property("xla_cpu_fast_math_honor_infs",
+                    &DebugOptions::xla_cpu_fast_math_honor_infs,
+                    &DebugOptions::set_xla_cpu_fast_math_honor_infs)
+      .def_property("xla_cpu_fast_math_honor_nans",
+                    &DebugOptions::xla_cpu_fast_math_honor_nans,
+                    &DebugOptions::set_xla_cpu_fast_math_honor_nans);
+
   py::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(py::init<>())
       .def_property(
@@ -196,7 +323,10 @@
           },
           &ExecutableBuildOptions::set_result_layout)
       .def_property("num_replicas", &ExecutableBuildOptions::num_replicas,
-                    &ExecutableBuildOptions::set_num_replicas);
+                    &ExecutableBuildOptions::set_num_replicas)
+      .def_property_readonly(
+          "debug_options", &ExecutableBuildOptions::mutable_debug_options,
+          py::return_value_policy::reference, py::keep_alive<1, 0>());
 
   py::class_<XlaComputation>(m, "XlaComputation")
       .def("GetProgramShape", &XlaComputation::GetProgramShape)
@@ -234,9 +364,14 @@
   // XlaBuilder.
   py::module ops = m.def_submodule("ops", "XLA operations");
 
+  ops.def("AllReduce",
+          static_cast<XlaOp (*)(
+              XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
+              const absl::optional<ChannelHandle>&)>(&CrossReplicaSum));
   ops.def("AllToAll", &AllToAll);
+  ops.def("CollectivePermute", &CollectivePermute);
   ops.def("CrossReplicaSum",
-          static_cast<XlaOp (*)(const XlaOp&, absl::Span<const ReplicaGroup>)>(
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
               &CrossReplicaSum));
   ops.def("BitcastConvertType", &BitcastConvertType, py::arg("operand"),
           py::arg("new_element_type"));
@@ -249,14 +384,11 @@
   ops.def("Collapse", &Collapse, py::arg("operand"), py::arg("dimensions"));
   ops.def("ConcatInDim", &ConcatInDim);
   ops.def("Conditional",
-          static_cast<XlaOp (*)(const XlaOp&,
-                                absl::Span<const XlaComputation* const>,
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
                                 absl::Span<const XlaOp>)>(&Conditional));
-  ops.def(
-      "Conditional",
-      static_cast<XlaOp (*)(const XlaOp&, const XlaOp&, const XlaComputation&,
-                            const XlaOp&, const XlaComputation&)>(
-          &Conditional));
+  ops.def("Conditional",
+          static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
+                                const XlaComputation&)>(&Conditional));
   ops.def("ConstantLiteral", &ConstantLiteral);
   ops.def("ConvGeneralDilated", &ConvGeneralDilated, py::arg("lhs"),
           py::arg("rhs"), py::arg("window_strides"), py::arg("padding"),
@@ -272,11 +404,19 @@
   ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
           py::arg("dimension_numbers"), py::arg("precision_config") = nullptr);
   ops.def("DynamicSlice",
-          static_cast<XlaOp (*)(const XlaOp&, absl::Span<const XlaOp>,
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
                                 absl::Span<const int64>)>(&DynamicSlice));
   ops.def("DynamicUpdateSlice",
-          static_cast<XlaOp (*)(const XlaOp&, const XlaOp&,
-                                absl::Span<const XlaOp>)>(&DynamicUpdateSlice));
+          static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
+              &DynamicUpdateSlice));
+
+  ops.def("Fft", &Fft);
+  py::enum_<FftType>(m, "FftType")
+      .value("FFT", FftType::FFT)
+      .value("IFFT", FftType::IFFT)
+      .value("RFFT", FftType::RFFT)
+      .value("IRFFT", FftType::IRFFT);
+
   ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
           py::arg("dimension_numbers"), py::arg("slice_sizes"));
   ops.def("GetTupleElement", &GetTupleElement);
@@ -322,12 +462,10 @@
                                 absl::Span<const int64>)>(&Reduce));
   ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding);
   ops.def("ReplicaId", &ReplicaId);
+  ops.def("Reshape", static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
+                                           absl::Span<const int64>)>(&Reshape));
   ops.def("Reshape",
-          static_cast<XlaOp (*)(const XlaOp&, absl::Span<const int64>,
-                                absl::Span<const int64>)>(&Reshape));
-  ops.def(
-      "Reshape",
-      static_cast<XlaOp (*)(const XlaOp&, absl::Span<const int64>)>(&Reshape));
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape));
   ops.def("Rev", &Rev, py::arg("operand"), py::arg("dimensions"));
   ops.def("RngNormal", &RngNormal);
   ops.def("RngUniform", &RngUniform);
@@ -338,10 +476,22 @@
   ops.def("Slice", &Slice);
   ops.def("SliceInDim", &SliceInDim, py::arg("operand"), py::arg("start_index"),
           py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
-  ops.def("Sort",
-          static_cast<XlaOp (*)(const XlaOp&, absl::Span<const XlaOp>, int64)>(
-              &Sort),
-          py::arg("keys"), py::arg("values"), py::arg("dimension") = -1);
+  ops.def(
+      "Sort",
+      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
+         int64 dimension) -> XlaOp {
+        return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+          std::vector<PrimitiveType> operand_types;
+          for (const auto& operand : operands) {
+            TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
+            operand_types.push_back(operand_shape.element_type());
+          }
+          return Sort(operands,
+                      CreateScalarLtComputation(operand_types, builder),
+                      dimension);
+        });
+      },
+      py::arg("builder"), py::arg("operands"), py::arg("dimension") = -1);
   ops.def("Transpose", &Transpose);
   ops.def("TriangularSolve", &TriangularSolve);
   ops.def("Tuple", &Tuple);
@@ -433,5 +583,4 @@
   tensorflow::AddXrtSubmodule(&m);
 }
 
-}  // namespace xla_python
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index c37b512..ced268b 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -28,7 +28,6 @@
 import numpy as np
 
 import six
-from six.moves import xrange
 
 # Note this module does *not* depend on any Python protocol buffers. The XLA
 # Python bindings are currently packaged both as part of jaxlib and as part
@@ -76,6 +75,10 @@
     """Deletes buffer `c_buffer`."""
 
   @abc.abstractmethod
+  def make_tuple(self, c_buffers, device_ordinal):
+    """Makes a tuple from a sequence of backend buffer objects."""
+
+  @abc.abstractmethod
   def destructure_tuple(self, c_buffer):
     """Destructures a tuple buffer into a sequence of buffers."""
 
@@ -99,29 +102,33 @@
 class LocalBackend(Backend):
   """XLA backend implemented using the in-process xla::LocalClient API."""
 
-  def __init__(self, platform=None, xla_platform_id=None):
+  def __init__(self, platform=None, xla_platform_id=None, asynchronous=False):
     """Creates a new LocalBackend.
 
     Args:
       platform: A string; the user-visible platform name, e.g. 'gpu'.
       xla_platform_id: A string; XLA's name for the platform, e.g., 'CUDA'.
+      asynchronous: A boolean; should we enable asynchronous execution?
+        (Experimental.)
     """
     super(LocalBackend, self).__init__(platform)
-    self.client = _xla.LocalClient.Get(xla_platform_id)
+    self.client = _xla.LocalClient.Get(platform, xla_platform_id, asynchronous)
 
   def device_count(self):
     return self.client.DeviceCount()
 
   def buffer_from_pyval(self, pyval, device=0):
-    return _xla.LocalShapedBuffer.FromPython(pyval, self.client, device)
+    return _xla.PyLocalBuffer.FromPython(pyval, self.client, device)
 
   def buffers_from_pyvals(self, pyvals_and_devices):
-    return _xla.LocalShapedBuffer.FromPythonValues(pyvals_and_devices,
-                                                   self.client)
+    return _xla.PyLocalBuffer.FromPythonValues(pyvals_and_devices, self.client)
 
   def delete_buffer(self, c_buffer):
     c_buffer.Delete()
 
+  def make_tuple(self, c_buffers, device_ordinal):
+    return _xla.PyLocalBuffer.MakeTuple(c_buffers, self.client, device_ordinal)
+
   def destructure_tuple(self, c_buffer):
     return c_buffer.DestructureTuple()
 
@@ -129,13 +136,13 @@
     options = _xla.ExecutableBuildOptions()
     options.num_replicas = compile_options.num_replicas
     if compile_options.argument_layouts:
-      argument_layouts = [
-          s.as_xla_shape() for s in compile_options.argument_layouts
-      ]
+      argument_layouts = compile_options.argument_layouts
     else:
-      argument_layouts = c_computation.GetProgramShape().Parameters()
+      argument_layouts = c_computation.GetProgramShape().parameter_shapes()
     if compile_options.result_layout:
-      options.result_layout = compile_options.result_layout.as_xla_shape()
+      options.result_layout = compile_options.result_layout
+    options.debug_options.xla_cpu_fast_math_honor_infs = True
+    options.debug_options.xla_cpu_fast_math_honor_nans = True
     return _xla.LocalExecutable.Compile(c_computation, argument_layouts,
                                         options, self.client)
 
@@ -149,10 +156,18 @@
     return executable.ExecutePerReplica(per_replica_args)
 
 
+def _cpu_backend_factory():
+  return LocalBackend(platform='cpu', xla_platform_id='Host', asynchronous=True)
+
+
+def _gpu_backend_factory():
+  return LocalBackend(platform='gpu', xla_platform_id='CUDA')
+
+
 # Backend factories, keyed by user-visible name, in increasing priority order.
 _local_backend_factories = collections.OrderedDict([
-    ('cpu', lambda: LocalBackend(platform='cpu', xla_platform_id='Host')),
-    ('gpu', lambda: LocalBackend(platform='gpu', xla_platform_id='CUDA')),
+    ('cpu', _cpu_backend_factory),
+    ('gpu', _gpu_backend_factory),
 ])
 
 
@@ -281,6 +296,66 @@
   return DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
 
 
+Shape = _xla.Shape
+Shape.__doc__ = """
+A Shape is an object defined in C++ that duck types like the following class:
+
+class Shape(object):
+  '''Represents an XLA shape.
+
+  A shape is either an array shape, having rank-many integer
+  dimensions and an element type (represented by a Numpy dtype), or it
+  is a tuple shape, having a shape for every tuple component:
+
+    type shape =
+        TupleShape of shape list
+      | ArrayShape of { dimensions: int list; element_type: dtype }
+  '''
+
+  @staticmethod
+  def tuple_shape(tuple_shapes) -> Shape:
+    "Construct a tuple shape."
+
+  @staticmethod
+  def array_shape(element_type, dimensions, minor_to_major=None) -> Shape:
+
+  @staticmethod
+  def from_pyval(pyval) -> Shape:
+    "Returns a Shape that describes a tuple-tree of Numpy arrays."
+
+  def __eq__(self, other: Shape) -> bool:
+  def __ne__(self, other: Shape) -> bool:
+  def __hash__(self):
+  def __repr__(self):
+  def is_tuple(self) -> bool:
+  def is_array(self) -> bool:
+  def tuple_shapes(self) -> [Shape]:
+  def numpy_dtype(self) -> np.dtype:
+    "Like element_type(), but returns dtype('O') for a tuple shape."
+  def xla_element_type(self) -> PrimitiveType:
+  def element_type(self) -> np.dtype:
+  def dimensions(self) -> (int, int, ...):
+  def rank(self) -> int:
+  def minor_to_major(self) -> [int]:
+  def with_major_to_minor_layout_if_absent(self) -> Shape:
+    "Returns a copy with missing layouts set to major-to-minor."
+
+  def to_serialized_proto(self) -> bytes:
+    "Returns 'shape' as a serialized proto."
+"""
+
+ProgramShape = _xla.ProgramShape
+ProgramShape.__doc__ = """
+A ProgramShape is a C++ object that duck types like the following class.
+
+class ProgramShape(object):
+  def __init__(self, parameter_shapes, result_shape):
+  def parameter_shapes(self) -> [Shape]:
+  def result_shape(self) -> Shape:
+  def __repr__(self):
+"""
+
+
 class Buffer(object):
   """Represents a handle to data owned by XLA.
 
@@ -307,10 +382,11 @@
     """Copies multiple Python values to freshly allocated on-device buffers.
 
     Arguments:
-      pyvals_and_devices: a list of `(pyval, device)` pairs, where `pyval` is
-      a Python value to copy (e.g., a NumPy array), and `device` is an integer
-      device ordinal.
+      pyvals_and_devices: a list of `(pyval, device)` pairs, where `pyval` is a
+        Python value to copy (e.g., a NumPy array), and `device` is an integer
+        device ordinal.
       backend: a Backend object, or `None` to use the default local backend.
+
     Returns:
       A list of `Buffer` objects corresponding to `pyvals_and_devices`.
     """
@@ -323,11 +399,18 @@
         for cbuf, (_, device) in zip(cbufs, pyvals_and_devices)
     ]
 
+  @staticmethod
+  def make_tuple(buffers, backend=None, device=0):
+    backend = backend or get_local_backend()
+    buf = backend.make_tuple([b.c_buffer for b in buffers],
+                             device_ordinal=device)
+    return Buffer(buf, backend, device)
+
   def to_py(self):
     return self.c_buffer.ToPython()
 
   def shape(self):
-    return _wrap_shape(self.c_buffer.shape())
+    return self.c_buffer.shape()
 
   def device(self):
     return self._device
@@ -341,7 +424,6 @@
     """Assuming a tuple buffer, unpack it into constituent tuple elements."""
     assert self.c_buffer is not None
     result = self._backend.destructure_tuple(self.c_buffer)
-    self.delete()
     return tuple(
         Buffer(sub_buffer, device=self._device, backend=self._backend)
         for sub_buffer in result)
@@ -355,205 +437,17 @@
 LocalBuffer = Buffer
 
 
-class Format(enum.IntEnum):
-  """Python copy of the Format protocol buffer enum."""
-  INVALID_FORMAT = 0
-  DENSE = 1
-  SPARSE = 2
+def shape_from_pyval(pyval):
+  """Returns a Shape that describes a tuple-tree of Numpy arrays."""
 
-
-class Shape(object):
-  """Represents an XLA shape.
-
-  A shape is either an array shape, having rank-many integer
-  dimensions and an element type (represented by a Numpy dtype), or it
-  is a tuple shape, having a shape for every tuple component:
-
-    type shape =
-        TupleShape of shape list
-      | ArrayShape of { dimensions: int list; element_type: dtype }
-
-  Callers are expected to instantiate this class only via the static
-  constructors: tuple_shape, array_shape, and from_pyval.
-  """
-
-  @staticmethod
-  def tuple_shape(tuple_shapes):
-    """Construct a tuple shape."""
-    if (not isinstance(tuple_shapes, (tuple, list)) or
-        not all(isinstance(t, Shape) for t in tuple_shapes)):
-      raise TypeError('tuple_shapes must be a tuple of Shapes')
-    return Shape(tuple_shapes, tuple)
-
-  @staticmethod
-  def array_shape(element_type, dimensions, minor_to_major=None):
-    """Construct an array shape."""
-    if (not isinstance(dimensions, tuple) or
-        not all(isinstance(i, int) for i in dimensions)):
-      dimensions = tuple(int(i) for i in dimensions)
-    return Shape(
-        dimensions, np.dtype(element_type), minor_to_major=minor_to_major)
-
-  @staticmethod
-  def from_pyval(pyval):
-    """Returns a Shape that describes a tuple-tree of Numpy arrays."""
-
-    def convert(pyval):
-      if isinstance(pyval, tuple):
-        return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
-      else:
-        pyval = require_numpy_array_layout(pyval)
-        return Shape.array_shape(pyval.dtype, np.shape(pyval))
-
-    return convert(pyval)
-
-  def __init__(self, dimensions, dtype, minor_to_major=None):
-    assert isinstance(dimensions, tuple)
-    self._dimensions = dimensions
-    self._dtype = dtype
-    self._is_tuple = dtype == tuple
-    self._minor_to_major = minor_to_major
-    self._check_minor_to_major()
-
-  def __eq__(self, other):
-    # pylint: disable=protected-access
-    return (self._dtype == other._dtype and
-            self._dimensions == other._dimensions and
-            self._minor_to_major == other._minor_to_major)
-
-  def __ne__(self, other):
-    return not self == other
-
-  def __hash__(self):
-    return hash((self._dtype, self._dimensions, self._minor_to_major))
-
-  def __repr__(self):
-    return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
-            '_is_tuple={!r}, _minor_to_major={!r})').format(
-                self._dtype, self._dimensions, self._is_tuple,
-                self._minor_to_major)
-
-  def is_tuple(self):
-    return self._is_tuple
-
-  def is_array(self):
-    return not self._is_tuple
-
-  def tuple_shapes(self):
-    if not self.is_tuple():
-      raise ValueError('not a tuple shape')
-    return self._dimensions
-
-  def numpy_dtype(self):
-    """Like element_type(), but returns dtype('O') in case of a tuple shape."""
-    if self.is_tuple():
-      return np.dtype(np.object)
+  def convert(pyval):
+    if isinstance(pyval, tuple):
+      return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
     else:
-      return self.element_type()
+      pyval = require_numpy_array_layout(pyval)
+      return Shape.array_shape(pyval.dtype, np.shape(pyval))
 
-  def xla_element_type(self):
-    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())]
-
-  def element_type(self):
-    if not self.is_array():
-      raise ValueError('not an array shape')
-    return self._dtype
-
-  def dimensions(self):
-    if not self.is_array():
-      raise ValueError('not an array shape')
-    return self._dimensions
-
-  def rank(self):
-    return len(self.dimensions())
-
-  def minor_to_major(self):
-    return self._minor_to_major
-
-  def map_leaves(self, f):
-    """Map f over each leaf-level array subshape.
-
-    Args:
-      f: The function to apply. Whenever f returns None, the identity is applied
-        instead.
-
-    Returns:
-      A new Shape with the mapped leaves.
-    """
-    if self.is_tuple():
-      children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
-      return Shape.tuple_shape(children)
-    else:
-      mapped = f(self)
-      return self if mapped is None else mapped
-
-  def _check_minor_to_major(self):
-    mtm = self._minor_to_major
-    if self.is_tuple():
-      assert mtm is None, self
-    if mtm is not None:
-      assert self.rank() == len(mtm), self
-      assert sorted(mtm) == list(range(len(mtm))), self
-
-  def update_minor_to_major(self, minor_to_major):
-    if not self.is_array():
-      raise ValueError('not an array shape')
-    if not isinstance(minor_to_major, tuple):
-      raise TypeError('minor_to_major must be a tuple')
-    updated = Shape.array_shape(self.element_type(), self.dimensions(),
-                                minor_to_major)
-    updated._check_minor_to_major()  # pylint: disable=protected-access
-    return updated
-
-  def with_major_to_minor_layout_if_absent(self):
-    """Returns a copy of a shape with missing layouts set to major-to-minor."""
-
-    def f(a):
-      if a.minor_to_major():
-        return None
-      return a.update_minor_to_major(tuple(xrange(a.rank() - 1, -1, -1)))
-
-    return self.map_leaves(f)
-
-  def serialize(self, proto):
-    """Serializes 'shape' into proto."""
-    if self.is_tuple():
-      proto.element_type = int(PrimitiveType.TUPLE)
-      for shape in self.tuple_shapes():
-        shape.serialize(proto.tuple_shapes.add())
-    else:
-      proto.element_type = int(self.xla_element_type())
-      proto.dimensions.extend(self.dimensions())
-      proto.is_dynamic_dimension.extend([False for _ in self.dimensions()])
-      if self.minor_to_major():
-        proto.layout.format = Format.DENSE
-        proto.layout.minor_to_major.extend(self.minor_to_major())
-
-  def as_xla_shape(self):
-    if self.is_tuple():
-      return _xla.Shape.Tuple([x.as_xla_shape() for x in self.tuple_shapes()])
-
-    return _xla.Shape.Array(self.xla_element_type(), self.dimensions(),
-                            self.minor_to_major())
-
-
-ProgramShape = collections.namedtuple('ProgramShape',
-                                      ('parameter_shapes', 'result_shape'))
-
-
-def _wrap_shape(xla_shape):
-  element_type = xla_shape.element_type()
-  if element_type == PrimitiveType.TUPLE:
-    shapes = tuple(_wrap_shape(sub) for sub in xla_shape.tuple_shapes())
-    return Shape.tuple_shape(shapes)
-  else:
-    dtype = XLA_ELEMENT_TYPE_TO_DTYPE[element_type]
-    return Shape.array_shape(dtype, xla_shape.dimensions())
-
-
-def _wrap_program_shape(program_shape):
-  return ProgramShape([_wrap_shape(arg) for arg in program_shape.Parameters()],
-                      _wrap_shape(program_shape.Result()))
+  return convert(pyval)
 
 
 def require_numpy_array_layout(value):
@@ -596,8 +490,7 @@
   # TODO(phawkins): support non-default backends.
   backend = get_local_backend()
   return backend.client.TransferFromOutfeed(
-      shape.with_major_to_minor_layout_if_absent().as_xla_shape(),
-      device_ordinal)
+      shape.with_major_to_minor_layout_if_absent(), device_ordinal)
 
 
 class CompileOptions(object):
@@ -683,10 +576,10 @@
     return Executable(c, backend=backend)
 
   def GetProgramShape(self):
-    return _wrap_program_shape(self._c_computation.GetProgramShape())
+    return self._c_computation.GetProgramShape()
 
   def GetReturnValueShape(self):
-    return _wrap_shape(self._c_computation.GetProgramShape().Result())
+    return self._c_computation.GetProgramShape().result_shape()
 
 
 class Executable(object):
@@ -861,7 +754,7 @@
       return Computation(self._builder.Build(), backend=backend)
 
   def GetShape(self, operand):
-    return _wrap_shape(self._builder.GetShape(operand))
+    return self._builder.GetShape(operand)
 
   def SetOpMetadata(self, op_metadata):
     """Set metadata for operations that are about to be enqueued."""
@@ -880,9 +773,8 @@
     Returns:
       An XlaOp.
     """
-    return ops.Infeed(
-        self._builder,
-        shape.with_major_to_minor_layout_if_absent().as_xla_shape())
+    return ops.Infeed(self._builder,
+                      shape.with_major_to_minor_layout_if_absent())
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -979,10 +871,9 @@
     if parameter_num is None:
       parameter_num = next(self._parameter_numbering)
 
-    return ops.Parameter(
-        self._builder, parameter_num,
-        shape.with_major_to_minor_layout_if_absent().as_xla_shape(),
-        name.encode('utf8'))
+    return ops.Parameter(self._builder, parameter_num,
+                         shape.with_major_to_minor_layout_if_absent(),
+                         name.encode('utf8'))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -997,7 +888,7 @@
       An XlaOp.
     """
     return self.ParameterWithShape(
-        Shape.from_pyval(value), name=name, parameter_num=parameter_num)
+        shape_from_pyval(value), name=name, parameter_num=parameter_num)
 
   def Iota(self, dtype, size):
     """Enqueues an iota constant onto the computation.
@@ -1024,7 +915,7 @@
       An XlaOp representing the added broadcasted iota constant.
     """
     element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
-    xla_shape = _xla.Shape.Array(element_type, shape, None)
+    xla_shape = _xla.Shape.array_shape(element_type, shape, None)
     return ops.Iota(self._builder, xla_shape, dimension)
 
   def Concatenate(self, operands, dimension):
@@ -1081,6 +972,24 @@
       dimensions = tuple(range(ndim))
     return ops.Reshape(operand, dimensions, new_sizes)
 
+  def AllReduce(self, operand, computation, replica_groups=None):
+    """AllReduce op.
+
+    Args:
+      operand: XlaOp representing the input array
+      computation: a Computation object - binary reduction function.
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the all-to-all is performed. If not supplied or None (the
+        default), all replicas belong to the same group.
+
+    Returns:
+      An XlaOp that represents the all-reduced result.
+    """
+    replica_groups_protos = _get_replica_groups_protos(replica_groups)
+    return ops.AllReduce(operand, computation.computation,
+                         replica_groups_protos, None)
+
   def AllToAll(self,
                operand,
                split_dimension,
@@ -1101,13 +1010,7 @@
     Returns:
       An XlaOp that represents the all-to-all concatenation.
     """
-    if replica_groups is None:
-      replica_groups_protos = []  # special value for XLA API
-    else:
-      replica_groups = list(replica_groups)
-      replica_groups_protos = [
-          _make_replica_group_proto(group) for group in replica_groups
-      ]
+    replica_groups_protos = _get_replica_groups_protos(replica_groups)
     if not replica_groups:
       split_count = 1
     else:
@@ -1282,10 +1185,9 @@
       An XlaOp representing the added custom call op.
     """
     opaque = opaque or b''
-    return ops.CustomCall(
-        self._builder, call_target_name, list(operands),
-        shape_with_layout.as_xla_shape(),
-        [s.as_xla_shape() for s in operand_shapes_with_layout], opaque)
+    return ops.CustomCall(self._builder, call_target_name,
+                          list(operands), shape_with_layout,
+                          list(operand_shapes_with_layout), opaque)
 
   def Map(self, operands, computation_to_apply, dimensions):
     """Enqueues a map operation onto the computation.
@@ -1373,7 +1275,7 @@
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
     Returns: a XlaOp to the generated array of F32 values.
     """
-    shape = _xla.Shape.Array(self.GetShape(mu).xla_element_type(), dims)
+    shape = _xla.Shape.array_shape(self.GetShape(mu).xla_element_type(), dims)
     return ops.RngNormal(mu, sigma, shape)
 
   def RngUniform(self, a, b, dims):
@@ -1390,7 +1292,7 @@
     Returns: a XlaOp to the generated array of values with the same numeric type
       (F32, S32, or U32) as the arguments a and b.
     """
-    shape = _xla.Shape.Array(self.GetShape(a).xla_element_type(), dims)
+    shape = _xla.Shape.array_shape(self.GetShape(a).xla_element_type(), dims)
     return ops.RngUniform(a, b, shape)
 
   def While(self, cond, body, init):
@@ -1593,11 +1495,11 @@
 
   def Sort(self, operand, dimension=-1):
     """Enqueues a sort operation onto the computation."""
-    return ops.Sort(operand, [], dimension)
+    return ops.Sort(self._builder, [operand], dimension)
 
   def SortKeyVal(self, keys, values, dimension=-1):
     """Enqueues a key-value sort operation onto the computation."""
-    return ops.Sort(keys, [values], dimension)
+    return ops.Sort(self._builder, [keys, values], dimension)
 
   def QR(self, a, full_matrices=True):
     """Enqueues a QR decomposition onto the computation."""
@@ -1636,6 +1538,12 @@
     return ops.Scatter(a, scatter_indices, updates,
                        update_computation.computation, dimension_numbers)
 
+  def Fft(self, operand, fft_type, fft_lengths):
+    """Enqueues a FFT operation onto the computation."""
+    return ops.Fft(operand, fft_type, fft_lengths)
+
+
+FftType = _xla.FftType
 
 _UNARY_OPS = [
     'Not',
@@ -1709,6 +1617,7 @@
     'Cholesky',
     'Clamp',
     'Collapse',
+    'CollectivePermute',
     'ConvertElementType',
     'Dot',
     'Gather',
@@ -1862,3 +1771,14 @@
   replica_group_proto = ReplicaGroup()
   replica_group_proto.replica_ids.extend(replica_group)
   return replica_group_proto
+
+
+def _get_replica_groups_protos(replica_groups):
+  if replica_groups is None:
+    replica_groups_protos = []  # special value for XLA API
+  else:
+    replica_groups = list(replica_groups)
+    replica_groups_protos = [
+        _make_replica_group_proto(group) for group in replica_groups
+    ]
+  return replica_groups_protos
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 036df7d..b08089b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -315,10 +315,11 @@
     c.CustomCall(
         b"test_subtract_f32",
         operands=(c.ConstantF32Scalar(1.25), c.ConstantF32Scalar(0.5)),
-        shape_with_layout=xla_client.Shape.array_shape(np.float32, (), ()),
+        shape_with_layout=xla_client.Shape.array_shape(
+            np.dtype(np.float32), (), ()),
         operand_shapes_with_layout=(
-            xla_client.Shape.array_shape(np.float32, (), ()),
-            xla_client.Shape.array_shape(np.float32, (), ()),
+            xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
+            xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
         ))
     self._ExecuteAndCompareClose(c, expected=0.75)
 
@@ -403,12 +404,12 @@
         expected=[-4.3, 1.3, -6.3, 3.3])
 
 
-class LocalBufferTest(ComputationTest):
-  """Tests focusing on execution with LocalBuffers."""
+class BufferTest(ComputationTest):
+  """Tests focusing on execution with Buffers."""
 
   def _Execute(self, c, arguments):
     compiled_c = c.Build().Compile()
-    arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments]
+    arg_buffers = [xla_client.Buffer.from_pyval(arg) for arg in arguments]
     result_buffer = compiled_c.Execute(arg_buffers)
     return result_buffer.to_py()
 
@@ -437,23 +438,23 @@
     c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
     arg = NumpyArrayF32(1.11)
     compiled_c = c.Build().Compile()
-    arg_buffer = xla_client.LocalBuffer.from_pyval(arg)
+    arg_buffer = xla_client.Buffer.from_pyval(arg)
     arg_buffer.delete()
     with self.assertRaises(ValueError):
       compiled_c.Execute([arg_buffer])
 
   def testDestructureTupleEmpty(self):
     t = ()
-    local_buffer = xla_client.LocalBuffer.from_pyval(t)
+    local_buffer = xla_client.Buffer.from_pyval(t)
     pieces = local_buffer.destructure()
-    self.assertTrue(local_buffer.is_deleted())
+    self.assertFalse(local_buffer.is_deleted())
     self.assertEqual(len(pieces), 0)
 
   def testDestructureTupleOneArrayElement(self):
     t = (np.array([1, 2, 3, 4], dtype=np.int32),)
-    local_buffer = xla_client.LocalBuffer.from_pyval(t)
+    local_buffer = xla_client.Buffer.from_pyval(t)
     pieces = local_buffer.destructure()
-    self.assertTrue(local_buffer.is_deleted())
+    self.assertFalse(local_buffer.is_deleted())
     self.assertEqual(len(pieces), 1)
     array = pieces[0]
     got = array.to_py()
@@ -461,25 +462,30 @@
     np.testing.assert_equal(want, got)
 
   def testDestructureTupleTwoArrayElementDifferentType(self):
-    t = (np.array([1.0, 2.0, 3.0, 4.0],
-                  dtype=np.float32), np.array([2, 3, 4, 5], dtype=np.int32))
-    local_buffer = xla_client.LocalBuffer.from_pyval(t)
-    pieces = local_buffer.destructure()
-    self.assertTrue(local_buffer.is_deleted())
-    self.assertEqual(len(pieces), 2)
-    array0, array1 = pieces
-    got = array0.to_py()
-    want = NumpyArrayF32([1.0, 2.0, 3.0, 4.0])
-    np.testing.assert_equal(want, got)
-    got = array1.to_py()
-    want = NumpyArrayS32([2, 3, 4, 5])
-    np.testing.assert_equal(want, got)
+    t = (
+        np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32),
+        np.array([2, 3, 4, 5], dtype=np.int32),
+    )
+    local_buffer = xla_client.Buffer.from_pyval(t)
+    # Run the test twice to verify that the original tuple buffer remains valid
+    # even after destructuring.
+    for _ in range(2):
+      pieces = local_buffer.destructure()
+      self.assertFalse(local_buffer.is_deleted())
+      self.assertEqual(len(pieces), 2)
+      array0, array1 = pieces
+      got = array0.to_py()
+      want = NumpyArrayF32([1.0, 2.0, 3.0, 4.0])
+      np.testing.assert_equal(want, got)
+      got = array1.to_py()
+      want = NumpyArrayS32([2, 3, 4, 5])
+      np.testing.assert_equal(want, got)
 
   def testDestructureTupleNested(self):
     t = ((NumpyArrayF32([1.0, 2.0]), NumpyArrayS32([3, 4])), NumpyArrayS32([5]))
-    local_buffer = xla_client.LocalBuffer.from_pyval(t)
+    local_buffer = xla_client.Buffer.from_pyval(t)
     pieces = local_buffer.destructure()
-    self.assertTrue(local_buffer.is_deleted())
+    self.assertFalse(local_buffer.is_deleted())
     self.assertEqual(len(pieces), 2)
     tuple0, array1 = pieces
     got = array1.to_py()
@@ -491,9 +497,25 @@
     np.testing.assert_equal(NumpyArrayF32([1.0, 2.0]), got[0])
     np.testing.assert_equal(NumpyArrayS32([3, 4]), got[1])
 
+  def testMakeTuple(self):
+    t = (
+        np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32),
+        np.array([2, 3, 4, 5], dtype=np.int32),
+    )
+    b0 = xla_client.Buffer.from_pyval(t[0])
+    b1 = xla_client.Buffer.from_pyval(t[1])
+    btup = xla_client.Buffer.make_tuple([b0, b1], device=0)
+    pieces = btup.destructure()
+    self.assertEqual(len(pieces), 2)
+    array0, array1 = pieces
+    np.testing.assert_equal(
+        np.array([1, 2, 3, 4], dtype=np.float32), array0.to_py())
+    np.testing.assert_equal(
+        np.array([2, 3, 4, 5], dtype=np.int32), array1.to_py())
+
   def testShape(self):
     pyval = np.array([[1., 2.]], np.float32)
-    local_buffer = xla_client.LocalBuffer.from_pyval(pyval)
+    local_buffer = xla_client.Buffer.from_pyval(pyval)
     xla_shape = local_buffer.shape()
     self.assertEqual(xla_shape.dimensions(), (1, 2))
     self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
@@ -1162,6 +1184,23 @@
     c.Cholesky(c.Constant(np.dot(l, l.T)))
     self._ExecuteAndCompareClose(c, expected=l, rtol=1e-4)
 
+  def testSort(self):
+    keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
+    c = self._NewComputation()
+    c.Sort(c.Constant(keys))
+    self._ExecuteAndCompareClose(
+        c, expected=np.array([[1, 2, 3, 4], [1, 2, 3, 4]], dtype=np.float32))
+
+  def testSortKeyVal(self):
+    keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
+    values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
+    c = self._NewComputation()
+    c.SortKeyVal(c.Constant(keys), c.Constant(values), dimension=0)
+    result = c.Build().Compile().ExecuteWithPythonValues()
+    self.assertIsInstance(result, tuple)
+    np.testing.assert_allclose(result[0], [[2, 1, 1, 2], [3, 4, 4, 3]])
+    np.testing.assert_equal(result[1], [[0, 5, 2, 7], [4, 1, 6, 3]])
+
   def testQR(self):
     a = np.array(
         [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
@@ -1242,6 +1281,33 @@
     expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
     np.testing.assert_allclose(g, expected, rtol=1e-4)
 
+  def testFft(self):
+    shape = [2, 3, 4, 5]
+    rng = np.random.RandomState(0)
+    a = rng.randn(*shape) + 1.0j * rng.randn(*shape)
+    a = a.astype(np.complex64)
+    # FFT
+    c = self._NewComputation()
+    c.Fft(c.Constant(a), xla_client.FftType.FFT, shape[-3:])
+    self._ExecuteAndCompareClose(c, expected=np.fft.fftn(a, axes=(1, 2, 3)),
+                                 rtol=1e-4)
+    # IFFT
+    c = self._NewComputation()
+    c.Fft(c.Constant(a), xla_client.FftType.IFFT, shape[-3:])
+    self._ExecuteAndCompareClose(c, expected=np.fft.ifftn(a, axes=(1, 2, 3)),
+                                 rtol=1e-4)
+    # RFFT
+    b = rng.randn(*shape).astype(np.float32)
+    c = self._NewComputation()
+    c.Fft(c.Constant(b), xla_client.FftType.RFFT, shape[-3:])
+    self._ExecuteAndCompareClose(c, expected=np.fft.rfftn(b, axes=(1, 2, 3)),
+                                 rtol=1e-4)
+    # IRFFT
+    c = self._NewComputation()
+    c.Fft(c.Constant(a), xla_client.FftType.IRFFT, [3, 4, 8])
+    self._ExecuteAndCompareClose(c, expected=np.fft.irfftn(a, axes=(1, 2, 3)),
+                                 rtol=1e-4)
+
 
 class EmbeddedComputationsTest(ComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
@@ -1680,7 +1746,7 @@
   def testInfeedS32Values(self):
     to_infeed = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    c.Infeed(xla_client.Shape.from_pyval(to_infeed[0]))
+    c.Infeed(xla_client.shape_from_pyval(to_infeed[0]))
     compiled_c = c.Build().Compile()
     for item in to_infeed:
       xla_client.transfer_to_infeed(item)
@@ -1692,7 +1758,7 @@
   def testInfeedThenOutfeedS32(self):
     to_round_trip = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0]))
+    x = c.Infeed(xla_client.shape_from_pyval(to_round_trip[0]))
     c.Outfeed(x)
 
     compiled_c = c.Build().Compile()
@@ -1702,7 +1768,7 @@
       execution.start()
       xla_client.transfer_to_infeed(want)
       got = xla_client.transfer_from_outfeed(
-          xla_client.Shape.from_pyval(to_round_trip[0]))
+          xla_client.shape_from_pyval(to_round_trip[0]))
       execution.join()
       self.assertEqual(want, got)
 
@@ -1738,7 +1804,9 @@
     c.ClearOpMetadata()
 
     options = xla_client.CompileOptions()
-    options.argument_layouts = [xla_client.Shape.array_shape(np.float32, [])]
+    options.argument_layouts = [
+        xla_client.Shape.array_shape(np.dtype(np.float32), [])
+    ]
 
     def TestFun():
       return c.Build().Compile(compile_options=options)
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
index e5a478e..9403717 100644
--- a/tensorflow/compiler/xla/python/xrt.cc
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -85,6 +85,7 @@
 
   py::class_<XrtBuffer, std::shared_ptr<XrtBuffer>>(m, "XrtBuffer")
       .def_static("FromLiteral", &XrtBuffer::FromLiteral)
+      .def_static("MakeTuple", &XrtBuffer::MakeTuple)
       .def("ToPython",
            [](std::shared_ptr<XrtBuffer> buffer) -> xla::StatusOr<py::object> {
              auto literal = absl::make_unique<xla::Literal>();
diff --git a/tensorflow/compiler/xla/python/xrt.py b/tensorflow/compiler/xla/python/xrt.py
index 54d1a7e..f9ab71e 100644
--- a/tensorflow/compiler/xla/python/xrt.py
+++ b/tensorflow/compiler/xla/python/xrt.py
@@ -31,13 +31,6 @@
 # pylint: enable=g-direct-tensorflow-import
 
 
-def _make_xla_shape(shape):
-  if shape.is_tuple():
-    return _xla.Shape.Tuple([_make_xla_shape(s) for s in shape.tuple_shapes()])
-  return _xla.Shape.Array(shape.xla_element_type(), shape.dimensions(),
-                          shape.minor_to_major())
-
-
 def get_tf_context(target, worker):
   """Returns a TensorFlow RPC client object.
 
@@ -60,7 +53,8 @@
     tf_device_type: the type of TensorFlow device to use for XRT (e.g. `"TPU"`).
   """
 
-  def __init__(self, tf_context, tf_device_type):
+  def __init__(self, tf_context, tf_device_type, platform="tpu"):
+    super(XrtBackend, self).__init__(platform)
     self.tf_device_type = tf_device_type
 
     self.context = _xla.xrt.XrtContext.Create(tf_context, tf_device_type)
@@ -77,19 +71,21 @@
   def destructure_tuple(self, c_buffer):
     return c_buffer.DestructureTuple()
 
+  def make_tuple(self, buffers, device_ordinal):
+    return _xla.xrt.XrtBuffer.MakeTuple(self.context, buffers)
+
   def compile(self, computation, compile_options):
     # pylint: disable=protected-access
-    program_shape = xla_client._wrap_program_shape(
-        computation.GetProgramShape())
+    program_shape = computation.GetProgramShape()
     # pylint: enable=protected-access
     proto = computation.GetSerializedProto()
     # TODO(phawkins): use the layouts in compile_options.
     arg_shapes = [
-        _make_xla_shape(shape.with_major_to_minor_layout_if_absent())
-        for shape in program_shape.parameter_shapes
+        shape.with_major_to_minor_layout_if_absent()
+        for shape in program_shape.parameter_shapes()
     ]
-    result_shape = _make_xla_shape(
-        program_shape.result_shape.with_major_to_minor_layout_if_absent())
+    result_shape = (
+        program_shape.result_shape().with_major_to_minor_layout_if_absent())
     device_assignment = _xla.xrt.AssignDevices(compile_options.num_replicas, 1)
     return _xla.xrt.XrtExecutable.Compile(self.context, proto, arg_shapes,
                                           result_shape, device_assignment)
diff --git a/tensorflow/compiler/xla/python/xrt_test.py b/tensorflow/compiler/xla/python/xrt_test.py
index c53ac10..8e0691f 100644
--- a/tensorflow/compiler/xla/python/xrt_test.py
+++ b/tensorflow/compiler/xla/python/xrt_test.py
@@ -48,12 +48,29 @@
     b = np.arange(10)
 
     c = BuildAddAndScaleComputation(
-        xla_client.Shape.from_pyval(a), xla_client.Shape.from_pyval(b))
+        xla_client.shape_from_pyval(a), xla_client.shape_from_pyval(b))
 
     executable = c.Compile(backend=backend)
     output = executable.ExecuteWithPythonValues((a, b))
     self.assertAllEqual(output, (a + b) * 3)
 
+  def testTuples(self):
+    (worker,), _ = test.create_local_cluster(num_workers=1, num_ps=0)
+    self.assertTrue(worker.target.startswith("grpc://"))
+    tf_context = xrt.get_tf_context(worker.target[len("grpc://"):], "worker")
+    backend = xrt.XrtBackend(tf_context, "XLA_CPU")
+
+    a = np.random.randn(10)
+    b = np.random.randn(15, 3)
+    pieces = [
+        xla_client.Buffer.from_pyval(a, backend=backend),
+        xla_client.Buffer.from_pyval(b, backend=backend)
+    ]
+    t = xla_client.Buffer.make_tuple(pieces, backend=backend)
+    a_out, b_out = t.destructure()
+    self.assertAllEqual(a, a_out.to_py())
+    self.assertAllEqual(b, b_out.to_py())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 282420e..1e7a924 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -18,6 +18,9 @@
     includes = [
         "//tensorflow/compiler/xla:friends",
     ],
+    packages = [
+        "//learning/brain/experimental/tf_runtime/...",
+    ],
 )
 
 xla_proto_library(
@@ -434,10 +437,10 @@
     srcs = ["pattern_matcher_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -505,8 +508,8 @@
     hdrs = ["hlo_matchers.h"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -549,13 +552,13 @@
     srcs = ["hlo_sharding_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -583,6 +586,7 @@
     srcs = ["call_graph_test.cc"],
     deps = [
         ":call_graph",
+        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -590,7 +594,6 @@
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -653,6 +656,7 @@
     deps = [
         ":call_graph",
         ":flatten_call_graph",
+        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -660,7 +664,6 @@
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -691,7 +694,6 @@
     deps = [
         ":compiler",
         ":computation_placer",
-        ":device_memory_allocator",
         ":platform_util",
         ":stream_pool",
         ":transfer_manager",
@@ -701,6 +703,7 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//third_party/eigen3",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -721,7 +724,6 @@
         ":compiler",
         ":computation_layout",
         ":computation_placer",
-        ":device_memory_allocator",
         ":dump",
         ":dynamic_dimension_inference",
         ":executable",
@@ -751,6 +753,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -767,7 +770,6 @@
         ":backend",
         ":compiler",
         ":computation_layout",
-        ":device_memory_allocator",
         ":executable",
         ":hlo",
         ":hlo_execution_profile",
@@ -787,6 +789,7 @@
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -855,7 +858,6 @@
     srcs = ["shaped_buffer.cc"],
     hdrs = ["shaped_buffer.h"],
     deps = [
-        ":device_memory_allocator",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -865,6 +867,7 @@
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -878,7 +881,6 @@
     srcs = ["shaped_buffer_test.cc"],
     deps = [
         ":cpu_plugin",
-        ":device_memory_allocator",
         ":platform_util",
         ":shaped_buffer",
         "//tensorflow/compiler/xla:shape_util",
@@ -888,6 +890,7 @@
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -901,7 +904,6 @@
     ],
     deps = [
         ":computation_layout",
-        ":device_memory_allocator",
         ":dump",
         ":hlo",
         ":hlo_execution_profile",
@@ -922,6 +924,7 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -988,7 +991,6 @@
     hdrs = ["allocation_tracker.h"],
     deps = [
         ":backend",
-        ":device_memory_allocator",
         ":transfer_manager",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -997,6 +999,7 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1156,6 +1159,7 @@
         ":hlo",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1163,7 +1167,6 @@
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1205,10 +1208,10 @@
         ":hlo_dataflow_analysis",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -1455,8 +1458,8 @@
     srcs = ["instruction_fusion_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":instruction_fusion",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -1467,11 +1470,11 @@
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":hlo",
+        ":hlo_pass",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1663,6 +1666,7 @@
         ":hlo_pass",
         ":hlo_query",
         ":pattern_matcher",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -1712,6 +1716,44 @@
 )
 
 cc_library(
+    name = "all_reduce_simplifier",
+    srcs = ["all_reduce_simplifier.cc"],
+    hdrs = ["all_reduce_simplifier.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_replication_analysis",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "all_reduce_simplifier_test",
+    srcs = ["all_reduce_simplifier_test.cc"],
+    deps = [
+        ":all_reduce_simplifier",
+        ":hlo",
+        ":hlo_parser",
+        ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
     name = "batch_dot_simplification",
     srcs = ["batch_dot_simplification.cc"],
     hdrs = ["batch_dot_simplification.h"],
@@ -1750,8 +1792,8 @@
     srcs = ["gather_expander_test.cc"],
     deps = [
         ":gather_expander",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -1849,9 +1891,9 @@
     name = "while_loop_analysis_test",
     srcs = ["while_loop_analysis_test.cc"],
     deps = [
+        ":hlo_parser",
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -2201,6 +2243,7 @@
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -2255,7 +2298,7 @@
         ":cpu_plugin",
         ":hlo_cost_analysis",
         ":hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_parser",
+        ":hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2268,14 +2311,14 @@
     srcs = ["hlo_computation_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2480,13 +2523,13 @@
     deps = [
         ":hlo",
         ":hlo_liveness_analysis",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2590,6 +2633,7 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -2869,12 +2913,12 @@
     deps = [
         ":hlo",
         ":hlo_module_dce",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -3000,12 +3044,12 @@
         ":hlo",
         ":hlo_cse",
         ":hlo_matchers",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -3190,27 +3234,6 @@
 )
 
 cc_library(
-    name = "device_memory_allocator",
-    srcs = [
-        "device_memory_allocator.cc",
-        "owning_device_memory.cc",
-    ],
-    hdrs = [
-        "device_memory_allocator.h",
-        "owning_device_memory.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
     name = "maybe_owning_device_memory",
     srcs = [
         "maybe_owning_device_memory.cc",
@@ -3219,7 +3242,7 @@
         "maybe_owning_device_memory.h",
     ],
     deps = [
-        ":device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
     ],
@@ -3262,10 +3285,10 @@
         "gpu",
     ],
     deps = [
+        ":hlo_parser",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3388,6 +3411,7 @@
     deps = [
         ":hlo",
         ":hlo_matchers",
+        ":hlo_parser",
         ":shape_inference",
         ":transpose_folding",
         "//tensorflow/compiler/xla:literal",
@@ -3396,7 +3420,6 @@
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3639,10 +3662,10 @@
     name = "tuple_util_test",
     srcs = ["tuple_util_test.cc"],
     deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
         ":tuple_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -3668,11 +3691,11 @@
     name = "while_util_test",
     srcs = ["while_util_test.cc"],
     deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
         ":while_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
     ],
@@ -3703,9 +3726,9 @@
     srcs = ["while_loop_invariant_code_motion_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -3731,9 +3754,9 @@
     srcs = ["while_loop_constant_sinking_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":while_loop_constant_sinking",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -3933,6 +3956,8 @@
     hdrs = ["ar_crs_combiner.h"],
     deps = [
         ":call_graph",
+        ":hlo",
+        ":hlo_pass",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -3940,8 +3965,6 @@
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -3965,11 +3988,11 @@
     srcs = ["dynamic_index_splitter.cc"],
     hdrs = ["dynamic_index_splitter.h"],
     deps = [
+        ":hlo",
         ":hlo_casting_utils",
+        ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -4067,3 +4090,49 @@
         "@com_google_absl//absl/memory",
     ],
 )
+
+cc_library(
+    name = "slice_sinker",
+    srcs = ["slice_sinker.cc"],
+    hdrs = ["slice_sinker.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "custom_call_target_registry",
+    srcs = ["custom_call_target_registry.cc"],
+    hdrs = ["custom_call_target_registry.h"],
+    visibility = ["//visibility:public"],
+)
+
+tf_cc_test(
+    name = "slice_sinker_test",
+    srcs = ["slice_sinker_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_parser",
+        ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":slice_sinker",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index dc20123..2441e64 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -33,6 +33,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -183,6 +184,8 @@
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
+  Status HandleCompare(HloInstruction* compare) override;
+
   Status HandleConcatenate(HloInstruction* concatenate) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -234,6 +237,7 @@
   Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
   Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
+  Status HandleScatter(HloInstruction* scatter) override;
 
   Status HandleSelect(HloInstruction* select) override;
 
@@ -259,31 +263,8 @@
                                       AlgebraicSimplifier* simplifier)
       : computation_(computation), options_(options), simplifier_(simplifier) {}
 
-  // Transforms Dots where at least one input is a vector or has a degenerate
-  // dimension and converts it into a multiply and reduce. This should enable
-  // more fusion than leaving the nodes as Dot operations.
-  StatusOr<bool> HandleDotStrengthReduction(HloInstruction* dot);
-
-  // Removes dimension dim from hlo.
-  HloInstruction* StripDim(HloInstruction* hlo, int64 dim) {
-    CHECK_EQ(hlo->shape().dimensions(dim), 1);
-    return computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::DeleteDimension(dim, hlo->shape()), hlo));
-  }
-
-  // Reshapes an instruction to rank 1 if it is not already rank 1.
-  HloInstruction* Flatten(HloInstruction* hlo) {
-    if (hlo->shape().rank() == 1) {
-      return hlo;
-    }
-    auto hlo_instruction =
-        computation_->AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(hlo->shape().element_type(),
-                                 {ShapeUtil::ElementsIn(hlo->shape())}),
-            hlo));
-    simplifier_->UpdateLayout(hlo_instruction->mutable_shape());
-    return hlo_instruction;
-  }
+  // Removes degenerate dimension from dot.
+  StatusOr<bool> RemoveDegenerateDimensionFromDot(HloInstruction* dot);
 
   // Converts to primitive type if the input hlo is not that type, otherwise
   // returns the original hlo.
@@ -299,7 +280,7 @@
         HloInstruction::CreateConvert(changed_shape, hlo));
   }
 
-  // Transposes a dot operand such that the batch dimensions are the msot major,
+  // Transposes a dot operand such that the batch dimensions are the most major,
   // and the contracting dimensions are most minor.
   StatusOr<HloInstruction*> NormalizeDotOperandToBatchMajorAndContractingMinor(
       HloInstruction* dot_operand, absl::Span<const int64> batch_dimensions,
@@ -315,6 +296,9 @@
     transpose_dimensions.insert(transpose_dimensions.end(),
                                 contracting_dimensions.begin(),
                                 contracting_dimensions.end());
+    if (absl::c_is_sorted(transpose_dimensions)) {
+      return dot_operand;
+    }
     return MakeTransposeHlo(dot_operand, transpose_dimensions);
   }
 
@@ -332,10 +316,6 @@
         shape, hlo, zero, dims, AddReduce_computation));
   }
 
-  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
-    return AddReduce(hlo, std::vector<int64>{dim});
-  }
-
   // Convenience method for replacing an instruction with a bitcast. If operand
   // is not null, then the bitcast will use the specified operand instead of the
   // operand of the instruction.
@@ -405,6 +385,9 @@
 
   StatusOr<HloInstruction*> OptimizeDotOfGather(HloInstruction* dot);
 
+  StatusOr<HloInstruction*> OptimizeDotOfReorderContractingDims(
+      HloInstruction* dot);
+
   HloComputation* GetOrCreateScalarAddComputation() {
     if (scalar_add_computation_) {
       return scalar_add_computation_;
@@ -531,9 +514,17 @@
   VLOG(10) << "trying transform [(A + C1) + C2 => A + (C1 + C2)]";
   HloInstruction *a, *c1, *c2;
   if (Match(add, m::Add(m::Add(m::NonConstant(&a), m::Constant(&c1)),
-                        m::Constant(&c2)))) {
+                        m::Constant(&c2))) ||
+      Match(add, m::Add(m::Add(m::NonConstant(&a),
+                               m::Broadcast(m::ConstantScalar(&c1))),
+                        m::Broadcast(m::ConstantScalar(&c2))))) {
     TF_ASSIGN_OR_RETURN(auto* sum_of_constants,
                         MakeBinaryHlo(HloOpcode::kAdd, c1, c2));
+    if (ShapeUtil::IsScalar(sum_of_constants->shape()) &&
+        !ShapeUtil::IsScalar(add->shape())) {
+      sum_of_constants = computation_->AddInstruction(
+          HloInstruction::CreateBroadcast(add->shape(), sum_of_constants, {}));
+    }
     return ReplaceWithNewInstruction(
         add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd, a,
                                           sum_of_constants));
@@ -861,9 +852,17 @@
 
   // Canonicalize subtraction of a constant to addition.
   VLOG(10) << "trying transform [A - Const => A + (-Const)]";
-  if (Match(sub, m::Subtract(m::NonConstant(&lhs), m::Constant(&rhs)))) {
+  if (Match(sub, m::Subtract(m::NonConstant(&lhs), m::Constant(&rhs))) ||
+      Match(sub, m::Subtract(m::NonConstant(&lhs),
+                             m::Broadcast(m::Constant(&rhs))))) {
     HloInstruction* negative_const = computation_->AddInstruction(
         HloInstruction::CreateUnary(rhs->shape(), HloOpcode::kNegate, rhs));
+    if (const HloInstruction* broadcast =
+            DynCast<HloBroadcastInstruction>(sub->operand(1))) {
+      negative_const =
+          computation_->AddInstruction(HloInstruction::CreateBroadcast(
+              broadcast->shape(), negative_const, broadcast->dimensions()));
+    }
     return ReplaceWithNewInstruction(
         sub, HloInstruction::CreateBinary(sub->shape(), HloOpcode::kAdd, lhs,
                                           negative_const));
@@ -1157,240 +1156,81 @@
   return Status::OK();
 }
 
-StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
+StatusOr<bool> AlgebraicSimplifierVisitor::RemoveDegenerateDimensionFromDot(
     HloInstruction* dot) {
-  HloInstruction *lhs, *rhs;
-  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
-
-  const auto kept_dim = [](int64 rank, int64 contracting_dimension,
-                           absl::Span<const int64> batch_dimensions) -> int64 {
-    for (int64 i = 0; i < rank; ++i) {
-      if (i != contracting_dimension &&
-          !absl::c_linear_search(batch_dimensions, i)) {
-        return i;
-      }
+  const Shape& lhs_shape = dot->operand(0)->shape();
+  int64 num_degenerate_lhs_dims = 0;
+  std::vector<int64> lhs_dimension_map(lhs_shape.rank(), -1);
+  for (int64 i = 0; i < lhs_shape.rank(); ++i) {
+    if (lhs_shape.dimensions(i) == 1) {
+      ++num_degenerate_lhs_dims;
+    } else {
+      lhs_dimension_map[i] = i - num_degenerate_lhs_dims;
     }
-    return -1;
-  };
-
-  const int64 dot_rank = dot->shape().rank();
-  const int64 rhs_rank = rhs->shape().rank();
-  const int64 lhs_rank = lhs->shape().rank();
-  const auto& dnums = dot->dot_dimension_numbers();
-  if (dnums.rhs_contracting_dimensions_size() != 1) {
-    return false;
-  }
-  if (dot_rank > 2 && (lhs_rank != rhs_rank || lhs_rank != dot_rank)) {
-    return false;
-  }
-  int64 lhs_collapsing_dim = dnums.lhs_contracting_dimensions(0);
-  int64 lhs_kept_dim = kept_dim(lhs_rank, lhs_collapsing_dim,
-                                AsInt64Slice(dnums.lhs_batch_dimensions()));
-  // If there is no non-contracting dimension in rank 2, do not strength reduce.
-  if (lhs_kept_dim == -1 && lhs_rank > 1) {
-    return false;
-  }
-  if (lhs->IsRank2Transpose()) {
-    lhs = lhs->mutable_operand(0);
-    std::swap(lhs_collapsing_dim, lhs_kept_dim);
   }
 
-  int64 rhs_collapsing_dim = dnums.rhs_contracting_dimensions(0);
-  int64 rhs_kept_dim = kept_dim(rhs_rank, rhs_collapsing_dim,
-                                AsInt64Slice(dnums.rhs_batch_dimensions()));
-  // If there is no non-contracting dimension in rank 2, do not strength reduce.
-  if (rhs_kept_dim == -1 && rhs_rank > 1) {
-    return false;
-  }
-  if (rhs->IsRank2Transpose()) {
-    rhs = rhs->mutable_operand(0);
-    std::swap(rhs_collapsing_dim, rhs_kept_dim);
-  }
-
-  auto reshape_if_necessary = [&](HloInstruction* hlo) {
-    hlo = AsType(hlo, dot->shape().element_type());
-    if (!ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
-      hlo = computation_->AddInstruction(
-          HloInstruction::CreateReshape(dot->shape(), hlo));
+  const Shape& rhs_shape = dot->operand(1)->shape();
+  int64 num_degenerate_rhs_dims = 0;
+  std::vector<int64> rhs_dimension_map(rhs_shape.rank(), -1);
+  for (int64 i = 0; i < rhs_shape.rank(); ++i) {
+    if (rhs_shape.dimensions(i) == 1) {
+      ++num_degenerate_rhs_dims;
+    } else {
+      rhs_dimension_map[i] = i - num_degenerate_rhs_dims;
     }
-    return hlo;
-  };
-
-  auto add_reduce_in_f32 = [&](HloInstruction* hlo, const int64 dim) {
-    return AddReduce(AsType(hlo, F32), dim);
-  };
-
-  auto broadcast = [&](HloInstruction* hlo, const Shape& shape,
-                       absl::Span<const int64> dims) {
-    return computation_->AddInstruction(
-        HloInstruction::CreateBroadcast(shape, hlo, dims));
-  };
-
-  auto broadcast_to_dim = [&](HloInstruction* hlo, const Shape& shape,
-                              int64 dim) {
-    return broadcast(hlo, shape, {dim});
-  };
-
-  auto multiply = [&](HloInstruction* local_lhs, HloInstruction* local_rhs) {
-    return computation_->AddInstruction(HloInstruction::CreateBinary(
-        local_lhs->shape(), HloOpcode::kMultiply, local_lhs, local_rhs));
-  };
-
-  // Strength reduce dot(a[K] , b[K]) =
-  //  reshape(result.shape,
-  //          reduce_sum(multiply(a, b), {0}))
-  if (rhs_rank == 1 && lhs_rank == 1) {
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(add_reduce_in_f32(multiply(lhs, rhs), 0))));
-    return true;
   }
-
-  if (ShapeUtil::IsEffectiveScalar(rhs->shape()) &&
-      ShapeUtil::IsEffectiveScalar(lhs->shape())) {
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(multiply(Flatten(lhs), Flatten(rhs)))));
-    return true;
-  }
-
-  // Simplify outer product into multiply with broadcasting.
-  //
-  // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
-  if (rhs_rank == 2 && rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, multiply(broadcast_to_dim(Flatten(lhs), dot->shape(), 0),
-                      broadcast_to_dim(Flatten(rhs), dot->shape(), 1))));
-    return true;
-  }
-
-  // Strength reduce dot(a[1, K], b) =
-  //    reshape(result.shape,
-  //      reduce_sum(
-  //        multiply(broadcast(reshape(a, [K]), {0}), b),
-  //        {0})
-  //      )
-  //    )
-  if (lhs_rank == 1 ||
-      (lhs_rank == 2 && lhs->shape().dimensions(lhs_kept_dim) == 1)) {
-    if (rhs->shape().rank() == 1) {
-      TF_RETURN_IF_ERROR(
-          ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
-                                      multiply(Flatten(lhs), rhs), 0))));
-      return true;
-    }
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(add_reduce_in_f32(
-                 multiply(broadcast_to_dim(Flatten(lhs), rhs->shape(),
-                                           rhs_collapsing_dim),
-                          rhs),
-                 rhs_collapsing_dim))));
-    return true;
-  }
-
-  // Strength reduce dot(a, b[K, 1]) =
-  //  reshape(result.shape,
-  //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
-  //  )
-  if (rhs_rank == 1 ||
-      (rhs_rank == 2 && rhs->shape().dimensions(rhs_kept_dim) == 1)) {
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(add_reduce_in_f32(
-                 multiply(lhs, broadcast_to_dim(Flatten(rhs), lhs->shape(),
-                                                lhs_collapsing_dim)),
-                 lhs_collapsing_dim))));
-    return true;
-  }
-
-  // Only consider kDot with batch dimension.
-  if (dot_rank <= 2) {
+  if (num_degenerate_lhs_dims == 0 && num_degenerate_rhs_dims == 0) {
     return false;
   }
-
-  CHECK_EQ(rhs_rank, lhs_rank);
-  CHECK_EQ(dot_rank, lhs_rank);
-  // If there is more than one non-contracting dimension or the batch dimensions
-  // are not equal, bail out since transposes may be required to do a strength
-  // reduction.
-  if (dnums.rhs_batch_dimensions_size() + 2 != dot_rank ||
-      !absl::c_equal(dnums.lhs_batch_dimensions(),
-                     dnums.rhs_batch_dimensions())) {
-    return false;
-  }
-
-  auto broadcast_dims = [](int64 rank, int64 non_broadcast_dim) {
-    absl::InlinedVector<int64, 8> dims;
-    for (int64 i = 0; i < rank; ++i) {
-      if (i != non_broadcast_dim) {
-        dims.push_back(i);
-      }
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  DotDimensionNumbers new_dnums;
+  for (int64 dim : dnums.lhs_batch_dimensions()) {
+    int64 new_dim = lhs_dimension_map[dim];
+    if (new_dim != -1) {
+      new_dnums.add_lhs_batch_dimensions(new_dim);
     }
-    return dims;
-  };
-
-  // If the contracting dimension is 1, remove the degnerate dimnensions from
-  // the lhs and rhs, broadcast each to the result shape and multiply.
-  if (lhs->shape().dimensions(lhs_collapsing_dim) == 1 &&
-      (rhs_kept_dim == rhs_rank - 1 ||
-       (rhs_collapsing_dim == rhs_rank - 1 && rhs_kept_dim == rhs_rank - 2))) {
-    CHECK_EQ(rhs->shape().dimensions(rhs_collapsing_dim), 1);
-    const int64 lhs_kept_dim_in_output =
-        lhs_kept_dim > lhs_collapsing_dim ? (lhs_kept_dim - 1) : lhs_kept_dim;
-    absl::InlinedVector<int64, 8> lhs_broadcast_dims;
-    for (const int64 dim : dnums.lhs_batch_dimensions()) {
-      lhs_broadcast_dims.push_back(dim > lhs_collapsing_dim ? (dim - 1) : dim);
+  }
+  for (int64 dim : dnums.lhs_contracting_dimensions()) {
+    int64 new_dim = lhs_dimension_map[dim];
+    if (new_dim != -1) {
+      new_dnums.add_lhs_contracting_dimensions(new_dim);
     }
-    absl::InlinedVector<int64, 8> rhs_broadcast_dims = lhs_broadcast_dims;
-    lhs_broadcast_dims.push_back(lhs_kept_dim_in_output);
-    absl::c_sort(lhs_broadcast_dims);
-    rhs_broadcast_dims.push_back(dot_rank - 1);
-    absl::c_sort(rhs_broadcast_dims);
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(
-                 multiply(broadcast(StripDim(lhs, lhs_collapsing_dim),
-                                    dot->shape(), lhs_broadcast_dims),
-                          broadcast(StripDim(rhs, rhs_collapsing_dim),
-                                    dot->shape(), rhs_broadcast_dims)))));
-    return true;
   }
 
-  // If the lhs and rhs non-contracting dimensions are both one, strip each one,
-  // multiply and then reduce the collapsing dimension
-  if (lhs->shape().dimensions(lhs_kept_dim) == 1 &&
-      rhs->shape().dimensions(rhs_kept_dim) == 1 &&
-      lhs_kept_dim == rhs_kept_dim) {
-    auto new_lhs = StripDim(lhs, lhs_kept_dim);
-    auto new_rhs = StripDim(rhs, rhs_kept_dim);
-    const int64 reduce_dim = rhs_kept_dim < rhs_collapsing_dim
-                                 ? (rhs_collapsing_dim - 1)
-                                 : rhs_collapsing_dim;
-    TF_RETURN_IF_ERROR(
-        ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
-                                    multiply(new_lhs, new_rhs), reduce_dim))));
-    return true;
+  for (int64 dim : dnums.rhs_batch_dimensions()) {
+    int64 new_dim = rhs_dimension_map[dim];
+    if (new_dim != -1) {
+      new_dnums.add_rhs_batch_dimensions(new_dim);
+    }
+  }
+  for (int64 dim : dnums.rhs_contracting_dimensions()) {
+    int64 new_dim = rhs_dimension_map[dim];
+    if (new_dim != -1) {
+      new_dnums.add_rhs_contracting_dimensions(new_dim);
+    }
   }
 
-  // If the lhs  non-contracting dimensions is one, strip the one, brodcast to
-  // the rhs shape, multiply and then reduce the collapsing dimension
-  if (lhs->shape().dimensions(lhs_kept_dim) == 1) {
-    auto new_lhs = broadcast(StripDim(lhs, lhs_kept_dim), rhs->shape(),
-                             broadcast_dims(rhs_rank, rhs_kept_dim));
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(add_reduce_in_f32(multiply(new_lhs, rhs),
-                                                    rhs_collapsing_dim))));
-    return true;
+  HloInstruction* new_lhs =
+      num_degenerate_lhs_dims > 0
+          ? dot->parent()->AddInstruction(HloInstruction::CreateReshape(
+                ShapeUtil::DropDegenerateDimensions(lhs_shape),
+                dot->mutable_operand(0)))
+          : dot->mutable_operand(0);
+  HloInstruction* new_rhs =
+      num_degenerate_rhs_dims > 0
+          ? dot->parent()->AddInstruction(HloInstruction::CreateReshape(
+                ShapeUtil::DropDegenerateDimensions(rhs_shape),
+                dot->mutable_operand(1)))
+          : dot->mutable_operand(1);
+  TF_ASSIGN_OR_RETURN(auto new_dot, MakeDotHlo(new_lhs, new_rhs, new_dnums,
+                                               dot->precision_config()));
+  if (ShapeUtil::Compatible(dot->shape(), new_dot->shape())) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(dot, new_dot));
+  } else {
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateReshape(dot->shape(), new_dot)));
   }
-
-  // If the rhs  non-contracting dimensions is one, strip the one, brodcast to
-  // the lhs shape, multiply and then reduce the collapsing dimension
-  if (rhs->shape().dimensions(rhs_kept_dim) == 1) {
-    auto new_rhs = broadcast(StripDim(rhs, rhs_kept_dim), lhs->shape(),
-                             broadcast_dims(lhs_rank, lhs_kept_dim));
-    TF_RETURN_IF_ERROR(ReplaceInstruction(
-        dot, reshape_if_necessary(add_reduce_in_f32(multiply(lhs, new_rhs),
-                                                    lhs_collapsing_dim))));
-    return true;
-  }
-
-  return false;
+  return true;
 }
 
 StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
@@ -1665,6 +1505,211 @@
   return memoized_lookup;
 }
 
+// This function tries to transform
+//   dot(reshape(transpose(A)), Const) to
+//   dot(reshape(A), reshape(transpose(reshape(Const)))),
+// so that the reshape and transpose on the Const side can be constant folded.
+//
+// The basic idea is that since the accumulation in the dot operation is
+// associative, so as long as we permute the elements of the contracting
+// dimensions on both sides of the dot in the same way, the result of the
+// dot is not affected.
+StatusOr<HloInstruction*>
+AlgebraicSimplifierVisitor::OptimizeDotOfReorderContractingDims(
+    HloInstruction* dot) {
+  // This transformation assumes layout is not assigned yet.
+  if (options_.is_layout_sensitive()) {
+    return nullptr;
+  }
+
+  // Canonicalize dot(<constant>, rhs) to dot(rhs, <constant>) to make the
+  // remainder of this function easier.
+  auto dnums = dot->dot_dimension_numbers();
+  auto lhs_contracting_dims = dnums.lhs_contracting_dimensions();
+  auto rhs_contracting_dims = dnums.rhs_contracting_dimensions();
+  auto* lhs = dot->mutable_operand(0);
+  auto* rhs = dot->mutable_operand(1);
+  if (dot->operand(0)->IsConstant()) {
+    std::swap(lhs, rhs);
+    std::swap(lhs_contracting_dims, rhs_contracting_dims);
+  }
+
+  // Require single contracting dim to make the implementation easier to
+  // track contracting dims.
+  if (dnums.lhs_contracting_dimensions_size() != 1) {
+    return nullptr;
+  }
+
+  // Pattern match Dot(reshape(transpose(input), constant))
+  HloInstruction* reshape;
+  HloInstruction* transpose;
+  HloInstruction* input;
+  HloInstruction* constant;
+  if (!Match(lhs,
+             m::Reshape(&reshape, m::Transpose(&transpose, m::Op(&input)))) ||
+      !Match(rhs, m::Constant(&constant))) {
+    return nullptr;
+  }
+
+  // Check that reshape squishes some dims into one dim and that this one
+  // dim is the dot's lhs contracting dim. The size of unmodified_dims should
+  // be N - 1, where N is the rank of the reshape output. This means that the
+  // reshape squishes some dims into one dim. lhs contracting dim should not
+  // be in unmodified_dims. This means that the squishing target dim is the
+  // lhs contracting dim.
+  auto unmodified_dims = ShapeUtil::DimensionsUnmodifiedByReshape(
+      reshape->operand(0)->shape(), reshape->shape());
+  CHECK_EQ(lhs_contracting_dims.size(), 1);
+  if ((unmodified_dims.size() != reshape->shape().rank() - 1) ||
+      absl::c_any_of(unmodified_dims, [&](const std::pair<int64, int64>& p) {
+        return p.second == lhs_contracting_dims[0];
+      })) {
+    return nullptr;
+  }
+
+  // Virtually pull the reshape into the dot so the dot operates on the
+  // transpose, with "unsquished" lhs contracting dims.  The new contracting
+  // dims are all of the dims that are modified by the reshape -- that is, every
+  // dimension that's not in `unmodified_dims[i].first`.
+  //
+  // (We don't need to actually create a new dot instruction. We can just keep
+  // track of lhs and lhs_contracting_dims.)
+  absl::flat_hash_set<int64> unmodified_transpose_dims;
+  for (const auto& pair : unmodified_dims) {
+    unmodified_transpose_dims.insert(pair.first);
+  }
+  lhs_contracting_dims.Clear();
+  for (int64 i = 0; i < transpose->shape().dimensions_size(); ++i) {
+    if (!unmodified_transpose_dims.contains(i)) {
+      lhs_contracting_dims.Add(i);
+    }
+  }
+  lhs = lhs->mutable_operand(0);
+
+  // Check that the transpose only permutes the contracting dims.
+  const auto& transpose_dims = transpose->dimensions();
+  for (int64 i = 0; i < transpose_dims.size(); ++i) {
+    if (transpose_dims[i] != i &&
+        !absl::c_linear_search(lhs_contracting_dims, i)) {
+      return nullptr;
+    }
+  }
+  // Virtually pull the transpose into the dot. Now the dot is equivalent to
+  // a new dot with "permuted" lhs contracting dims.
+  std::vector<int64> permutation;
+  for (auto dim : lhs_contracting_dims) {
+    permutation.push_back(transpose_dims[dim] - lhs_contracting_dims[0]);
+  }
+  auto new_lhs_contracting_dims =
+      ComposePermutations(AsInt64Slice(lhs_contracting_dims), permutation);
+  lhs_contracting_dims.Clear();
+  for (auto dim : new_lhs_contracting_dims) {
+    lhs_contracting_dims.Add(dim);
+  }
+  lhs = lhs->mutable_operand(0);
+
+  // All checks are passed at this point.
+  //
+  // Transform lhs. Remove the transpose and reshape by sorting the lhs
+  // contracting dims and squishing them into a single one. We don't actually
+  // squish the lhs_contracting_dims here because we still need the unsquished
+  // contracting dims to invert reshape and transpose.
+  absl::c_sort(lhs_contracting_dims);
+  lhs = computation_->AddInstruction(
+      HloInstruction::CreateReshape(reshape->shape(), lhs));
+
+  // Transform rhs. Say the input HLO is:
+  //
+  //   t0 = f32[2, 2, 3] parameter(0)
+  //   t1 = f32[2, 3, 2] transpose(t0) dimensions={0, 2, 1}
+  //   t2 = f32[2, 6] reshape(t1)
+  //   t3 = f32[6, 2] constant(...)
+  //   dot = f32[2, 2] dot(t2, t3) lhs_contracting_dims={1},
+  //                               rhs_contracting_dims={0}
+  //
+  // At this point in the function, we have decided that the second and third
+  // dims of t0 can be switched to remove the transpose, and we have
+  // "virtually decomposed" the input HLO to:
+  //
+  //   t0 = f32[2, 2, 3] parameter(0)
+  //   t2' = f32[2, 6] reshape(t0)
+  //   t3' = f32[6, 2] ops-to-be-filled ...
+  //   dot = f32[2, 2] dot(t2', t3') lhs_contracting_dims={1},
+  //                                 rhs_contracting_dims={0}
+  //
+  // The rest of this function is to fill in the ops of t3'. To do this, we
+  // unsquish the contracting dimensions in t3 and then apply the inverse of
+  // the transpose from t1.
+
+  // Invert reshape.
+  CHECK_EQ(rhs_contracting_dims.size(), 1);
+  auto rhs_unsquished_shape_dims = constant->shape().dimensions();
+  auto it = rhs_unsquished_shape_dims.erase(rhs_unsquished_shape_dims.begin() +
+                                            rhs_contracting_dims[0]);
+  for (auto dim : lhs_contracting_dims) {
+    it = rhs_unsquished_shape_dims.insert(it,
+                                          transpose->shape().dimensions(dim));
+    ++it;
+  }
+  HloInstruction* rhs_reshape =
+      computation_->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(constant->shape().element_type(),
+                               rhs_unsquished_shape_dims),
+          constant));
+  rhs = rhs_reshape;
+
+  // Rhs reshape "unsquishes" the single rhs contracting dim into multiple dims.
+  rhs_contracting_dims.Resize(lhs_contracting_dims.size(),
+                              rhs_contracting_dims[0]);
+  absl::c_iota(rhs_contracting_dims, rhs_contracting_dims[0]);
+
+  // Invert transpose. First compute the shape.
+  auto rhs_transpose_shape_dims = rhs_reshape->shape().dimensions();
+  it = rhs_transpose_shape_dims.erase(
+      rhs_transpose_shape_dims.begin() + rhs_contracting_dims[0],
+      rhs_transpose_shape_dims.begin() + rhs_contracting_dims[0] +
+          rhs_contracting_dims.size());
+  for (auto dim : lhs_contracting_dims) {
+    it = rhs_transpose_shape_dims.insert(it, input->shape().dimensions(dim));
+    ++it;
+  }
+  // Then compute the transpose dims.
+  std::vector<int64> rhs_transpose_dims(rhs_reshape->shape().rank());
+  absl::c_iota(rhs_transpose_dims, 0);
+  it = rhs_transpose_dims.erase(
+      rhs_transpose_dims.begin() + rhs_contracting_dims[0],
+      rhs_transpose_dims.begin() + rhs_contracting_dims[0] +
+          rhs_contracting_dims.size());
+  auto inverse_lhs_transpose_dims = InversePermutation(transpose_dims);
+  for (auto dim : lhs_contracting_dims) {
+    it = rhs_transpose_dims.insert(it, inverse_lhs_transpose_dims[dim] -
+                                           lhs_contracting_dims[0] +
+                                           rhs_contracting_dims[0]);
+    ++it;
+  }
+  HloInstruction* rhs_transpose =
+      computation_->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(constant->shape().element_type(),
+                               rhs_transpose_shape_dims),
+          rhs_reshape, rhs_transpose_dims));
+  rhs = rhs_transpose;
+
+  // Squish the multiple rhs contracting dims into a single one.
+  rhs = computation_->AddInstruction(
+      HloInstruction::CreateReshape(constant->shape(), rhs));
+
+  // If we virtually swapped lhs and rhs, we need to swap it back before
+  // creating new dot.
+  if (dot->operand(0)->IsConstant()) {
+    std::swap(lhs, rhs);
+  }
+
+  HloInstruction* new_dot =
+      computation_->AddInstruction(HloInstruction::CreateDot(
+          dot->shape(), lhs, rhs, dnums, dot->precision_config()));
+  return new_dot;
+}
+
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
@@ -1682,8 +1727,7 @@
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
 
-  // Only optimize F32 or BF16 dot operations where the dot, rhs and lhs are
-  // rank 2 or below.
+  // Only optimize F32 or BF16 dot operations where the dot, rhs and lhs.
   if (dot->shape().element_type() != F32 &&
       dot->shape().element_type() != BF16) {
     return Status::OK();
@@ -1799,13 +1843,16 @@
     return ReplaceInstruction(dot, new_dot);
   }
 
-  if (lhs->shape().rank() > 2 || rhs->shape().rank() > 2 ||
-      dot->shape().rank() > 2) {
-    if (options_.enable_dot_strength_reduction() &&
-        !options_.is_layout_sensitive()) {
-      TF_RETURN_IF_ERROR(HandleDotStrengthReduction(dot).status());
-    }
-    return Status::OK();
+  // Simplify dot(reshape(transpose(A)), Const) to:
+  // dot(reshape(A), reshape(transpose(reshape(Const)))), so that the reshape
+  // and transpose on the Const side can be constant folded.
+  TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_reorder_optimized,
+                      OptimizeDotOfReorderContractingDims(dot));
+  if (dot_of_reorder_optimized) {
+    VLOG(10) << " Replaced dot " << dot->ToString()
+             << " with new dot operation: "
+             << dot_of_reorder_optimized->ToString();
+    return ReplaceInstruction(dot, dot_of_reorder_optimized);
   }
 
   TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_concat_optimized,
@@ -1827,11 +1874,10 @@
     return ReplaceInstruction(dot, dot_of_gather_optimized);
   }
 
-  if (options_.enable_dot_strength_reduction() &&
-      !options_.is_layout_sensitive()) {
-    TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
-                        HandleDotStrengthReduction(dot));
-    if (did_strength_reduction) {
+  if (options_.enable_dot_strength_reduction()) {
+    TF_ASSIGN_OR_RETURN(bool removed_degenerate_dimensions,
+                        RemoveDegenerateDimensionFromDot(dot));
+    if (removed_degenerate_dimensions) {
       return Status::OK();
     }
   }
@@ -1883,6 +1929,29 @@
     return Status::OK();
   }
 
+  VLOG(10) << "trying transform [(A * C1) * C2 => A * (C1 * C2)]";
+  HloInstruction *a, *c1, *c2;
+  if (Match(multiply,
+            m::Multiply(m::Multiply(m::NonConstant(&a), m::Constant(&c1)),
+                        m::Constant(&c2))) ||
+      Match(multiply,
+            m::Multiply(
+                m::Multiply(m::Op(&a), m::Broadcast(m::ConstantScalar(&c1))),
+                m::Broadcast(m::ConstantScalar(&c2))))) {
+    TF_ASSIGN_OR_RETURN(auto* product_of_constants,
+                        MakeBinaryHlo(HloOpcode::kMultiply, c1, c2));
+    if (ShapeUtil::IsScalar(product_of_constants->shape()) &&
+        !ShapeUtil::IsScalar(multiply->shape())) {
+      product_of_constants =
+          computation_->AddInstruction(HloInstruction::CreateBroadcast(
+              multiply->shape(), product_of_constants, {}));
+    }
+    return ReplaceWithNewInstruction(
+        multiply,
+        HloInstruction::CreateBinary(multiply->shape(), HloOpcode::kMultiply, a,
+                                     product_of_constants));
+  }
+
   // exp(A) * exp(B) => exp(A+B)
   if (Match(multiply, m::Multiply(m::Exp(m::Op(&lhs)), m::Exp(m::Op(&rhs))))) {
     auto add = computation_->AddInstruction(HloInstruction::CreateBinary(
@@ -2147,6 +2216,49 @@
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
+  HloInstruction* lhs;
+  HloInstruction* rhs;
+  CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
+
+  auto replace_with_pred_broadcast = [&](bool value) {
+    return ReplaceWithNewInstruction(
+        compare,
+        HloInstruction::CreateBroadcast(
+            compare->shape(),
+            computation_->AddInstruction(
+                HloInstruction::CreateConstant(LiteralUtil::CreateR0(value))),
+            {}));
+  };
+  if (compare->comparison_direction() == ComparisonDirection::kLt &&
+      lhs->opcode() == HloOpcode::kIota && IsAll(rhs, 0)) {
+    return replace_with_pred_broadcast(false);
+  } else if (compare->comparison_direction() == ComparisonDirection::kGt &&
+             IsAll(lhs, 0) && rhs->opcode() == HloOpcode::kIota) {
+    return replace_with_pred_broadcast(false);
+  } else if (compare->comparison_direction() == ComparisonDirection::kGe &&
+             lhs->opcode() == HloOpcode::kIota && IsAll(rhs, 0)) {
+    return replace_with_pred_broadcast(true);
+  } else if (compare->comparison_direction() == ComparisonDirection::kLe &&
+             IsAll(lhs, 0) && rhs->opcode() == HloOpcode::kIota) {
+    return replace_with_pred_broadcast(true);
+  }
+  if (lhs == rhs &&
+      primitive_util::IsIntegralType(lhs->shape().element_type())) {
+    switch (compare->comparison_direction()) {
+      case ComparisonDirection::kGt:
+      case ComparisonDirection::kLt:
+      case ComparisonDirection::kNe:
+        return replace_with_pred_broadcast(false);
+      case ComparisonDirection::kEq:
+      case ComparisonDirection::kGe:
+      case ComparisonDirection::kLe:
+        return replace_with_pred_broadcast(true);
+    }
+  }
+  return Status::OK();
+}
+
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
@@ -2975,7 +3087,7 @@
     }
   }
 
-  // TODO(b/112040122): Most of those optimizations below can be done for
+  // TODO(b/131122694): Most of those optimizations below can be done for
   // multi-output reduces.
   if (multi_output_reduce) {
     return Status::OK();
@@ -3337,6 +3449,22 @@
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleScatter(HloInstruction* scatter) {
+  if (ShapeUtil::IsZeroElementArray(scatter->operand(2)->shape()) &&
+      ReplaceInstructionIfSameShape(scatter, scatter->mutable_operand(0))) {
+    return Status::OK();
+  }
+  if (ShapeUtil::IsZeroElementArray(scatter->operand(1)->shape()) &&
+      SameShape(scatter, scatter->operand(0)) &&
+      SameShape(scatter, scatter->operand(2))) {
+    return ReplaceWithNewInstruction(
+        scatter, HloInstruction::CreateMap(
+                     scatter->shape(),
+                     {scatter->mutable_operand(0), scatter->mutable_operand(2)},
+                     scatter->to_apply()));
+  }
+  return Status::OK();
+}
 Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
   auto operand = sort->mutable_operand(0);
   int64 dimension_to_sort = sort->dimensions(0);
@@ -3353,8 +3481,8 @@
 }
 
 namespace {
-bool OnlyPermutesMoreThanOneDegenerateDim(const Shape& shape,
-                                          absl::Span<const int64> perm) {
+bool OnlyPermutesDegenerateDims(const Shape& shape,
+                                absl::Span<const int64> perm) {
   std::vector<int64> new_permutation;
   int64 degenerate_count = 0;
   for (int64 i = 0; i < perm.size(); ++i) {
@@ -3364,7 +3492,7 @@
       ++degenerate_count;
     }
   }
-  return degenerate_count > 1 && absl::c_is_sorted(new_permutation);
+  return degenerate_count > 0 && absl::c_is_sorted(new_permutation);
 }
 }  // namespace
 
@@ -3386,8 +3514,7 @@
 
   // Replace transpose with a reshape if more than one degenerate method is
   // permuted.
-  if (OnlyPermutesMoreThanOneDegenerateDim(transpose->shape(),
-                                           transpose->dimensions())) {
+  if (OnlyPermutesDegenerateDims(transpose->shape(), transpose->dimensions())) {
     return ReplaceWithNewInstruction(
         transpose, HloInstruction::CreateReshape(
                        transpose->shape(), transpose->mutable_operand(0)));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index fed8d26..fee95ae 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -295,6 +295,70 @@
   EXPECT_EQ(computation->root_instruction(), zero);
 }
 
+TEST_F(AlgebraicSimplifierTest, MultiplyReassociateMergeConstants) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      c0 = f32[] constant(2.0)
+      c1 = f32[] constant(3.0)
+      multiply0 = f32[] multiply(p0, c0)
+      ROOT multiply1 = f32[] multiply(multiply0, c1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0),
+                                     m::Multiply(m::ConstantScalar(2.0),
+                                                 m::ConstantScalar(3.0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MultiplyReassociateMergeBroadcastedConstants) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(2.0)
+      c1 = f32[] constant(3.0)
+      b0 = f32[4] broadcast(c0), dimensions={}
+      b1 = f32[4] broadcast(c1), dimensions={}
+      multiply0 = f32[4] multiply(p0, b0)
+      ROOT multiply1 = f32[4] multiply(multiply0, b1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Multiply(
+          m::Parameter(0), m::Broadcast(m::Multiply(m::ConstantScalar(2.0),
+                                                    m::ConstantScalar(3.0))))));
+}
+
+TEST_F(AlgebraicSimplifierTest,
+       MultiplyReassociateMultiplyOfConstantAndBroadcast) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c0 = f32[4] constant({2.0, 3.0, 4.0, 5.0})
+      c1 = f32[] constant(3.0)
+      c2 = f32[] constant(4.0)
+      b0 = f32[4] broadcast(c1), dimensions={}
+      b1 = f32[4] broadcast(c2), dimensions={}
+      multiply0 = f32[4] multiply(c0, b0)
+      ROOT multiply1 = f32[4] multiply(multiply0, b1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Multiply(
+          m::Constant(), m::Broadcast(m::Multiply(m::ConstantScalar(3.0),
+                                                  m::ConstantScalar(4.0))))));
+}
+
 // Test that select(true, a, b) is simplified to a
 TEST_F(AlgebraicSimplifierTest, SelectTrue) {
   Shape r0s32 = ShapeUtil::MakeShape(S32, {});
@@ -446,6 +510,27 @@
                         m::Add(m::Op().Is(constant1), m::Op().Is(constant2)))));
 }
 
+TEST_F(AlgebraicSimplifierTest, AddReassociateMergeBroadcastedConstants) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(1.0)
+      c1 = f32[] constant(2.0)
+      b0 = f32[4] broadcast(c0), dimensions={}
+      b1 = f32[4] broadcast(c1), dimensions={}
+      add0 = f32[4] add(p0, b0)
+      ROOT add1 = f32[4] add(add0, b1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Add(m::Parameter(0),
+                                m::Broadcast(m::Add(m::ConstantScalar(1.0),
+                                                    m::ConstantScalar(2.0))))));
+}
+
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
@@ -640,6 +725,25 @@
                                       m::Negate(m::Op().Is(constant)))));
 }
 
+// Test that A - Broadcast(Const) is canonicalized to A + Broadcast(-Const).
+TEST_F(AlgebraicSimplifierTest, SubBroadcastConstCanonicalization) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c = f32[] constant(0.125)
+      b = f32[4] broadcast(c), dimensions={}
+      ROOT sub = f32[4] subtract(p0, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Add(m::Parameter(0),
+                        m::Broadcast(m::Negate(m::ConstantScalar(0.125))))));
+}
+
 // Test that (A/B)/C is simplified to A/(B*C).
 TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   auto m = CreateNewVerifiedModule();
@@ -4337,7 +4441,7 @@
   int m, k, n;
   PrimitiveType element_type;
   std::tie(m, k, n, element_type) = GetParam();
-  std::vector<int64> lhs_dims = {1, 3, 5};
+  std::vector<int64> lhs_dims = {2, 3, 5};
   std::vector<int64> rhs_dims = lhs_dims;
   std::vector<int64> output_dims = lhs_dims;
   if (m > 0) {
@@ -4380,6 +4484,7 @@
   const bool dot_should_be_transformed =
       m == 1 || k == 1 || n == 1 || m == -1 || k == -1 || n == -1;
   EXPECT_EQ(changed, dot_should_be_transformed);
+  TF_ASSERT_OK_AND_ASSIGN(changed, simplifier.Run(module.get()));
   bool has_no_dot = true;
   for (const auto& hlo : computation->instructions()) {
     if (hlo->opcode() == HloOpcode::kDot) {
@@ -4434,11 +4539,17 @@
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
   auto computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
+  // First pass of algebraic simplifier will remove degenerate dimensions
+  // and optimize dot(transpose(x),transpose(y))
   TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
       dot_should_be_transformed || (transpose_lhs && transpose_rhs);
   EXPECT_EQ(changed, computation_should_be_modified);
+  // The second pass of algebriac simplifer will remove dots without
+  // non-contracting dimensions or contracting dimensions.
+  TF_ASSERT_OK_AND_ASSIGN(changed, simplifier.Run(module.get()));
+  EXPECT_EQ(changed, dot_should_be_transformed);
   bool has_no_dot = true;
   for (const auto& hlo : computation->instructions()) {
     if (hlo->opcode() == HloOpcode::kDot) {
@@ -5000,11 +5111,315 @@
       [](const Shape&, const Shape&) { return false; });
   options.set_is_layout_sensitive(true);
   ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
-  LOG(INFO) << "\n" << m->ToString();
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::Reshape(m::Parameter(0)).WithShapeEqualTo(&result_shape)));
 }
 
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_RL) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[6, 2] constant({{1, 2},{3, 4},{5, 6},{1, 1},{1, 1},{1, 1}})
+      t0 = f32[2, 2, 3] parameter(0)
+      t1 = f32[2, 3, 2] transpose(t0), dimensions={0, 2, 1}
+      lhs = f32[2, 6] reshape(t1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto shape1 = ShapeUtil::MakeShape(F32, {2, 6});
+  auto shape2 = ShapeUtil::MakeShape(F32, {3, 2, 2});
+  auto shape3 = ShapeUtil::MakeShape(F32, {2, 3, 2});
+  // The transformation of moving transpose and reshape to the constant side
+  // is layout insensitive. We ignore layout when checking shapes.
+  const HloInstruction* transpose;
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot(
+                  m::Reshape(m::Parameter(0)).WithShapeCompatibleTo(&shape1),
+                  m::Reshape(m::Transpose(&transpose,
+                                          m::Reshape(m::Constant())
+                                              .WithShapeCompatibleTo(&shape2))
+                                 .WithShapeCompatibleTo(&shape3)))));
+  EXPECT_THAT(transpose->dimensions(), ElementsAre(1, 0, 2));
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_RR) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[2, 6] constant({{1, 2, 3, 4, 5, 6},
+                                {1, 1, 1, 1, 1, 1}})
+      t0 = f32[2, 2, 3] parameter(0)
+      t1 = f32[2, 3, 2] transpose(t0), dimensions={0, 2, 1}
+      lhs = f32[2, 6] reshape(t1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto shape1 = ShapeUtil::MakeShape(F32, {2, 6});
+  auto shape2 = ShapeUtil::MakeShape(F32, {2, 3, 2});
+  auto shape3 = ShapeUtil::MakeShape(F32, {2, 2, 3});
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot(
+                  m::Reshape(m::Parameter(0)).WithShapeCompatibleTo(&shape1),
+                  m::Reshape(m::Transpose(m::Reshape(m::Constant())
+                                              .WithShapeCompatibleTo(&shape2))
+                                 .WithShapeCompatibleTo(&shape3)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_LR) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[2, 6] constant({{1, 2, 3, 4, 5, 6},
+                                {1, 1, 1, 1, 1, 1}})
+      t0 = f32[2, 3, 2] parameter(0)
+      t1 = f32[3, 2, 2] transpose(t0), dimensions={1, 0, 2}
+      lhs = f32[6, 2] reshape(t1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto shape1 = ShapeUtil::MakeShape(F32, {6, 2});
+  auto shape2 = ShapeUtil::MakeShape(F32, {2, 3, 2});
+  auto shape3 = ShapeUtil::MakeShape(F32, {2, 2, 3});
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot(
+                  m::Reshape(m::Parameter(0)).WithShapeCompatibleTo(&shape1),
+                  m::Reshape(m::Transpose(m::Reshape(m::Constant())
+                                              .WithShapeCompatibleTo(&shape2))
+                                 .WithShapeCompatibleTo(&shape3)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_LR2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[8, 2] constant({{1, 1},{2, 2},{3, 3},{4, 4},{5, 5},{6, 6},{7, 7},{8, 8}})
+      t0 = f32[2, 2, 2, 2] parameter(0)
+      t1 = f32[2, 2, 2, 2] transpose(t0), dimensions={0, 2, 3, 1}
+      lhs = f32[2, 8] reshape(t1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={1},
+                                            rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto shape1 = ShapeUtil::MakeShape(F32, {2, 8});
+  auto shape2 = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
+  const HloInstruction* transpose;
+  ASSERT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Dot(
+          m::Reshape(m::Parameter(0)).WithShapeCompatibleTo(&shape1),
+          m::Reshape(m::Transpose(
+              &transpose,
+              m::Reshape(m::Constant()).WithShapeCompatibleTo(&shape2))))));
+  EXPECT_THAT(transpose->dimensions(), ElementsAre(2, 0, 1, 3));
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_MM) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[2, 6, 2] constant({{{1, 1},{2, 2},{3, 3},{4, 4},{5, 5},{6, 6}},
+                                   {{1, 1},{2, 2},{3, 3},{4, 4},{5, 5},{6, 6}}})
+      t0 = f32[2, 2, 3, 2] parameter(0)
+      t1 = f32[2, 3, 2, 2] transpose(t0), dimensions={0, 2, 1, 3}
+      lhs = f32[2, 6, 2] reshape(t1)
+      ROOT dot.5 = f32[2, 2, 2] dot(lhs, rhs), lhs_batch_dims={0}, lhs_contracting_dims={1},
+                                               rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto shape1 = ShapeUtil::MakeShape(F32, {2, 6, 2});
+  auto shape2 = ShapeUtil::MakeShape(F32, {2, 3, 2, 2});
+  auto shape3 = ShapeUtil::MakeShape(F32, {2, 2, 3, 2});
+  const HloInstruction* transpose;
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot(
+                  m::Reshape(m::Parameter(0)).WithShapeCompatibleTo(&shape1),
+                  m::Reshape(m::Transpose(&transpose,
+                                          m::Reshape(m::Constant())
+                                              .WithShapeCompatibleTo(&shape2))
+                                 .WithShapeCompatibleTo(&shape3)))));
+  EXPECT_THAT(transpose->dimensions(), ElementsAre(0, 2, 1, 3));
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_NegTranspose) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[12, 2] constant({{1, 1},{2, 2},{3, 3},{4, 4},{5, 5},{6, 6},{1, 1},{2, 2},{3, 3},{4, 4},{5, 5},{6, 6}})
+      t0 = f32[3, 4, 2] parameter(0)
+      t1 = f32[2, 3, 4] transpose(t0), dimensions={2, 0, 1}
+      lhs = f32[2, 12] reshape(t1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  // Transpose affects non-contracting dimension. The transpose and reshape
+  // should not be moved to the constant side.
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_NegReshape) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[8, 2] constant({{1, 1},{2, 2},{3, 3},{4, 4},{1, 1},{2, 2},{3, 3},{4, 4}})
+      t0 = f32[2, 4, 3] parameter(0)
+      t1 = f32[2, 3, 4] transpose(t0), dimensions={0, 2, 1}
+      lhs = f32[3, 8] reshape(t1)
+      ROOT dot.5 = f32[3, 2] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  // Reshape affects non-contracting dimensions. The transpose and reshape
+  // should not be moved to the constant side.
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_NegConstant) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      t0 = f32[2, 3, 4] parameter(0)
+      t1 = f32[2, 4, 3] transpose(t0), dimensions={0, 2, 1}
+      lhs = f32[2, 12] reshape(t1)
+      rhs = f32[12, 2] parameter(1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  // Both operands are non-constant, so the optimization should not happen.
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_NegLayout) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      rhs = f32[6, 2] constant({{1, 2},{3, 4},{5, 6},{1, 1},{1, 1},{1, 1}})
+      t0 = f32[2, 2, 3] parameter(0)
+      t1 = f32[2, 3, 2] transpose(t0), dimensions={0, 2, 1}
+      lhs = f32[2, 6] reshape(t1)
+      ROOT dot.5 = f32[2, 2] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  // We disable converting reshape to bitcast to make sure algsimp pass does
+  // not catch the reshape in this test, then we can simply check if algsimp
+  // pass does not make any change.
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
+  options.set_is_layout_sensitive(true);
+  // The transformation of moving transpose and reshape to the constant side is
+  // layout insensitive. It should not happen if AlgebraicSimplifier is set up
+  // to be layout sensitive.
+  ASSERT_FALSE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_SizeOneDimsNoChange) {
+  // This isn't transformed (notice that the relative order of the `2` and `3`
+  // dims doesn't change, so there's no opportunity here), but it's nonetheless
+  // an interesting testcase because of the presence of the size-1 dimensions.
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+     param = f32[1,2,5,3] parameter(0)
+     transpose = f32[1,5,2,3] transpose(param), dimensions={0,2,1,3}
+     reshape = f32[5,6] reshape(transpose)
+     constant = f32[6,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+     ROOT dot = f32[5,4] dot(reshape, constant),
+       lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DotContractingReorder_SizeOneDims) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+     param = f32[1,2,3,5] parameter(0)
+     transpose = f32[1,3,2,5] transpose(param), dimensions={0,2,1,3}
+     reshape = f32[6,5] reshape(transpose)
+     constant = f32[6,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+     ROOT dot = f32[5,4] dot(reshape, constant),
+       lhs_contracting_dims={0}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto shape1 = ShapeUtil::MakeShape(F32, {6, 5});
+  auto shape2 = ShapeUtil::MakeShape(F32, {1, 3, 2, 4});
+  auto shape3 = ShapeUtil::MakeShape(F32, {1, 2, 3, 4});
+  const HloInstruction* transpose;
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot(
+                  m::Reshape(m::Parameter(0)).WithShapeCompatibleTo(&shape1),
+                  m::Reshape(m::Transpose(&transpose,
+                                          m::Reshape(m::Constant())
+                                              .WithShapeCompatibleTo(&shape2))
+                                 .WithShapeCompatibleTo(&shape3)))));
+  EXPECT_THAT(transpose->dimensions(), ElementsAre(0, 2, 1, 3));
+}
+
+// This test exposes a real bug: It tries to read an out-of-bounds array index
+// from within ComposePermutations().  TODO(b/132330723): Fix this.
+TEST_F(AlgebraicSimplifierTest,
+       DISABLED_DotContractingReorder_NoChangeInContractingDimsOrder) {
+  // No optimization opportunity here because the transpose does not reorder the
+  // contracting dims.
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = f32[2,5,1,3] parameter(0)
+      transpose = f32[1,5,2,3] transpose(param), dimensions={2,1,0,3}
+      reshape = f32[5,6] reshape(transpose)
+      constant = f32[6,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+      ROOT dot = f32[5,4] dot(reshape, constant),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareIota) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      zero = s32[] constant(0)
+      iota = s32[128] iota(), iota_dimension=0
+      broad = s32[128] broadcast(zero), dimensions={}
+      ROOT compare = pred[128] compare(iota, broad), direction=LT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::ConstantScalar(false))));
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareSame) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[123] parameter(0)
+      ROOT compare = pred[123] compare(param, param), direction=GE
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::ConstantScalar(true))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
new file mode 100644
index 0000000..e541bfe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
@@ -0,0 +1,121 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_reduce_simplifier.h"
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(auto replication, HloReplicationAnalysis::Run(module));
+  std::vector<HloInstruction*> all_reduces_to_replace;
+  for (auto computation : module->computations()) {
+    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      if (!inst->shape().IsArray()) {
+        // We currently do not change tuple-shaped all-reduce.
+        continue;
+      }
+      if (inst->IsCrossReplicaAllReduce() &&
+          replication->HloInstructionIsReplicatedAt(inst->operand(0), {})) {
+        all_reduces_to_replace.push_back(inst);
+      }
+    }
+  }
+
+  bool changed = false;
+  if (all_reduces_to_replace.empty()) {
+    return changed;
+  }
+
+  // Returns the size of a replica group if all groups have the same size, or -1
+  // if they have different sizes.
+  auto get_replica_group_size =
+      [this](const HloInstruction* all_reduce) -> int64 {
+    if (all_reduce->replica_groups().empty()) {
+      return replica_count_;
+    }
+    int64 replica_group_size = -1;
+    for (const auto& group : all_reduce->replica_groups()) {
+      if (replica_group_size == -1) {
+        replica_group_size = group.replica_ids_size();
+      } else if (replica_group_size != group.replica_ids_size()) {
+        return -1;
+      }
+    }
+    return replica_group_size;
+  };
+
+  for (auto all_reduce : all_reduces_to_replace) {
+    if (all_reduce->to_apply()->instruction_count() != 3 ||
+        all_reduce->to_apply()->num_parameters() != 2) {
+      continue;
+    }
+    HloInstruction* replacement;
+    switch (all_reduce->to_apply()->root_instruction()->opcode()) {
+      case HloOpcode::kAdd: {
+        int64 replica_group_size = get_replica_group_size(all_reduce);
+        if (replica_group_size == -1) {
+          continue;
+        }
+        // Create the multiplier:
+        //   broadcast(convert_to_matching_type(s32 group size))
+        auto multiplier =
+            all_reduce->parent()->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::CreateR0<int32>(replica_group_size)));
+        if (all_reduce->shape().element_type() != S32) {
+          multiplier = all_reduce->parent()->AddInstruction(
+              HloInstruction::CreateConvert(
+                  ShapeUtil::ChangeElementType(
+                      multiplier->shape(), all_reduce->shape().element_type()),
+                  multiplier));
+        }
+        if (all_reduce->shape().rank() > 0) {
+          multiplier = all_reduce->parent()->AddInstruction(
+              HloInstruction::CreateBroadcast(all_reduce->shape(), multiplier,
+                                              {}));
+        }
+        replacement =
+            all_reduce->parent()->AddInstruction(HloInstruction::CreateBinary(
+                all_reduce->shape(), HloOpcode::kMultiply,
+                all_reduce->mutable_operand(0), multiplier));
+        break;
+      }
+      case HloOpcode::kMinimum:
+      case HloOpcode::kMaximum:
+      case HloOpcode::kOr:
+      case HloOpcode::kAnd:
+        replacement = all_reduce->mutable_operand(0);
+        break;
+      default:
+        continue;
+    }
+    VLOG(2) << "Replacing " << all_reduce->ToString() << " with "
+            << replacement->ToString();
+    TF_RETURN_IF_ERROR(all_reduce->ReplaceAllUsesWith(replacement));
+    changed = true;
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier.h b/tensorflow/compiler/xla/service/all_reduce_simplifier.h
new file mode 100644
index 0000000..f2d2294
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// A pass that detects all-reduces whose inputs are already the same across
+// replicas using the replication analysis, then replaces those all-reduces with
+// local computations. E.g., a sum all-reduce on replicated input will be
+// replaced by a multiply with the replica count.
+class AllReduceSimplifier : public HloModulePass {
+ public:
+  explicit AllReduceSimplifier(int64 replica_count)
+      : replica_count_(replica_count) {}
+  ~AllReduceSimplifier() override = default;
+  absl::string_view name() const override { return "all-reduce-simp"; }
+
+  // Run all-reduce simplification on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  int64 replica_count_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
new file mode 100644
index 0000000..2e03e67
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_reduce_simplifier.h"
+
+#include <memory>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+using AllReduceSimplifierTest = HloTestBase;
+
+TEST_F(AllReduceSimplifierTest, ReplicatedParameters) {
+  const char* kModuleStr = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+max {
+  a.1 = f32[] parameter(0)
+  b.1 = f32[] parameter(1)
+  ROOT max = f32[] maximum(a.1, b.1)
+}
+
+min {
+  a.2 = f32[] parameter(0)
+  b.2 = f32[] parameter(1)
+  ROOT min = f32[] minimum(a.2, b.2)
+}
+
+sum.1 {
+  a.3 = f32[] parameter(0)
+  b.3 = f32[] parameter(1)
+  ROOT add.1 = f32[] add(a.3, b.3)
+}
+
+test {
+  p0 = f32[8,16] parameter(0), parameter_replication={true}
+  p1 = f32[8,16] parameter(1), parameter_replication={false}
+  p2 = f32[] parameter(2), parameter_replication={true}
+  all-reduce = f32[8,16] all-reduce(p0), replica_groups={}, to_apply=sum
+  all-reduce.1 = f32[8,16] all-reduce(p0), replica_groups={}, to_apply=max
+  all-reduce.2 = f32[8,16] all-reduce(p1), replica_groups={}, to_apply=min
+  all-reduce.3 = f32[] all-reduce(p2), replica_groups={}, to_apply=sum.1
+  ROOT tuple = (f32[8,16], f32[8,16], f32[8,16], f32[]) tuple(all-reduce, all-reduce.1, all-reduce.2, all-reduce.3)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  AllReduceSimplifier simplifier(/*replica_count=*/8);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::MultiplyAnyOrder(m::Parameter(0),
+                              m::Broadcast(m::Convert(m::ConstantScalar(8)))),
+          m::Parameter(0), m::AllReduce(m::Parameter(1)),
+          m::MultiplyAnyOrder(m::Parameter(2),
+                              m::Convert(m::ConstantScalar(8))))));
+}
+
+TEST_F(AllReduceSimplifierTest, AllReduceAfterAllReduce) {
+  const char* kModuleStr = R"(
+HloModule m
+
+max {
+  a.1 = f32[] parameter(0)
+  b.1 = f32[] parameter(1)
+  ROOT max = f32[] maximum(a.1, b.1)
+}
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+test {
+  p0 = f32[8,16] parameter(0), parameter_replication={false}
+  all-reduce = f32[8,16] all-reduce(p0), replica_groups={}, to_apply=max
+  ROOT all-reduce.1 = f32[8,16] all-reduce(all-reduce), replica_groups={}, to_apply=sum
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  AllReduceSimplifier simplifier(/*replica_count=*/8);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AllReduce(m::Parameter(0)),
+                  m::Broadcast(m::Convert(m::ConstantScalar(8))))));
+}
+
+TEST_F(AllReduceSimplifierTest, SubgroupAllReduce) {
+  const char* kModuleStr = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+max {
+  a.1 = f32[] parameter(0)
+  b.1 = f32[] parameter(1)
+  ROOT max = f32[] maximum(a.1, b.1)
+}
+
+min {
+  a.2 = f32[] parameter(0)
+  b.2 = f32[] parameter(1)
+  ROOT min = f32[] minimum(a.2, b.2)
+}
+
+test {
+  p0 = f32[8,16] parameter(0), parameter_replication={true}
+  p1 = f32[8,16] parameter(1), parameter_replication={false}
+  all-reduce = f32[8,16] all-reduce(p0), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum
+  all-reduce.1 = f32[8,16] all-reduce(p0), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=max
+  all-reduce.2 = f32[8,16] all-reduce(p1), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=min
+  ROOT tuple = (f32[8,16], f32[8,16], f32[8,16]) tuple(all-reduce, all-reduce.1, all-reduce.2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  AllReduceSimplifier simplifier(/*replica_count=*/8);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::MultiplyAnyOrder(m::Parameter(0),
+                              m::Broadcast(m::Convert(m::ConstantScalar(4)))),
+          m::Parameter(0), m::AllReduce(m::Parameter(1)))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 6cb0e98..ea56c75 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -20,13 +20,13 @@
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -221,8 +221,8 @@
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
     allocation_map[device_memory.opaque()] = {
-        OwningDeviceMemory(device_memory, device_ordinal,
-                           backend_->memory_allocator()),
+        se::OwningDeviceMemory(device_memory, device_ordinal,
+                               backend_->memory_allocator()),
         /*ref_count=*/1};
   } else {
     it->second.ref_count++;
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 98d1a30..6e7f9fd 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -77,7 +77,7 @@
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    OwningDeviceMemory device_memory;
+    se::OwningDeviceMemory device_memory;
 
     // This is the number of times this memory allocation is referred to by
     // registered data handles.
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 787fe16..1ca2280 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -107,44 +107,90 @@
   return absl::nullopt;
 }
 
+absl::optional<HloInstruction*> ArCrsCombiner::ConditionalFromBodyParameter(
+    HloInstruction* instruction) {
+  CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
+  HloComputation* computation = instruction->parent();
+  auto caller_instructions = call_graph_->GetComputationCallers(computation);
+  if (caller_instructions.size() == 1) {
+    auto caller_instruction = caller_instructions[0];
+    if (caller_instruction->opcode() == HloOpcode::kConditional) {
+      return caller_instruction;
+    }
+  }
+  return absl::nullopt;
+}
+
 std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
     HloInstruction* instruction) {
-  if (instruction->opcode() == HloOpcode::kTuple) {
-    return {instruction};
-  }
-  if (instruction->opcode() == HloOpcode::kDomain) {
-    return GetAllTuples(instruction->operands()[0]);
-  }
-  if (instruction->opcode() == HloOpcode::kParameter) {
-    auto maybe_while = WhileFromBodyParameter(instruction);
-    if (!maybe_while) {
-      return {};
-    }
-    auto while_instr = *maybe_while;
-    auto init_tuples = GetAllTuples(while_instr->while_init());
-    auto body_tuples =
-        GetAllTuples(while_instr->while_body()->root_instruction());
-    if (init_tuples.empty() || body_tuples.empty()) {
-      return {};
-    }
-    init_tuples.insert(init_tuples.end(), body_tuples.begin(),
-                       body_tuples.end());
-    return init_tuples;
-  }
-  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
-    std::vector<HloInstruction*> result_tuples;
-    for (auto tuple : GetAllTuples(instruction->operands()[0])) {
-      auto tmp_tuples =
-          GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
-      if (tmp_tuples.empty()) {
-        return {};
+  switch (instruction->opcode()) {
+    case HloOpcode::kTuple:
+      return {instruction};
+    case HloOpcode::kDomain:
+      return GetAllTuples(instruction->operands()[0]);
+    case HloOpcode::kParameter: {
+      auto maybe_while = WhileFromBodyParameter(instruction);
+      if (maybe_while) {
+        auto while_instr = *maybe_while;
+        auto init_tuples = GetAllTuples(while_instr->while_init());
+        auto body_tuples =
+            GetAllTuples(while_instr->while_body()->root_instruction());
+        if (init_tuples.empty() || body_tuples.empty()) {
+          return {};
+        }
+        init_tuples.insert(init_tuples.end(), body_tuples.begin(),
+                           body_tuples.end());
+        return init_tuples;
       }
-      result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
-                           tmp_tuples.end());
+      auto maybe_conditional = ConditionalFromBodyParameter(instruction);
+      if (maybe_conditional) {
+        auto cond_instr = *maybe_conditional;
+        std::vector<HloInstruction*> tuples;
+        for (int64 i = 0; i < cond_instr->branch_computations().size(); ++i) {
+          if (cond_instr->branch_computation(i)->parameter_instruction(0) ==
+              instruction) {
+            // If the same computation is used for more than one branch of the
+            // conditional, we collect the arguments that flow to the
+            // computation from all branches.
+            auto branch_tuples =
+                GetAllTuples(cond_instr->mutable_operand(i + 1));
+            if (branch_tuples.empty()) {
+              return {};
+            }
+            tuples.insert(tuples.end(), branch_tuples.begin(),
+                          branch_tuples.end());
+          }
+        }
+        return tuples;
+      }
+      return {};
     }
-    return result_tuples;
+    case HloOpcode::kGetTupleElement: {
+      std::vector<HloInstruction*> result_tuples;
+      for (auto tuple : GetAllTuples(instruction->operands()[0])) {
+        auto tmp_tuples =
+            GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
+        if (tmp_tuples.empty()) {
+          return {};
+        }
+        result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
+                             tmp_tuples.end());
+      }
+      return result_tuples;
+    }
+    case HloOpcode::kConditional: {
+      std::vector<HloInstruction*> result_tuples;
+      for (HloComputation* body : instruction->branch_computations()) {
+        if (body->root_instruction()->opcode() != HloOpcode::kTuple) {
+          return {};
+        }
+        result_tuples.push_back(body->root_instruction());
+      }
+      return result_tuples;
+    }
+    default:
+      return {};
   }
-  return {};
 }
 
 bool ArCrsCombiner::TupleElementsComputeSameValue(
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index 2ae5560..e5926c7 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -119,6 +119,12 @@
   absl::optional<HloInstruction*> WhileFromBodyParameter(
       HloInstruction* instruction);
 
+  // If the passed instruction is a parameter in one of the branch computations,
+  // and the branch body is only called by a single instruction, return the
+  // conditional instruction.
+  absl::optional<HloInstruction*> ConditionalFromBodyParameter(
+      HloInstruction* instruction);
+
   // Returns a vector of tuple instructions.
   // If all instructions that flow to "instruction" are tuples, return them.
   // Otherwise, return an empty vector.
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index b972b12..e972e3c 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -1173,5 +1173,47 @@
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ArCrsCombinerTest, SameValueTestConditional) {
+  const char* module_str = R"(
+HloModule foobar
+
+branch_true {
+  pt = (f32[2,4], f32[2,4]) parameter(0)
+  gte.0 = f32[2,4] get-tuple-element(pt), index=0
+  gte.1 = f32[2,4] get-tuple-element(pt), index=1
+  ROOT tuple.t = (f32[2,4], f32[2,4]) tuple(gte.1, gte.0)
+}
+
+branch_false {
+  pf = (f32[2,4], f32[2,4]) parameter(0)
+  gte.0 = f32[2,4] get-tuple-element(pf), index=0
+  gte.1 = f32[2,4] get-tuple-element(pf), index=1
+  add = f32[2,4] add(gte.1, gte.1)
+  ROOT tuple.f = (f32[2,4], f32[2,4]) tuple(gte.0, add)
+}
+
+ENTRY Parameters1.v4 {
+  constant = pred[] constant(true)
+  p = f32[2,4] parameter(0)
+  tuple = (f32[2,4], f32[2,4]) tuple(p, p)
+  ROOT conditional = (f32[2,4], f32[2,4]) conditional(constant, tuple, tuple), true_computation=branch_true, false_computation=branch_false
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto cond = module->entry_computation()->root_instruction();
+
+  auto branch_true = cond->branch_computation(0)->root_instruction();
+  auto t0 = branch_true->mutable_operand(0);
+  auto t1 = branch_true->mutable_operand(1);
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(t0, t1));
+
+  auto branch_false = cond->branch_computation(1)->root_instruction();
+  auto f0 = branch_false->mutable_operand(0);
+  auto f1 = branch_false->mutable_operand(1);
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(f0, f1));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 1528ec6..d859f64 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -134,7 +134,7 @@
     }
   }
   // Create a memory allocator for the valid stream executors.
-  memory_allocator_ = absl::make_unique<StreamExecutorMemoryAllocator>(
+  memory_allocator_ = absl::make_unique<se::StreamExecutorMemoryAllocator>(
       platform, stream_executors);
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index e7f29a0..79fdeb2 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -27,7 +27,6 @@
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,6 +34,7 @@
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace Eigen {
 struct ThreadPoolDevice;
@@ -88,7 +88,7 @@
   // Accessors for the various objects.
   se::Platform* platform() const { return platform_; }
   Compiler* compiler() const { return compiler_; }
-  DeviceMemoryAllocator* memory_allocator() const {
+  se::DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_.get();
   }
   TransferManager* transfer_manager() const { return transfer_manager_; }
@@ -179,7 +179,7 @@
       stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
-  std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;
 
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
   struct IntraOpThreadPool;
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.cc b/tensorflow/compiler/xla/service/cholesky_expander.cc
index 1c39cf9..c4979ad 100644
--- a/tensorflow/compiler/xla/service/cholesky_expander.cc
+++ b/tensorflow/compiler/xla/service/cholesky_expander.cc
@@ -99,7 +99,7 @@
       // a[..., i, i]
       auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
       // np.dot(row, np.swapaxes(row, -1, -2))
-      auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision);
+      auto diag_dot = BatchDot(row, false, row, true, precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
       auto l_ii = Sqrt(a_ii - diag_dot);
@@ -114,7 +114,7 @@
       // The columns in [i, n] are zeroed out in `row`, so we just have to
       // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
       // r.T)
-      auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision);
+      auto dot = BatchDot(body_l, false, row, true, precision);
       // np.dot(l[..., i+1:, :i], r.T)
       auto dot_ip1 = Select(Le(mask_range_col, i), mask_zeros_col, dot);
 
@@ -178,7 +178,7 @@
         // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
-        auto delta = BatchDot(lhs, TransposeInMinorDims(rhs), precision);
+        auto delta = BatchDot(lhs, false, rhs, true, precision);
         auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
         a = UpdateSliceInMinorDims(a, before - delta, {i, i});
       }
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 9b483bd..631a7dd 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -75,8 +75,10 @@
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
-  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
-  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+  se::DeviceMemoryAllocator* device_allocator() const {
+    return device_allocator_;
+  }
+  void set_device_allocator(se::DeviceMemoryAllocator* device_allocator) {
     device_allocator_ = device_allocator;
   }
 
@@ -98,7 +100,7 @@
   AotCompilationOptions();
 
  private:
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   DebugOptions debug_options_;
   absl::optional<DeviceAssignment> static_device_assignment_;
 };
@@ -147,14 +149,14 @@
   // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Optimizes a HLO module group, a set of module which runs concurrently on
   // multiple devices potentially communicating data between the modules.
   virtual Status RunHloPassesOnModuleGroup(
       HloModuleGroup* module_group,
       absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
@@ -168,7 +170,7 @@
   // device_allocator is optional; see RunHloPasses.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules.
@@ -176,7 +178,7 @@
   RunBackendOnModuleGroup(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
@@ -189,7 +191,7 @@
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Returns the backend configurations that the backend will consider for the
   // given HLO. Returns no configurations if the backend does not support
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 127b08a..4b8d20f 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -1088,17 +1088,6 @@
   return Status::OK();
 }
 
-Status CopyInsertion::VerifyNoLiveRangeInterference(const HloOrdering& ordering,
-                                                    HloModule* module) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
-  // TODO(b/122263061): This function appears to be incorrect, saying that
-  // live-range interference is occurring when it isn't.  We've disabled it for
-  // now while we investigate.
-  // TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering));
-  return Status::OK();
-}
-
 Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
                                               HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
@@ -1183,10 +1172,8 @@
   DumpHloModuleDuringPassIfEnabled(
       name(), "after adding copies to resolve interference", *module);
 
-  DependencyHloOrdering dep_ordering(module);
-  TF_DCHECK_OK(VerifyNoLiveRangeInterference(dep_ordering, module));
-
-  TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(dep_ordering, module));
+  TF_RETURN_IF_ERROR(
+      RemoveUnnecessaryCopies(DependencyHloOrdering(module), module));
   DumpHloModuleDuringPassIfEnabled(name(), "after removing unnecessary copies",
                                    *module);
 
@@ -1196,8 +1183,6 @@
 
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
-  TF_DCHECK_OK(
-      VerifyNoLiveRangeInterference(DependencyHloOrdering(module), module));
 
   if (VLOG_IS_ON(1)) {
     int64 num_total_copies = 0;
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index 8866b50..f7e1997 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -89,11 +89,6 @@
   //
   Status AddSpecialCaseCopies(HloModule* module);
 
-  // Verifies that no HLO values have interfering live ranges using the given
-  // ordering.
-  Status VerifyNoLiveRangeInterference(const HloOrdering& ordering,
-                                       HloModule* module);
-
  protected:
   // Override which requires the caller to pass in a call graph.
   virtual Status AddSpecialCaseCopies(const CallGraph& call_graph,
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 445265d..09f5c85 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -99,6 +99,7 @@
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:conditional_to_select",
         "//tensorflow/compiler/xla/service:scatter_expander",
+        "//tensorflow/compiler/xla/service:slice_sinker",
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -181,7 +182,6 @@
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
-        ":custom_call_target_registry",
         ":disassembler",
         ":orc_jit_memory_mapper",
         ":runtime_fp16",
@@ -202,6 +202,7 @@
         "@llvm//:orc_jit",
         "@llvm//:support",
         "@llvm//:target",  # fixdeps: keep
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
@@ -244,7 +245,6 @@
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
@@ -254,6 +254,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/host:host_stream",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -946,17 +947,6 @@
 )
 
 cc_library(
-    name = "custom_call_target_registry",
-    srcs = [
-        "custom_call_target_registry.cc",
-    ],
-    hdrs = [
-        "custom_call_target_registry.h",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
     name = "orc_jit_memory_mapper",
     srcs = ["orc_jit_memory_mapper.cc"],
     hdrs = ["orc_jit_memory_mapper.h"],
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 5a5327e..7dab505 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -123,7 +123,10 @@
 
   CHECK(!llvm::verifyModule(module, &llvm::dbgs()));
 
-  runtime::RewriteIRRuntimeFunctions(&module, enable_fast_math_);
+  const auto& opts = target_machine_->Options;
+  bool fast_math_enabled = opts.UnsafeFPMath && opts.NoInfsFPMath &&
+                           opts.NoNaNsFPMath && opts.NoSignedZerosFPMath;
+  runtime::RewriteIRRuntimeFunctions(&module, fast_math_enabled);
 
   // Buffer for holding machine code prior to constructing the ObjectFile.
   llvm::SmallVector<char, 0> stream_buffer;
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index edcd47e..fdaba45 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -32,8 +32,7 @@
  public:
   explicit CompilerFunctor(
       llvm::TargetMachine* target_machine, int opt_level,
-      bool optimize_for_size, bool enable_fast_math,
-      bool disable_expensive_passes,
+      bool optimize_for_size, bool disable_expensive_passes,
       LLVMCompiler::ModuleHook pre_optimization_hook = nullptr,
       LLVMCompiler::ModuleHook post_optimization_hook = nullptr,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook =
@@ -41,7 +40,6 @@
       : target_machine_(target_machine),
         opt_level_(opt_level),
         optimize_for_size_(optimize_for_size),
-        enable_fast_math_(enable_fast_math),
         disable_expensive_passes_(disable_expensive_passes),
         pre_optimization_hook_(std::move(pre_optimization_hook)),
         post_optimization_hook_(std::move(post_optimization_hook)),
@@ -64,7 +62,6 @@
   llvm::TargetMachine* target_machine_;
   const unsigned opt_level_;
   const bool optimize_for_size_;
-  const bool enable_fast_math_;
   const bool disable_expensive_passes_;
   LLVMCompiler::ModuleHook pre_optimization_hook_;
   LLVMCompiler::ModuleHook post_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 43b9501..06ea1e2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -17,6 +17,7 @@
 
 #include <stddef.h>
 #include <string.h>
+
 #include <map>
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <string>
@@ -97,6 +98,7 @@
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
+#include "tensorflow/compiler/xla/service/slice_sinker.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
@@ -314,6 +316,7 @@
     pass.AddPass<TupleSimplifier>();
     pass.AddPass<WhileLoopConstantSinking>();
     pass.AddPass<WhileLoopSimplifier>();
+    pass.AddPass<SliceSinker>();
     pass.AddPass<HloDCE>();
     pass.AddPass<ReshapeMover>();
     pass.AddPass<HloConstantFolding>();
@@ -424,9 +427,9 @@
   if (module_config.debug_options().xla_cpu_enable_fast_math()) {
     target_options.UnsafeFPMath = true;
     target_options.NoInfsFPMath =
-        module_config.debug_options().xla_cpu_fast_math_honor_infs();
+        !module_config.debug_options().xla_cpu_fast_math_honor_infs();
     target_options.NoNaNsFPMath =
-        module_config.debug_options().xla_cpu_fast_math_honor_nans();
+        !module_config.debug_options().xla_cpu_fast_math_honor_nans();
     target_options.NoSignedZerosFPMath = true;
   } else {
     target_options.UnsafeFPMath = false;
@@ -534,7 +537,7 @@
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   std::unique_ptr<llvm::TargetMachine> jit_target_machine =
       SimpleOrcJIT::InferTargetMachineForJIT(
           CompilerTargetOptions(module->config()),
@@ -594,7 +597,7 @@
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Compiling: " << module->name();
   XLA_SCOPED_LOGGING_TIMER(
       absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
@@ -618,7 +621,6 @@
       CompilerTargetOptions(module->config()),
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
-      module->config().debug_options().xla_cpu_enable_fast_math(),
       module->config().debug_options().xla_llvm_disable_expensive_passes(),
       pre_optimization_ir_hook, post_optimization_ir_hook,
       OrcJITPostCompilationHook::Create(module.get()));
@@ -658,12 +660,6 @@
                           BufferSizeBytesFunction(), memory_alignment,
                           /*allow_input_output_aliasing=*/false,
                           /*allocate_buffers_for_constants=*/true));
-  // BufferAssignment::ToString() includes a header, so no need for us to
-  // print one ourselves.
-  if (DumpingEnabledForHloModule(*module)) {
-    DumpToFileInDirOrStdout(*module, "buffer_assignment",
-                            assignment->ToString());
-  }
   DumpHloModuleIfEnabled(*module, *assignment, "after_optimizations");
 
   // Each computation is a single function.  Emit all embedded computations
@@ -749,15 +745,29 @@
 
   // We can pass just one llvm::TargetOptions when we compile the LLVM module,
   // so we bail if the configs have conflicting flags. At the moment, the only
-  // flag that needs to be consistent is fast-math.
-  const bool fast_math_enabled =
-      modules[0]->config().debug_options().xla_cpu_enable_fast_math();
-  for (const auto& module : modules) {
-    if (module->config().debug_options().xla_cpu_enable_fast_math() !=
-        fast_math_enabled) {
-      return InvalidArgument(
-          "All HLO module configs must have the same value for "
-          "xla_enable_fast_math.");
+  // flags that need to be consistent are for fast-math.
+  for (const auto& fn_and_name :
+       {std::make_pair(&DebugOptions::xla_cpu_enable_fast_math,
+                       "xla_cpu_enable_fast_math"),
+        std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_infs,
+                       "xla_cpu_fast_math_honor_infs"),
+        std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_nans,
+                       "xla_cpu_fast_math_honor_nans")}) {
+    // This only works because each of the method pointers above returns a bool.
+    // Otherwise we'd have to do some template magic.
+    const auto& field_method_ptr = fn_and_name.first;
+    const auto& field_name = fn_and_name.second;
+    bool first_module_val =
+        (modules[0]->config().debug_options().*field_method_ptr)();
+    for (int64 i = 0; i < modules.size(); ++i) {
+      bool cur_module_val =
+          (modules[i]->config().debug_options().*field_method_ptr)();
+      if (first_module_val != cur_module_val) {
+        return InvalidArgument(
+            "All HLO module configs must have the same value for %s, but "
+            "module 0 and %d have different values (%d vs %d).",
+            field_name, i, first_module_val, cur_module_val);
+      }
     }
   }
 
@@ -927,7 +937,6 @@
     CompilerFunctor compiler_functor(
         target_machine.get(), opt_level,
         options::OptimizeForSizeRequested(module->config()),
-        module->config().debug_options().xla_cpu_enable_fast_math(),
         module->config().debug_options().xla_llvm_disable_expensive_passes(),
         pre_optimization_ir_hook, post_optimization_ir_hook, post_codegen_hook);
     std::unique_ptr<llvm::MemoryBuffer> object_file =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 8ff0fd5..dd15891 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -133,11 +133,11 @@
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 23d0af3..cc0f808 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -73,13 +73,13 @@
 }
 
 StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
-                   std::vector<OwningDeviceMemory>>>
+                   std::vector<se::OwningDeviceMemory>>>
 CpuExecutable::CreateBufferTable(
-    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+    se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
     absl::Span<const ShapedBuffer* const> arguments) {
   std::vector<se::DeviceMemoryBase> unowning_buffers(
       assignment_->Allocations().size());
-  std::vector<OwningDeviceMemory> owning_buffers(
+  std::vector<se::OwningDeviceMemory> owning_buffers(
       assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -207,7 +207,7 @@
 
 StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<OwningDeviceMemory> buffers) {
+    absl::Span<se::OwningDeviceMemory> buffers) {
   se::Stream* stream = run_options->stream();
   ScopedShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(),
@@ -216,7 +216,7 @@
   const HloInputOutputAliasConfig& input_output_alias =
       module().input_output_alias_config();
 
-  // Move OwningDeviceMemory values which contain the array(s) of the result
+  // Move se::OwningDeviceMemory values which contain the array(s) of the result
   // into the respective location in ScopedShapedBuffer which is returned to the
   // caller.
   TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
@@ -235,7 +235,7 @@
             const BufferAllocation::Slice slice,
             this->assignment_->GetUniqueSlice(src, buffer_source->index()));
         const BufferAllocation::Index buffer_index = slice.index();
-        OwningDeviceMemory& buffer = buffers[buffer_index];
+        se::OwningDeviceMemory& buffer = buffers[buffer_index];
         if (!slice.allocation()->is_entry_computation_parameter()) {
           // If the buffer coming out of the result is from a parameter, the
           // owning buffer will be null, and that means the caller aliased some
@@ -297,8 +297,8 @@
   auto* host_stream = dynamic_cast<se::host::HostStream*>(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<OwningDeviceMemory> owning_buffers;
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  std::vector<se::OwningDeviceMemory> owning_buffers;
   std::vector<se::DeviceMemoryBase> unowning_buffers;
   TF_ASSIGN_OR_RETURN(
       std::tie(unowning_buffers, owning_buffers),
@@ -326,7 +326,7 @@
     CpuExecutable* executable;
     ServiceExecutableRunOptions run_options;
     std::vector<se::DeviceMemoryBase> unowning_buffers;
-    std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+    std::shared_ptr<std::vector<se::OwningDeviceMemory>> buffers;
     HloExecutionProfile* hlo_execution_profile;
 
     void operator()() {
@@ -338,7 +338,7 @@
   };
   host_stream->EnqueueTask(
       AsyncRunTask{this, *run_options, std::move(unowning_buffers),
-                   std::make_shared<std::vector<OwningDeviceMemory>>(
+                   std::make_shared<std::vector<se::OwningDeviceMemory>>(
                        std::move(owning_buffers)),
                    hlo_execution_profile});
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 3b91b15..735a207 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -25,7 +25,6 @@
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -37,6 +36,7 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace cpu {
@@ -111,8 +111,9 @@
   //    storage and the live-out buffer into which the computation writes it
   //    result.
   StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
-                     std::vector<OwningDeviceMemory>>>
-  CreateBufferTable(DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+                     std::vector<se::OwningDeviceMemory>>>
+  CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
+                    int device_ordinal,
                     absl::Span<const ShapedBuffer* const> arguments);
 
   // Calls the generated function performing the computation with the given
@@ -126,7 +127,7 @@
   // The addresses are set according to buffer assignment.
   StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<OwningDeviceMemory> buffers);
+      absl::Span<se::OwningDeviceMemory> buffers);
 
   // Returns the points-to set of the root instruction of the entry
   // computation. Uses points-to analysis from buffer assignment.
diff --git a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h b/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
deleted file mode 100644
index 664125e..0000000
--- a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
-
-// This file is depended on by kernels that have to build for mobile devices.
-// For this reason, we avoid relying on TensorFlow and instead only use the
-// standard C++ library.
-
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-namespace xla {
-namespace cpu {
-
-// The CPU JIT compiler uses this registry to resolve symbolic CustomCall
-// targets; so when using the CPU JIT, CustomCall targets need to be registered
-// here with the symbol name used in the CustomCall.
-//
-// The XLA AOT compiler links using a standard offline linker; so when compiling
-// in AOT mode, you *also* need to make sure the name of the callee (presumably
-// implemented in C++) matches up with the symbolic name used in the CustomCall.
-//
-// We maintain the registry in both the JIT and the AOT cases for simplicity,
-// but we only use it when running in JIT mode.
-class CustomCallTargetRegistry {
- public:
-  static CustomCallTargetRegistry* Global();
-
-  void Register(const std::string& symbol, void* address);
-  void* Lookup(const std::string& symbol) const;
-
- private:
-  std::unordered_map<std::string, void*> registered_symbols_;
-  mutable std::mutex mu_;
-};
-
-class RegisterCustomCallTarget {
- public:
-  explicit RegisterCustomCallTarget(const std::string& name, void* address) {
-    CustomCallTargetRegistry::Global()->Register(name, address);
-  }
-};
-
-#define REGISTER_CUSTOM_CALL_CONCAT(a, b) a##b
-
-#define REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, counter) \
-  static ::xla::cpu::RegisterCustomCallTarget REGISTER_CUSTOM_CALL_CONCAT(    \
-      custom_call_target_register, counter)(symbol,                           \
-                                            reinterpret_cast<void*>(address))
-
-#define REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address) \
-  REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, __COUNTER__)
-
-#define REGISTER_CUSTOM_CALL_TARGET(function) \
-  REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function)
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 93ef517..a4bb5f7 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -119,13 +119,9 @@
                              int32 vector_width) {
   VectorSupportLibrary vsl(F32, vector_width, b, "exp_f32");
 
-  // This implements the same polynomial approximation as implemented in Eigen3.
-
+  // This implements the same polynomial approximation as implemented in Cephes.
   const llvm::APFloat half = GetIeeeF32(0.5);
-  const llvm::APFloat one = GetIeeeF32(1.0);
-
-  const llvm::APFloat exp_hi = GetIeeeF32(88.3762626647950);
-  const llvm::APFloat exp_lo = GetIeeeF32(-88.3762626647949);
+  const llvm::APFloat one = GetIeeeF32(1);
 
   const llvm::APFloat cephes_LOG2EF = GetIeeeF32(1.44269504088896341);
   const llvm::APFloat cephes_exp_C1 = GetIeeeF32(0.693359375);
@@ -138,39 +134,79 @@
   const llvm::APFloat cephes_exp_p4 = GetIeeeF32(1.6666665459E-1);
   const llvm::APFloat cephes_exp_p5 = GetIeeeF32(5.0000001201E-1);
 
-  llvm::Value* input_clamped =
-      vsl.Clamp(input, /*low=*/exp_lo, /*high=*/exp_hi);
-  llvm::Value* fx = vsl.Floor(vsl.MulAdd(input_clamped, cephes_LOG2EF, half));
-  llvm::Value* tmp = vsl.Mul(cephes_exp_C1, fx);
-  llvm::Value* z = vsl.Mul(cephes_exp_C2, fx);
-  llvm::Value* x = vsl.Sub(input_clamped, tmp);
-  x = vsl.Sub(x, z);
-  z = vsl.Mul(x, x);
+  // To compute e^input, we re-express it as
+  //
+  //   e^input = e^(a + b)
+  //           = e^(a + n log(2))
+  //           = e^a * 2^n.
+  //
+  // We choose n = floor(a * log(2) + 0.5), restricting the value of `a` to
+  // (-0.5, 0.5).  We then use a polynomial to compute e^a.
 
-  llvm::Value* y = vsl.MulAdd(x, cephes_exp_p0, cephes_exp_p1);
-  y = vsl.MulAdd(y, x, cephes_exp_p2);
-  y = vsl.MulAdd(y, x, cephes_exp_p3);
-  y = vsl.MulAdd(y, x, cephes_exp_p4);
-  y = vsl.MulAdd(y, x, cephes_exp_p5);
-  y = vsl.MulAdd(y, z, x);
-  y = vsl.Add(one, y);
+  // Restrict input to a small range, including some values that evaluate to
+  // +/- inf.  Our computations below aren't particularly sensitive to the exact
+  // choices here, so we choose values a bit larger/smaller than
+  //
+  //   log(F32_MAX) =       88.723...
+  //   log(F32_EPSILON) = -103.279....
+  //
+  input = vsl.Clamp(input, GetIeeeF32(-104), GetIeeeF32(88.8));
 
-  // VectorSupportLibrary (intentionally) can't juggle more than one type at a
-  // time so drop down to IRBuilder for this bit.
-  llvm::Value* vector_constant_0x7f =
-      b->CreateVectorSplat(vector_width, b->getInt32(0x7f));
-  llvm::Value* vector_constant_23 =
-      b->CreateVectorSplat(vector_width, b->getInt32(23));
-  llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b->getInt32Ty(), vector_width);
-  // fx is clamped so we don't have to worry about it being out of range for
-  // i32.
-  llvm::Value* emm0 = b->CreateFPToSI(fx, i32_vector_type);
-  emm0 = b->CreateAdd(emm0, vector_constant_0x7f);
-  emm0 = b->CreateShl(emm0, vector_constant_23);
-  llvm::Value* emm0_f32 = b->CreateBitCast(emm0, vsl.vector_type());
+  llvm::Value* x = input;
+  llvm::Value* n = vsl.Floor(vsl.MulAdd(input, cephes_LOG2EF, half));
 
-  return vsl.Max(vsl.Mul(y, emm0_f32), input);
+  // When we eventually do the multiplication in e^a * 2^n, we need to handle
+  // the case when n > 127, the max fp32 exponent (so 2^n == inf) but e^a < 1
+  // (so e^a * 2^n != inf).  There's a similar problem for n < -126, the
+  // smallest fp32 exponent.
+  //
+  // A straightforward solution would be to detect n out of range and split it
+  // up, doing
+  //
+  //   e^a * 2^n = e^a * 2^(n1 + n2)
+  //             = (2^n1 * e^a) * 2^n2.
+  //
+  // But it turns out this approach is quite slow.  It's not clear why; our
+  // hypothesis is that the integer operations on the exponent `n` have nonlocal
+  // effects on the pipeline.
+  //
+  // The approach we use instead is to clamp n to [-126, 127] so 2^n doesn't
+  // over/underflow.  This causes `a` to be outside the range (-0.5, 0.5), which
+  // means that our polynomial for e^a will give a less-accurate result.  In
+  // practice this seems to work well enough; it passes our exhaustive tests,
+  // breaking only one result, and by one ulp (we return exp(88.7228394) =
+  // max-float but we should return inf).
+  n = vsl.Clamp(n, GetIeeeF32(-126), GetIeeeF32(127));
+
+  // Polynomial to compute z = e^a, accurate for a in (-0.5, 0.5).
+  x = vsl.Sub(x, vsl.Mul(cephes_exp_C1, n));
+  x = vsl.Sub(x, vsl.Mul(cephes_exp_C2, n));
+  llvm::Value* z = vsl.MulAdd(x, cephes_exp_p0, cephes_exp_p1);
+  z = vsl.MulAdd(z, x, cephes_exp_p2);
+  z = vsl.MulAdd(z, x, cephes_exp_p3);
+  z = vsl.MulAdd(z, x, cephes_exp_p4);
+  z = vsl.MulAdd(z, x, cephes_exp_p5);
+  z = vsl.MulAdd(z, vsl.Mul(x, x), x);
+  z = vsl.Add(one, z);
+
+  // Convert n to an i32.  This is safe because we clamped it above.
+  llvm::Value* n_i32 =
+      b->CreateFPToSI(n, llvm::VectorType::get(b->getInt32Ty(), vector_width));
+
+  // Create 2^n as an fp32.  This works because -126 <= n <= 127 means that n is
+  // within the bounds for an fp32 exponent.
+  auto splat_i32 = [&](int32 v) {
+    return b->CreateVectorSplat(vector_width, b->getInt32(v));
+  };
+  const int32 kF32SignificandBits = 23;
+  llvm::Value* exp_bias = splat_i32(0x7f);
+  llvm::Value* pow2 =
+      b->CreateBitCast(b->CreateShl(b->CreateAdd(n_i32, exp_bias),
+                                    splat_i32(kF32SignificandBits)),
+                       vsl.vector_type());
+
+  // Return z * 2^n.
+  return vsl.Mul(z, pow2);
 }
 
 llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h
index 85af63b..193c25f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h
@@ -26,15 +26,17 @@
 
 template <typename EigenDevice, typename ScalarType>
 void EigenConvImpl(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-                   ScalarType* rhs, int64 input_batch, int64 input_rows,
-                   int64 input_cols, int64 input_channels, int64 kernel_rows,
-                   int64 kernel_cols, int64 kernel_channels,
-                   int64 kernel_filters, int64 output_rows, int64 output_cols,
-                   int64 row_stride, int64 col_stride, int64 padding_top,
-                   int64 padding_bottom, int64 padding_left,
-                   int64 padding_right, int64 lhs_row_dilation,
-                   int64 lhs_col_dilation, int64 rhs_row_dilation,
-                   int64 rhs_col_dilation) {
+                   ScalarType* rhs, Eigen::Index input_batch,
+                   Eigen::Index input_rows, Eigen::Index input_cols,
+                   Eigen::Index input_channels, Eigen::Index kernel_rows,
+                   Eigen::Index kernel_cols, Eigen::Index kernel_channels,
+                   Eigen::Index kernel_filters, Eigen::Index output_rows,
+                   Eigen::Index output_cols, Eigen::Index row_stride,
+                   Eigen::Index col_stride, Eigen::Index padding_top,
+                   Eigen::Index padding_bottom, Eigen::Index padding_left,
+                   Eigen::Index padding_right, Eigen::Index lhs_row_dilation,
+                   Eigen::Index lhs_col_dilation, Eigen::Index rhs_row_dilation,
+                   Eigen::Index rhs_col_dilation) {
   const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(lhs, input_batch, input_rows, input_cols, input_channels);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 02b2c32..bf55e9e 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 
 #include <stdint.h>
+
 #include <algorithm>
 #include <list>
 #include <utility>
@@ -28,7 +29,6 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Host.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h"
@@ -42,6 +42,7 @@
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -94,7 +95,7 @@
 SimpleOrcJIT::SimpleOrcJIT(
     const llvm::TargetOptions& target_options,
     llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-    bool enable_fast_math, bool disable_expensive_passes,
+    bool disable_expensive_passes,
     LLVMCompiler::ModuleHook pre_optimization_hook,
     LLVMCompiler::ModuleHook post_optimization_hook,
     std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook)
@@ -130,11 +131,10 @@
           }),
       compile_layer_(
           object_layer_,
-          CompilerFunctor(target_machine_.get(), opt_level, optimize_for_size,
-                          enable_fast_math, disable_expensive_passes,
-                          std::move(pre_optimization_hook),
-                          std::move(post_optimization_hook),
-                          std::move(post_codegen_hook))),
+          CompilerFunctor(
+              target_machine_.get(), opt_level, optimize_for_size,
+              disable_expensive_passes, std::move(pre_optimization_hook),
+              std::move(post_optimization_hook), std::move(post_codegen_hook))),
       gdb_jit_event_listener_(
           llvm::JITEventListener::createGDBRegistrationListener()) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
@@ -147,16 +147,18 @@
     // On Mac OS X, 'name' may have a leading underscore prefix, even though the
     // registered name may not.
     std::string stripped_name(name.begin() + 1, name.end());
-    func_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name);
+    func_addr =
+        xla::CustomCallTargetRegistry::Global()->Lookup(stripped_name, "Host");
   } else {
-    func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+    func_addr = xla::CustomCallTargetRegistry::Global()->Lookup(name, "Host");
   }
 
   if (func_addr == nullptr) {
     LOG(ERROR)
         << "Unable to resolve runtime symbol: `" << name
         << "'.  Hint: if the symbol a custom call target, make sure you've "
-           "registered it with the JIT using REGISTER_CUSTOM_CALL_TARGET.";
+           "registered it with the JIT using "
+           "XLA_CPU_REGISTER_CUSTOM_CALL_TARGET.";
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
@@ -210,14 +212,15 @@
 namespace {
 // Register some known symbols with the CustomCallTargetRegistry.
 bool RegisterKnownJITSymbols() {
-  CustomCallTargetRegistry* registry = CustomCallTargetRegistry::Global();
+  xla::CustomCallTargetRegistry* registry =
+      xla::CustomCallTargetRegistry::Global();
 
 #define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                               \
   do {                                                                       \
     auto* function_address =                                                 \
         reinterpret_cast<void*>(__xla_cpu_runtime_##base_name);              \
     registry->Register(xla::cpu::runtime::k##base_name##SymbolName,          \
-                       function_address);                                    \
+                       function_address, "Host");                            \
     CHECK_EQ(absl::string_view(xla::cpu::runtime::k##base_name##SymbolName), \
              "__xla_cpu_runtime_" #base_name);                               \
   } while (false)
@@ -248,8 +251,10 @@
   REGISTER_CPU_RUNTIME_SYMBOL(TracingStart);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
 
-  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee));
-  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee));
+  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee),
+                     "Host");
+  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee),
+                     "Host");
 
 #undef REGISTER_CPU_RUNTIME_SYMBOL
 
@@ -257,11 +262,12 @@
 // Unfortunately the double versions are overloaded on some systems, e.g.
 // Mac so we need an explicit cast. This requires passing the function signature
 // for that case.
-#define REGISTER_LIBM_SYMBOL(name, double_sig)                          \
-  do {                                                                  \
-    registry->Register(#name "f", reinterpret_cast<void*>(name##f));    \
-    registry->Register(                                                 \
-        #name, reinterpret_cast<void*>(static_cast<double_sig>(name))); \
+#define REGISTER_LIBM_SYMBOL(name, double_sig)                                 \
+  do {                                                                         \
+    registry->Register(#name "f", reinterpret_cast<void*>(name##f), "Host");   \
+    registry->Register(#name,                                                  \
+                       reinterpret_cast<void*>(static_cast<double_sig>(name)), \
+                       "Host");                                                \
   } while (false)
 
   REGISTER_LIBM_SYMBOL(acos, double (*)(double));
@@ -319,8 +325,9 @@
 #ifdef __APPLE__
   REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
   registry->Register("__sincosf_stret",
-                     reinterpret_cast<void*>(__sincosf_stret));
-  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret));
+                     reinterpret_cast<void*>(__sincosf_stret), "Host");
+  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret),
+                     "Host");
 #else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
 #endif
@@ -333,19 +340,19 @@
 
 #undef REGISTER_LIBM_SYMBOL
 
-  registry->Register("memcpy", reinterpret_cast<void*>(memcpy));
-  registry->Register("memmove", reinterpret_cast<void*>(memmove));
-  registry->Register("memset", reinterpret_cast<void*>(memset));
+  registry->Register("memcpy", reinterpret_cast<void*>(memcpy), "Host");
+  registry->Register("memmove", reinterpret_cast<void*>(memmove), "Host");
+  registry->Register("memset", reinterpret_cast<void*>(memset), "Host");
 
 #ifdef __APPLE__
-  registry->Register("__bzero", reinterpret_cast<void*>(bzero));
+  registry->Register("__bzero", reinterpret_cast<void*>(bzero), "Host");
   registry->Register("memset_pattern16",
-                     reinterpret_cast<void*>(memset_pattern16));
+                     reinterpret_cast<void*>(memset_pattern16), "Host");
 #endif
 
 #ifdef MEMORY_SANITIZER
   registry->Register("__msan_unpoison",
-                     reinterpret_cast<void*>(__msan_unpoison));
+                     reinterpret_cast<void*>(__msan_unpoison), "Host");
 #endif
 
   return true;
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 075a018..f9e845b 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -57,7 +57,7 @@
   SimpleOrcJIT(
       const llvm::TargetOptions& target_options,
       llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-      bool enable_fast_math, bool disable_expensive_passes,
+      bool disable_expensive_passes,
       LLVMCompiler::ModuleHook pre_optimization_hook,
       LLVMCompiler::ModuleHook post_optimization_hook,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook);
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 1bd4b59..b15ad1e 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -107,13 +107,19 @@
 llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a,
                                          const llvm::APFloat& low,
                                          const llvm::APFloat& high) {
+  CHECK(!low.isNaN());
+  CHECK(!high.isNaN());
+  CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
+
   AssertCorrectTypes({a});
   llvm::Type* type = a->getType();
-  CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
   CHECK(scalar_type_->isFloatingPointTy());
-  return llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), b_),
-      GetConstantFloat(type, high), b_);
+
+  llvm::Value* low_value = GetConstantFloat(type, low);
+  llvm::Value* high_value = GetConstantFloat(type, high);
+  a = b_->CreateSelect(b_->CreateFCmpUGE(a, low_value), a, low_value);
+  a = b_->CreateSelect(b_->CreateFCmpULE(a, high_value), a, high_value);
+  return a;
 }
 
 llvm::Value* VectorSupportLibrary::FCmpEQMask(llvm::Value* lhs,
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index c444fd7..2f8be8c 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -100,8 +100,10 @@
 
   llvm::Value* Floor(llvm::Value* a);
 
+  // Precondition: Neither `low` nor `high` is nan.
   llvm::Value* Clamp(llvm::Value* a, const llvm::APFloat& low,
                      const llvm::APFloat& high);
+
   llvm::Value* SplatFloat(const llvm::APFloat& d) {
     return GetConstantFloat(vector_type(), d);
   }
diff --git a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.cc b/tensorflow/compiler/xla/service/custom_call_target_registry.cc
similarity index 73%
rename from tensorflow/compiler/xla/service/cpu/custom_call_target_registry.cc
rename to tensorflow/compiler/xla/service/custom_call_target_registry.cc
index 5f58038..e6a7021 100644
--- a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.cc
+++ b/tensorflow/compiler/xla/service/custom_call_target_registry.cc
@@ -13,10 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 
 namespace xla {
-namespace cpu {
 
 CustomCallTargetRegistry* CustomCallTargetRegistry::Global() {
   static auto* registry = new CustomCallTargetRegistry;
@@ -24,16 +23,17 @@
 }
 
 void CustomCallTargetRegistry::Register(const std::string& symbol,
-                                        void* address) {
+                                        void* address,
+                                        const std::string& platform) {
   std::lock_guard<std::mutex> lock(mu_);
-  registered_symbols_[symbol] = address;
+  registered_symbols_[std::make_pair(symbol, platform)] = address;
 }
 
-void* CustomCallTargetRegistry::Lookup(const std::string& symbol) const {
+void* CustomCallTargetRegistry::Lookup(const std::string& symbol,
+                                       const std::string& platform) const {
   std::lock_guard<std::mutex> lock(mu_);
-  auto it = registered_symbols_.find(symbol);
+  auto it = registered_symbols_.find(std::make_pair(symbol, platform));
   return it == registered_symbols_.end() ? nullptr : it->second;
 }
 
-}  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/custom_call_target_registry.h b/tensorflow/compiler/xla/service/custom_call_target_registry.h
new file mode 100644
index 0000000..0623968
--- /dev/null
+++ b/tensorflow/compiler/xla/service/custom_call_target_registry.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
+
+// This file is depended on by kernels that have to build for mobile devices.
+// For this reason, we avoid relying on TensorFlow and instead only use the
+// standard C++ library.
+
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+
+namespace xla {
+
+// XLA JIT compilers use this registry to resolve symbolic CustomCall targets;
+// so when using XLA as a JIT, CustomCall targets need to be registered here
+// with the symbol name used in the CustomCall.
+//
+// The XLA:CPU ahead-of-time (AOT) compiler links using a standard offline
+// linker; so when compiling in CPU AOT mode, you *also* need to make sure the
+// name of the callee (presumably implemented in C++) matches up with the
+// symbolic name used in the CustomCall.
+//
+// We maintain the registry in both the JIT and the AOT cases for simplicity,
+// but we only use it when running in JIT mode.
+class CustomCallTargetRegistry {
+ public:
+  static CustomCallTargetRegistry* Global();
+
+  void Register(const std::string& symbol, void* address,
+                const std::string& platform);
+  void* Lookup(const std::string& symbol, const std::string& platform) const;
+
+ private:
+  // Maps the pair (symbol, platform) to a C function implementing a custom call
+  // named `symbol` for StreamExecutor platform `platform`.
+  //
+  // Different platforms have different ABIs.  TODO(jlebar): Describe them!
+  //
+  // (We std::map rather than std::unordered_map because the STL doesn't provide
+  // a default hasher for pair<string, string>, and we want to avoid pulling in
+  // dependencies that might define this.)
+  std::map<std::pair<std::string, std::string>, void*> registered_symbols_;
+  mutable std::mutex mu_;
+};
+
+class RegisterCustomCallTarget {
+ public:
+  explicit RegisterCustomCallTarget(const std::string& name, void* address,
+                                    const std::string& platform) {
+    CustomCallTargetRegistry::Global()->Register(name, address, platform);
+  }
+};
+
+#define XLA_REGISTER_CUSTOM_CALL_CONCAT(a, b) a##b
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address,   \
+                                                        platform, counter) \
+  static ::xla::RegisterCustomCallTarget XLA_REGISTER_CUSTOM_CALL_CONCAT(  \
+      custom_call_target_register, counter)(                               \
+      symbol, reinterpret_cast<void*>(address), platform)
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET(function, platform) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function, platform)
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address, platform)  \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, platform, \
+                                                  __COUNTER__)
+
+// Convenience overloads for registering custom-call targets on the CPU.
+#define XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(function) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function, "Host")
+
+#define XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address, "Host")
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
deleted file mode 100644
index e1e3b15..0000000
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-
-#include <string>
-
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-
-namespace xla {
-
-StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const se::Platform* platform,
-    absl::Span<se::StreamExecutor* const> stream_executors)
-    : DeviceMemoryAllocator(platform),
-      stream_executors_(stream_executors.begin(), stream_executors.end()) {}
-
-StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
-    int device_ordinal, uint64 size, bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
-                      GetStreamExecutor(device_ordinal));
-  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
-  if (size > 0 && result == nullptr) {
-    return ResourceExhausted(
-        "Failed to allocate request for %s (%uB) on device ordinal %d",
-        tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
-  }
-  VLOG(3) << absl::StreamFormat(
-      "Allocated %s (%uB) on device ordinal %d: %p",
-      tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal,
-      result.opaque());
-  return OwningDeviceMemory(result, device_ordinal, this);
-}
-
-Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
-                                                 se::DeviceMemoryBase mem) {
-  if (!mem.is_null()) {
-    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
-                        GetStreamExecutor(device_ordinal));
-    VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
-                                  mem.opaque(), device_ordinal);
-    stream_executor->Deallocate(&mem);
-  }
-  return Status::OK();
-}
-
-StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
-    int device_ordinal) {
-  if (device_ordinal < 0) {
-    return InvalidArgument("device ordinal value (%d) must be non-negative",
-                           device_ordinal);
-  }
-  if (device_ordinal >= stream_executors_.size()) {
-    return InvalidArgument(
-        "device ordinal value (%d) >= number of devices (%u)", device_ordinal,
-        stream_executors_.size());
-  }
-  if (stream_executors_[device_ordinal] == nullptr) {
-    return NotFound("Device %s:%d present but not supported",
-                    platform()->Name(), device_ordinal);
-  }
-  return stream_executors_[device_ordinal];
-}
-
-bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
-  return false;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 874f9da..f45cda8 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -117,6 +117,7 @@
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
   virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
+  virtual Status HandlePartitionId(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
@@ -291,7 +292,7 @@
   // This call is purely a performance hint and can be omitted without
   // affecting correctness.
   void ReserveVisitStates(int num) { visit_state_.reserve(num); }
-  size_t VisitStateSize() const { return visit_state_.size(); }
+  size_t VisitStateCapacity() const { return visit_state_.capacity(); }
 
   // Useful when we want to visit the same computation more than once with the
   // same visitor.
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 79ce3f8..756ba90 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -109,6 +109,9 @@
   Status HandleReplicaId(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandlePartitionId(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index 06d0456..d251c82 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/dump.h"
+
 #include "absl/strings/ascii.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -190,6 +191,10 @@
   if (opts.dump_as_text) {
     DumpToFileInDirOrStdoutImpl(StrCat(filename, ".txt"), module.ToString(),
                                 opts);
+    if (buffer_assn) {
+      DumpToFileInDirOrStdoutImpl(StrCat(filename, "-buffer-assignment.txt"),
+                                  buffer_assn->ToString(), opts);
+    }
   }
 
   if (opts.dump_as_proto) {
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 1269e06..b2563f9 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -178,13 +179,14 @@
                int64 operand_index, HloInstruction* dynamic_size) {
         HloInstruction* reduce = hlo;
         int64 operand_count = reduce->operand_count();
+        bool is_variadic_reduce = operand_count > 2;
         CHECK_EQ(operand_count % 2, 0);
         if (operand_index >= operand_count / 2) {
           // Init values doesn't have dynamic size.
           return Status::OK();
         }
         if ((absl::c_count(reduce->dimensions(), dimension) != 0)) {
-          // Dimension is to be reduce, stop tracing.
+          // Dimension is to be reduced, stop tracing.
           return Status::OK();
         }
 
@@ -192,8 +194,21 @@
         int64 dimensions_not_reduced_count = 0;
         for (int i = 0; i < operand->shape().rank(); ++i) {
           if (dimension == i) {
-            parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
-                                    dynamic_size);
+            ShapeIndex result_index = {};
+
+            if (is_variadic_reduce) {
+              // The dimensions of all data operands of a variadic reduce have
+              // to be the same.  This means that if one operand of variadic
+              // reduce has a dynamic dimension, we set all outputs to use the
+              // same dynamic size in corresponding dimensions.
+              for (int64 i = 0; i < operand_count / 2; ++i) {
+                parent_->SetDynamicSize(
+                    reduce, {i}, dimensions_not_reduced_count, dynamic_size);
+              }
+            } else {
+              parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
+                                      dynamic_size);
+            }
 
             return Status::OK();
           }
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index d0f2998..a77aaca 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -83,6 +83,12 @@
   // by a scalar instruction `size`.
   void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
                       HloInstruction* size) {
+    Shape subshape = ShapeUtil::GetSubshape(inst->shape(), index);
+    CHECK(!subshape.IsTuple())
+        << "Can't set a tuple shape to dynamic dimension";
+    CHECK(dim < subshape.rank() && dim >= 0)
+        << "Asked to set invalid dynamic dimension. Shape: "
+        << subshape.ToString() << ", Dimension: " << dim;
     dynamic_mapping_.try_emplace(DynamicDimension{inst, index, dim}, size);
     auto iter = per_hlo_dynamic_dimensions_.try_emplace(inst);
     iter.first->second.emplace(DynamicDimension{inst, index, dim});
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 46db5f7..a18c017 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -61,6 +61,26 @@
     return module_->AddEmbeddedComputation(embedded_builder.Build());
   }
 
+  HloComputation* GetAddTuple() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto lhs_1 =
+        embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+            1, ShapeUtil::MakeShape(F32, {}), "lhs.1"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        2, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    auto rhs_1 =
+        embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+            3, ShapeUtil::MakeShape(F32, {}), "rhs.1"));
+    auto add = embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    auto add_1 = embedded_builder.AddInstruction(HloInstruction::CreateBinary(
+        lhs->shape(), HloOpcode::kAdd, lhs_1, rhs_1));
+    embedded_builder.AddInstruction(HloInstruction::CreateTuple({add, add_1}));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
   HloComputation* GetGe() {
     auto embedded_builder = HloComputation::Builder("ge");
     auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
@@ -239,6 +259,47 @@
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), nullptr);
 }
 
+TEST_F(DynamicDimensionInferenceTest, VariadicReduce) {
+  // Handle variadic reduce where output is a tuple.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {1, 2});
+
+  auto data_param_dynamic = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto data_param_static = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, input_shape, "data_param.2"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, scalar_shape_, "size_param"));
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  auto dynamic_negate = builder.AddInstruction(HloInstruction::CreateUnary(
+      input_shape, HloOpcode::kNegate, data_param_dynamic));
+
+  auto static_negate = builder.AddInstruction(HloInstruction::CreateUnary(
+      input_shape, HloOpcode::kNegate, data_param_static));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      ShapeUtil::MakeTupleShape({reduce_shape, reduce_shape}),
+      {dynamic_negate, static_negate}, {init, init}, {1}, GetAddTuple()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  SCOPED_TRACE(module_->ToString());
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {0}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {1}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {0}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {1}, 0), nullptr);
+}
+
 TEST_F(DynamicDimensionInferenceTest, DotTest) {
   auto builder = HloComputation::Builder(TestName());
   constexpr int xdim = 3;
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index d2db921..95405cd 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -19,7 +19,6 @@
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
-
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -28,18 +27,19 @@
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
-
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
 
 namespace {
 
-// ChooseIdentityValue looks at the instruction and returns a identity value
-// which, when padded, doesn't change the result of the instruction.
+// ChooseIdentityValue looks at the instruction's operand, returns a
+// identity value which, when padded, doesn't change the result of the
+// instruction.
 //
 // nullopt is returned if padding doesn't need to be reset.
-StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst) {
+StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst,
+                                              int64 operand_number) {
   HloComputation* comp = inst->parent();
   // Padding on elementwise operation doesn't affect the result of the effective
   // data.
@@ -48,7 +48,14 @@
   }
 
   switch (inst->opcode()) {
-    case HloOpcode::kReduce:
+    case HloOpcode::kReduce: {
+      TF_RET_CHECK(operand_number < inst->operand_count() / 2)
+          << "Only data operand with dynamic dimension is valid.";
+      // Variadic reduce has different init value for different operand, given a
+      // data operand number, find the init value index.
+      int64 init_value_index = inst->operand_count() / 2 + operand_number;
+      return inst->mutable_operand(init_value_index);
+    }
     case HloOpcode::kReduceWindow: {
       // Because of the way we do reduce, we already require the `init` operand
       // of hlo reduce instruction to be identity value. Here we reuse the
@@ -137,7 +144,7 @@
           }
 
           TF_ASSIGN_OR_RETURN(HloInstruction * identity_value,
-                              ChooseIdentityValue(inst));
+                              ChooseIdentityValue(inst, operand_num));
           if (identity_value == nullptr) {
             continue;
           }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index d10e37a..664fdca 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -442,7 +442,9 @@
                                           {operand_value},
                                           {operand_value->getType()}, b_);
     case HloOpcode::kRoundNearestAfz:
-      return EmitRoundNearestAfz(op->shape().element_type(), operand_value);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kSign: {
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
@@ -1139,12 +1141,6 @@
   return Select(x_is_small, for_small_x, for_large_x);
 }
 
-StatusOr<llvm::Value*> ElementalIrEmitter::EmitRoundNearestAfz(
-    PrimitiveType /*prim_type*/, llvm::Value* value) {
-  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round, {value},
-                                      {value->getType()}, b_);
-}
-
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                    llvm::Value* lhs,
                                                    llvm::Value* rhs) {
@@ -1470,11 +1466,16 @@
       }
     }
     case RNG_NORMAL: {
+      // Convert uniform x in (0, 1] to normal using formula:
+      //   Normal(x, mu, sigma) = mu + sqrt(2)*sigma*ErfcInv(2x)
+      //                        = mu + sqrt(2)*sigma*ErfInv(1-2x)
       TF_ASSIGN_OR_RETURN(
           llvm::Value * r,
           EmitErfcInv(elem_prim_ty, FMul(llvm::ConstantFP::get(elem_ir_ty, 2.0),
                                          elem_value)));
-      return FAdd(FMul(r, b_or_sigma), a_or_mean);
+      return FAdd(FMul(llvm::ConstantFP::get(r->getType(), std::sqrt(2.0)),
+                       FMul(r, b_or_sigma)),
+                  a_or_mean);
     }
     default:
       return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 6b1c85b..6b3844c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -146,9 +146,6 @@
   virtual StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                           llvm::Value* value);
 
-  virtual StatusOr<llvm::Value*> EmitRoundNearestAfz(PrimitiveType prim_type,
-                                                     llvm::Value* value);
-
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x);
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index a08ec18..e716295 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -24,13 +24,11 @@
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -40,6 +38,8 @@
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
@@ -47,13 +47,13 @@
 // leftover buffers to be released by the caller.
 struct ExecutionOutput {
   ExecutionOutput(ScopedShapedBuffer result,
-                  std::vector<OwningDeviceMemory> to_be_released)
+                  std::vector<se::OwningDeviceMemory> to_be_released)
       : result(std::move(result)), to_be_released(std::move(to_be_released)) {}
   ScopedShapedBuffer result;
 
   // Leftover buffers for the caller to release. Elements in this list are
   // donated input memory buffers that are not reused by XLA as outputs.
-  std::vector<OwningDeviceMemory> to_be_released;
+  std::vector<se::OwningDeviceMemory> to_be_released;
 };
 
 // A given platform's compiler will produce an Executable -- this is a uniform
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 2cc4784..8305fe9 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -86,6 +85,24 @@
 #    ],
 #)
 
+tf_cc_test(
+    name = "custom_call_test",
+    srcs = ["custom_call_test.cc"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "stream_assignment",
     srcs = ["stream_assignment.cc"],
@@ -195,6 +212,7 @@
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
@@ -283,10 +301,10 @@
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -329,6 +347,7 @@
         ":buffer_allocations",
         ":hlo_execution_profiler",
         ":thunk",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/synchronization",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
@@ -350,6 +369,7 @@
         "convolution_thunk.cc",
         "copy_thunk.cc",
         "cudnn_batchnorm_thunk.cc",
+        "custom_call_thunk.cc",
         "fft_thunk.cc",
         "for_thunk.cc",
         "gemm_thunk.cc",
@@ -370,6 +390,7 @@
         "convolution_thunk.h",
         "copy_thunk.h",
         "cudnn_batchnorm_thunk.h",
+        "custom_call_thunk.h",
         "fft_thunk.h",
         "for_thunk.h",
         "gemm_thunk.h",
@@ -408,7 +429,6 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
@@ -420,14 +440,18 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/kernels:gpu_utils",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
+        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor/cuda:cuda_stream",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -437,6 +461,7 @@
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -475,7 +500,6 @@
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -483,8 +507,8 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/util/proto:proto_utils",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -500,8 +524,8 @@
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
@@ -511,10 +535,19 @@
     hdrs = ["redzone_allocator.h"],
     deps = [
         ":gpu_constants",
+        ":partition_assignment",
+        ":stream_executor_util",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/stream_executor:stream_executor_headers",
     ],
 )
 
@@ -526,11 +559,13 @@
         ":redzone_allocator",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:kernel",
         "//tensorflow/stream_executor/cuda:cuda_activation",
@@ -623,12 +658,12 @@
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -923,6 +958,7 @@
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:slice_sinker",
         "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:stable_sort_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
@@ -938,6 +974,8 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/cuda:cuda_diagnostics",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
@@ -1111,7 +1149,17 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:kernel_spec",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1135,30 +1183,29 @@
     hdrs = ["buffer_comparator.h"],
     deps = [
         ":gpu_executable",
+        ":partition_assignment",
+        ":stream_executor_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
-        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/strings",
     ],
 )
 
-xla_test(
+tf_cc_test(
     name = "buffer_comparator_test",
     srcs = ["buffer_comparator_test.cc"],
-    backends = [
-        "cpu",
-        "gpu",
-    ],
+    tags = tf_cuda_tests_tags(),
     deps = [
         ":buffer_comparator",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/stream_executor:device_memory",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index f46a1bc..3afc18d 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -39,7 +39,7 @@
 
 StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
     const BufferAssignment* buffer_assignment, int device_ordinal,
-    DeviceMemoryAllocator* memory_allocator) {
+    se::DeviceMemoryAllocator* memory_allocator) {
   const int64 num_buffers = buffer_assignment->Allocations().size();
   auto buffer_allocations = absl::WrapUnique(new BufferAllocations(
       num_buffers, device_ordinal, memory_allocator, buffer_assignment));
@@ -77,7 +77,7 @@
       const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
       if (buffer_size > 0) {
-        OwningDeviceMemory buffer;
+        se::OwningDeviceMemory buffer;
         TF_ASSIGN_OR_RETURN(
             buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
         if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 9413ac2..cf78b92 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -23,9 +23,9 @@
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -50,7 +50,7 @@
     // memory on.
     StatusOr<std::unique_ptr<BufferAllocations>> Build(
         const BufferAssignment* buffer_assignment, int device_ordinal,
-        DeviceMemoryAllocator* memory_allocator);
+        se::DeviceMemoryAllocator* memory_allocator);
 
    private:
     absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
@@ -62,7 +62,9 @@
   BufferAllocations(const BufferAllocations&) = delete;
   BufferAllocations& operator=(const BufferAllocations&) = delete;
 
-  DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_; }
+  se::DeviceMemoryAllocator* memory_allocator() const {
+    return memory_allocator_;
+  }
   int device_ordinal() const { return device_ordinal_; }
 
   // Returns the device address of buffer `buffer_index`. `buffer_index` must be
@@ -84,7 +86,7 @@
 
  private:
   BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
-                    DeviceMemoryAllocator* memory_allocator,
+                    se::DeviceMemoryAllocator* memory_allocator,
                     const BufferAssignment* buffer_assignment)
       : buffers_(buffer_count),
         device_ordinal_(device_ordinal),
@@ -104,7 +106,7 @@
   se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
+  se::DeviceMemoryAllocator* memory_allocator_;
   const BufferAssignment* buffer_assignment_;
   bool torn_down_ = false;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 24272cf..5f3b3b4 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -15,233 +15,387 @@
 
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 
+#include <algorithm>
 #include <cmath>
+
 #include "absl/strings/str_replace.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/kernel.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace xla {
 namespace gpu {
 
 static constexpr double kTolerance = 0.1f;
 
-static StatusOr<string> GetCompHloText(const Shape& shape) {
-  // Implements the textual format of the comparison routine, as it's more
-  // readable.
-  //
-  // This text template takes three substitution parameters:
-  // ${ORIG_TYPE}: buffer element type.
-  // ${CMP_TYPE}: intermediate element type for calculating numeric differences.
-  // ${SIZE}: number of elements.
-  // ${CLAMP_TO}: Clamp the value to [-$CLAMP_TO, $CLAMP_TO].
-  static constexpr char kCompHloText[] = R"(
-HloModule Compare_${ORIG_TYPE}_${CMP_TYPE}_${SIZE}_${CLAMP_TO}
-
-Max {
-  %lhs = ${CMP_TYPE}[] parameter(0)
-  %rhs = ${CMP_TYPE}[] parameter(1)
-  ROOT %max = ${CMP_TYPE}[] maximum(%lhs, %rhs)
-}
-
-Canonicalize (aparam: ${ORIG_TYPE}[${SIZE}]) -> ${CMP_TYPE}[${SIZE}] {
-  %min_constant = ${CMP_TYPE}[] constant(-${CLAMP_TO})
-  %max_constant = ${CMP_TYPE}[] constant(${CLAMP_TO})
-  %min_values = ${CMP_TYPE}[${SIZE}] broadcast(%min_constant), dimensions={}
-  %max_values = ${CMP_TYPE}[${SIZE}] broadcast(%max_constant), dimensions={}
-
-  %a = ${ORIG_TYPE}[${SIZE}] parameter(0)
-  %converted = ${CMP_TYPE}[${SIZE}] convert(%a)
-  ROOT %clamped = ${CMP_TYPE}[${SIZE}] clamp(%min_values, %converted, %max_values)
-}
-
-// RelError(x, y) = abs(x - y) / (max(abs(x), abs(y)) + 1)
-// x and y must be finite.
-RelError (aparam: ${CMP_TYPE}[${SIZE}], bparam: ${CMP_TYPE}[${SIZE}]) -> ${CMP_TYPE}[${SIZE}] {
-  %lhs = ${CMP_TYPE}[${SIZE}] parameter(0)
-  %rhs = ${CMP_TYPE}[${SIZE}] parameter(1)
-  %one_constant = ${CMP_TYPE}[] constant(1.0)
-  %ones = ${CMP_TYPE}[${SIZE}] broadcast(%one_constant), dimensions={}
-
-  %sub = ${CMP_TYPE}[${SIZE}] subtract(%lhs, %rhs)
-  %sub_abs = ${CMP_TYPE}[${SIZE}] abs(%sub)
-  %lhs_abs = ${CMP_TYPE}[${SIZE}] abs(%lhs)
-  %rhs_abs = ${CMP_TYPE}[${SIZE}] abs(%rhs)
-  %max = ${CMP_TYPE}[${SIZE}] maximum(%lhs_abs, %rhs_abs)
-  %denominator = ${CMP_TYPE}[${SIZE}] add(%max, %ones)
-  ROOT %error = ${CMP_TYPE}[${SIZE}] divide(%sub_abs, %denominator)
-}
-
-// Here is the chain-style definition of this function:
-//   Error(NaN, NaN) = 0
-//   Error(Inf, Inf) = 0
-//   Error(-Inf, -Inf) = 0
-//   Error(NonFinite, x) = Inf
-//   Error(x, NonFinite) = Inf
-//   Error(x, y) = RelError(x, y)
-// , where the early matched pattern takes precedence.
+// Comparison kernel code: compare two buffers of fp16/fp32/fp64 of length
+// buffer_length where the relative error does not exceed the passed
+// rel_error_threshold. Write the number of mismatches into out parameter
+// mismatch_count.
 //
-// To implement this, we start from the bottom, and keep using select to
-// overwrite previously picked values. The last value produced by a matched
-// pattern is the final value.
-Error (aparam: ${CMP_TYPE}[${SIZE}], bparam: ${CMP_TYPE}[${SIZE}]) -> ${CMP_TYPE}[${SIZE}] {
-  %lhs = ${CMP_TYPE}[${SIZE}] parameter(0)
-  %rhs = ${CMP_TYPE}[${SIZE}] parameter(1)
-  %zero_constant = ${CMP_TYPE}[] constant(0.0)
-  %inf_constant = ${CMP_TYPE}[] constant(inf)
-  %zeros = ${CMP_TYPE}[${SIZE}] broadcast(%zero_constant), dimensions={}
-  %infs = ${CMP_TYPE}[${SIZE}] broadcast(%inf_constant), dimensions={}
+// NaN's are considered equal, and for half's we clamp all numbers to largest
+// and smallest numbers representable to avoid miscomparisons due to overflows.
+//
+// The PTX below is compiled from the following CUDA code:
+//
+// #include<cuda_fp16.h>
+// extern "C" { // avoid name mangling
+// __device__ float canonicalize(float input) {
+//   // All fp16 infinities are treated as 65505 or -65505, in order to avoid
+//   // differences due to overflows.
+//   return isnan(input) ? input : max(-65505.0f, min(input, 65505.0f));
+// }
+//
+// __global__ void __xla_fp16_comparison(__half* buffer_a, __half* buffer_b,
+//                                       float rel_error_threshold,
+//                                       unsigned long long buffer_length,
+//                                       int* mismatch_count) {
+//   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   float elem_a = __half2float(buffer_a[idx]);
+//   float elem_b = __half2float(buffer_b[idx]);
+//   elem_a = canonicalize(elem_a);
+//   elem_b = canonicalize(elem_b);
+//   if (isnan(elem_a) && isnan(elem_b)) return;
+//   float rel_error = abs(elem_a - elem_b)
+//       / (max(abs(elem_a), abs(elem_b)) + 1);
+//   if (rel_error > rel_error_threshold || isnan(rel_error))
+//     atomicAdd(mismatch_count, 1);
+// }
+//
+// __global__ void __xla_fp32_comparison(float* buffer_a, float* buffer_b,
+//                                       float rel_error_threshold,
+//                                       unsigned long long buffer_length,
+//                                       int* mismatch_count) {
+//   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   float elem_a = buffer_a[idx];
+//   float elem_b = buffer_b[idx];
+//   if (isnan(elem_a) && isnan(elem_b)) return;
+//   if (isinf(elem_a) && isinf(elem_b) && signbit(elem_a) == signbit(elem_b))
+//     return;
+//   float rel_error = abs(elem_a - elem_b)
+//       / (max(abs(elem_a), abs(elem_b)) + 1);
+//   if (rel_error > rel_error_threshold || isnan(rel_error))
+//     atomicAdd(mismatch_count, 1);
+// }
+//
+// __global__ void __xla_fp64_comparison(double* buffer_a, double* buffer_b,
+//                                       float rel_error_threshold,
+//                                       unsigned long long buffer_length,
+//                                       int* mismatch_count) {
+//   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   double elem_a = buffer_a[idx];
+//   double elem_b = buffer_b[idx];
+//   if (isnan(elem_a) && isnan(elem_b)) return;
+//   if (isinf(elem_a) && isinf(elem_b) && signbit(elem_a) == signbit(elem_b))
+//     return;
+//   double rel_error = abs(elem_a - elem_b)
+//       / (max(abs(elem_a), abs(elem_b)) + 1);
+//   if (rel_error > rel_error_threshold || isnan(rel_error))
+//     atomicAdd(mismatch_count, 1);
+// }
+// } // end extern declaration.
+static const char* buffer_compare_ptx = R"(
+.version 4.2
+.target sm_30
+.address_size 64
 
-  %lhs_is_finite = pred[${SIZE}] is-finite(%lhs)
-  %lhs_is_not_finite = pred[${SIZE}] not(%lhs_is_finite)
-  %lhs_is_not_nan = pred[${SIZE}] compare(%lhs, %lhs), direction=EQ
-  %lhs_is_nan = pred[${SIZE}] not(%lhs_is_not_nan)
-  %lhs_is_inf = pred[${SIZE}] and(%lhs_is_not_finite, %lhs_is_not_nan)
-  %lhs_is_non_neg = pred[${SIZE}] compare(%lhs, %zeros), direction=GE
+.visible .entry __xla_fp16_comparison(
+  .param .u64 __xla_fp16_comparison_param_0,
+  .param .u64 __xla_fp16_comparison_param_1,
+  .param .f32 __xla_fp16_comparison_param_2,
+  .param .u64 __xla_fp16_comparison_param_3,
+  .param .u64 __xla_fp16_comparison_param_4
+)
+{
+  .reg .pred   %p<10>;
+  .reg .b16   %rs<3>;
+  .reg .f32   %f<20>;
+  .reg .b32   %r<6>;
+  .reg .b64   %rd<12>;
+  ld.param.u64   %rd8, [__xla_fp16_comparison_param_3];
+  mov.u32   %r1, %tid.x;
+  mov.u32   %r2, %ctaid.x;
+  mov.u32   %r3, %ntid.x;
+  mad.lo.s32   %r4, %r3, %r2, %r1;
+  cvt.s64.s32   %rd4, %r4;
+  setp.ge.u64   %p1, %rd4, %rd8;
+  @%p1 bra   LBB7_4;
+  ld.param.u64   %rd5, [__xla_fp16_comparison_param_0];
+  ld.param.u64   %rd7, [__xla_fp16_comparison_param_1];
+  cvta.to.global.u64   %rd2, %rd7;
+  cvta.to.global.u64   %rd3, %rd5;
+  shl.b64   %rd9, %rd4, 1;
+  add.s64   %rd10, %rd3, %rd9;
+  ld.global.u16   %rs1, [%rd10];
+  // begin inline asm
+  {  cvt.f32.f16 %f6, %rs1;}
 
-  %rhs_is_finite = pred[${SIZE}] is-finite(%rhs)
-  %rhs_is_not_finite = pred[${SIZE}] not(%rhs_is_finite)
-  %rhs_is_not_nan = pred[${SIZE}] compare(%rhs, %rhs), direction=EQ
-  %rhs_is_nan = pred[${SIZE}] not(%rhs_is_not_nan)
-  %rhs_is_inf = pred[${SIZE}] and(%rhs_is_not_finite, %rhs_is_not_nan)
-  %rhs_is_non_neg = pred[${SIZE}] compare(%rhs, %zeros), direction=GE
+  // end inline asm
+  add.s64   %rd11, %rd2, %rd9;
+  ld.global.u16   %rs2, [%rd11];
+  // begin inline asm
+  {  cvt.f32.f16 %f7, %rs2;}
 
-  %both_same_sign = pred[${SIZE}] and(%lhs_is_non_neg, %rhs_is_non_neg)
-  %both_inf = pred[${SIZE}] and(%lhs_is_inf, %rhs_is_inf)
-  %both_same_sign_inf = pred[${SIZE}] and(%both_same_sign, %both_inf)
-  %both_nan = pred[${SIZE}] and(%lhs_is_nan, %rhs_is_nan)
+  // end inline asm
+  abs.f32   %f8, %f6;
+  setp.gtu.f32   %p2, %f8, 0f7F800000;
+  min.f32   %f9, %f6, 0f477FE100;
+  max.f32   %f10, %f9, 0fC77FE100;
+  selp.f32   %f1, %f6, %f10, %p2;
+  abs.f32   %f11, %f7;
+  setp.gtu.f32   %p3, %f11, 0f7F800000;
+  min.f32   %f12, %f7, 0f477FE100;
+  max.f32   %f13, %f12, 0fC77FE100;
+  selp.f32   %f2, %f7, %f13, %p3;
+  abs.f32   %f3, %f1;
+  setp.gtu.f32   %p4, %f3, 0f7F800000;
+  abs.f32   %f4, %f2;
+  setp.gtu.f32   %p5, %f4, 0f7F800000;
+  and.pred    %p6, %p4, %p5;
+  @%p6 bra   LBB7_4;
+  ld.param.f32   %f5, [__xla_fp16_comparison_param_2];
+  sub.f32   %f14, %f1, %f2;
+  abs.f32   %f15, %f14;
+  max.f32   %f16, %f3, %f4;
+  add.f32   %f17, %f16, 0f3F800000;
+  div.rn.f32   %f18, %f15, %f17;
+  setp.leu.f32   %p7, %f18, %f5;
+  abs.f32   %f19, %f18;
+  setp.le.f32   %p8, %f19, 0f7F800000;
+  and.pred    %p9, %p7, %p8;
+  @%p9 bra   LBB7_4;
+  ld.param.u64   %rd6, [__xla_fp16_comparison_param_4];
+  cvta.to.global.u64   %rd1, %rd6;
+  atom.global.add.u32   %r5, [%rd1], 1;
+LBB7_4:
+  ret;
 
-  // Reverse-order selections
-
-  // Error(x, y) = RelError(x, y)
-  %rel_error = ${CMP_TYPE}[${SIZE}] call(%lhs, %rhs), to_apply=RelError
-  // Error(x, NonFinite) = Inf
-  %after_x_non_finite = ${CMP_TYPE}[${SIZE}] select(%rhs_is_not_finite, %infs, %rel_error)
-  // Error(NonFinite, x) = Inf
-  %after_non_finite_x = ${CMP_TYPE}[${SIZE}] select(%lhs_is_not_finite, %infs, %after_x_non_finite)
-  // Error(-Inf, -Inf) = 0
-  // Error(Inf, Inf) = 0
-  %after_both_same_sign_inf = ${CMP_TYPE}[${SIZE}] select(%both_same_sign_inf, %zeros, %after_non_finite_x)
-  // Error(NaN, NaN) = 0
-  ROOT %after_both_nan = ${CMP_TYPE}[${SIZE}] select(%both_nan, %zeros, %after_both_same_sign_inf)
 }
+  // .globl  __xla_fp32_comparison
+.visible .entry __xla_fp32_comparison(
+  .param .u64 __xla_fp32_comparison_param_0,
+  .param .u64 __xla_fp32_comparison_param_1,
+  .param .f32 __xla_fp32_comparison_param_2,
+  .param .u64 __xla_fp32_comparison_param_3,
+  .param .u64 __xla_fp32_comparison_param_4
+)
+{
+  .reg .pred   %p<12>;
+  .reg .f32   %f<12>;
+  .reg .b32   %r<9>;
+  .reg .b64   %rd<12>;
 
-ENTRY MaxDifference {
-  %zero_constant = ${CMP_TYPE}[] constant(0.0)
+  ld.param.u64   %rd8, [__xla_fp32_comparison_param_3];
+  mov.u32   %r1, %tid.x;
+  mov.u32   %r2, %ctaid.x;
+  mov.u32   %r3, %ntid.x;
+  mad.lo.s32   %r4, %r3, %r2, %r1;
+  cvt.s64.s32   %rd4, %r4;
+  setp.ge.u64   %p1, %rd4, %rd8;
+  @%p1 bra   LBB8_6;
+  ld.param.u64   %rd5, [__xla_fp32_comparison_param_0];
+  ld.param.u64   %rd7, [__xla_fp32_comparison_param_1];
+  cvta.to.global.u64   %rd2, %rd7;
+  cvta.to.global.u64   %rd3, %rd5;
+  shl.b64   %rd9, %rd4, 2;
+  add.s64   %rd10, %rd3, %rd9;
+  ld.global.f32   %f1, [%rd10];
+  add.s64   %rd11, %rd2, %rd9;
+  ld.global.f32   %f2, [%rd11];
+  abs.f32   %f3, %f1;
+  setp.gtu.f32   %p2, %f3, 0f7F800000;
+  abs.f32   %f4, %f2;
+  setp.gtu.f32   %p3, %f4, 0f7F800000;
+  and.pred    %p4, %p2, %p3;
+  @%p4 bra   LBB8_6;
+  setp.neu.f32   %p5, %f3, 0f7F800000;
+  setp.neu.f32   %p6, %f4, 0f7F800000;
+  or.pred    %p7, %p5, %p6;
+  @%p7 bra   LBB8_4;
+  mov.b32   %r5, %f1;
+  mov.b32   %r6, %f2;
+  xor.b32    %r7, %r6, %r5;
+  setp.gt.s32   %p8, %r7, -1;
+  @%p8 bra   LBB8_6;
+LBB8_4:
+  ld.param.f32   %f5, [__xla_fp32_comparison_param_2];
+  sub.f32   %f6, %f1, %f2;
+  abs.f32   %f7, %f6;
+  max.f32   %f8, %f3, %f4;
+  add.f32   %f9, %f8, 0f3F800000;
+  div.rn.f32   %f10, %f7, %f9;
+  setp.leu.f32   %p9, %f10, %f5;
+  abs.f32   %f11, %f10;
+  setp.le.f32   %p10, %f11, 0f7F800000;
+  and.pred    %p11, %p9, %p10;
+  @%p11 bra   LBB8_6;
+  ld.param.u64   %rd6, [__xla_fp32_comparison_param_4];
+  cvta.to.global.u64   %rd1, %rd6;
+  atom.global.add.u32   %r8, [%rd1], 1;
+LBB8_6:
+  ret;
 
-  %lhs = ${ORIG_TYPE}[${SIZE}] parameter(0)
-  %rhs = ${ORIG_TYPE}[${SIZE}] parameter(1)
-  %lhs_canonical = ${CMP_TYPE}[${SIZE}] call(%lhs), to_apply=Canonicalize
-  %rhs_canonical = ${CMP_TYPE}[${SIZE}] call(%rhs), to_apply=Canonicalize
-  %error = ${CMP_TYPE}[${SIZE}] call(%lhs_canonical, %rhs_canonical), to_apply=Error
-  %max_diff = ${CMP_TYPE}[] reduce(%error, %zero_constant), dimensions={0}, to_apply=Max
-  ROOT %converted_max_diff = f64[] convert(%max_diff)
-})";
+}
+  // .globl  __xla_fp64_comparison
+.visible .entry __xla_fp64_comparison(
+  .param .u64 __xla_fp64_comparison_param_0,
+  .param .u64 __xla_fp64_comparison_param_1,
+  .param .f32 __xla_fp64_comparison_param_2,
+  .param .u64 __xla_fp64_comparison_param_3,
+  .param .u64 __xla_fp64_comparison_param_4
+)
+{
+  .reg .pred   %p<16>;
+  .reg .f32   %f<2>;
+  .reg .b32   %r<13>;
+  .reg .f64   %fd<12>;
+  .reg .b64   %rd<12>;
 
-  absl::string_view orig_type;
-  absl::string_view cmp_type;
-  string clamp_to;
-
-  switch (shape.element_type()) {
-    case xla::F16:
-      orig_type = "f16";
-      cmp_type = "f32";
-      // Clamp fp16s to 65505, since they actually overflow a lot in practice.
-      // This way, +infs and values like 65504 are considered be within
-      // tolerance.
-      clamp_to = "65505";
-      break;
-    case xla::F32:
-      orig_type = "f32";
-      cmp_type = "f32";
-      clamp_to = "inf";
-      break;
-    case xla::F64:
-      orig_type = "f64";
-      cmp_type = "f64";
-      clamp_to = "inf";
-      break;
-    default:
-      return Unimplemented("Unimplemented element type");
+  ld.param.u64   %rd8, [__xla_fp64_comparison_param_3];
+  mov.u32   %r2, %tid.x;
+  mov.u32   %r3, %ctaid.x;
+  mov.u32   %r4, %ntid.x;
+  mad.lo.s32   %r5, %r4, %r3, %r2;
+  cvt.s64.s32   %rd4, %r5;
+  setp.ge.u64   %p1, %rd4, %rd8;
+  @%p1 bra   LBB9_6;
+  ld.param.u64   %rd5, [__xla_fp64_comparison_param_0];
+  ld.param.u64   %rd7, [__xla_fp64_comparison_param_1];
+  cvta.to.global.u64   %rd2, %rd7;
+  cvta.to.global.u64   %rd3, %rd5;
+  shl.b64   %rd9, %rd4, 3;
+  add.s64   %rd10, %rd3, %rd9;
+  ld.global.f64   %fd1, [%rd10];
+  add.s64   %rd11, %rd2, %rd9;
+  ld.global.f64   %fd2, [%rd11];
+  abs.f64   %fd3, %fd1;
+  setp.gtu.f64   %p2, %fd3, 0d7FF0000000000000;
+  abs.f64   %fd4, %fd2;
+  setp.gtu.f64   %p3, %fd4, 0d7FF0000000000000;
+  and.pred    %p4, %p2, %p3;
+  @%p4 bra   LBB9_6;
+  {
+  .reg .b32 %temp; 
+  mov.b64   {%r6, %temp}, %fd1;
   }
-
-  string size_str = absl::StrCat(ShapeUtil::ElementsIn(shape));
-  return absl::StrReplaceAll(kCompHloText, {
-                                               {"${ORIG_TYPE}", orig_type},
-                                               {"${CMP_TYPE}", cmp_type},
-                                               {"${SIZE}", size_str},
-                                               {"${CLAMP_TO}", clamp_to},
-                                           });
+  {
+  .reg .b32 %temp; 
+  mov.b64   {%temp, %r1}, %fd1;
+  }
+  and.b32    %r7, %r1, 2147483647;
+  setp.ne.s32   %p5, %r7, 2146435072;
+  setp.ne.s32   %p6, %r6, 0;
+  or.pred    %p7, %p6, %p5;
+  @%p7 bra   LBB9_4;
+  {
+  .reg .b32 %temp; 
+  mov.b64   {%r8, %temp}, %fd2;
+  }
+  {
+  .reg .b32 %temp; 
+  mov.b64   {%temp, %r9}, %fd2;
+  }
+  and.b32    %r10, %r9, 2147483647;
+  setp.eq.s32   %p8, %r10, 2146435072;
+  setp.eq.s32   %p9, %r8, 0;
+  and.pred    %p10, %p8, %p9;
+  xor.b32    %r11, %r9, %r1;
+  setp.gt.s32   %p11, %r11, -1;
+  and.pred    %p12, %p11, %p10;
+  @%p12 bra   LBB9_6;
+LBB9_4:
+  ld.param.f32   %f1, [__xla_fp64_comparison_param_2];
+  sub.f64   %fd5, %fd1, %fd2;
+  abs.f64   %fd6, %fd5;
+  max.f64   %fd7, %fd3, %fd4;
+  add.f64   %fd8, %fd7, 0d3FF0000000000000;
+  div.rn.f64   %fd9, %fd6, %fd8;
+  cvt.f64.f32   %fd10, %f1;
+  setp.leu.f64   %p13, %fd9, %fd10;
+  abs.f64   %fd11, %fd9;
+  setp.le.f64   %p14, %fd11, 0d7FF0000000000000;
+  and.pred    %p15, %p13, %p14;
+  @%p15 bra   LBB9_6;
+  ld.param.u64   %rd6, [__xla_fp64_comparison_param_4];
+  cvta.to.global.u64   %rd1, %rd6;
+  atom.global.add.u32   %r12, [%rd1], 1;
+LBB9_6:
+  ret;
 }
+)";
 
-StatusOr<BufferComparator> BufferComparator::Create(
-    const Shape& shape, se::StreamExecutor* stream_exec, Compiler* compiler) {
-  // One may consider using hlo_runner to do all the compilation and execution.
-  // However, as of the time hlo_runner doesn't support injection for Compiler*,
-  // or Stream*. We may revisit this in the future if it
-  // proves to be a maintenance burden.
-  TF_ASSIGN_OR_RETURN(
-      auto exec, ([&]() -> StatusOr<std::unique_ptr<Executable>> {
-        HloModuleConfig config;
-        DebugOptions debug_options;
-        debug_options.set_xla_backend_optimization_level(2);
-        config.set_debug_options(debug_options);
-        TF_ASSIGN_OR_RETURN(string hlo_text, GetCompHloText(shape));
-        TF_ASSIGN_OR_RETURN(auto module, ParseHloString(hlo_text, config));
-        TF_ASSIGN_OR_RETURN(
-            module,
-            compiler->RunHloPasses(std::move(module), stream_exec, nullptr));
-        return compiler->RunBackend(std::move(module), stream_exec, nullptr);
-      }()));
+template <typename ElementT>
+using ComparisonKernelT =
+    se::TypedKernel<se::DeviceMemory<ElementT>, se::DeviceMemory<ElementT>,
+                    float, uint64, se::DeviceMemory<uint64>>;
 
-  return BufferComparator(shape, std::move(exec));
-}
+// Compares two buffers on the GPU.
+//
+// Returns `true` if two buffers are equal, `false` otherwise.
+template <typename ElementT>
+static StatusOr<bool> DeviceCompare(se::Stream* stream,
+                                    se::DeviceMemoryBase lhs,
+                                    se::DeviceMemoryBase rhs,
+                                    const Shape& buffer_shape,
+                                    const HloModuleConfig& config,
+                                    absl::string_view kernel_name) {
+  se::StreamExecutor* executor = stream->parent();
 
-StatusOr<bool> BufferComparator::CompareEqualImpl(
-    se::Stream* stream, DeviceMemoryAllocator* allocator,
-    se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs) {
+  se::ScopedDeviceMemory<uint64> out_param =
+      executor->AllocateOwnedScalar<uint64>();
+
+  stream->ThenMemZero(out_param.ptr(), sizeof(uint64));
   if (lhs.size() != rhs.size()) {
-    return InternalError("Mismatched buffer size: %d bytes vs %d bytes",
+    return InternalError("Mismatched buffer size: %d bytes vs. %d bytes",
                          lhs.size(), rhs.size());
   }
 
-  auto stream_exec = stream->parent();
-  auto to_shaped_buffer =
-      [stream_exec,
-       this](se::DeviceMemoryBase buffer) -> StatusOr<ShapedBuffer> {
-    auto device_ordinal = stream_exec->device_ordinal();
-    ShapedBuffer shaped(shape_, shape_, stream_exec->platform(),
-                        device_ordinal);
-    shaped.set_buffer(buffer, {});
-    return std::move(shaped);
-  };
+  se::DeviceMemory<ElementT> lhs_typed(lhs);
+  se::DeviceMemory<ElementT> rhs_typed(rhs);
+  uint64 buffer_size = lhs_typed.ElementCount();
 
-  TF_ASSIGN_OR_RETURN(auto shaped_lhs, to_shaped_buffer(lhs));
-  TF_ASSIGN_OR_RETURN(auto shaped_rhs, to_shaped_buffer(rhs));
+  PtxCompilationOptions opts(config);
+  TF_ASSIGN_OR_RETURN(
+      absl::Span<const uint8> compiled_ptx,
+      CompilePtxOrGetCached(executor, buffer_compare_ptx, opts));
 
-  ExecutableRunOptions run_options;
-  run_options.set_device_ordinal(stream_exec->device_ordinal());
-  run_options.set_stream(stream);
-  run_options.set_allocator(allocator);
-  ServiceExecutableRunOptions service_run_options(run_options);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ComparisonKernelT<ElementT>> comparison_kernel,
+      (CreateTypedKernel<se::DeviceMemory<ElementT>, se::DeviceMemory<ElementT>,
+                         float, uint64, se::DeviceMemory<uint64>>(
+          kernel_name, buffer_compare_ptx, compiled_ptx, executor)));
 
-  const ShapedBuffer* arg_buffers[] = {&shaped_lhs, &shaped_rhs};
-  TF_ASSIGN_OR_RETURN(auto result_buffer,
-                      comparator_exec_->ExecuteOnStream(&service_run_options,
-                                                        arg_buffers, nullptr));
+  LaunchDimensions dim =
+      CalculateLaunchDimensions(buffer_shape, executor->GetDeviceDescription());
 
-  double result;
-  CHECK(result_buffer.root_buffer().size() == sizeof(result));
-  stream->ThenMemcpy(&result, result_buffer.root_buffer(), sizeof(result));
+  stream->ThenLaunch(se::ThreadDim(dim.threads_per_block()),
+                     se::BlockDim(dim.block_count()), *comparison_kernel,
+                     lhs_typed, rhs_typed, static_cast<float>(kTolerance),
+                     buffer_size, out_param.cref());
+
+  uint64 result = -1;
+  CHECK_EQ(out_param->size(), sizeof(result));
+  stream->ThenMemcpy(&result, *out_param, sizeof(result));
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  return result < kTolerance;
+  return result == 0;
 }
 
 // Host side comparison code that does the same thing, but reports some of the
 // differences as well. It only print logs for debugging.
+//
+// Returns true if no differences were seen, false otherwise.
 template <typename ElementType, typename ComparisonType>
-Status HostCompare(se::Stream* stream, se::DeviceMemoryBase lhs,
-                   se::DeviceMemoryBase rhs) {
+StatusOr<bool> HostCompare(se::Stream* stream, se::DeviceMemoryBase lhs,
+                           se::DeviceMemoryBase rhs) {
   int64 n = lhs.size() / sizeof(ElementType);
   std::vector<ElementType> host_lhs(n), host_rhs(n);
   stream->ThenMemcpy(host_lhs.data(), lhs, lhs.size());
@@ -250,14 +404,11 @@
 
   const auto canonicalize = [](ComparisonType a) -> ComparisonType {
     if (std::is_same<ElementType, Eigen::half>::value && a) {
-      constexpr float kMaxFp16Value = 65504.;
+      constexpr ComparisonType kMaxFp16Value = 65505.;
       if (std::isnan(a)) {
         return a;
       }
-      if (a < 0) {
-        return -(kMaxFp16Value + 1);
-      }
-      return kMaxFp16Value + 1;
+      return std::max(-kMaxFp16Value, std::min(a, kMaxFp16Value));
     }
     return a;
   };
@@ -281,35 +432,48 @@
                  << original_rhs;
     }
   }
-  return Status::OK();
+  return differences_seen == 0;
 }
 
-StatusOr<bool> BufferComparator::CompareEqual(se::Stream* stream,
-                                              DeviceMemoryAllocator* allocator,
-                                              se::DeviceMemoryBase lhs,
-                                              se::DeviceMemoryBase rhs) {
-  TF_ASSIGN_OR_RETURN(auto result,
-                      CompareEqualImpl(stream, allocator, lhs, rhs));
+template <typename ElementT, typename ComparisonT>
+static StatusOr<bool> CompareEqualParameterized(se::Stream* stream,
+                                                se::DeviceMemoryBase lhs,
+                                                se::DeviceMemoryBase rhs,
+                                                const Shape& shape,
+                                                const HloModuleConfig& config,
+                                                absl::string_view kernel_name) {
+  XLA_SCOPED_LOGGING_TIMER("BufferComparator::CompareEqual");
+  TF_ASSIGN_OR_RETURN(
+      bool result,
+      DeviceCompare<ElementT>(stream, lhs, rhs, shape, config, kernel_name));
 
   if (result) {
     return true;
   }
 
+  TF_ASSIGN_OR_RETURN(bool host_return,
+                      (HostCompare<ElementT, ComparisonT>(stream, lhs, rhs)));
+  CHECK(host_return == result) << "Different comparison result on GPU vs host";
+
+  return false;
+}
+
+StatusOr<bool> BufferComparator::CompareEqual(se::Stream* stream,
+                                              se::DeviceMemoryBase lhs,
+                                              se::DeviceMemoryBase rhs) {
   switch (shape_.element_type()) {
     case xla::F16:
-      TF_RETURN_IF_ERROR(HostCompare<Eigen::half, float>(stream, lhs, rhs));
-      break;
+      return CompareEqualParameterized<Eigen::half, float>(
+          stream, lhs, rhs, shape_, config_, "__xla_fp16_comparison");
     case xla::F32:
-      TF_RETURN_IF_ERROR(HostCompare<float, float>(stream, lhs, rhs));
-      break;
+      return CompareEqualParameterized<float, float>(
+          stream, lhs, rhs, shape_, config_, "__xla_fp32_comparison");
     case xla::F64:
-      TF_RETURN_IF_ERROR(HostCompare<double, double>(stream, lhs, rhs));
-      break;
+      return CompareEqualParameterized<double, double>(
+          stream, lhs, rhs, shape_, config_, "__xla_fp64_comparison");
     default:
       return Unimplemented("Unimplemented element type");
   }
-
-  return false;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.h b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
index 3e6695a..e77dfe0 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
@@ -16,9 +16,8 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
 
-#include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -31,9 +30,8 @@
   BufferComparator(const BufferComparator&) = delete;
   BufferComparator(BufferComparator&&) = default;
 
-  static StatusOr<BufferComparator> Create(const Shape& buffer_shape,
-                                           se::StreamExecutor* stream_exec,
-                                           Compiler* compiler);
+  BufferComparator(const Shape& shape, const HloModuleConfig& config)
+      : shape_(shape), config_(config) {}
 
   // Returns true if the two buffers compare equal. The definition of "equal"
   // is:
@@ -45,21 +43,12 @@
   //
   // See the implementation for the tolerance value.
   StatusOr<bool> CompareEqual(se::Stream* stream,
-                              DeviceMemoryAllocator* allocator,
                               se::DeviceMemoryBase lhs,
                               se::DeviceMemoryBase rhs);
 
  private:
-  BufferComparator(const Shape& shape, std::unique_ptr<Executable> exec)
-      : shape_(shape), comparator_exec_(std::move(exec)) {}
-
-  StatusOr<bool> CompareEqualImpl(se::Stream* stream,
-                                  DeviceMemoryAllocator* allocator,
-                                  se::DeviceMemoryBase lhs,
-                                  se::DeviceMemoryBase rhs);
-
   Shape shape_;
-  std::unique_ptr<Executable> comparator_exec_;
+  HloModuleConfig config_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
index 274c218..4bca6e7 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
@@ -16,10 +16,11 @@
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 
 #include <limits>
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/backend.h"
+
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/device_memory.h"
 
 namespace xla {
 namespace gpu {
@@ -28,11 +29,9 @@
 class BufferComparatorTest : public testing::Test {
  protected:
   BufferComparatorTest()
-      : backend_(Backend::CreateDefaultBackend().ConsumeValueOrDie()),
-        stream_exec_(backend_->default_stream_executor()),
-        allocator_(stream_exec_->platform(), {stream_exec_}),
-        compiler_(Compiler::GetForPlatform(stream_exec_->platform())
-                      .ConsumeValueOrDie()) {}
+      : platform_(
+            se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie()),
+        stream_exec_(platform_->ExecutorForDevice(0).ValueOrDie()) {}
 
   // Take floats only for convenience. Still uses ElementType internally.
   template <typename ElementType>
@@ -43,49 +42,26 @@
     se::Stream stream(stream_exec_);
     stream.Init();
 
-    auto owning_lhs_buffer = allocator_
-                                 .Allocate(stream_exec_->device_ordinal(),
-                                           lhs.size() * sizeof(ElementType))
-                                 .ConsumeValueOrDie();
+    se::ScopedDeviceMemory<ElementType> lhs_buffer =
+        stream_exec_->AllocateOwnedArray<ElementType>(lhs.size());
+    se::ScopedDeviceMemory<ElementType> rhs_buffer =
+        stream_exec_->AllocateOwnedArray<ElementType>(lhs.size());
 
-    auto owning_rhs_buffer = allocator_
-                                 .Allocate(stream_exec_->device_ordinal(),
-                                           rhs.size() * sizeof(ElementType))
-                                 .ConsumeValueOrDie();
-
-    auto lhs_buffer =
-        se::DeviceMemory<ElementType>(owning_lhs_buffer.AsDeviceMemoryBase());
-    auto rhs_buffer =
-        se::DeviceMemory<ElementType>(owning_rhs_buffer.AsDeviceMemoryBase());
-
-    stream.ThenMemcpy(&lhs_buffer, lhs.data(), lhs_buffer.size());
-    stream.ThenMemcpy(&rhs_buffer, rhs.data(), rhs_buffer.size());
-
+    stream.ThenMemcpy(lhs_buffer.ptr(), lhs.data(), lhs_buffer->size());
+    stream.ThenMemcpy(rhs_buffer.ptr(), rhs.data(), rhs_buffer->size());
     TF_CHECK_OK(stream.BlockHostUntilDone());
 
-    static auto* cmp_cache =
-        new absl::flat_hash_map<std::pair<PrimitiveType, int64>,
-                                std::unique_ptr<BufferComparator>>();
-    auto key =
-        std::make_pair(primitive_util::NativeToPrimitiveType<ElementType>(),
-                       static_cast<int64>(lhs_buffer.ElementCount()));
-    std::unique_ptr<BufferComparator>& comparator = (*cmp_cache)[key];
-    if (!comparator) {
-      comparator.reset(new BufferComparator(
-          BufferComparator::Create(
-              ShapeUtil::MakeShape(key.first, {key.second}), stream.parent(),
-              compiler_)
-              .ConsumeValueOrDie()));
-    }
-    return comparator
-        ->CompareEqual(&stream, &allocator_, lhs_buffer, rhs_buffer)
+    BufferComparator comparator(
+        ShapeUtil::MakeShape(
+            primitive_util::NativeToPrimitiveType<ElementType>(),
+            {static_cast<int64>(lhs_buffer->ElementCount())}),
+        HloModuleConfig());
+    return comparator.CompareEqual(&stream, *lhs_buffer, *rhs_buffer)
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<Backend> backend_;
+  se::Platform* platform_;
   se::StreamExecutor* stream_exec_;
-  StreamExecutorMemoryAllocator allocator_;
-  Compiler* compiler_;
 };
 
 TEST_F(BufferComparatorTest, TestNaNs) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 812e31f..b3f274e 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -256,9 +256,9 @@
   const auto device_ordinal = stream_exec_->device_ordinal();
 
   // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  optional<StreamExecutorMemoryAllocator> se_allocator;
+  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  se::DeviceMemoryAllocator* allocator;
+  optional<se::StreamExecutorMemoryAllocator> se_allocator;
   if (allocator_ != nullptr) {
     allocator = allocator_;
   } else {
@@ -306,8 +306,11 @@
     }
   };
 
+  const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
+
   // Allocate space for the input, filter, and output of the convolution.
-  RedzoneAllocator input_output_allocator(device_ordinal, allocator);
+  RedzoneAllocator input_output_allocator(device_ordinal, allocator,
+                                          hlo_module_config);
   std::vector<se::DeviceMemoryBase> operand_buffers;
   for (const auto* operand : instr->operands()) {
     TF_ASSIGN_OR_RETURN(auto buffer,
@@ -346,7 +349,8 @@
                      AlgorithmToString(alg)),
         2);
 
-    RedzoneAllocator scratch_allocator(device_ordinal, allocator);
+    RedzoneAllocator scratch_allocator(device_ordinal, allocator,
+                                       hlo_module_config);
     se::dnn::ProfileResult profile_result;
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
@@ -388,7 +392,7 @@
     if (comparator.has_value()) {
       XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::CompareEqual", 2);
       StatusOr<bool> compare_result = comparator->CompareEqual(
-          &stream, allocator, reference_result_buffer, result_buffer);
+          &stream, reference_result_buffer, result_buffer);
       if (!compare_result.ok()) {
         LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
                    << " against " << AlgorithmToString(alg) << " for "
@@ -416,21 +420,13 @@
       }
     } else {
       XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::Create", 2);
-      auto comp =
-          BufferComparator::Create(result_shape, stream.parent(), compiler_);
-      if (comp.ok()) {
-        comparator.emplace(comp.ConsumeValueOrDie());
-        reference_result_buffer = result_buffer;
-        TF_ASSIGN_OR_RETURN(result_buffer,
-                            input_output_allocator.AllocateBytes(
-                                &stream, reference_result_buffer.size()));
-        initialize_buffer(result_buffer);
-        first_algorithm = alg;
-      } else {
-        LOG(ERROR) << "Fail to initialize buffer comparator: " << comp.status()
-                   << ", instruction: " << instr->ToString();
-        CHECK(!crash_on_checking_failure);
-      }
+      comparator.emplace(result_shape, hlo_module_config);
+      reference_result_buffer = result_buffer;
+      TF_ASSIGN_OR_RETURN(result_buffer,
+                          input_output_allocator.AllocateBytes(
+                              &stream, reference_result_buffer.size()));
+      initialize_buffer(result_buffer);
+      first_algorithm = alg;
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 664fd7f..9e8a797 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -19,13 +19,13 @@
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -38,7 +38,8 @@
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec,
-                           DeviceMemoryAllocator* allocator, Compiler* compiler)
+                           se::DeviceMemoryAllocator* allocator,
+                           Compiler* compiler)
       : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
 
   absl::string_view name() const override {
@@ -56,7 +57,7 @@
       const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
-  DeviceMemoryAllocator* allocator_;                  // may be null
+  se::DeviceMemoryAllocator* allocator_;              // may be null
   Compiler* compiler_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
index 7861eb1ef..2ba6e8f 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
@@ -174,9 +174,9 @@
   const auto device_ordinal = stream_exec_->device_ordinal();
 
   // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  absl::optional<StreamExecutorMemoryAllocator> se_allocator;
+  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  se::DeviceMemoryAllocator* allocator;
+  absl::optional<se::StreamExecutorMemoryAllocator> se_allocator;
   if (allocator_ != nullptr) {
     allocator = allocator_;
   } else {
@@ -200,7 +200,7 @@
 }
 
 CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
-                                   DeviceMemoryAllocator* allocator)
+                                   se::DeviceMemoryAllocator* allocator)
     : stream_exec_(stream_exec), allocator_(allocator) {}
 
 StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
index c822331..d8c2cc5 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
@@ -16,12 +16,12 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -30,7 +30,7 @@
 class CusolverRewriter : public HloModulePass {
  public:
   CusolverRewriter(se::StreamExecutor* stream_exec,
-                   DeviceMemoryAllocator* allocator);
+                   se::DeviceMemoryAllocator* allocator);
   absl::string_view name() const override { return "cusolver-rewriter"; }
 
   StatusOr<bool> Run(HloModule* module) override;
@@ -39,7 +39,7 @@
   StatusOr<bool> RunOnComputation(HloComputation* computation);
 
   se::StreamExecutor* stream_exec_;   // never null
-  DeviceMemoryAllocator* allocator_;  // may be null
+  se::DeviceMemoryAllocator* allocator_;  // may be null
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
new file mode 100644
index 0000000..b60ace6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "cuda/includes/cuda_headers/third_party/gpus/cuda/include/driver_types.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class CustomCallTest : public ClientLibraryTestBase {};
+
+bool is_invoked_called = false;
+void Callback_IsInvoked(CUstream /*stream*/, void** /*buffers*/,
+                        const char* /*opaque*/, size_t /*opaque_len*/) {
+  is_invoked_called = true;
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_IsInvoked, "CUDA");
+
+TEST_F(CustomCallTest, IsInvoked) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_IsInvoked", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"");
+  EXPECT_FALSE(is_invoked_called);
+  TF_ASSERT_OK(Execute(&b, {}).status());
+  EXPECT_TRUE(is_invoked_called);
+}
+
+TEST_F(CustomCallTest, UnknownTarget) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "UknownTarget", /*operands=*/{}, ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"");
+  ASSERT_FALSE(Execute(&b, {}).ok());
+}
+
+void Callback_Memcpy(CUstream stream, void** buffers, const char* /*opaque*/,
+                     size_t /*opaque_len*/) {
+  void* src = buffers[0];
+  void* dst = buffers[1];
+  auto err = cudaMemcpyAsync(dst, src, /*count=*/sizeof(float) * 128,
+                             cudaMemcpyDeviceToDevice, stream);
+  CHECK_EQ(err, cudaSuccess);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Memcpy, "CUDA");
+TEST_F(CustomCallTest, Memcpy) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Memcpy",
+             /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>(), ::testing::Each(42));
+}
+
+// Check that opaque handles nulls within the string.
+std::string& kExpectedOpaque = *new std::string("abc\0def", 7);
+void Callback_Opaque(CUstream /*stream*/, void** /*buffers*/,
+                     const char* opaque, size_t opaque_len) {
+  std::string opaque_str(opaque, opaque_len);
+  CHECK_EQ(opaque_str, kExpectedOpaque);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Opaque, "CUDA");
+TEST_F(CustomCallTest, Opaque) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Opaque", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}), kExpectedOpaque);
+  TF_ASSERT_OK(Execute(&b, {}).status());
+}
+
+void Callback_SubBuffers(CUstream stream, void** buffers,
+                         const char* /*opaque*/, size_t /*opaque_len*/) {
+  // `buffers` is a flat array containing device pointers to the following.
+  //
+  //   0: root tuple of param 0
+  //   1:   param 0 at tuple index {0}, shape f32[128]
+  //   2:   param 0 at tuple index {1}, shape f32[256]
+  //   3: root tuple of param 1
+  //   4:   param 1 at tuple index {0}, shape f32[1024]
+  //   5:   param 1 at tuple index {1}, shape f32[8]
+  //   6: root tuple of custom-call result
+  //   7:   result at tuple index {0}, shape f32[8]
+  //   8:   result at tuple index {1}, shape (f32[128], f32[256])
+  //   9:     result at tuple index {1, 0}, shape f32[128]
+  //  10:     result at tuple index {1, 1}, shape f32[256]
+  //  11:   result at tuple index {2}, shape f32[1024]
+  //
+  // It's the contract of custom-call that the non-root pointers (i.e.
+  // everything other than indices 0, 3, and 6) may be null, if XLA is unable to
+  // analyze the program well enough to determine for sure what's in those
+  // buffers.  For this simple example, all of the buffers should be non-null.
+
+  // Check the param 0 tuple, namely that
+  //
+  //   (*buffers[0])[0] == buffers[1] and
+  //   (*buffers[0])[1] == buffers[2].
+  //
+  // because buffers contains pointers to device memory, we have to retrieve
+  // these values via cudaMemcpy.
+  void* p0[2];
+  cudaMemcpy(p0, buffers[0], 2 * sizeof(void*), cudaMemcpyDeviceToHost);
+  CHECK_EQ(p0[0], buffers[1]);
+  CHECK_EQ(p0[1], buffers[2]);
+
+  // Check the param 1 tuple, namely that
+  //
+  //   (*buffers[3])[0] == buffers[4]
+  //   (*buffers[3])[1] == buffers[5].
+  void* p1[2];
+  cudaMemcpy(p1, buffers[3], 2 * sizeof(void*), cudaMemcpyDeviceToHost);
+  CHECK_EQ(p1[0], buffers[4]);
+  CHECK_EQ(p1[1], buffers[5]);
+
+  // We don't have an equivalent check for the output tuple (i.e. we don't check
+  // (*buffers[6])[0] == buffers[7]) because it's up to us to set the tuple
+  // as part of this custom-call.
+
+  // Write the results.  First set the root tuple output buffer to {b7, b8,
+  // b11}.
+  void* root[3] = {buffers[7], buffers[8], buffers[11]};
+  cudaMemcpy(buffers[6], root, 3 * sizeof(void*), cudaMemcpyHostToDevice);
+
+  // Now set the sub-tuple output buffer at index 8 to {b9, b10}.
+  void* sub_tuple[2] = {buffers[9], buffers[10]};
+  cudaMemcpy(buffers[8], sub_tuple, 2 * sizeof(void*), cudaMemcpyDeviceToHost);
+
+  // Now set output leaf buffers 7, 9, 10, and 11, copying data from the
+  // corresponding same-sized inputs.
+  cudaMemcpyAsync(buffers[7], buffers[5], 8 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[9], buffers[1], 128 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[10], buffers[2], 256 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[11], buffers[4], 1024 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_SubBuffers, "CUDA");
+TEST_F(CustomCallTest, SubBuffers) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_SubBuffers", /*operands=*/
+             {
+                 Tuple(&b,
+                       {
+                           Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                           Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                       }),
+                 Tuple(&b,
+                       {
+                           Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                           Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                       }),
+             },
+             ShapeUtil::MakeTupleShape({
+                 ShapeUtil::MakeShape(F32, {8}),
+                 ShapeUtil::MakeTupleShape({
+                     ShapeUtil::MakeShape(F32, {128}),
+                     ShapeUtil::MakeShape(F32, {256}),
+                 }),
+                 ShapeUtil::MakeShape(F32, {1024}),
+             }),
+             /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>({0}), ::testing::Each(4));
+  EXPECT_THAT(result.data<float>({1, 0}), ::testing::Each(1));
+  EXPECT_THAT(result.data<float>({1, 1}), ::testing::Each(2));
+  EXPECT_THAT(result.data<float>({2}), ::testing::Each(3));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
new file mode 100644
index 0000000..f0f3152
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+
+namespace xla {
+namespace gpu {
+
+CustomCallThunk::CustomCallThunk(
+    void* call_target,
+    std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
+    ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque,
+    const HloInstruction* instr)
+    : Thunk(Thunk::kCustomCall, instr),
+      call_target_(call_target),
+      operand_slices_(std::move(operand_slices)),
+      result_slices_(std::move(result_slices)),
+      opaque_(std::move(opaque)) {
+  CHECK_EQ(instr->operand_count(), operand_slices_.size());
+  for (int64 i = 0; i < instr->operand_count(); ++i) {
+    const auto& s1 = operand_slices_[i].shape();
+    const auto& s2 = instr->operand(i)->shape();
+    CHECK(ShapeUtil::Equal(s1, s2)) << absl::StreamFormat(
+        "Shape mismatch between instr->operand(%d) and "
+        "operand_slices[%d].shape(): %s vs %s",
+        i, i, s1.ToString(), s2.ToString());
+  }
+  CHECK(ShapeUtil::Equal(instr->shape(), result_slices.shape()))
+      << absl::StreamFormat(
+             "Shape mismatch between instr->shape() and result_slices.shape(): "
+             "%s vs %s.",
+             instr->shape().ToString(), result_slices.shape().ToString());
+}
+
+Status CustomCallThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  // gpu_stream is CUstream or e.g. the equivalent type in ROCm.
+  auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
+  auto typed_call_target =
+      reinterpret_cast<void (*)(decltype(gpu_stream), void** /*buffers*/,
+                                const char* /*opaque*/, size_t /*opaque_len*/)>(
+          call_target_);
+
+  std::vector<void*> buffers;
+  auto append_buffers = [&](const ShapeTree<BufferAllocation::Slice>& slices) {
+    slices.ForEachElement([&](const ShapeIndex& /*index*/,
+                              const BufferAllocation::Slice& slice) {
+      if (slice.allocation() == nullptr) {
+        buffers.push_back(nullptr);
+      }
+      buffers.push_back(buffer_allocations.GetDeviceAddress(slice).opaque());
+    });
+  };
+  for (const auto& slices : operand_slices_) {
+    append_buffers(slices);
+  }
+  append_buffers(result_slices_);
+
+  typed_call_target(gpu_stream, buffers.data(), opaque_.data(), opaque_.size());
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
new file mode 100644
index 0000000..9011fa2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSTOM_CALL_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSTOM_CALL_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk to run a GPU custom call.
+//
+// This thunk's `ExecuteOnStream` implementation executes a host function
+// `call_target` which is expected to enqueue operations onto the GPU.
+//
+// For information about the calling convention, see xla/g3doc/custom_call.md
+//
+// Note that not all kCustomCall HLOs in XLA:GPU end up being run by this thunk.
+// XLA itself creates kCustomCall instructions when lowering kConvolution HLOs
+// into calls to cudnn.  These internally-created custom-calls are run using
+// ConvolutionThunk, not CustomCallThunk.  There's no ambiguity because they
+// have special call target names (e.g. "__cudnn$convForward") that only the
+// compiler is allowed to create.
+class CustomCallThunk : public Thunk {
+ public:
+  CustomCallThunk(
+      void* call_target,
+      std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
+      ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque,
+      const HloInstruction* instr);
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  void* call_target_;
+  std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices_;
+  ShapeTree<BufferAllocation::Slice> result_slices_;
+  std::string opaque_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSTOM_CALL_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 551f7d7..ffa60da 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -71,7 +71,6 @@
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
     llvm::IRBuilder<>* b, NestedComputer compute_nested)
     : ElementalIrEmitter(hlo_module_config, module, b),
-      hlo_module_config_(hlo_module_config),
       compute_nested_(std::move(compute_nested)) {}
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
@@ -271,16 +270,6 @@
   return FPCast(fast_tanh, value->getType());
 }
 
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitRoundNearestAfz(
-    PrimitiveType prim_type, llvm::Value* value) {
-  // Use libdevice __nv_round instead of llvm.round. This is to workaround a
-  // bug in the PTX backend, which implements llvm.round with PTX cvt.rni.
-  // When the llvm.round is fixed, we may still want to use __nv_round here as
-  // expanding the non-trivial implementation early while inlining allows better
-  // optimizations.
-  return EmitLibdeviceMathCall("__nv_round", {value}, {prim_type}, prim_type);
-}
-
 llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
     const string& callee_name, absl::Span<llvm::Value* const> operands,
     absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index e9d0817..466543a 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -91,9 +91,6 @@
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
 
-  StatusOr<llvm::Value*> EmitRoundNearestAfz(PrimitiveType prim_type,
-                                             llvm::Value* value) override;
-
   llvm::Value* EmitThreadId() override;
 
  private:
@@ -129,7 +126,6 @@
       const string& callee_name, absl::Span<llvm::Value* const> operands,
       absl::Span<const PrimitiveType> input_types, PrimitiveType output_type);
 
-  const HloModuleConfig& hlo_module_config_;
   NestedComputer compute_nested_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index ca4a605..8f40010 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -29,7 +29,7 @@
 namespace gpu {
 
 FftScratchAllocator::FftScratchAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+    int device_ordinal, se::DeviceMemoryAllocator* memory_allocator)
     : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
 int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
@@ -48,7 +48,7 @@
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
                       memory_allocator_->Allocate(device_ordinal_, byte_size,
                                                   /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 2be50e0..f653e4f 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -38,7 +38,7 @@
 class FftScratchAllocator : public se::ScratchAllocator {
  public:
   FftScratchAllocator(int device_ordinal,
-                      DeviceMemoryAllocator* memory_allocator);
+                      se::DeviceMemoryAllocator* memory_allocator);
 
   int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
@@ -49,8 +49,8 @@
 
  private:
   const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 24aeda3..dec40c5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -36,6 +36,7 @@
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace xla {
 namespace gpu {
@@ -98,14 +99,12 @@
                                 sub_streams, hlo_module_->entry_computation());
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  // This top-level trace serves two purposes:
-  //  1) It marks the scope of the whole XLA module.
-  //  2) It tells us whether tracing is enabled.  We use this to avoid the
-  //     expensive HloInstruction::ToString() calls inside the loop below if
-  //     tracing is disabled.
-  ScopedAnnotation top_level_annotation(hlo_module_->name(), "XLA GPU module");
+  tensorflow::profiler::TraceMe hlo_module_activity(
+      [&] { return absl::StrCat(hlo_module_->name(), ":XLA GPU module"); },
+      tensorflow::profiler::TraceMeLevel::kInfo);
 
   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
+  bool scoped_annotation_enabled = ScopedAnnotation::IsEnabled();
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
     // Annotate execution of this op if tracing was enabled when we started
     // running this module.  If tracing is enabled *while* we're running the
@@ -114,12 +113,13 @@
     // TODO(jlebar): Should we cache the results of HloInstruction::ToString(),
     // since we expect it to be an expensive call?
     absl::optional<ScopedAnnotation> op_annotation;
-    if (top_level_annotation.IsEnabled()) {
+    CHECK(thunk->hlo_instruction());
+    if (scoped_annotation_enabled) {
+      auto hlo = thunk->hlo_instruction();
       op_annotation.emplace(
-          thunk->hlo_instruction() != nullptr
-              ? thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical())
-              : "<unknown>",
-          "XLA op");
+          thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical()),
+          absl::StrCat("#tf_op=", hlo->metadata().op_name(),
+                       ",hlo_op=", hlo->name(), "#"));
     }
 
     TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
@@ -226,11 +226,11 @@
   return &module_globals_.emplace(executor, std::move(globals)).first->second;
 }
 
-StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> GpuExecutable::Execute(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+    HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) {
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
 
   if (GetRootPointsToSet().IsAmbiguous()) {
     return Unimplemented("Points-to set of root instruction is ambiguous");
@@ -272,8 +272,6 @@
       buffer_allocations_builder.Build(
           assignment_.get(), executor->device_ordinal(), memory_allocator));
 
-  bool block_host_until_done =
-      !memory_allocator->AllowsAsynchronousDeallocation();
   TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
                                    block_host_until_done,
                                    hlo_execution_profile));
@@ -339,12 +337,22 @@
   return std::move(shaped_buffer);
 }
 
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    absl::Span<const ShapedBuffer* const> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  return Execute(run_options, arguments, hlo_execution_profile,
+                 /*block_host_until_done=*/true);
+}
+
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments) {
-  // TODO(b/30671675): Implement asynchronous execution mode.
-  return Unimplemented(
-      "Asynchronous execution on stream is not yet supported on GPU.");
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  // Force synchronous execution if the allocator requires it.
+  bool block_host_until_done =
+      !memory_allocator->AllowsAsynchronousDeallocation();
+  return Execute(run_options, arguments, nullptr, block_host_until_done);
 }
 
 const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 2b3c77f..b1f63bc 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -24,7 +24,6 @@
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
@@ -38,6 +37,7 @@
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -86,6 +86,11 @@
       absl::Span<const ShapedBuffer* const> arguments) override;
 
  private:
+  StatusOr<ScopedShapedBuffer> Execute(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments,
+      HloExecutionProfile* hlo_execution_profile, bool block_host_until_done);
+
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
   // clients, such as Tensorflow, that use a single stream of execution for
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 4182b02..d5b351f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -84,6 +84,11 @@
 }
 
 bool IsInputFusibleReduction(const HloInstruction& instr) {
+  // TODO(b/129089333): Don't fuse variadic reduce.
+  if (instr.opcode() == HloOpcode::kReduce && instr.shape().IsTuple()) {
+    return false;
+  }
+
   return IsReduceInputFusion(instr) ||
          IsReductionFromOrToContiguousDimensions(instr);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index b8727e6..54cab21 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -255,7 +255,7 @@
   }
   auto producer = consumer->operand(operand_index);
 
-  // Don't fuse variadic reduce.
+  // TODO(b/129089333): Don't fuse variadic reduce.
   if (consumer->opcode() == HloOpcode::kReduce && consumer->shape().IsTuple()) {
     return false;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index a78b4ff..b9d944b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -32,10 +32,12 @@
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
+                   const se::Platform* platform,
                    const se::DeviceDescription* device_desc,
                    llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
+        platform_(platform),
         device_desc_(device_desc),
         llvm_module_(llvm_module) {}
   // Disallow copy and assign.
@@ -47,6 +49,7 @@
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
+  const se::Platform* platform() const { return platform_; }
   const se::DeviceDescription& device_description() const {
     return *device_desc_;
   }
@@ -56,6 +59,7 @@
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
+  const se::Platform* platform_;
   const se::DeviceDescription* device_desc_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 6793aa8..774c2b8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -13,6 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+
 #include <algorithm>
 #include <cstring>
 #include <iterator>
@@ -20,8 +22,6 @@
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
-
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -37,6 +37,7 @@
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
@@ -45,6 +46,7 @@
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
@@ -527,7 +529,35 @@
     return Status::OK();
   }
 
-  return IrEmitter::HandleCustomCall(custom_call);
+  if (void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+          custom_call->custom_call_target(),
+          ir_emitter_context_->platform()->Name())) {
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    auto get_slices_for_instr = [&](const HloInstruction* instr) {
+      ShapeTree<BufferAllocation::Slice> slices(instr->shape());
+      slices.ForEachMutableElement([&](const ShapeIndex& index,
+                                       BufferAllocation::Slice* slice) {
+        StatusOr<BufferAllocation::Slice> s = assn.GetUniqueSlice(instr, index);
+        if (s.ok()) {
+          *slice = s.ValueOrDie();
+        }
+      });
+      return slices;
+    };
+    std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices;
+    for (const auto* operand : custom_call->operands()) {
+      operand_slices.push_back(get_slices_for_instr(operand));
+    }
+    ShapeTree<BufferAllocation::Slice> result_slices =
+        get_slices_for_instr(custom_call);
+    AddThunkToThunkSequence(absl::make_unique<CustomCallThunk>(
+        call_target, std::move(operand_slices), std::move(result_slices),
+        Cast<HloCustomCallInstruction>(custom_call)->opaque(), custom_call));
+    return Status::OK();
+  }
+
+  return Unimplemented("No registered implementation for custom call to \"%s\"",
+                       custom_call->custom_call_target());
 }
 
 Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index e09b8fb..fbe22e3 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -16,13 +16,19 @@
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/kernel.h"
 
 namespace xla {
 namespace gpu {
@@ -39,16 +45,6 @@
 Status KernelThunk::Initialize(const GpuExecutable& executable,
                                se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
-  if (!loader_spec_) {
-    loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
-    loader_spec_->AddCudaPtxInMemory(executable.ptx(), kernel_name_);
-
-    if (!executable.cubin().empty()) {
-      loader_spec_->AddCudaCubinInMemory(
-          reinterpret_cast<const char*>(executable.cubin().data()),
-          kernel_name_);
-    }
-  }
 
   // Load the kernel into the device if necessary.
   //
@@ -57,10 +53,12 @@
   // profiles.
   auto it = kernel_cache_.find(executor);
   if (kernel_cache_.end() == it) {
-    it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first;
-    if (!executor->GetKernel(*loader_spec_, &it->second)) {
-      return InternalError("Unable to load kernel %s", kernel_name_);
-    }
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<se::KernelBase> kernel,
+        CreateKernel(kernel_name_, args_.size(), executable.ptx(),
+                     executable.cubin(), executor));
+
+    kernel_cache_.emplace(executor, std::move(kernel));
   }
 
   return Status::OK();
@@ -85,27 +83,22 @@
     CHECK(it != kernel_cache_.end())
         << "Initialize() not called for StreamExecutor " << executor;
     launch_dimensions = launch_dimensions_;
-    kernel = &it->second;
+    kernel = it->second.get();
   }
 
   VLOG(3) << "Launching " << kernel->name();
-  // Launch the kernel with potentially multiple blocks and threads.
-  static constexpr int kKernelArgsLimit = 1024;
-  auto kernel_args = absl::make_unique<se::KernelArgsArray<kKernelArgsLimit>>();
+  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
   for (const BufferAllocation* arg : args_) {
-    const auto& buf = buffer_allocations.GetDeviceAddress(arg->index());
-    kernel_args->add_device_memory_argument(buf);
-    VLOG(3) << "  Arg: alloc #" << arg->index() << ": " << buf.opaque() << " ("
+    se::DeviceMemoryBase buf =
+        buffer_allocations.GetDeviceAddress(arg->index());
+    VLOG(3) << "  Arg: alloc #" << arg->index() << ": " << buf.opaque() << "  ("
             << buf.size() << "B)";
+    buffer_args.push_back(buf);
   }
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
-  if (!stream->parent()->Launch(
-          stream, se::ThreadDim(launch_dimensions.threads_per_block()),
-          se::BlockDim(launch_dimensions.block_count()), *kernel,
-          *kernel_args)) {
-    return InternalError("Unable to launch kernel %s", kernel_name_);
-  }
-  return Status::OK();
+  return ExecuteKernelOnStream(*kernel, buffer_args,
+                               launch_dimensions.threads_per_block(),
+                               launch_dimensions.block_count(), stream);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index f63db5c..2cea89e 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -84,12 +84,11 @@
   // Describes how to load this kernel. ExecuteOnStream reuses this loader
   // specification for all executions.
   mutable tensorflow::mutex mutex_;
-  std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
 
   // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
   // values.
-  std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
-      GUARDED_BY(mutex_);
+  std::unordered_map<se::StreamExecutor*, std::unique_ptr<se::KernelBase>>
+      kernel_cache_ GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 698d2d5..ca42807 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -34,6 +34,7 @@
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 12b6056..34966b1 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -21,12 +21,6 @@
 #include <utility>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/util.h"
-
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
@@ -55,11 +49,17 @@
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace xla {
 namespace gpu {
@@ -487,9 +487,9 @@
 
   string ptx;
   {
-    tensorflow::tracing::ScopedActivity activity("Compiling IR",
-                                                 module->getName().str(),
-                                                 /*is_expensive=*/true);
+    tensorflow::profiler::TraceMe activity(
+        [&] { return absl::StrCat("Compiling IR:", module->getName().str()); },
+        tensorflow::profiler::TraceMeLevel::kInfo);
     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
     TF_ASSIGN_OR_RETURN(
         ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 3051db3..c00edae 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -18,6 +18,7 @@
 #include "tensorflow/compiler/xla/util.h"
 
 #if GOOGLE_CUDA
+#include "absl/container/flat_hash_set.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "third_party/nccl/nccl.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -76,6 +77,42 @@
 // This manager is responsible for establishing communication channels and
 // ultimately enqueueing the NCCL library operation onto the participating
 // streams.
+//
+// Implementation note: We make an effort to avoid initializing nccl
+// communciation channels too often, as this is expensive.
+//
+// Ideally, we'd set up a nccl channel between each pair of devices that needs
+// to communicate, and close each channel when the GPUs won't be communicating
+// again "for a long time" (because channels hold memory on the GPU).  As a
+// simplification to this ideal, we adopt the following policy.
+//
+//  - We maintain a set of GPUs that are "actively participating" in
+//    cross-device communications.  That set of GPUs is always connected as a
+//    clique, using ncclCommInitAll.
+//
+//  - When a NcclAllReduceThunk touches a new GPU, we tear down the old clique
+//    and build a new, bigger one.
+//
+//  - All GPUs ever touched by a thunk are considered "actively in use" by that
+//    thunk until the thunk is destroyed.  Destroying the thunk decrements the
+//    refcount of the GPUs it's touched, and if that refcount goes to 0
+//    (meaning, some GPUs are no longer in use by any thunk), we tear down the
+//    clique and build a new, smaller one.
+//
+// This approximation is justified because:
+//
+//  - Currently the only collective operation we support is AllReduce, which
+//    requires a clique.  When we support point-to-point operations, we may not
+//    want to build a communication clique.
+//
+//  - Tearing down and creating a new thunk is tantamount to running the whole
+//    XLA:GPU compiler.  This is expensive, so shouldn't happen "too often" to
+//    cause thrashing here.
+//
+//  - XLA executables already keep resources on the GPU tied to the lifetime of
+//    the executable (e.g. constants stored in GPU memory), so tying the
+//    lifetime of the nccl communication channels to the lifetime of the
+//    executable is consistent.
 class GlobalRendezvousManager {
  public:
   // The GpuExecutable-executing threads call this in order to a) establish the
@@ -98,18 +135,38 @@
     return current_generation_;
   }
 
- private:
-  // Called by the primary thread to set up the communication links.
+  // Increments the refcount of a GPU in our accounting of which devices are
+  // "actively participating" in cross-device operations.
   //
-  // TODO(b/125951860): This performs lots of (presumably) unnecessary host-side
-  // synchronization so that we can be paranoid about semantics in the earliest
-  // implementation. In the limit we should only need to synchronize host
-  // replica threads when the "number of replicas" or "participating device
-  // ordinals" change, to set up a new NCCL "communication" context, at which
-  // point we can enqueue onto device streams without host synchronization in
-  // our code -- this will likely be helpful for "lots of little AllReduce"
-  // cases.
-  Status InitializeCommunicationChannels() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  // This doesn't actually do anything other than increment the refcount.  If
+  // the GPU added here is novel, we'll rebuild the nccl communication clique
+  // when we actually go do the communication.
+  void AddrefParticipatingDevice(int device_ordinal);
+
+  // Decrements the refcount of a set of GPUs in our accounting of which devices
+  // are "actively participating" in cross-device operations.
+  //
+  // If one or more GPUs' refcounts to go 0, we immediately destroy the whole
+  // nccl communication clique.  We'll rebuild a new, smaller clique the next
+  // time it's used.
+  void DecrefParticipatingDevices(absl::Span<const int> device_ordinals);
+
+  // Gets the set of devices that have a NCCL channel currently open.  This is
+  // primarily for testing.
+  absl::flat_hash_set<int> DevicesWithOpenNcclChannels() const {
+    absl::flat_hash_set<int> devices;
+    tensorflow::mutex_lock lock(mutex_);
+    for (const auto& kv : comms_) {
+      devices.insert(kv.first);
+    }
+    return devices;
+  }
+
+ private:
+  // Destroys the current nccl communication clique and builds a new one
+  // connecting the given devices.
+  Status ReinitializeNcclClique(const absl::flat_hash_set<int>& device_ordinals)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Called when all necessary participants are present, the functionality
   // that's implemented by all executing threads lives in here.
@@ -118,28 +175,51 @@
   // Puts all state back into a "reset" state for the next generation of
   // AllReduce requests.
   void DeinitializeGeneration() EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
-    for (ncclComm_t& comm : comms_) {
-      ncclCommDestroy(comm);
-    }
-    comms_.clear();
     participants_.clear();
     current_generation_++;
     initialized_ = false;
     done_ = absl::nullopt;
   }
 
-  tensorflow::mutex mutex_;
+  mutable tensorflow::mutex mutex_;
   tensorflow::condition_variable all_participants_present_;
   tensorflow::condition_variable deinitialized_;
 
-  // Communication handles that correspond to the participants below.
-  std::vector<ncclComm_t> comms_ GUARDED_BY(mutex_);
-
   Status initialize_status_ GUARDED_BY(mutex_);
   std::vector<ParticipantData> participants_ GUARDED_BY(mutex_);
   int64 current_generation_ GUARDED_BY(mutex_) = 0;
   bool initialized_ GUARDED_BY(mutex_) = false;
 
+  struct Comm {
+    explicit Comm(ncclComm_t nccl_comm) : nccl_comm(nccl_comm) {}
+
+    // Movable, but not copyable.
+    Comm(Comm&& c) : nccl_comm(c.nccl_comm) { c.nccl_comm.reset(); }
+    Comm& operator=(Comm&& c) {
+      nccl_comm = c.nccl_comm;
+      c.nccl_comm.reset();
+      return *this;
+    }
+    Comm(const Comm&) = delete;
+    Comm& operator=(const Comm&) = delete;
+
+    absl::optional<ncclComm_t> nccl_comm;
+
+    ~Comm() {
+      if (nccl_comm.has_value()) {
+        VLOG(3) << absl::StreamFormat("Destroying comm %p", *nccl_comm);
+        ncclCommDestroy(*nccl_comm);
+      }
+    }
+  };
+  // Communication handles for our NCCL clique.  Key is device ordinal.
+  absl::flat_hash_map<int, Comm> comms_ GUARDED_BY(mutex_);
+
+  // Refcounts of which devices are "actively participating" in all-reduces.
+  // These devices don't necessarily have an open comm, but the next time we run
+  // an operation, we'll create a NCCL clique between all of them.
+  absl::flat_hash_map<int, int64> device_refcounts_ GUARDED_BY(mutex_);
+
   // The participating threads wait for this to count down in order to know we
   // can begin the teardown process.
   absl::optional<tensorflow::BlockingCounter> done_;
@@ -151,11 +231,6 @@
     return participants_.size() >= participant.replica_count;
   };
 
-  // We remember the participant index at which we are inserted and use that
-  // same index for referring to auxiliary metadata (e.g. the ncclComm_t handle
-  // index) below.
-  int64 index;
-
   {
     tensorflow::mutex_lock lock(mutex_);
 
@@ -171,7 +246,6 @@
           "participants; existing: %s; submitted: %s)",
           participants_.back().ToString(), participant.ToString());
     }
-    index = participants_.size();
     participants_.push_back(participant);
 
     if (all_participants_present()) {
@@ -205,11 +279,35 @@
       VLOG(3) << "Primary initializing accounting data.";
       initialized_ = true;
       done_.emplace(participant.replica_count);
-      initialize_status_ = InitializeCommunicationChannels();
-      VLOG(3) << "Done initializing communication channels; status: "
-              << initialize_status_;
-      if (!initialize_status_.ok()) {
-        DeinitializeGeneration();
+
+      // Check if all participants_ are in comms_.  If not, we will rebuild the
+      // clique to include them.  (This can't be spelled using absl::c_any_of
+      // because it needs to touch comms_ and tensorflow::mutex lacks an
+      // AssertHeld() function that would let us assert that the lambda is run
+      // while holding the lock.)
+      bool new_devices_found = false;
+      for (const auto& p : participants_) {
+        if (!comms_.contains(p.device_ordinal)) {
+          new_devices_found = true;
+          break;
+        }
+      }
+
+      if (new_devices_found) {
+        absl::flat_hash_set<int> new_clique_device_ordinals;
+        for (const auto& kv : comms_) {
+          new_clique_device_ordinals.insert(kv.first);
+        }
+        for (const auto& p : participants_) {
+          new_clique_device_ordinals.insert(p.device_ordinal);
+        }
+
+        initialize_status_ = ReinitializeNcclClique(new_clique_device_ordinals);
+        VLOG(3) << "Done initializing communication channels; status: "
+                << initialize_status_;
+        if (!initialize_status_.ok()) {
+          DeinitializeGeneration();
+        }
       }
     }
 
@@ -218,7 +316,7 @@
       return initialize_status_;
     }
 
-    comm = comms_[index];
+    comm = *comms_.at(participant.device_ordinal).nccl_comm;
 
     // Drop the lock at the end of scope so other participants may enter.
   }
@@ -259,22 +357,30 @@
   return all_reduce_status;
 }
 
-Status GlobalRendezvousManager::InitializeCommunicationChannels() {
-  std::vector<int> ordinals;
-  for (ParticipantData& data : participants_) {
-    ordinals.push_back(data.device_ordinal);
-  }
-  comms_.resize(ordinals.size());
-  VLOG(3) << "Participants: " << participants_.size()
-          << "; initializing comms.";
-  ncclResult_t result = ncclCommInitAll(comms_.data(), comms_.size(),
-                                        /*devlist=*/ordinals.data());
+Status GlobalRendezvousManager::ReinitializeNcclClique(
+    const absl::flat_hash_set<int>& device_ordinals) {
+  comms_.clear();
+
+  std::vector<int> ordinals_vec(device_ordinals.begin(), device_ordinals.end());
+  std::vector<ncclComm_t> comm_vec;
+  comm_vec.resize(device_ordinals.size());
+
+  VLOG(3) << absl::StreamFormat(
+      "Initializing nccl comms for participant devices {%s}",
+      absl::StrJoin(ordinals_vec, ", "));
+  ncclResult_t result = ncclCommInitAll(comm_vec.data(), comm_vec.size(),
+                                        /*devlist=*/ordinals_vec.data());
   if (result != ncclSuccess) {
-    comms_.clear();
     return InternalError(
         "Failed to initialize NCCL communication channels for %d participants: "
         "%s",
-        participants_.size(), ncclGetErrorString(result));
+        ordinals_vec.size(), ncclGetErrorString(result));
+  }
+
+  for (int64 i = 0; i < ordinals_vec.size(); ++i) {
+    VLOG(3) << absl::StreamFormat("Device ordinal %d assigned ncclComm %p",
+                                  ordinals_vec[i], comm_vec[i]);
+    CHECK(comms_.emplace(ordinals_vec[i], Comm{comm_vec[i]}).second);
   }
   return Status::OK();
 }
@@ -289,6 +395,11 @@
           << " on device: " << participant.device_ordinal;
   void* send_buffer = participant.source_data.opaque();
   void* recv_buffer = participant.destination_data.opaque();
+  VLOG(3) << absl::StreamFormat(
+      "Calling ncclAllReduce(send_buffer=%p, recv_buffer=%p, count=%d, "
+      "datatype=ncclFloat, op=ncclSum, comm=%p, stream=%p)",
+      send_buffer, recv_buffer, participant.element_count,
+      static_cast<const void*>(comm), cu_stream);
   ncclResult_t result = ncclAllReduce(send_buffer, recv_buffer,
                                       /*count=*/participant.element_count,
                                       /*datatype=*/ncclFloat,
@@ -304,6 +415,36 @@
   return Status::OK();
 }
 
+void GlobalRendezvousManager::AddrefParticipatingDevice(int device_ordinal) {
+  // Addref'ing a device doesn't do anything other than increment its refcount.
+  // We'll update our nccl clique if necessary during the next call to
+  // SubmitParticipant.
+  tensorflow::mutex_lock lock(mutex_);
+  device_refcounts_[device_ordinal]++;
+}
+
+void GlobalRendezvousManager::DecrefParticipatingDevices(
+    absl::Span<const int> device_ordinals) {
+  // Decref'ing devices causes us to destroy the nccl clique if any devices were
+  // removed due to having refcount 0.  We'll rebuild the new, smaller clique
+  // during the next call to SubmitParticipant.
+  tensorflow::mutex_lock lock(mutex_);
+  bool removed_device = false;
+  for (int device_ordinal : device_ordinals) {
+    auto it = device_refcounts_.find(device_ordinal);
+    CHECK(it != device_refcounts_.end());
+    it->second--;
+    if (it->second == 0) {
+      device_refcounts_.erase(it);
+      removed_device = true;
+    }
+  }
+
+  if (removed_device) {
+    comms_.clear();
+  }
+}
+
 static GlobalRendezvousManager* GetGlobalRendezvous() {
   static auto* manager = new GlobalRendezvousManager;
   return manager;
@@ -311,6 +452,11 @@
 
 }  // namespace
 
+/*static*/ absl::flat_hash_set<int>
+NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
+  return GetGlobalRendezvous()->DevicesWithOpenNcclChannels();
+}
+
 Status NcclAllReduceThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
     HloExecutionProfiler* profiler) {
@@ -327,8 +473,32 @@
   participant.stream = stream;
   participant.originator = this;
 
+  // We currently say that that all GPUs this thunk has ever touched are
+  // "actively participating" in cross-device operations, until the thunk itself
+  // is destroyed.
+  //
+  // This policy is an attempt to avoid thrashing the GPU (ncclCommInitAll is
+  // very expensive) while also freeing resources on the GPUs when we can.  The
+  // idea is, creating new thunks is tantamount to running the whole XLA:GPU
+  // compiler stack, so that shouldn't happen terribly often.
+  bool new_device;
+  {
+    tensorflow::mutex_lock lock(mu_);
+    new_device = devices_seen_.insert(participant.device_ordinal).second;
+  }
+  if (new_device) {
+    GetGlobalRendezvous()->AddrefParticipatingDevice(
+        participant.device_ordinal);
+  }
+
   return GetGlobalRendezvous()->SubmitParticipant(std::move(participant));
 }
+
+NcclAllReduceThunk::~NcclAllReduceThunk() {
+  GetGlobalRendezvous()->DecrefParticipatingDevices(
+      std::vector<int>(devices_seen_.begin(), devices_seen_.end()));
+}
+
 #else
 
 Status NcclAllReduceThunk::ExecuteOnStream(
@@ -339,6 +509,13 @@
       "compiler, which is necessary to build the NCCL source library.");
 }
 
+NcclAllReduceThunk::~NcclAllReduceThunk() = default;
+
+/*static*/ absl::flat_hash_set<int>
+NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
+  return {};
+}
+
 #endif  // GOOGLE_CUDA
 
 NcclAllReduceThunk::NcclAllReduceThunk(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 1a8d135..9ff4fb1 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -16,11 +16,13 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -38,12 +40,21 @@
   // error.
   static bool NcclIsEnabled();
 
+  // Gets the set of devices that have a NCCL channel open.  This is primarily
+  // for testing.
+  //
+  // (Indeed, because the NCCL channels are a global variable, in the real
+  // world, the value returned here is stale as soon as you read it, so it's not
+  // clear how you *could* use it for anything other than tests.)
+  static absl::flat_hash_set<int> DevicesWithOpenNcclChannels();
+
   // TODO(b/125951860): Plumb more datatypes / reduction operators. Initial
   // implementation is simply F32 summation.
   NcclAllReduceThunk(int64 replica_count, int64 element_count,
                      const BufferAllocation::Slice& source_buffer,
                      const BufferAllocation::Slice& destination_buffer,
                      const HloInstruction* all_reduce);
+  ~NcclAllReduceThunk() override;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
                          se::Stream* stream,
@@ -54,6 +65,10 @@
   const int64 element_count_;
   const BufferAllocation::Slice source_buffer_;
   const BufferAllocation::Slice destination_buffer_;
+
+  tensorflow::mutex mu_;
+  // Set of GPUs that ExecuteOnStream has been called on.
+  absl::flat_hash_set<int> devices_seen_ GUARDED_BY(mu_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 31d7690..d028557 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
 
 #include <stdlib.h>
+
 #include <atomic>
 #include <functional>
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
@@ -83,6 +84,7 @@
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/slice_sinker.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/stable_sort_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
@@ -104,6 +106,7 @@
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 
 namespace xla {
@@ -117,30 +120,13 @@
 
 namespace tracing = tensorflow::tracing;
 
-// Returns a vector of potential locations of the CUDA root directory.
-std::vector<string> GetCudaRootCandidates(
-    const HloModuleConfig& hlo_module_config) {
-  std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
-
-  // "." is our last resort, even though it probably won't work.
-  potential_cuda_roots.push_back(".");
-
-  // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
-  // highest priority.
-  string xla_gpu_cuda_data_dir =
-      hlo_module_config.debug_options().xla_gpu_cuda_data_dir();
-  if (!xla_gpu_cuda_data_dir.empty()) {
-    potential_cuda_roots.insert(potential_cuda_roots.begin(),
-                                xla_gpu_cuda_data_dir);
-  }
-  return potential_cuda_roots;
-}
-
 void PrintCantFindCudaMessage(absl::string_view msg,
                               const HloModuleConfig& hlo_module_config) {
   LOG(WARNING) << msg;
   LOG(WARNING) << "Searched in the following directories:";
-  for (const auto& dir : GetCudaRootCandidates(hlo_module_config)) {
+
+  for (const auto& dir :
+       GetCudaRootCandidates(PtxCompilationOptions(hlo_module_config))) {
     LOG(WARNING) << "  " << dir;
   }
   LOG(WARNING)
@@ -151,7 +137,8 @@
 
 // Returns the directory containing nvvm libdevice files.
 string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
-  const auto& candidate_dirs = GetCudaRootCandidates(hlo_module_config);
+  const auto& candidate_dirs =
+      GetCudaRootCandidates(PtxCompilationOptions(hlo_module_config));
   for (const string& cuda_root : candidate_dirs) {
     string libdevice_dir =
         tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
@@ -177,7 +164,7 @@
 // It takes a compiler pointer, as passes may compile and execute HLOs on the
 // fly for cuDNN verification or other purposes.
 Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
-                         DeviceMemoryAllocator* device_allocator,
+                         se::DeviceMemoryAllocator* device_allocator,
                          Compiler* compiler) {
   {
     HloPassPipeline pipeline("optimization");
@@ -239,6 +226,7 @@
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
+      pass.AddPass<SliceSinker>();
       pass.AddPass<HloDCE>();
       pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
@@ -426,78 +414,6 @@
   return pipeline.Run(hlo_module).status();
 }
 
-// Prints a warning if the ptxas at ptxas_path has known bugs.
-//
-// Only prints a warning the first time it's called for a particular value of
-// ptxas_path.
-void WarnIfBadPtxasVersion(const string& ptxas_path) {
-  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
-      new std::unordered_set<string>();
-
-  tensorflow::mutex_lock lock(mu);
-  if (!seen_ptxas_paths->insert(ptxas_path).second) {
-    // Already checked this ptx binary, nothing to do.
-    return;
-  }
-
-  tensorflow::SubProcess ptxas;
-  ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
-  ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
-  if (!ptxas.Start()) {
-    LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
-    return;
-  }
-
-  string out;
-  int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
-                                    /*stderr_output=*/nullptr);
-  if (exit_code != 0) {
-    LOG(WARNING) << "Running " << ptxas_path << " --version returned "
-                 << exit_code;
-    return;
-  }
-
-  int64 vmaj, vmin, vdot;
-  string vmaj_str, vmin_str, vdot_str;
-  if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
-                         &vmin_str, &vdot_str) ||
-      !absl::SimpleAtoi(vmaj_str, &vmaj) ||
-      !absl::SimpleAtoi(vmin_str, &vmin) ||
-      !absl::SimpleAtoi(vdot_str, &vdot)) {
-    LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
-                 << " --version:\n"
-                 << out;
-    return;
-  }
-
-  // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
-  // PTX 6.0.  An older ptxas will just fail to compile any of our code.
-  //
-  // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
-  // address calculations with large offsets (e.g. "load ptr + large_constant"),
-  // b/70245379.
-  //
-  // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
-  // that appears related to address calculations, b/111107644.  ptxas 9.2.88
-  // appears to work, as far as we can tell.
-  if (vmaj < 9) {
-    LOG(ERROR)
-        << "You are using ptxas 8.x, but XLA requires ptxas 9.x (and strongly "
-           "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
-           "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
-           "binary is sufficient.";
-  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
-    LOG(WARNING)
-        << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
-        << vdot
-        << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
-           "miscompile XLA code, leading to incorrect results or "
-           "invalid-address errors.\n\nYou do not need to update to CUDA "
-           "9.2.88; cherry-picking the ptxas binary is sufficient.";
-  }
-}
-
 // Prints a warning if the ptx->sass JIT in the driver has known bugs.
 //
 // Using such a driver only a problem if we fail to use ptxas to compile our ptx
@@ -538,80 +454,6 @@
   });
 }
 
-// Compiles the given PTX string using ptxas and returns the resulting machine
-// code (i.e. a cubin) as a byte array.
-StatusOr<std::vector<uint8>> CompilePtx(
-    const string& ptx, int cc_major, int cc_minor,
-    const HloModuleConfig& hlo_module_config) {
-  tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
-  auto env = tensorflow::Env::Default();
-  string ptxas_path;
-  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
-    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
-    VLOG(2) << "Looking for ptxas at " << ptxas_path;
-    if (env->FileExists(ptxas_path).ok()) {
-      break;
-    }
-  }
-  TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
-  VLOG(2) << "Using ptxas at " << ptxas_path;
-
-  WarnIfBadPtxasVersion(ptxas_path);
-
-  // Write ptx into a temporary file.
-  string ptx_path;
-  if (!env->LocalTempFilename(&ptx_path)) {
-    return InternalError("couldn't get temp PTX file name");
-  }
-  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
-    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
-  });
-
-  TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_path, ptx));
-  VLOG(2) << "ptx written to: " << ptx_path;
-
-  // Invoke ptxas and collect its output.
-  string cubin_path;
-  if (!env->LocalTempFilename(&cubin_path)) {
-    return InternalError("couldn't get temp CUBIN file name");
-  }
-  auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
-    // CUBIN file may never be created, so the failure to delete it should not
-    // produce TF error.
-    tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
-  });
-  tensorflow::SubProcess ptxas_info_dumper;
-  std::vector<string> ptxas_args = {
-      ptxas_path, ptx_path, "-o", cubin_path,
-      absl::StrCat("-arch=sm_", cc_major, cc_minor)};
-  if (VLOG_IS_ON(2)) {
-    ptxas_args.push_back("-v");
-  }
-  if (hlo_module_config.debug_options().xla_gpu_disable_ptxas_optimizations()) {
-    ptxas_args.push_back("-O0");
-  }
-  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
-  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
-                                     tensorflow::ACTION_PIPE);
-  if (!ptxas_info_dumper.Start()) {
-    return InternalError("Failed to launch ptxas");
-  }
-  string stderr_output;
-  int exit_status = ptxas_info_dumper.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  XLA_LOG_LINES(tensorflow::INFO, stderr_output);
-  if (exit_status != 0) {
-    return InternalError("ptxas exited with non-zero error code %d",
-                         exit_status);
-  }
-
-  // Read in the result of compilation and return it as a byte vector.
-  string cubin;
-  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
-                                                  cubin_path, &cubin));
-  std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
-  return cubin_vector;
-}
 
 }  // namespace
 
@@ -621,11 +463,12 @@
 
 StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
-  tracing::ScopedActivity activity("HLO Transforms", module->name(),
-                                   /*is_expensive=*/true);
+  tensorflow::profiler::TraceMe activity(
+      [&] { return absl::StrCat("HLO Transforms:", module->name()); },
+      tensorflow::profiler::TraceMeLevel::kInfo);
   TF_RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), stream_exec, device_allocator, this));
 
@@ -636,7 +479,7 @@
 
 StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend");
 
   TF_RET_CHECK(stream_exec != nullptr);
@@ -676,15 +519,11 @@
           [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
           /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true));
-  if (DumpingEnabledForHloModule(*module)) {
-    DumpToFileInDirOrStdout(*module, "buffer_assignment",
-                            buffer_assignment->ToString());
-  }
   DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
-  IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
-                                      &stream_exec->GetDeviceDescription(),
-                                      &llvm_module);
+  IrEmitterContext ir_emitter_context(
+      module.get(), buffer_assignment.get(), stream_exec->platform(),
+      &stream_exec->GetDeviceDescription(), &llvm_module);
 
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
@@ -764,8 +603,8 @@
     DumpToFileInDirOrStdout(*module, "ptx", ptx);
   }
 
-  const std::vector<uint8> cubin =
-      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor, module->config());
+  const std::vector<uint8> cubin = CompilePtxOrGetCachedResult(
+      stream_exec, ptx, cc_major, cc_minor, module->config());
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@@ -805,10 +644,11 @@
 }
 
 std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
-    const string& ptx, int cc_major, int cc_minor,
-    const HloModuleConfig& hlo_module_config) {
+    se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
+    int cc_minor, const HloModuleConfig& hlo_module_config) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
-  tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
+  tensorflow::profiler::TraceMe activity(
+      "PTX->CUBIN", tensorflow::profiler::TraceMeLevel::kInfo);
   bool inserted;
   decltype(compilation_cache_.begin()) iter;
   // Pointers into compilation_cache_ where the ptx and (optional) cubin are
@@ -834,8 +674,8 @@
     if (inserted) {
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin =
-            CompilePtx(*cache_ptx, cc_major, cc_minor, hlo_module_config);
+        StatusOr<std::vector<uint8>> maybe_cubin = CompilePtx(
+            stream_exec, *cache_ptx, PtxCompilationOptions(hlo_module_config));
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index b2077f4..25e4b94 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -33,6 +33,7 @@
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace xla {
 namespace gpu {
@@ -52,11 +53,11 @@
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
@@ -98,8 +99,8 @@
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
   std::vector<uint8> CompilePtxOrGetCachedResult(
-      const string& ptx, int cc_major, int cc_minor,
-      const HloModuleConfig& hlo_module_config);
+      se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
+      int cc_minor, const HloModuleConfig& hlo_module_config);
 
   // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
   // -> cubin so we don't recompile the same ptx twice.  This is important for
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
index a70ad5a..b222997 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
@@ -14,7 +14,21 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
+
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/kernel.h"
+#include "tensorflow/stream_executor/kernel_spec.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace xla {
 namespace gpu {
@@ -36,7 +50,7 @@
 
   int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
   TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory allocated_buffer,
+      se::OwningDeviceMemory allocated_buffer,
       memory_allocator_->Allocate(device_ordinal_,
                                   byte_size + 2 * redzone_size_ + rhs_slop,
                                   /*retry_on_failure=*/false));
@@ -78,67 +92,204 @@
   return data_chunk;
 }
 
-Status RedzoneAllocator::CheckRedzones(se::Stream* stream) const {
-  for (const auto& buf_and_size : allocated_buffers_) {
-    const auto& allocated_buf = buf_and_size.first;
-    int64 user_alloc_size = buf_and_size.second;
-    char* addr =
-        reinterpret_cast<char*>(allocated_buf.AsDeviceMemoryBase().opaque());
-    // user_alloc_size isn't necessarily the same as
-    // allocated_buf.size() - 2 * redzone_size_ because if user_alloc_size was
-    // not a multiple of kRhsRedzoneAlign, we rounded it up.
-    se::DeviceMemoryBase buf(addr + redzone_size_, user_alloc_size);
-    TF_RETURN_IF_ERROR(CheckBufferRedzones(buf, stream));
+// PTX blob for the function which checks that every byte in
+// input_buffer (length is buffer_length) is equal to redzone_pattern.
+//
+// On mismatch, increment the counter pointed to by out_mismatch_cnt_ptr.
+//
+// Generated from:
+// __global__ void redzone_checker(unsigned char* input_buffer,
+//                                 unsigned char redzone_pattern,
+//                                 unsigned long long buffer_length,
+//                                 int* out_mismatched_ptr) {
+//   unsigned long long idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   if (input_buffer[idx] != redzone_pattern) atomicAdd(out_mismatched_ptr, 1);
+// }
+//
+// Code must compile for the oldest GPU XLA may be compiled for.
+static const char* redzone_checker_ptx = R"(
+.version 4.2
+.target sm_30
+.address_size 64
+
+.visible .entry redzone_checker(
+  .param .u64 input_buffer,
+  .param .u8 redzone_pattern,
+  .param .u64 buffer_length,
+  .param .u64 out_mismatch_cnt_ptr
+)
+{
+  .reg .pred   %p<3>;
+  .reg .b16   %rs<3>;
+  .reg .b32   %r<6>;
+  .reg .b64   %rd<8>;
+
+  ld.param.u64   %rd6, [buffer_length];
+  mov.u32   %r1, %tid.x;
+  mov.u32   %r2, %ctaid.x;
+  mov.u32   %r3, %ntid.x;
+  mad.lo.s32   %r4, %r3, %r2, %r1;
+  cvt.u64.u32   %rd3, %r4;
+  setp.ge.u64   %p1, %rd3, %rd6;
+  @%p1 bra   LBB6_3;
+  ld.param.u8   %rs1, [redzone_pattern];
+  ld.param.u64   %rd4, [input_buffer];
+  cvta.to.global.u64   %rd2, %rd4;
+  add.s64   %rd7, %rd2, %rd3;
+  ld.global.u8   %rs2, [%rd7];
+  setp.eq.s16   %p2, %rs2, %rs1;
+  @%p2 bra   LBB6_3;
+  ld.param.u64   %rd5, [out_mismatch_cnt_ptr];
+  cvta.to.global.u64   %rd1, %rd5;
+  atom.global.add.u32   %r5, [%rd1], 1;
+LBB6_3:
+  ret;
+}
+)";
+
+// The PTX in redzone_checker_ptx has to be launched with specified types
+// in the specified order.
+using ComparisonKernelT = se::TypedKernel<se::DeviceMemory<uint8>, uint8,
+                                          uint64, se::DeviceMemory<uint64>>;
+
+// Check that redzones weren't overwritten on a host.
+//
+// Slower, but gives a more useful error message.
+static Status CheckRedzoneHost(se::DeviceMemoryBase redzone,
+                               se::DeviceMemoryBase user_allocation,
+                               absl::string_view name, se::Stream* stream,
+                               uint8 redzone_pattern, int64 redzone_size) {
+  uint64 size = redzone.size();
+  auto redzone_data = absl::make_unique<uint8[]>(size);
+  TF_RETURN_IF_ERROR(stream->ThenMemcpy(redzone_data.get(), redzone, size)
+                         .BlockHostUntilDone());
+  XLA_SCOPED_LOGGING_TIMER("RedzoneAllocator::CheckBufferRedzones CPU loop.");
+
+  std::array<uint8, sizeof(uint64)> pattern_arr;
+  pattern_arr.fill(redzone_pattern);
+  uint64 pattern64;
+  std::memcpy(&pattern64, pattern_arr.data(), sizeof(uint64));
+
+  int64 i;
+  for (i = 0; i + 7 < size; i += sizeof(uint64)) {
+    uint64 rz_value = *reinterpret_cast<uint64*>(&redzone_data[i]);
+    if (rz_value != pattern64) {
+      return InternalError(
+          "Redzone mismatch in %s redzone of buffer %p at offset %d; "
+          "expected %08x but was %08x.",
+          name, user_allocation.opaque(), i, pattern64, rz_value);
+    }
+  }
+  for (; i < size; ++i) {
+    uint8 rz_value = redzone_data[i];
+    if (rz_value != redzone_pattern) {
+      return InternalError(
+          "Redzone mismatch in %s redzone of buffer %p at offset %d; "
+          "expected %08x but was %08x.",
+          name, user_allocation.opaque(), i, redzone_pattern, rz_value);
+    }
   }
   return Status::OK();
 }
 
-Status RedzoneAllocator::CheckBufferRedzones(se::DeviceMemoryBase buf,
-                                             se::Stream* stream) const {
-  XLA_SCOPED_LOGGING_TIMER("RedzoneAllocator::CheckBufferRedzones.");
-  char* buf_start = reinterpret_cast<char*>(buf.opaque());
-  auto check_redzone = [&](int64 offset, int64 size, absl::string_view name) {
-    se::DeviceMemoryBase redzone(buf_start + offset, size);
-    auto redzone_data = absl::make_unique<uint8[]>(size);
-    TF_RETURN_IF_ERROR(stream->ThenMemcpy(redzone_data.get(), redzone, size)
-                           .BlockHostUntilDone());
-    XLA_SCOPED_LOGGING_TIMER("RedzoneAllocator::CheckBufferRedzones CPU loop.");
+// Run the redzone checker on the provided buffer redzone.
+//
+// Increment out_param if mismatch occurs.
+static Status RunRedzoneChecker(se::Stream* stream,
+                                const se::DeviceMemory<uint8>& redzone,
+                                uint8 redzone_pattern,
+                                const se::DeviceMemory<uint64>& out_param,
+                                const ComparisonKernelT& comparison_kernel) {
+  se::StreamExecutor* executor = stream->parent();
+  Shape redzone_shape = ShapeUtil::MakeShape(
+      PrimitiveType::U8, {static_cast<int64>(redzone.size())});
+  LaunchDimensions dim = CalculateLaunchDimensions(
+      redzone_shape, executor->GetDeviceDescription());
 
-    std::array<uint8, sizeof(uint64)> pattern_arr;
-    pattern_arr.fill(redzone_pattern_);
-    uint64 pattern64;
-    std::memcpy(&pattern64, pattern_arr.data(), sizeof(uint64));
+  stream->ThenLaunch(se::ThreadDim(dim.threads_per_block()),
+                     se::BlockDim(dim.block_count()), comparison_kernel,
+                     redzone, redzone_pattern, redzone.size(), out_param);
 
-    int64 i;
-    for (i = 0; i + 7 < size; i += sizeof(uint64)) {
-      uint64 rz_value = *reinterpret_cast<uint64*>(&redzone_data[i]);
-      if (rz_value != pattern64) {
-        return InternalError(
-            "Redzone mismatch in %s redzone of buffer %p at offset %d; "
-            "expected %08x but was %08x.",
-            name, buf.opaque(), i, pattern64, rz_value);
-      }
-    }
-    for (; i < size; ++i) {
-      uint8 rz_value = redzone_data[i];
-      if (rz_value != redzone_pattern_) {
-        return InternalError(
-            "Redzone mismatch in %s redzone of buffer %p at offset %d; "
-            "expected %08x but was %08x.",
-            name, buf.opaque(), i, redzone_pattern_, rz_value);
-      }
-    }
-    return Status::OK();
-  };
+  return Status::OK();
+}
 
-  // `buf` points to the buffer returned to the user, so the LHS redzone starts
-  // before `buf`.
-  TF_RETURN_IF_ERROR(check_redzone(-redzone_size_, redzone_size_, "LHS"));
-
+// Check redzones around the user allocation.
+//
+// Increment out_param if mismatch occurs.
+static Status CheckRedzonesForBuffer(se::Stream* stream,
+                                     se::DeviceMemoryBase memory,
+                                     const se::DeviceMemory<uint64>& out_param,
+                                     const ComparisonKernelT& comparison_kernel,
+                                     int64 user_allocation_size,
+                                     uint64 redzone_size,
+                                     uint8 redzone_pattern) {
+  se::StreamExecutor* executor = stream->parent();
   int64 rhs_slop =
-      RoundUpToNearest<int64>(buf.size(), kRhsRedzoneAlign) - buf.size();
-  TF_RETURN_IF_ERROR(
-      check_redzone(buf.size(), redzone_size_ + rhs_slop, "RHS"));
+      RoundUpToNearest<int64>(user_allocation_size, kRhsRedzoneAlign) -
+      user_allocation_size;
+  CHECK_EQ(memory.size(), user_allocation_size + rhs_slop + 2 * redzone_size);
+
+  se::DeviceMemory<uint8> buffer_uint8(memory);
+  se::DeviceMemory<uint8> lhs_redzone =
+      executor->GetSubBuffer(&buffer_uint8, 0, redzone_size);
+  se::DeviceMemory<uint8> user_allocation =
+      executor->GetSubBuffer(&buffer_uint8, redzone_size, user_allocation_size);
+  se::DeviceMemory<uint8> rhs_redzone =
+      executor->GetSubBuffer(&buffer_uint8, redzone_size + user_allocation_size,
+                             redzone_size + rhs_slop);
+
+  TF_RETURN_IF_ERROR(RunRedzoneChecker(stream, lhs_redzone, redzone_pattern,
+                                       out_param, comparison_kernel));
+  TF_RETURN_IF_ERROR(RunRedzoneChecker(stream, rhs_redzone, redzone_pattern,
+                                       out_param, comparison_kernel));
+  int64 result;
+  CHECK_EQ(out_param.size(), sizeof(result));
+  stream->ThenMemcpy(&result, out_param, sizeof(result));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  if (result != 0) {
+    TF_RETURN_IF_ERROR(CheckRedzoneHost(lhs_redzone, user_allocation, "LHS",
+                                        stream, redzone_pattern, redzone_size));
+    TF_RETURN_IF_ERROR(CheckRedzoneHost(rhs_redzone, user_allocation, "RHS",
+                                        stream, redzone_pattern, redzone_size));
+    LOG(FATAL) << "Mismatched results with host and device comparison";
+  }
+
+  return Status::OK();
+}
+
+Status RedzoneAllocator::CheckRedzones(se::Stream* stream) const {
+  XLA_SCOPED_LOGGING_TIMER("Redzone checking");
+
+  se::StreamExecutor* executor = stream->parent();
+
+  TF_ASSIGN_OR_RETURN(
+      absl::Span<const uint8> compiled_ptx,
+      CompilePtxOrGetCached(executor, redzone_checker_ptx,
+                            PtxCompilationOptions(hlo_module_config_)));
+
+  se::ScopedDeviceMemory<uint64> out_param =
+      executor->AllocateOwnedScalar<uint64>();
+  stream->ThenMemZero(out_param.ptr(), sizeof(uint64));
+
+  auto typed_or = CreateTypedKernel<se::DeviceMemory<uint8>, uint8, uint64,
+                                    se::DeviceMemory<uint64>>(
+      "redzone_checker", redzone_checker_ptx, compiled_ptx, executor);
+
+  // TF_ASSIGN_OR_RETURN does not work due to complex template.
+  if (!typed_or.ok()) {
+    return typed_or.status();
+  }
+  std::unique_ptr<ComparisonKernelT> comparison_kernel =
+      std::move(typed_or.ValueOrDie());
+
+  for (const auto& buf_and_size : allocated_buffers_) {
+    TF_RETURN_IF_ERROR(CheckRedzonesForBuffer(
+        stream, buf_and_size.first.AsDeviceMemoryBase(), out_param.cref(),
+        *comparison_kernel, buf_and_size.second, redzone_size_,
+        redzone_pattern_));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
index d8b438c..4e3438c 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
@@ -18,11 +18,12 @@
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 namespace gpu {
@@ -40,14 +41,17 @@
 // memory for cudnn convolutions.
 class RedzoneAllocator : public se::ScratchAllocator {
  public:
-  RedzoneAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator,
+  RedzoneAllocator(int device_ordinal,
+                   se::DeviceMemoryAllocator* memory_allocator,
+                   const HloModuleConfig& hlo_module_config,
                    int64 redzone_size = 1 << 23,  // 8MiB per side, 16MiB total
                    uint8 redzone_pattern = -1)
       : device_ordinal_(device_ordinal),
         redzone_size_(
             RoundUpToNearest(redzone_size, kXlaAllocatedBufferAlignBytes)),
         redzone_pattern_(redzone_pattern),
-        memory_allocator_(memory_allocator) {}
+        memory_allocator_(memory_allocator),
+        hlo_module_config_(hlo_module_config) {}
 
   // Redzones don't count towards the memory limit.
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
@@ -64,11 +68,6 @@
   Status CheckRedzones(se::Stream* stream) const;
 
  private:
-  // Checks that one buffer's redzones are unmodified.  buf should point to the
-  // user-editable buffer, i.e. it should not include redzones.
-  Status CheckBufferRedzones(se::DeviceMemoryBase buf,
-                             se::Stream* stream) const;
-
   const int device_ordinal_;
 
   // Redzone size on *one side* of allocation.
@@ -78,13 +77,14 @@
   const int64 redzone_size_;
 
   const uint8 redzone_pattern_;
-  DeviceMemoryAllocator* memory_allocator_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  const HloModuleConfig& hlo_module_config_;
 
   // The second element of the pair is the size of the user allocation.  This
   // isn't necessarily just first.size() - 2 * redzone_size_ because when the
   // user allocation size is not a multiple of 4 bytes, we round up the size of
   // the RHS redzone.
-  std::vector<std::pair<OwningDeviceMemory, int64>> allocated_buffers_;
+  std::vector<std::pair<se::OwningDeviceMemory, int64>> allocated_buffers_;
 
   int64 allocated_bytes_excluding_redzones_ = 0;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
index 4eebab9..a3b0ac3 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
@@ -14,12 +14,14 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
@@ -39,9 +41,10 @@
   se::Platform* platform =
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
-  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, kRedzoneSize,
-                             kRedzonePattern);
+  HloModuleConfig config;
+  se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
+                             kRedzoneSize, kRedzonePattern);
 
   se::Stream stream(stream_exec);
   stream.Init();
@@ -103,6 +106,27 @@
   modify_redzone(rhs_redzone, /*offset=*/kRedzoneSize - 1, "rhs");
 }
 
+// Older CUDA compute capabilities (<= 2.0) have a limitation that grid
+// dimension X cannot be larger than 65535.
+//
+// Make sure we can launch kernels on sizes larger than that, given that the
+// maximum number of threads per block is 1024.
+TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
+  // Make sure the redzone size would require grid dimension > 65535.
+  constexpr int64 kRedzoneSize = 65535 * 1024 + 1;
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
+  HloModuleConfig config;
+  se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
+                             kRedzoneSize, /*redzone_pattern=*/-1);
+  se::Stream stream(stream_exec);
+  stream.Init();
+  (void)allocator.AllocateBytes(&stream, /*byte_size=*/1);
+  TF_EXPECT_OK(allocator.CheckRedzones(&stream));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
index 197367e..7a32204 100644
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
@@ -29,7 +29,7 @@
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
                       memory_allocator_->Allocate(device_ordinal_, byte_size,
                                                   /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
index 620c7e7..a22e7f5 100644
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
@@ -18,18 +18,19 @@
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 namespace gpu {
 
 class ScratchAllocator : public se::ScratchAllocator {
  public:
-  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+  ScratchAllocator(int device_ordinal,
+                   se::DeviceMemoryAllocator* memory_allocator)
       : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
@@ -50,8 +51,8 @@
 
  private:
   const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index c8d3916..efdcf15 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -15,9 +15,18 @@
 
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/stream_executor/kernel_spec.h"
 
 namespace xla {
 namespace gpu {
@@ -180,5 +189,243 @@
   return tensorflow::mutex_lock{it->second};
 }
 
+StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
+    absl::string_view kernel_name, uint64 num_args, absl::string_view ptx,
+    absl::Span<const uint8> cubin_data, se::StreamExecutor* stream_exec) {
+  se::MultiKernelLoaderSpec loader_spec(num_args);
+  loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
+
+  if (!cubin_data.empty()) {
+    loader_spec.AddCudaCubinInMemory(
+        reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
+  }
+
+  auto kernel_base = absl::make_unique<se::KernelBase>(stream_exec);
+  if (!stream_exec->GetKernel(loader_spec, kernel_base.get())) {
+    return InternalError("Unable to load kernel '%s'", kernel_name);
+  }
+
+  return std::move(kernel_base);
+}
+
+Status ExecuteKernelOnStream(const se::KernelBase& kernel,
+                             absl::Span<const se::DeviceMemoryBase> args,
+                             int64 threads_per_block, int64 block_count,
+                             se::Stream* stream) {
+  static constexpr int kKernelArgsLimit = 1024;
+  auto kernel_args = absl::make_unique<se::KernelArgsArray<kKernelArgsLimit>>();
+  for (const se::DeviceMemoryBase& buf : args) {
+    kernel_args->add_device_memory_argument(buf);
+  }
+
+  if (!stream->parent()->Launch(stream, se::ThreadDim(threads_per_block),
+                                se::BlockDim(block_count), kernel,
+                                *kernel_args)) {
+    return InternalError("Unable to launch kernel");
+  }
+  return Status::OK();
+}
+
+// Prints a warning if the ptxas at ptxas_path has known bugs.
+//
+// Only prints a warning the first time it's called for a particular value of
+// ptxas_path.
+//
+// Locks on entry.
+void WarnIfBadPtxasVersion(const string& ptxas_path) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
+      new std::unordered_set<string>();
+
+  tensorflow::mutex_lock lock(mu);
+  if (!seen_ptxas_paths->insert(ptxas_path).second) {
+    // Already checked this ptx binary, nothing to do.
+    return;
+  }
+
+  tensorflow::SubProcess ptxas;
+  ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
+  ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
+  if (!ptxas.Start()) {
+    LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
+    return;
+  }
+
+  string out;
+  int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
+                                    /*stderr_output=*/nullptr);
+  if (exit_code != 0) {
+    LOG(WARNING) << "Running " << ptxas_path << " --version returned "
+                 << exit_code;
+    return;
+  }
+
+  int64 vmaj, vmin, vdot;
+  string vmaj_str, vmin_str, vdot_str;
+  if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
+                         &vmin_str, &vdot_str) ||
+      !absl::SimpleAtoi(vmaj_str, &vmaj) ||
+      !absl::SimpleAtoi(vmin_str, &vmin) ||
+      !absl::SimpleAtoi(vdot_str, &vdot)) {
+    LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
+                 << " --version:\n"
+                 << out;
+    return;
+  }
+
+  // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
+  // PTX 6.0.  An older ptxas will just fail to compile any of our code.
+  //
+  // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
+  // address calculations with large offsets (e.g. "load ptr + large_constant"),
+  // b/70245379.
+  //
+  // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
+  // that appears related to address calculations, b/111107644.  ptxas 9.2.88
+  // appears to work, as far as we can tell.
+  if (vmaj < 9) {
+    LOG(ERROR)
+        << "You are using ptxas 8.x, but XLA requires ptxas 9.x (and strongly "
+           "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
+           "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
+           "binary is sufficient.";
+  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
+    LOG(WARNING)
+        << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
+        << vdot
+        << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
+           "miscompile XLA code, leading to incorrect results or "
+           "invalid-address errors.\n\nYou do not need to update to CUDA "
+           "9.2.88; cherry-picking the ptxas binary is sufficient.";
+  }
+}
+
+// Returns a vector of potential locations of the CUDA root directory.
+// Searches through tensorflow CUDA locations AND through the CUDA location
+// specified in HLO configuration.
+std::vector<string> GetCudaRootCandidates(
+    PtxCompilationOptions compile_ptx_options) {
+  std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
+
+  // "." is our last resort, even though it probably won't work.
+  potential_cuda_roots.push_back(".");
+
+  // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
+  // highest priority.
+  string xla_gpu_cuda_data_dir = compile_ptx_options.xla_gpu_cuda_data_dir;
+  if (!xla_gpu_cuda_data_dir.empty()) {
+    potential_cuda_roots.insert(potential_cuda_roots.begin(),
+                                xla_gpu_cuda_data_dir);
+  }
+  return potential_cuda_roots;
+}
+
+StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
+    se::StreamExecutor* executor, absl::string_view ptx,
+    PtxCompilationOptions compilation_options) {
+  using PtxCacheKey = std::tuple<se::StreamExecutor*, std::string,
+                                 PtxCompilationOptions::PtxOptionsTuple>;
+  static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
+  static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
+      *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
+
+  tensorflow::mutex_lock lock(ptx_cache_mutex);
+  PtxCacheKey cache_key{executor, std::string(ptx),
+                        compilation_options.ToTuple()};
+  auto it = ptx_cache.find(cache_key);
+  if (it == ptx_cache.end()) {
+    TF_ASSIGN_OR_RETURN(std::vector<uint8> compiled,
+                        CompilePtx(executor, ptx, compilation_options));
+    it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
+  }
+
+  CHECK(it != ptx_cache.end());
+  const std::vector<uint8>& compiled = it->second;
+  return absl::MakeSpan(compiled);
+}
+
+StatusOr<std::vector<uint8>> CompilePtx(
+    se::StreamExecutor* stream_exec, absl::string_view ptx,
+    PtxCompilationOptions compile_ptx_options) {
+  int cc_major, cc_minor;
+  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor)) {
+    LOG(WARNING)
+        << "Couldn't get compute capability for device; assuming sm_20.";
+    cc_major = 2;
+    cc_minor = 0;
+  }
+
+  tensorflow::profiler::TraceMe activity(
+      "Compile PTX", tensorflow::profiler::TraceMeLevel::kInfo);
+  auto env = tensorflow::Env::Default();
+  string ptxas_path;
+  for (const string& cuda_root : GetCudaRootCandidates(compile_ptx_options)) {
+    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
+    VLOG(2) << "Looking for ptxas at " << ptxas_path;
+    if (env->FileExists(ptxas_path).ok()) {
+      break;
+    }
+  }
+  TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
+  VLOG(2) << "Using ptxas at " << ptxas_path;
+
+  WarnIfBadPtxasVersion(ptxas_path);
+
+  // Write ptx into a temporary file.
+  string ptx_path;
+  if (!env->LocalTempFilename(&ptx_path)) {
+    return InternalError("couldn't get temp PTX file name");
+  }
+  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
+    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
+  });
+
+  TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_path, ptx));
+  VLOG(2) << "ptx written to: " << ptx_path;
+
+  // Invoke ptxas and collect its output.
+  string cubin_path;
+  if (!env->LocalTempFilename(&cubin_path)) {
+    return InternalError("couldn't get temp CUBIN file name");
+  }
+  auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
+    // CUBIN file may never be created, so the failure to delete it should not
+    // produce TF error.
+    tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
+  });
+  tensorflow::SubProcess ptxas_info_dumper;
+  std::vector<string> ptxas_args = {
+      ptxas_path, ptx_path, "-o", cubin_path,
+      absl::StrCat("-arch=sm_", cc_major, cc_minor)};
+  if (VLOG_IS_ON(2)) {
+    ptxas_args.push_back("-v");
+  }
+  if (compile_ptx_options.xla_gpu_disable_ptxas_optimizations) {
+    ptxas_args.push_back("-O0");
+  }
+  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
+  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
+                                     tensorflow::ACTION_PIPE);
+  if (!ptxas_info_dumper.Start()) {
+    return InternalError("Failed to launch ptxas");
+  }
+  string stderr_output;
+  int exit_status = ptxas_info_dumper.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+  XLA_LOG_LINES(tensorflow::INFO, stderr_output);
+  if (exit_status != 0) {
+    return InternalError("ptxas exited with non-zero error code %d",
+                         exit_status);
+  }
+
+  // Read in the result of compilation and return it as a byte vector.
+  string cubin;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                                  cubin_path, &cubin));
+  std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
+  return cubin_vector;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 8dbd7d7..2f9bcba 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -16,11 +16,15 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/kernel_spec.h"
 
 // Helper functions for interacting with StreamExecutor.
 
@@ -53,6 +57,100 @@
 // device while another thread is using it.
 tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec);
 
+// Creates a kernel which can be launched with stream.ThenLaunch, such that
+// the types of the arguments provided for launch would have to match
+// types of the arguments provided at creation time.
+//
+// The kernel has a name kernel_name, and is based from provided PTX in ptx,
+// and (optional) compiled PTX in cubin_data.
+// The canonical storage for both ptx and cubin_data should outlive the
+// lifetime of the kernel.
+//
+// This is a preferred API since it provides type safety for kernel launches.
+template <typename... Args>
+StatusOr<std::unique_ptr<se::TypedKernel<Args...>>> CreateTypedKernel(
+    absl::string_view kernel_name, absl::string_view ptx,
+    absl::Span<const uint8> cubin_data, se::StreamExecutor* stream_exec) {
+  auto kernel_base = absl::make_unique<se::TypedKernel<Args...>>(stream_exec);
+  se::MultiKernelLoaderSpec loader_spec(kernel_base->kNumberOfParameters);
+  loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
+
+  if (!cubin_data.empty()) {
+    loader_spec.AddCudaCubinInMemory(
+        reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
+  }
+
+  if (!stream_exec->GetKernel(loader_spec, kernel_base.get())) {
+    return InternalError("Unable to load kernel '%s'", kernel_name);
+  }
+
+  return std::move(kernel_base);
+}
+
+// Creates a kernel with a provided name, based from provided PTX in ptx.
+// The kernel should be executed using the provided executor.
+// The argument cubin_data represents compiled PTX and may be left empty.
+//
+// The canonical storage for both ptx and cubin_data should outlive
+// the lifetime of the kernel.
+StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
+    absl::string_view kernel_name, uint64 num_args, absl::string_view ptx,
+    absl::Span<const uint8> cubin_data, se::StreamExecutor* stream_exec);
+
+// Runs loaded kernel on the stream with the provided arguments.
+Status ExecuteKernelOnStream(const se::KernelBase& kernel,
+                             absl::Span<const se::DeviceMemoryBase> args,
+                             int64 threads_per_block, int64 block_count,
+                             se::Stream* stream);
+
+// Options for compiling with PTX.
+struct PtxCompilationOptions {
+  bool xla_gpu_disable_ptxas_optimizations;
+  std::string xla_gpu_cuda_data_dir;
+
+  using PtxOptionsTuple = std::tuple<bool, std::string>;
+
+  explicit PtxCompilationOptions(const HloModuleConfig& hlo_module_config)
+      : xla_gpu_disable_ptxas_optimizations(
+            hlo_module_config.debug_options()
+                .xla_gpu_disable_ptxas_optimizations()),
+        xla_gpu_cuda_data_dir(
+            hlo_module_config.debug_options().xla_gpu_cuda_data_dir()) {}
+
+  // For comparison and hashing.
+  PtxOptionsTuple ToTuple() {
+    return std::make_tuple(xla_gpu_disable_ptxas_optimizations,
+                           xla_gpu_cuda_data_dir);
+  }
+};
+
+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array.
+//
+// Queries stream executor stream_exec to get CUDA compute capability from the
+// device.
+//
+// compile_ptx_options is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+// It can be constructed from HloModuleConfig.
+StatusOr<std::vector<uint8>> CompilePtx(
+    se::StreamExecutor* stream_exec, absl::string_view ptx,
+    PtxCompilationOptions compile_ptx_options);
+
+// Same as CompilePtx, but caches the result, and returns unowned view of
+// the compiled binary.
+//
+// A copy of the string provided in ptx will be made.
+StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
+    se::StreamExecutor* executor, absl::string_view ptx,
+    PtxCompilationOptions compilation_options);
+
+// Returns a vector of potential locations of the CUDA root directory.
+// Searches through tensorflow CUDA locations AND through the CUDA location
+// specified in compile_ptx_options (can be constructed from HloModuleConfig).
+std::vector<string> GetCudaRootCandidates(
+    PtxCompilationOptions compile_ptx_options);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index d798b31..b6ce15b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -200,8 +200,8 @@
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
index 672c68e..914b81c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index f43e059..5a9b7bd 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -34,6 +34,8 @@
       return "kCudnnBatchNormForwardInference";
     case Thunk::kCudnnBatchNormForwardTraining:
       return "kCudnnBatchNormForwardTraining";
+    case Thunk::kCustomCall:
+      return "kCustomCall";
     case Thunk::kNcclAllReduce:
       return "kNcclAllReduce";
     case Thunk::kFft:
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 6a35f22..bdd0671 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -49,13 +49,14 @@
     kCudnnBatchNormBackward,
     kCudnnBatchNormForwardInference,
     kCudnnBatchNormForwardTraining,
-    kNcclAllReduce,
+    kCustomCall,
     kFft,
     kGemm,
     kInfeed,
     kKernel,
     kMemset32BitValue,
     kMemzero,
+    kNcclAllReduce,
     kOutfeed,
     kSequential,
     kTriangularSolve,
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 4fca981..2af8e1d 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -251,7 +251,26 @@
       // We can only share with the operand buffer if it is about to be freed;
       // we must be the last user of the buffer.
       bool shared = false;
-      if (options_.may_reuse_operand_buffers) {
+      auto shared_it = shared_buffers_.find(buffer);
+      if (shared_it != shared_buffers_.end()) {
+        std::shared_ptr<SharedGroup> group = shared_it->second;
+        if (group->refcount != 0) {
+          // This buffer has a shared group with already some instructions
+          // scheduled (refcount > 0), find and share buffer with the
+          // canonical instruction.
+          shared = true;
+          VLOG(3) << "  Sharing: " << buffer->ToString()
+                  << " with must aliased buffer "
+                  << group->canonical->ToString();
+          FillDebugTrace(HeapSimulatorTrace::Event::SHARE_WITH, buffer,
+                         instruction, group->canonical);
+        } else {
+          VLOG(3) << "  New shared group, canonical buffer: "
+                  << buffer->ToString();
+          group->canonical = buffer;
+        }
+        group->refcount++;
+      } else if (options_.may_reuse_operand_buffers) {
         for (const BufferValue* operand_buffer : operand_buffers_to_free) {
           if (reused_buffers.contains(operand_buffer)) {
             continue;
@@ -261,12 +280,17 @@
               points_to_analysis.CanShareOperandBufferWithUser(
                   operand_buffer->instruction(), operand_buffer->index(),
                   buffer->instruction(), buffer->index())) {
-            VLOG(3) << "  Sharing: " << buffer->ToString() << " with "
-                    << operand_buffer->ToString();
-            ShareBuffer(buffer, operand_buffer, instruction);
-            shared = true;
-            reused_buffers.insert(operand_buffer);
-            break;
+            // Make sure the two buffers belong to the same shared groups.
+            // Otherwise we'd need to merge those shared groups which is not
+            // suported.
+            if (InSameSharedGroup(buffer, operand_buffer)) {
+              VLOG(3) << "  Sharing: " << buffer->ToString() << " with "
+                      << operand_buffer->ToString();
+              ShareBuffer(buffer, operand_buffer, instruction);
+              shared = true;
+              reused_buffers.insert(operand_buffer);
+              break;
+            }
           }
         }
       }
@@ -358,6 +382,17 @@
       options_(options),
       schedule_(schedule),
       memory_by_computation_(memory_by_computation) {
+  for (const BufferValueFlatSet& value_set : options.must_alias_sets) {
+    auto group = std::make_shared<SharedGroup>();
+    group->refcount = 0;
+    VLOG(2) << "Shared buffers:";
+    for (const BufferValue* buffer_value : value_set) {
+      VLOG(2) << "    " << buffer_value->ToString();
+      shared_buffers_.emplace(buffer_value, group);
+      // Refcounts are not incremented here as buffers are shared but not
+      // referenced yet.
+    }
+  }
   debug_trace_.set_whole_module_simulation(schedule_ != nullptr);
 }
 
@@ -402,9 +437,13 @@
   if (shared_it != shared_buffers_.end()) {
     std::shared_ptr<SharedGroup> group = shared_it->second;
     --group->refcount;
+    VLOG(3) << "    Decrementing refcount : " << group->canonical->ToString();
     if (group->refcount > 0) {
+      // Another buffer still holds the reference to this shared group, don't
+      // free the underlying canonical buffer.
       return;
     }
+    VLOG(3) << "    Ref == 0 " << group->canonical->ToString();
     CHECK_EQ(group->refcount, 0)
         << "Free caused negative refcount on shared buffer: " << *buffer;
     buffer = group->canonical;
@@ -423,6 +462,21 @@
   FillDebugTrace(HeapSimulatorTrace::Event::FREE, buffer, instruction, nullptr);
 }
 
+bool HeapSimulator::InSameSharedGroup(const BufferValue* left,
+                                      const BufferValue* right) {
+  auto left_it = shared_buffers_.find(left);
+  if (left_it == shared_buffers_.end()) {
+    return true;
+  }
+
+  auto right_it = shared_buffers_.find(right);
+  if (right_it == shared_buffers_.end()) {
+    return true;
+  }
+
+  return left_it->second == right_it->second;
+}
+
 // ShareBuffer associates buffers with their SharedGroup in shared_buffers_.
 // The 'buffer' must be a non-allocated, non-freed buffer, just like in calls to
 // Alloc.  The 'shared' buffer must be a previously allocated or shared buffer.
@@ -445,6 +499,12 @@
     // The 'shared' buffer already has a group; it might be the canonical, but
     // also might not be.  Just add 'buffer' to the existing group.
     std::shared_ptr<SharedGroup> group = shared_it->second;
+
+    if (group->refcount == 0) {
+      // Nothing is scheduled at the shared group yet. This must be the
+      // canonical.
+      group->canonical = shared;
+    }
     canonical = group->canonical;
     ++group->refcount;
     shared_buffers_.emplace(buffer, group);
@@ -475,7 +535,7 @@
     for (const auto& share_pair : shared_buffers_) {
       const BufferValue* buffer = share_pair.first;
       std::shared_ptr<SharedGroup> group = share_pair.second;
-      if (buffer != group->canonical) {
+      if (buffer != group->canonical && group->canonical != nullptr) {
         // The canonical must already exist in the chunk_map, since we called
         // Alloc(canonical) on the underlying algorithm.  Add non-canonical
         // chunks with the same offset as the canonical.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 3e0631a..ef1a62e 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -85,6 +85,9 @@
     // If 'buffers_to_assign' is provided, only those buffers are assigned
     // offsets, otherwise all buffers defined by the instructions are assigned.
     const BufferValueFlatSet* buffers_to_assign;
+    // A vector of multiple buffer value sets. Each set enforces a must-alias
+    // relationship for all buffers inside them.
+    std::vector<BufferValueFlatSet> must_alias_sets;
   };
 
   // Returns the minimum memory required to compute an HLO module where all
@@ -153,6 +156,11 @@
   void Free(const BufferValue* buffer, const HloInstruction* instruction);
   void ShareBuffer(const BufferValue* buffer, const BufferValue* shared,
                    const HloInstruction* instruction);
+
+  // Returns true if:
+  //  Two buffers belong to the same shared group.
+  //  Eight of the buffer has no shared group assigned.
+  bool InSameSharedGroup(const BufferValue* left, const BufferValue* right);
   Result Finish();
 
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 2f16280..8cb70a1 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -259,7 +259,8 @@
   // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
-      const std::vector<HloInstruction*>& instruction_sequence) {
+      const std::vector<HloInstruction*>& instruction_sequence,
+      const std::vector<HloInstruction*>& must_alias_set = {}) {
     HloModuleConfig config;
     module_ = absl::make_unique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
@@ -272,10 +273,19 @@
     auto zero_size = [](const BufferValue& buffer) { return 0; };
     auto algorithm = absl::make_unique<DecreasingSizeRunsHeap>(
         absl::make_unique<HeapCallRecorder>(&actual_calls_));
+    BufferValueFlatSet must_alias_buffer_value_set;
+
+    for (HloInstruction* hlo : must_alias_set) {
+      must_alias_buffer_value_set.insert(
+          points_to_analysis_->GetBufferDefinedAt(hlo, {}).ValueOrDie());
+    }
+
+    HeapSimulator::Options options;
+    options.must_alias_sets = {must_alias_buffer_value_set};
     result_ =
         HeapSimulator::Run(std::move(algorithm), *module_->entry_computation(),
                            HloInstructionSequence(instruction_sequence),
-                           *points_to_analysis_, zero_size)
+                           *points_to_analysis_, zero_size, options)
             .ConsumeValueOrDie();
   }
 
@@ -410,6 +420,46 @@
   });
 }
 
+TEST_F(HeapSimulatorTest, MustAliasBuffers) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec4_, "paramY"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto add_1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+
+  auto add_2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+
+  auto add_3 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, add_1, add_2));
+
+  // Check that mul and add_2 are collocated as requested by the user.
+  HeapSimulatorTracker tracker(
+      TestName(), builder.Build(),
+      {paramA, paramX, mul, paramY, add_1, add_2, add_3}, {mul, add_2});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(add_1, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(mul, {})},
+      {kFree, tracker.BufferAt(paramY, {})},
+      {kFree, tracker.BufferAt(add_1, {})},
+      {kFinish, nullptr},
+  });
+  tracker.ExpectSharedBuffers(add_2, {}, mul, {});
+}
+
 TEST_F(HeapSimulatorTest, MultiplyAdd) {
   auto builder = HloComputation::Builder(TestName());
   auto paramA = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 54ee929..18c5f1e 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -49,6 +49,9 @@
   reserved "called_computation_names";
   reserved 44;
   reserved "replica_group_ids";
+  // Use backend_config instead for custom_call_opaque.
+  reserved 53;
+  reserved "custom_call_opaque";
 
   string name = 1;
   string opcode = 2;
@@ -131,9 +134,6 @@
   // kCustomCall.
   string custom_call_target = 28;
 
-  // Opaque string, only present for kCustomCall.
-  string custom_call_opaque = 53;
-
   // Shape of outfeed request.
   xla.ShapeProto outfeed_shape = 29;
 
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 7d02f4b..8e10d6a 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -59,7 +59,7 @@
   // construction process.
   using BufferNumber = int64;
 
-  explicit BufferValueMap(HloModule* module,
+  explicit BufferValueMap(const HloModule* module,
                           const HloDataflowAnalysis& dataflow)
       : module_(module), dataflow_(dataflow) {
     buffers_.reserve(dataflow_.values().size());
@@ -325,7 +325,7 @@
     return aliased_buffers;
   }
 
-  HloModule* module_;
+  const HloModule* module_ = nullptr;
 
   // Dataflow analysis used to construct the buffer map.
   const HloDataflowAnalysis& dataflow_;
@@ -341,7 +341,7 @@
   BufferNumber next_buffer_number_ = 0;
 };
 
-HloAliasAnalysis::HloAliasAnalysis(HloModule* module) : module_(module) {}
+HloAliasAnalysis::HloAliasAnalysis(const HloModule* module) : module_(module) {}
 
 const HloBuffer& HloAliasAnalysis::GetUniqueBufferAt(
     const HloInstruction* instruction, const ShapeIndex& index) const {
@@ -488,8 +488,9 @@
 
 /* static */
 StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
-    HloModule* module, const HloDataflowAnalysis::FusionCanShareBufferFunction&
-                           fusion_can_share_buffer) {
+    const HloModule* module,
+    const HloDataflowAnalysis::FusionCanShareBufferFunction&
+        fusion_can_share_buffer) {
   VLOG(2) << "HloAliasAnalysis::Run on module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
 
@@ -523,10 +524,75 @@
 
   TF_DCHECK_OK(alias_analysis->Verify());
 
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ShapeUtil::ForEachSubshape(
+      root->shape(), [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+        for (const HloBuffer* buffer :
+             alias_analysis->ComputeBuffersAt(root, index)) {
+          alias_analysis->live_out_buffers_.insert(buffer);
+        }
+      });
+
   XLA_VLOG_LINES(2, alias_analysis->ToString());
   return std::move(alias_analysis);
 }
 
+void HloAliasAnalysis::MergeBuffers(const HloBuffer& to,
+                                    const HloBuffer& from) {
+  CHECK(to.id() != from.id());
+  VLOG(2) << "Merge buffer: " << from.ToString() << " into :" << to.ToString();
+
+  CHECK(from.id() < buffers_.size());
+  CHECK(to.id() < buffers_.size());
+
+  // Merge the values of `to` and `from`, creates a new buffer with the
+  // merged values.
+  std::vector<const HloValue*> merged_values(to.values().begin(),
+                                             to.values().end());
+
+  merged_values.insert(merged_values.end(), from.values().begin(),
+                       from.values().end());
+  absl::c_sort(merged_values, [](const HloValue* a, const HloValue* b) {
+    return a->id() < b->id();
+  });
+
+  buffers_[to.id()] = HloBuffer(to.id(), merged_values);
+  for (const HloValue* value : merged_values) {
+    // Update references of values.
+    value_to_buffer_[value] = &buffers_[to.id()];
+  }
+
+  if (live_out_buffers_.count(&from) > 0) {
+    // Update live out set to erase `from` and add `to`.
+    live_out_buffers_.erase(&from);
+    live_out_buffers_.insert(&buffers_[to.id()]);
+  }
+
+  int64 from_id = from.id();
+  if (from_id != buffers_.size() - 1) {
+    // Now `from` is invalid, move the last element of buffers to replace `from`
+    // and update references to the last element.
+    const HloBuffer& last_elem = buffers_.back();
+    buffers_[from.id()] = HloBuffer(from_id, last_elem.values());
+
+    if (live_out_buffers_.count(&last_elem) > 0) {
+      // Update live out set to redirect the last element to its new position.
+      live_out_buffers_.erase(&last_elem);
+      live_out_buffers_.insert(&buffers_[from_id]);
+    }
+
+    // Update references of values.
+    for (const HloValue* value : buffers_[from_id].values()) {
+      value_to_buffer_[value] = &buffers_[from_id];
+    }
+  }
+
+  // Remove the last element.
+  buffers_.pop_back();
+
+  CHECK(Verify().ok());
+}
+
 bool HloAliasAnalysis::HasLiveRangeInterference(
     const HloOrdering& ordering) const {
   for (const HloBuffer& buffer : buffers()) {
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index 372f99f..d09ec15 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -41,7 +41,7 @@
   // The callgraph of the given HloModule must be flattened
   // (xla::FlattenCallGraph) prior to running the analysis.
   static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
-      HloModule* module,
+      const HloModule* module,
       const HloDataflowAnalysis::FusionCanShareBufferFunction&
           fusion_can_share_buffer);
 
@@ -82,9 +82,7 @@
   const std::vector<HloBuffer>& buffers() const { return buffers_; }
 
   // Returns the underlying dataflow analysis used by this alias analysis.
-  const HloDataflowAnalysis& dataflow_analysis() const {
-    return *dataflow_analysis_;
-  }
+  HloDataflowAnalysis& dataflow_analysis() const { return *dataflow_analysis_; }
 
   // Returns true if any index in the output of the given instruction has more
   // than one buffer. That is, ComputeBuffersAt returns a vector with more than
@@ -95,17 +93,44 @@
   // output of the given instruction.
   bool InstructionBuffersAreDistinct(const HloInstruction* instruction) const;
 
+  // Merge buffer `from` into buffer `to`. Caller has to make sure no
+  // interference will be introduced after merging. This rebuilds internal data
+  // structure, and invalidates references to all existing buffers.
+  void MergeBuffers(const HloBuffer& to, const HloBuffer& from);
+
   // Returns true if any HLO values in the module have interfering live ranges
   // assuming the given ordering.
   bool HasLiveRangeInterference(const HloOrdering& ordering) const;
 
+  // Returns true if a buffer lives out of the module.
+  bool BufferLivesOut(const HloBuffer& buffer) const {
+    return live_out_buffers_.count(&buffer);
+  }
+
+  // Returns true if a hlo value lives out of the module.
+  bool ValueLivesOut(const HloValue& value) const {
+    return live_out_buffers_.count(&GetBufferContainingValue(value));
+  }
+
+  std::vector<const HloBuffer*> LiveOutBuffers() const {
+    std::vector<const HloBuffer*> results(live_out_buffers_.begin(),
+                                          live_out_buffers_.end());
+    absl::c_sort(results, [](const HloBuffer* a, const HloBuffer* b) {
+      return a->id() < b->id();
+    });
+    return results;
+  }
+
  protected:
-  explicit HloAliasAnalysis(HloModule* module);
+  explicit HloAliasAnalysis(const HloModule* module);
 
   // Verify various invariants of the alias analysis.
   Status Verify() const;
 
-  HloModule* module_;
+  const HloModule* module_;
+
+  // A set of buffers that live out the module.
+  absl::flat_hash_set<const HloBuffer*> live_out_buffers_;
 
   // The underlying dataflow analysis used by this alias analysis.
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index e344fbc..89eda85 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -1022,6 +1022,54 @@
             analysis.GetUniqueBufferAt(bitcast));
 }
 
+TEST_F(HloAliasAnalysisTest, MergeBuffers) {
+  // Bitcasting a value should not produce a new buffer.
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, elem_shape, "param"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(elem_shape, HloOpcode::kNegate, param0));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(elem_shape, HloOpcode::kNegate, negate));
+
+  module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.buffers().size(), 3);
+  analysis.MergeBuffers(analysis.buffers()[0], analysis.buffers()[1]);
+  EXPECT_EQ(analysis.buffers().size(), 2);
+  analysis.MergeBuffers(analysis.buffers()[0], analysis.buffers()[1]);
+  EXPECT_EQ(analysis.buffers().size(), 1);
+  analysis.BufferLivesOut(analysis.buffers()[0]);
+}
+
+TEST_F(HloAliasAnalysisTest, MergeBuffersReverse) {
+  // Bitcasting a value should not produce a new buffer.
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, elem_shape, "param"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(elem_shape, HloOpcode::kNegate, param0));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(elem_shape, HloOpcode::kNegate, negate));
+
+  module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.buffers().size(), 3);
+  analysis.MergeBuffers(analysis.buffers()[2], analysis.buffers()[1]);
+  EXPECT_EQ(analysis.buffers().size(), 2);
+  analysis.MergeBuffers(analysis.buffers()[1], analysis.buffers()[0]);
+  EXPECT_EQ(analysis.buffers().size(), 1);
+  analysis.BufferLivesOut(analysis.buffers()[0]);
+}
+
 TEST_F(HloAliasAnalysisTest, BitcastInterference) {
   // A bitcast value simultaneously live with its operand should not cause
   // interference.
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.h b/tensorflow/compiler/xla/service/hlo_buffer.h
index a88c87e..a81078f 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.h
+++ b/tensorflow/compiler/xla/service/hlo_buffer.h
@@ -109,11 +109,11 @@
 
  private:
   // Unique identifier for this HloBuffer.
-  const Id id_;
+  Id id_;
 
   // The set of values contained in this buffer. Vector contains no duplicates
   // and is sorted stably by HloValue::Id.
-  const std::vector<const HloValue*> values_;
+  std::vector<const HloValue*> values_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer);
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index e42808b..89dbe93 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -198,6 +198,13 @@
       const HloComputationProto& proto,
       const absl::flat_hash_map<int64, HloComputation*>& computation_map);
 
+  using InstructionSequence = tensorflow::gtl::iterator_range<
+      UnwrappingIterator<std::list<std::unique_ptr<HloInstruction>>::iterator>>;
+
+  using ConstInstructionSequence =
+      tensorflow::gtl::iterator_range<UnwrappingIterator<
+          std::list<std::unique_ptr<HloInstruction>>::const_iterator>>;
+
   // Gets the instructions in this computation.
   //
   // The returned type is a range of HloInstruction*s, so you can iterate over
@@ -205,15 +212,11 @@
   //
   //   for (HloInstruction* instr : computation->instructions()) { ... }
   //
-  tensorflow::gtl::iterator_range<UnwrappingIterator<
-      std::list<std::unique_ptr<HloInstruction>>::const_iterator>>
-  instructions() const {
+  ConstInstructionSequence instructions() const {
     return {MakeUnwrappingIterator(instructions_.begin()),
             MakeUnwrappingIterator(instructions_.end())};
   }
-  tensorflow::gtl::iterator_range<
-      UnwrappingIterator<std::list<std::unique_ptr<HloInstruction>>::iterator>>
-  instructions() {
+  InstructionSequence instructions() {
     return {MakeUnwrappingIterator(instructions_.begin()),
             MakeUnwrappingIterator(instructions_.end())};
   }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index e7ed858..e0f18c4 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -130,7 +130,7 @@
         int64 elements_in_constant =
             ShapeUtil::ElementsIn(instruction->shape());
 
-        static const int64 kMaximumConstantSizeElements = 2 * 1000 * 1000;
+        static const int64 kMaximumConstantSizeElements = 45 * 1000 * 1000;
         if (elements_in_constant > elements_in_removed_operands &&
             elements_in_constant > kMaximumConstantSizeElements) {
           continue;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 372f015..8c1b22e 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -17,6 +17,10 @@
 
 #include <cmath>
 
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -129,6 +133,42 @@
   return shape_size_(shape);
 }
 
+int64 HloCostAnalysis::FusionParameterReadBytes(
+    const HloInstruction* hlo) const {
+  int64 size = 0;
+  bool seen_trivial_user = false;
+  CHECK(hlo->IsFused() && hlo->opcode() == HloOpcode::kParameter);
+  for (const HloInstruction* user : hlo->users()) {
+    switch (user->opcode()) {
+      case HloOpcode::kFusion: {
+        for (int64 idx : user->OperandIndices(hlo)) {
+          size += FusionParameterReadBytes(user->fused_parameter(idx));
+        }
+        break;
+      }
+      case HloOpcode::kSlice:
+        size += GetShapeSize(user->shape());
+        break;
+      case HloOpcode::kDynamicSlice:
+        size += hlo == user->operand(0) ? GetShapeSize(user->shape())
+                                        : GetShapeSize(hlo->shape());
+        break;
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kReshape:
+        size += GetShapeSize(hlo->shape());
+        break;
+      default:
+        // Other instructions reading this parameter are assumed to be able to
+        // share the read from memory.
+        if (!seen_trivial_user) {
+          seen_trivial_user = true;
+          size += GetShapeSize(hlo->shape());
+        }
+    }
+  }
+  return size;
+}
+
 Status HloCostAnalysis::HandleElementwiseUnary(const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
@@ -598,6 +638,10 @@
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandlePartitionId(const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleReplicaId(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
@@ -612,6 +656,17 @@
 }
 
 Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
+  if (fusion->IsCustomFusion()) {
+    for (const HloInstruction* hlo :
+         fusion->fused_instructions_computation()->instructions()) {
+      if (hlo->opcode() == HloOpcode::kGather) {
+        return HandleGather(hlo);
+      }
+      if (hlo->opcode() == HloOpcode::kScatter) {
+        return HandleScatter(hlo);
+      }
+    }
+  }
   TF_ASSIGN_OR_RETURN(
       current_properties_,
       ProcessNestedSubcomputation(fusion->fused_instructions_computation()));
@@ -622,12 +677,34 @@
   current_properties_[kBytesAccessedKey] = 0;
   ShapeUtil::ForEachSubshape(
       fusion->shape(),
-      [this](const Shape& subshape, const ShapeIndex& /*shape_index*/) {
+      [this, fusion](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (!subshape.IsArray()) {
+          return;
+        }
+        if (shape_index.empty()) {
+          if (fusion->fused_expression_root()->opcode() ==
+              HloOpcode::kDynamicUpdateSlice) {
+            current_properties_[kBytesAccessedKey] += GetShapeSize(
+                fusion->fused_expression_root()->operand(0)->shape());
+            return;
+          }
+        } else if (shape_index.size() == 1) {
+          if (fusion->fused_expression_root()
+                  ->operand(shape_index[0])
+                  ->opcode() == HloOpcode::kDynamicUpdateSlice) {
+            current_properties_[kBytesAccessedKey] +=
+                GetShapeSize(fusion->fused_expression_root()
+                                 ->operand(shape_index[0])
+                                 ->operand(0)
+                                 ->shape());
+            return;
+          }
+        }
         current_properties_[kBytesAccessedKey] += GetShapeSize(subshape);
       });
 
-  for (const HloInstruction* operand : fusion->operands()) {
-    current_properties_[kBytesAccessedKey] += GetShapeSize(operand->shape());
+  for (const HloInstruction* operand : fusion->fused_parameters()) {
+    current_properties_[kBytesAccessedKey] += FusionParameterReadBytes(operand);
   }
 
   return Status::OK();
@@ -779,6 +856,7 @@
 StatusOr<HloCostAnalysis::Properties>
 HloCostAnalysis::ProcessNestedSubcomputation(HloComputation* computation) {
   HloCostAnalysis visitor(shape_size_, per_second_rates_);
+  visitor.ReserveVisitStates(computation->instruction_count());
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.properties();
 }
@@ -786,6 +864,7 @@
 StatusOr<HloCostAnalysis::Properties>
 HloCostAnalysis::ProcessUnnestedSubcomputation(HloComputation* computation) {
   HloCostAnalysis visitor(shape_size_, per_second_rates_);
+  visitor.ReserveVisitStates(computation->instruction_count());
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   hlo_properties_.insert(visitor.hlo_properties_.begin(),
                          visitor.hlo_properties_.end());
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 4480554..b764655 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -77,6 +77,7 @@
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
   Status HandleReplicaId(const HloInstruction* hlo) override;
+  Status HandlePartitionId(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
   Status HandleRng(const HloInstruction* random) override;
@@ -196,6 +197,10 @@
   // a layout.
   int64 GetShapeSize(const Shape& shape) const;
 
+  // Traverses a fusion operand to find the actual bytes accessed by the fusion
+  // node.
+  int64 FusionParameterReadBytes(const HloInstruction* hlo) const;
+
   // Function which computes the size of the top-level of a given shape (not
   // including nested elements, if any). If null then bytes_accessed methods
   // return an error.
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 849cac2..1e7e125 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -143,7 +143,9 @@
     for (auto instruction : computation->MakeInstructionPostOrder()) {
       // If the instruction has zero operands (constants, parameters, etc.) skip
       // over it.
-      if (instruction->operand_count() == 0) {
+      if (instruction->operand_count() == 0 &&
+          instruction->opcode() != HloOpcode::kPartitionId &&
+          instruction->opcode() != HloOpcode::kReplicaId) {
         continue;
       }
       // Skip instructions which have side effects.
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
index 3746fbb..5b388bc 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -47,7 +47,11 @@
         HloInstruction* domain = (*creator)(instruction, root, operand);
         if (domain != nullptr) {
           VLOG(4) << "New domain: " << domain->ToString();
-          TF_RETURN_IF_ERROR(operand->ReplaceUseWith(instruction, domain));
+          // Call ReplaceUseWithDifferentShape even though the shapes are
+          // expected to match to avoid an expensive shape check between the
+          // original and the new instruction.
+          TF_RETURN_IF_ERROR(
+              operand->ReplaceUseWithDifferentShape(instruction, domain));
           ++added_domains;
         }
       }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 0eb4610..0320979 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -27,6 +27,7 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -215,10 +216,10 @@
         return Unimplemented(
             "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE.");
       });
-  typed_visitors_[OPAQUE] =
+  typed_visitors_[OPAQUE_TYPE] =
       absl::make_unique<FunctionVisitor>([](HloInstruction*) {
         return Unimplemented(
-            "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE.");
+            "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE_TYPE.");
       });
   typed_visitors_[TOKEN] =
       absl::make_unique<FunctionVisitor>([](HloInstruction*) {
@@ -497,7 +498,7 @@
   switch (elem_ty) {
     case PRED:
     case TUPLE:
-    case OPAQUE:
+    case OPAQUE_TYPE:
     case TOKEN:
     case S8:
     case S16:
@@ -779,6 +780,545 @@
   return Status::OK();
 }
 
+namespace {
+
+// Straightforward implementation of 1D DFT transform. Uses passed-in start
+// index and stride to gather inputs from the data vector into the preallocated
+// buffer, computes the result, and writes it back to the same locations in the
+// data vector. Runs in O(length^2) time.
+//
+// Parameters contract_output and expand_input are used to avoid unnecessary
+// calculations. When contract_output is set to true, then only (length / 2) + 1
+// output values are computed. When expand_input is set to true, then
+// (length / 2) + 1 values from the data set are used to re-create the full set
+// of size 'length', on which the transform is then performed.
+//
+void NaiveDft1D(int64 length, int64 start, int64 stride, bool inverse,
+                bool contract_output, bool expand_input,
+                absl::Span<complex128> data, absl::Span<complex128> buffer) {
+  CHECK_GT(data.size(), start + (length - 1) * stride);
+  CHECK_GT(buffer.size(), length - 1);
+
+  // Copy input data to 1D vector.
+  bool input_is_zero = true;
+  const int64 ub = expand_input ? length / 2 + 1 : length;
+  for (int64 k = 0; k < ub; k++) {
+    complex128 value = data[start + k * stride];
+    input_is_zero &= value == complex128(0.0, 0.0);
+    buffer[k] = value;
+    if (expand_input) {
+      // Use conjugates of the values at indices [1 ... (ub - 2)] when the
+      // length is even and at indices [1 ... (ub - 1)] when the length is odd
+      // to calculate missing values at indices [(length - 1) ... ub].
+      if (k > 0 && k < (length - ub + 1)) {
+        buffer[length - k] = std::conj(value);
+      }
+    }
+  }
+
+  // Do 1D transformation with double precision.
+  if (!input_is_zero) {
+    const int64 ub = contract_output ? length / 2 + 1 : length;
+    for (int64 k = 0; k < ub; k++) {
+      complex128 value = complex128(0.0, 0.0);
+      for (int n = 0; n < length; n++) {
+        auto coeff = std::exp(complex128(0.0, -2.0 * M_PI * n * k / length));
+        value += (inverse ? std::conj(buffer[n]) : buffer[n]) * coeff;
+      }
+      data[start + k * stride] =
+          inverse ? std::conj(value) / complex128(length, 0.0) : value;
+    }
+  }
+}
+
+// Helper to reverse the order of dimension lengths in the passed-in literal.
+std::vector<int64> GetDimensionLengths(const Literal& literal) {
+  std::vector<int64> lengths = literal.shape().dimensions();
+  absl::c_reverse(lengths);
+  return lengths;
+}
+
+// Helper to compute strides for creating linear indices into multidimensional
+// data from the dimension lengths and the layout. Returns a new vector of size
+// lengths.size() + 1. The last element of the returned vector at index
+// [lengths.size()] contains the product of all dimension lengths.
+std::vector<int64> ComputeStrides(const absl::Span<const int64> lengths,
+                                  const Layout& layout) {
+  const int64 num_dimensions = lengths.size();
+
+  // Make sure that the layout length matches the number of dimensions.
+  CHECK_EQ(num_dimensions, layout.minor_to_major_size());
+
+  // Calculate strides using layout-specified ordering of the dimensions and
+  // place the stride for axis 0 at index 0, for axis 1 at index 1, etc.
+  std::vector<int64> strides(num_dimensions + 1);
+  int64 stride = 1;
+  for (int64 i = 0; i < num_dimensions; i++) {
+    // Reverse the ordering of the dimensions in the layout.
+    const int64 index = (num_dimensions - 1) - layout.minor_to_major(i);
+    strides[index] = stride;
+    stride *= lengths[index];
+  }
+  strides[num_dimensions] = stride;
+
+  return strides;
+}
+
+// Compute strides as above using the default layout.
+std::vector<int64> ComputeStrides(const absl::Span<const int64> lengths) {
+  return ComputeStrides(lengths,
+                        LayoutUtil::GetDefaultLayoutForRank(lengths.size()));
+}
+
+// Compute strides as above using the layout from the literal, if available.
+std::vector<int64> ComputeStrides(const absl::Span<const int64> lengths,
+                                  const Literal& literal) {
+  return literal.shape().has_layout()
+             ? ComputeStrides(lengths, literal.shape().layout())
+             : ComputeStrides(lengths);
+}
+
+// Make 1D sweeps along each transform axis.
+void Sweep(int64 fft_rank, FftType fft_type,
+           const absl::Span<const int64> fft_lengths,
+           const absl::Span<const int64> fft_strides,
+           absl::Span<complex128> data, absl::Span<complex128> buffer) {
+  const bool inverse = fft_type == FftType::IFFT || fft_type == FftType::IRFFT;
+  const bool input_is_truncated = fft_type == FftType::IRFFT;
+  const bool output_is_truncated = fft_type == FftType::RFFT;
+
+  // Recursively visit each column of the data along the sweep_axis. Calculate
+  // linearized index of that column's first element and the stride, then invoke
+  // 1D transform.
+  // For RFFT, avoid calculating unused output values: first, compute only
+  // (length_x / 2) + 1 values along the X axis, then limit the X coordinate to
+  // [0 ... (length / 2)] during the sweeps along other axes. Similarly, for
+  // IRFFT sweep along higher dimensions first, while keeping the X coordinate
+  // in the [0 ... (length / 2)] range, then re-create negative frequencies
+  // omitted in the input and perform the full-length transform along the X axis
+  // in the last sweep.
+  std::function<void(int64, int64, int64)> sweep = [&](int64 sweep_axis,
+                                                       int64 axis,
+                                                       int64 start) {
+    if (axis < 0) {
+      // Base case: invoke 1D transform.
+      const int64 length = fft_lengths[sweep_axis];
+      const int64 stride = fft_strides[sweep_axis];
+      const bool expand_input = input_is_truncated && sweep_axis == 0;
+      const bool contract_oputput = output_is_truncated && sweep_axis == 0;
+      NaiveDft1D(length, start, stride, inverse, contract_oputput, expand_input,
+                 data, buffer);
+    } else if (axis == sweep_axis) {
+      // Visit only the elements with coordinate 0 along the sweep axis.
+      sweep(sweep_axis, axis - 1, start);
+    } else {
+      const int64 length = fft_lengths[axis];
+      const bool is_truncated = input_is_truncated || output_is_truncated;
+      const int64 ub = is_truncated && axis == 0 ? (length / 2) + 1 : length;
+      for (int64 i = 0; i < ub; i++) {
+        sweep(sweep_axis, axis - 1, start + i * fft_strides[axis]);
+      }
+    }
+  };
+  if (input_is_truncated) {
+    // Sweep along the X axis last for IRFFT.
+    for (int64 sweep_axis = fft_rank - 1; sweep_axis >= 0; sweep_axis--) {
+      sweep(sweep_axis, fft_rank - 1, 0);
+    }
+  } else {
+    // Sweep along the X axis first for RFFT. The order does not matter for FFT
+    // and IFFT types; handle them here as well.
+    for (int64 sweep_axis = 0; sweep_axis < fft_rank; sweep_axis++) {
+      sweep(sweep_axis, fft_rank - 1, 0);
+    }
+  }
+}
+
+// These templates convert the data from the input data type to the type used in
+// calculations and then to the output data type. They are intended to be used
+// only within the DFT implementation. One special case is IRFFT, where the
+// specialization drops imaginary parts of complex values (which is expected to
+// be 0) and returns real numbers.
+template <typename ToType, typename FromType>
+ToType GetAs(FromType value) {
+  return static_cast<ToType>(value);
+}
+
+template <>
+float GetAs<float, complex128>(complex128 value) {
+  return static_cast<float>(value.real());
+}
+
+// This template generates two linearized indices, which can be used to access
+// multidimensional arrays. It uses a recursive function, which passes the
+// indices to the user-supplied callback function. The destination index is
+// always within dst_lengths[] bounds. The boolean parameter within_src_bounds
+// indicates whether the source index is within src_lengths[] bounds.
+//
+// The value returned from the callback function controls the recursion depth.
+// Returning true indicates that the base case had been hit and the recursion
+// stops. Otherwise, the recursion proceeds along the next less-major axis.
+//
+// For example, the base case when the axis value becomes negative invokes the
+// callback function for each possible index within dst_lengths[] bounds. The
+// base case when the axis value is equal to zero limits the indices to point
+// only to first elements along the minor-most dimension, allowing the callback
+// function to handle all values along the X axis.
+//
+template <typename BaseFn>
+void GenerateIndices(const absl::Span<const int64> dst_lengths,
+                     const absl::Span<const int64> dst_strides,
+                     const absl::Span<const int64> src_lengths,
+                     const absl::Span<const int64> src_strides, int64 fft_rank,
+                     int64 dst_start, int64 src_start, BaseFn&& base) {
+  CHECK_EQ(dst_lengths.size() + 1, dst_strides.size());
+  CHECK_GE(dst_lengths.size(), fft_rank);
+  CHECK_EQ(src_lengths.size() + 1, src_strides.size());
+  CHECK_GE(src_lengths.size(), fft_rank);
+
+  std::function<void(int64, int64, int64, bool)> generate =
+      [&](int64 axis, int64 dst_index, int64 src_index,
+          bool within_src_bounds) {
+        if (!base(axis, dst_index, src_index, within_src_bounds)) {
+          for (int64 i = 0; i < dst_lengths[axis]; i++) {
+            // Because the loop goes over dst_lengths[], the source index may be
+            // out of src_lengths[] bounds. In this case, within_src_bounds is
+            // false.
+            within_src_bounds &= i < src_lengths[axis];
+            generate(axis - 1, dst_index, src_index, within_src_bounds);
+            dst_index += dst_strides[axis];
+            src_index += src_strides[axis];
+          }
+        }
+      };
+  generate(fft_rank - 1, dst_start, src_start, true);
+}
+
+// Copies the input data from a literal to a pre-allocated vector. The sizes of
+// the input and the transform do not need to match. For each axis of the
+// transform, any extra input values beyond the transform length are ignored.
+// Conversely, if the input does not contain enough elements along any axis, the
+// data is padded with zeroes.
+//
+// For IRFFT transforms, we use (length_x / 2) + 1 elements from the input,
+// where length_x is the size of the full transform along the X axis.
+//
+// The input literal may have a rank higher than the rank of the transform.
+// Passed-in input_index value points to the first element of the input literal
+// to be copied.
+//
+// Returns true if all values in the work data set are zeroes.
+//
+template <typename InputType>
+bool CopyDataFromInput(const Literal& input_literal, int64 input_start,
+                       int64 fft_rank, FftType fft_type, int64 fft_size,
+                       const absl::Span<const int64> fft_lengths,
+                       const absl::Span<const int64> fft_strides,
+                       const absl::Span<const int64> input_lengths,
+                       const absl::Span<const int64> input_strides,
+                       absl::Span<complex128> data) {
+  CHECK_GE(data.size(), fft_size);
+
+  const bool input_is_truncated = fft_type == FftType::IRFFT;
+
+  // Recursively visit each transform dimension to copy input values to the
+  // working data set. The base case handles inputs along the X axis.
+  bool input_is_zero = true;
+  const InputType* input_data = input_literal.data<InputType>().data();
+  auto base_case = [&](int64 axis, int64 dst_index, int64 src_index,
+                       bool within_src_bounds) {
+    if (axis == 0) {
+      // For IRFFT, the negavie frequencies are only needed for the sweep along
+      // the X axis, which is performed last. Leave this part of the working set
+      // uninitialized until then.
+      const int64 length = fft_lengths[axis];
+      const int64 ub = input_is_truncated ? (length / 2) + 1 : length;
+      for (int64 i = 0; i < ub; i++) {
+        complex128 value = InputType(0);
+        // Read input value only if the index is within bounds.
+        if (within_src_bounds && i < input_lengths[axis]) {
+          value = GetAs<complex128, InputType>(
+              input_data[src_index + i * input_strides[axis]]);
+          input_is_zero &= value == complex128(0.0, 0.0);
+        }
+        data[dst_index + i * fft_strides[axis]] = value;
+      }
+      return true;
+    }
+    return false;
+  };
+  GenerateIndices(fft_lengths, fft_strides, input_lengths, input_strides,
+                  fft_rank, 0, input_start, base_case);
+  return input_is_zero;
+}
+
+// Copies the result of the transform to the literal output. The sizes of the
+// transform and output must match.
+//
+// For RFFT transforms, we copy (length_x / 2) + 1 elements, where length_x is
+// the size of the full transform along the X axis (the most minor dimension).
+//
+// The output literal may have a rank higher than the rank of the transform.
+// Passed-in output_index value points to the first element of the output
+// literal to be filled in.
+//
+template <typename OutputType>
+void CopyDataToOutput(const absl::Span<complex128> data, int64 output_start,
+                      int64 fft_rank, FftType fft_type,
+                      const absl::Span<const int64> fft_lengths,
+                      const absl::Span<const int64> fft_strides,
+                      const absl::Span<const int64> output_lengths,
+                      const absl::Span<const int64> output_strides,
+                      Literal* output_literal) {
+  const bool output_is_truncated = fft_type == FftType::RFFT;
+
+  // Base case for recursive copy of the results to the output. The code avoids
+  // making a recursive call for each output element by handling axis 0 in the
+  // loop (as opposed to making "axis < 0" to be the base case).
+  OutputType* output_data = output_literal->data<OutputType>().data();
+  auto base_case = [&](int64 axis, int64 dst_index, int64 src_index,
+                       bool within_src_bounds) {
+    if (axis == 0) {
+      // Drop negative frequencies for RFFT.
+      const int64 length = fft_lengths[axis];
+      const int64 ub = output_is_truncated ? (length / 2) + 1 : length;
+      for (int64 i = 0; i < output_lengths[axis]; i++) {
+        OutputType value = OutputType(0);
+        // Read data only if the index is within bounds.
+        if (within_src_bounds && i < ub) {
+          value = GetAs<OutputType, complex128>(
+              data[src_index + i * fft_strides[axis]]);
+        }
+        output_data[dst_index + i * output_strides[axis]] = value;
+      }
+      return true;
+    }
+    return false;
+  };
+  GenerateIndices(output_lengths, output_strides, fft_lengths, fft_strides,
+                  fft_rank, output_start, 0, base_case);
+}
+
+// Determine the type to use with the CopyDataFromInput<> template above.
+bool CopyDataFromInput(const Literal& input_literal, int64 input_start,
+                       int64 fft_rank, FftType fft_type, int64 fft_size,
+                       const absl::Span<const int64> fft_lengths,
+                       const absl::Span<const int64> fft_strides,
+                       const absl::Span<const int64> input_lengths,
+                       const absl::Span<const int64> input_strides,
+                       absl::Span<complex128> data) {
+  const bool input_is_float = fft_type == FftType::RFFT;
+  if (input_is_float) {
+    return CopyDataFromInput<float>(
+        input_literal, input_start, fft_rank, fft_type, fft_size, fft_lengths,
+        fft_strides, input_lengths, input_strides, data);
+  } else {
+    return CopyDataFromInput<complex64>(
+        input_literal, input_start, fft_rank, fft_type, fft_size, fft_lengths,
+        fft_strides, input_lengths, input_strides, data);
+  }
+}
+
+// Determine the type to use with the CopyDataToOutput<> template above.
+void CopyDataToOutput(const absl::Span<complex128> data, int64 output_start,
+                      int64 fft_rank, FftType fft_type,
+                      const absl::Span<const int64> fft_lengths,
+                      const absl::Span<const int64> fft_strides,
+                      const absl::Span<const int64> output_lengths,
+                      const absl::Span<const int64> output_strides,
+                      Literal* output_literal) {
+  const bool output_is_float = fft_type == FftType::IRFFT;
+  if (output_is_float) {
+    CopyDataToOutput<float>(data, output_start, fft_rank, fft_type, fft_lengths,
+                            fft_strides, output_lengths, output_strides,
+                            output_literal);
+  } else {
+    CopyDataToOutput<complex64>(data, output_start, fft_rank, fft_type,
+                                fft_lengths, fft_strides, output_lengths,
+                                output_strides, output_literal);
+  }
+}
+
+Status CheckParameters(const Shape& input_shape, const Shape& output_shape,
+                       int64 fft_rank, FftType fft_type,
+                       const absl::Span<const int64> fft_lengths) {
+  // Check FFT parameters.
+  if (fft_rank <= 0) {
+    return InvalidArgument("Zero or negative FFT rank.");
+  }
+  if (*absl::c_min_element(fft_lengths) < 0) {
+    return InvalidArgument("Negative FFT length.");
+  }
+
+  // Check input-related values.
+  TF_CHECK_OK(ShapeUtil::ValidateShape(input_shape));
+  if (!input_shape.IsArray()) {
+    return Unimplemented("Only array input shapes are supported.");
+  }
+  auto input_elt_type = input_shape.element_type();
+  if (fft_type == FftType::RFFT && input_elt_type != PrimitiveType::F32) {
+    return InvalidArgument("Invalid input type: %d, must be %d (float).",
+                           input_elt_type, PrimitiveType::F32);
+  }
+  if (fft_type != FftType::RFFT && input_elt_type != PrimitiveType::C64) {
+    return InvalidArgument("Invalid input type: %d, must be %d (complex64).",
+                           input_elt_type, PrimitiveType::C64);
+  }
+  const int64 input_rank = input_shape.rank();
+  if (input_rank < fft_rank) {
+    return InvalidArgument("Input shape rank is smaller than FFT rank.");
+  }
+
+  // Check output-related values.
+  TF_CHECK_OK(ShapeUtil::ValidateShape(output_shape));
+  if (!output_shape.IsArray()) {
+    return Unimplemented("Only array output shapes are supported.");
+  }
+  auto output_elt_type = output_shape.element_type();
+  if (fft_type == FftType::IRFFT && output_elt_type != PrimitiveType::F32) {
+    return InvalidArgument("Invalid output type: %d, must be %d (float).",
+                           output_elt_type, PrimitiveType::F32);
+  }
+  if (fft_type != FftType::IRFFT && output_elt_type != PrimitiveType::C64) {
+    return InvalidArgument("Invalid output type: %d, must be %d (complex64).",
+                           output_elt_type, PrimitiveType::C64);
+  }
+  const int64 output_rank = output_shape.rank();
+  if (output_rank < fft_rank) {
+    return InvalidArgument("Output shape rank is smaller than FFT rank.");
+  }
+
+  // Consistency of input and output parameters.
+  if (input_rank != output_rank) {
+    return InvalidArgument(
+        "Ranks of input shape and output shape do not match.");
+  }
+  for (int64 dim = 0; dim < input_rank - fft_rank; dim++) {
+    if (ShapeUtil::GetDimension(input_shape, dim) !=
+        ShapeUtil::GetDimension(output_shape, dim)) {
+      return InvalidArgument(
+          "Higher dimension lengths of input shape and output shape do not "
+          "match.");
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+// Flexible but slow implementation of the discrete Fourier transform. All
+// transform types (FFT, IFFT, RFFT, and IRFFT) are supported, as well as the
+// arbitrary rank and length of each dimension of the transform, and arbitrary
+// layouts of the input and output literals.
+//
+// The input literal in operand 0 provides input data, which must be complex64
+// for FFT, IFFT, IRFFT transforms and float for RFFT. The transform is computed
+// over the innermost dimensions of the input, thus the rank of the input data
+// must be same as fft_rank or larger. The input is expected to provide Ni
+// values along each transform axis with one exception: for IRFFT, only
+// (N0 / 2) + 1 values are needed along the X axis (the innermost index). To
+// increase flexibility, this implementation can handle mismatches between the
+// input size and transform lengths by either dropping extra input values or
+// using zeroes in place of missing input values as necessary. If the input data
+// has rank higher than the transform, the transform is applied for each valid
+// combination of the higher-ranking indices.
+//
+// The output contains complex64 values for FFT, IFFT, RFFT, and float values
+// for IRFFT. The rank of the output as well as the sizes of the dimensions
+// above the rank of the transform must match those of the input. Sizes of the
+// output's "fft_rank" innermost dimensions are expected to match the length of
+// the transform along respective axes with one exception: for RFFT, the output
+// is trimmed along the X axis to have only (N0 / 2) + 1 values. In case the
+// length(s) mismatch, the FFT output is trimmed to fit into the provided output
+// shape, or the output is padded with zero values appropriately.
+//
+// For example, 2D FFT transform of size 16x16 applied to complex64[2][15][17]
+// input array will perform two transforms over the [][15][17] data in the sub
+// arrays [0][][] and [1][][], dropping the values along axis X and padding axis
+// Y with zeroes to create 16x16 working sets, and generating
+// complex64[2][16][16] output. 3D IRFFT transform of size 64x16x16 applied to
+// complex64[64][16][9] input array will use all input values and will produce
+// float[64][16][16] output.
+//
+// The implementation of the 1D transform is a straightforward loop nest. The
+// transforms of higher ranks apply sets of 1D transforms along each axis. For
+// example, the 2D transform is computed by applying 1D transforms to each
+// column followed by applying 1D transforms to each row.
+//
+// In general, a transform of rank n runs in O(N0*N1*...*Nn*(N0+N1+...+Nn))
+// time, where Ni is the length of the transform's i-th dimension. It is
+// possible to reduce the run time to O(N0*N1*...(log(N0)+log(N1)+...)) by
+// plugging in a more efficient 1D implementation.
+//
+Status HloEvaluator::HandleFft(HloInstruction* fft) {
+  const FftType fft_type = fft->fft_type();
+  std::vector<int64> fft_lengths = fft->fft_length();
+  const int64 fft_rank = fft_lengths.size();
+  const Literal& input_literal = GetEvaluatedLiteralFor(fft->operand(0));
+  const Shape& input_shape = input_literal.shape();
+  const Shape& output_shape = fft->shape();
+  Literal output_literal = Literal::CreateFromShape(output_shape);
+
+  // Make fft_lengths[0] the minor-most dimension.
+  absl::c_reverse(fft_lengths);
+
+  TF_RETURN_IF_ERROR(CheckParameters(input_shape, output_shape, fft_rank,
+                                     fft_type, fft_lengths));
+
+  const auto fft_strides = ComputeStrides(fft_lengths);
+
+  // Working set size.
+  const int64 fft_size = fft_strides[fft_rank];
+
+  if (fft_size > 0) {
+    // Linearized working data set.
+    std::vector<complex128> data(fft_size);
+
+    // Temporary buffer allocated once and used in 1D sweeps.
+    std::vector<complex128> buffer(*absl::c_max_element(fft_lengths));
+
+    // Sizes of each axis of input and output literals.
+    const auto input_lengths = GetDimensionLengths(input_literal);
+    const auto output_lengths = GetDimensionLengths(output_literal);
+
+    // Strides for generating linearized indices into multidimensional arrays.
+    const auto input_strides = ComputeStrides(input_lengths, input_literal);
+    const auto output_strides = ComputeStrides(output_lengths, output_literal);
+
+    // Visit all elements in the dimensions with ranks above the FFT rank. For
+    // each such element invoke the transform. Use separate indices for the
+    // input and the output to allow different layouts.
+    auto base_case = [&](int64 axis, int64 output_index, int64 input_index,
+                         bool within_src_bounds) {
+      if (axis == fft_rank - 1) {
+        // Base case: copy the data from the input literal, apply the
+        // transform, and copy the result to the output literal.
+        CHECK(within_src_bounds);
+        bool input_is_zero =
+            CopyDataFromInput(input_literal, input_index, fft_rank, fft_type,
+                              fft_size, fft_lengths, fft_strides, input_lengths,
+                              input_strides, absl::MakeSpan(data));
+        if (!input_is_zero) {
+          // Make 1D sweeps along each transform axis.
+          Sweep(fft_rank, fft_type, fft_lengths, fft_strides,
+                absl::MakeSpan(data), absl::MakeSpan(buffer));
+        }
+        CopyDataToOutput(absl::MakeSpan(data), output_index, fft_rank, fft_type,
+                         fft_lengths, fft_strides, output_lengths,
+                         output_strides, &output_literal);
+        return true;
+      }
+      return false;
+    };
+    GenerateIndices(output_lengths, output_strides, input_lengths,
+                    input_strides, input_shape.rank(), 0, 0, base_case);
+  }
+
+  evaluated_[fft] = std::move(output_literal);
+  return Status::OK();
+}
+
 // Returns an ShapeUtil::IndexIterationSpace that iterates over the output batch
 // dimensions while keeping the rest of the output dimensions clamped to 0.
 ShapeUtil::IndexIterationSpace IterationSpaceForOutputBatchIndices(
@@ -1700,7 +2240,8 @@
                       ShapeInference::InferReduceShape(
                           operand_shapes, dimensions_to_reduce,
                           /*to_apply=*/function->ComputeProgramShape()));
-  TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape))
+  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(reduce->shape(),
+                                                        inferred_return_shape))
       << "return shape is set to: " << ShapeUtil::HumanString(reduce->shape())
       << " but is inferred to be: "
       << ShapeUtil::HumanString(inferred_return_shape);
@@ -1717,11 +2258,11 @@
 
   // All args and results have the same dimensions, so pick an arbitrary one.
   const Shape& arg_shape = input_args[0]->shape();
-  const Shape& out_shape = reduce->shape();
+  const Shape& out_shape = inferred_return_shape;
   bool is_tuple = out_shape.IsTuple();
-  const Shape& output_shape = reduce->shape().IsTuple()
-                                  ? reduce->shape().tuple_shapes(0)
-                                  : reduce->shape();
+  const Shape& output_shape = inferred_return_shape.IsTuple()
+                                  ? inferred_return_shape.tuple_shapes(0)
+                                  : inferred_return_shape;
 
   absl::Span<const int64> arg_dimensions = AsInt64Slice(arg_shape.dimensions());
 
@@ -1764,7 +2305,7 @@
       }));
 
   if (is_tuple) {
-    Literal tuple_result(reduce->shape());
+    Literal tuple_result(inferred_return_shape);
     for (int64 i = 0; i < num_args; ++i) {
       TF_CHECK_OK(tuple_result.MoveFrom(std::move(results[i]), {i}));
     }
@@ -1773,6 +2314,10 @@
     CHECK_EQ(results.size(), 1);
     evaluated_[reduce] = std::move(results[0]);
   }
+  if (!ShapeUtil::Compatible(reduce->shape(), inferred_return_shape)) {
+    TF_ASSIGN_OR_RETURN(evaluated_[reduce],
+                        evaluated_[reduce].ConvertToShape(reduce->shape()));
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 357975a..45b6a27 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -204,6 +204,8 @@
 
   Status HandleTuple(HloInstruction* tuple) override;
 
+  Status HandleFft(HloInstruction* fft) override;
+
   Status HandleGather(HloInstruction* gather) override;
 
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 81a2a96..68221c0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -56,7 +56,7 @@
 // In bf16 mode, all f32 shapes are converted to bf16 before running.
 class HloEvaluatorTest : public HloTestBase {
  public:
-  HloEvaluatorTest() : use_bfloat16_(false) {}
+  HloEvaluatorTest() : use_bfloat16_(false) { InitializeFftData(); }
 
   StatusOr<Literal> Evaluate(
       absl::Span<const Literal* const> arg_literals = {}) {
@@ -130,11 +130,24 @@
   }
 
  protected:
-  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {}
+  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {
+    InitializeFftData();
+  }
+
+  // Initializes data sets used in FFT tests below.
+  void InitializeFftData();
+
   HloEvaluator evaluator_;
 
   const bool use_bfloat16_;
   std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
+
+  // Data sets used in FFT tests below.
+  ErrorSpec fft_error_ = ErrorSpec(1e-4, 1e-5);
+  Literal fft_c64x2x4x8_;
+  Literal fft_c64x2x4x8_1d_;
+  Literal fft_c64x2x4x8_2d_;
+  Literal fft_c64x2x4x8_3d_;
 };
 
 // Lets you write TEST_Ps that run twice, once with and once without bf16.
@@ -339,6 +352,13 @@
   auto expected = LiteralUtil::CreateR1<float>({});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
+
+TEST_F(HloEvaluatorTest, DoesAbsC128) {
+  auto x = LiteralUtil::CreateR0<complex128>({1, 2});
+  auto expected_real = LiteralUtil::CreateR0<double>(2.23607);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected_real), std::move(x), 3e-06);
+}
+
 TEST_F(HloEvaluatorTest, DoesNegateR2) {
   auto operand = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
@@ -1423,6 +1443,1015 @@
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+// Initialization of data sets for FFT tests:
+
+void HloEvaluatorTest::InitializeFftData() {
+  // clang-format off
+  fft_c64x2x4x8_ = LiteralUtil::CreateR3<complex64>({
+    {{{0.0, 0.0}, {1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0},
+      {4.0, 0.0}, {5.0, 0.0}, {6.0, 0.0}, {7.0, 0.0}},
+     {{0.0, 0.0}, {0.0, 1.0}, {0.0, 2.0}, {0.0, 3.0},
+      {0.0, 4.0}, {0.0, 5.0}, {0.0, 6.0}, {0.0, 7.0}},
+     {{0.0, 7.0}, {1.0, 6.0}, {2.0, 5.0}, {3.0, 4.0},
+      {4.0, 3.0}, {5.0, 2.0}, {6.0, 1.0}, {7.0, 0.0}},
+     {{7.0, 0.0}, {6.0, 1.0}, {5.0, 2.0}, {4.0, 3.0},
+      {3.0, 4.0}, {2.0, 5.0}, {1.0, 6.0}, {0.0, 7.0}}},
+    {{{-4.0, 0.0}, {-3.0, 0.0}, {-2.0, 0.0}, {-1.0, 0.0},
+      {1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}},
+     {{0.0, -4.0}, {0.0, -3.0}, {0.0, -2.0}, {0.0, -1.0},
+      {0.0, 1.0}, {0.0, 2.0}, {0.0, 3.0}, {0.0, 4.0}},
+     {{3.5, 3.5}, {-1.707107, -0.707107}, {-1.0, -0.0}, {-0.707107, 0.292893},
+      {-0.5, 0.5}, {-0.292893, 0.707107}, {0.0, 1.0}, {0.707107, 1.707107}},
+     {{3.5, 3.5}, {1.707107, 0.707107}, {1.0, 0.0}, {0.707107, -0.292893},
+      {0.5, -0.5}, {0.292893, -0.707107}, {-0.0, -1.0}, {-0.707107, -1.707107}}}
+  });
+  fft_c64x2x4x8_1d_ = LiteralUtil::CreateR3<complex64>({
+    {{{28.0, 0.0}, {-4.0, 9.656854}, {-4.0, 4.0}, {-4.0, 1.656854},
+      {-4.0, 0.0}, {-4.0, -1.656854}, {-4.0, -4.0}, {-4.0, -9.656854}},
+     {{0.0, 28.0}, {-9.656854, -4.0}, {-4.0, -4.0}, {-1.656854, -4.0},
+      {0.0, -4.0}, {1.656854, -4.0}, {4.0, -4.0}, {9.656854, -4.0}},
+     {{28.0, 28.0}, {5.656854, 13.656854}, {0.0, 8.0}, {-2.343146, 5.656854},
+      {-4.0, 4.0}, {-5.656854, 2.343146}, {-8.0, -0.0}, {-13.656854, -5.656854}},  // NOLINT
+     {{28.0, 28.0}, {-5.656854, -13.656854}, {-0.0, -8.0}, {2.343146, -5.656854},  // NOLINT
+      {4.0, -4.0}, {5.656854, -2.343146}, {8.0, 0.0}, {13.656854, 5.656854}}},
+    {{{0.0, 0.0}, {-5.0, 12.071068}, {-4.0, 4.0}, {-5.0, 2.071068},
+      {-4.0, 0.0}, {-5.0, -2.071068}, {-4.0, -4.0}, {-5.0, -12.071068}},
+     {{0.0, 0.0}, {-12.071068, -5.0}, {-4.0, -4.0}, {-2.071068, -5.0},
+      {0.0, -4.0}, {2.071068, -5.0}, {4.0, -4.0}, {12.071068, -5.0}},
+     {{0.0, 7.0}, {1.0, 6.0}, {2.0, 5.0}, {3.0, 4.0},
+      {4.0, 3.0}, {5.0, 2.0}, {6.0, 1.0}, {7.0, 0.0}},
+     {{7.0, 0.0}, {6.0, 1.0}, {5.0, 2.0}, {4.0, 3.0},
+      {3.0, 4.0}, {2.0, 5.0}, {1.0, 6.0}, {0.0, 7.0}}}
+  });
+  fft_c64x2x4x8_2d_ = LiteralUtil::CreateR3<complex64>({
+    {{{84.0, 84.0}, {-13.656854, 5.656854}, {-8.0, 0.0}, {-5.656854, -2.343146},
+      {-4.0, -4.0}, {-2.343146, -5.656854}, {0.0, -8.0}, {5.656854, -13.656854}},  // NOLINT
+     {{0.0, 0.0}, {0.0, -0.0}, {0.0, 0.0}, {0.0, 0.0},
+      {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+     {{28.0, -28.0}, {16.970562, 40.970562}, {0.0, 24.0}, {-7.029438, 16.970562},      // NOLINT
+      {-12.0, 12.0}, {-16.970562, 7.029438}, {-24.0, 0.0}, {-40.970562, -16.970562}},  // NOLINT
+     {{0.0, -56.0}, {-19.313708, -8.0}, {-8.0, -8.0}, {-3.313708, -8.0},
+      {0.0, -8.0}, {3.313708, -8.0}, {8.0, -8.0}, {19.313708, -8.0}}},
+    {{{7.0, 7.0}, {-10.071068, 14.071068}, {-1.0, 7.0}, {-0.071068, 4.071068},
+      {3.0, 3.0}, {4.071068, -0.071068}, {7.0, -1.0}, {14.071068, -10.071068}},
+     {{0.0, 0.0}, {-12.0, 24.142136}, {-12.0, 8.0}, {-16.0, 4.142136},
+      {-16.0, 0.0}, {-20.0, -4.142136}, {-20.0, -8.0}, {-24.0, -24.142136}},
+     {{-7.0, 7.0}, {2.071068, 22.071068}, {-3.0, 11.0}, {-3.928932, 8.071068},
+      {-3.0, 3.0}, {-4.071068, -0.071068}, {-3.0, -5.0}, {-10.071068, -14.071068}},  // NOLINT
+     {{0.0, -14.0}, {0.0, -12.0}, {0.0, -10.0}, {0.0, -8.0},
+      {0.0, -6.0}, {0.0, -4.0}, {0.0, -2.0}, {0.0, 0.0}}}
+  });
+  fft_c64x2x4x8_3d_ = LiteralUtil::CreateR3<complex64>({
+    {{{91.0, 91.0}, {-23.727922, 19.727922}, {-9.0, 7.0}, {-5.727922, 1.727922},
+      {-1.0, -1.0}, {1.727922, -5.727922}, {7.0, -9}, {19.727922, -23.727922}},
+     {{0.0, 0.0}, {-12.0, 24.142136}, {-12.0, 8.0}, {-16.0, 4.142136},
+      {-16.0, 0.0}, {-20.0, -4.142136}, {-20.0, -8.0}, {-24.0, -24.142136}},
+     {{21.0, -21.0}, {19.041630, 63.041630}, {-3.0, 35.0}, {-10.958370, 25.041630},     // NOLINT
+      {-15.0, 15.0}, {-21.041630, 6.958370}, {-27.0, -5.0}, {-51.041630, -31.041630}},  // NOLINT
+     {{0.0, -70.0}, {-19.313708, -20.0}, {-8.0, -18.0}, {-3.313708, -16.0},
+      {0.0, -14.0}, {3.313708, -12.0}, {8.0, -10.0}, {19.313708, -8.0}}},
+    {{{77.0, 77.0}, {-3.585786, -8.414214}, {-7.0, -7.0}, {-5.585786, -6.414214},   // NOLINT
+      {-7.0, -7.0}, {-6.414214, -5.585786}, {-7.0, -7.0}, {-8.414214, -3.585786}},  // NOLINT
+     {{0.0, 0.0}, {12.0, -24.142136}, {12.0, -8.0}, {16.0, -4.142136},
+      {16.0, 0.0}, {20.0, 4.142136}, {20.0, 8.0}, {24.0, 24.142136}},
+     {{35.0, -35.0}, {14.899494, 18.899494}, {3.0, 13.0}, {-3.100506, 8.899494},
+      {-9.0, 9.0}, {-12.899494, 7.100506}, {-21.0, 5.0}, {-30.899494, -2.899494}},  // NOLINT
+     {{0.0, -42.0}, {-19.313708, 4.0}, {-8.0, 2.0}, {-3.313708, 0.0},
+      {0.0, -2.0}, {3.313708, -4.0}, {8.0, -6.0}, {19.313708, -8.0}}}
+  });
+  // clang-format on
+}
+
+// Simple FFT tests:
+
+TEST_F(HloEvaluatorTest, 1D_FFT_4_on_c64x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[4] parameter(0)
+  ROOT fft = c64[4] fft(operand), fft_type=FFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}});
+  auto expected = LiteralUtil::CreateR1<complex64>(
+      {{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}, {-2.0, -2.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IFFT_4_on_c64x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[4] parameter(0)
+  ROOT ifft = c64[4] fft(operand), fft_type=IFFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}, {-2.0, -2.0}});
+  auto expected = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_RFFT_4_on_f32x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[4] parameter(0)
+  ROOT rfft = c64[3] fft(operand), fft_type=RFFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
+  auto expected =
+      LiteralUtil::CreateR1<complex64>({{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IRFFT_4_on_c64x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3] parameter(0)
+  ROOT irfft = f32[4] fft(operand), fft_type=IRFFT, fft_length={4}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR1<complex64>({{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}});
+  auto expected = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// 1D FFT tests:
+
+TEST_F(HloEvaluatorTest, 1D_FFT_8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT fft = c64[2, 4, 8] fft(operand), fft_type=FFT, fft_length={8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_1d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_1d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IFFT_8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT ifft = c64[2, 4, 8] fft(operand), fft_type=IFFT, fft_length={8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_1d_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_RFFT_8_on_f32x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[8] parameter(0)
+  ROOT rfft = c64[5] fft(operand), fft_type=RFFT, fft_length={8}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR1<float>({1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1});
+  auto expected = LiteralUtil::CreateR1<complex64>({{39.6, 0.0},
+                                                    {-3.6, 8.691169},
+                                                    {-3.6, 3.6},
+                                                    {-3.6, 1.491169},
+                                                    {-3.6, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IRFFT_8_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT irfft = f32[8] fft(operand), fft_type=IRFFT, fft_length={8}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>({{39.6, 0.0},
+                                                 {-3.6, 8.691169},
+                                                 {-3.6, 3.6},
+                                                 {-3.6, 1.491169},
+                                                 {-3.6, 0.0}});
+  auto expected =
+      LiteralUtil::CreateR1<float>({1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_RFFT_9_on_f32x9) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[9] parameter(0)
+  ROOT rfft = c64[5] fft(operand), fft_type=RFFT, fft_length={9}
+}
+)";
+  auto input = LiteralUtil::CreateR1<float>(
+      {1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.9});
+  auto expected = LiteralUtil::CreateR1<complex64>({{49.5, 0.0},
+                                                    {-3.360560, 11.705792},
+                                                    {-3.893717, 5.712929},
+                                                    {-4.5, 3.117691},
+                                                    {-4.895723, 1.021942}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IRFFT_9_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT irfft = f32[9] fft(operand), fft_type=IRFFT, fft_length={9}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>({{49.5, 0.0},
+                                                 {-3.360560, 11.705792},
+                                                 {-3.893717, 5.712929},
+                                                 {-4.5, 3.117691},
+                                                 {-4.895723, 1.021942}});
+  auto expected = LiteralUtil::CreateR1<float>(
+      {1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.9});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// 2D FFT tests:
+
+TEST_F(HloEvaluatorTest, 2D_FFT_4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT fft = c64[2, 4, 8] fft(operand), fft_type=FFT, fft_length={4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_2d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_2d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_IFFT_4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT ifft = c64[2, 4, 8] fft(operand), fft_type=IFFT, fft_length={4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_2d_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_RFFT_3x8_on_f32x3x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 8] parameter(0)
+  ROOT rfft = c64[3, 5] fft(operand), fft_type=RFFT, fft_length={3, 8}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR2<float>({{1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1},
+                                    {8.1, 7.2, 6.3, 5.4, 4.5, 3.6, 2.7, 1.8},
+                                    {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}});
+  auto expected = LiteralUtil::CreateR2<complex64>({{{118.8, 0.0},
+                                                     {-4.4, 10.622540},
+                                                     {-4.4, 4.4},
+                                                     {-4.4, 1.822540},
+                                                     {-4.4, 0.0}},
+                                                    {{0.0, 0.0},
+                                                     {-19.926162, 0.797280},
+                                                     {-10.128203, -3.728203},
+                                                     {-6.069756, -5.602720},
+                                                     {-3.2, -6.928203}},
+                                                    {{0.0, 0.0},
+                                                     {13.526162, 14.653687},
+                                                     {3.728203, 10.128203},
+                                                     {-0.330244, 8.253687},
+                                                     {-3.2, 6.928203}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_IRFFT_3x8_on_c64x3x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 5] parameter(0)
+  ROOT irfft = f32[3, 8] fft(operand), fft_type=IRFFT, fft_length={3, 8}
+}
+)";
+  auto input = LiteralUtil::CreateR2<complex64>({{{118.8, 0.0},
+                                                  {-4.4, 10.622540},
+                                                  {-4.4, 4.4},
+                                                  {-4.4, 1.822540},
+                                                  {-4.4, 0.0}},
+                                                 {{0.0, 0.0},
+                                                  {-19.926162, 0.797280},
+                                                  {-10.128203, -3.728203},
+                                                  {-6.069756, -5.602720},
+                                                  {-3.2, -6.928203}},
+                                                 {{0.0, 0.0},
+                                                  {13.526162, 14.653687},
+                                                  {3.728203, 10.128203},
+                                                  {-0.330244, 8.253687},
+                                                  {-3.2, 6.928203}}});
+  auto expected =
+      LiteralUtil::CreateR2<float>({{1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1},
+                                    {8.1, 7.2, 6.3, 5.4, 4.5, 3.6, 2.7, 1.8},
+                                    {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_RFFT_3x9_on_f32x3x9) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 9] parameter(0)
+  ROOT rfft = c64[3, 5] fft(operand), fft_type=RFFT, fft_length={3, 9}
+}
+)";
+  auto input = LiteralUtil::CreateR2<float>(
+      {{1.9, 2.8, 3.7, 4.6, 5.5, 6.4, 7.3, 8.2, 9.1},
+       {9.1, 8.2, 7.3, 6.4, 5.5, 4.6, 3.7, 2.8, 1.9},
+       {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9}});
+  auto expected = LiteralUtil::CreateR2<complex64>({{{148.5, 0.0},
+                                                     {-4.95, 13.600013},
+                                                     {-4.95, 5.899180},
+                                                     {-4.95, 2.857884},
+                                                     {-4.95, 0.872819}},
+                                                    {{0.0, 0.0},
+                                                     {-25.014467, 2.096690},
+                                                     {-12.888800, -3.503916},
+                                                     {-8.1, -5.715768},
+                                                     {-4.974333, -7.159452}},
+                                                    {{0.0, 0.0},
+                                                     {17.814467, 17.685147},
+                                                     {5.688800, 12.084542},
+                                                     {0.9, 9.872690},
+                                                     {-2.225667, 8.429006}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_IRFFT_3x9_on_c64x3x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 5] parameter(0)
+  ROOT irfft = f32[3, 9] fft(operand), fft_type=IRFFT, fft_length={3, 9}
+}
+)";
+  auto input = LiteralUtil::CreateR2<complex64>({{{148.5, 0.0},
+                                                  {-4.95, 13.600013},
+                                                  {-4.95, 5.899180},
+                                                  {-4.95, 2.857884},
+                                                  {-4.95, 0.872819}},
+                                                 {{0.0, 0.0},
+                                                  {-25.014467, 2.096690},
+                                                  {-12.888800, -3.503916},
+                                                  {-8.1, -5.715768},
+                                                  {-4.974333, -7.159452}},
+                                                 {{0.0, 0.0},
+                                                  {17.814467, 17.685147},
+                                                  {5.688800, 12.084542},
+                                                  {0.9, 9.872690},
+                                                  {-2.225667, 8.429006}}});
+  auto expected = LiteralUtil::CreateR2<float>(
+      {{1.9, 2.8, 3.7, 4.6, 5.5, 6.4, 7.3, 8.2, 9.1},
+       {9.1, 8.2, 7.3, 6.4, 5.5, 4.6, 3.7, 2.8, 1.9},
+       {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// 3D FFT tests:
+
+TEST_F(HloEvaluatorTest, 3D_FFT_2x4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT fft = c64[2, 4, 8] fft(operand), fft_type=FFT, fft_length={2, 4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_3d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_3d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_IFFT_2x4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT ifft = c64[2, 4, 8] fft(operand), fft_type=IFFT, fft_length={2, 4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_3d_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_RFFT_3x3x4_on_f32x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 3, 4] parameter(0)
+  ROOT rfft = c64[3, 3, 3] fft(operand), fft_type=RFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<float>(
+      {{{1.8, 2.7, 3.6, 4.5}, {8.1, 7.2, 6.3, 5.4}, {1.1, 2.2, 3.3, 4.4}},
+       {{5.4, 6.3, 7.2, 8.1}, {4.5, 3.6, 2.7, 1.8}, {5.5, 6.6, 7.7, 8.8}},
+       {{-1.8, -2.7, -3.6, -4.5},
+        {-5.4, -6.3, -7.2, -8.1},
+        {1.9, 2.9, 3.9, 4.9}}});
+  auto expected = LiteralUtil::CreateR3<complex64>(
+      {{{{92.8, 0.0}, {-2.8, 2.8}, {-2.8, 0.0}},
+        {{-5.9, 35.160631}, {-11.519100, -8.919100}, {-1.3, -10.219100}},
+        {{-5.9, -35.160631}, {8.919100, 11.519100}, {-1.3, 10.219100}}},
+       {{{29.5, -81.579593}, {1.390897, 5.190897}, {-1.9, 3.290897}},
+        {{-25.1, -49.017038}, {1.044486, 4.844486}, {-1.9, 2.944486}},
+        {{11.8, 27.712813}, {1.517691, 4.717691}, {-1.6, 3.117691}}},
+       {{{29.5, 81.579593}, {-5.190897, -1.390897}, {-1.9, -3.290897}},
+        {{11.8, -27.712813}, {-4.717691, -1.517691}, {-1.6, -3.117691}},
+        {{-25.1, 49.017038}, {-4.844486, -1.044486}, {-1.9, -2.944486}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_IRFFT_3x3x4_on_c64x3x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 3] parameter(0)
+  ROOT irfft = f32[3, 3, 4] fft(operand), fft_type=IRFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{92.8, 0.0}, {-2.8, 2.8}, {-2.8, 0.0}},
+        {{-5.9, 35.160631}, {-11.519100, -8.919100}, {-1.3, -10.219100}},
+        {{-5.9, -35.160631}, {8.919100, 11.519100}, {-1.3, 10.219100}}},
+       {{{29.5, -81.579593}, {1.390897, 5.190897}, {-1.9, 3.290897}},
+        {{-25.1, -49.017038}, {1.044486, 4.844486}, {-1.9, 2.944486}},
+        {{11.8, 27.712813}, {1.517691, 4.717691}, {-1.6, 3.117691}}},
+       {{{29.5, 81.579593}, {-5.190897, -1.390897}, {-1.9, -3.290897}},
+        {{11.8, -27.712813}, {-4.717691, -1.517691}, {-1.6, -3.117691}},
+        {{-25.1, 49.017038}, {-4.844486, -1.044486}, {-1.9, -2.944486}}}});
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{1.8, 2.7, 3.6, 4.5}, {8.1, 7.2, 6.3, 5.4}, {1.1, 2.2, 3.3, 4.4}},
+       {{5.4, 6.3, 7.2, 8.1}, {4.5, 3.6, 2.7, 1.8}, {5.5, 6.6, 7.7, 8.8}},
+       {{-1.8, -2.7, -3.6, -4.5},
+        {-5.4, -6.3, -7.2, -8.1},
+        {1.9, 2.9, 3.9, 4.9}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_RFFT_3x3x5_on_f32x3x3x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 3, 5] parameter(0)
+  ROOT rfft = c64[3, 3, 3] fft(operand), fft_type=RFFT, fft_length={3, 3, 5}
+}
+)";
+  auto input = LiteralUtil::CreateR3<float>({{{1.8, 2.7, 3.6, 4.5, 5.4},
+                                              {8.1, 7.2, 6.3, 5.4, 4.5},
+                                              {1.1, 2.2, 3.3, 4.4, 5.5}},
+                                             {{5.4, 6.3, 7.2, 8.1, 9.0},
+                                              {4.5, 3.6, 2.7, 1.8, 0.9},
+                                              {5.5, 6.6, 7.7, 8.8, 9.9}},
+                                             {{-1.8, -2.7, -3.6, -4.5, -5.4},
+                                              {-5.4, -6.3, -7.2, -8.1, -9.0},
+                                              {1.9, 2.9, 3.9, 4.9, 5.9}}});
+  auto expected = LiteralUtil::CreateR3<complex64>(
+      {{{{119.5, 0.0}, {-3.5, 4.817337}, {-3.5, 1.137219}},
+        {{-5.75, 56.724664}, {-19.206730, -10.537254}, {-5.775483, -12.245880}},
+        {{-5.75, -56.724664}, {15.956730, 15.010495}, {2.525483, 13.301869}}},
+       {{{39.25, -106.088112}, {3.286913, 7.382528}, {-1.038404, 4.885305}},
+        {{-29.0, -64.951905}, {2.690922, 6.949515}, {-1.179098, 4.452292}},
+        {{16.75, 30.743902}, {3.363918, 6.649878}, {-0.733751, 4.546954}}},
+       {{{39.25, 106.088112}, {-8.036913, -0.844714}, {-3.711596, -3.341936}},
+        {{16.75, -30.743902}, {-7.363918, -1.144350}, {-3.266249, -3.247275}},
+        {{-29.0, 64.951905}, {-7.440922, -0.411701}, {-3.570902, -2.908924}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_IRFFT_3x3x5_on_c64x3x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 3] parameter(0)
+  ROOT irfft = f32[3, 3, 5] fft(operand), fft_type=IRFFT, fft_length={3, 3, 5}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{119.5, 0.0}, {-3.5, 4.817337}, {-3.5, 1.137219}},
+        {{-5.75, 56.724664}, {-19.206730, -10.537254}, {-5.775483, -12.245880}},
+        {{-5.75, -56.724664}, {15.956730, 15.010495}, {2.525483, 13.301869}}},
+       {{{39.25, -106.088112}, {3.286913, 7.382528}, {-1.038404, 4.885305}},
+        {{-29.0, -64.951905}, {2.690922, 6.949515}, {-1.179098, 4.452292}},
+        {{16.75, 30.743902}, {3.363918, 6.649878}, {-0.733751, 4.546954}}},
+       {{{39.25, 106.088112}, {-8.036913, -0.844714}, {-3.711596, -3.341936}},
+        {{16.75, -30.743902}, {-7.363918, -1.144350}, {-3.266249, -3.247275}},
+        {{-29.0, 64.951905}, {-7.440922, -0.411701}, {-3.570902, -2.908924}}}});
+  auto expected = LiteralUtil::CreateR3<float>({{{1.8, 2.7, 3.6, 4.5, 5.4},
+                                                 {8.1, 7.2, 6.3, 5.4, 4.5},
+                                                 {1.1, 2.2, 3.3, 4.4, 5.5}},
+                                                {{5.4, 6.3, 7.2, 8.1, 9.0},
+                                                 {4.5, 3.6, 2.7, 1.8, 0.9},
+                                                 {5.5, 6.6, 7.7, 8.8, 9.9}},
+                                                {{-1.8, -2.7, -3.6, -4.5, -5.4},
+                                                 {-5.4, -6.3, -7.2, -8.1, -9.0},
+                                                 {1.9, 2.9, 3.9, 4.9, 5.9}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// FFT tests with non-default data layout:
+
+TEST_F(HloEvaluatorTest, 1D_FFT_8_on_c64x2x4x8_with_layout) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8]{0, 2, 1} parameter(0)
+  ROOT fft = c64[2, 4, 8]{1, 2, 0} fft(operand), fft_type=FFT, fft_length={8}
+}
+)";
+  auto input = fft_c64x2x4x8_.Relayout(LayoutUtil::MakeLayout({0, 2, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_1d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_1d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_FFT_4x8_on_c64x2x4x8_with_layout) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8]{2, 0, 1} parameter(0)
+  ROOT fft = c64[2, 4, 8]{1, 0, 2} fft(operand), fft_type=FFT, fft_length={4, 8}
+}
+)";
+  auto input = fft_c64x2x4x8_.Relayout(LayoutUtil::MakeLayout({2, 0, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_2d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_2d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_FFT_2x4x8_on_c64x2x4x8_with_layout) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8]{1, 2, 0} parameter(0)
+  ROOT fft =
+    c64[2, 4, 8]{0, 2, 1} fft(operand), fft_type=FFT, fft_length={2, 4, 8}
+}
+)";
+  auto input = fft_c64x2x4x8_.Relayout(LayoutUtil::MakeLayout({1, 2, 0}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_3d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_3d_, result, fft_error_));
+}
+
+// FFT tests with unusual parameters:
+
+// Zero-length transform.
+TEST_F(HloEvaluatorTest, 1D_FFT_0_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={0}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  auto expected = LiteralUtil::CreateR4<complex64>({{{{{0.0, 0.0}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Zero-length axis.
+TEST_F(HloEvaluatorTest, 1D_FFT_1_on_c64x1x1x1x0) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 0] parameter(0)
+  ROOT fft = c64[1, 1, 1, 0] fft(operand), fft_type=FFT, fft_length={1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input,
+      LiteralUtil::CreateR4<complex64>({{{{}}}}).Reshape({1, 1, 1, 0}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 1D_FFT_1_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Zero-length transform.
+TEST_F(HloEvaluatorTest, 3D_FFT_1x0x1_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={1, 0, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  auto expected = LiteralUtil::CreateR4<complex64>({{{{{0.0, 0.0}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Zero-length axis.
+TEST_F(HloEvaluatorTest, 3D_FFT_1x1x1_on_c64x0x1x0x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[0, 1, 0, 1] parameter(0)
+  ROOT fft = c64[0, 1, 0, 1] fft(operand), fft_type=FFT, fft_length={1, 1, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input,
+      LiteralUtil::CreateR4<complex64>({{{{}}}}).Reshape({0, 1, 0, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 3D_FFT_1x1x1_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={1, 1, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 3D_FFT_3x1x1_on_c64x1x3x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 3, 1, 1] parameter(0)
+  ROOT fft = c64[1, 3, 1, 1] fft(operand), fft_type=FFT, fft_length={3, 1, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>(
+      {{{{{42.24, 24.42}}}, {{{-42.24, 24.42}}}, {{{42.24, -24.42}}}}});
+  auto expected =
+      LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}},
+                                         {{{84.5367, 97.5818}}},
+                                         {{{-0.0566792, -48.7418}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 3D_IFFT_3x1x1_on_c64x1x3x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 3, 1, 1] parameter(0)
+  ROOT ifft = c64[1, 3, 1, 1] fft(operand), fft_type=IFFT, fft_length={3, 1, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}},
+                                                  {{{84.5367, 97.5818}}},
+                                                  {{{-0.0566792, -48.7418}}}}});
+  auto expected = LiteralUtil::CreateR4<complex64>(
+      {{{{{42.24, 24.42}}}, {{{-42.24, 24.42}}}, {{{42.24, -24.42}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Odd transform length.
+TEST_F(HloEvaluatorTest, 1D_FFT_5_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT fft = c64[5] fft(operand), fft_type=FFT, fft_length={5}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 5.0}, {2.0, 4.0}, {3.0, 3.0}, {4.0, 2.0}, {5.0, 1.0}});
+  auto expected = LiteralUtil::CreateR1<complex64>({{15.0, 15.0},
+                                                    {0.940955, 5.94095},
+                                                    {-1.6877, 3.3123},
+                                                    {-3.3123, 1.6877},
+                                                    {-5.94095, -0.940955}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Odd transform length.
+TEST_F(HloEvaluatorTest, 1D_IFFT_5_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT ifft = c64[5] fft(operand), fft_type=IFFT, fft_length={5}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>({{15.0, 15.0},
+                                                 {0.940955, 5.94095},
+                                                 {-1.6877, 3.3123},
+                                                 {-3.3123, 1.6877},
+                                                 {-5.94095, -0.940955}});
+  auto expected = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 5.0}, {2.0, 4.0}, {3.0, 3.0}, {4.0, 2.0}, {5.0, 1.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 1D_FFT_4_on_zero_c64x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[4] parameter(0)
+  ROOT fft = c64[4] fft(operand), fft_type=FFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_FFT_3x3x4_on_zero_c64x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 4] parameter(0)
+  ROOT fft = c64[3, 3, 4] fft(operand), fft_type=FFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_IFFT_3x3x4_on_zero_c64x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 4] parameter(0)
+  ROOT ifft = c64[3, 3, 4] fft(operand), fft_type=IFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_RFFT_3x3x4_on_zero_f32x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 3, 4] parameter(0)
+  ROOT rfft = c64[3, 3, 3] fft(operand), fft_type=RFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<float>(
+      {{{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}}});
+  auto expected = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_IRFFT_3x3x4_on_zero_c64x3x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 3] parameter(0)
+  ROOT irfft = f32[3, 3, 4] fft(operand), fft_type=IRFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Input values, for which IRFFT discards non-zero imaginary parts.
+TEST_F(HloEvaluatorTest, 2D_IRFFT_3x4_on_c64x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3] parameter(0)
+  ROOT irfft = f32[3, 4] fft(operand), fft_type=IRFFT, fft_length={3, 4}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR2<complex64>({{{0.0, 0.0}, {1.0, 0.0}, {2.0, 0.0}},
+                                        {{3.0, 0.0}, {4.0, 0.0}, {5.0, 0.0}},
+                                        {{6.0, 0.0}, {7.0, 0.0}, {8.0, 0.0}}});
+  auto expected =
+      LiteralUtil::CreateR2<float>({{4.0, -0.5, 0.0, -0.5},
+                                    {-1.5, 0.433013, 0.0, -0.433013},
+                                    {-1.5, -0.433013, 0.0, 0.433013}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
 class HloEvaluatorPreciseReduceTest : public HloTestBase {};
 
 // Tests that Reduce doesn't lose precision when adding many numbers (because
@@ -2910,6 +3939,30 @@
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_F(HloEvaluatorTest, MixedPrecisionReduction) {
+  const string hlo_text = R"(
+HloModule MixedPrecisionReduction
+
+add_f32 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  arg0 = f32[4]{0} parameter(0)
+  init = f32[] constant(0)
+  ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_f32
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+
+  Literal arg = LiteralUtil::CreateR1<float>({1.0f, 3.0f, -2.0f, 42.0f});
+  Literal expected = LiteralUtil::CreateR0<bfloat16>(bfloat16(44.0f));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&arg}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_F(HloEvaluatorTest, DontFailOnCallUnimplementedOps) {
   // Infeed triggers unimplemented error within HandleCall, and we verify that
   // the Evaluator does fail in such case.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index ab27ac8..a2afb0c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -68,8 +68,8 @@
 // Templated DfsHloVisitor for use by HloEvaluator.
 //
 // Typically ReturnT here indicates the resulting literal type of each evaluated
-// Handle* method of a TypedVisitor.  There are however a few notable exceptions
-// to this rule, notably:
+// Handle* method of a TypedVisitor.  There are however a few exceptions to this
+// rule, notably:
 // - HandleCompare and HandleIsFinite: where the resulting literal type is
 //   always boolean.
 // - HandleImag and HandleReal: where the resulting literal type is always float
@@ -81,7 +81,7 @@
 //   - ReturnT: The type of input and output of each operation.
 //   - ElementwiseT: The type in which internal computation are done.
 //
-// This a logically a private part of HloEvaluator.  It lives in this header
+// This is logically a private part of HloEvaluator.  It lives in this header
 // file rather than in hlo_evaluator.cc because we use extern templates and a
 // bunch of independent cc files to speed up compiling the many instantiations
 // of this class.
@@ -180,7 +180,8 @@
         parent_->GetEvaluatedLiteralFor(abs->operand(0));
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[abs],
-        (HloEvaluator::ElementWiseUnaryOpImpl<float, NativeT>(
+        (HloEvaluator::ElementWiseUnaryOpImpl<typename NativeT::value_type,
+                                              NativeT>(
             abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
             operand_literal)));
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 70ba0b1..3a1ba77 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -61,9 +61,6 @@
 using absl::StrCat;
 using absl::StrFormat;
 using absl::StrJoin;
-using tensorflow::Env;
-using tensorflow::WriteStringToFile;
-using tensorflow::io::JoinPath;
 
 // Used to indicate how we should treat a given HLOInstruction in the graph.
 // should we treat it like normal, hide it, and so on?
@@ -120,7 +117,7 @@
 // We arbitrarily set this as the boundary between "large" and "small"
 // instructions.
 bool IsSmall(const HloInstruction* instr) {
-  if (ShapeUtil::HasPrimitiveType(instr->shape(), OPAQUE) ||
+  if (ShapeUtil::HasPrimitiveType(instr->shape(), OPAQUE_TYPE) ||
       ShapeUtil::HasPrimitiveType(instr->shape(), TOKEN)) {
     return true;
   }
@@ -1043,6 +1040,7 @@
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index bbda80c..21a3aa9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -416,6 +416,10 @@
       instruction = CreateReplicaId();
       break;
     }
+    case HloOpcode::kPartitionId: {
+      instruction = CreatePartitionId();
+      break;
+    }
     case HloOpcode::kConvolution: {
       TF_RET_CHECK(proto.has_window());
       TF_RET_CHECK(proto.has_convolution_dimension_numbers());
@@ -456,11 +460,11 @@
         }
         instruction =
             CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
-                             operand_shapes, proto.custom_call_opaque());
+                             operand_shapes, proto.backend_config());
       } else {
         instruction =
             CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
-                             proto.custom_call_opaque());
+                             proto.backend_config());
       }
       if (proto.has_window()) {
         static_cast<HloCustomCallInstruction*>(instruction.get())
@@ -869,6 +873,12 @@
       new HloInstruction(HloOpcode::kReplicaId, ShapeUtil::MakeShape(U32, {})));
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreatePartitionId() {
+  return absl::WrapUnique(new HloInstruction(HloOpcode::kPartitionId,
+                                             ShapeUtil::MakeShape(U32, {})));
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
     const Shape& infeed_shape, HloInstruction* token_operand,
     const string& config) {
@@ -1506,6 +1516,10 @@
       CHECK_EQ(new_operands.size(), 0);
       clone = CreateReplicaId();
       break;
+    case HloOpcode::kPartitionId:
+      CHECK_EQ(new_operands.size(), 0);
+      clone = CreatePartitionId();
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1765,6 +1779,7 @@
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNegate:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kPopulationCount:
     case HloOpcode::kPower:
     case HloOpcode::kReal:
@@ -1919,6 +1934,7 @@
   std::replace(user->operands_.begin(), user->operands_.end(), this,
                new_producer);
   new_producer->AddUser(user);
+  // Custom fusions may not be able to handle deduplicated operands.
   if (user->opcode() == HloOpcode::kFusion) {
     TF_RETURN_IF_ERROR(
         Cast<HloFusionInstruction>(user)->DeduplicateFusionOperands());
@@ -2593,6 +2609,8 @@
       return visitor->HandleCollectivePermute(this);
     case HloOpcode::kReplicaId:
       return visitor->HandleReplicaId(this);
+    case HloOpcode::kPartitionId:
+      return visitor->HandlePartitionId(this);
     case HloOpcode::kTuple:
       return visitor->HandleTuple(this);
     case HloOpcode::kMap:
@@ -2754,7 +2772,7 @@
   // Calculating the instruction count within a module can be expensive on large
   // models so only do it if the visit state is empty. This will help when the
   // same visitor is reused across many computations of a single module.
-  if (visitor->VisitStateSize() == 0) {
+  if (visitor->VisitStateCapacity() == 0) {
     visitor->ReserveVisitStates(root->GetModule()->instruction_count());
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 8757da5..23b5566 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -529,6 +529,9 @@
   // Creates an instruction that returns a U32 replica ID.
   static std::unique_ptr<HloInstruction> CreateReplicaId();
 
+  // Creates an instruction that returns a U32 partition ID.
+  static std::unique_ptr<HloInstruction> CreatePartitionId();
+
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 9f9dde9..9fb9b55 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1580,6 +1580,9 @@
 }
 
 Status HloFusionInstruction::DeduplicateFusionOperands() {
+  if (IsCustomFusion()) {
+    return Status::OK();
+  }
   absl::flat_hash_map<const HloInstruction*, int> operand_indices;
   std::vector<int> operands_to_remove;
   for (int i = 0; i < operand_count(); ++i) {
@@ -2085,7 +2088,7 @@
         *convolution_dimension_numbers_;
   }
   proto.set_custom_call_target(custom_call_target_);
-  proto.set_custom_call_opaque(opaque_);
+  proto.set_backend_config(opaque_);
   proto.set_feature_group_count(feature_group_count_);
   proto.set_batch_group_count(batch_group_count_);
   if (layout_constrained()) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 0adfef7..cf0f4bc 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -220,6 +220,7 @@
 HLO_MATCHER(Negate);
 HLO_MATCHER(Outfeed);
 HLO_MATCHER(Pad);
+HLO_MATCHER(PartitionId);
 HLO_MATCHER(Power);
 HLO_MATCHER(Recv);
 HLO_MATCHER(RecvDone);
@@ -227,6 +228,7 @@
 HLO_MATCHER(ReducePrecision);
 HLO_MATCHER(ReduceWindow);
 HLO_MATCHER(Remainder);
+HLO_MATCHER(ReplicaId);
 HLO_MATCHER(Reshape);
 HLO_MATCHER(Reverse);
 HLO_MATCHER(Rng);
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 82b131a..ba3c069 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -22,6 +22,7 @@
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -126,6 +127,7 @@
 
     // Create map containing the number of unscheduled uses (hlo instructions)
     // of each logical buffer.
+    unscheduled_use_count_.reserve(points_to_analysis.num_logical_buffers());
     for (auto* instruction : computation->instructions()) {
       for (auto* buffer :
            points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
@@ -460,6 +462,7 @@
     sequence.push_back(hlo);
     return Status::OK();
   });
+  visitor.ReserveVisitStates(computation->instruction_count());
   TF_RETURN_IF_ERROR(computation->AcceptWithOperandOrder(
       &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
                                              const HloInstruction* b) {
@@ -612,11 +615,13 @@
     if (!computation->IsFusionComputation()) {
       HloInstructionSequence& computation_sequence =
           schedule.GetOrCreateSequence(computation);
-      TF_RETURN_IF_ERROR(computation->Accept(
+      FunctionVisitor visitor(
           [&computation_sequence](HloInstruction* instruction) {
             computation_sequence.push_back(instruction);
             return Status::OK();
-          }));
+          });
+      visitor.ReserveVisitStates(computation->instruction_count());
+      TF_RETURN_IF_ERROR(computation->Accept(&visitor));
     }
   }
   TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 6d3a498..ecd4eb3 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -108,6 +108,7 @@
   V(kOutfeed, "outfeed", 2)                                            \
   V(kPad, "pad", 2)                                                    \
   V(kParameter, "parameter", 0)                                        \
+  V(kPartitionId, "partition-id", 0)                                   \
   V(kPopulationCount, "popcnt", 1)                                     \
   V(kPower, "power", 2)                                                \
   V(kReal, "real", 1)                                                  \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 831771f..a4804a8 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -190,17 +190,30 @@
   }
 
   // The use at a while is an input to a phi, and logically occurs before values
-  // are defined in the body or condition computations.
+  // are defined in the body. Note that the use is *not* before the value if the
+  // value is defined in the condition and is not the condition parameter, since
+  // the input of a while's life range is only ended at the start the body.
   if (use.instruction->opcode() == HloOpcode::kWhile) {
     const HloInstruction* xla_while = use.instruction;
     if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
-                                           xla_while->while_body()) ||
-        call_graph_->InstructionIsNestedIn(value.defining_instruction(),
-                                           xla_while->while_condition())) {
+                                           xla_while->while_body())) {
       VLOG(4) << "  use is while " << use.instruction->name()
-              << " and def is in condition or body";
+              << " and def is in body";
       return true;
     }
+    if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
+                                           xla_while->while_condition())) {
+      if (value.defining_instruction() !=
+          xla_while->while_condition()->parameter_instruction(0)) {
+        VLOG(4) << "  use is while " << use.instruction->name()
+                << " and def is in condition and is not the parameter";
+        return false;
+      } else {
+        VLOG(4) << "  use is while " << use.instruction->name()
+                << " and def is in condition and is the parameter";
+        return true;
+      }
+    }
   }
 
   // Similarly if the value is defined at a while, it logically occurs after any
@@ -263,10 +276,23 @@
   }
 
   if (a.live_out_of_module()) {
-    VLOG(4) << a << " is live out of module and defined before " << b;
+    VLOG(4) << a << " is live out of module and not defined before " << b;
     return false;
   }
 
+  // If the root instruction aliases the buffer 'a', the live range of 'a' is
+  // until the end of the computation and can never be strictly before another
+  // buffer nested in the same computation. This is needed to prevent the root
+  // instruction's buffers from being reused by later instructions even when
+  // the root is not the last instruction in the schedule.
+  for (const HloPosition& pos : a.positions()) {
+    if (pos.instruction->parent()->root_instruction() == pos.instruction &&
+        call_graph().InstructionIsNestedIn(b.instruction(),
+                                           pos.instruction->parent())) {
+      return false;
+    }
+  }
+
   // All uses of 'a' must be before 'b' is defined.
   for (const HloUse& use : a.uses()) {
     if (dataflow.DoesNotUseOperandBuffer(a.instruction(), a.index(),
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 8e8b9d6..1140811 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -247,6 +247,11 @@
   EXPECT_FALSE(ordering.LiveRangeStrictlyBefore(
       dataflow->GetValueDefinedAt(constant),
       dataflow->GetValueDefinedAt(xla_while), *dataflow));
+  // Value defined as init of while interferes with instructions in the
+  // condition other than the parameter.
+  EXPECT_FALSE(ordering.LiveRangeStrictlyBefore(
+      dataflow->GetValueDefinedAt(constant),
+      dataflow->GetValueDefinedAt(convert), *dataflow));
   EXPECT_TRUE(ordering.MayInterfere(dataflow->GetValueDefinedAt(constant),
                                     dataflow->GetValueDefinedAt(xla_while),
                                     *dataflow));
@@ -261,8 +266,10 @@
   EXPECT_FALSE(ordering.MayInterfere(dataflow->GetValueDefinedAt(negate),
                                      dataflow->GetValueDefinedAt(xla_while),
                                      *dataflow));
-
-  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(convert),
+  EXPECT_TRUE(ordering.MayInterfere(dataflow->GetValueDefinedAt(constant),
+                                    dataflow->GetValueDefinedAt(xla_while),
+                                    *dataflow));
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(constant),
                                        dataflow->GetValueDefinedAt(xla_while)));
   EXPECT_TRUE(ordering.LiveRangeStrictlyBefore(
       dataflow->GetValueDefinedAt(convert),
@@ -496,5 +503,36 @@
                                     *dataflow));
 }
 
+TEST_F(HloOrderingTest, InterferenceWithOuterRoot) {
+  absl::string_view hlo_string = R"(
+HloModule InterferenceWithOuterRoot, is_scheduled=true
+
+Emmbedded (embedded_param: f32[42]) -> f32[42] {
+  embedded_param = f32[42]{0} parameter(0)
+  multiply = f32[42]{0} multiply(embedded_param, embedded_param)
+  ROOT log = f32[42]{0} log(multiply)
+}
+
+ENTRY InterferenceWithOuterRoot {
+  param = f32[4096,4096]{1,0} parameter(0)
+  ROOT add = f32[4096,4096]{1,0} add(param, param)
+  call = f32[42]{0} call(param), to_apply=Emmbedded
+}
+
+)";
+  HloModuleConfig hlo_config;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string, hlo_config));
+  TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
+                          HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
+  DependencyHloOrdering ordering(module.get());
+  auto multiply = FindInstruction(module.get(), "multiply");
+  auto add = FindInstruction(module.get(), "add");
+
+  EXPECT_TRUE(ordering.MayInterfere(dataflow->GetValueDefinedAt(multiply),
+                                    dataflow->GetValueDefinedAt(add),
+                                    *dataflow));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index d158faf..3667fc3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -891,6 +891,15 @@
       instruction = builder->AddInstruction(HloInstruction::CreateReplicaId());
       break;
     }
+    case HloOpcode::kPartitionId: {
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreatePartitionId());
+      break;
+    }
     case HloOpcode::kReshape: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -1459,6 +1468,9 @@
       if (!ParseOperands(&operands)) {
         return false;
       }
+      if (!ShapeUtil::IsScalar(operands[0]->shape())) {
+        return Error(lexer_.GetLoc(), "The first operand must be a scalar");
+      }
       const bool branch_index_is_bool =
           operands[0]->shape().element_type() == PRED;
       if (branch_index_is_bool) {
@@ -1467,6 +1479,10 @@
         attrs["false_computation"] = {
             /*required=*/true, AttrTy::kHloComputation, &false_computation};
       } else {
+        if (operands[0]->shape().element_type() != S32) {
+          return Error(lexer_.GetLoc(),
+                       "The first operand must be a scalar of PRED or S32");
+        }
         attrs["branch_computations"] = {/*required=*/true,
                                         AttrTy::kBracedHloComputationList,
                                         &branch_computations};
@@ -4137,6 +4153,14 @@
     }
   }
 
+  if (lexer_.GetKind() != TokKind::kEof) {
+    Error(
+        lexer_.GetLoc(),
+        "Syntax error:\nExpected eof after parsing single instruction.  Did "
+        "you mean to write an HLO module and forget the \"HloModule\" header?");
+    return false;
+  }
+
   module->AddEntryComputation(builder.Build());
   for (auto& comp : computations_) {
     module->AddEmbeddedComputation(std::move(comp));
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index de23c32..011850e 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1438,6 +1438,17 @@
 
 )"
 },
+// partition-id
+{
+"PartitionId",
+R"(HloModule partition-id
+
+ENTRY PartitionId {
+  ROOT id = u32[] partition-id()
+}
+
+)"
+},
 // Iota
 {
 "Iota",
@@ -1536,6 +1547,19 @@
 
 )"
 },
+
+// Bitcast-convert usage
+{
+"BitcastConvert",
+R"(HloModule BitcastConvert
+
+ENTRY BitcastConvertUsage {
+  p = f32[100]{0} parameter(0)
+  ROOT out = u32[100]{0} bitcast-convert(p)
+}
+
+)"
+}
 });
   // clang-format on
 }
@@ -2478,6 +2502,16 @@
   EXPECT_EQ(convolution->feature_group_count(), 1);
 }
 
+TEST(HloParserSingleOpTest, MultipleOpsProducesError) {
+  const string text = R"(
+    param = f32[2,5,1,3] parameter(0)
+    transpose = f32[1,5,2,3] transpose(param), dimensions={2,1,0,3}
+  )";
+  auto status = ParseHloString(text).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Expected eof"));
+}
+
 TEST_F(HloParserTest, IsScheduledIsFalse) {
   const string text = R"(
 HloModule axpy_module, is_scheduled=false
@@ -2840,5 +2874,89 @@
                                    "parameter_replication has 3 elements"));
 }
 
+TEST_F(HloParserTest, CheckIndexedConditionalDimension) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  branch0 {
+    tparam = f32[4] parameter(0)
+    ROOT tgte1 = f32[4] ceil(tparam)
+  }
+
+  branch1 {
+    fparam = f32[4] parameter(0)
+    ROOT fgte1 = f32[4] floor(fparam)
+  }
+
+  ENTRY entry {
+    p0 = f32[4] parameter(0)
+    b0 = s32[2] parameter(1)
+    ROOT conditional = f32[4] conditional(b0, p0, p0),
+      branch_computations={branch0, branch1}
+  }
+  )";
+  auto result = ParseHloString(hlo_string);
+  EXPECT_NE(Status::OK(), result.status());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("The first operand must be a scalar"));
+}
+
+TEST_F(HloParserTest, CheckIndexedConditionalElementType) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  branch0 {
+    tparam = f32[4] parameter(0)
+    ROOT tgte1 = f32[4] ceil(tparam)
+  }
+
+  branch1 {
+    fparam = f32[4] parameter(0)
+    ROOT fgte1 = f32[4] floor(fparam)
+  }
+
+  ENTRY entry {
+    p0 = f32[4] parameter(0)
+    b0 = f32[] parameter(1)
+    ROOT conditional = f32[4] conditional(b0, p0, p0),
+      branch_computations={branch0, branch1}
+  }
+  )";
+  auto result = ParseHloString(hlo_string);
+  EXPECT_NE(Status::OK(), result.status());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr(
+                  "The first operand must be a scalar of PRED or S32"));
+}
+
+TEST_F(HloParserTest,
+       CheckPredicatedConditionalRequiresTrueAndFalseComputation) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  branch0 {
+    tparam = f32[4] parameter(0)
+    ROOT tgte1 = f32[4] ceil(tparam)
+  }
+
+  branch1 {
+    fparam = f32[4] parameter(0)
+    ROOT fgte1 = f32[4] floor(fparam)
+  }
+
+  ENTRY entry {
+    p0 = f32[4] parameter(0)
+    b0 = pred[] parameter(1)
+    ROOT conditional = f32[4] conditional(b0, p0, p0),
+      branch_computations={branch0, branch1}
+  }
+  )";
+  auto result = ParseHloString(hlo_string);
+  EXPECT_NE(Status::OK(), result.status());
+  EXPECT_THAT(
+      result.status().error_message(),
+      ::testing::HasSubstr("unexpected attribute \"branch_computations\""));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
index 316a867..ea1f5b2 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
@@ -359,7 +359,10 @@
   get-tuple-element.5 = (f32[], f32[]) get-tuple-element(param), index=1
   get-tuple-element.6 = (f32[], f32[]) get-tuple-element(param), index=2
   replica-id = u32[] replica-id()
-  ROOT conditional = (f32[], f32[]) conditional(replica-id, get-tuple-element.4, get-tuple-element.5, get-tuple-element.6), branch_computations={Negate, Identity, Floor}
+  id = s32[] bitcast-convert(replica-id)
+  ROOT conditional = (f32[], f32[]) conditional(id, get-tuple-element.4,
+    get-tuple-element.5, get-tuple-element.6),
+    branch_computations={Negate, Identity, Floor}
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 9416324..5ba390a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -109,7 +109,7 @@
 }
 
 StatusOr<std::vector<ScopedShapedBuffer>> HloRunner::TransferLiteralsToDevice(
-    const absl::Span<const Literal* const> literals) {
+    absl::Span<const Literal* const> literals) {
   std::vector<ScopedShapedBuffer> buffers;
   for (const Literal* literal : literals) {
     CHECK(literal != nullptr);
@@ -121,7 +121,7 @@
 }
 
 StatusOr<std::vector<ScopedShapedBuffer>> HloRunner::TransferLiteralsToDevice(
-    const absl::Span<const Literal> literals) {
+    absl::Span<const Literal> literals) {
   std::vector<const Literal*> literal_pointers;
   literal_pointers.reserve(literals.size());
   for (const auto& literal : literals) {
@@ -138,10 +138,10 @@
                                                                  buffer);
 }
 
-StatusOr<Literal> HloRunner::Execute(
-    std::unique_ptr<HloModule> module,
-    const absl::Span<const Literal* const> arguments, bool run_hlo_passes,
-    ExecutionProfile* profile) {
+StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
+                                     absl::Span<const Literal* const> arguments,
+                                     bool run_hlo_passes,
+                                     ExecutionProfile* profile) {
   TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
                       TransferLiteralsToDevice(arguments));
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
@@ -154,7 +154,7 @@
 }
 
 StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
-                                     const absl::Span<const Literal> arguments,
+                                     absl::Span<const Literal> arguments,
                                      bool run_hlo_passes,
                                      ExecutionProfile* profile) {
   // Construct a vector of plain pointers for the arguments.
@@ -170,10 +170,9 @@
       /*profile=*/profile);
 }
 
-StatusOr<Literal> HloRunner::Execute(
-    std::unique_ptr<Executable> executable,
-    const absl::Span<const Literal* const> arguments,
-    ExecutionProfile* profile) {
+StatusOr<Literal> HloRunner::Execute(std::unique_ptr<Executable> executable,
+                                     absl::Span<const Literal* const> arguments,
+                                     ExecutionProfile* profile) {
   TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
                       TransferLiteralsToDevice(arguments));
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
@@ -185,7 +184,7 @@
 }
 
 StatusOr<Literal> HloRunner::Execute(std::unique_ptr<Executable> executable,
-                                     const absl::Span<const Literal> arguments,
+                                     absl::Span<const Literal> arguments,
                                      ExecutionProfile* profile) {
   // Construct a vector of plain pointers for the arguments.
   std::vector<const Literal*> argument_pointers;
@@ -201,7 +200,7 @@
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
     std::unique_ptr<HloModule> module,
-    const absl::Span<const ShapedBuffer* const> arguments, bool run_hlo_passes,
+    absl::Span<const ShapedBuffer* const> arguments, bool run_hlo_passes,
     ExecutionProfile* profile) {
   // Get service run options.
   se::Stream stream(backend().default_stream_executor());
@@ -222,7 +221,7 @@
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
     std::unique_ptr<HloModule> module,
-    const absl::Span<const ScopedShapedBuffer> arguments, bool run_hlo_passes,
+    absl::Span<const ScopedShapedBuffer> arguments, bool run_hlo_passes,
     ExecutionProfile* profile) {
   std::vector<const ShapedBuffer*> argument_pointers;
   argument_pointers.reserve(arguments.size());
@@ -237,8 +236,7 @@
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    Executable* executable,
-    const absl::Span<const ShapedBuffer* const> arguments,
+    Executable* executable, absl::Span<const ShapedBuffer* const> arguments,
     ExecutionProfile* profile) {
   // Get service run options.
   se::Stream stream(backend().default_stream_executor());
@@ -256,8 +254,7 @@
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    Executable* executable,
-    const absl::Span<const ScopedShapedBuffer> arguments,
+    Executable* executable, absl::Span<const ScopedShapedBuffer> arguments,
     ExecutionProfile* profile) {
   std::vector<const ShapedBuffer*> argument_pointers;
   argument_pointers.reserve(arguments.size());
@@ -272,10 +269,16 @@
 
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module, const ReplicatedExecuteOptions& options,
-    DeviceAssignment* device_assignment, bool use_threads) {
+    DeviceAssignment* device_assignment) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
+  return ExecuteReplicated(executable.get(), options, device_assignment);
+}
+
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    Executable* executable, const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment, ExecutionProfile* profile) {
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
 
@@ -369,7 +372,7 @@
 
   LOG(INFO) << "Replicated execution started";
   std::vector<ScopedShapedBuffer> results;
-  if (!use_threads) {
+  if (!options.use_threads) {
     TF_ASSIGN_OR_RETURN(results,
                         executable->ExecuteOnStreams(service_run_options,
                                                      argument_buffer_slices));
@@ -415,13 +418,12 @@
 }
 
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
-    std::unique_ptr<HloModule> module, const ReplicatedExecuteOptions& options,
-    bool use_threads) {
+    std::unique_ptr<HloModule> module,
+    const ReplicatedExecuteOptions& options) {
   TF_ASSIGN_OR_RETURN(
       DeviceAssignment device_assignment,
       backend().computation_placer()->AssignDevices(options.num_replicas, 1));
-  return ExecuteReplicated(std::move(module), options, &device_assignment,
-                           use_threads);
+  return ExecuteReplicated(std::move(module), options, &device_assignment);
 }
 
 StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index e782786..7e666a8 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -78,6 +78,10 @@
     // saved modules are coming from after the HLO pass pipeline, so triggering
     // another run will likely cause errors.
     bool run_hlo_passes = false;
+
+    // If true, executes on multiple threads using se::Stream::ExecuteOnStream.
+    // Othewise, executes using xla::Executable::ExecuteOnStreams.
+    bool use_threads = false;
   };
 
   // intra_op_parallelism_threads: For the CPU backend only. It is the thread
@@ -110,9 +114,9 @@
   // Transfers data between the host and device.
   StatusOr<ScopedShapedBuffer> TransferLiteralToDevice(const Literal& literal);
   StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
-      const absl::Span<const Literal* const> literals);
+      absl::Span<const Literal* const> literals);
   StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
-      const absl::Span<const Literal> literals);
+      absl::Span<const Literal> literals);
   StatusOr<Literal> TransferLiteralFromDevice(const ShapedBuffer& buffer);
 
   // Executes the given module with given literals as input and returns the
@@ -121,46 +125,44 @@
   // If run_hlo_passes is false, the module will be executed without Hlo
   // optimization.
   StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
-                            const absl::Span<const Literal* const> arguments,
+                            absl::Span<const Literal* const> arguments,
                             bool run_hlo_passes = true,
                             ExecutionProfile* profile = nullptr);
 
   StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
-                            const absl::Span<const Literal> arguments,
+                            absl::Span<const Literal> arguments,
                             bool run_hlo_passes = true,
                             ExecutionProfile* profile = nullptr);
 
   StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
-                            const absl::Span<const Literal* const> arguments,
+                            absl::Span<const Literal* const> arguments,
                             ExecutionProfile* profile = nullptr);
 
   StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
-                            const absl::Span<const Literal> arguments,
+                            absl::Span<const Literal> arguments,
                             ExecutionProfile* profile = nullptr);
 
   // As Execute(), but accepts and returns device buffers instead of host
   // buffers.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
       std::unique_ptr<HloModule> module,
-      const absl::Span<const ShapedBuffer* const> arguments,
+      absl::Span<const ShapedBuffer* const> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
       std::unique_ptr<HloModule> module,
-      const absl::Span<const ScopedShapedBuffer> arguments,
+      absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
   // In the following two calls, "executable" is not a unique_ptr to allow
   // reuse of the Executable.  This call may update the profile information in
   // *executable.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      Executable* executable,
-      const absl::Span<const ShapedBuffer* const> arguments,
+      Executable* executable, absl::Span<const ShapedBuffer* const> arguments,
       ExecutionProfile* profile = nullptr);
 
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      Executable* executable,
-      const absl::Span<const ScopedShapedBuffer> arguments,
+      Executable* executable, absl::Span<const ScopedShapedBuffer> arguments,
       ExecutionProfile* profile = nullptr);
 
   // Creates an executable object given an HLO module. If run_hlo_passes is
@@ -171,19 +173,24 @@
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
   // value.
-  //
-  // use_threads indicates whether this replicated computation will be executed
-  // with a thread-per-replica, vs using an implicitly async call such as
-  // Executable::ExecuteOnStreams.
   StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      const ReplicatedExecuteOptions& options, bool use_threads = false);
+      const ReplicatedExecuteOptions& options);
 
   // Same as above, but with specified device assignment.
   StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
       const ReplicatedExecuteOptions& options,
-      DeviceAssignment* device_assignment, bool use_threads = false);
+      DeviceAssignment* device_assignment);
+
+  // Same as above, but with a reusable Executable.  This may update the profile
+  // information in *executable.
+  //
+  // Note that this call ignores ReplicatedExecutionOptions::run_hlo_passes,
+  // since we've already compiled the Executable.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      Executable* executable, const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment, ExecutionProfile* profile = nullptr);
 
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 218b33b..ba856fc 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -177,12 +177,16 @@
   // Build vector of HloUses for the value.
   for (const HloPosition& position : positions_) {
     for (HloInstruction* user : position.instruction->users()) {
-      for (int64 operand_number : user->OperandIndices(position.instruction)) {
+      for (int64 i = 0; i < user->operand_count(); ++i) {
+        if (user->operand(i) != position.instruction) {
+          continue;
+        }
+
         // Root instructions of computations are considered to be uses whether
         // or not the root instruction itself actually uses the value.
-        if (MayUseOperandValue(operand_number, position.index, user) ||
+        if (MayUseOperandValue(i, position.index, user) ||
             ContainsKey(root_positions, user)) {
-          HloUse new_use{user, operand_number, position.index};
+          HloUse new_use{user, i, position.index};
 
           // The new use must not already exist in uses_.
           for (const HloUse& use : uses_) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 2add3d3..6cbfb78 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -13,6 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+
 #include <set>
 
 #include "absl/container/flat_hash_map.h"
@@ -21,7 +23,6 @@
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -201,6 +202,10 @@
                     ShapeInference::InferAllToAllTupleShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandlePartitionId(HloInstruction* hlo) {
+  return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
+}
+
 Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
 }
@@ -670,10 +675,23 @@
 }
 
 Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
+  if (!ShapeUtil::IsScalar(conditional->operand(0)->shape())) {
+    return InvalidArgument(
+        "The first operand of conditional must be a scalar. Got %s",
+        conditional->operand(0)->shape().DebugString());
+  }
   const int num_branches = conditional->branch_count();
-  if (conditional->operand(0)->shape().element_type() == PRED) {
+  PrimitiveType operand0_type = conditional->operand(0)->shape().element_type();
+  if (operand0_type == PRED) {
     TF_RET_CHECK(num_branches == 2);
   } else {
+    if (operand0_type != S32) {
+      return InvalidArgument(
+          "The first operand of indexed conditional must be a scalar of S32. "
+          "Got"
+          " type %s.",
+          PrimitiveType_Name(operand0_type));
+    }
     TF_RET_CHECK(num_branches >= 1);
   }
   TF_RETURN_IF_ERROR(CheckOperandCount(conditional, num_branches + 1));
@@ -969,7 +987,7 @@
   if (computation->num_parameters() != layout.parameter_count()) {
     return InternalError(
         "Number of parameters in entry computation layout (%d) must be same "
-        "as number of parameters of entry computation computation (%d)",
+        "as number of parameters of entry computation (%d)",
         layout.parameter_count(), computation->num_parameters());
   }
 
@@ -1486,11 +1504,9 @@
 
   std::unique_ptr<ShapeVerifier> shape_verifier =
       target_metadata_->GetVerifier();
+  InstructionVerifier instruction_verifier(instruction_can_change_layout_func_);
   for (auto* computation : module->computations()) {
     TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
-
-    InstructionVerifier instruction_verifier(
-        instruction_can_change_layout_func_);
     TF_RETURN_IF_ERROR(computation->Accept(&instruction_verifier));
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index a38ec5a0..45e472b 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -57,6 +57,7 @@
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
+  Status HandlePartitionId(HloInstruction* hlo) override;
   Status HandleReplicaId(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
   Status HandleInfeed(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 8245310..201fc65 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -155,17 +155,17 @@
 
 TEST_F(HloVerifierTest, CheckCallOperandParameterShapesMismatch) {
   const char* const hlo_string = R"(
-HloModule Module
+  HloModule Module
 
-callme {
-  ROOT param = (s32[], f32[4]) parameter(0)
-}
+  callme {
+    ROOT param = (s32[], f32[4]) parameter(0)
+  }
 
-ENTRY entry {
-  p0 = (f32[4], s32[]) parameter(0)
-  ROOT mycall = (s32[], f32[4]) call(p0), to_apply=callme
-}
-)";
+  ENTRY entry {
+    p0 = (f32[4], s32[]) parameter(0)
+    ROOT mycall = (s32[], f32[4]) call(p0), to_apply=callme
+  }
+  )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
 
   auto status = verifier().Run(module.get()).status();
@@ -176,25 +176,25 @@
 
 TEST_F(HloVerifierTest, CheckConditionalOperandParameterShapesMismatch) {
   const char* const hlo_string = R"(
-HloModule Module
+  HloModule Module
 
-true_branch {
-  tparam = (s32[], f32[4]) parameter(0)
-  ROOT tgte1 = f32[4] get-tuple-element(tparam), index=1
-}
+  true_branch {
+    tparam = (s32[], f32[4]) parameter(0)
+    ROOT tgte1 = f32[4] get-tuple-element(tparam), index=1
+  }
 
-false_branch {
-  fparam = (s32[], f32[4]) parameter(0)
-  ROOT fgte1 = f32[4] get-tuple-element(fparam), index=1
-}
+  false_branch {
+    fparam = (s32[], f32[4]) parameter(0)
+    ROOT fgte1 = f32[4] get-tuple-element(fparam), index=1
+  }
 
-ENTRY entry {
-  p0 = (f32[4], s32[]) parameter(0)
-  constant = pred[] constant(true)
-  ROOT conditional = f32[4] conditional(constant, p0, p0),
-    true_computation=true_branch, false_computation=false_branch
-}
-)";
+  ENTRY entry {
+    p0 = (f32[4], s32[]) parameter(0)
+    constant = pred[] constant(true)
+    ROOT conditional = f32[4] conditional(constant, p0, p0),
+      true_computation=true_branch, false_computation=false_branch
+  }
+  )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
 
   auto status = verifier().Run(module.get()).status();
@@ -203,6 +203,51 @@
               HasSubstr("shape does not match parameter"));
 }
 
+TEST_F(HloVerifierTest, CheckConditionalBranchIndexOperandShape) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  branch0 {
+    tparam = f32[4] parameter(0)
+    ROOT tgte1 = f32[4] ceil(tparam)
+  }
+
+  branch1 {
+    fparam = f32[4] parameter(0)
+    ROOT fgte1 = f32[4] floor(fparam)
+  }
+
+  branch2 {
+    sparam = f32[4] parameter(0)
+    ROOT sgte1 = f32[4] ceil(sparam)
+  }
+
+  ENTRY entry {
+    p0 = f32[4] parameter(0)
+    b0 = s32[] parameter(1)
+    ROOT conditional = f32[4] conditional(b0, p0, p0, p0),
+      branch_computations={branch0, branch1, branch2}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+  auto status = verifier().Run(module.get()).status();
+
+  HloInstruction* condition = FindInstruction(module.get(), "b0");
+  *condition->mutable_shape() = ShapeUtil::MakeShape(F32, {});
+  status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr(
+          "first operand of indexed conditional must be a scalar of S32"));
+
+  *condition->mutable_shape() = ShapeUtil::MakeShape(S32, {4});
+  status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("first operand of conditional must be a scalar"));
+}
+
 TEST_F(HloVerifierTest, RngOpnd0NotScalar) {
   const char* const hlo_string = R"(
   HloModule Module
@@ -504,7 +549,7 @@
   HloModule Module
 
   ENTRY SelectMixedPrecisionNotAllowed {
-   p0 = pred[] parameter(0)
+   p0 = pred[32] parameter(0)
    p1 = f32[32] parameter(1)
    p2 = bf16[32] parameter(2)
    ROOT select = f32[32] select(p0, p1, p2)
@@ -523,7 +568,7 @@
   HloModule Module
 
   ENTRY SelectMixedPrecisionAllowed {
-   p0 = pred[] parameter(0)
+   p0 = pred[32] parameter(0)
    p1 = f32[32] parameter(1)
    p2 = bf16[32] parameter(2)
    ROOT select = f32[32] select(p0, p1, p2)
@@ -551,7 +596,7 @@
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
-              HasSubstr("Select operation is not supported for tuples"));
+              HasSubstr("Expected array argument for select"));
 }
 
 TEST_F(HloVerifierTest, IotaNonArrayResult) {
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 404ca34..f12b725 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -88,6 +88,7 @@
     case HloOpcode::kXor:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kPopulationCount:
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
@@ -250,67 +251,63 @@
   HloInstructionSet do_not_duplicate;
   absl::flat_hash_map<std::pair<HloInstruction*, HloInstruction*>, bool>
       can_fuse_on_all_paths_result_cache;
-  for (HloInstruction* consumer : post_order) {
-    for (HloInstruction* producer : consumer->operands()) {
-      if (do_not_duplicate.count(producer) > 0) {
-        continue;
-      }
-
-      // If the producer is effectively not more than unary, duplicating it
-      // will not increase the number of relevant inputs read, as the fusion
-      // node will only need to read at most 1 relevant input (the input of
-      // the producer). In that case, we do not forbid fusion of the operation
-      // here.
-      if (EffectivelyAtMostUnary(producer)) {
-        continue;
-      }
-
-      // If the total size of the inputs is less than or equal to the total size
-      // of the outputs for the producer then duplicating it won't increase the
-      // memory traffic. In that case, we do not forbid fusion of the operation
-      // here.
-      auto total_size = [](const Shape& shape) {
-        int64 size = 0;
-        ShapeUtil::ForEachSubshape(
-            shape,
-            [&size](const Shape& subshape, const ShapeIndex& shape_index) {
-              if (subshape.IsArray()) {
-                size += ShapeUtil::ElementsIn(subshape);
-              }
-            });
-        return size;
-      };
-      int64 operands_size = 0;
-      for (const HloInstruction* op : producer->operands()) {
-        operands_size += total_size(op->shape());
-      }
-      if (operands_size <= total_size(producer->shape())) {
-        continue;
-      }
-
-      // Otherwise we will forbid fusing the op unless we can fuse it into
-      // all of its consumers on all paths.
-      //
-      // That means, that for:
-      // A --> B (fusible)
-      //   \-> C (non-fusible)
-      // A will be not allowed to be fused into B, as it cannot be fused into C.
-      //
-      // Similarly, for:
-      // A -------------> B
-      //   \-> C -> D -/
-      // If:
-      // - A is fusible into B and C, and D is fusible into B
-      // - C is *not* fusible into D
-      // A will be not allowed to be fused into B, as it cannot be fused via
-      // all paths.
-      if (producer->IsFusible() &&
-          CanFuseOnAllPaths(producer, consumer, do_not_duplicate,
-                            &can_fuse_on_all_paths_result_cache)) {
-        continue;
-      }
-      do_not_duplicate.insert(producer);
+  for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) {
+    HloInstruction* producer = *it;
+    // If the producer is effectively not more than unary, duplicating it
+    // will not increase the number of relevant inputs read, as the fusion
+    // node will only need to read at most 1 relevant input (the input of
+    // the producer). In that case, we do not forbid fusion of the operation
+    // here.
+    if (EffectivelyAtMostUnary(producer)) {
+      continue;
     }
+
+    // If the total size of the inputs is less than or equal to the total size
+    // of the outputs for the producer then duplicating it won't increase the
+    // memory traffic. In that case, we do not forbid fusion of the operation
+    // here.
+    auto total_size = [](const Shape& shape) {
+      int64 size = 0;
+      ShapeUtil::ForEachSubshape(
+          shape, [&size](const Shape& subshape, const ShapeIndex& shape_index) {
+            if (subshape.IsArray()) {
+              size += ShapeUtil::ElementsIn(subshape);
+            }
+          });
+      return size;
+    };
+    int64 operands_size = 0;
+    for (const HloInstruction* op : producer->operands()) {
+      operands_size += total_size(op->shape());
+    }
+    if (operands_size <= total_size(producer->shape())) {
+      continue;
+    }
+
+    // Otherwise we will forbid fusing the op unless we can fuse it into
+    // all of its consumers on all paths.
+    //
+    // That means, that for:
+    // A --> B (fusible)
+    //   \-> C (non-fusible)
+    // A will be not allowed to be fused into B, as it cannot be fused into C.
+    //
+    // Similarly, for:
+    // A -------------> B
+    //   \-> C -> D -/
+    // If:
+    // - A is fusible into B and C, and D is fusible into B
+    // - C is *not* fusible into D
+    // A will be not allowed to be fused into B, as it cannot be fused via
+    // all paths.
+    if (producer->IsFusible() &&
+        absl::c_all_of(producer->users(), [&](HloInstruction* consumer) {
+          return CanFuseOnAllPaths(producer, consumer, do_not_duplicate,
+                                   &can_fuse_on_all_paths_result_cache);
+        })) {
+      continue;
+    }
+    do_not_duplicate.insert(producer);
   }
 
   return do_not_duplicate;
@@ -409,13 +406,11 @@
       }
       sorted_operand_numbers.push_back(i);
     }
-    absl::c_sort(
-        sorted_operand_numbers, [&](int64 i, int64 j) {
-          // Instructions with higher priority in the queue come first.
-          return (
-              FindOrDie(post_order_index_, instruction->mutable_operand(i)) >
+    absl::c_sort(sorted_operand_numbers, [&](int64 i, int64 j) {
+      // Instructions with higher priority in the queue come first.
+      return (FindOrDie(post_order_index_, instruction->mutable_operand(i)) >
               FindOrDie(post_order_index_, instruction->mutable_operand(j)));
-        });
+    });
     return std::make_pair(instruction, sorted_operand_numbers);
   }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 599489b..7f0c1cc 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -35,6 +35,7 @@
         "//tensorflow/compiler/xla/service:cholesky_expander",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -53,7 +54,6 @@
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index a8f8ab4..80a3ebc 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -22,7 +22,7 @@
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/cholesky_expander.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
@@ -52,8 +52,8 @@
 StatusOr<Literal> HandleEvaluatorCustomCall(
     HloInstruction* custom_call, absl::Span<const Literal*> operands) {
   // Find the target C function in the global registry.
-  auto* registry = xla::cpu::CustomCallTargetRegistry::Global();
-  void* target_fn = registry->Lookup(custom_call->custom_call_target());
+  auto* registry = CustomCallTargetRegistry::Global();
+  void* target_fn = registry->Lookup(custom_call->custom_call_target(), "Host");
   if (!target_fn) {
     return NotFound("Custom call target '%s' was not registered",
                     custom_call->custom_call_target());
@@ -96,7 +96,7 @@
 
 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
   TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
   return std::move(hlo_module);
@@ -105,13 +105,13 @@
 Status InterpreterCompiler::RunHloPassesOnModuleGroup(
     HloModuleGroup* module_group,
     absl::Span<se::StreamExecutor* const> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented("Module group compilation not supported on Interpreter");
 }
 
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Run backend " << hlo_module->name();
@@ -137,7 +137,7 @@
 InterpreterCompiler::RunBackendOnModuleGroup(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented(
       "Module group compilation is not supported on Interpreter.");
 }
@@ -145,7 +145,7 @@
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   if (module_group->empty()) {
     return std::vector<std::unique_ptr<Executable>>();
   }
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index 5912729..dc83295 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -45,24 +45,24 @@
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
   Status RunHloPassesOnModuleGroup(
       HloModuleGroup* module_group,
       absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
   StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 071abeb..b1a26b3 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -110,7 +110,8 @@
   return port::Status::OK();
 }
 
-DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+XlaInterpreterExecutor::CreateDeviceDescription(int device_ordinal) {
   internal::DeviceDescriptionBuilder builder;
 
   builder.set_device_address_bits(64);
@@ -119,7 +120,7 @@
   builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
   builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
 
-  return builder.Build().release();
+  return builder.Build();
 }
 
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 47f8ded..6d33768 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -69,7 +69,7 @@
   }
 
   void *Allocate(uint64 size) override;
-  void *GetSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+  void *GetSubBuffer(DeviceMemoryBase *parent, uint64 offset_bytes,
                      uint64 size_bytes) override;
   void Deallocate(DeviceMemoryBase *mem) override;
 
@@ -80,9 +80,9 @@
   bool HostMemoryRegister(void *mem, uint64 size) override { return true; }
   bool HostMemoryUnregister(void *mem) override { return true; }
 
-  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &pop_src,
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &dev_src,
               uint64 size) override;
-  bool Memcpy(Stream *stream, DeviceMemoryBase *pop_dst, const void *host_src,
+  bool Memcpy(Stream *stream, DeviceMemoryBase *dev_dst, const void *host_src,
               uint64 size) override;
   bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *pop_dst,
                             const DeviceMemoryBase &host_src,
@@ -114,10 +114,10 @@
     return false;
   }
 
-  port::Status SynchronousMemcpy(DeviceMemoryBase *pop_dst,
+  port::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
                                  const void *host_src, uint64 size) override;
   port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &pop_src,
+                                 const DeviceMemoryBase &dev_src,
                                  uint64 size) override;
   port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
                                                const DeviceMemoryBase &pop_src,
@@ -165,7 +165,13 @@
     return false;
   }
 
-  DeviceDescription *PopulateDeviceDescription() const override;
+  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return CreateDeviceDescription(0);
+  }
+
+  static port::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
 
   port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
     return port::Status::OK();
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index b0fc1af..aa17c20 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -22,7 +22,6 @@
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
@@ -43,6 +42,11 @@
 
 const string& XlaInterpreterPlatform::Name() const { return name_; }
 
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+XlaInterpreterPlatform::DescriptionForDevice(int ordinal) const {
+  return XlaInterpreterExecutor::CreateDeviceDescription(ordinal);
+}
+
 port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
     int ordinal) {
   StreamExecutorConfig config;
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index 0187f6d..ff9c5d0 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -29,8 +29,9 @@
 
 class XlaInterpreterPlatform : public Platform {
  public:
-  XlaInterpreterPlatform(const string& name = "Interpreter",
-                         const Platform::Id& id = kXlaInterpreterPlatformId);
+  XlaInterpreterPlatform()
+      : XlaInterpreterPlatform("Interpreter", kXlaInterpreterPlatformId) {}
+  XlaInterpreterPlatform(const string& name, const Platform::Id& id);
   ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
@@ -39,6 +40,9 @@
 
   const string& Name() const override;
 
+  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
   port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
 
   port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index b0abce5..b1303f1 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1226,9 +1226,14 @@
 // unassigned layouts in the graph.
 bool InstructionShouldPropagateDepthFirst(const HloInstruction& hlo) {
   switch (hlo.opcode()) {
+    case HloOpcode::kFusion:
+      return hlo.IsCustomFusion();
+    case HloOpcode::kGather:
+      return true;
     case HloOpcode::kReshape:
       return hlo.operand(0)->shape().rank() == 1 ||
              std::get<0>(hlo.ReshapeMerelyInsertsOrDeletes1SizedDimensions());
+    case HloOpcode::kScatter:
     case HloOpcode::kTranspose:
       return true;
     default:
@@ -2100,6 +2105,7 @@
     case HloOpcode::kIota:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReduce:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index fc6d43f..6b6b366 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -359,7 +359,7 @@
   // the cost of `instruction`. `output_layout` is the layout of `instruction`.
   // Returns null if it can't decide the best layout.
   // Precondition: `instruction` and the operand are array-shaped.
-  std::unique_ptr<Layout> ChooseOperandLayoutFromOutputLayout(
+  virtual std::unique_ptr<Layout> ChooseOperandLayoutFromOutputLayout(
       const Layout& output_layout, const HloInstruction* instruction,
       int64 operand_no);
   // Given the layout of `user`'s `operand_no`-th operand, chooses a layout of
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 382b575..82e955c 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -24,7 +24,7 @@
 Status LLVMCompiler::RunHloPassesOnModuleGroup(
     HloModuleGroup* module_group,
     absl::Span<se::StreamExecutor* const> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented(
       "Model partitioning not implemented for the CPU/GPU compilers!");
 }
@@ -33,7 +33,7 @@
 LLVMCompiler::RunBackendOnModuleGroup(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented(
       "Model partitioning not implemented for the CPU/GPU compilers!");
 }
@@ -41,7 +41,7 @@
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   // Tensorflow tries to enable the following behaviors in all its threads:
   //
   //  - Denormals are zero (DAZ): roughly, operations treat denormal floats as
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index afd9f37..888815b 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -61,28 +61,28 @@
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
   //       se::StreamExecutor* stream_exec,
-  //       DeviceMemoryAllocator* device_allocator)
+  //       se::DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
   //       se::StreamExecutor* stream_exec,
-  //       DeviceMemoryAllocator* device_allocator)
+  //       se::DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   Status RunHloPassesOnModuleGroup(
       HloModuleGroup* module_group,
       absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
  protected:
   ModuleHook user_pre_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index ca85dd7..e1303f6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -49,8 +49,8 @@
     srcs = ["alias_analysis_test.cc"],
     deps = [
         ":alias_analysis",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index db90085..db60e08 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -13,12 +13,13 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -29,7 +30,7 @@
 
 void FakeCustomCallTarget(float* out, float** in) {}
 
-REGISTER_CUSTOM_CALL_TARGET(FakeCustomCallTarget);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(FakeCustomCallTarget);
 
 TEST_F(AliasAnalysisTest, EmbeddedComputationParamsMayAliasTemps) {
   const char* hlo_string = R"(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
index 1ea5a42..f96c985 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -23,7 +23,7 @@
   CHECK(allocation.is_constant());
   HloInstruction* const_instr = nullptr;
   for (const auto& buffer_offset_pair : allocation.assigned_buffers()) {
-    const LogicalBuffer* buffer = buffer_offset_pair.first;
+    const BufferValue* buffer = buffer_offset_pair.first;
     // BufferAssignment may have assigned non-constant instructions to this
     // allocation too so we can't CHECK this condition.  E.g. for
     //
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 5b5f7dc..b043f95 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -43,12 +43,9 @@
 // are supported.
 class IrArray {
  public:
-  // A multidimensional index into an IrArray. The index for dimension zero is
-  // first in the vector. This is the reverse order of the notation used for
-  // describing the dimensions of an array. That is, for a [4 x 3 x 2] array
-  // dimension zero has size 2, dimension one has size 3, and dimension two has
-  // size 4. Thus the index {1, 2, 3} indexes the last element of this [4 x 3 x
-  // 2] array.
+  // A multidimensional index into an IrArray. All the runtime indices
+  // (multidim) and dimensions (Shape::dimensions(), absl::Span<const int64>)
+  // are major-first.
   //
   // This may also keep a linear index and the layout and dimensions it was
   // emitted for; if the shape where this `Index` is used matches, the linear
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 66219c1..8155989 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -194,7 +194,7 @@
     }  // A Tuple contains an array of pointers. Use i8*.
     case TUPLE:
     // An Opaque is like a void*, use i8*.
-    case OPAQUE:
+    case OPAQUE_TYPE:
       return llvm::Type::getInt8PtrTy(module->getContext());
     case TOKEN:
       // Tokens do not have a physical representation, but the compiler needs
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index f56ba32..170d226 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -23,13 +23,13 @@
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
index 8269842..1642c50 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
@@ -17,25 +17,29 @@
 #include "absl/types/variant.h"
 namespace xla {
 
-se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
+tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
   if (HasOwnership()) {
-    return absl::get<OwningDeviceMemory>(mem_).AsDeviceMemoryBase();
+    return absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
+        .AsDeviceMemoryBase();
   } else {
-    return absl::get<se::DeviceMemoryBase>(mem_);
+    return absl::get<tensorflow::se::DeviceMemoryBase>(mem_);
   }
 }
 
 bool MaybeOwningDeviceMemory::HasOwnership() const {
-  return absl::holds_alternative<OwningDeviceMemory>(mem_);
+  return absl::holds_alternative<tensorflow::se::OwningDeviceMemory>(mem_);
 }
 
-absl::optional<OwningDeviceMemory> MaybeOwningDeviceMemory::Release() {
+absl::optional<tensorflow::se::OwningDeviceMemory>
+MaybeOwningDeviceMemory::Release() {
   if (!HasOwnership()) {
     return {};
   }
-  OwningDeviceMemory result = std::move(absl::get<OwningDeviceMemory>(mem_));
+  tensorflow::se::OwningDeviceMemory result =
+      std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
   mem_ = result.AsDeviceMemoryBase();
-  return absl::make_optional<OwningDeviceMemory>(std::move(result));
+  return absl::make_optional<tensorflow::se::OwningDeviceMemory>(
+      std::move(result));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
index 82e7f11..e4c3196 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
@@ -18,30 +18,30 @@
 
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
 // MaybeOwningDeviceMemory represents either an owned or unowned device memory.
-// Like std::variant<OwningDeviceMemory, DeviceMemory>. When the object goes
+// Like std::variant<se::OwningDeviceMemory, DeviceMemory>. When the object goes
 // output of scope, it will free the underlying memory if it owns it.
 class MaybeOwningDeviceMemory {
  public:
   MaybeOwningDeviceMemory() = default;
-  explicit MaybeOwningDeviceMemory(OwningDeviceMemory owned)
+  explicit MaybeOwningDeviceMemory(tensorflow::se::OwningDeviceMemory owned)
       : mem_(std::move(owned)) {}
-  explicit MaybeOwningDeviceMemory(se::DeviceMemoryBase unowned)
+  explicit MaybeOwningDeviceMemory(tensorflow::se::DeviceMemoryBase unowned)
       : mem_(unowned) {}
   MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
   ~MaybeOwningDeviceMemory() = default;
 
-  MaybeOwningDeviceMemory& operator=(se::DeviceMemoryBase unowned) {
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::DeviceMemoryBase unowned) {
     mem_ = unowned;
     return *this;
   }
 
-  MaybeOwningDeviceMemory& operator=(OwningDeviceMemory owned) {
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::OwningDeviceMemory owned) {
     mem_ = std::move(owned);
     return *this;
   }
@@ -50,19 +50,21 @@
 
   // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The
   // caller of this function is *not* responsible for freeing the memory.
-  se::DeviceMemoryBase AsDeviceMemoryBase();
+  tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase();
 
-  // Release the OwningDeviceMemory without freeing it, and moves the ownership
-  // of the memory buffer from the object to the caller.
+  // Release the tensorflow::se::OwningDeviceMemory without freeing it, and
+  // moves the ownership of the memory buffer from the object to the caller.
   //
   // A nullopt is returned if the HasOwnership() == false;
-  absl::optional<OwningDeviceMemory> Release();
+  absl::optional<tensorflow::se::OwningDeviceMemory> Release();
 
   // Returns true if the device_memory has ownership over underlying memory.
   bool HasOwnership() const;
 
  private:
-  absl::variant<OwningDeviceMemory, se::DeviceMemoryBase> mem_;
+  absl::variant<tensorflow::se::OwningDeviceMemory,
+                tensorflow::se::DeviceMemoryBase>
+      mem_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 49c346d..42b9e56 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,7 +29,6 @@
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -58,6 +57,7 @@
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -347,7 +347,7 @@
     const std::vector<const HloModuleProto*>& module_protos,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << StrFormat("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
@@ -783,7 +783,7 @@
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
+    se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << StrFormat(
       "BuildExecutable on service %p with serialized module proto: %s", this,
       module_proto.name());
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index f127e34..ba51e45 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -29,7 +29,6 @@
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 #include "tensorflow/compiler/xla/service/compilation_cache.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
@@ -43,6 +42,7 @@
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -234,7 +234,7 @@
       const HloModuleProto& module_proto,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
       se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator = nullptr);
+      se::DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
@@ -242,7 +242,7 @@
       const std::vector<const HloModuleProto*>& module_protos,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-      DeviceMemoryAllocator* device_allocator);
+      se::DeviceMemoryAllocator* device_allocator);
 
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 6bee671..7fc6631 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -43,7 +43,9 @@
 
   // Delegate to `ExecutableRunOptions` member.
   se::Stream* stream() const { return run_options_.stream(); }
-  DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
+  se::DeviceMemoryAllocator* allocator() const {
+    return run_options_.allocator();
+  }
   int device_ordinal() const { return run_options_.device_ordinal(); }
 
   // Borrows a stream and returns a smart pointer which returns the stream on
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 0d8c4d9..d533290 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -634,10 +634,6 @@
     return fail("Element types do not match.");
   }
 
-  if ((lhs.rank() < 1) || (rhs.rank() < 1)) {
-    return fail("Dot only supports rank 1 or above.");
-  }
-
   // Validate basic properties of dot dimension numbers.
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
 
@@ -2744,45 +2740,27 @@
   return ShapeUtil::PermuteDimensions(InversePermutation(dimensions), operand);
 }
 
-// TODO(b/36794510): Make broadcast semantics more consistent, by supporting
-// "degenerate" cases, as with binary elementwise ops.
 /* static */ StatusOr<Shape> ShapeInference::InferClampShape(
     const Shape& min, const Shape& operand, const Shape& max) {
   TF_RETURN_IF_ERROR(ExpectArray(min, "clamp min"));
   TF_RETURN_IF_ERROR(ExpectArray(operand, "clamp operand"));
   TF_RETURN_IF_ERROR(ExpectArray(max, "clamp max"));
-  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(min, operand) ||
-      !ShapeUtil::SameElementTypeIgnoringFpPrecision(max, operand)) {
-    return InvalidArgument("Clamp with different operand types: %s, %s, %s.",
-                           ShapeUtil::HumanString(min),
-                           ShapeUtil::HumanString(operand),
-                           ShapeUtil::HumanString(max));
+
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(min, operand) ||
+      !ShapeUtil::CompatibleIgnoringFpPrecision(max, operand)) {
+    return InvalidArgument(
+        "Clamp with different shapes: %s, %s, %s.", ShapeUtil::HumanString(min),
+        ShapeUtil::HumanString(operand), ShapeUtil::HumanString(max));
   }
-  if (((ShapeUtil::CompatibleIgnoringFpPrecision(min, operand) ||
-        ShapeUtil::IsScalar(min)) &&
-       (ShapeUtil::CompatibleIgnoringFpPrecision(max, operand) ||
-        ShapeUtil::IsScalar(max)))) {
-    return operand;
-  }
-  if (ShapeUtil::IsScalar(operand)) {
-    if (ShapeUtil::CompatibleIgnoringFpPrecision(min, max)) {
-      return ShapeUtil::ChangeElementType(min, operand.element_type());
-    } else if (ShapeUtil::IsScalar(min)) {
-      return ShapeUtil::ChangeElementType(max, operand.element_type());
-    } else if (ShapeUtil::IsScalar(max)) {
-      return ShapeUtil::ChangeElementType(min, operand.element_type());
-    }
-  }
-  return Unimplemented("%s, %s <clamp> %s is not implemented.",
-                       min.ShortDebugString(), max.ShortDebugString(),
-                       operand.ShortDebugString());
+  return operand;
 }
 
-// TODO(b/36794510): Make broadcast semantics more consistent, by supporting
-// "degenerate" cases, as with binary elementwise ops, as well as scalar
-// broadcast from all operands, not just the predicate.
 /* static */ StatusOr<Shape> ShapeInference::InferSelectShape(
     const Shape& pred, const Shape& on_true, const Shape& on_false) {
+  TF_RETURN_IF_ERROR(ExpectArray(pred, "select pred"));
+  TF_RETURN_IF_ERROR(ExpectArray(on_true, "select on-true"));
+  TF_RETURN_IF_ERROR(ExpectArray(on_false, "select on-false"));
+
   if (!ShapeUtil::CompatibleIgnoringFpPrecision(on_true, on_false)) {
     return InvalidArgument(
         "Operands to select must be the same shape; got %s and %s.",
@@ -2793,38 +2771,18 @@
         "Select's pred operand must have PRED element type; got %s.",
         ShapeUtil::HumanString(pred));
   }
-  if (Shape::Equal()
-          .IgnoreElementType()
-          .IgnoreLayout()
-          .IgnoreDynamicDimension()(pred, on_true) ||
-      ShapeUtil::IsScalar(pred)) {
-    // By this stage we know that pred's element type is PRED. Therefore, this
-    // check restricts pred to be a PRED scalar, or a PRED array with the same
-    // dimensions as on_true and on_false.
-    Shape inferred_shape = ShapeUtil::ChangeElementType(
-        on_true, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
-
-    // Propagate dynamic dimensions if pred is not a scalar.
-    if (!ShapeUtil::IsScalar(pred)) {
-      for (int i = 0; i < inferred_shape.rank(); i++) {
-        if (pred.is_dynamic_dimension(i)) {
-          inferred_shape.set_dynamic_dimension(i, true);
-        }
-      }
-    }
-
-    if (inferred_shape.IsTuple()) {
-      return InvalidArgument(
-          "Select operation is not supported for tuples: %s."
-          " Use tuple-select instead.",
-          ShapeUtil::HumanString(inferred_shape));
-    }
-    return inferred_shape;
+  if (!Shape::Equal()
+           .IgnoreElementType()
+           .IgnoreLayout()
+           .IgnoreDynamicDimension()(pred, on_true)) {
+    return InvalidArgument(
+        "Operands to select and predicate must be the same shape; got %s and "
+        "%s.",
+        ShapeUtil::HumanString(on_true), ShapeUtil::HumanString(pred));
   }
-  return InvalidArgument(
-      "Select operation with non-scalar predicate with dimensionality "
-      "different from the other operands: %s.",
-      ShapeUtil::HumanString(pred));
+
+  return ShapeUtil::ChangeElementType(
+      pred, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferTupleSelectShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 2aa3b12..3bfa971 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -115,14 +115,16 @@
       HloOpcode::kSelect, pred_, tuple, tuple);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Use tuple-select"));
+              HasSubstr("Expected array argument for select"));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kSelect, pred_, matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(
+      inferred_status.status().error_message(),
+      HasSubstr("Operands to select and predicate must be the same shape"));
 }
 
 TEST_F(ShapeInferenceTest, SelectArrayPredBetweenArrays) {
@@ -150,8 +152,9 @@
       HloOpcode::kSelect, ShapeUtil::MakeShape(PRED, {64}), matrix_64_48_,
       matrix_64_48_);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().error_message(),
-              HasSubstr("with non-scalar predicate with dimensionality"));
+  ASSERT_THAT(
+      inferred_status_error3.status().error_message(),
+      HasSubstr("Operands to select and predicate must be the same shape"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
   auto inferred_status_error4 = ShapeInference::InferTernaryOpShape(
@@ -160,7 +163,7 @@
       ShapeUtil::MakeTupleShape({f32_, f32_}));
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(inferred_status_error4.status().error_message(),
-              HasSubstr("pred operand must have PRED element type"));
+              HasSubstr("Expected array argument for select pred"));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllMatrix) {
@@ -180,43 +183,49 @@
 TEST_F(ShapeInferenceTest, ClampMinScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, f32_, matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("Clamp with different shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("Clamp with different shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, matrix_64_48_, f32_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("Clamp with different shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, matrix_64_48_, f32_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("Clamp with different shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, f32_, f32_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("Clamp with different shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, f32_, matrix_64_48_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("Clamp with different shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampBadShapes) {
@@ -1104,16 +1113,13 @@
   }
 }
 
-// scalar <dot> vector: error
+// scalar <dot> vector: ok
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
   DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
       ShapeInference::InferDotOpShape(f32_, vector_32_, dot_dnums);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Dot only supports rank"));
+  EXPECT_TRUE(inferred_status.ok());
+  EXPECT_EQ(inferred_status.ValueOrDie(), vector_32_);
 }
 
 // 3D <dot> 2D: error
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index d90dde3..9b0ec31 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -67,6 +67,20 @@
 
 ShapedBuffer::~ShapedBuffer() {}
 
+StatusOr<ShapedBuffer> ShapedBuffer::SubShapedBuffer(
+    const ShapeIndex& index) const {
+  TF_ASSIGN_OR_RETURN(const Shape* host_sub_shape,
+                      ShapeUtil::TryGetSubshape(on_host_shape(), index));
+  TF_ASSIGN_OR_RETURN(const Shape* device_sub_shape,
+                      ShapeUtil::TryGetSubshape(on_device_shape(), index));
+  ShapedBuffer sub_shaped_buffer(*host_sub_shape, *device_sub_shape, platform_,
+                                 device_ordinal_);
+  TF_ASSIGN_OR_RETURN(ShapeTree<se::DeviceMemoryBase> sub_buffers,
+                      buffers_.SubShapeTree(index));
+  sub_shaped_buffer.set_buffers(std::move(sub_buffers));
+  return std::move(sub_shaped_buffer);
+}
+
 void ShapedBuffer::clear() {
   for (auto& pair : buffers_) {
     // A default constructed DeviceMemoryBase is a null pointer.
@@ -105,14 +119,14 @@
 
 ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
                                        const Shape& on_device_shape,
-                                       DeviceMemoryAllocator* allocator,
+                                       se::DeviceMemoryAllocator* allocator,
                                        int device_ordinal)
     : ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
                    device_ordinal),
       allocator_(allocator) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                                       DeviceMemoryAllocator* allocator)
+                                       se::DeviceMemoryAllocator* allocator)
     : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index f5210c9..3934654 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -21,12 +21,12 @@
 #include <string>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -90,6 +90,7 @@
   void set_buffers(ShapeTree<se::DeviceMemoryBase> buffers) {
     CHECK(ShapeUtil::Equal(buffers.shape(), on_device_shape_));
     buffers_ = std::move(buffers);
+    buffers_.replace_shape_ptr(&on_device_shape_);
   }
 
   // Returns the underlying ShapeTree containing all the device addresses in the
@@ -97,6 +98,8 @@
   const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
   ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
 
+  StatusOr<ShapedBuffer> SubShapedBuffer(const ShapeIndex& index) const;
+
   // Set all device memory pointers in the object to null.
   void clear();
 
@@ -135,13 +138,13 @@
   // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
   explicit ScopedShapedBuffer(const Shape& on_host_shape,
                               const Shape& on_device_shape,
-                              DeviceMemoryAllocator* allocator,
+                              se::DeviceMemoryAllocator* allocator,
                               int device_ordinal);
 
   // Create a ScopedShapedBuffer by taking over the memory from the incoming
   // ShapedBuffer.
   explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                              DeviceMemoryAllocator* allocator);
+                              se::DeviceMemoryAllocator* allocator);
 
   // Movable, but not copyable.
   ScopedShapedBuffer(ScopedShapedBuffer&& s);
@@ -154,13 +157,13 @@
 
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
-  DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
+  se::DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
   // Sets the device memory buffer at the given index.
   //
   // If the given buffer's device memory is non-null, its device_ordinal and
   // allocator must match those in `this`.
-  void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) {
+  void set_buffer(se::OwningDeviceMemory buffer, const ShapeIndex& index) {
     if (!buffer.is_null()) {
       CHECK_EQ(buffer.device_ordinal(), device_ordinal());
       CHECK_EQ(buffer.allocator(), allocator_);
@@ -184,7 +187,7 @@
  protected:
   void Deallocate();
 
-  DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index ca64bd3..3885c5f 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -16,13 +16,13 @@
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -34,7 +34,7 @@
   auto* platform = platforms[0];
   TF_ASSERT_OK_AND_ASSIGN(auto executors,
                           xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::StreamExecutorMemoryAllocator allocator(platform, executors);
+  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
   auto scoped_buffer = absl::make_unique<xla::ScopedShapedBuffer>(
@@ -43,11 +43,11 @@
   buffer = nullptr;
 }
 
-class TestAllocator : public DeviceMemoryAllocator {
+class TestAllocator : public se::DeviceMemoryAllocator {
  public:
   TestAllocator()
-      : DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) {
-  }
+      : se::DeviceMemoryAllocator(
+            PlatformUtil::GetDefaultPlatform().ValueOrDie()) {}
 
   ~TestAllocator() override {
     if (!allocations_.empty()) {
@@ -56,18 +56,18 @@
   }
 
   // Pull in two-arg overload of Allocate.
-  using DeviceMemoryAllocator::Allocate;
+  using se::DeviceMemoryAllocator::Allocate;
 
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool /*retry_on_failure*/) override {
+  StatusOr<se::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool /*retry_on_failure*/) override {
     // By contract, we must return null if size == 0.
     if (size == 0) {
-      return OwningDeviceMemory();
+      return se::OwningDeviceMemory();
     }
     void* buf = malloc(size);
     allocations_.insert({device_ordinal, buf});
-    return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal,
-                              this);
+    return se::OwningDeviceMemory(se::DeviceMemoryBase(buf, size),
+                                  device_ordinal, this);
   }
 
   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
@@ -120,7 +120,7 @@
   sb.buffers().ForEachMutableElement(
       [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
         TF_ASSERT_OK_AND_ASSIGN(
-            OwningDeviceMemory m,
+            se::OwningDeviceMemory m,
             allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
         *buffer = m.Forget();
       });
@@ -148,6 +148,27 @@
       });
 }
 
+TEST(ScopedShapedBufferTest, TestSubShapeTree) {
+  Shape array_shape = ShapeUtil::MakeShape(F32, {1});
+  Shape tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({array_shape, array_shape});
+  TestAllocator allocator;
+  ScopedShapedBuffer sb(tuple_shape, tuple_shape, &allocator,
+                        /*device_ordinal=*/0);
+  sb.buffers().ForEachMutableElement(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            se::OwningDeviceMemory m,
+            allocator.Allocate(/*device_ordinal=*/0, /*size=*/32));
+        *buffer = m.Forget();
+      });
+  auto ssb_statusor = sb.SubShapedBuffer({1});
+  ASSERT_TRUE(ssb_statusor.ok());
+  auto ssb = ssb_statusor.ConsumeValueOrDie();
+  EXPECT_EQ(ssb.on_host_shape(), array_shape);
+  EXPECT_EQ(ssb.on_device_shape(), array_shape);
+}
+
 // Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
 // (cardinality of each non-leaf node's children).
 void BM_TakeSubTree(int iters, int depth, int fan_out) {
diff --git a/tensorflow/compiler/xla/service/slice_sinker.cc b/tensorflow/compiler/xla/service/slice_sinker.cc
new file mode 100644
index 0000000..a8e681d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/slice_sinker.cc
@@ -0,0 +1,278 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/slice_sinker.h"
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+namespace {
+
+// Returns whether two slices are taken from the same indices, assuming the
+// slices are taking from tensors with the same dimensions.
+bool SameSliceConfiguration(const HloInstruction* slice_1,
+                            const HloInstruction* slice_2) {
+  CHECK_EQ(slice_1->opcode(), HloOpcode::kSlice);
+  CHECK_EQ(slice_2->opcode(), HloOpcode::kSlice);
+  CHECK(slice_1->operand(0)->shape().dimensions() ==
+        slice_2->operand(0)->shape().dimensions());
+  return slice_1->slice_starts() == slice_2->slice_starts() &&
+         slice_1->slice_limits() == slice_2->slice_limits() &&
+         slice_1->slice_strides() == slice_2->slice_strides();
+}
+
+// Returns true if all the operands of the given elementwise operation are
+// slices from the same indices of tensors with compatible shapes.
+bool IsElementwiseOperationOnSimilarSlices(const HloInstruction* inst) {
+  CHECK(inst->IsElementwise());
+
+  // Check that all operands are slices.
+  if (absl::c_any_of(inst->operands(), [](const HloInstruction* operand) {
+        return operand->opcode() != HloOpcode::kSlice;
+      })) {
+    return false;
+  }
+
+  // Check that all slices are from the same indices of slice sources with
+  // compatible shapes.
+  const HloInstruction* slice0 = inst->operand(0);
+  return absl::c_all_of(absl::MakeSpan(inst->operands()).subspan(1),
+                        [slice0](const HloInstruction* slice) {
+                          return ShapeUtil::CompatibleIgnoringElementType(
+                                     slice0->operand(0)->shape(),
+                                     slice->operand(0)->shape()) &&
+                                 SameSliceConfiguration(slice0, slice);
+                        });
+}
+
+// Given an elementwise operation with all slice operands, operation_on_slices,
+// checks whether another operation, candidate, is an operation that hasn't been
+// transformed and is similar to operation_on_slices as defined by the following
+// criteria:
+// (1) candidate has the same opcode as the operation_on_slices.
+// (2) The ith operand of candidate is a slice from the same slice source of
+//     the ith operand in operation_on_slices.
+// (3) All operands of candidate are slices taken from the same indices as the
+//     operands of operation_on_slices are.
+bool IsSimilarOperationOnSlices(const HloInstruction* operation_on_slices,
+                                const HloInstruction* candidate) {
+  // Instructions that have already been transformed have user_count 0. Avoid
+  // transforming such instructions again.
+  if (candidate->user_count() == 0) {
+    return false;
+  }
+
+  if (candidate->opcode() != operation_on_slices->opcode()) {
+    return false;
+  }
+
+  const HloInstruction* operand_slice0 = candidate->operand(0);
+  for (int64 i = 0; i < candidate->operand_count(); ++i) {
+    const HloInstruction* operand_slice = candidate->operand(i);
+    if (operand_slice->opcode() != HloOpcode::kSlice ||
+        operand_slice->operand(0) !=
+            operation_on_slices->operand(i)->operand(0) ||
+        !SameSliceConfiguration(operand_slice0, operand_slice)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Given a group of elementwise operations on slices that can be transformed to
+// one elementwise operation on the slice sources, compares the cost of
+// implementing the new elementwise operation on the slice sources with the cost
+// of implementing all the individual elementwise operations independently.
+// Returns true if the former is less expensive.
+//
+// Currently we don't support the following transformation that produces a new
+// elementwise operation on bigger slices of the slice sources. This is because
+// we don't have such a use case yet:
+// Transform
+//   p = f32[20] parameter(0)
+//   a = f32[8] slice(p), slice=[0:8]
+//   aa = add(a, a)
+//   b = f32[7] slice(p), slice=[2:9]
+//   bb = add(b, b)
+//
+// to
+//   p = f32[20] parameter(0)
+//   x = f32[9] slice(p), slice=[0:8]
+//   xx = add(x,x)
+//   aa = f32[8] slice(xx), slice=[0:8]
+//   bb = f32[7] slice(xx), slice=[2:9]
+bool ShouldTransform(const std::vector<HloInstruction*>& operations_on_slices) {
+  int64 sum = 0;
+  for (HloInstruction* user : operations_on_slices) {
+    sum += ShapeUtil::ElementsIn(user->shape());
+  }
+  return sum >= xla::ShapeUtil::ElementsIn(
+                    operations_on_slices[0]->operand(0)->operand(0)->shape());
+}
+
+// Returns a group of elementwise operations on slices that are similar to the
+// given operations_on_slices. See IsSimilarOperationOnSlices for what are
+// considered similar operation on slices.
+absl::optional<std::vector<HloInstruction*>> FindElementwiseOperationGroup(
+    const HloInstruction* operation_on_slices) {
+  std::vector<HloInstruction*> operations;
+  const HloInstruction* slice_source0 =
+      operation_on_slices->operand(0)->operand(0);
+
+  // Traverse the slices taken from the first slice sources.
+  for (const HloInstruction* operand_slice0 : slice_source0->users()) {
+    if (operand_slice0->opcode() != HloOpcode::kSlice) {
+      continue;
+    }
+
+    for (HloInstruction* user : operand_slice0->users()) {
+      if (IsSimilarOperationOnSlices(operation_on_slices, user)) {
+        operations.push_back(user);
+      }
+    }
+  }
+
+  return ShouldTransform(operations) ? absl::make_optional(operations)
+                                     : absl::nullopt;
+}
+
+// Generates a new elementwise operation using the slice_sources as operands,
+// and replaces the uses of elementwise operation_on_slices with slices of the
+// new elementwise operations.
+Status SinkSlices(const std::vector<HloInstruction*>& slice_sources,
+                  const std::vector<HloInstruction*>& operation_on_slices) {
+  const Shape shape = slice_sources[0]->shape();
+  PrimitiveType element_type = operation_on_slices[0]->shape().element_type();
+  Shape new_shape = ShapeUtil::ChangeElementType(shape, element_type);
+
+  HloComputation* computation = operation_on_slices[0]->parent();
+  auto operation_on_slice_sources = computation->AddInstruction(
+      operation_on_slices[0]->CloneWithNewOperands(new_shape, slice_sources));
+  VLOG(10) << "Adding operation_on_slice_sources: "
+           << operation_on_slice_sources->ToString();
+
+  // Replace each operation on slices with a slice of the operation on the slice
+  // sources.
+  for (HloInstruction* user : operation_on_slices) {
+    const HloInstruction* operand_slice = user->operand(0);
+    auto user_slice =
+        computation->AddInstruction(operand_slice->CloneWithNewOperands(
+            user->shape(), {operation_on_slice_sources}));
+    VLOG(10) << "Adding new slice: " << user_slice->ToString()
+             << " to replace: " << user->ToString();
+    TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(user_slice));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+// There are two purposes of this pass.
+//
+// 1. Eliminates redundant work that occurs when two slices overlap. For
+// example:
+//   p = f32[10] parameter(0)
+//   a = f32[9] slice(p), slice=[0:9]
+//   aa = add(a, a)
+//   b = f32[8] slice(p), slice=[2:10]
+//   bb = add(b, b)
+//   ...
+// Here we do 17 scalar add operations, while we actually only need to do 10 if
+// we can transform the code to the following:
+//   p = f32[10] parameter(0)
+//   add = add(p, p)
+//   aa = f32[9] slice(add), slice=[0:9]
+//   bb = f32[8] slice(add), slice=[2:10]
+//   ...
+//
+// 2. Merges elementwise operations when two slices are "adjacent".
+//   p = f32[10] parameter(0)
+//   a = f32[6] slice(p), slice=[0:6]
+//   aa = add(a, a)
+//   b = f32[4] slice(p), slice=[6:10]
+//   bb = add(b, b)
+//   ...
+// Here we're not doing any redundant work, but transforming this graph to the
+// following graph allows us to run fewer kernels:
+//   p = f32[10] parameter(0)
+//   add = add(p, p)
+//   aa = f32[6] slice(add), slice=[0:6]
+//   bb = f32[4] slice(add), slice=[6:10]
+//
+// As can be seen from the examples, the group of elementwise operations being
+// transformed must meet the following requirements:
+// (1) The operands of each operation are slices taken from the same indices of
+//     bigger tensors with the same dimensions.
+// (2) All operations have the same opcode.
+// (3) The corresponding operands of all operations are slices taken
+//     from the same bigger tensors.
+// (4) The accumulated size of the group of operations is not less than the size
+//     of such a bigger tensor. This is a heuristic to ensure that the
+// transformation never causes us to do more elementwise operations.
+//
+// This pass currently doesn't transform non-elementwise instructions. We may
+// extend this pass to transform non-elementwise instructions, such as dot,
+// broadcast and reduce in the future.
+StatusOr<bool> SliceSinker::Run(HloModule* module) {
+  bool changed = false;
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      // When processing instruction A in this loop, we may transform A along
+      // with instruction B, which is after A in the post order. An instruction
+      // that has been transformed has a user_count 0. We use this fact to
+      // avoid transforming an instruction that has been transformed.
+      if (!instruction->IsElementwise() || instruction->operand_count() == 0 ||
+          instruction->user_count() == 0) {
+        continue;
+      }
+      VLOG(10) << "Processing instruction : " << instruction->ToString();
+
+      // This checks condition (1).
+      if (!IsElementwiseOperationOnSimilarSlices(instruction)) {
+        continue;
+      }
+
+      // Try to find a group of elementwise operations that are similar to
+      // the current instruction. This checks conditions (2)-(4).
+      absl::optional<std::vector<HloInstruction*>> similar_operations =
+          FindElementwiseOperationGroup(instruction);
+      if (!similar_operations.has_value()) {
+        continue;
+      }
+
+      std::vector<HloInstruction*> slice_sources;
+      absl::c_transform(
+          instruction->operands(), std::back_inserter(slice_sources),
+          [](HloInstruction* slice) { return slice->mutable_operand(0); });
+
+      TF_RETURN_IF_ERROR(SinkSlices(slice_sources, similar_operations.value()));
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/slice_sinker.h b/tensorflow/compiler/xla/service/slice_sinker.h
new file mode 100644
index 0000000..4615b5f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/slice_sinker.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SLICE_SINKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SLICE_SINKER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// An HLO pass that sinks slice operations used by a group of elementwise
+// operations and merges the group of elementwise operations.
+class SliceSinker : public HloModulePass {
+ public:
+  tensorflow::StringPiece name() const override { return "slice-sinker"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SLICE_SINKER_H_
diff --git a/tensorflow/compiler/xla/service/slice_sinker_test.cc b/tensorflow/compiler/xla/service/slice_sinker_test.cc
new file mode 100644
index 0000000..f09a7a8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/slice_sinker_test.cc
@@ -0,0 +1,498 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/slice_sinker.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+using ::testing::ElementsAre;
+
+class SliceSinkerTest : public HloTestBase {};
+
+TEST_F(SliceSinkerTest, TernaryOperation) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = pred[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      p2 = f32[8,9] parameter(2)
+      s00 = pred[2,9] slice(pred[8,9] p0), slice={[0:2], [0:9]}
+      s01 = pred[6,9] slice(pred[8,9] p0), slice={[2:8], [0:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[6,9] slice(f32[8,9] p1), slice={[2:8], [0:9]}
+      s20 = f32[2,9] slice(f32[8,9] p2), slice={[0:2], [0:9]}
+      s21 = f32[6,9] slice(f32[8,9] p2), slice={[2:8], [0:9]}
+      sel0 = f32[2,9] select(pred[2,9] s00, f32[2,9] s10, f32[2,9] s20)
+      sel1 = f32[6,9] select(pred[6,9] s01, f32[6,9] s11, f32[6,9] s21)
+      ROOT tuple = (f32[2,9], f32[6,9]) tuple(sel0, sel1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  EXPECT_THAT(inst,
+              GmockMatch(m::Tuple(
+                  m::Slice(&slice0, m::Select(m::Parameter(0), m::Parameter(1),
+                                              m::Parameter(2))),
+                  m::Slice(&slice1, m::Select(m::Parameter(0), m::Parameter(1),
+                                              m::Parameter(2))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(2, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(1, 1));
+}
+
+TEST_F(SliceSinkerTest, OverlappingPartialSlicesBeneficial) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[5,9] slice(f32[8,9] p0), slice={[3:8], [0:9]}
+      s02 = f32[8,4] slice(f32[8,9] p0), slice={[0:8], [0:4]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[5,9] slice(f32[8,9] p1), slice={[3:8], [0:9]}
+      s12 = f32[8,4] slice(f32[8,9] p1), slice={[0:8], [0:4]}
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] s10)
+      add1 = f32[5,9] add(f32[5,9] s01, f32[5,9] s11)
+      add2 = f32[8,4] add(f32[8,4] s02, f32[8,4] s12)
+      ROOT tuple = (f32[2,9], f32[5,9], f32[8,4]) tuple(add0, add1, add2)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  const HloInstruction* slice2;
+  EXPECT_THAT(
+      inst, GmockMatch(m::Tuple(
+                m::Slice(&slice0, m::Add(m::Parameter(0), m::Parameter(1))),
+                m::Slice(&slice1, m::Add(m::Parameter(0), m::Parameter(1))),
+                m::Slice(&slice2, m::Add(m::Parameter(0), m::Parameter(1))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(3, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice2->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice2->slice_limits(), ElementsAre(8, 4));
+  EXPECT_THAT(slice2->slice_strides(), ElementsAre(1, 1));
+}
+
+TEST_F(SliceSinkerTest, SameSliceSourcesTwoPeerGroups) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[6,9] slice(f32[8,9] p0), slice={[2:8], [0:9]}
+      s02 = f32[8,2] slice(f32[8,9] p0), slice={[0:8], [0:2]}
+      s03 = f32[8,7] slice(f32[8,9] p0), slice={[0:8], [2:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[6,9] slice(f32[8,9] p1), slice={[2:8], [0:9]}
+      s12 = f32[8,2] slice(f32[8,9] p1), slice={[0:8], [0:2]}
+      s13 = f32[8,7] slice(f32[8,9] p1), slice={[0:8], [2:9]}
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] s10)
+      add1 = f32[6,9] add(f32[6,9] s01, f32[6,9] s11)
+      mul0 = f32[8,2] multiply(f32[8,2] s02, f32[8,2] s12)
+      mul1 = f32[8,7] multiply(f32[8,7] s03, f32[8,7] s13)
+      ROOT tuple = (f32[2,9], f32[6,9], f32[8,2], f32[8,7]) tuple(add0, add1, mul0, mul1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  const HloInstruction* slice2;
+  const HloInstruction* slice3;
+  EXPECT_THAT(
+      inst,
+      GmockMatch(m::Tuple(
+          m::Slice(&slice0, m::Add(m::Parameter(0), m::Parameter(1))),
+          m::Slice(&slice1, m::Add(m::Parameter(0), m::Parameter(1))),
+          m::Slice(&slice2, m::Multiply(m::Parameter(0), m::Parameter(1))),
+          m::Slice(&slice3, m::Multiply(m::Parameter(0), m::Parameter(1))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(2, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice2->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice2->slice_limits(), ElementsAre(8, 2));
+  EXPECT_THAT(slice2->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice3->slice_starts(), ElementsAre(0, 2));
+  EXPECT_THAT(slice3->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice3->slice_strides(), ElementsAre(1, 1));
+}
+
+TEST_F(SliceSinkerTest, OverlappingMultipleSlices) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[5,9] slice(f32[8,9] p0), slice={[3:8], [0:9]}
+      s02 = f32[3,9] slice(f32[8,9] p0), slice={[2:5], [0:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[5,9] slice(f32[8,9] p1), slice={[3:8], [0:9]}
+      s12 = f32[3,9] slice(f32[8,9] p1), slice={[2:5], [0:9]}
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] s10)
+      add1 = f32[5,9] add(f32[5,9] s01, f32[5,9] s11)
+      add2 = f32[3,9] add(f32[3,9] s02, f32[3,9] s12)
+      ROOT tuple = (f32[2,9], f32[5,9], f32[3,9]) tuple(add0, add1, add2)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  const HloInstruction* slice2;
+  EXPECT_THAT(
+      inst, GmockMatch(m::Tuple(
+                m::Slice(&slice0, m::Add(m::Parameter(0), m::Parameter(1))),
+                m::Slice(&slice1, m::Add(m::Parameter(0), m::Parameter(1))),
+                m::Slice(&slice2, m::Add(m::Parameter(0), m::Parameter(1))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(3, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice2->slice_starts(), ElementsAre(2, 0));
+  EXPECT_THAT(slice2->slice_limits(), ElementsAre(5, 9));
+  EXPECT_THAT(slice2->slice_strides(), ElementsAre(1, 1));
+}
+
+TEST_F(SliceSinkerTest, DisjointedPartialSlices) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[5,9] slice(f32[8,9] p0), slice={[2:7], [0:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[5,9] slice(f32[8,9] p1), slice={[2:7], [0:9]}
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] s10)
+      add1 = f32[5,9] add(f32[5,9] s01, f32[5,9] s11)
+      ROOT tuple = (f32[2,9], f32[5,9]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, OverlappingPartialSlicesNotBeneficial) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,7] slice(f32[8,9] p0), slice={[0:2], [0:7]}
+      s01 = f32[6,7] slice(f32[8,9] p0), slice={[2:8], [0:7]}
+      s10 = f32[2,7] slice(f32[8,9] p1), slice={[0:2], [0:7]}
+      s11 = f32[6,7] slice(f32[8,9] p1), slice={[2:8], [0:7]}
+      add0 = f32[2,7] add(f32[2,7] s00, f32[2,7] s10)
+      add1 = f32[6,7] add(f32[6,7] s01, f32[6,7] s11)
+      ROOT tuple = (f32[2,7], f32[6,7]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, DifferentOrderingOfSliceSources) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,7] parameter(0)
+      p1 = f32[8,7] parameter(1)
+      s00 = f32[2,7] slice(f32[8,7] p0), slice={[0:2], [0:7]}
+      s01 = f32[6,7] slice(f32[8,7] p0), slice={[2:8], [0:7]}
+      s10 = f32[2,7] slice(f32[8,7] p1), slice={[0:2], [0:7]}
+      s11 = f32[6,7] slice(f32[8,7] p1), slice={[2:8], [0:7]}
+      add0 = f32[2,7] add(f32[2,7] s00, f32[2,7] s10)
+      add1 = f32[6,7] add(f32[6,7] s11, f32[6,7] s01)
+      ROOT tuple = (f32[2,7], f32[6,7]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, SlicesFromDifferentIndices) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[4,9] slice(f32[8,9] p0), slice={[0:4], [0:9]}
+      s01 = f32[4,9] slice(f32[8,9] p0), slice={[4:8], [0:9]}
+      s10 = f32[4,9] slice(f32[8,9] p1), slice={[0:4], [0:9]}
+      s11 = f32[4,9] slice(f32[8,9] p1), slice={[4:8], [0:9]}
+      add0 = f32[4,9] add(f32[4,9] s01, f32[4,9] s10)
+      add1 = f32[4,9] add(f32[4,9] s00, f32[4,9] s11)
+      ROOT tuple = (f32[4,9], f32[4,9]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, DifferentOperator) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[6,9] slice(f32[8,9] p0), slice={[2:8], [0:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[6,9] slice(f32[8,9] p1), slice={[2:8], [0:9]}
+      mul = f32[2,9] multiply(f32[2,9] s00, f32[2,9] s10)
+      add = f32[6,9] add(f32[6,9] s01, f32[6,9] s11)
+      ROOT tuple = (f32[2,9], f32[6,9]) tuple(mul, add)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, SlicesWithMultiUsers) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[6,9] slice(f32[8,9] p0), slice={[2:8], [0:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[6,9] slice(f32[8,9] p1), slice={[2:8], [0:9]}
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] s10)
+      add1 = f32[6,9] add(f32[6,9] s01, f32[6,9] s11)
+      mul0 = f32[2,9] multiply(f32[2,9] s00, f32[2,9] s10)
+      mul1 = f32[6,9] multiply(f32[6,9] s01, f32[6,9] s11)
+      ROOT tuple = (f32[2,9], f32[6,9]) tuple(add0, add1, mul0, mul1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  const HloInstruction* slice2;
+  const HloInstruction* slice3;
+  EXPECT_THAT(
+      inst,
+      GmockMatch(m::Tuple(
+          m::Slice(&slice0, m::Add(m::Parameter(0), m::Parameter(1))),
+          m::Slice(&slice1, m::Add(m::Parameter(0), m::Parameter(1))),
+          m::Slice(&slice2, m::Multiply(m::Parameter(0), m::Parameter(1))),
+          m::Slice(&slice3, m::Multiply(m::Parameter(0), m::Parameter(1))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(2, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice2->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice2->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice2->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice3->slice_starts(), ElementsAre(2, 0));
+  EXPECT_THAT(slice3->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice3->slice_strides(), ElementsAre(1, 1));
+}
+
+TEST_F(SliceSinkerTest, NonElementWise) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8] parameter(0)
+      s00 = f32[2] slice(f32[8] p0), slice={[0:2]}
+      s01 = f32[6] slice(f32[8] p0), slice={[2:8]}
+      bc0 = f32[2,9] broadcast(f32[2] s00), dimensions={0}
+      bc1 = f32[6,9] broadcast(f32[6] s01), dimensions={0}
+      ROOT tuple = (f32[2,9], f32[6,9]) tuple(bc0, bc1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, SlicesWithNontrivialStrides) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[4,9] slice(f32[8,9] p0), slice={[0:7:2], [0:9]}
+      s01 = f32[4,9] slice(f32[8,9] p0), slice={[1:8:2], [0:9]}
+      s10 = f32[4,9] slice(f32[8,9] p1), slice={[0:7:2], [0:9]}
+      s11 = f32[4,9] slice(f32[8,9] p1), slice={[1:8:2], [0:9]}
+      add0 = f32[4,9] add(f32[4,9] s00, f32[4,9] s10)
+      add1 = f32[4,9] add(f32[4,9] s01, f32[4,9] s11)
+      ROOT tuple = (f32[4,9], f32[4,9]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  EXPECT_THAT(
+      inst, GmockMatch(m::Tuple(
+                m::Slice(&slice0, m::Add(m::Parameter(0), m::Parameter(1))),
+                m::Slice(&slice1, m::Add(m::Parameter(0), m::Parameter(1))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(7, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(2, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(1, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(2, 1));
+}
+
+TEST_F(SliceSinkerTest, NotAllSliceOperand) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[2,9] parameter(1)
+      p2 = f32[6,9] parameter(2)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[6,9] slice(f32[8,9] p0), slice={[2:8], [0:9]}
+      abs0 = f32[2,9] abs(f32[2,9] p1)
+      abs1 = f32[6,9] abs(f32[6,9] p2)
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] abs0)
+      add1 = f32[6,9] add(f32[6,9] s01, f32[6,9] abs1)
+      ROOT tuple = (f32[2,9], f32[6,9]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(SliceSinkerTest, Cascade) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[8,9] parameter(0)
+      p1 = f32[8,9] parameter(1)
+      s00 = f32[2,9] slice(f32[8,9] p0), slice={[0:2], [0:9]}
+      s01 = f32[6,9] slice(f32[8,9] p0), slice={[2:8], [0:9]}
+      s10 = f32[2,9] slice(f32[8,9] p1), slice={[0:2], [0:9]}
+      s11 = f32[6,9] slice(f32[8,9] p1), slice={[2:8], [0:9]}
+      abs0 = f32[2,9] abs(f32[2,9] s10)
+      abs1 = f32[6,9] abs(f32[6,9] s11)
+      add0 = f32[2,9] add(f32[2,9] s00, f32[2,9] abs0)
+      add1 = f32[6,9] add(f32[6,9] s01, f32[6,9] abs1)
+      ROOT tuple = (f32[2,9], f32[6,9]) tuple(add0, add1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  SliceSinker slice_sinker;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&slice_sinker, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* inst = module->entry_computation()->root_instruction();
+  const HloInstruction* slice0;
+  const HloInstruction* slice1;
+  EXPECT_THAT(
+      inst,
+      GmockMatch(m::Tuple(
+          m::Slice(&slice0, m::Add(m::Parameter(0), m::Abs(m::Parameter(1)))),
+          m::Slice(&slice1,
+                   m::Add(m::Parameter(0), m::Abs(m::Parameter(1)))))));
+  EXPECT_THAT(slice0->slice_starts(), ElementsAre(0, 0));
+  EXPECT_THAT(slice0->slice_limits(), ElementsAre(2, 9));
+  EXPECT_THAT(slice0->slice_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(slice1->slice_starts(), ElementsAre(2, 0));
+  EXPECT_THAT(slice1->slice_limits(), ElementsAre(8, 9));
+  EXPECT_THAT(slice1->slice_strides(), ElementsAre(1, 1));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index dd26a21..6474edf 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -178,7 +178,8 @@
     se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
     const MutableBorrowingLiteral& literal, std::function<void(Status)> done,
     const TransferMetadata* transfer_metadata) {
-  if (!ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) {
+  if (!Shape::Equal().MinorToMajorOnlyInLayout()(HostShapeToDeviceShape(shape),
+                                                 shape)) {
     auto error = StrCat("Shape ", ShapeUtil::HumanString(shape),
                         " has a differently shaped representation on-device: ",
                         ShapeUtil::HumanString(HostShapeToDeviceShape(shape)));
@@ -307,7 +308,7 @@
 }
 
 StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
-    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
     int device_ordinal) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
     return InvalidArgument("Shape must have a layout: %s",
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 17a0a3c..f08862b 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -229,7 +229,7 @@
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
   StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
-      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
       int device_ordinal);
 
   // The given ShapedBuffer holds a handle to allocated memory, but it is not
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
index 790074a..57efee7 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -317,13 +317,9 @@
         auto a_row =
             MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
         if (left_side) {
-          remainder =
-              b_row - BatchDot(MaybeTransposeInMinorDims(a_row, transpose_a), x,
-                               precision);
+          remainder = b_row - BatchDot(a_row, transpose_a, x, false, precision);
         } else {
-          remainder =
-              b_row - BatchDot(x, MaybeTransposeInMinorDims(a_row, transpose_a),
-                               precision);
+          remainder = b_row - BatchDot(x, false, a_row, transpose_a, precision);
         }
       }
 
@@ -332,12 +328,11 @@
       auto start_index = ConstantR0WithType(builder, S32, j * block_size);
       std::vector<XlaOp> update_starts = {start_index, zero};
       if (left_side) {
-        x_update = BatchDot(MaybeTransposeInMinorDims(inv_block, transpose_a),
-                            remainder, precision);
+        x_update =
+            BatchDot(inv_block, transpose_a, remainder, false, precision);
       } else {
-        x_update = BatchDot(remainder,
-                            MaybeTransposeInMinorDims(inv_block, transpose_a),
-                            precision);
+        x_update =
+            BatchDot(remainder, false, inv_block, transpose_a, precision);
         std::swap(update_starts[0], update_starts[1]);
       }
       x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index ca173fc..638c3b4 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -19,6 +19,7 @@
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -754,6 +755,14 @@
       // index 'other_add_operand_index').
       return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
                                           other_add_operand_index);
+    } else if (user->IsCustomFusion()) {
+      std::vector<int64> operand_indices = user->OperandIndices(operand);
+      return operand_indices.size() == 1 && operand_indices[0] == 0 &&
+             absl::c_any_of(
+                 user->fused_instructions_computation()->instructions(),
+                 [](const HloInstruction* hlo) {
+                   return hlo->opcode() == HloOpcode::kScatter;
+                 });
     }
   }
   if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index a31bf0f..b6e1cb6 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -56,7 +56,7 @@
   bool IsArray() const { return primitive_util::IsArrayType(element_type()); }
   bool IsTuple() const { return element_type() == TUPLE; }
   bool IsToken() const { return element_type() == TOKEN; }
-  bool IsOpaque() const { return element_type() == OPAQUE; }
+  bool IsOpaque() const { return element_type() == OPAQUE_TYPE; }
 
   // Returns true if no array dimension in the shape is dynamically sized. Tuple
   // shapes are traversed recursively.
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 0891201..75eb34f 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -122,15 +122,16 @@
   // Return the shape represented with this ShapeTree.
   const Shape& shape() const { return *shape_; }
 
-  // Replaces *only* the underlying shape of this ShapeTree. The caller must own
-  // the Shape object and hence shape_storage_ is not updated.
-  //
-  // Only safe to use this if the ShapeTree was constructed with 'explicit
-  // ShapeTree(const Shape* shape)' or is moved from one such ShapeTree. The
-  // caller must ensure that the input shape is consistent with the underlying
-  // tree.
+  // A ShapeTree object can own the underlying Shape pointer (via the
+  // shape_storage_ member), or can point to a Shape object owned by the caller.
+  // This API replaces the underlying Shape object to the one supplied by the
+  // caller, whom must ensure the object remain valid for the whole lifetime of
+  // this ShapeTree object, and also that the Shape is consistent with it.
   void replace_shape_ptr(const Shape* shape) {
-    CHECK(shape_storage_.get() == nullptr);
+    if (shape_storage_ != nullptr) {
+      CHECK_EQ(*shape, *shape_storage_);
+      shape_storage_ = nullptr;
+    }
     shape_ = shape;
   }
 
@@ -290,6 +291,8 @@
                        const ShapeIndex& source_base_index,
                        const ShapeIndex& target_base_index);
 
+  StatusOr<ShapeTree<T>> SubShapeTree(const ShapeIndex& index) const;
+
   bool operator==(const ShapeTree<T>& other) const;
   bool operator!=(const ShapeTree<T>& other) const { return !(*this == other); }
 
@@ -665,6 +668,16 @@
 }
 
 template <typename T>
+StatusOr<ShapeTree<T>> ShapeTree<T>::SubShapeTree(
+    const ShapeIndex& index) const {
+  TF_ASSIGN_OR_RETURN(const Shape* sub_shape,
+                      ShapeUtil::TryGetSubshape(shape(), index));
+  ShapeTree<T> sub_shape_tree(*sub_shape);
+  sub_shape_tree.CopySubtreeFrom(*this, index, {});
+  return std::move(sub_shape_tree);
+}
+
+template <typename T>
 bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
   bool equal = true;
   ForEachElement([&other, &equal](const ShapeIndex& index, const T& data) {
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 340f679..eee18c9 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -96,7 +96,7 @@
     return InvalidArgument("Dimensions size is %ld, but layout size is %ld.",
                            dimensions.size(), minor_to_major.size());
   }
-  if (element_type == OPAQUE || element_type == TUPLE) {
+  if (element_type == OPAQUE_TYPE || element_type == TUPLE) {
     return InvalidArgument("Unsupported element type: %s",
                            PrimitiveType_Name(element_type));
   }
@@ -258,7 +258,7 @@
 
 /* static */ Shape ShapeUtil::MakeOpaqueShape() {
   Shape result;
-  result.set_element_type(OPAQUE);
+  result.set_element_type(OPAQUE_TYPE);
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(result));
   return result;
 }
@@ -319,7 +319,7 @@
     case C64:
     case C128:
     case TUPLE:
-    case OPAQUE:
+    case OPAQUE_TYPE:
     case TOKEN:
       return false;
 
@@ -570,7 +570,7 @@
       // Tokens require no space.
       return 0;
     case TUPLE:
-    case OPAQUE:
+    case OPAQUE_TYPE:
       LOG(FATAL) << PrimitiveType_Name(primitive_type)
                  << " primitive type has no definitive size";
     default:
@@ -591,7 +591,7 @@
     return byte_size;
   } else if (shape.element_type() == TOKEN) {
     return 0;
-  } else if (shape.element_type() == OPAQUE) {
+  } else if (shape.element_type() == OPAQUE_TYPE) {
     CHECK_GT(pointer_size, 0);
     return pointer_size;
   }
@@ -653,7 +653,7 @@
   }
 
   // Tokens and opaques can should not have layout or dimensions.
-  if (shape.element_type() == TOKEN || shape.element_type() == OPAQUE) {
+  if (shape.element_type() == TOKEN || shape.element_type() == OPAQUE_TYPE) {
     if (shape.dimensions_size() != 0) {
       return InvalidArgument(
           "shape has %s element type, but has dimensions field: %s",
diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h
index 3ede5e6..a2b76fa 100644
--- a/tensorflow/compiler/xla/test_helpers.h
+++ b/tensorflow/compiler/xla/test_helpers.h
@@ -28,9 +28,6 @@
 
 // This module contains a minimal subset of gmock functionality just
 // sufficient to execute the currently existing tests.
-namespace util {
-class Status;
-}  // namespace util
 
 namespace xla {
 template <typename T>
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index d5bcdc2..cff87c5 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -55,6 +55,7 @@
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -259,7 +260,6 @@
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -268,6 +268,7 @@
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -1172,7 +1173,6 @@
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -1183,6 +1183,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
@@ -1418,8 +1419,8 @@
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1711,6 +1712,10 @@
     srcs = ["multi_device_all_reduce_test.cc"],
     backends = ["gpu"],
     tags = [
+        # This test is tagged "manual" because it requires multiple GPUs, and
+        # Forge only supports single-GPU tests.  Guitar skips "manual" tests
+        # unless they're also tagged "guitar".
+        "noguitar",  # TODO(b/131524578): Re-enable this.
         "manual",
         "multi_gpu",
         "no_oss",
@@ -2074,7 +2079,6 @@
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -2086,6 +2090,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
@@ -2202,13 +2207,13 @@
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 08cb39d..48719c6 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -265,6 +265,8 @@
                 "-DXLA_DISABLED_MANIFEST=\\\"%s\\\"" % manifest,
             ],
             deps = [
+                "@com_google_absl//absl/container:flat_hash_map",
+                "@com_google_absl//absl/strings",
                 "//tensorflow/compiler/xla:types",
                 "//tensorflow/core:lib",
                 "//tensorflow/core:regexp_internal",
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index cfee9c0..0ab765a 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -1801,7 +1801,8 @@
                       Convolve1DTestParam{24, 1, 1, 10, 5},
                       Convolve1DTestParam{160, 1, 1, 10, 1},
                       Convolve1DTestParam{255, 1, 1, 3, 1},
-                      Convolve1DTestParam{130, 1, 1, 1, 3},
+                      Convolve1DTestParam{130, 1, 1, 1, 2},
+                      Convolve1DTestParam{136, 1, 1, 1, 2},
                       Convolve1DTestParam{64, 1, 1, 1, 1},
                       Convolve1DTestParam{128, 1, 1, 1, 1},
                       Convolve1DTestParam{139, 1, 1, 128, 1},
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 4687ed6..63c3b4b 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -19,7 +19,7 @@
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -64,10 +64,10 @@
 
 }  // namespace
 
-REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
-REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
-REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
-REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 72e1cf708..59c3d4f 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -31,6 +31,7 @@
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1200,6 +1201,7 @@
       p{v{16, 34}, v{16, 34}, "ab,ab->ab"},
       p{v{16, 3, 34}, v{3, 16, 34}, "abc,bac->abc"},
       p{v{5, 19}, v{}, "ab,->ab"},
+      p{v{8, 1, 16, 64}, v{8, 12, 16, 64}, "bqhf,bkhf->bhqk"},
   };
   return test_cases;
 }
@@ -1402,5 +1404,183 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
 }
 
+XLA_TEST_F(DotOperationTest, ReorderContractingDimsConstLHS_RL) {
+  Array3D<float> input_arr(2, 3, 2);
+  Array2D<float> const_arr(2, 6);
+  input_arr.FillIota(0);
+  const_arr.FillIota(0);
+
+  XlaBuilder builder(TestName());
+  auto t0 =
+      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  auto t1 = Transpose(t0, {1, 0, 2});
+  auto rhs = Reshape(t1, {6, 2});
+  auto lhs = ConstantR2FromArray2D(&builder, const_arr);
+  Dot(lhs, rhs);
+
+  ComputeAndCompare(&builder, {}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
+  Array3D<float> input_arr(2, 3, 2);
+  Array2D<float> const_arr(2, 6);
+  input_arr.FillIota(0);
+  const_arr.FillIota(0);
+
+  XlaBuilder builder(TestName());
+  auto t0 =
+      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  auto t1 = Transpose(t0, {1, 0, 2});
+  auto lhs = Reshape(t1, {6, 2});
+  auto rhs = ConstantR2FromArray2D(&builder, const_arr);
+
+  DotDimensionNumbers dims;
+  dims.add_lhs_contracting_dimensions(0);
+  dims.add_rhs_contracting_dimensions(1);
+  DotGeneral(lhs, rhs, dims);
+
+  ComputeAndCompare(&builder, {}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_RL) {
+  Array4D<float> input_arr(2, 2, 3, 4);
+  Array2D<float> const_arr(24, 2);
+  input_arr.FillIota(0);
+  const_arr.FillIota(0);
+
+  XlaBuilder builder(TestName());
+  auto t0 =
+      AddParam(LiteralUtil::CreateR4FromArray4D<float>(input_arr), &builder);
+  auto t1 = Transpose(t0, {0, 2, 3, 1});
+  auto lhs = Reshape(t1, {2, 24});
+  auto rhs = ConstantR2FromArray2D(&builder, const_arr);
+  Dot(lhs, rhs);
+
+  ComputeAndCompare(&builder, {}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
+  Array3D<float> input_arr(2, 6, 2);
+  Array3D<float> const_arr(2, 6, 3);
+  input_arr.FillIota(0);
+  const_arr.FillIota(0);
+
+  XlaBuilder builder(TestName());
+  auto t0 =
+      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  auto t1 = Reshape(t0, {2, 2, 3, 2});
+  auto t2 = Transpose(t1, {0, 2, 1, 3});
+  auto lhs = Reshape(t2, {2, 6, 2});
+  auto rhs = ConstantR3FromArray3D(&builder, const_arr);
+
+  DotDimensionNumbers dims;
+  dims.add_lhs_contracting_dimensions(1);
+  dims.add_rhs_contracting_dimensions(1);
+  dims.add_lhs_batch_dimensions(0);
+  dims.add_rhs_batch_dimensions(0);
+  DotGeneral(lhs, rhs, dims);
+
+  ComputeAndCompare(&builder, {}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
+  Array4D<float> input_arr(2, 2, 3, 5);
+  Array2D<float> const_arr(2, 30);
+  input_arr.FillIota(0);
+  const_arr.FillIota(0);
+
+  XlaBuilder builder(TestName());
+  auto t0 =
+      AddParam(LiteralUtil::CreateR4FromArray4D<float>(input_arr), &builder);
+  auto t1 = Transpose(t0, {0, 2, 1, 3});
+  auto t2 = Reshape(t1, {2, 6, 5});
+  auto t3 = Transpose(t2, {0, 2, 1});
+  auto lhs = Reshape(t3, {2, 30});
+  auto rhs = ConstantR2FromArray2D(&builder, const_arr);
+
+  DotDimensionNumbers dims;
+  dims.add_lhs_contracting_dimensions(1);
+  dims.add_rhs_contracting_dimensions(1);
+  DotGeneral(lhs, rhs, dims);
+
+  // Constant folding are disabled by default in unit tests. algsimp
+  // optimization can be applied multiple times if we fold the transpose
+  // and reshape that are moved to the constant side of the dot.
+  mutable_debug_options()->clear_xla_disable_hlo_passes();
+  ComputeAndCompare(&builder, {}, error_spec_);
+}
+
+// This benchmark is to show the performance impact of the following
+// transformation:
+//   dot(reshape(transpose(A)), Const) ==>
+//   dot(reshape(A), reshape(transpose(reshape(Const)))),
+// and then fold the reshape and transpose on the Const side.
+// We can compare performance with and without algsimp pass to see the impact.
+void DOT_ReorderContracting(int num_iters) {
+  tensorflow::testing::StopTiming();
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
+  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform);
+  auto client =
+      ClientLibrary::GetOrCreateLocalClient(client_options).ValueOrDie();
+
+  int device_ordinal = client->default_device_ordinal();
+
+  const int64 d0 = 128;
+  const int64 d1 = 128;
+  const int64 d2 = 128;
+  const int64 d3 = 128;
+
+  Array3D<float> input_arr(d0, d1, d2);
+  Array2D<float> const_arr(d1 * d2, d3);
+  input_arr.FillIota(0);
+  const_arr.FillIota(0);
+  XlaBuilder builder("ReorderContracting");
+  auto t0 =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {d0, d1, d2}), "param0");
+  auto t1 = Transpose(t0, {0, 2, 1});
+  auto lhs = Reshape(t1, {d0, d2 * d1});
+  auto rhs = ConstantR2FromArray2D(&builder, const_arr);
+  Dot(lhs, rhs);
+  auto computation = builder.Build().ConsumeValueOrDie();
+
+  auto input_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  ScopedShapedBuffer buffer0 =
+      client->LiteralToShapedBuffer(input_literal, device_ordinal)
+          .ConsumeValueOrDie();
+
+  std::unique_ptr<LocalExecutable> executable =
+      client
+          ->Compile(computation, {&buffer0.on_host_shape()},
+                    ExecutableBuildOptions())
+          .ConsumeValueOrDie();
+
+  se::Stream stream(executors[device_ordinal]);
+  stream.Init();
+
+  ExecutableRunOptions options;
+  options.set_allocator(&allocator);
+
+  const int kWarmups = 2;
+  for (int i = 0; i < kWarmups; ++i) {
+    ASSERT_IS_OK(executable->Run({&buffer0}, options));
+  }
+
+  const int64 total_bytes = d0 * d1 * d2 + d1 * d2 * d3 + d0 * d3;
+  tensorflow::testing::BytesProcessed(static_cast<int64>(num_iters) *
+                                      total_bytes * sizeof(float));
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < num_iters; ++i) {
+    ASSERT_IS_OK(executable->Run({&buffer0}, options));
+  }
+}
+
+BENCHMARK(DOT_ReorderContracting);
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 82e2db3..1ea72af 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -21,7 +21,6 @@
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -34,6 +33,7 @@
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -736,7 +736,7 @@
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client =
       ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
   auto* transfer_manager =
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
index 58bb9a2..adb1d39 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
@@ -215,7 +215,7 @@
         RunImpl<half, uint16>(enqueue_op, evaluate_op);
         break;
       case BF16:
-        SetDefaultErrSpec(0.001, 0.01);
+        SetDefaultErrSpec(0.002, 0.02);
         RunImpl<bfloat16, uint16>(enqueue_op, evaluate_op);
         break;
       default:
@@ -245,14 +245,6 @@
     int64 begin, end;
     std::tie(begin, end) = test_range;
 
-    if (begin >= known_incorrect_begin_ && end <= known_incorrect_end_) {
-      LOG(INFO) << absl::StreamFormat(
-          "Skipping this shard, as the range under test, [%d, %d), falls "
-          "entirely within the known-incorrect range [%d, %d).",
-          begin, end, known_incorrect_begin_, known_incorrect_end_);
-      return;
-    }
-
     LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
 
     int64 input_size = end - begin;
@@ -262,8 +254,7 @@
       IntegralT input_val = i + begin;
       // If the operation is known to be buggy on a specific input clamp that
       // input to 0 under the assumption that the op is at least correct on 0.
-      if (input_val >= known_incorrect_begin_ &&
-          input_val < known_incorrect_end_) {
+      if (known_incorrect_fn_ && known_incorrect_fn_(input_val)) {
         input_arr[i] = T{0};
       } else {
         input_arr[i] = absl::bit_cast<T>(input_val);
@@ -434,11 +425,14 @@
       LOG(ERROR) << err_generator();
     } else if (*mismatches == kMaxMismatchesLoggedToErr) {
       LOG(ERROR) << "Not printing any more mismatches; pass "
-                    "--vmodule=exhaustive_f32__op_test=2 to see "
+                    "--vmodule=exhaustive_op_test=2 to see "
                     "all of them.";
     }
   }
 
+  // Sets error parameters appropriately for testing sin/cos/tan.
+  void SetParamsForSinCosTan();
+
   // The following members are set during construction so testcases can read
   // these values and use them e.g. to influence the values given to the mutable
   // members below.
@@ -452,10 +446,9 @@
   // Tests can set the following variables for control over execution.  This is
   // safe because each XLA_TEST_P instantiates a new instance of this class.
 
-  // Testing will ignore the given range (encoded as bitwise representations of
-  // the type under test zero-extended to int64).
-  int64 known_incorrect_begin_ = 0;
-  int64 known_incorrect_end_ = 0;
+  // Testing will ignore inputs for which known_incorect_fn_ returns true.  (Its
+  // argument is the type under test, e.g. f32, zero-extended to int64).
+  std::function<bool(int64)> known_incorrect_fn_;
 
   // If unset, reasonable defaults will be used depending on the type under
   // test.
@@ -496,40 +489,39 @@
 }
 
 XLA_TEST_P(ExhaustiveOpTest, Exp) {
-  if (platform_ == "Host" && ty_ == F32) {
-    // TODO(b/73142289): The vectorized Exp implementation gives results outside
-    // our error spec in this range.
-    known_incorrect_begin_ = 1107296256 + 11583654;
-    known_incorrect_end_ = 1107296256 + 11629080;
-  } else if (platform_ == "Host" && ty_ == BF16) {
-    // TODO(jlebar): Is this a rounding error?  Why doesn't it occur on XLA:GPU?
-    //
-    // Mismatch on 88.5 (0x42b1).
-    //   Expected 2.72491739e+38 (0x7f4d), but got inf (0x7f80).
-    known_incorrect_begin_ = 0x42b1;
-    known_incorrect_end_ = 0x42b2;
+  // Our CPU implementation of exp returns one incorrect value: says
+  // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
+  // acceptable and check for it explicitly so that we can be aware if anything
+  // changes.
+  if (platform_ == "Host") {
+    auto host_exp_with_overflow = +[](float f) {
+      if (f == 88.7228394f) {
+        return 3.40282347e+38f;
+      }
+      return std::exp(f);
+    };
+    Run(Exp, host_exp_with_overflow);
+  } else {
+    Run(Exp, std::exp);
   }
-
-  Run(Exp, std::exp);
 }
 
 XLA_TEST_P(ExhaustiveOpTest, Expm1) {
-  // Expm1 has the same erroneous behavior on CPU as Exp.
-  if (platform_ == "Host" && ty_ == F32) {
-    // TODO(b/73142289): The vectorized Exp implementation gives results outside
-    // our error spec in this range.
-    known_incorrect_begin_ = 1107296256 + 11583654;
-    known_incorrect_end_ = 1107296256 + 11629080;
-  } else if (platform_ == "Host" && ty_ == BF16) {
-    // TODO(jlebar): Is this a rounding error?  Why doesn't it occur on XLA:GPU?
-    //
-    // Mismatch on 88.5 (0x42b1).
-    //   Expected 2.72491739e+38 (0x7f4d), but got inf (0x7f80).
-    known_incorrect_begin_ = 0x42b1;
-    known_incorrect_end_ = 0x42b2;
+  // Our CPU implementation of expm1 returns one incorrect value: says
+  // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
+  // acceptable and check for it explicitly so that we can be aware if anything
+  // changes.
+  if (platform_ == "Host") {
+    auto host_expm1_with_overflow = +[](float f) {
+      if (f == 88.7228394f) {
+        return 3.40282347e+38f;
+      }
+      return std::expm1(f);
+    };
+    Run(Expm1, host_expm1_with_overflow);
+  } else {
+    Run(Expm1, std::expm1);
   }
-
-  Run(Expm1, std::expm1);
 }
 
 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
@@ -553,10 +545,111 @@
   Run(Sqrt, std::sqrt);
 }
 
-// TODO(jlebar): Add remaining trig functions.  Don't forget Atan2!
 // TODO(jlebar): Test trig functions over complex inputs.
+
+XLA_TEST_P(ExhaustiveOpTest, Acosh) {
+  // Error inherited from Log, which our implementation of Acosh uses.
+  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
+    abs_err_ = 0.001;
+    rel_err_ = 0.001;
+  }
+  Run(Acosh, std::acosh);
+}
+XLA_TEST_P(ExhaustiveOpTest, Asinh) {
+  // Error inherited from Log, which our implementation of Asinh uses.
+  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
+    abs_err_ = 0.001;
+    rel_err_ = 0.001;
+  }
+  Run(Asinh, std::asinh);
+}
+XLA_TEST_P(ExhaustiveOpTest, Atanh) { Run(Atanh, std::atanh); }
+XLA_TEST_P(ExhaustiveOpTest, Acos) { Run(Acos, std::acos); }
+XLA_TEST_P(ExhaustiveOpTest, Asin) { Run(Asin, std::asin); }
+
+XLA_TEST_P(ExhaustiveOpTest, Cosh) {
+  // Our cosh implementation incorrectly overflows to inf for +/-89.4159851.
+  // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
+  // max-float, so we deem this acceptable.
+  //
+  // This does not occur on CPU because we have an offsetting error in our
+  // implementation of exp.
+  float (*host_cosh)(float);
+  if (platform_ == "Host") {
+    host_cosh = &std::cosh;
+  } else {
+    host_cosh = +[](float x) {
+      if (std::abs(x) == 89.4159851f) {
+        return std::numeric_limits<float>::infinity();
+      }
+      return std::cosh(x);
+    };
+  }
+  Run(Cosh, host_cosh);
+}
+XLA_TEST_P(ExhaustiveOpTest, Sinh) {
+  // Our sinh implementation incorrectly overflows to +/-inf for +/-89.4159851.
+  // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
+  // max-float, so we deem this acceptable.
+  //
+  // This does not occur on CPU because we have an offsetting error in our
+  // implementation of exp.
+  float (*host_sinh)(float);
+  if (platform_ == "Host") {
+    host_sinh = &std::sinh;
+  } else {
+    host_sinh = +[](float x) {
+      if (std::abs(x) == 89.4159851f) {
+        return std::copysign(std::numeric_limits<float>::infinity(), x);
+      }
+      return std::sinh(x);
+    };
+  }
+  Run(Sinh, host_sinh);
+}
 XLA_TEST_P(ExhaustiveOpTest, Tanh) { Run(Tanh, std::tanh); }
 
+void ExhaustiveOpTest::SetParamsForSinCosTan() {
+  if (platform_ == "Host" || platform_ == "CUDA") {
+    return;
+  }
+
+  // Non CPU/GPU targets may have used the Cody-Waite range reduction technique
+  // and will not provide meaningful results for sin/cos/tan if magnitudes
+  // exceed 2**p.
+  if (ty_ == F32) {
+    rel_err_ = 0.001;
+    abs_err_ = 0.001;
+    known_incorrect_fn_ = [](int64 v) {
+      float f = absl::bit_cast<float>(static_cast<uint32>(v));
+      return std::abs(f) > (1 << 13);
+    };
+  } else if (ty_ == BF16) {
+    known_incorrect_fn_ = [](int64 v) {
+      float f =
+          static_cast<float>(absl::bit_cast<bfloat16>(static_cast<uint16>(v)));
+      return std::abs(f) > (1 << 13);
+    };
+  }
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Cos) {
+  SetParamsForSinCosTan();
+  Run(Cos, std::cos);
+}
+XLA_TEST_P(ExhaustiveOpTest, Sin) {
+  SetParamsForSinCosTan();
+  Run(Sin, std::sin);
+}
+XLA_TEST_P(ExhaustiveOpTest, Tan) {
+  SetParamsForSinCosTan();
+  Run(Tan, std::tan);
+}
+
+// TODO(jlebar): Enable these.
+// XLA_TEST_P(ExhaustiveOpTest, Atan) { Run(Atan, std::atan); }
+// XLA_TEST_P(ExhaustiveOpTest, Atan2) { Run(Atan2, std::atan2); }
+
 XLA_TEST_P(ExhaustiveOpTest, Erf) { Run(Erf, std::erf); }
 XLA_TEST_P(ExhaustiveOpTest, Erfc) { Run(Erfc, std::erfc); }
 XLA_TEST_P(ExhaustiveOpTest, ErfInv) { Run(ErfInv, HostErfInv); }
@@ -595,19 +688,24 @@
   if (platform_ == "CUDA" && (ty_ == F32 || ty_ == F16)) {
     rel_err_ = 0.001;
   }
+  float (*host_lgamma)(float) = std::lgamma;
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
     // we see on CPU/GPU.
     rel_err_ = 0.01;
     abs_err_ = 0.01;
 
-    // Overflows for to inf for input 4.08500343e+36 (0x7c44af8e).
+    // Overflows to inf for input 4.08500343e+36 (0x7c44af8e).
     if (ty_ == F32) {
-      known_incorrect_begin_ = 0x7c44af8e;
-      known_incorrect_end_ = 0x7c44af8e + 1;
+      host_lgamma = +[](float v) {
+        if (absl::bit_cast<uint32>(v) == 0x7c44af8e) {
+          return std::numeric_limits<float>::infinity();
+        }
+        return std::lgamma(v);
+      };
     }
   }
-  Run(Lgamma, std::lgamma);
+  Run(Lgamma, host_lgamma);
 }
 
 XLA_TEST_P(ExhaustiveOpTest, Round) { Run(Round, std::round); }
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index f4a7309..2d0805c 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -829,7 +829,7 @@
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
 
   const int64 intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index a12fa04..f946ecb 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -210,11 +210,11 @@
     int64 num_replicas, bool use_threads) {
   HloRunner::ReplicatedExecuteOptions options;
   options.num_replicas = num_replicas;
+  options.use_threads = use_threads;
   for (auto argument : arguments) {
     options.arguments.push_back(argument);
   }
-  return test_runner_.ExecuteReplicated(std::move(module), options,
-                                        use_threads);
+  return test_runner_.ExecuteReplicated(std::move(module), options);
 }
 
 StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
@@ -224,11 +224,12 @@
   HloRunner::ReplicatedExecuteOptions options;
   options.num_replicas = num_replicas;
   options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = use_threads;
   for (auto argument : arguments) {
     options.arguments.push_back(argument);
   }
   return test_runner_.ExecuteReplicated(std::move(module), options,
-                                        device_assignment, use_threads);
+                                        device_assignment);
 }
 
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 9652788..67a1aba 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -22,7 +22,6 @@
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -41,6 +40,7 @@
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -130,14 +130,14 @@
   // Create x as a col-major array.
   auto x_array = LiteralToShapedBuffer(LiteralUtil::CreateR2WithLayout(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
-  EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(),
-                                LayoutUtil::MakeLayout({0, 1})));
+  EXPECT_TRUE(Layout::Equal().MinorToMajorOnly()(
+      x_array.on_device_shape().layout(), LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
   auto y_array = LiteralToShapedBuffer(LiteralUtil::CreateR2WithLayout(
       {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
-  EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(),
-                                LayoutUtil::MakeLayout({1, 0})));
+  EXPECT_TRUE(Layout::Equal().MinorToMajorOnly()(
+      y_array.on_device_shape().layout(), LayoutUtil::MakeLayout({1, 0})));
 
   ScopedShapedBuffer result_colmaj =
       ExecuteLocallyOrDie(computation, {&x_array, &y_array});
@@ -171,8 +171,9 @@
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj.on_device_shape().layout(),
-                                LayoutUtil::MakeLayout({0, 1})));
+  EXPECT_TRUE(Layout::Equal().MinorToMajorOnly()(
+      result_colmaj.on_device_shape().layout(),
+      LayoutUtil::MakeLayout({0, 1})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
                                        ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
@@ -183,8 +184,9 @@
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj.on_device_shape().layout(),
-                                LayoutUtil::MakeLayout({1, 0})));
+  EXPECT_TRUE(Layout::Equal().MinorToMajorOnly()(
+      result_rowmaj.on_device_shape().layout(),
+      LayoutUtil::MakeLayout({1, 0})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
                                        ShapedBufferToLiteral(result_rowmaj),
                                        error_spec_);
@@ -900,7 +902,7 @@
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client =
       ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
   auto* transfer_manager =
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 710d8ae..7eaa279 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -35,17 +35,16 @@
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<OwningDeviceMemory> TestAllocator::Allocate(int device_ordinal,
-                                                     uint64 size,
-                                                     bool retry_on_failure) {
+StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
-                                                 retry_on_failure);
+  return se::StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
+                                                     retry_on_failure);
 }
 
 Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
@@ -55,7 +54,7 @@
     deallocation_count_++;
     device_deallocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
+  return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
 }
 
 int64 TestAllocator::allocation_count() const {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 4027c7b..292baac 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -24,7 +24,6 @@
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -36,18 +35,19 @@
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
-class TestAllocator : public StreamExecutorMemoryAllocator {
+class TestAllocator : public se::StreamExecutorMemoryAllocator {
  public:
   explicit TestAllocator(se::Platform* platform)
-      : StreamExecutorMemoryAllocator(
+      : se::StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) override;
+  StatusOr<se::OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                            bool retry_on_failure) override;
   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   // Return the number of allocations that have been performed.
diff --git a/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc b/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
index 1513d89..7895895 100644
--- a/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
@@ -14,35 +14,86 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+// Tests cross-GPU all-reduce operatons.
+//
+// This test requires multiple GPUs.  For instructions on running this within
+// Google, see go/multi-gpu-unit-test.
 
 namespace xla {
 namespace {
 
-class MultiDeviceAllReduceTest : public HloTestBase {};
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAre;
+
+class MultiDeviceAllReduceTest : public HloTestBase {
+ protected:
+  std::unique_ptr<HloModule> MakeCrsModule(int64 num_elems,
+                                           const HloModuleConfig& config) {
+    const char* kTemplate = R"(
+      HloModule test
+
+      add {
+        x = f32[] parameter(0)
+        y = f32[] parameter(1)
+        add = f32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        p = f32[NUM_ELEMS] parameter(0)
+        ROOT crs = f32[NUM_ELEMS] all-reduce(p), to_apply=add
+      }
+    )";
+    return ParseHloString(
+               absl::StrReplaceAll(kTemplate,
+                                   {{"NUM_ELEMS", absl::StrCat(num_elems)}}),
+               config)
+        .ValueOrDie();
+  }
+};
+
+// Returns the non-empty subsets of {0, 1, ..., n}.  For example,
+// PowerSetOfIota(3) = {{0}, {1}, {2}, {0,1}, {0,2}, {1,2}, {0,1,2}}.
+std::vector<std::vector<int64>> PowerSetOfIota(int64 n) {
+  std::vector<std::vector<int64>> power_set;
+  for (int64 i = 1; i < (1 << n); ++i) {
+    power_set.emplace_back();
+    for (int64 j = 0; j < n; ++j) {
+      if (i & (1 << j)) {
+        power_set.back().push_back(j);
+      }
+    }
+  }
+  return power_set;
+}
+
+// Makes a DeviceAssignment assigning replica-id i to devices[i].
+DeviceAssignment MakeDeviceAssn(std::vector<int64> devices) {
+  DeviceAssignment assn(/*replica_count=*/devices.size(),
+                        /*computation_count=*/1);
+  for (int64 i = 0; i < devices.size(); ++i) {
+    assn(i, 0) = devices[i];
+  }
+  return assn;
+}
+
+// Shorter alias for this function.
+absl::flat_hash_set<int> OpenNcclChannels() {
+  return gpu::NcclAllReduceThunk::DevicesWithOpenNcclChannels();
+}
 
 XLA_TEST_F(MultiDeviceAllReduceTest, TwoReplicasOneOperand) {
-  const char* module_str = R"(
-  HloModule test
-
-  add {
-    x = f32[] parameter(0)
-    y = f32[] parameter(1)
-    add = f32[] add(x, y)
-  }
-
-  ENTRY test_computation {
-    p = f32[3] parameter(0)
-    ROOT crs = f32[3] all-reduce(p), to_apply=add
-  })";
   auto config = GetModuleConfigForTest();
   config.set_replica_count(2);
-  auto module = ParseHloString(module_str, config).ValueOrDie();
+  auto module = MakeCrsModule(/*num_elems=*/3, config);
   auto literal = LiteralUtil::CreateR1<float>({1, 2, 3});
   auto expected = LiteralUtil::CreateR1<float>({2, 4, 6});
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
@@ -52,5 +103,112 @@
   EXPECT_EQ(expected, results[1]);
 }
 
+// Tries all-to-all operations across all 2^kNumDevices - 1 combinations of
+// devices in sequence.
+XLA_TEST_F(MultiDeviceAllReduceTest, AllCombinations) {
+  const int64 kNumDevices = 4;
+  const int64 kNumElems = 1024;
+
+  for (std::vector<int64> devices : PowerSetOfIota(kNumDevices)) {
+    SCOPED_TRACE(absl::StrFormat("Running on devices {%s}",
+                                 absl::StrJoin(devices, ", ")));
+
+    DeviceAssignment device_assn = MakeDeviceAssn(devices);
+
+    auto config = GetModuleConfigForTest();
+    config.set_replica_count(devices.size());
+    config.set_static_device_assignment(device_assn);
+
+    auto module = MakeCrsModule(kNumElems, config);
+
+    std::vector<float> input_vec(kNumElems);
+    absl::c_iota(input_vec, 0);
+    auto input_literal = LiteralUtil::CreateR1<float>(input_vec);
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::vector<Literal> results,
+        ExecuteReplicated(std::move(module), {&input_literal},
+                          /*num_replicas=*/devices.size(), &device_assn,
+                          /*run_hlo_passes=*/true, /*use_threads=*/true));
+  }
+}
+
+// Check that the NCCL data structures in our all-reduce implementation are
+// cached as we expect.
+XLA_TEST_F(MultiDeviceAllReduceTest, NcclChannelCaching) {
+  const int64 kNumElems = 1024;
+
+  std::vector<float> input_vec(kNumElems);
+  absl::c_iota(input_vec, 0);
+  auto input_literal = LiteralUtil::CreateR1<float>(input_vec);
+
+  // Initially no NCCL channels should be open.
+  EXPECT_THAT(OpenNcclChannels(), IsEmpty());
+
+  // Create three Executables, touching devices {0,1}, {1,2}, and {0,1,2}.
+  struct ExecutableInfo {
+    std::unique_ptr<Executable> executable;
+    DeviceAssignment device_assn;
+    HloRunner::ReplicatedExecuteOptions opts;
+  };
+  std::vector<ExecutableInfo> executables;
+  for (const auto& devices :
+       std::vector<std::vector<int64>>{{0, 1}, {1, 2}, {0, 1, 2}}) {
+    executables.emplace_back();
+    auto& e = executables.back();
+
+    e.device_assn = MakeDeviceAssn(devices);
+
+    auto config = GetModuleConfigForTest();
+    config.set_replica_count(devices.size());
+    config.set_static_device_assignment(e.device_assn);
+    auto module = MakeCrsModule(kNumElems, config);
+    e.executable =
+        test_runner_
+            .CreateExecutable(std::move(module), /*run_hlo_passes=*/true)
+            .ValueOrDie();
+
+    e.opts.num_replicas = devices.size();
+    e.opts.use_threads = true;
+    e.opts.arguments.push_back(&input_literal);
+  }
+
+  auto run_executable = [&](int64 i) {
+    auto& e = executables[i];
+    TF_ASSERT_OK(
+        test_runner_
+            .ExecuteReplicated(e.executable.get(), e.opts, &e.device_assn)
+            .status());
+  };
+
+  // Compiling executables above shouldn't cause us to open any channels.
+  EXPECT_THAT(OpenNcclChannels(), IsEmpty());
+
+  // Run the executables and check that channels are opened as we expect.
+  run_executable(0);
+  EXPECT_THAT(OpenNcclChannels(), UnorderedElementsAre(0, 1));
+
+  run_executable(2);
+  EXPECT_THAT(OpenNcclChannels(), UnorderedElementsAre(0, 1, 2));
+
+  run_executable(1);
+  EXPECT_THAT(OpenNcclChannels(), UnorderedElementsAre(0, 1, 2));
+
+  // Tear down the executables and check that channels are closed as we expect.
+  // Note that after we tear down an executable *all* the nccl channels may go
+  // away, so we rerun all of the executables that haven't been torn down.
+  executables[2].executable.reset();
+  run_executable(0);
+  run_executable(1);
+  EXPECT_THAT(OpenNcclChannels(), UnorderedElementsAre(0, 1, 2));
+
+  executables[0].executable.reset();
+  run_executable(1);
+  EXPECT_THAT(OpenNcclChannels(), UnorderedElementsAre(1, 2));
+
+  executables[1].executable.reset();
+  EXPECT_THAT(OpenNcclChannels(), IsEmpty());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 1fcb212..5b3f30a 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -1159,5 +1159,37 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-5, 1e-5}));
 }
 
+XLA_TEST_F(VariadicReduceTest, ReduceMultiOutputVariadicAnd) {
+  absl::string_view hlo_string = R"(
+    HloModule VariadicReduceMultiOutput
+
+    VariadicAnd {
+      value = pred[] parameter(0)
+      value_idx = u32[] parameter(1)
+      current_value = pred[] parameter(2)
+      current_value_idx = u32[] parameter(3)
+      ROOT out = (pred[], u32[]) tuple(value, value_idx)
+    }
+
+    ENTRY CheckBuffer {
+      test_value = f32[] parameter(0)
+      buffer = f32[100] parameter(1)
+      value_broadcast = f32[100] broadcast(test_value), dimensions={}
+      comparison_result = pred[100] compare(buffer, value_broadcast), direction=EQ
+      true_constant = pred[] constant(true)
+
+      zero_idx = u32[] constant(0)
+      idxs = u32[100]{0} iota(), iota_dimension=0
+      out = (pred[], u32[]) reduce(
+         comparison_result, idxs, true_constant, zero_idx
+      ), dimensions={0}, to_apply=VariadicAnd
+
+      ROOT returned = u32[] get-tuple-element(out), index=1
+    }
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-5, 1e-5}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 0dcb1c4..4b3283b 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -84,7 +84,7 @@
                    GetParam().window_strides, GetParam().padding_type, source,
                    ConstantR0<float>(&builder_, 0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-4));
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -199,7 +199,10 @@
         SelectAndScatterTestParam{
             {1, 5, 5}, {1, 5, 5}, Padding::kSame, {3, 1, 1}, {3, 1, 1}},
         SelectAndScatterTestParam{
-            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}}));
+            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{{1104}, {551}, Padding::kValid, {3}, {2}},
+        SelectAndScatterTestParam{
+            {1300}, {1171}, Padding::kValid, {130}, {1}}));
 
 // Test for F32 1D array, with a zero-element input.
 XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index a9874a9..4241d81 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -18,9 +18,8 @@
 #include <fstream>
 #include <streambuf>
 #include <string>
-#include <unordered_map>
 
-#include "absl/strings/ascii.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
@@ -31,7 +30,7 @@
 
 // Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
 // disabled - a sequence of regexps.
-using ManifestT = std::unordered_map<string, std::vector<string>>;
+using ManifestT = absl::flat_hash_map<string, std::vector<string>>;
 
 ManifestT ReadManifest() {
   ManifestT manifest;
@@ -68,10 +67,21 @@
 
 }  // namespace
 
-string PrependDisabledIfIndicated(const string& test_case_name,
-                                  const string& test_name) {
+std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
+                                       absl::string_view test_name) {
   ManifestT manifest = ReadManifest();
 
+  // If the test name ends with a slash followed by one or more digits, strip
+  // that off; this is just a shard number, and matching on this would be
+  // unstable even if someone wanted to do it.
+  static auto* shard_num_pattern = new RE2(R"(/\d+$)");
+  tensorflow::RegexpStringPiece suffix;
+  if (RE2::PartialMatch(
+          tensorflow::RegexpStringPiece(test_name.data(), test_name.size()),
+          *shard_num_pattern, &suffix)) {
+    test_name.remove_suffix(suffix.size());
+  }
+
   // First try full match: test_case_name.test_name
   // If that fails, try to find just the test_case_name; this would disable all
   // tests in the test case.
@@ -79,7 +89,7 @@
   if (it == manifest.end()) {
     it = manifest.find(test_case_name);
     if (it == manifest.end()) {
-      return test_name;
+      return std::string(test_name);
     }
   }
 
@@ -88,12 +98,12 @@
   string platform_string = XLA_PLATFORM;
   for (const auto& s : disabled_platforms) {
     if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
-      return "DISABLED_" + test_name;
+      return absl::StrCat("DISABLED_", test_name);
     }
   }
 
   // We didn't hit in the disabled manifest entries, so don't disable it.
-  return test_name;
+  return std::string(test_name);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 80a6868..9636df2 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -30,6 +30,7 @@
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -68,8 +69,8 @@
 // disabled on a particular platform. For a test that should be disabled,
 // returns DISABLED_ prepended to its name; otherwise returns the test name
 // unmodified.
-string PrependDisabledIfIndicated(const string& test_case_name,
-                                  const string& test_name);
+std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
+                                       absl::string_view test_name);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index d6641d2..00b72ce 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -19,7 +19,6 @@
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
@@ -34,6 +33,7 @@
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -117,6 +117,26 @@
   LiteralTestUtil::ExpectR1Equal<float>(test_vector, result);
 }
 
+XLA_TEST_F(TransferManagerTest, TransferR1LargeUnalignedF32) {
+  std::vector<float> test_vector(1025);
+  std::iota(test_vector.begin(), test_vector.end(), 0);
+  Shape shape = ShapeUtil::MakeShape(F32, {1024});
+  BorrowingLiteral literal(reinterpret_cast<const char*>(&test_vector[1]),
+                           shape);
+  auto device_buffer = AllocateDeviceBuffer(shape);
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
+                                                          device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result,
+      transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer));
+
+  std::vector<float> expected_output(1024);
+  std::iota(expected_output.begin(), expected_output.end(), 1);
+  LiteralTestUtil::ExpectR1Equal<float>(expected_output, result);
+}
+
 XLA_TEST_F(TransferManagerTest, TransferR1U8) {
   const char* test_string = "0123456789abcdef";
   Literal literal = LiteralUtil::CreateR1U8(test_string);
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 85212fa..4d80a57 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1265,7 +1265,7 @@
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client =
       ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 7b7b8f5..b36fc41 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -135,7 +135,7 @@
   LocalService* service = ClientLibrary::GetXlaService(client->platform());
   Backend* backend = service->mutable_backend();
   se::StreamExecutor* executor = backend->default_stream_executor();
-  DeviceMemoryAllocator* allocator = backend->memory_allocator();
+  se::DeviceMemoryAllocator* allocator = backend->memory_allocator();
   auto* transfer_manager = backend->transfer_manager();
   TF_ASSERT_OK_AND_ASSIGN(
       StreamPool::Ptr stream_ptr,
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index d665613..3d443be 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -271,7 +271,7 @@
   // Run the computation num_runs times, and return the result from the last
   // execution.
   const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
-  StreamExecutorMemoryAllocator allocator(
+  se::StreamExecutorMemoryAllocator allocator(
       client->platform(),
       {client->platform()->ExecutorForDevice(0).ValueOrDie()});
   absl::optional<ScopedShapedBuffer> final_result;
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index bb8bbf5..732b7f2 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -16,8 +16,10 @@
 #include "tensorflow/compiler/xla/util.h"
 
 #include <stdarg.h>
+
 #include <numeric>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -39,23 +41,41 @@
   return status;
 }
 
-ScopedLoggingTimer::ScopedLoggingTimer(const string& label, bool enabled)
-    : enabled(enabled), label(label) {
+ScopedLoggingTimer::ScopedLoggingTimer(const std::string& label, bool enabled,
+                                       TimerStats* timer_stats)
+    : enabled(enabled), label(label), timer_stats(timer_stats) {
   if (enabled) {
     start_micros = tensorflow::Env::Default()->NowMicros();
   }
 }
 
-ScopedLoggingTimer::~ScopedLoggingTimer() {
+void ScopedLoggingTimer::StopAndLog() {
   if (enabled) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
     double secs = (end_micros - start_micros) / 1000000.0;
 
+    TimerStats& stats = *timer_stats;
+    tensorflow::mutex_lock lock(stats.stats_mutex);
+    stats.cumulative_secs += secs;
+    if (secs > stats.max_secs) {
+      stats.max_secs = secs;
+    }
+    stats.times_called++;
+
     LOG(INFO) << label << " time: "
-              << tensorflow::strings::HumanReadableElapsedTime(secs);
+              << tensorflow::strings::HumanReadableElapsedTime(secs)
+              << " (cumulative: "
+              << tensorflow::strings::HumanReadableElapsedTime(
+                     stats.cumulative_secs)
+              << ", max: "
+              << tensorflow::strings::HumanReadableElapsedTime(stats.max_secs)
+              << ", #called: " << stats.times_called << ")";
+    enabled = false;
   }
 }
 
+ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }
+
 Status AddStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
   return Status{prior.code(),
@@ -91,7 +111,7 @@
   DCHECK(IsPermutation(input_permutation, input_permutation.size()));
   std::vector<int64> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
-    output_permutation[input_permutation[i]] = i;
+    output_permutation.at(input_permutation.at(i)) = i;
   }
   return output_permutation;
 }
@@ -101,7 +121,7 @@
   CHECK_EQ(p1.size(), p2.size());
   std::vector<int64> output;
   for (size_t i = 0; i < p1.size(); ++i) {
-    output.push_back(p1[p2[i]]);
+    output.push_back(p1.at(p2.at(i)));
   }
   return output;
 }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 1754ae0..55b092c 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -40,6 +40,7 @@
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -63,6 +64,8 @@
 // readable form. This differs from base's ElapsedTimer primarily in that it
 // spits out the human-readable duration form.
 //
+// Keeps track of global maximum and cumulative times across all invocations.
+//
 // By default, the timing traces are only printed at VLOG(1) and above:
 //
 //   XLA_SCOPED_LOGGING_TIMER("fooing bar");  // nop if !VLOG_IS_ON(1).
@@ -83,9 +86,17 @@
   XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)
 
 // Helper for macros above.  Don't use directly.
-#define XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)      \
-  ::xla::ScopedLoggingTimer XLA_ScopedLoggingTimerInstance##counter( \
-      label, VLOG_IS_ON(level))
+#define XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)         \
+  static ::xla::TimerStats XLA_TimerStats##counter;                     \
+  ::xla::ScopedLoggingTimer XLA_ScopedLoggingTimerInstance##counter(    \
+      label, /*enabled=*/VLOG_IS_ON(level), &XLA_TimerStats##counter);
+
+struct TimerStats {
+  tensorflow::mutex stats_mutex;
+  double cumulative_secs GUARDED_BY(stats_mutex) = 0;
+  double max_secs GUARDED_BY(stats_mutex) = 0;
+  uint64 times_called GUARDED_BY(stats_mutex) = 0;
+};
 
 // RAII timer for XLA_SCOPED_LOGGING_TIMER and XLA_SCOPED_LOGGING_TIMER_LEVEL
 // macros above.  Recommended usage is via the macros so you don't have to give
@@ -93,12 +104,22 @@
 struct ScopedLoggingTimer {
   // The timer does nothing if enabled is false.  This lets you pass in your
   // file's VLOG_IS_ON value.
-  ScopedLoggingTimer(const string& label, bool enabled);
+  //
+  // timer_stats is unowned non-null pointer which is used to populate the
+  // global timer statistics.
+  ScopedLoggingTimer(const std::string& label, bool enabled,
+                     TimerStats* timer_stats);
+
+  // Stop the timer and log the tracked time. Timer is disabled after this
+  // function is called.
+  void StopAndLog();
+
   ~ScopedLoggingTimer();
 
   bool enabled;
   string label;
   uint64 start_micros;
+  TimerStats* timer_stats;
 };
 
 // Given a vector<T>, returns a Span<char> that points at its
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 7dfaa10..4366675 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -215,6 +215,9 @@
     // Generate a step marker at each iteration of the top level while loop,
     // which is assumed to be a training loop.
     STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP = 1;
+    // Generate a step marker at each iteration of the second level while loops,
+    // which is assumed to be a training or eval loop.
+    STEP_MARK_AT_SECOND_LEVEL_WHILE_LOOP = 3;
     // No step marker generated.
     STEP_MARK_NONE = 2;
   }
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 6e5772a..67f76d0 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -71,7 +71,10 @@
   // An opaque type used for passing context-specific data to a custom
   // operation. Shapes of this primitive type will have empty dimensions and
   // tuple_shapes fields.
-  OPAQUE = 14;
+  //
+  // (OPAQUE would be a better name for this identifier, but that conflicts with
+  // a macro defined in windows.h.)
+  OPAQUE_TYPE = 14;
 
   // A token type threaded between side-effecting operations. Shapes of this
   // primitive type will have empty dimensions and tuple_shapes fields.
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index b2718c5..acd984f 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -54,6 +54,7 @@
         "xrt_util.h",
     ],
     deps = [
+        ":xrt_proto",
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -66,13 +67,13 @@
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/compiler/xrt/client/xrt_client.cc b/tensorflow/compiler/xrt/client/xrt_client.cc
index 3c71019..c1f06e9 100644
--- a/tensorflow/compiler/xrt/client/xrt_client.cc
+++ b/tensorflow/compiler/xrt/client/xrt_client.cc
@@ -103,6 +103,50 @@
   return std::make_shared<XrtBuffer>(std::move(buffer_handle), literal.shape());
 }
 
+/*static*/ xla::StatusOr<std::shared_ptr<XrtBuffer>> XrtBuffer::MakeTuple(
+    const std::shared_ptr<XrtContext>& context,
+    const std::vector<std::shared_ptr<XrtBuffer>>& elements) {
+  if (elements.empty()) {
+    return errors::Unimplemented(
+        "The arity zero case of MakeTuple is not implemented.");
+  }
+  int tf_device_id = elements[0]->handle().device_id();
+  xrt::XLATupleNode tuple_description;
+  std::vector<xla::Shape> element_shapes;
+  element_shapes.reserve(elements.size());
+  for (int index = 0; index < elements.size(); ++index) {
+    xrt::XLATupleNode* node = tuple_description.add_tuples();
+    node->set_input_index(index);
+    element_shapes.push_back(elements[index]->shape());
+    if (elements[index]->handle().device_id() != tf_device_id) {
+      return errors::InvalidArgument(
+          "All elements of tuple must be on the same device ( ",
+          elements[index]->handle().device_id(), " vs. ", tf_device_id, ")");
+    }
+  }
+  auto proto = absl::make_unique<TensorProto>();
+  proto->set_dtype(DT_STRING);
+  tuple_description.SerializeToString(proto->add_string_val());
+
+  XrtTensorHandle description_handle =
+      context->tf_context()->SendTensor(std::move(proto), tf_device_id,
+                                        /*host_memory=*/true);
+
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["Ninputs"] = MakeAttrValue(elements.size());
+
+  std::vector<const XrtTensorHandle*> args;
+  args.reserve(elements.size() + 1);
+  args.push_back(&description_handle);
+  for (const auto& element : elements) {
+    args.push_back(&element->handle());
+  }
+  XrtTensorHandle buffer_handle = std::move(context->tf_context()->EnqueueOp(
+      "XRTMakeTuple", args, /*output_arity=*/1, attrs, tf_device_id)[0]);
+  return std::make_shared<XrtBuffer>(
+      std::move(buffer_handle), xla::ShapeUtil::MakeTupleShape(element_shapes));
+}
+
 xla::StatusOr<xla::Literal> XrtBuffer::ToLiteral() const {
   TF_RET_CHECK(handle_.valid());
   XrtTensorHandle literal_handle = std::move(handle_.context()->EnqueueOp(
diff --git a/tensorflow/compiler/xrt/client/xrt_client.h b/tensorflow/compiler/xrt/client/xrt_client.h
index d8db230..c54f156 100644
--- a/tensorflow/compiler/xrt/client/xrt_client.h
+++ b/tensorflow/compiler/xrt/client/xrt_client.h
@@ -52,6 +52,11 @@
       const std::shared_ptr<XrtContext>& context, int xrt_device_ordinal,
       const xla::LiteralSlice& literal);
 
+  // Builds a new XrtBuffer tuple from its constituent parts.
+  static xla::StatusOr<std::shared_ptr<XrtBuffer>> MakeTuple(
+      const std::shared_ptr<XrtContext>& context,
+      const std::vector<std::shared_ptr<XrtBuffer>>& elements);
+
   // Converts an XrtBuffer to an XLA literal, copying the buffer from the remote
   // host. Blocks until the buffer is available.
   xla::StatusOr<xla::Literal> ToLiteral() const;
@@ -62,8 +67,6 @@
   // Destructures a tuple-shaped buffer into its constituent pieces.
   xla::StatusOr<std::vector<std::shared_ptr<XrtBuffer>>> DestructureTuple();
 
-  // TODO(phawkins): add a static method for building tuples of buffers.
-
   // TODO(phawkins): add a mechanism for converting XrtBuffers into remote
   // tensors and vice-versa for TF interoperability.
 
@@ -78,6 +81,7 @@
   XrtBuffer& operator=(XrtBuffer&&) = default;
 
   const XrtTensorHandle& handle() const { return handle_; }
+  const xla::Shape& shape() const { return shape_; }
 
  private:
   // Tensor that contains the XRT allocation ID.
diff --git a/tensorflow/compiler/xrt/client/xrt_client_test.cc b/tensorflow/compiler/xrt/client/xrt_client_test.cc
index 66cda9f..e64c986 100644
--- a/tensorflow/compiler/xrt/client/xrt_client_test.cc
+++ b/tensorflow/compiler/xrt/client/xrt_client_test.cc
@@ -292,6 +292,50 @@
   // constituents.
   buffer->Delete();
 
+  ASSERT_EQ(pieces.size(), 2);
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal a_out, pieces[0]->ToLiteral());
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal b_out, pieces[1]->ToLiteral());
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(a, a_out));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(b, b_out));
+
+  // Explicitly delete one of the pieces, use RAII to delete the other.
+  pieces[1]->Delete();
+}
+
+TEST_F(XrtClientTest, TupleConstructionAndDestructuring) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<XrtContext> context, MakeContext());
+
+  // Tests sending a literal to and from the device.
+  xla::Shape a_shape = xla::ShapeUtil::MakeShape(xla::F32, {3, 4, 5});
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal a,
+                          xla::LiteralUtil::CreateRandomLiteral<xla::F32>(
+                              a_shape,
+                              /*mean=*/7.0, /*stddev=*/13.5));
+  TF_ASSERT_OK_AND_ASSIGN(auto a_buffer, XrtBuffer::FromLiteral(context, 0, a));
+
+  xla::Shape b_shape = xla::ShapeUtil::MakeShape(xla::F64, {2, 7});
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal b,
+                          xla::LiteralUtil::CreateRandomLiteral<xla::F64>(
+                              b_shape,
+                              /*mean=*/3.15, /*stddev=*/-2.1));
+  TF_ASSERT_OK_AND_ASSIGN(auto b_buffer, XrtBuffer::FromLiteral(context, 0, b));
+
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal a_in, a_buffer->ToLiteral());
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal b_in, b_buffer->ToLiteral());
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(a, a_in));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(b, b_in));
+
+  std::vector<std::shared_ptr<XrtBuffer>> elems = {a_buffer, b_buffer};
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<XrtBuffer> buffer,
+                          XrtBuffer::MakeTuple(context, elems));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::shared_ptr<XrtBuffer>> pieces,
+                          buffer->DestructureTuple());
+
+  // Explicitly delete the tuple, which should have no effect on its
+  // constituents.
+  buffer->Delete();
+
+  ASSERT_EQ(pieces.size(), 2);
   TF_ASSERT_OK_AND_ASSIGN(xla::Literal a_out, pieces[0]->ToLiteral());
   TF_ASSERT_OK_AND_ASSIGN(xla::Literal b_out, pieces[1]->ToLiteral());
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(a, a_out));
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 42ef881..d89dc46 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -27,6 +27,7 @@
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
+#include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -41,6 +42,12 @@
 
 namespace {
 
+struct InputBuffers {
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  std::vector<xla::ShapedBuffer> input_allocations;
+  std::vector<xla::ShapedBuffer*> input_pointers;
+};
+
 uint32 InitialRandomSeed() {
   // Support plumbing the TF seed through to XLA is being worked on.
   // If a user wants deterministic behavior, their best option
@@ -64,52 +71,134 @@
   return counter.fetch_add(2);
 }
 
-// Populates `inputs` with the input tensors to the computation.
-Status GetComputationInputs(OpKernelContext* context, ResourceMgr* rm,
-                            bool release_inputs,
-                            std::vector<XRTTupleAllocation*>* input_tuples,
-                            std::vector<xla::ShapedBuffer>* input_allocations,
-                            std::vector<xla::ShapedBuffer*>* input_pointers) {
-  std::vector<int64> input_uids;
-  OpInputList arg_list;
-  TF_RETURN_IF_ERROR(context->input_list("input_handles", &arg_list));
-
-  // Concatenate all input uids from list of scalars-or-vectors carrying them.
-  for (int i = 0; i < arg_list.size(); ++i) {
-    const Tensor& arg = arg_list[i];
-    if (TensorShapeUtils::IsScalar(arg.shape())) {
-      input_uids.push_back(arg.scalar<int64>()());
-    } else {
-      TF_RET_CHECK(TensorShapeUtils::IsVector(arg.shape()));
-      auto arg_vec = arg.vec<int64>();
-      const int64 num_elts = arg.shape().dim_size(0);
-      for (int i = 0; i < num_elts; ++i) {
-        input_uids.push_back(arg_vec(i));
-      }
-    }
-  }
-
-  // Retrieve allocations for the uids.
-  input_tuples->resize(input_uids.size());
-  input_pointers->resize(input_uids.size());
-  for (int i = 0; i < input_uids.size(); ++i) {
-    const int64 input_uid = input_uids[i];
+xla::StatusOr<InputBuffers> GetInputBuffers(
+    ResourceMgr* rm, const std::vector<InputCoords>& input_coords,
+    bool release_inputs) {
+  InputBuffers input_buffers;
+  input_buffers.input_tuples.reserve(input_coords.size());
+  input_buffers.input_allocations.reserve(input_coords.size());
+  input_buffers.input_pointers.reserve(input_coords.size());
+  for (size_t i = 0; i < input_coords.size(); ++i) {
+    XRTTupleAllocation* tuple;
     TF_RETURN_IF_ERROR(
-        XRTTupleAllocation::Lookup(rm, input_uid, &(*input_tuples)[i]));
+        XRTTupleAllocation::Lookup(rm, input_coords[i].handle, &tuple));
+    input_buffers.input_tuples.emplace_back(tuple);
     if (release_inputs) {
       // We are holding a reference to the tuple, so we can safely delete it
       // from the resource manager here.
-      TF_RETURN_IF_ERROR(
-          XRTTupleAllocation::DeleteFromResourceManager(rm, input_uid));
-      VLOG(2) << "Released allocation handle " << input_uid;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::DeleteFromResourceManager(
+          rm, input_coords[i].handle));
+      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
     }
-    XRTTupleAllocation* tuple = (*input_tuples)[i];
-    input_allocations->emplace_back(tuple->ToShapedBuffer());
+    if (input_coords[i].index.empty()) {
+      input_buffers.input_allocations.emplace_back(tuple->ToShapedBuffer());
+    } else {
+      xla::ShapedBuffer shaped_buffer = tuple->ToShapedBuffer();
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer sub_shaped_buffer,
+                          shaped_buffer.SubShapedBuffer(input_coords[i].index));
+      input_buffers.input_allocations.emplace_back(
+          std::move(sub_shaped_buffer));
+    }
   }
-  for (int i = 0; i < input_uids.size(); ++i) {
-    (*input_pointers)[i] = &(*input_allocations)[i];
+  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
+    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
   }
-  return Status::OK();
+  return std::move(input_buffers);
+}
+
+xla::StatusOr<InputBuffers> GetChainedOpInputs(
+    const xrt::XRTChainedExecuteOp& op, int current_index,
+    absl::Span<const RefPtr<XRTTupleAllocation>> ops_outputs) {
+  InputBuffers input_buffers;
+  input_buffers.input_tuples.reserve(op.inputs_size());
+  input_buffers.input_allocations.reserve(op.inputs_size());
+  input_buffers.input_pointers.reserve(op.inputs_size());
+  for (auto& input : op.inputs()) {
+    if (input.op_index() >= current_index) {
+      return errors::InvalidArgument(
+          "Input index ", input.op_index(),
+          " is above the current position: ", current_index);
+    }
+    input_buffers.input_tuples.emplace_back(ops_outputs[input.op_index()]);
+    // Thanks to the greatness of proto3, there is no way to query for
+    // explicitly set fields, so the default for output_index (zero) means no
+    // sub-index. As consequence, the real index is output_index - 1.
+    if (input.output_index() == 0) {
+      input_buffers.input_allocations.emplace_back(
+          input_buffers.input_tuples.back()->ToShapedBuffer());
+    } else {
+      xla::ShapedBuffer shaped_buffer =
+          input_buffers.input_tuples.back()->ToShapedBuffer();
+      TF_ASSIGN_OR_RETURN(
+          xla::ShapedBuffer sub_shaped_buffer,
+          shaped_buffer.SubShapedBuffer({input.output_index() - 1}));
+      input_buffers.input_allocations.emplace_back(
+          std::move(sub_shaped_buffer));
+    }
+  }
+  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
+    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
+  }
+  return std::move(input_buffers);
+}
+
+xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
+    OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
+    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
+    se::Stream* stream, int rng_seed) {
+  VLOG(2) << "Executing computation.";
+  xla::ExecutableRunOptions run_options;
+  run_options.set_stream(stream);
+  run_options.set_allocator(device_ref->backend()->memory_allocator());
+  run_options.set_intra_op_thread_pool(&context->eigen_cpu_device());
+  run_options.set_rng_seed(rng_seed);
+
+  Env* env = Env::Default();
+  auto start_time = env->NowMicros();
+  TF_ASSIGN_OR_RETURN(
+      xla::ScopedShapedBuffer run_result,
+      executable->Run(input_buffers.input_pointers, run_options));
+  auto elapsed = env->NowMicros() - start_time;
+  VLOG(2) << "Elapsed time: " << elapsed << "us";
+
+  auto shaped_buffer = run_result.release();
+  XRTTupleAllocation* output_tuple;
+  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+      shaped_buffer, device_ref->backend(), device_ref->device_ordinal(),
+      &output_tuple));
+  RefPtr<XRTTupleAllocation> output_tuple_ptr(output_tuple);
+
+  // The ScopedShapedBuffer returned by the executable Run() API, in case of
+  // input/output buffer aliasing, might have holes in it, which need to be
+  // filled using the proper input tuples buffers which are the source of
+  // aliasing.
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
+  auto alias_function =
+      [&](const xla::ShapeIndex& output_index,
+          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
+    TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
+    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
+               ? output_tuple->AliasBufferFrom(
+                     *input_buffers.input_tuples[alias.parameter_number],
+                     alias.parameter_index, output_index)
+               : Status::OK();
+  };
+  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
+
+  return std::move(output_tuple_ptr);
+}
+
+xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
+    OpKernelContext* context, ResourceMgr* rm,
+    XRTGenericDeviceAccessor::ScopedRef* device_ref,
+    xla::LocalExecutable* executable,
+    const std::vector<InputCoords>& input_coords, bool release_inputs,
+    se::Stream* stream, int rng_seed) {
+  TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
+                      GetInputBuffers(rm, input_coords, release_inputs));
+  return ExecuteComputation(context, device_ref, executable, input_buffers,
+                            stream, rng_seed);
 }
 
 // XRTExecuteOp
@@ -162,31 +251,6 @@
       rm->default_container(), kXRTCompilationCacheResourceName, &cache));
   core::ScopedUnref cache_unref(cache);
 
-  std::unique_ptr<XRTCompilationCacheEntryRef> entry;
-  TF_RETURN_IF_ERROR(cache->Lookup(compilation_handle, &entry));
-
-  if (release_compilation) {
-    // Process-wide cache of XLA executables.
-    TF_RETURN_IF_ERROR(cache->Release(compilation_handle));
-    VLOG(2) << "Released compilation handle " << compilation_handle;
-  }
-
-  std::vector<XRTTupleAllocation*> input_tuples;
-  // Make a cleanup method so that we can safely return in error conditions
-  // without leaking references to allocations.
-  auto buffer_releaser = gtl::MakeCleanup([&input_tuples]() {
-    for (auto tuple : input_tuples) {
-      if (tuple != nullptr) {
-        tuple->Unref();
-      }
-    }
-  });
-  std::vector<xla::ShapedBuffer> input_allocations;
-  std::vector<xla::ShapedBuffer*> input_pointers;
-  TF_RETURN_IF_ERROR(GetComputationInputs(context, rm, release_inputs,
-                                          &input_tuples, &input_allocations,
-                                          &input_pointers));
-
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -201,86 +265,107 @@
   se::Stream* stream = context->op_device_context()
                            ? context->op_device_context()->stream()
                            : nullptr;
+  TF_ASSIGN_OR_RETURN(std::vector<InputCoords> input_coords,
+                      GetComputationInputs(context, rm, "input_handles"));
 
-  // Execute the computation.
-  VLOG(2) << "Executing computation.";
-  xla::ExecutableRunOptions run_options;
-  run_options.set_stream(stream);
-  run_options.set_allocator(device_ref.backend()->memory_allocator());
-  run_options.set_intra_op_thread_pool(&context->eigen_cpu_device());
-  run_options.set_rng_seed(rng_seed);
-
-  Env* env = Env::Default();
-  auto start_time = env->NowMicros();
-
+  std::unique_ptr<XRTCompilationCacheEntryRef> entry;
+  TF_RETURN_IF_ERROR(cache->Lookup(compilation_handle, &entry));
   xla::LocalExecutable* executable = entry->get().get_executable();
-  auto run_result = executable->Run(input_pointers, run_options);
-  if (!run_result.ok()) {
-    return run_result.status();
+  if (release_compilation) {
+    // Process-wide cache of XLA executables.
+    TF_RETURN_IF_ERROR(cache->Release(compilation_handle));
+    VLOG(2) << "Released compilation handle " << compilation_handle;
   }
 
-  auto elapsed = env->NowMicros() - start_time;
-  VLOG(2) << "Elapsed time: " << elapsed << "us";
+  TF_ASSIGN_OR_RETURN(
+      RefPtr<XRTTupleAllocation> output_tuple,
+      ExecuteComputation(context, rm, &device_ref, executable, input_coords,
+                         release_inputs, stream, rng_seed));
 
-  auto scoped_buffer = run_result.ConsumeValueOrDie();
-  auto shaped_buffer = scoped_buffer.release();
-  XRTTupleAllocation* output_tuple;
-  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-      shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
-      &output_tuple));
-
-  // The ScopedShapedBuffer returned by the executable Run() API, in case of
-  // input/output buffer aliasing, might have holes in it, which need to be
-  // filled using the proper input tuples buffers which are the source of
-  // aliasing.
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      executable->executable()->module().input_output_alias_config();
-  auto alias_function =
-      [&](const xla::ShapeIndex& output_index,
-          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
-    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
-    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple->AliasBufferFrom(
-                     *input_tuples[alias.parameter_number],
-                     alias.parameter_index, output_index)
-               : Status::OK();
-  };
-  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
-
-  if (config_proto.return_exploded_tuple() &&
-      output_tuple->on_device_shape().IsTuple()) {
-    int64 tuple_element_count =
-        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
-    Tensor* output_tensor;
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        0, TensorShape({tuple_element_count}), &output_tensor));
-
-    for (int64 i = 0; i < tuple_element_count; ++i) {
-      xla::ShapeIndex shape_index;
-      shape_index.push_back(i);
-
-      XRTTupleAllocation* suballocation;
-      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-          output_tuple, shape_index, &suballocation,
-          /*alias_parent_allocation=*/false));
-      int64 key;
-      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
-      output_tensor->vec<int64>()(i) = key;
-    }
-    output_tuple->Unref();
-  } else {
-    Tensor* output_tensor;
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(0, TensorShape({}), &output_tensor));
-    int64 key;
-    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
-    output_tensor->scalar<int64>()() = key;
-  }
-  return Status::OK();
+  return CreateExecuteOutput(context, rm, std::move(output_tuple),
+                             config_proto.return_exploded_tuple());
 }
 
 XRTExecuteOp::~XRTExecuteOp() = default;
 
+class XRTExecuteChainedOp : public AsyncOpKernel {
+ public:
+  explicit XRTExecuteChainedOp(OpKernelConstruction* context);
+  ~XRTExecuteChainedOp() override;
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
+
+ private:
+  Status DoWork(OpKernelContext* context);
+};
+
+XRTExecuteChainedOp::XRTExecuteChainedOp(OpKernelConstruction* context)
+    : AsyncOpKernel(context) {}
+
+void XRTExecuteChainedOp::ComputeAsync(OpKernelContext* context,
+                                       DoneCallback done) {
+  // Schedule onto the default queue, for unbounded concurrency. See b/73520706
+  Env::Default()->SchedClosure([this, context, done]() {
+    OP_REQUIRES_OK_ASYNC(context, DoWork(context), done);
+    done();
+  });
+}
+
+Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
+  VLOG(1) << "XRTExecuteChainedOp::Compute";
+  ResourceMgr* rm;
+  TF_RETURN_IF_ERROR(
+      XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
+
+  const Tensor& execution_plan = context->input(0);
+  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_plan.shape()));
+  xrt::XRTChainedExecutePlan plan;
+  TF_RET_CHECK(plan.ParseFromString(execution_plan.scalar<string>()()));
+
+  const Tensor& execution_config = context->input(1);
+  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
+  xrt::XRTChainedExecuteConfig config;
+  TF_RET_CHECK(config.ParseFromString(execution_config.scalar<string>()()));
+
+  XRTCompilationCache* cache;
+  TF_RETURN_IF_ERROR(rm->Lookup<XRTCompilationCache>(
+      rm->default_container(), kXRTCompilationCacheResourceName, &cache));
+  core::ScopedUnref cache_unref(cache);
+
+  // We are guaranteed that the underlying device object won't be deleted out
+  // from under us, while the ScopedRef is live.
+  class XRTGenericDeviceAccessor::ScopedRef device_ref;
+  TF_RETURN_IF_ERROR(
+      XRTGenericDeviceAccessor::InitScopedRef(context, 0, &device_ref));
+
+  int rng_seed = config.rng_seed();
+  if (rng_seed == 0) {
+    rng_seed = GetXLARandomSeed();
+  }
+
+  se::Stream* stream = context->op_device_context()
+                           ? context->op_device_context()->stream()
+                           : nullptr;
+  auto execute_op =
+      [&](const xrt::XRTChainedExecuteOp& op, int current_index,
+          absl::Span<const RefPtr<XRTTupleAllocation>> ops_outputs)
+      -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
+    TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
+                        GetChainedOpInputs(op, current_index, ops_outputs));
+
+    std::unique_ptr<XRTCompilationCacheEntryRef> entry;
+    TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry));
+    xla::LocalExecutable* executable = entry->get().get_executable();
+
+    return ExecuteComputation(context, &device_ref, executable, input_buffers,
+                              stream, rng_seed);
+  };
+
+  return ExecuteChained(context, rm, plan, config, execute_op);
+}
+
+XRTExecuteChainedOp::~XRTExecuteChainedOp() = default;
+
 }  // namespace
 
 REGISTER_KERNEL_BUILDER(Name("XRTExecute")
@@ -299,4 +384,18 @@
                             .HostMemory("output_handle"),
                         XRTExecuteOp);
 
+REGISTER_KERNEL_BUILDER(Name("XRTExecuteChained")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("execution_plan")
+                            .HostMemory("execution_config")
+                            .HostMemory("output_handle"),
+                        XRTExecuteChainedOp);
+
+REGISTER_KERNEL_BUILDER(Name("XRTExecuteChained")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("execution_plan")
+                            .HostMemory("execution_config")
+                            .HostMemory("output_handle"),
+                        XRTExecuteChainedOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
index 4f59fcc..a52b2a7 100644
--- a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
@@ -50,4 +50,22 @@
 'Ninputs' is the number of input handles.
 )");
 
+REGISTER_OP("XRTExecuteChained")
+    .Input("execution_plan: string")
+    .Input("execution_config: string")
+    .Output("output_handle: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      return tensorflow::shape_inference::ScalarShape(c);
+    })
+    .Doc(
+        R"(
+Runs a sequence of previously-compiled computations on a core.
+The 'execution_plan' input is a serialized xrt::XRTChainedExecutePlan proto
+describing the post-order of the chained execution.
+The 'execution_config' input is a serialized xrt::XRTChainedExecuteConfig
+proto describing the configuration for the chained execution operation.
+Returns one of more int64 handles to the XRT device data generated by the
+chained execution.
+)");
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index f9e21d4..bc07ae3 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -167,6 +167,18 @@
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation SubAndScale() {
+  xla::XlaBuilder builder("SubAndScale");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P1");
+  auto sum = xla::Sub(p0, p1);
+  auto c = xla::ConstantR0<float>(&builder, 11.0f);
+  xla::Mul(sum, c);
+  return builder.Build().ValueOrDie();
+}
+
 xla::XlaComputation Dot() {
   xla::XlaBuilder builder("Dot");
   auto p0 = xla::Parameter(
@@ -378,7 +390,6 @@
   xla::LiteralProto response;
   EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
   EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
-  outputs.clear();
 
   xla::LiteralProto new_literal =
       xla::LiteralUtil::CreateR2({{9, 2}, {4, 1}}).ToProto();
@@ -390,7 +401,6 @@
   TF_EXPECT_OK(session.Run({write_op}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
   EXPECT_EQ(allocation_handle, outputs[0].scalar<int64>()());
-  outputs.clear();
 
   auto read_after_write = ops::XRTReadLiteral(root, Input(allocation_handle));
   TF_EXPECT_OK(session.Run({read_after_write}, &outputs));
@@ -438,7 +448,6 @@
   release_tensor.flat<int64>()(1) = allocation_handle2;
 
   auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
-  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
                            &outputs));
 }
@@ -467,13 +476,7 @@
           .ToProto();
   StoreComputationSnapshot(AddAndTuple(), c2.mutable_hlo_snapshot());
 
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(false);
-
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
   auto computation1 =
       ops::Const(root.WithDevice("/device:CPU:0"), c1.SerializeAsString());
   auto c_handle1 = ops::XRTCompile(root, computation1);
@@ -495,7 +498,6 @@
   release_tensor.flat<int64>()(1) = compilation_handle2;
 
   auto release = ops::XRTReleaseCompilationHandle(root, release_tensor);
-  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
                            &outputs));
 }
@@ -520,7 +522,6 @@
 
   auto clear_all = ops::XRTReleaseAllAllocations(root);
 
-  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {},
                            {clear_all}, &outputs));
   EXPECT_EQ(outputs.size(), 0);
@@ -686,6 +687,196 @@
   EXPECT_TRUE(CompareLiteralProtos(response_1, expected_1));
 }
 
+TEST(RawApiTest, ExecuteChainedOpByOp) {
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+
+  auto make_computation = [](const std::function<xla::XlaComputation()>& fn) {
+    xrt::XLAComputation c;
+    auto config = c.mutable_config();
+    auto shapes = config->mutable_program_shape();
+    *shapes->add_parameters() =
+        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+    *shapes->add_parameters() =
+        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+    *shapes->mutable_result() =
+        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+    StoreComputationSnapshot(fn(), c.mutable_hlo_snapshot());
+    return c.SerializeAsString();
+  };
+
+  auto c_add_scale = make_computation(AddAndScale);
+  auto c_sub_scale = make_computation(SubAndScale);
+
+  auto c_add_scale_op = ops::XRTCompile(
+      root, ops::Const(root.WithDevice("/device:CPU:0"), c_add_scale));
+  auto c_sub_scale_op = ops::XRTCompile(
+      root, ops::Const(root.WithDevice("/device:CPU:0"), c_sub_scale));
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(
+      session.Run({c_add_scale_op.handle, c_sub_scale_op.handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 c_add_scale_handle = outputs[0].scalar<int64>()();
+  int64 c_sub_scale_handle = outputs[1].scalar<int64>()();
+
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
+
+  auto p0_handle = ops::XRTAllocate(
+      root,
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString()));
+  auto p1_handle = ops::XRTAllocate(
+      root,
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString()));
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(false);
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto result0 = ops::XRTExecute(root, Input(c_add_scale_handle), e_config,
+                                 {Output(p0_handle), Output(p1_handle)});
+  auto result1 = ops::XRTExecute(root, Input(c_sub_scale_handle), e_config,
+                                 {Output(p0_handle), Output(p1_handle)});
+  auto result = ops::XRTExecute(root, Input(c_add_scale_handle), e_config,
+                                {result0.output_handle, result1.output_handle});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({-150.0f, -36.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, ExecuteChained) {
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+
+  auto make_computation = [](const std::function<xla::XlaComputation()>& fn) {
+    xrt::XLAComputation c;
+    auto config = c.mutable_config();
+    auto shapes = config->mutable_program_shape();
+    *shapes->add_parameters() =
+        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+    *shapes->add_parameters() =
+        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+    *shapes->mutable_result() =
+        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+    StoreComputationSnapshot(fn(), c.mutable_hlo_snapshot());
+    return c.SerializeAsString();
+  };
+
+  auto c_add_scale = make_computation(AddAndScale);
+  auto c_sub_scale = make_computation(SubAndScale);
+
+  auto c_add_scale_op = ops::XRTCompile(
+      root, ops::Const(root.WithDevice("/device:CPU:0"), c_add_scale));
+  auto c_sub_scale_op = ops::XRTCompile(
+      root, ops::Const(root.WithDevice("/device:CPU:0"), c_sub_scale));
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(
+      session.Run({c_add_scale_op.handle, c_sub_scale_op.handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 c_add_scale_handle = outputs[0].scalar<int64>()();
+  int64 c_sub_scale_handle = outputs[1].scalar<int64>()();
+
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
+
+  auto p0_handle_op = ops::XRTAllocate(
+      root,
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString()));
+  auto p1_handle_op = ops::XRTAllocate(
+      root,
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString()));
+
+  TF_EXPECT_OK(session.Run({p0_handle_op, p1_handle_op}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 p0_handle = outputs[0].scalar<int64>()();
+  int64 p1_handle = outputs[1].scalar<int64>()();
+
+  xrt::XRTChainedExecuteConfig config;
+  auto config_const =
+      ops::Const(root.WithDevice("/device:CPU:0"), config.SerializeAsString());
+
+  xrt::XRTChainedExecutePlan plan;
+  xrt::XRTChainedExecuteOp* op;
+  xrt::XRTChainedExecuteOp::Input* input;
+  xrt::XRTChainedExecuteOp::Output* output;
+
+  // Index 0
+  op = plan.add_ops();
+  op->set_data_handle(p0_handle);
+
+  // Index 1
+  op = plan.add_ops();
+  op->set_data_handle(p1_handle);
+
+  // Index 2
+  op = plan.add_ops();
+  op->set_computation_handle(c_add_scale_handle);
+  input = op->add_inputs();
+  input->set_op_index(0);
+  input = op->add_inputs();
+  input->set_op_index(1);
+
+  // Index 3
+  op = plan.add_ops();
+  op->set_computation_handle(c_sub_scale_handle);
+  input = op->add_inputs();
+  input->set_op_index(0);
+  input = op->add_inputs();
+  input->set_op_index(1);
+
+  // Index 4
+  op = plan.add_ops();
+  op->set_computation_handle(c_add_scale_handle);
+  input = op->add_inputs();
+  input->set_op_index(2);
+  input = op->add_inputs();
+  input->set_op_index(3);
+  output = op->add_outputs();
+  output->set_result_index(0);
+
+  auto plan_const =
+      ops::Const(root.WithDevice("/device:CPU:0"), plan.SerializeAsString());
+  auto result = ops::XRTExecuteChained(root, plan_const, config_const);
+  TF_ASSERT_OK(root.status());
+
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  auto handles_vec = outputs[0].vec<int64>();
+  EXPECT_EQ(handles_vec.size(), 1);
+
+  auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(0)));
+  TF_ASSERT_OK(root.status());
+
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({-150.0f, -36.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecute) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});
@@ -1146,7 +1337,6 @@
       root.WithControlDependencies(read_back), result);
   TF_ASSERT_OK(root.status());
 
-  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_back},
                            {release}, &outputs));
 
@@ -1165,7 +1355,6 @@
       root.WithControlDependencies(read_handle), Input(alloc_handle));
   TF_ASSERT_OK(root.status());
 
-  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_handle},
                            {release_handle}, &outputs));
 
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 84adee7..a598b80 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -3,9 +3,9 @@
 package xrt;
 
 import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+import "tensorflow/compiler/xla/service/hlo.proto";
 import "tensorflow/compiler/xla/xla.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
 
 message DeviceAssignment {
   message ComputationDevice {
@@ -106,3 +106,61 @@
   // allocations, one for each of the first-level elements of the result tuple.
   bool return_exploded_tuple = 7;
 }
+
+message XRTChainedExecuteConfig {
+  // If non-zero, rng_seed to reset the core with.
+  uint32 rng_seed = 1;
+  // Which model-parallel computation to run from the compiled bundle.
+  int32 core_index_in_replica = 2;
+  // Optional key to disambiguate between executions. This is only needed if
+  // multiple host send/recvs may be outstanding concurrently with executions.
+  string execution_instance_key = 3;
+}
+
+// A single chained execute operation. An operation can either be a device data
+// load, or an existing (as in, previously compiled and accessible via its int64
+// handle) XLA computation execution.
+message XRTChainedExecuteOp {
+  // Represents an input for this operation.
+  message Input {
+    // The index within the XRTChainedExecutePlan.ops post-order of the source
+    // operation for this input.
+    int64 op_index = 1;
+    // The output index of the value generated by the operation at op_index.
+    // Zero (default value) means no index ({}) while if an indexing is
+    // required, output_index needs to be set to index+1.
+    // Thanks proto3!
+    int64 output_index = 2;
+  }
+  // Represents an output of the XRTChainedExecute operation, which should
+  // originate by the output of this operation.
+  message Output {
+    // The index in the value generated by this operation, which should be
+    // forwarded as XRTChainedExecute output. If output_index is zero (default
+    // value) the whole output will be used as result. This means that if the
+    // output shape is a tuple, the result will be the full tuple. Otherwise the
+    // real sub-tuple index will be output_index - 1.
+    int64 output_index = 1;
+    // The index in the vector of the results returned by the XRTChainedExecute
+    // operation, where this output should be forwarded.
+    int64 result_index = 2;
+  }
+
+  oneof op_oneof {
+    // The handle to an existing XRT device data.
+    int64 data_handle = 1;
+    // The handle to an existing XRT compiled computation.
+    int64 computation_handle = 2;
+  }
+  // The outputs of this XRTChainedExecuteOp operation.
+  repeated Output outputs = 3;
+  // The inputs of this XRTChainedExecuteOp operation. If data_handle is set,
+  // there are no inputs.
+  repeated Input inputs = 4;
+}
+
+// Execution plan for the XRTChainedExecute operation.
+message XRTChainedExecutePlan {
+  // The post order with the XRT computations to be executed.
+  repeated XRTChainedExecuteOp ops = 1;
+}
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 1b3bcbe..07abd60 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/compiler/xrt/xrt_state.h"
 
 #include <stdint.h>
+
 #include <map>
 #include <memory>
 #include <string>
@@ -73,8 +74,11 @@
 const char* kTupleContainer = "tuples";
 
 int64 get_uid() {
-  uint64 unsigned_rand = random::New64() & INT64_MAX;
-  return static_cast<int64>(unsigned_rand);
+  int64 uid;
+  do {
+    uid = random::New64() & INT64_MAX;
+  } while (uid == XRTTupleAllocation::InvalidKey());
+  return uid;
 }
 
 BufferAllocStats* GetAllocStats() {
@@ -113,7 +117,7 @@
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
     TF_ASSIGN_OR_RETURN(
-        xla::OwningDeviceMemory buffer,
+        se::OwningDeviceMemory buffer,
         allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false));
     // Move our buffer into shaped_buffer, which takes ownership of it.
     index_to_buffer.second = buffer.Forget();
@@ -131,7 +135,7 @@
 
 XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          int device_ordinal,
-                                         xla::DeviceMemoryAllocator* allocator)
+                                         se::DeviceMemoryAllocator* allocator)
     : size_(allocation.size()),
       allocation_(allocation),
       device_ordinal_(device_ordinal),
@@ -165,7 +169,7 @@
 }
 
 XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
-                                       xla::DeviceMemoryAllocator* allocator,
+                                       se::DeviceMemoryAllocator* allocator,
                                        const xla::Shape& on_host_shape,
                                        const xla::Shape& on_device_shape)
     : device_ordinal_(device_ordinal),
@@ -338,7 +342,7 @@
 
 /* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
     const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-    xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
+    se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
     xla::Shape* device_shape) {
   // Initialize both host and device shape to be the 'spine' of the new tuple
   // shape, given by the shape of the tree of tuples.
@@ -411,7 +415,7 @@
           xla::Shape subshape =
               xla::ShapeUtil::GetSubshape(device_shape, index);
           uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
-          TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
+          TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                               allocator->Allocate(device_ordinal, size,
                                                   /*retry_on_failure=*/false));
           VLOG(2) << "Allocated buffer at " << buffer.opaque() << " index "
@@ -498,7 +502,7 @@
 
 void XRTTupleAllocation::InitializeFromShapedBuffer(
     const xla::ShapedBuffer& shaped_buffer,
-    xla::DeviceMemoryAllocator* allocator, int device_ordinal) {
+    se::DeviceMemoryAllocator* allocator, int device_ordinal) {
   for (auto& buffer : buffers_) {
     // Make a reference-counted version of the allocated buffer.
     buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first),
@@ -545,7 +549,7 @@
     if (!release_checker(buffer.first)) {
       *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
     } else {
-      *shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory(
+      *shaped_tree.mutable_element(buffer.first) = se::OwningDeviceMemory(
           buffer.second->allocation(), device_ordinal_, allocator_);
       DiscardAllocation(buffer.first);
     }
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 6519da3..0cc0d3d 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -25,7 +25,6 @@
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -34,6 +33,7 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace tensorflow {
@@ -45,8 +45,7 @@
 class XRTBufferAllocation : public core::RefCounted {
  public:
   XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
-                      int device_ordinal,
-                      xla::DeviceMemoryAllocator* allocator);
+                      int device_ordinal, se::DeviceMemoryAllocator* allocator);
   ~XRTBufferAllocation() override;
 
   // The region of device memory being wrapped.
@@ -69,7 +68,7 @@
   uint64 size_ = 0;
   se::DeviceMemoryBase allocation_;
   int device_ordinal_;
-  xla::DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 };
 
 // Entry in the resource manager corresponding to an allocation handle returned
@@ -141,6 +140,10 @@
   // manager.
   static Status ReleaseAllAllocations(ResourceMgr* rm);
 
+  // Returns the invalid key value, which will be never generated by the
+  // Intern() API.
+  static int64 InvalidKey() { return 0; }
+
   // Adds the allocation to a ResourceMgr and returns the key that will be used
   // to retrieve it. Transfers a reference on *this to rm.
   Status Intern(ResourceMgr* rm, int64* key);
@@ -193,14 +196,14 @@
 
  private:
   // Creates a new handle with (tuple) shape.
-  XRTTupleAllocation(int device_ordinal, xla::DeviceMemoryAllocator* allocator,
+  XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator,
                      const xla::Shape& on_host_shape,
                      const xla::Shape& on_device_shape);
 
   // Inherits the allocations represented in buffer, which must have the same
   // shape as buffers_.
   void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                  xla::DeviceMemoryAllocator* allocator,
+                                  se::DeviceMemoryAllocator* allocator,
                                   int device_ordinal);
 
   // Takes a tree 'elements' where each leaf is an allocation, validates that
@@ -210,12 +213,12 @@
   // grafted on.
   static Status ExpandTreeOfTuples(
       const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-      xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
+      se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
       xla::Shape* device_shape);
 
   // Location of the memory that is being managed.
   int device_ordinal_;
-  xla::DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 
   // The shape that the caller thinks the tuple has.
   const xla::Shape on_host_shape_;
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 8b7749b..518c993 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -47,6 +47,20 @@
   return string();
 }
 
+Status MakeOutput(const RefPtr<XRTTupleAllocation>& output, int64 index,
+                  RefPtr<XRTTupleAllocation>* result) {
+  if (index == 0) {
+    *result = output;
+  } else {
+    XRTTupleAllocation* tuple;
+    TF_RETURN_IF_ERROR(
+        XRTTupleAllocation::MakeSubBuffer(output.get(), {index - 1}, &tuple,
+                                          /*alias_parent_allocation=*/true));
+    result->reset(tuple);
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
@@ -66,4 +80,122 @@
   return options;
 }
 
+xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
+    OpKernelContext* context, ResourceMgr* rm, const char* input_name) {
+  OpInputList arg_list;
+  TF_RETURN_IF_ERROR(context->input_list(input_name, &arg_list));
+  // Concatenate all input uids from list of scalars-or-vectors carrying them.
+  std::vector<InputCoords> input_coords;
+  for (int i = 0; i < arg_list.size(); ++i) {
+    const Tensor& arg = arg_list[i];
+    if (TensorShapeUtils::IsScalar(arg.shape())) {
+      input_coords.emplace_back(arg.scalar<int64>()());
+    } else {
+      TF_RET_CHECK(TensorShapeUtils::IsVector(arg.shape()));
+      auto arg_vec = arg.vec<int64>();
+      const int64 num_elts = arg.shape().dim_size(0);
+      for (int i = 0; i < num_elts; ++i) {
+        input_coords.emplace_back(arg_vec(i));
+      }
+    }
+  }
+  return std::move(input_coords);
+}
+
+Status CreateExecuteOutput(OpKernelContext* context, ResourceMgr* rm,
+                           RefPtr<XRTTupleAllocation> output_tuple,
+                           bool return_exploded_tuple) {
+  if (return_exploded_tuple && output_tuple->on_host_shape().IsTuple()) {
+    int64 tuple_element_count =
+        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        0, TensorShape({tuple_element_count}), &output_tensor));
+
+    for (int64 i = 0; i < tuple_element_count; ++i) {
+      XRTTupleAllocation* suballocation;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          output_tuple.get(), {i}, &suballocation,
+          /*alias_parent_allocation=*/false));
+      int64 key;
+      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
+      output_tensor->vec<int64>()(i) = key;
+    }
+  } else {
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, TensorShape({}), &output_tensor));
+    int64 key;
+    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
+    output_tuple.release();
+    output_tensor->scalar<int64>()() = key;
+  }
+  return Status::OK();
+}
+
+Status ExecuteChained(OpKernelContext* context, ResourceMgr* rm,
+                      const xrt::XRTChainedExecutePlan& plan,
+                      const xrt::XRTChainedExecuteConfig& config,
+                      const ChainedExecuteFn& execute_op) {
+  // Create the vector which tracks the uses of the intermediate chained
+  // operations outputs.
+  std::vector<int64> uses(plan.ops_size(), 0);
+  for (auto& op : plan.ops()) {
+    for (auto& input : op.inputs()) {
+      uses[input.op_index()] += 1;
+    }
+  }
+  std::vector<RefPtr<XRTTupleAllocation>> ops_outputs(plan.ops_size());
+  std::vector<RefPtr<XRTTupleAllocation>> results;
+  for (int i = 0; i < plan.ops_size(); ++i) {
+    auto& op = plan.ops(i);
+    if (op.op_oneof_case() == xrt::XRTChainedExecuteOp::kDataHandle) {
+      // This operation is a device data load. Fetch the proper
+      // XRTTupleAllocation behind the user handle and fill up the op output at
+      // the current position.
+      XRTTupleAllocation* tuple;
+      TF_RETURN_IF_ERROR(
+          XRTTupleAllocation::Lookup(rm, op.data_handle(), &tuple));
+      ops_outputs[i].reset(tuple);
+    } else if (op.op_oneof_case() ==
+               xrt::XRTChainedExecuteOp::kComputationHandle) {
+      // This is an XRT execute operation, forward to the device specific
+      // handler.
+      TF_ASSIGN_OR_RETURN(ops_outputs[i], execute_op(op, i, ops_outputs));
+    } else {
+      return errors::InvalidArgument(
+          "Undefined operation kind at post-order position ", i);
+    }
+    // If the result of this chained operation is an output result, feed the
+    // results vector at the desired position.
+    for (auto& output : op.outputs()) {
+      if (output.result_index() >= results.size()) {
+        results.resize(output.result_index() + 1);
+      }
+      TF_RETURN_IF_ERROR(MakeOutput(ops_outputs[i], output.output_index(),
+                                    &results[output.result_index()]));
+    }
+    // Drop intermediate results which have no more users.
+    for (auto& input : op.inputs()) {
+      uses[input.op_index()] -= 1;
+      if (uses[input.op_index()] == 0) {
+        ops_outputs[input.op_index()].reset();
+      }
+    }
+  }
+
+  Tensor* output_tensor;
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({static_cast<int64>(results.size())}), &output_tensor));
+  for (size_t i = 0; i < results.size(); ++i) {
+    int64 key = XRTTupleAllocation::InvalidKey();
+    if (results[i] != nullptr) {
+      TF_RETURN_IF_ERROR(results[i]->Intern(rm, &key));
+      results[i].release();
+    }
+    output_tensor->vec<int64>()(i) = key;
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index d9c05a7..07159dd 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -18,10 +18,106 @@
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/compiler/xrt/xrt_state.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+// Reference counted smart pointer for XRT objects providing the standard
+// Ref()/Unref() APIs.
+template <typename T>
+class RefPtr {
+ public:
+  RefPtr() = default;
+  // Creates a RefPtr from a pointer. This is an ownership transfer operation,
+  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
+  RefPtr(T* ptr) : ptr_(ptr) {}
+  RefPtr(const RefPtr& other) : ptr_(other.ptr_) { Acquire(ptr_); }
+  RefPtr(RefPtr&& other) : ptr_(other.ptr_) { other.ptr_ = nullptr; }
+
+  ~RefPtr() { Release(ptr_); }
+
+  RefPtr& operator=(const RefPtr& other) {
+    if (this != &other) {
+      Acquire(other.ptr_);
+      Release(ptr_);
+      ptr_ = other.ptr_;
+    }
+    return *this;
+  }
+
+  RefPtr& operator=(RefPtr&& other) {
+    if (this != &other) {
+      Release(ptr_);
+      ptr_ = other.ptr_;
+      other.ptr_ = nullptr;
+    }
+    return *this;
+  }
+
+  operator bool() const { return ptr_ != nullptr; }
+  bool operator==(const RefPtr& rhs) const { return ptr_ == rhs.ptr_; }
+  bool operator!=(const RefPtr& rhs) const { return ptr_ != rhs.ptr_; }
+  bool operator==(const T* ptr) const { return ptr_ == ptr; }
+  bool operator!=(const T* ptr) const { return ptr_ != ptr; }
+  bool operator==(std::nullptr_t ptr) const { return ptr_ == ptr; }
+  bool operator!=(std::nullptr_t ptr) const { return ptr_ != ptr; }
+
+  T* get() const { return ptr_; }
+
+  T* operator->() const {
+    CHECK(ptr_ != nullptr);  // Crash OK
+    return ptr_;
+  }
+
+  T& operator*() const {
+    CHECK(ptr_ != nullptr);  // Crash OK
+    return *ptr_;
+  }
+
+  T* release() {
+    T* ptr = ptr_;
+    ptr_ = nullptr;
+    return ptr;
+  }
+
+  // Resets the RefPtr from a pointer. This is an ownership transfer operation,
+  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
+  void reset(T* ptr = nullptr) {
+    Release(ptr_);
+    ptr_ = ptr;
+  }
+
+ private:
+  static void Release(T* ptr) {
+    if (ptr != nullptr) {
+      ptr->Unref();
+    }
+  }
+
+  static void Acquire(T* ptr) {
+    if (ptr != nullptr) {
+      ptr->Ref();
+    }
+  }
+
+  T* ptr_ = nullptr;
+};
+
+struct InputCoords {
+  explicit InputCoords(int64 handle) : handle(handle) {}
+  InputCoords(int64 handle, xla::ShapeIndex index)
+      : handle(handle), index(std::move(index)) {}
+
+  int64 handle = 0;
+  xla::ShapeIndex index;
+};
+
 // Filters the debug options provided as argument according to the value of the
 // TF_XLA_DEBUG_OPTIONS_PASSTHROUGH environment variable. If such variable is
 // set to "1" or "true", the debug options will be returned as is. Otherwise
@@ -29,6 +125,29 @@
 // contained in it, will be limited to gs:// and bigstore:// ones.
 xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
 
+// Populates the input_coords with a list of input coordinates from a input_name
+// op argument.
+xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
+    OpKernelContext* context, ResourceMgr* rm, const char* input_name);
+
+// Create the XRT execute output tensor given the computation result
+// (output_tuple). The return_exploded_tuple tells whether a tuple result should
+// be returned as vector of handles representing each tuple child.
+Status CreateExecuteOutput(OpKernelContext* context, ResourceMgr* rm,
+                           RefPtr<XRTTupleAllocation> output_tuple,
+                           bool return_exploded_tuple);
+
+// Drives the XRT chained computation execution given the supplied core execute
+// function.
+using ChainedExecuteFn =
+    std::function<xla::StatusOr<RefPtr<XRTTupleAllocation>>(
+        const xrt::XRTChainedExecuteOp&, int,
+        absl::Span<const RefPtr<XRTTupleAllocation>>)>;
+Status ExecuteChained(OpKernelContext* context, ResourceMgr* rm,
+                      const xrt::XRTChainedExecutePlan& plan,
+                      const xrt::XRTChainedExecuteConfig& config,
+                      const ChainedExecuteFn& execute_op);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index aaeb151..6760ef2 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -8,7 +8,6 @@
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
-load("//tensorflow:tensorflow.bzl", "if_not_windows_cuda")
 
 py_library(
     name = "contrib_py",
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
index 6d2d70c..651b108 100644
--- a/tensorflow/contrib/autograph/examples/benchmarks/BUILD
+++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
@@ -17,6 +17,7 @@
     name = "cartpole_benchmark",
     size = "enormous",
     srcs = ["cartpole_benchmark.py"],
+    python_version = "PY2",
     tags = [
         "local",
         "manual",
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 0c91ee3..002d681 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -14,7 +14,6 @@
 ==============================================================================*/
 
 #include "tensorflow/contrib/bigtable/kernels/bigtable_lib.h"
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
@@ -262,16 +261,16 @@
           }
           components.clear();
         }
-        grpc::Status mutation_status;
+        ::google::cloud::Status mutation_status;
         std::vector<::google::cloud::bigtable::FailedMutation> failures =
-            resource->table().BulkApply(mutation, mutation_status);
-        if (!mutation_status.ok()) {
-          LOG(ERROR) << "Failure applying mutation: "
-                     << mutation_status.error_code() << " - "
-                     << mutation_status.error_message() << " ("
-                     << mutation_status.error_details() << ").";
-        }
+            resource->table().BulkApply(mutation);
         if (!failures.empty()) {
+          mutation_status = failures.front().status();
+          if (!mutation_status.ok()) {
+            LOG(ERROR) << "Failure applying mutation: "
+                       << mutation_status.code() << " - "
+                       << mutation_status.message() << ".";
+          }
           ::google::bigtable::v2::MutateRowsRequest request;
           mutation.MoveTo(&request);
           for (const auto& failure : failures) {
@@ -282,12 +281,11 @@
           }
         }
         OP_REQUIRES_ASYNC(
-            ctx, failures.empty() && mutation_status.ok(),
+            ctx, failures.empty(),
             errors::Unknown("Failure while writing to Cloud Bigtable: ",
-                            mutation_status.error_code(), " - ",
-                            mutation_status.error_message(), " (",
-                            mutation_status.error_details(),
-                            "), # of mutation failures: ", failures.size(),
+                            mutation_status.code(), " - ",
+                            mutation_status.message(),
+                            "; # of mutation failures: ", failures.size(),
                             ". See the log for the specific error details."),
             done);
       } while (!end_of_sequence);
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 3a46e6e..0bdaf3a 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -16,22 +16,6 @@
 #include "tensorflow/contrib/bigtable/kernels/bigtable_lib.h"
 
 namespace tensorflow {
-
-Status GrpcStatusToTfStatus(const ::grpc::Status& status) {
-  if (status.ok()) {
-    return Status::OK();
-  }
-  auto grpc_code = status.error_code();
-  if (status.error_code() == ::grpc::StatusCode::ABORTED ||
-      status.error_code() == ::grpc::StatusCode::UNAVAILABLE ||
-      status.error_code() == ::grpc::StatusCode::OUT_OF_RANGE) {
-    grpc_code = ::grpc::StatusCode::INTERNAL;
-  }
-  return Status(static_cast<::tensorflow::error::Code>(grpc_code),
-                strings::StrCat("Error reading from Cloud Bigtable: ",
-                                status.error_message()));
-}
-
 namespace {
 ::tensorflow::error::Code GcpErrorCodeToTfErrorCode(
     ::google::cloud::StatusCode code) {
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
index f6aa67f..1325560 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
@@ -16,16 +16,13 @@
 #ifndef TENSORFLOW_CONTRIB_BIGTABLE_KERNELS_BIGTABLE_LIB_H_
 #define TENSORFLOW_CONTRIB_BIGTABLE_KERNELS_BIGTABLE_LIB_H_
 
-// Note: we use bigtable/client/internal/table.h as this is the no-exception API
-
 #include "google/cloud/bigtable/data_client.h"
-#include "google/cloud/bigtable/internal/table.h"
+#include "google/cloud/bigtable/table.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
 namespace tensorflow {
 
-Status GrpcStatusToTfStatus(const ::grpc::Status& status);
 Status GcpStatusToTfStatus(const ::google::cloud::Status& status);
 
 string RegexFromStringSet(const std::vector<string>& strs);
@@ -66,7 +63,7 @@
 
   ~BigtableTableResource() override { client_->Unref(); }
 
-  ::google::cloud::bigtable::noex::Table& table() { return table_; }
+  ::google::cloud::bigtable::Table& table() { return table_; }
 
   string DebugString() const override {
     return strings::StrCat(
@@ -77,7 +74,7 @@
  private:
   BigtableClientResource* client_;  // Ownes one ref.
   const string table_name_;
-  ::google::cloud::bigtable::noex::Table table_;
+  ::google::cloud::bigtable::Table table_;
 };
 
 namespace data {
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
index 22b711a..98ec991 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
@@ -152,18 +152,19 @@
         }
         if (input_tensors[0].NumElements() == 1) {
           // Single key lookup.
-          ::google::cloud::Status status;
-          auto pair = dataset()->table_->table().ReadRow(
-              input_tensors[0].scalar<string>()(), dataset()->filter_, status);
-          if (!status.ok()) {
-            return GcpStatusToTfStatus(status);
+          ::google::cloud::StatusOr<
+              std::pair<bool, ::google::cloud::bigtable::Row>>
+              row = dataset()->table_->table().ReadRow(
+                  input_tensors[0].scalar<string>()(), dataset()->filter_);
+          if (!row.ok()) {
+            return GcpStatusToTfStatus(row.status());
           }
-          if (!pair.first) {
+          if (!row->first) {
             return errors::DataLoss("Row key '",
                                     input_tensors[0].scalar<string>()(),
                                     "' not found.");
           }
-          TF_RETURN_IF_ERROR(ParseRow(ctx, pair.second, out_tensors));
+          TF_RETURN_IF_ERROR(ParseRow(ctx, row->second, out_tensors));
         } else {
           // Batched get.
           return errors::Unimplemented(
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
index f0c3ef4..88284c5 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
@@ -125,15 +125,15 @@
       // ensure we don't accidentally miss any subsets of the requested range by
       // including `begin_key()` and `end_key()` as appropriate.
       Status Initialize(IteratorContext* ctx) override {
-        grpc::Status status;
-        std::vector<google::cloud::bigtable::RowKeySample> row_keys =
-            dataset()->table().table().SampleRows(status);
-        if (!status.ok()) {
-          return GrpcStatusToTfStatus(status);
+        ::google::cloud::StatusOr<
+            std::vector<::google::cloud::bigtable::RowKeySample>>
+            row_key_samples = dataset()->table().table().SampleRows();
+        if (!row_key_samples.ok()) {
+          return GcpStatusToTfStatus(row_key_samples.status());
         }
 
-        for (size_t i = 0; i < row_keys.size(); ++i) {
-          string row_key(row_keys[i].row_key);
+        for (const auto& row_key_sample : *row_key_samples) {
+          string row_key(row_key_sample.row_key);
           if (dataset()->key_range_.contains_key(row_key)) {
             // First key: check to see if we need to add the begin_key.
             if (keys_.empty() && dataset()->key_range_.begin_key() != row_key) {
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
index 9b60e0a..119da35 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
@@ -80,12 +80,14 @@
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        ::grpc::Status status;
-        row_keys_ = dataset()->table()->table().SampleRows(status);
-        if (!status.ok()) {
+        ::google::cloud::StatusOr<
+            std::vector<::google::cloud::bigtable::RowKeySample>>
+            sampled_rows = dataset()->table()->table().SampleRows();
+        if (!sampled_rows.ok()) {
           row_keys_.clear();
-          return GrpcStatusToTfStatus(status);
+          return GcpStatusToTfStatus(sampled_rows.status());
         }
+        row_keys_ = std::move(*sampled_rows);
         return Status::OK();
       }
 
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index d9fce6e..4b688f2 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -432,6 +432,17 @@
   return nullptr;
 }
 
+std::unique_ptr<
+    grpc::ClientAsyncReaderInterface<google::bigtable::v2::MutateRowsResponse>>
+BigtableTestClient::PrepareAsyncMutateRows(
+    grpc::ClientContext* context,
+    const google::bigtable::v2::MutateRowsRequest& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index 63d59b3..299494b 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -100,6 +100,12 @@
                 const google::bigtable::v2::ReadRowsRequest& request,
                 grpc::CompletionQueue* cq, void* tag) override;
 
+  std::unique_ptr<grpc::ClientAsyncReaderInterface<
+      google::bigtable::v2::MutateRowsResponse>>
+  PrepareAsyncMutateRows(grpc::ClientContext* context,
+                         const google::bigtable::v2::MutateRowsRequest& request,
+                         grpc::CompletionQueue* cq) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index f7f15a3..34d759b 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -71,6 +71,7 @@
     name = "losses_test",
     size = "small",
     srcs = ["python/utils/losses_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":losses",
@@ -121,6 +122,7 @@
     name = "gbdt_batch_test",
     size = "medium",
     srcs = ["python/training/functions/gbdt_batch_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "notsan",  # b/62863147
@@ -150,6 +152,7 @@
     name = "model_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/model_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":model_ops_py",
@@ -170,6 +173,7 @@
     name = "prediction_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/prediction_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":model_ops_py",
@@ -187,6 +191,7 @@
     name = "quantile_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/quantile_ops_test.py"],
+    python_version = "PY2",
     shard_count = 3,
     srcs_version = "PY2AND3",
     deps = [
@@ -209,6 +214,7 @@
     name = "split_handler_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/split_handler_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":split_handler_ops_py",
@@ -225,6 +231,7 @@
     name = "stats_accumulator_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/stats_accumulator_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":stats_accumulator_ops_py",
@@ -239,6 +246,7 @@
     name = "training_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/training_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":model_ops_py",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 64e4c45..aa0cddf 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -69,6 +69,7 @@
     name = "trainer_hooks_test",
     size = "small",
     srcs = ["trainer_hooks_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":trainer_hooks",
@@ -118,6 +119,7 @@
     name = "custom_export_strategy_test",
     size = "small",
     srcs = ["custom_export_strategy_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":custom_export_strategy",
@@ -176,6 +178,7 @@
     size = "medium",
     timeout = "long",
     srcs = ["dnn_tree_combined_estimator_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
@@ -195,6 +198,7 @@
     name = "estimator_test",
     size = "medium",
     srcs = ["estimator_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index fd832de..5ac3cc5 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -284,6 +284,7 @@
 py_test(
     name = "categorical_split_handler_test",
     srcs = ["learner/batch/categorical_split_handler_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":categorical_split_handler",
@@ -324,6 +325,7 @@
 py_test(
     name = "ordinal_split_handler_test",
     srcs = ["learner/batch/ordinal_split_handler_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":ordinal_split_handler",
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 61441b2..728b764 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -1149,9 +1149,9 @@
       expected_leaf_1 = [-3.4480, -3.4429, 13.8490, -3.45, -3.4508]
       expected_leaf_2 = [-1.2547, -1.3145, 1.52, 2.3875, -1.3264]
       self.assertArrayNear(expected_leaf_1,
-                           output.trees[0].nodes[1].leaf.vector.value, 3e-3)
+                           output.trees[0].nodes[1].leaf.vector.value, 7e-3)
       self.assertArrayNear(expected_leaf_2,
-                           output.trees[0].nodes[2].leaf.vector.value, 3e-3)
+                           output.trees[0].nodes[2].leaf.vector.value, 7e-3)
 
   def testTrainFnMulticlassDiagonalHessian(self):
     """Tests the GBDT train for multiclass diagonal hessian."""
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
index 81b0c19..bd81e36 100644
--- a/tensorflow/contrib/constrained_optimization/BUILD
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -41,6 +41,7 @@
 py_test(
     name = "candidates_test",
     srcs = ["python/candidates_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         # TODO(b/129496144): Re-enable MSAN test.
@@ -69,6 +70,7 @@
 py_test(
     name = "external_regret_optimizer_test",
     srcs = ["python/external_regret_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":constrained_optimization",
@@ -83,6 +85,7 @@
 py_test(
     name = "swap_regret_optimizer_test",
     srcs = ["python/swap_regret_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":constrained_optimization",
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index fa44c4d..6273bcf 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -28,6 +28,7 @@
 py_test(
     name = "copy_test",
     srcs = glob(["python/util/copy_test.py"]),
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":copy_graph_py",
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 42f538b..10475cf 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -10,6 +10,7 @@
 py_test(
     name = "assert_element_shape_test",
     srcs = ["assert_element_shape_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/data/python/ops:batching",
@@ -32,6 +33,7 @@
     size = "medium",
     srcs = ["lmdb_dataset_op_test.py"],
     data = ["//tensorflow/core:lmdb_testdata"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -57,6 +59,7 @@
     name = "reduce_dataset_test",
     size = "small",
     srcs = ["reduce_dataset_test.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow/contrib/data/python/ops:get_single_element",
         "//tensorflow/contrib/data/python/ops:grouping",
@@ -73,6 +76,7 @@
     name = "slide_dataset_op_test",
     size = "small",
     srcs = ["slide_dataset_op_test.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow/contrib/data/python/ops:sliding",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index f8bb942..6a88cc6 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -43,7 +43,8 @@
   # contents of a dataset.
   a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
 
-  a.apply(tf.contrib.data.dense_to_sparse_batch(batch_size=2, row_shape=[6])) ==
+  a.apply(tf.data.experimental.dense_to_sparse_batch(batch_size=2,
+  row_shape=[6])) ==
   {
       ([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],  # indices
        ['a', 'b', 'c', 'a', 'b'],                 # values
@@ -55,14 +56,13 @@
   ```
 
   Args:
-    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the
-      number of consecutive elements of this dataset to combine in a
-      single batch.
-    row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like
-      object representing the equivalent dense shape of a row in the
-      resulting `tf.SparseTensor`. Each element of this dataset must
-      have the same rank as `row_shape`, and must have size less
-      than or equal to `row_shape` in each dimension.
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like object
+      representing the equivalent dense shape of a row in the resulting
+      `tf.SparseTensor`. Each element of this dataset must have the same rank as
+      `row_shape`, and must have size less than or equal to `row_shape` in each
+      dimension.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -85,7 +85,7 @@
   # of a dataset.
   a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
 
-  a.apply(tf.contrib.data.unbatch()) == {
+  a.apply(tf.data.experimental.unbatch()) == {
       'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
   ```
 
@@ -111,7 +111,8 @@
 
   ```python
   dataset = tf.data.Dataset.range(200)
-  batched = dataset.apply(tf.contrib.data.batch_and_drop_remainder(128))
+  batched =
+  dataset.apply(tf.contrib.data.batch_and_drop_remainder(128))
   print(batched.output_shapes)  # ==> "(128,)" (the batch dimension is known)
   ```
 
@@ -121,7 +122,7 @@
 
   Args:
     batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        consecutive elements of this dataset to combine in a single batch.
+      consecutive elements of this dataset to combine in a single batch.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -152,11 +153,10 @@
   Args:
     batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements of this dataset to combine in a single batch.
-    padded_shapes: A nested structure of `tf.TensorShape` or
-      `tf.int64` vector tensor-like objects. See
-      `tf.data.Dataset.padded_batch` for details.
-    padding_values: (Optional.) A nested structure of scalar-shaped
-      `tf.Tensor`. See `tf.data.Dataset.padded_batch` for details.
+    padded_shapes: A nested structure of `tf.TensorShape` or `tf.int64` vector
+      tensor-like objects. See `tf.data.Dataset.padded_batch` for details.
+    padding_values: (Optional.) A nested structure of scalar-shaped `tf.Tensor`.
+      See `tf.data.Dataset.padded_batch` for details.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -179,7 +179,7 @@
 
   ```python
   shapes = [tf.TensorShape([16, 256]), tf.TensorShape([None, 2])]
-  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
+  result = dataset.apply(tf.data.experimental.assert_element_shape(shapes))
   print(result.output_shapes)  # ==> "((16, 256), (<unknown>, 2))"
   ```
 
@@ -245,8 +245,8 @@
   deprecated.
 
   Args:
-    map_func: A function mapping a nested structure of tensors to another
-      nested structure of tensors.
+    map_func: A function mapping a nested structure of tensors to another nested
+      structure of tensors.
     batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements of this dataset to combine in a single batch.
     num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
@@ -257,9 +257,9 @@
       whether the last batch should be dropped in case its size is smaller than
       desired; the default behavior is not to drop the smaller batch.
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number of elements to process in parallel. If not
-        specified, `batch_size * num_parallel_batches` elements will be
-        processed in parallel.
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index 0559a2e..b22e11a 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -32,12 +32,14 @@
   ```python
   dataset = tf.data.Dataset.from_tensor_slices([1., 2., 0., 4.])
 
-  # Computing `tf.check_numerics(1. / 0.)` will raise an InvalidArgumentError.
-  dataset = dataset.map(lambda x: tf.check_numerics(1. / x, "error"))
+  # Computing `tf.debugging.check_numerics(1. / 0.)` will raise an
+  InvalidArgumentError.
+  dataset = dataset.map(lambda x: tf.debugging.check_numerics(1. / x, "error"))
 
   # Using `ignore_errors()` will drop the element that causes an error.
   dataset =
-      dataset.apply(tf.contrib.data.ignore_errors())  # ==> { 1., 0.5, 0.2 }
+      dataset.apply(tf.data.experimental.ignore_errors())  # ==> { 1., 0.5, 0.2
+      }
   ```
 
   Returns:
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
index 58ad9ee..9df55fa 100644
--- a/tensorflow/contrib/data/python/ops/get_single_element.py
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -30,13 +30,14 @@
   """Returns the single element in `dataset` as a nested structure of tensors.
 
   This function enables you to use a `tf.data.Dataset` in a stateless
-  "tensor-in tensor-out" expression, without creating a `tf.data.Iterator`.
+  "tensor-in tensor-out" expression, without creating a
+  `tf.compat.v1.data.Iterator`.
   This can be useful when your preprocessing transformations are expressed
   as a `Dataset`, and you want to use the transformation at serving time.
   For example:
 
   ```python
-  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
+  input_batch = tf.compat.v1.placeholder(tf.string, shape=[BATCH_SIZE])
 
   def preprocessing_fn(input_str):
     # ...
@@ -46,7 +47,7 @@
              .map(preprocessing_fn, num_parallel_calls=BATCH_SIZE)
              .batch(BATCH_SIZE))
 
-  image_batch, label_batch = tf.contrib.data.get_single_element(dataset)
+  image_batch, label_batch = tf.data.experimental.get_single_element(dataset)
   ```
 
   Args:
@@ -70,7 +71,8 @@
 
   Args:
     dataset: A `tf.data.Dataset` object.
-    reducer: A `tf.contrib.data.Reducer` object representing the reduce logic.
+    reducer: A `tf.data.experimental.Reducer` object representing the reduce
+      logic.
 
   Returns:
     A nested structure of `tf.Tensor` objects, corresponding to the result
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index f50da4d..4543bd2 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -46,7 +46,7 @@
   # Preprocess 4 files concurrently.
   filenames = tf.data.Dataset.list_files("/path/to/data/train*.tfrecords")
   dataset = filenames.apply(
-      tf.contrib.data.parallel_interleave(
+      tf.data.experimental.parallel_interleave(
           lambda filename: tf.data.TFRecordDataset(filename),
           cycle_length=4))
   ```
@@ -146,7 +146,7 @@
       `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     A dataset that interleaves elements from `datasets` at random, according to
@@ -175,7 +175,7 @@
   # Define a dataset containing `[0, 1, 2, 0, 1, 2, 0, 1, 2]`.
   choice_dataset = tf.data.Dataset.range(3).repeat(3)
 
-  result = tf.contrib.data.choose_from_datasets(datasets, choice_dataset)
+  result = tf.data.experimental.choose_from_datasets(datasets, choice_dataset)
   ```
 
   The elements of `result` will be:
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py
index 48c325c..013fedb 100644
--- a/tensorflow/contrib/data/python/ops/iterator_ops.py
+++ b/tensorflow/contrib/data/python/ops/iterator_ops.py
@@ -36,11 +36,11 @@
     ds = tf.data.Dataset.range(10)
     iterator = ds.make_initializable_iterator()
     # Build the iterator SaveableObject.
-    saveable_obj = tf.contrib.data.make_saveable_from_iterator(iterator)
+    saveable_obj = tf.data.experimental.make_saveable_from_iterator(iterator)
     # Add the SaveableObject to the SAVEABLE_OBJECTS collection so
     # it can be automatically saved using Saver.
-    tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
-    saver = tf.train.Saver()
+    tf.compat.v1.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
+    saver = tf.compat.v1.train.Saver()
 
     while continue_training:
       ... Perform training ...
@@ -82,7 +82,7 @@
   while True:
     est.train(
         train_input_fn,
-        hooks=[tf.contrib.data.CheckpointInputPipelineHook(est)],
+        hooks=[tf.data.experimental.CheckpointInputPipelineHook(est)],
         steps=train_steps_per_eval)
     # Note: We do not pass the hook here.
     metrics = est.evaluate(eval_input_fn)
@@ -99,7 +99,7 @@
      pipeline.
 
   For saving the input pipeline checkpoint alongside the model weights use
-  `tf.contrib.data.make_saveable_from_iterator` directly to create a
+  `tf.data.experimental.make_saveable_from_iterator` directly to create a
   `SaveableObject` and add to the `SAVEABLE_OBJECTS` collection. Note, however,
   that you will need to be careful not to restore the training iterator during
   eval. You can do that by not adding the iterator to the SAVEABLE_OBJECTS
diff --git a/tensorflow/contrib/data/python/ops/parsing_ops.py b/tensorflow/contrib/data/python/ops/parsing_ops.py
index 3aeee9d..7bc4f0a 100644
--- a/tensorflow/contrib/data/python/ops/parsing_ops.py
+++ b/tensorflow/contrib/data/python/ops/parsing_ops.py
@@ -34,7 +34,7 @@
   and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
   `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
   and `SparseFeature` is mapped to a `SparseTensor`, and each
-  `FixedLenFeature` is mapped to a `Tensor`. See `tf.parse_example` for more
+  `FixedLenFeature` is mapped to a `Tensor`. See `tf.io.parse_example` for more
   details about feature dictionaries.
 
   Args:
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index c6bf521..70fbff9 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -60,7 +60,7 @@
 
   Args:
     file_pattern: List of files or patterns of file paths containing CSV
-      records. See `tf.gfile.Glob` for pattern rules.
+      records. See `tf.io.gfile.glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     column_names: An optional list of strings that corresponds to the CSV
@@ -225,11 +225,11 @@
 
   Args:
     file_pattern: List of files or patterns of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
+      `Example` records. See `tf.io.gfile.glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
-      `VarLenFeature` values. See `tf.parse_example`.
+      `VarLenFeature` values. See `tf.io.parse_example`.
     reader: A function or class that can be
       called with a `filenames` tensor and (optional) `reader_args` and returns
       a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
@@ -328,11 +328,11 @@
 
   Args:
     file_pattern: List of files or patterns of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
+      `Example` records. See `tf.io.gfile.glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
-      `VarLenFeature` values. See `tf.parse_example`.
+      `VarLenFeature` values. See `tf.io.parse_example`.
     reader: A function or class that can be
       called with a `filenames` tensor and (optional) `reader_args` and returns
       a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
@@ -378,7 +378,7 @@
     (key value) pairs sequentially.
     For example:
     ```python
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
 
     dataset = tf.contrib.lmdb.LMDBDataset("/foo/bar.mdb")
 
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
index 329b34f..ef9944e 100644
--- a/tensorflow/contrib/data/python/ops/shuffle_ops.py
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -26,7 +26,7 @@
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
   """Shuffles and repeats a Dataset returning a new permutation for each epoch.
 
-  `dataset.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count))`
+  `dataset.apply(tf.data.experimental.shuffle_and_repeat(buffer_size, count))`
 
   is equivalent to
 
@@ -45,7 +45,7 @@
       indefinitely.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py
index 909d06c..9129599 100644
--- a/tensorflow/contrib/data/python/ops/unique.py
+++ b/tensorflow/contrib/data/python/ops/unique.py
@@ -32,7 +32,7 @@
   dataset = tf.data.Dataset.from_tensor_slices([1, 37, 2, 37, 2, 1])
 
   # Using `unique()` will drop the duplicate elements.
-  dataset = dataset.apply(tf.contrib.data.unique())  # ==> { 1, 37, 2 }
+  dataset = dataset.apply(tf.data.experimental.unique())  # ==> { 1, 37, 2 }
   ```
 
   Returns:
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index 401527f..035d8cf 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -22,6 +22,7 @@
 py_test(
     name = "summaries_test",
     srcs = ["summaries_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 833b600..a50d409 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -20,6 +20,7 @@
     deps = [
         ":keras_multi_worker_test_base",
         "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:model_combinations",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:single_loss_example",
         "//tensorflow/python/distribute:strategy_combinations",
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index b029bc4..d6eff47 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -312,6 +312,7 @@
                               input_fn,
                               expected_values,
                               test_reinitialize=True,
+                              ignore_order=False,
                               use_core_strategy=False):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
@@ -327,7 +328,10 @@
         next_element = iterator.get_next()
         computed_value = sess.run([values.select_replica(r, next_element)
                                    for r in range(len(devices))])
-        self.assertEqual(expected_value, computed_value)
+        if ignore_order:
+          self.assertCountEqual(expected_value, computed_value)
+        else:
+          self.assertEqual(expected_value, computed_value)
 
       with self.assertRaises(errors.OutOfRangeError):
         next_element = iterator.get_next()
@@ -342,7 +346,10 @@
           next_element = iterator.get_next()
           computed_value = sess.run([values.select_replica(r, next_element)
                                      for r in range(len(devices))])
-          self.assertEqual(expected_value, computed_value)
+          if ignore_order:
+            self.assertCountEqual(expected_value, computed_value)
+          else:
+            self.assertEqual(expected_value, computed_value)
 
 
 class DistributedCollectiveAllReduceStrategyTest(
@@ -413,7 +420,6 @@
         num_gpus=num_gpus,
         use_core_strategy=use_core_strategy)
 
-  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(yuefengz): Update how we use num_gpus and required_gpus
   @combinations.generate(
       combinations.combine(
@@ -422,8 +428,7 @@
           required_gpus=1,
           use_dataset=[True, False],
           use_core_strategy=[True, False]))
-  def DISABLED_testMakeInputFnIterator(self, num_gpus, use_dataset,
-                                       use_core_strategy):
+  def testMakeInputFnIterator(self, num_gpus, use_dataset, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -450,6 +455,7 @@
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
+        ignore_order=not use_dataset,
         use_core_strategy=use_core_strategy)
 
   @combinations.generate(
@@ -576,7 +582,7 @@
           required_gpus=2,
           use_dataset=[True, False],
           use_core_strategy=[True, False]))
-  def DISABLED_testMakeInputFnIterator(self, use_dataset, use_core_strategy):
+  def testMakeInputFnIterator(self, use_dataset, use_core_strategy):
     num_gpus = 2
     if use_dataset:
       fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
@@ -599,6 +605,7 @@
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
+        ignore_order=not use_dataset,
         use_core_strategy=use_core_strategy)
 
   @combinations.generate(
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
index 58bede8..75fbc3b 100644
--- a/tensorflow/contrib/distribute/python/examples/BUILD
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -13,6 +13,7 @@
 py_binary(
     name = "simple_estimator_example",
     srcs = ["simple_estimator_example.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow:tensorflow_py",
     ],
@@ -23,6 +24,7 @@
     srcs = [
         "keras_model_with_estimator.py",
     ],
+    python_version = "PY2",
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
@@ -32,6 +34,7 @@
 py_binary(
     name = "keras_mnist",
     srcs = ["keras_mnist.py"],
+    python_version = "PY2",
     deps = [":keras_mnist_lib"],
 )
 
@@ -51,6 +54,7 @@
     srcs = [
         "mnist_eager_multigpu.py",
     ],
+    python_version = "PY2",
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
@@ -62,6 +66,7 @@
     srcs = [
         "mnist_tf1_tpu.py",
     ],
+    python_version = "PY2",
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
index 9edb087..a134b12 100644
--- a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
+++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
@@ -673,26 +673,6 @@
 
   @combinations.generate(
       combinations.combine(
-          distribution=[strategy_combinations.tpu_strategy_one_step],
-          mode=['graph']))
-  def test_dataset_input_shape_fully_defined(self, distribution):
-    with self.cached_session():
-      model = get_model()
-
-      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
-
-      dataset = get_dataset(distribution)
-      # Input shapes are not fully known. Batch dimension is unknown as we are
-      # not using the drop_remainder argument.
-      dataset = dataset.repeat(100).batch(10)
-
-      with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
-
-  @combinations.generate(
-      combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.mirrored_strategy_with_two_gpus,
diff --git a/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py b/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py
index 889976d..190aaf8 100644
--- a/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py
@@ -33,6 +33,7 @@
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
 
 
@@ -132,18 +133,19 @@
   return model, IMAGE_INPUTS, IMAGE_TARGETS
 
 
-# TODO(b/130243026): Re-enable this test.
 def make_lstm_model(initial_weights=None):
   inputs = keras.layers.Input(shape=(10, 20))
-  rnn1_out = keras.layers.LSTM(20, return_sequences=True)(inputs)
-  rnn2_out = keras.layers.LSTM(10)(rnn1_out)
-  outputs = keras.layers.Dense(1)(rnn2_out)
+  rnn_out = keras.layers.LSTM(4)(inputs)
+  outputs = keras.layers.Dense(1)(rnn_out)
   model = keras.Model(inputs, outputs)
 
   if initial_weights:
     model.set_weights(initial_weights)
 
-  model.compile('adam', 'binary_crossentropy', metrics=['mse'])
+  model.compile(
+      gradient_descent.SGD(0.1),
+      'sparse_categorical_crossentropy',
+      metrics=['sparse_categorical_crossentropy'])
 
   return model, LSTM_INPUTS, LSTM_TARGETS
 
@@ -177,7 +179,7 @@
           strategy_cls=[
               collective_strategy.CollectiveAllReduceStrategy,
           ],
-          make_model=[make_image_model, make_embedding_model],
+          make_model=[make_image_model, make_lstm_model, make_embedding_model],
           required_gpus=[0, 1]))
   def test_correctness(self, strategy_cls, make_model):
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 5bfd732..3856935 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -61,8 +61,8 @@
   GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
   caution needs to be taken:
 
-  1) Always use `tf.get_variable` instead of `tf.Variable` which is not able
-  to refer to the same variable on different replicas.
+  1) Always use `tf.compat.v1.get_variable` instead of `tf.Variable` which
+  is not able to refer to the same variable on different replicas.
 
   2) It is generally not recommended to open a device scope under the strategy's
   scope. A device scope (i.e. calling `tf.device`) will be merged with or
@@ -70,9 +70,9 @@
   variables.
 
   3) It is also not recommended to open a colocation scope (i.e. calling
-  `tf.colocate_with`) under the strategy's scope. For colocating variables, use
-  `strategy.extended.colocate_vars_with` instead. Colocation of ops will
-  possibly create conflicts of device assignment.
+  `tf.compat.v1.colocate_with`) under the strategy's scope. For colocating
+  variables, use `strategy.extended.colocate_vars_with` instead. Colocation of
+  ops will possibly create conflicts of device assignment.
   """
 
   def __init__(self, num_gpus_per_worker=0):
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 2561fcd..da3cd48 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -565,6 +565,7 @@
                               input_fn,
                               expected_values,
                               test_reinitialize=True,
+                              ignore_order=False,
                               use_core_strategy=False):
     distribution, master_target, config = self._get_test_objects(
         task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
@@ -580,7 +581,10 @@
         next_element = iterator.get_next()
         computed_value = sess.run([values.select_replica(r, next_element)
                                    for r in range(len(devices))])
-        self.assertEqual(expected_value, computed_value)
+        if ignore_order:
+          self.assertCountEqual(expected_value, computed_value)
+        else:
+          self.assertEqual(expected_value, computed_value)
 
       with self.assertRaises(errors.OutOfRangeError):
         next_element = iterator.get_next()
@@ -595,7 +599,10 @@
           next_element = iterator.get_next()
           computed_value = sess.run([values.select_replica(r, next_element)
                                      for r in range(len(devices))])
-          self.assertEqual(expected_value, computed_value)
+          if ignore_order:
+            self.assertCountEqual(expected_value, computed_value)
+          else:
+            self.assertEqual(expected_value, computed_value)
 
 
 class ParameterServerStrategyTest(
@@ -689,7 +696,6 @@
   def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
     self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
 
-  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
       combinations.combine(
@@ -698,7 +704,7 @@
           required_gpus=1,
           use_core_strategy=[True, False],
           use_dataset=[True, False]))
-  def DISABLED_testMakeInputFnIteratorDistributed(
+  def testMakeInputFnIteratorDistributed(
       self, num_gpus, use_core_strategy, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
@@ -724,9 +730,9 @@
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
+        ignore_order=not use_dataset,
         use_core_strategy=use_core_strategy)
 
-  # TODO(b/124344198): Re-enable after fixing this flaky test.
   @combinations.generate(
       combinations.combine(
           mode=['graph'],
@@ -734,8 +740,8 @@
           required_gpus=1,
           use_core_strategy=[True, False],
           use_dataset=[True, False]))
-  def DISABLED_testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
-                                            use_dataset):
+  def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
+                                   use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -760,6 +766,7 @@
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
+        ignore_order=not use_dataset,
         use_core_strategy=use_core_strategy)
 
   @combinations.generate(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py
index d5b3367..1b88c1d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py
@@ -41,10 +41,10 @@
                             diag_shift=shift)
 
     y_ = self.evaluate(b.forward(x))
-    self.assertAllClose(y, y_)
+    self.assertAllClose(y, y_, rtol=1e-4)
 
     x_ = self.evaluate(b.inverse(y))
-    self.assertAllClose(x, x_)
+    self.assertAllClose(x, x_, rtol=1e-4)
 
   @test_util.run_in_graph_and_eager_modes
   def testInvertible(self):
@@ -52,18 +52,18 @@
     # Generate random inputs from an unconstrained space, with
     # event size 6 to specify 3x3 triangular matrices.
     batch_shape = [2, 1]
-    x = np.float32(np.random.randn(*(batch_shape + [6])))
+    x = np.float32(self._rng.randn(*(batch_shape + [6])))
     b = bijectors.ScaleTriL(diag_bijector=bijectors.Softplus(),
                             diag_shift=3.14159)
     y = self.evaluate(b.forward(x))
     self.assertAllEqual(y.shape, batch_shape + [3, 3])
 
     x_ = self.evaluate(b.inverse(y))
-    self.assertAllClose(x, x_)
+    self.assertAllClose(x, x_, rtol=1e-4)
 
     fldj = self.evaluate(b.forward_log_det_jacobian(x, event_ndims=1))
     ildj = self.evaluate(b.inverse_log_det_jacobian(y, event_ndims=2))
-    self.assertAllClose(fldj, -ildj)
+    self.assertAllClose(fldj, -ildj, rtol=1e-4)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index d441e47..a500f9f 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -151,6 +151,7 @@
 py_test(
     name = "metrics_test",
     srcs = ["metrics_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":metrics",
@@ -188,6 +189,7 @@
 py_test(
     name = "evaluator_test",
     srcs = ["evaluator_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":evaluator",
@@ -220,6 +222,7 @@
 py_test(
     name = "network_test",
     srcs = ["network_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":network",
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 51443d2..fa46d73 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -165,8 +165,15 @@
         self.__call__(example, *args, **kwargs)
       return self.all_metric_results(summary_logdir)
     # Graph construction
-    call_op = self.__call__(
-        dataset_ops.make_one_shot_iterator(dataset).get_next(), *args, **kwargs)
+    next_value = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    # Function inlining destroys strict inputs semantics (function body might
+    # start execution before all inputs are ready). When iterator is exhausted
+    # and throws out of range error, function body might be partially executed.
+    # To prevent this we add an explicit control dependency from the 'get_next'.
+    with ops.control_dependencies([next_value]):
+      has_next_value = control_flow_ops.no_op(name="iterator_has_next")
+    with ops.control_dependencies([has_next_value]):
+      call_op = self.__call__(next_value, *args, **kwargs)
     init_op = self.init_variables()
     results_op = self.all_metric_results(summary_logdir)
     return (init_op, call_op, results_op)
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index fbb5daf..a001d42 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -8,6 +8,7 @@
 py_binary(
     name = "densenet",
     srcs = ["densenet.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":densenet_lib"],
 )
diff --git a/tensorflow/contrib/eager/python/examples/gan/BUILD b/tensorflow/contrib/eager/python/examples/gan/BUILD
index d99a519..be561a1 100644
--- a/tensorflow/contrib/eager/python/examples/gan/BUILD
+++ b/tensorflow/contrib/eager/python/examples/gan/BUILD
@@ -8,6 +8,7 @@
 py_binary(
     name = "mnist",
     srcs = ["mnist.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":mnist_lib"],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index 30afef8..8536fdb 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -8,6 +8,7 @@
 py_binary(
     name = "linear_regression",
     srcs = ["linear_regression.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":linear_regression_lib"],
 )
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index f2851d9..a80f3d2 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -37,9 +37,6 @@
     ],
     shard_count = 4,
     tags = [
-        "noasan",  # Fix b/118130911
-        "nomsan",  # Fix b/118130911
-        "notsan",  # Fix b/118130911
         "optonly",
         "oss_serial",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index 1090756..a48d08b 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -86,6 +86,7 @@
     additional_deps = [":blocks_test_main_lib"],
     shard_count = 4,
     tags = [
+        "no_oss",  # TODO(b/132387200): Segfaulting
         "optonly",
     ],
 )
@@ -134,6 +135,7 @@
 py_binary(
     name = "cifar_tfrecords",
     srcs = ["cifar_tfrecords.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
@@ -143,6 +145,7 @@
 py_binary(
     name = "main",
     srcs = ["main.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":main_lib"],
 )
@@ -162,6 +165,7 @@
 py_binary(
     name = "main_estimator",
     srcs = ["main_estimator.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
index f4dbe7a..aca0b2f 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
@@ -8,6 +8,7 @@
 py_binary(
     name = "rnn_colorbot",
     srcs = ["rnn_colorbot.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":rnn_colorbot_lib"],
 )
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
index 3301363..ef683ce 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
@@ -8,6 +8,7 @@
 py_binary(
     name = "rnn_ptb",
     srcs = ["rnn_ptb.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":rnn_ptb_lib"],
 )
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index 9b0fbaa..72f1829 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -17,6 +17,7 @@
     name = "data_test",
     size = "small",
     srcs = ["data_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":data",
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index da2479a..ab510b8 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -201,6 +201,7 @@
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/ops/kmeans_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 8fd2b5f..91e2954 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -145,6 +145,7 @@
     name = "arg_scope_test",
     size = "small",
     srcs = ["python/ops/arg_scope_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_py",
@@ -156,6 +157,7 @@
     name = "checkpoint_utils_test",
     size = "small",
     srcs = ["python/framework/checkpoint_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],  # http://b/30468735
     deps = [
@@ -175,6 +177,7 @@
     name = "ops_test",
     size = "small",
     srcs = ["python/ops/ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_py",
@@ -187,6 +190,7 @@
     name = "prettyprint_ops_test",
     size = "small",
     srcs = ["python/ops/prettyprint_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_py",
@@ -203,6 +207,7 @@
 py_test(
     name = "experimental_test",
     srcs = ["python/framework/experimental_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_py",
@@ -214,6 +219,7 @@
 py_test(
     name = "graph_util_test",
     srcs = ["python/framework/graph_util_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_py",
@@ -225,6 +231,7 @@
 py_test(
     name = "tensor_util_test",
     srcs = ["python/framework/tensor_util_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_py",
@@ -242,6 +249,7 @@
     name = "variables_test",
     size = "small",
     srcs = ["python/ops/variables_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
@@ -280,6 +288,7 @@
     size = "medium",
     srcs = ["python/ops/checkpoint_ops_test.py"],
     data = [":checkpoint_ops_testdata"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 739eb36..8e13f19 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -90,6 +90,9 @@
 
 template <typename BiasType, typename ScaleType>
 class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
+  using T = qint8;       // conv_input and filter type
+  using TempT = qint32;  // temporary accumulator type for tensor contraction
+
  public:
   void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
               const Tensor& conv_input, ScaleType conv_input_scale,
@@ -105,9 +108,6 @@
     // contraction using 32-bit accumulation (QInt32).
     Tensor temp_output(DT_QINT32, output->shape());
 
-    using T = qint8;
-    using TempT = qint32;
-
     constexpr int32 row_dilation = 1;
     constexpr int32 col_dilation = 1;
 
@@ -116,6 +116,10 @@
     // CPU convolution works with input in NHWC and filter in HWIO data formats.
     // NOTE: This code is mostly shared with 'Conv2D' and 'FusedConv2D'.
 
+    BiasActivationOutputKernel output_kernel(conv_input_scale, side_input,
+                                             side_input_scale, bias,
+                                             activation_mode, output);
+
     if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
         col_stride == 1) {
       int conv_width =  // Width for the convolution step.
@@ -128,7 +132,7 @@
       auto in0 = conv_input.shaped<T, 2>({conv_width, filter.dim_size(2)});
       auto in1 = filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)});
 
-      out.device(device) = in0.contract(in1, dim_pair /*, output_kernel*/);
+      out.device(device) = in0.contract(in1, dim_pair, output_kernel);
 
     } else if (filter.dim_size(0) == conv_input.dim_size(1) &&
                filter.dim_size(1) == conv_input.dim_size(2) &&
@@ -147,7 +151,7 @@
       auto in0 = conv_input.shaped<T, 2>({conv_input.dim_size(0), k});
       auto in1 = filter.shaped<T, 2>({k, filter.dim_size(3)});
 
-      out.device(device) = in0.contract(in1, dim_pair /*, output_kernel*/);
+      out.device(device) = in0.contract(in1, dim_pair, output_kernel);
 
     } else {
       auto out = temp_output.tensor<TempT, 4>();
@@ -155,57 +159,95 @@
       auto in1 = filter.tensor<T, 4>();
 
       // Need to swap row/col when calling Eigen.
-      out.device(device) = Eigen::SpatialConvolution(
-          in0, in1, col_stride, row_stride, padding, col_dilation,
-          row_dilation /*, output_kernel*/);
-    }
-
-    constexpr int8 max_range = 127;
-    constexpr int8 min_range = -128;
-
-#if defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::IndexList<int, int, int, Eigen::type2index<1>> broadcast_bias;
-    broadcast_bias.set(0, temp_output.dim_size(0));
-    broadcast_bias.set(1, temp_output.dim_size(1));
-    broadcast_bias.set(2, temp_output.dim_size(2));
-#else
-    const Eigen::array<int, 4> broadcast_bias(temp_output.dim_size(0),
-                                              temp_output.dim_size(1),
-                                              temp_output.dim_size(2), 1);
-#endif
-
-    // TODO(ezhulenev): Bias and SideInput could be added to the result of
-    // convolution using Eigen output kernels.
-
-    auto temp_t = temp_output.tensor<TempT, 4>();
-    auto bias_t = bias.shaped<BiasType, 4>({1, 1, 1, temp_output.dim_size(3)});
-    auto side_input_t = side_input.tensor<T, 4>();
-
-    auto conv_output_scaled = temp_t.cast<ScaleType>() * conv_input_scale;
-    auto broadcasted_bias = bias_t.broadcast(broadcast_bias);
-    auto side_input_scaled = side_input_t.cast<ScaleType>() * side_input_scale;
-
-    // This expression corresponds to cuDNN implementation of INT8
-    // cudnnConvolutionBiasActivationForward.
-    // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#scaling-parameters__fig-conv-bias-activation-forward
-
-    if (activation_mode == ActivationMode::NONE) {
-      output->tensor<T, 4>().device(device) =
-          (conv_output_scaled + broadcasted_bias + side_input_scaled)
-              .round()
-              .clip(static_cast<ScaleType>(min_range),
-                    static_cast<ScaleType>(max_range))
-              .template cast<T>();
-    } else if (activation_mode == ActivationMode::RELU) {
-      output->tensor<T, 4>().device(device) =
-          (conv_output_scaled + broadcasted_bias + side_input_scaled)
-              .round()
-              .clip(0, static_cast<ScaleType>(max_range))
-              .template cast<T>();
-    } else {
-      OP_REQUIRES(ctx, false, errors::Internal("Unsupported activation mode"));
+      out.device(device) =
+          Eigen::SpatialConvolution(in0, in1, col_stride, row_stride, padding,
+                                    col_dilation, row_dilation, output_kernel);
     }
   }
+
+ private:
+  // Contraction output mapper for temporary QInt32 tensor.
+  using ContractionOutputMapper =
+      Eigen::internal::blas_data_mapper<TempT, Eigen::Index, Eigen::ColMajor>;
+
+  // This output kernel computes an expressions corresponding to cuDNN
+  // implementation of INT8 cudnnConvolutionBiasActivationForward:
+  // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#scaling-parameters__fig-conv-bias-activation-forward
+  struct BiasActivationOutputKernel {
+    static constexpr int8 kMaxRange = 127;
+    static constexpr int8 kMinRange = -128;
+
+    explicit BiasActivationOutputKernel(ScaleType conv_input_scale,
+                                        const Tensor& side_input,
+                                        ScaleType side_input_scale,
+                                        const Tensor& bias,
+                                        ActivationMode activation_mode,
+                                        Tensor* output)
+        : activation_mode(activation_mode),
+          conv_input_scale(conv_input_scale),
+          bias_data(bias.flat<BiasType>().data()),
+          side_input_data(side_input.flat<T>().data()),
+          side_input_scale(side_input_scale),
+          output_data(const_cast<T*>(output->flat<T>().data())) {}
+
+    EIGEN_ALWAYS_INLINE void operator()(
+        const ContractionOutputMapper& conv_output_mapper,
+        const Eigen::TensorContractionParams& params, Eigen::Index i,
+        Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
+      DCHECK(params.swapped_arguments);
+
+      const auto stride = conv_output_mapper.stride();
+
+      const BiasType* bias_base = bias_data + i;
+      typename TTypes<BiasType>::UnalignedConstTensor bias(bias_base, num_rows);
+
+      const T* side_input_base = side_input_data + i + j * stride;
+      T* output_base = output_data + i + j * stride;
+
+      for (int col = 0; col < num_cols; ++col) {
+        // A column of an output tensor after QInt8xQInt8 -> QInt32 contraction.
+        // This is a temporary tensor, that we will scale, add bias with
+        // side_input, and quantize before writing to final output tensor.
+        typename TTypes<TempT>::UnalignedConstTensor conv_output(
+            &conv_output_mapper(0, col), num_rows);
+
+        // A column of side input tensor corresponding to conv output row.
+        typename TTypes<T>::UnalignedConstTensor side_input(
+            side_input_base + col * stride, num_rows);
+
+        // A column of output quantized tensor corresponding to conv output row.
+        typename TTypes<T>::UnalignedTensor output(output_base + col * stride,
+                                                   num_rows);
+
+        auto conv_output_scaled =
+            conv_output.cast<ScaleType>() * conv_input_scale;
+        ScaleType lower_bound = (activation_mode == ActivationMode::NONE
+                                     ? static_cast<ScaleType>(kMinRange)
+                                     : 0);
+        if (side_input_scale == 0.0f) {
+          output = (conv_output_scaled + bias)
+                       .round()
+                       .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
+                       .template cast<T>();
+        } else {
+          auto side_input_scaled =
+              side_input.cast<ScaleType>() * side_input_scale;
+          output = (conv_output_scaled + bias + side_input_scaled)
+                       .round()
+                       .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
+                       .template cast<T>();
+        }
+      }
+    }
+
+   private:
+    ActivationMode activation_mode;
+    ScaleType conv_input_scale;
+    const BiasType* bias_data;
+    const T* side_input_data;
+    ScaleType side_input_scale;
+    T* output_data;
+  };
 };
 #endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 
@@ -476,9 +518,8 @@
   return cc;
 }
 
-void LogFusedConvAutotuneResults(
-    se::dnn::ConvolutionKind kind, se::dnn::DataType element_type,
-    const se::dnn::BatchDescriptor& input_desc,
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, const se::dnn::BatchDescriptor& input_desc,
     const se::dnn::FilterDescriptor& filter_desc,
     const se::dnn::BatchDescriptor& output_desc,
     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
@@ -487,7 +528,7 @@
   AutotuningLog log;
   {
     ConvolutionProto instr;
-    instr.set_kind(kind);
+    instr.set_kind(se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION);
     *instr.mutable_input() = input_desc.ToProto(element_type);
     *instr.mutable_filter() = filter_desc.ToProto(element_type);
     *instr.mutable_output() = output_desc.ToProto(element_type);
@@ -894,8 +935,7 @@
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
       }
     }
-    internal::LogFusedConvAutotuneResults(
-        se::dnn::ConvolutionKind::FORWARD,
+    internal::LogFusedConvForwardAutotuneResults(
         se::dnn::ToDataType<typename RawType<T>::type>::value, conv_input_desc,
         filter_desc, output_desc, conv_desc, conv_input_scale, side_input_scale,
         dnn_activation_mode, stream->parent(), results);
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
index b4a2faa..0d79696 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
@@ -950,7 +950,7 @@
           padding=padding_type,
           conv_input_scale=conv_input_scale,
           side_input_scale=side_input_scale,
-          side_input=side_input,
+          side_input=(None if side_input_scale == 0.0 else side_input),
           activation_mode="Relu" if apply_relu else "None",
           data_format="NHWC",
           filter_format="HWIO")
@@ -1045,7 +1045,8 @@
             padding=padding_type,
             conv_input_scale=conv_input_scale,
             side_input_scale=side_input_scale,
-            side_input=_Int8Roundtrip(_NchwVectCToNhwc, side_input),
+            side_input=(None if side_input_scale == 0.0 else _Int8Roundtrip(
+                _NchwVectCToNhwc, side_input)),
             activation_mode="Relu" if apply_relu else "None",
             data_format="NHWC",
             filter_format="HWIO")
@@ -1060,7 +1061,7 @@
             padding=padding_type,
             conv_input_scale=conv_input_scale,
             side_input_scale=side_input_scale,
-            side_input=side_input,
+            side_input=(None if side_input_scale == 0.0 else side_input),
             activation_mode="Relu" if apply_relu else "None",
             data_format="NCHW_VECT_C",
             filter_format="OIHW_VECT_I")
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 386e4cf..97de8a0 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -58,6 +58,7 @@
 py_test(
     name = "train_test",
     srcs = ["python/train_test.py"],
+    python_version = "PY2",
     shard_count = 50,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -161,6 +162,7 @@
 py_test(
     name = "losses_impl_test",
     srcs = ["python/losses/python/losses_impl_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":losses_impl",
@@ -198,6 +200,7 @@
 py_test(
     name = "tuple_losses_test",
     srcs = ["python/losses/python/tuple_losses_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":losses_impl",
@@ -236,6 +239,7 @@
 py_test(
     name = "conditioning_utils_test",
     srcs = ["python/features/python/conditioning_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":conditioning_utils",
@@ -266,6 +270,7 @@
 py_test(
     name = "random_tensor_pool_test",
     srcs = ["python/features/python/random_tensor_pool_test.py"],
+    python_version = "PY2",
     shard_count = 6,
     srcs_version = "PY2AND3",
     deps = [
@@ -303,6 +308,7 @@
 py_test(
     name = "virtual_batchnorm_test",
     srcs = ["python/features/python/virtual_batchnorm_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":virtual_batchnorm",
@@ -338,6 +344,7 @@
 py_test(
     name = "clip_weights_test",
     srcs = ["python/features/python/clip_weights_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":clip_weights",
@@ -376,6 +383,7 @@
 py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -411,6 +419,7 @@
 py_test(
     name = "eval_utils_test",
     srcs = ["python/eval/python/eval_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":eval_utils",
@@ -443,6 +452,7 @@
 py_test(
     name = "summaries_test",
     srcs = ["python/eval/python/summaries_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":namedtuples",
@@ -475,6 +485,7 @@
 py_test(
     name = "head_test",
     srcs = ["python/estimator/python/head_test.py"],
+    python_version = "PY2",
     shard_count = 1,
     srcs_version = "PY2AND3",
     deps = [
@@ -512,6 +523,7 @@
 py_test(
     name = "gan_estimator_test",
     srcs = ["python/estimator/python/gan_estimator_test.py"],
+    python_version = "PY2",
     shard_count = 1,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -567,6 +579,7 @@
 py_test(
     name = "stargan_estimator_test",
     srcs = ["python/estimator/python/stargan_estimator_test.py"],
+    python_version = "PY2",
     shard_count = 1,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -617,6 +630,7 @@
 py_test(
     name = "tpu_gan_estimator_test",
     srcs = ["python/estimator/python/tpu_gan_estimator_test.py"],
+    python_version = "PY2",
     shard_count = 11,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -670,6 +684,7 @@
     srcs = [
         "python/estimator/python/latent_gan_estimator_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":latent_gan_estimator",
@@ -705,6 +720,7 @@
 py_test(
     name = "sliced_wasserstein_test",
     srcs = ["python/eval/python/sliced_wasserstein_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":sliced_wasserstein",
@@ -738,6 +754,7 @@
 py_test(
     name = "spectral_normalization_test",
     srcs = ["python/features/python/spectral_normalization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":spectral_normalization",
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index efbdb11..2c30126 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -189,7 +189,7 @@
 
 def get_graph_def_from_disk(filename):
   """Get a GraphDef proto from a disk location."""
-  with gfile.FastGFile(filename, 'rb') as f:
+  with gfile.GFile(filename, 'rb') as f:
     return graph_pb2.GraphDef.FromString(f.read())
 
 
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index 1711100..35b6e63 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -53,6 +53,7 @@
 py_test(
     name = "util_test",
     srcs = ["tests/util_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
@@ -65,6 +66,7 @@
 py_test(
     name = "select_test",
     srcs = ["tests/select_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
@@ -77,6 +79,7 @@
 py_test(
     name = "match_test",
     srcs = ["tests/match_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":match",
@@ -89,6 +92,7 @@
 py_test(
     name = "subgraph_test",
     srcs = ["tests/subgraph_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
@@ -101,6 +105,7 @@
 py_test(
     name = "reroute_test",
     srcs = ["tests/reroute_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
@@ -114,6 +119,7 @@
 py_test(
     name = "edit_test",
     srcs = ["tests/edit_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
@@ -127,6 +133,7 @@
 py_test(
     name = "transform_test",
     srcs = ["tests/transform_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index 5d9b35f..cf786c0 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -79,6 +79,7 @@
     name = "input_pipeline_ops_test",
     size = "small",
     srcs = ["python/ops/input_pipeline_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":input_pipeline_py",
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index 0b7d64f..9a2c944 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -31,6 +31,7 @@
 py_test(
     name = "odes_test",
     srcs = ["python/ops/odes_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":integrate_py",
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index 87c2dcd..833771e 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -47,6 +47,7 @@
 py_test(
     name = "random_fourier_features_test",
     srcs = ["python/mappers/random_fourier_features_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":dense_kernel_mapper_py",
@@ -63,6 +64,7 @@
 py_test(
     name = "kernel_estimators_test",
     srcs = ["python/kernel_estimators_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -80,6 +82,7 @@
 py_test(
     name = "losses_test",
     srcs = ["python/losses_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":kernel_methods",
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 7e19ae7..fb28d66 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -69,6 +69,7 @@
     srcs = [
         "python/ops/core_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
@@ -106,6 +107,7 @@
     srcs = [
         "python/ops/io_ops_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":core",
@@ -136,6 +138,7 @@
     srcs = [
         "python/ops/nn_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":core",
@@ -171,6 +174,7 @@
     srcs = [
         "python/ops/ops_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":core",
@@ -205,6 +209,7 @@
     srcs = [
         "python/ops/sugar_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":core",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 69d5496..c6f6e72 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -162,6 +162,7 @@
     name = "regularizers_test",
     size = "small",
     srcs = ["python/layers/regularizers_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -178,6 +179,7 @@
     name = "initializers_test",
     size = "small",
     srcs = ["python/layers/initializers_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -194,6 +196,7 @@
     name = "normalization_test",
     size = "medium",
     srcs = ["python/layers/normalization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
@@ -211,6 +214,7 @@
 py_test(
     name = "optimizers_test",
     srcs = ["python/layers/optimizers_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -232,6 +236,7 @@
     name = "summaries_test",
     size = "small",
     srcs = ["python/layers/summaries_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -247,6 +252,7 @@
     name = "feature_column_test",
     size = "small",
     srcs = ["python/layers/feature_column_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -268,6 +274,7 @@
     name = "feature_column_ops_test",
     size = "medium",
     srcs = ["python/layers/feature_column_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -292,6 +299,7 @@
     name = "target_column_test",
     size = "small",
     srcs = ["python/layers/target_column_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -306,6 +314,7 @@
     name = "sparse_feature_cross_op_test",
     size = "medium",
     srcs = ["python/kernel_tests/sparse_feature_cross_op_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -323,6 +332,7 @@
     size = "small",
     timeout = "moderate",
     srcs = ["python/layers/embedding_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -345,6 +355,7 @@
     name = "utils_test",
     size = "small",
     srcs = ["python/layers/utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -360,6 +371,7 @@
     name = "sparse_ops_test",
     size = "small",
     srcs = ["python/ops/sparse_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
@@ -376,6 +388,7 @@
     name = "encoders_test",
     size = "small",
     srcs = ["python/layers/encoders_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
@@ -390,6 +403,7 @@
     name = "rev_block_lib_test",
     size = "medium",
     srcs = ["python/layers/rev_block_lib_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 3f0a91c..1d0cac3 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -261,6 +261,9 @@
     name = "tensor_signature_test",
     srcs = ["python/learn/estimators/tensor_signature_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "manual",  # b/130760310
+    ],
     deps = [
         ":learn",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index 2c7215b..d6a43ee 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -37,6 +37,7 @@
 py_binary(
     name = "produce_small_datasets",
     srcs = ["produce_small_datasets.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
@@ -48,6 +49,7 @@
     name = "base_test",
     size = "small",
     srcs = ["base_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
@@ -59,6 +61,7 @@
     name = "load_csv_test",
     size = "small",
     srcs = ["load_csv_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
@@ -70,6 +73,7 @@
     name = "synthetic_test",
     size = "small",
     srcs = ["synthetic_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
diff --git a/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py b/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
index b302250..937a493 100644
--- a/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
+++ b/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
@@ -22,12 +22,17 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import resource_loader
+from tensorflow.python.util.deprecation import deprecated
 
 
 _libsvm_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_libsvm_ops.so"))
 
 
+@deprecated(None,
+            'tf.contrib.libsvm will be removed in 2.0, the support for libsvm '
+            'format will continue to be provided in tensorflow-io: '
+            'https://github.com/tensorflow/io')
 def decode_libsvm(content, num_features, dtype=None, label_dtype=None):
   """Convert Libsvm records to a tensor of label and a tensor of feature.
 
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index 7534b50..ec0cbf9 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -40,6 +40,7 @@
     name = "sdca_ops_test",
     size = "medium",
     srcs = ["python/kernel_tests/sdca_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
@@ -80,6 +81,7 @@
     name = "sharded_mutable_dense_hashtable_test",
     size = "small",
     srcs = ["python/ops/sharded_mutable_dense_hashtable_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":sharded_mutable_dense_hashtable_py",
@@ -100,6 +102,7 @@
     name = "sparse_feature_column_test",
     size = "small",
     srcs = ["python/ops/sparse_feature_column_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":sparse_feature_column_py",
@@ -130,6 +133,7 @@
 py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index f4ebbde..c51b651 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -39,6 +39,7 @@
 py_test(
     name = "loss_ops_test",
     srcs = glob(["python/losses/loss_ops_test.py"]),
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":losses_py",
@@ -86,6 +87,7 @@
     srcs = [
         "python/metric_learning/metric_loss_ops_test.py",
     ],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 9d3aaa5..13f8431 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -651,7 +651,9 @@
 $(wildcard tensorflow/core/util/*/*.cc) \
 $(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \
 tensorflow/core/profiler/internal/profiler_interface.cc \
+tensorflow/core/profiler/internal/traceme_recorder.cc \
 tensorflow/core/profiler/lib/profiler_session.cc \
+tensorflow/core/profiler/lib/traceme.cc \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index c472b27..ac54c0c 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -8,7 +8,6 @@
 tensorflow/core/kernels/aggregate_ops.cc
 tensorflow/core/kernels/argmax_op.cc
 tensorflow/core/kernels/avgpooling_op.cc
-tensorflow/core/kernels/batch_matmul_op_common.cc
 tensorflow/core/kernels/batch_matmul_op_real.cc
 tensorflow/core/kernels/batch_norm_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
diff --git a/tensorflow/contrib/meta_graph_transform/BUILD b/tensorflow/contrib/meta_graph_transform/BUILD
index 2440078..d667b8e 100644
--- a/tensorflow/contrib/meta_graph_transform/BUILD
+++ b/tensorflow/contrib/meta_graph_transform/BUILD
@@ -36,6 +36,7 @@
     name = "meta_graph_transform_test",
     size = "small",
     srcs = ["meta_graph_transform_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:private"],
     deps = [
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index 21cd34f..aee6817 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -51,6 +51,7 @@
 py_test(
     name = "classification_test",
     srcs = ["python/metrics/classification_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":metrics_py",
@@ -64,6 +65,7 @@
     name = "histogram_ops_test",
     size = "medium",
     srcs = ["python/kernel_tests/histogram_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":metrics_py",
@@ -78,6 +80,7 @@
 py_test(
     name = "metric_ops_test",
     srcs = ["python/ops/metric_ops_test.py"],
+    python_version = "PY2",
     shard_count = 30,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
@@ -103,6 +106,7 @@
     name = "metric_ops_large_test",
     size = "large",
     srcs = ["python/ops/metric_ops_large_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
     deps = [
diff --git a/tensorflow/contrib/mixed_precision/python/BUILD b/tensorflow/contrib/mixed_precision/python/BUILD
index 1d769e1..3982139 100644
--- a/tensorflow/contrib/mixed_precision/python/BUILD
+++ b/tensorflow/contrib/mixed_precision/python/BUILD
@@ -28,6 +28,7 @@
     name = "loss_scale_manager_test",
     size = "small",
     srcs = ["loss_scale_manager_test.py"],
+    python_version = "PY2",
     deps = [
         ":loss_scale_manager",
         "//tensorflow/python:client_testlib",
@@ -62,6 +63,7 @@
     name = "loss_scale_optimizer_test",
     size = "small",
     srcs = ["loss_scale_optimizer_test.py"],
+    python_version = "PY2",
     deps = [
         ":loss_scale_optimizer",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index 3cffd76..fa7eb9f 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -46,6 +46,7 @@
     name = "layers_test",
     size = "small",
     srcs = ["python/layers/layers_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers",
@@ -115,6 +116,7 @@
     name = "pruning_utils_test",
     size = "medium",
     srcs = ["python/pruning_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pruning_utils",
@@ -127,6 +129,7 @@
     name = "pruning_test",
     size = "small",
     srcs = ["python/pruning_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pruning",
@@ -138,6 +141,7 @@
     name = "rnn_cells_test",
     size = "small",
     srcs = ["python/layers/rnn_cells_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pruning",
@@ -150,6 +154,7 @@
     name = "strip_pruning_vars_test",
     size = "small",
     srcs = ["python/strip_pruning_vars_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":layers",
@@ -163,6 +168,7 @@
 py_binary(
     name = "strip_pruning_vars",
     srcs = ["python/strip_pruning_vars.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
index 30ea912..805a6ea 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
@@ -48,6 +48,7 @@
     srcs = [
         "cifar10_eval.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cifar10_pruning",
@@ -61,6 +62,7 @@
     srcs = [
         "cifar10_train.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cifar10_pruning",
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index ef7ab22..e3e36c4 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -44,6 +44,7 @@
     name = "alpha_dropout_test",
     size = "small",
     srcs = ["python/ops/alpha_dropout_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":nn_py",
@@ -61,6 +62,7 @@
     name = "fwd_gradients_test",
     size = "small",
     srcs = ["python/ops/fwd_gradients_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":nn_py",
@@ -74,6 +76,7 @@
     name = "sampling_ops_test",
     size = "small",
     srcs = ["python/ops/sampling_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":nn_py",
@@ -89,6 +92,7 @@
     name = "scaled_softplus_test",
     size = "small",
     srcs = ["python/ops/scaled_softplus_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":nn_py",
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index f30643c..6c85533 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -65,6 +65,7 @@
 py_test(
     name = "adam_gs_optimizer_test",
     srcs = ["python/training/adam_gs_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -80,6 +81,7 @@
 py_test(
     name = "adamax_test",
     srcs = ["python/training/adamax_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -95,6 +97,7 @@
 py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no-internal-py3",
@@ -115,6 +118,7 @@
 py_test(
     name = "moving_average_optimizer_test",
     srcs = ["python/training/moving_average_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "notsan",  # b/31055119
@@ -151,6 +155,7 @@
 py_test(
     name = "multitask_optimizer_wrapper_test",
     srcs = ["python/training/multitask_optimizer_wrapper_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -168,6 +173,7 @@
 py_test(
     name = "lazy_adam_gs_optimizer_test",
     srcs = ["python/training/lazy_adam_gs_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -187,6 +193,7 @@
 py_test(
     name = "lazy_adam_optimizer_test",
     srcs = ["python/training/lazy_adam_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -206,6 +213,7 @@
 py_test(
     name = "reg_adagrad_optimizer_test",
     srcs = ["python/training/reg_adagrad_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -225,6 +233,7 @@
 py_test(
     name = "nadam_optimizer_test",
     srcs = ["python/training/nadam_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -244,6 +253,7 @@
 py_test(
     name = "weight_decay_optimizers_test",
     srcs = ["python/training/weight_decay_optimizers_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -348,6 +358,7 @@
 py_test(
     name = "sign_decay_test",
     srcs = ["python/training/sign_decay_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -358,6 +369,7 @@
 py_test(
     name = "addsign_test",
     srcs = ["python/training/addsign_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -377,6 +389,7 @@
 py_test(
     name = "powersign_test",
     srcs = ["python/training/powersign_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -396,6 +409,7 @@
 py_test(
     name = "ggt_test",
     srcs = ["python/training/ggt_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -415,6 +429,7 @@
     name = "shampoo_test",
     size = "medium",
     srcs = ["python/training/shampoo_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
@@ -435,6 +450,7 @@
 py_test(
     name = "lars_optimizer_test",
     srcs = ["python/training/lars_optimizer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
@@ -450,6 +466,7 @@
 py_test(
     name = "matrix_functions_test",
     srcs = ["python/training/matrix_functions_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index b469ebf..1c8cdc5 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -143,11 +143,6 @@
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    # The optimizer and Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend([
-        "model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-        "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"
-    ])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index bb9743d..7bcf07f 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -188,7 +188,7 @@
     return True
   # Don't need to do anything special in graph mode, since dynamic values
   # will propagate correctly automatically.
-  # TODO(josh11b): Add per-device caching across steps using variables for
+  # TODO(josh11b): Add per-replica caching across steps using variables for
   # truly static values once we add distributed support.
   if context.executing_eagerly() and isinstance(
       value, resource_variable_ops.ResourceVariable):
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index db2b114..3767407 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -77,6 +77,7 @@
 py_test(
     name = "periodic_resample_op_test",
     srcs = ["python/kernel_tests/periodic_resample_op_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "notap",
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index 53a3bc6..3189bb9 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -110,6 +110,7 @@
     name = "saved_model_predictor_test",
     srcs = ["saved_model_predictor_test.py"],
     data = [":test_export_dir"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -126,6 +127,7 @@
     name = "predictor_factories_test",
     srcs = ["predictor_factories_test.py"],
     data = [":test_export_dir"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -137,6 +139,7 @@
 py_test(
     name = "core_estimator_predictor_test",
     srcs = ["core_estimator_predictor_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -150,6 +153,7 @@
 py_test(
     name = "contrib_estimator_predictor_test",
     srcs = ["contrib_estimator_predictor_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index ba69515..598f6d1 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -17,6 +17,7 @@
     name = "common_test",
     size = "small",
     srcs = ["python/common_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":common",
@@ -46,6 +47,7 @@
     name = "graph_matcher_test",
     size = "small",
     srcs = ["python/graph_matcher_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_matcher",
@@ -75,6 +77,7 @@
     name = "input_to_ops_test",
     size = "small",
     srcs = ["python/input_to_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":input_to_ops",
@@ -112,6 +115,7 @@
 py_test(
     name = "fold_batch_norms_test",
     srcs = ["python/fold_batch_norms_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":fold_batch_norms",
@@ -152,6 +156,7 @@
     name = "quant_ops_test",
     size = "small",
     srcs = ["python/quant_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":quant_ops",
@@ -185,6 +190,7 @@
     name = "quantize_test",
     size = "small",
     srcs = ["python/quantize_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":quantize",
@@ -204,6 +210,7 @@
     name = "quantize_parameterized_test",
     size = "medium",
     srcs = ["python/quantize_parameterized_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     # TODO(b/118839526): Re-enable msan test.
@@ -243,6 +250,7 @@
     name = "quantize_graph_test",
     size = "small",
     srcs = ["python/quantize_graph_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":quantize_graph",
diff --git a/tensorflow/contrib/quantize/python/common.py b/tensorflow/contrib/quantize/python/common.py
index e6c04bc..3c553d0 100644
--- a/tensorflow/contrib/quantize/python/common.py
+++ b/tensorflow/contrib/quantize/python/common.py
@@ -115,7 +115,8 @@
           dtype=dtypes.int64,
           initializer=init_ops.zeros_initializer(),
           trainable=False,
-          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+          aggregation=variable_scope.VariableAggregation.ONLY_FIRST_REPLICA)
       with g.name_scope(quantization_step_tensor.op.name + '/'):
         # We return the incremented variable tensor. Since this is used in conds
         # for quant_delay and freeze_bn_delay, it will run once per graph
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 39082ca..ecee254 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -55,7 +55,8 @@
       shape=shape,
       initializer=initializer,
       collections=collections,
-      trainable=trainable)
+      trainable=trainable,
+      aggregation=variable_scope.VariableAggregation.MEAN)
 
 
 def LastValueQuantize(inputs,
@@ -161,12 +162,12 @@
         # than the positive range.
         min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits)
 
-      # TFLite requires that 0.0 if always in the [min; max] range. Because
+      # TFLite requires that 0.0 is always in the [min; max] range. Because
       # batch_min <= batch_max, it follows that range_min <= 0 <= range_max.
       range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio)
       range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio)
     else:
-      # TFLite requires that 0.0 if always in the [min; max] range.
+      # TFLite requires that 0.0 is always in the [min; max] range.
       range_min = math_ops.minimum(batch_min, 0.0)
       range_max = math_ops.maximum(batch_max, 0.0)
 
@@ -286,12 +287,12 @@
         # than the positive range.
         min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits)
 
-      # TFLite requires that 0.0 if always in the [min; max] range. Because
+      # TFLite requires that 0.0 is always in the [min; max] range. Because
       # batch_min <= batch_max, it follows that range_min <= 0 <= range_max.
       range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio)
       range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio)
     else:
-      # TFLite requires that 0.0 if always in the [min; max] range.
+      # TFLite requires that 0.0 is always in the [min; max] range.
       range_min = math_ops.minimum(batch_min, 0.0)
       range_max = math_ops.maximum(batch_max, 0.0)
 
diff --git a/tensorflow/contrib/rate/BUILD b/tensorflow/contrib/rate/BUILD
index 76db9ae..4a60b47 100644
--- a/tensorflow/contrib/rate/BUILD
+++ b/tensorflow/contrib/rate/BUILD
@@ -34,6 +34,7 @@
     name = "rate_test",
     size = "small",
     srcs = ["rate_test.py"],
+    python_version = "PY2",
     tags = [
         "manual",  # TODO(b/120555555)
         "no_oss",  # TODO(b/120555555)
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
index 9325a14..18ef020 100644
--- a/tensorflow/contrib/receptive_field/BUILD
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -62,6 +62,7 @@
 py_test(
     name = "graph_compute_order_test",
     srcs = ["python/util/graph_compute_order_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_compute_order_py",
@@ -78,6 +79,7 @@
 py_test(
     name = "parse_layer_parameters_test",
     srcs = ["python/util/parse_layer_parameters_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":graph_compute_order_py",
@@ -94,6 +96,7 @@
 py_test(
     name = "receptive_field_test",
     srcs = ["python/util/receptive_field_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":receptive_field_py",
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 3aa8a14..274bdbe 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -37,6 +37,7 @@
     name = "remote_fused_graph_ops_test",
     size = "small",
     srcs = ["python/ops/remote_fused_graph_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":remote_fused_graph_ops_py",
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 83a7525..66fadcc 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -99,6 +99,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["optonly"],
     xla_enabled = True,
 )
 
@@ -345,6 +346,7 @@
 py_binary(
     name = "checkpoint_convert",
     srcs = ["python/tools/checkpoint_convert.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":checkpoint_convert_lib"],
 )
@@ -368,6 +370,7 @@
     name = "checkpoint_convert_test",
     size = "small",
     srcs = ["python/tools/checkpoint_convert_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index f0242a3..969ff19 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -69,6 +69,7 @@
     name = "reader_test",
     size = "small",
     srcs = ["python/saved_model/reader_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 35da873..d92a5ec 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -90,6 +90,7 @@
                memory_layer=None,
                check_inner_dims_defined=True,
                score_mask_value=None,
+               custom_key_value_fn=None,
                name=None):
     """Construct base AttentionMechanism class.
 
@@ -114,6 +115,8 @@
       score_mask_value: (optional): The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
         `memory_sequence_length` is not None.
+      custom_key_value_fn: (optional): The custom function for
+        computing keys and values.
       name: Name to use when creating ops.
     """
     if (query_layer is not None and
@@ -148,6 +151,8 @@
       self._keys = (
           self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
           else self._values)
+      if custom_key_value_fn is not None:
+        self._keys, self._values = custom_key_value_fn(self._keys, self._values)
       self._batch_size = (
           tensor_shape.dimension_value(self._keys.shape[0]) or
           array_ops.shape(self._keys)[0])
@@ -672,6 +677,7 @@
                probability_fn=None,
                score_mask_value=None,
                dtype=None,
+               custom_key_value_fn=None,
                name="LuongAttention"):
     """Construct the AttentionMechanism mechanism.
 
@@ -691,6 +697,8 @@
         `probability_fn`. The default is -inf. Only used if
         `memory_sequence_length` is not None.
       dtype: The data type for the memory layer of the attention mechanism.
+      custom_key_value_fn: (optional): The custom function for
+        computing keys and values.
       name: Name to use when creating ops.
     """
     # For LuongAttention, we only transform the memory layer; thus
@@ -708,6 +716,7 @@
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
         score_mask_value=score_mask_value,
+        custom_key_value_fn=custom_key_value_fn,
         name=name)
     self._num_units = num_units
     self._scale = scale
@@ -930,6 +939,7 @@
                probability_fn=None,
                score_mask_value=None,
                dtype=None,
+               custom_key_value_fn=None,
                name="BahdanauAttention"):
     """Construct the Attention mechanism.
 
@@ -937,7 +947,7 @@
       num_units: The depth of the query mechanism.
       memory: The memory to query; usually the output of an RNN encoder.  This
         tensor should be shaped `[batch_size, max_time, ...]`.
-      memory_sequence_length (optional): Sequence lengths for the batch entries
+      memory_sequence_length: (optional) Sequence lengths for the batch entries
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
       normalize: Python boolean.  Whether to normalize the energy term.
@@ -950,6 +960,8 @@
         `memory_sequence_length` is not None.
       dtype: The data type for the query and memory layers of the attention
         mechanism.
+      custom_key_value_fn: (optional): The custom function for
+        computing keys and values.
       name: Name to use when creating ops.
     """
     if probability_fn is None:
@@ -964,6 +976,7 @@
             num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
+        custom_key_value_fn=custom_key_value_fn,
         memory_sequence_length=memory_sequence_length,
         score_mask_value=score_mask_value,
         name=name)
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 9c08859..abeaed1 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -65,6 +65,7 @@
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
     main = "bundle_shim_test.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -114,6 +115,7 @@
     name = "exporter_test",
     size = "small",
     srcs = ["exporter_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:private"],
     deps = [
@@ -150,6 +152,7 @@
 py_test(
     name = "gc_test",
     srcs = ["gc_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
@@ -266,6 +269,7 @@
     srcs = ["session_bundle_test.py"],
     data = [":session_bundle_half_plus_two"],
     main = "session_bundle_test.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
diff --git a/tensorflow/contrib/session_bundle/example/BUILD b/tensorflow/contrib/session_bundle/example/BUILD
index 9a56eab..18a0759 100644
--- a/tensorflow/contrib/session_bundle/example/BUILD
+++ b/tensorflow/contrib/session_bundle/example/BUILD
@@ -15,6 +15,7 @@
     srcs = [
         "export_half_plus_two.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/contrib/session_bundle/exporter_test.py b/tensorflow/contrib/session_bundle/exporter_test.py
index 68419ff..33f10a4 100644
--- a/tensorflow/contrib/session_bundle/exporter_test.py
+++ b/tensorflow/contrib/session_bundle/exporter_test.py
@@ -88,12 +88,12 @@
       asset_file = constant_op.constant(asset_filepath_orig, name="filename42")
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, asset_file)
 
-      with gfile.FastGFile(asset_filepath_orig, "w") as f:
+      with gfile.GFile(asset_filepath_orig, "w") as f:
         f.write("your data here")
       assets_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
 
       ignored_asset = os.path.join(test.get_temp_dir(), "ignored.txt")
-      with gfile.FastGFile(ignored_asset, "w") as f:
+      with gfile.GFile(ignored_asset, "w") as f:
         f.write("additional data here")
 
       variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index 516e3ea..96e2dce 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -23,6 +23,7 @@
 py_test(
     name = "evaluation_test",
     srcs = ["python/slim/evaluation_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":evaluation",
@@ -70,6 +71,7 @@
 py_test(
     name = "learning_test",
     srcs = ["python/slim/learning_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
@@ -168,6 +170,7 @@
 py_test(
     name = "summaries_test",
     srcs = ["python/slim/summaries_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":summaries",
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index fb96d93..2f6b006 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -346,7 +346,7 @@
   with slim.arg_scope([slim.conv2d], padding='SAME',
                       weights_initializer=tf.truncated_normal_initializer(stddev=0.01)
                       weights_regularizer=slim.l2_regularizer(0.0005)):
-    net = slim.conv2d(inputs, 64, [11, 11], scope='conv1')
+    net = slim.conv2d(inputs, 64, [11, 11], 4, scope='conv1')
     net = slim.conv2d(net, 128, [11, 11], padding='VALID', scope='conv2')
     net = slim.conv2d(net, 256, [11, 11], scope='conv3')
 ```
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index eef043e..f1b5736 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -60,6 +60,7 @@
 py_test(
     name = "dataset_data_provider_test",
     srcs = ["dataset_data_provider_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
@@ -98,6 +99,7 @@
     name = "parallel_reader_test",
     size = "small",
     srcs = ["parallel_reader_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":parallel_reader",
@@ -130,6 +132,7 @@
     name = "prefetch_queue_test",
     size = "small",
     srcs = ["prefetch_queue_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":prefetch_queue",
@@ -179,6 +182,7 @@
 py_test(
     name = "tfexample_decoder_test",
     srcs = ["tfexample_decoder_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":tfexample_decoder",
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index e9595d1..f19177b 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -45,6 +45,7 @@
     name = "alexnet_test",
     size = "medium",
     srcs = ["alexnet_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":alexnet",
@@ -117,6 +118,7 @@
     name = "inception_v1_test",
     size = "medium",
     srcs = ["inception_v1_test.py"],
+    python_version = "PY2",
     shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
@@ -137,6 +139,7 @@
     name = "inception_v2_test",
     size = "medium",
     srcs = ["inception_v2_test.py"],
+    python_version = "PY2",
     shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
@@ -157,6 +160,7 @@
     name = "inception_v3_test",
     size = "medium",
     srcs = ["inception_v3_test.py"],
+    python_version = "PY2",
     shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
@@ -191,6 +195,7 @@
     name = "overfeat_test",
     size = "medium",
     srcs = ["overfeat_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":overfeat",
@@ -235,6 +240,7 @@
     name = "resnet_v1_test",
     size = "medium",
     srcs = ["resnet_v1_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
@@ -271,6 +277,7 @@
     name = "resnet_v2_test",
     size = "medium",
     srcs = ["resnet_v2_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
@@ -307,6 +314,7 @@
     name = "vgg_test",
     size = "medium",
     srcs = ["vgg_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":vgg",
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index 7dd52df..4085801 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -9,6 +9,7 @@
 py_test(
     name = "summary_ops_test",
     srcs = ["summary_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":summary",
@@ -29,6 +30,7 @@
 py_test(
     name = "summary_ops_graph_test",
     srcs = ["summary_ops_graph_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":summary",
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 42898e7..e0159a8 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -64,7 +64,7 @@
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.python.ops.summary_ops_v2 import all_summary_ops
+from tensorflow.python.ops.summary_ops_v2 import all_v2_summary_ops as all_summary_ops
 from tensorflow.python.ops.summary_ops_v2 import always_record_summaries
 from tensorflow.python.ops.summary_ops_v2 import audio
 from tensorflow.python.ops.summary_ops_v2 import create_db_writer
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 583bbf9..a7f8819 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -436,6 +436,7 @@
     name = "eval_metrics_test",
     size = "small",
     srcs = ["client/eval_metrics_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":eval_metrics",
@@ -461,6 +462,7 @@
     name = "scatter_add_ndim_op_test",
     size = "small",
     srcs = ["python/kernel_tests/scatter_add_ndim_op_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
@@ -502,6 +504,7 @@
     name = "tensor_forest_test",
     size = "small",
     srcs = ["python/tensor_forest_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":tensor_forest_py",
@@ -539,6 +542,7 @@
     name = "random_forest_test",
     size = "medium",
     srcs = ["client/random_forest_test.py"],
+    python_version = "PY2",
     shard_count = 6,
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index b7185e0..64176a0 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -122,6 +122,7 @@
     name = "hybrid_layer_test",
     size = "small",
     srcs = ["python/hybrid_layer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":fully_connected_layer",
@@ -163,6 +164,7 @@
     name = "routing_function_op_test",
     size = "small",
     srcs = ["python/kernel_tests/routing_function_op_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
@@ -177,6 +179,7 @@
     name = "k_feature_routing_function_op_test",
     size = "small",
     srcs = ["python/kernel_tests/k_feature_routing_function_op_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
@@ -206,6 +209,7 @@
 py_test(
     name = "decisions_to_data_test",
     srcs = ["python/layers/decisions_to_data_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
@@ -247,6 +251,7 @@
     name = "decisions_to_data_then_nn_test",
     size = "small",
     srcs = ["python/models/decisions_to_data_then_nn_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_then_nn",
@@ -274,6 +279,7 @@
     name = "k_feature_decisions_to_data_then_nn_test",
     size = "small",
     srcs = ["python/models/k_feature_decisions_to_data_then_nn_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":k_feature_decisions_to_data_then_nn",
@@ -301,6 +307,7 @@
     name = "forest_to_data_then_nn_test",
     size = "small",
     srcs = ["python/models/forest_to_data_then_nn_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":forest_to_data_then_nn",
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 7f0b325..85070cf 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -50,6 +50,7 @@
     name = "projector_api_test",
     size = "small",
     srcs = ["plugins/projector/projector_api_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":projector",
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index a434c12..9f9e19a 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -97,6 +97,7 @@
     name = "skip_gram_ops_test",
     size = "medium",
     srcs = ["python/ops/skip_gram_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":text_py",
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 70c3a07..235f3ad 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -17,6 +17,7 @@
 py_binary(
     name = "predict",
     srcs = ["predict.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [":predict_main_lib"],
@@ -42,6 +43,7 @@
     timeout = "long",  # Moderate but for asan
     srcs = ["predict_test.py"],
     data = ["data/period_trend.csv"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
@@ -56,6 +58,7 @@
 py_binary(
     name = "known_anomaly",
     srcs = ["known_anomaly.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [":known_anomaly_main_lib"],
@@ -80,6 +83,7 @@
     name = "known_anomaly_test",
     timeout = "long",  # Moderate but for asan
     srcs = ["known_anomaly_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":known_anomaly_main_lib",
@@ -90,6 +94,7 @@
 py_binary(
     name = "multivariate",
     srcs = ["multivariate.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [":multivariate_main_lib"],
@@ -116,6 +121,7 @@
     srcs = [
         "multivariate_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":multivariate_main_lib",
@@ -126,6 +132,7 @@
 py_binary(
     name = "lstm",
     srcs = ["lstm.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     visibility = ["//visibility:public"],
@@ -155,6 +162,7 @@
     name = "lstm_test",
     timeout = "long",  # Moderate but for asan
     srcs = ["lstm_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 449ec8b..ae2c4a5 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -104,6 +104,7 @@
     srcs = [
         "estimators_test.py",
     ],
+    python_version = "PY2",
     shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
@@ -159,6 +160,7 @@
     srcs = [
         "head_test.py",
     ],
+    python_version = "PY2",
     shard_count = 10,
     srcs_version = "PY2AND3",
     tags = [
@@ -214,6 +216,7 @@
     srcs = [
         "model_utils_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip_gpu",  # b/63391119
@@ -249,6 +252,7 @@
     srcs = [
         "state_management_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -314,6 +318,7 @@
     srcs = [
         "input_pipeline_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",  # b/63709811
@@ -390,6 +395,7 @@
     srcs = [
         "ar_model_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -443,6 +449,7 @@
     srcs = [
         "math_utils_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip_gpu",  # b/63391119
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index cf5e749..08eafec 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -39,6 +39,7 @@
     name = "state_space_model_test",
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_mac",
@@ -180,6 +181,7 @@
     name = "structural_ensemble_test",
     timeout = "long",  # Moderate but for asan/tsan/msan timeouts
     srcs = ["structural_ensemble_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
index 41aa4d2..d85aae6 100644
--- a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
@@ -19,5 +19,5 @@
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import *
+from tensorflow_estimator.python.estimator.tpu._tpu_estimator_embedding import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
index 1b1328b..9cbb508 100644
--- a/tensorflow/contrib/tpu/python/tpu/error_handling.py
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -19,5 +19,5 @@
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.error_handling import *
+from tensorflow_estimator.python.estimator.tpu.error_handling import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 50faac3..01b1b4a 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -1641,7 +1641,7 @@
         validation_split=validation_split)
 
     # Prepare validation data
-    val_x, val_y, val_sample_weights = self._prepare_validation_data(
+    x, y, val_x, val_y, val_sample_weights = self._prepare_validation_data(
         validation_data, validation_split, validation_steps, x, y,
         sample_weights, batch_size)
     return self._pipeline_fit_loop(
@@ -1934,7 +1934,7 @@
       batch_size: The training batch size (if provided)
 
     Returns:
-      A 3-tuple of (val_x, val_y, val_sample_weights).
+      A 5-tuple of (x, y, val_x, val_y, val_sample_weights).
 
     Raises:
       ValueError: If the provided arguments are not compatible with
@@ -1991,7 +1991,7 @@
       val_y = None
       val_sample_weights = None
 
-    return val_x, val_y, val_sample_weights
+    return x, y, val_x, val_y, val_sample_weights
 
   def predict(self,
               x,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index c36aaa3..2c9bce0 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -19,5 +19,5 @@
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.tpu_config import *
+from tensorflow_estimator.python.estimator.tpu.tpu_config import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index b77b010..573f49b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -19,5 +19,5 @@
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.tpu_context import *
+from tensorflow_estimator.python.estimator.tpu.tpu_context import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 8931184..0ee4906 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -19,15 +19,15 @@
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import,redefined-builtin
-from tensorflow.python.tpu.tpu_estimator import *
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import *
 # used by tests
-from tensorflow.python.tpu.tpu_estimator import _clone_export_output_with_tensors
-from tensorflow.python.tpu.tpu_estimator import _create_global_step
-from tensorflow.python.tpu.tpu_estimator import _export_output_to_tensors
-from tensorflow.python.tpu.tpu_estimator import _get_scaffold
-from tensorflow.python.tpu.tpu_estimator import _Inputs
-from tensorflow.python.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
-from tensorflow.python.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
-from tensorflow.python.tpu.tpu_estimator import _TPU_ESTIMATOR
-from tensorflow.python.tpu.tpu_estimator import _TPU_TRAIN_OP
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _create_global_step
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _get_scaffold
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _Inputs
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_TRAIN_OP
 # pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/util.py b/tensorflow/contrib/tpu/python/tpu/util.py
index 8d9b70d..6e0da24 100644
--- a/tensorflow/contrib/tpu/python/tpu/util.py
+++ b/tensorflow/contrib/tpu/python/tpu/util.py
@@ -19,5 +19,5 @@
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.util import *
+from tensorflow_estimator.python.estimator.tpu.util import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 5bc4c3b..8f1d5ce 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -70,6 +70,7 @@
     name = "device_setter_test",
     size = "small",
     srcs = ["python/training/device_setter_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":training_py",
@@ -85,6 +86,7 @@
     name = "sequence_queueing_state_saver_test",
     size = "medium",
     srcs = ["python/training/sequence_queueing_state_saver_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":training_py",
@@ -103,6 +105,7 @@
     name = "batch_sequences_with_states_test",
     size = "medium",
     srcs = ["python/training/batch_sequences_with_states_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
@@ -126,6 +129,7 @@
     name = "feeding_queue_runner_test",
     size = "medium",
     srcs = ["python/training/feeding_queue_runner_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
@@ -141,6 +145,7 @@
     name = "hparam_test",
     size = "small",
     srcs = ["python/training/hparam_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":training_py",
@@ -152,6 +157,7 @@
     name = "resample_test",
     size = "small",
     srcs = ["python/training/resample_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":training_py",
@@ -169,6 +175,7 @@
     name = "sampling_ops_test",
     size = "small",
     srcs = ["python/training/sampling_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":training_py",
@@ -192,6 +199,7 @@
     name = "sampling_ops_threading_test",
     size = "small",
     srcs = ["python/training/sampling_ops_threading_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "manual",
@@ -214,6 +222,7 @@
     name = "bucket_ops_test",
     size = "medium",
     srcs = ["python/training/bucket_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
@@ -234,6 +243,7 @@
     name = "evaluation_test",
     size = "small",
     srcs = ["python/training/evaluation_test.py"],
+    python_version = "PY2",
     shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
@@ -266,6 +276,7 @@
     name = "training_test",
     size = "medium",
     srcs = ["python/training/training_test.py"],
+    python_version = "PY2",
     shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
index af29abd..0f92ed3 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -15,11 +15,8 @@
 
 #ifdef TENSORFLOW_USE_VERBS
 
-#include "grpcpp/alarm.h"
-#include "grpcpp/grpcpp.h"
-#include "grpcpp/server_builder.h"
-
 #include "tensorflow/contrib/verbs/grpc_verbs_service.h"
+
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
index e616778..97da84e 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -18,6 +18,9 @@
 
 #ifdef TENSORFLOW_USE_VERBS
 
+#include "grpcpp/alarm.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_builder.h"
 #include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
 #include "tensorflow/contrib/verbs/rdma_mgr.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index af8425a..681f2ab 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -136,6 +136,8 @@
     "tf_additional_libdevice_deps",
     "tf_additional_libdevice_srcs",
     "tf_additional_minimal_lib_srcs",
+    "tf_additional_monitoring_hdrs",
+    "tf_additional_monitoring_srcs",
     "tf_additional_mpi_lib_defines",
     "tf_additional_numa_copts",
     "tf_additional_numa_deps",
@@ -244,6 +246,7 @@
     "example/example_parser_configuration.proto",
     "protobuf/trackable_object_graph.proto",
     "protobuf/control_flow.proto",
+    "protobuf/data/experimental/snapshot.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
     "protobuf/meta_graph.proto",
@@ -587,6 +590,7 @@
         "platform/cpu_feature_guard.h",
         "platform/error.h",
         "platform/fingerprint.h",
+        "platform/monitoring.h",
         "platform/net.h",
         "platform/notification.h",
         "platform/prefetch.h",
@@ -598,7 +602,7 @@
         "platform/stacktrace_handler.h",
         "platform/strong_hash.h",
         "platform/subprocess.h",
-    ],
+    ] + tf_additional_monitoring_hdrs(),
     visibility = ["//visibility:private"],
 )
 
@@ -907,6 +911,7 @@
         "framework/kernel_def_builder.h",
         "framework/kernel_def_util.h",
         "framework/log_memory.h",
+        "framework/logging.h",
         "framework/lookup_interface.h",
         "framework/memory_types.h",
         "framework/node_def_builder.h",
@@ -939,12 +944,13 @@
         "framework/tracking_allocator.h",
         "framework/type_index.h",
         "framework/type_traits.h",
+        "framework/typed_allocator.h",
         "framework/types.h",
         "public/version.h",
         "util/activation_mode.h",
         "util/batch_util.h",
         "util/bcast.h",
-        "util/cuda_kernel_helper.h",
+        "util/matmul_bcast.h",
         "util/device_name_utils.h",
         "util/dump_graph.h",
         "util/events_writer.h",
@@ -1172,7 +1178,10 @@
     op_lib_names = [
         "array_ops",
     ],
-    deps = [":protos_all_cc"],
+    deps = [
+        ":lib",
+        ":protos_all_cc",
+    ],
 )
 
 tf_gen_op_libs(
@@ -1400,6 +1409,7 @@
         ":framework",
         ":lib",
         ":math_ops_op_lib",
+        ":protos_all_cc",
     ],
     alwayslink = 1,
 )
@@ -1635,9 +1645,14 @@
     ] + if_dynamic_kernels(
         [],
         otherwise = [
+            "//tensorflow/core/kernels:aggregate_ops",
+            "//tensorflow/core/kernels:bcast_ops",
             "//tensorflow/core/kernels:cast_op",
             "//tensorflow/core/kernels:constant_op",
+            "//tensorflow/core/kernels:identity_op",
             "//tensorflow/core/kernels:random_ops",
+            "//tensorflow/core/kernels:reduction_ops",
+            "//tensorflow/core/kernels:reshape_op",
         ],
     ),
 )
@@ -1668,6 +1683,13 @@
 )
 
 # -----------------------------------------------------------------------------
+# MKL targets
+cc_library(
+    name = "mkl_graph_util",
+    hdrs = ["graph/mkl_graph_util.h"],
+)
+
+# -----------------------------------------------------------------------------
 # Public Android targets
 
 # Android-specific BUILD targets
@@ -1724,6 +1746,10 @@
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
             "platform/**/logger.cc",
+            # Exclude env_time and logging to avoid collisions with
+            # :platform_base, a common dependency for downstream targets.
+            "platform/**/env_time.cc",
+            "platform/**/logging.cc",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/rocm.h",
@@ -1750,10 +1776,12 @@
 filegroup(
     name = "mobile_srcs_only_runtime",
     srcs = [
+        "//tensorflow/core/common_runtime/eager:srcs",
         "//tensorflow/core/kernels:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
         "//tensorflow/core/util/tensor_bundle:android_srcs",
         "//tensorflow/c:srcs",
+        "//tensorflow/c/eager:srcs",
     ] + glob(
         [
             "common_runtime/**/*.h",
@@ -1770,7 +1798,6 @@
             "**/*testlib*",
             "**/*main.cc",
             "common_runtime/gpu/**/*",
-            "common_runtime/eager/*",
             "common_runtime/gpu_device_factory.*",
             "graph/dot.*",
         ],
@@ -1819,6 +1846,7 @@
         ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
+        "@farmhash_archive//:farmhash",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1843,6 +1871,7 @@
         ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
+        "@farmhash_archive//:farmhash",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1852,6 +1881,7 @@
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        ":platform_base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1863,7 +1893,6 @@
     srcs = if_emscripten([":mobile_srcs_no_runtime"]),
     copts = ["-DSUPPORT_SELECTIVE_REGISTRATION"] + tf_opts_nortti_if_emscripten(),
     defines = ["TENSORFLOW_LITE_PROTOS"],
-    linkopts = ["-lz"],
     tags = [
         "manual",
         "notap",
@@ -1875,6 +1904,7 @@
         ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
+        "@farmhash_archive//:farmhash",
         "@nsync//:nsync_cpp",
         "@zlib_archive//:zlib",
     ],
@@ -1917,6 +1947,7 @@
         ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
+        "@farmhash_archive//:farmhash",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -2186,6 +2217,13 @@
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/variable_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/variable.proto",
+    visibility = ["//visibility:public"],
+)
+
 # -----------------------------------------------------------------------------
 # Internal targets
 
@@ -2323,6 +2361,7 @@
     "platform/denormal.h",
     "platform/host_info.h",
     "platform/platform.h",
+    "platform/monitoring.h",
     "platform/protobuf_internal.h",
     "platform/setround.h",
     "platform/snappy.h",
@@ -2383,6 +2422,7 @@
             "lib/jpeg/**/*",
             "lib/png/**/*",
             "platform/**/env_time.cc",
+            "platform/**/monitoring.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
             "platform/**/logger.cc",
@@ -2397,6 +2437,7 @@
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/rocm.h",
+            "platform/**/monitoring.cc",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
@@ -2408,7 +2449,7 @@
         # Protobuf deps already included through the ":lib_proto_parsing"
         # dependency.
         tf_additional_proto_srcs(),
-    ),
+    ) + tf_additional_monitoring_srcs(),
     hdrs = LIB_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
     defines = LIB_INTERNAL_DEFINES,
@@ -2851,6 +2892,7 @@
         "@com_google_absl//absl/time",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/profiler/lib:traceme",
         "//third_party/eigen3",
     ] + if_static(
         extra_deps = ["@protobuf_archive//:protobuf"],
@@ -2918,7 +2960,6 @@
 tf_cuda_library(
     name = "cuda_device_functions",
     hdrs = [
-        "util/cuda_device_functions.h",
         "util/gpu_device_functions.h",
     ],
     visibility = ["//visibility:public"],
@@ -3058,12 +3099,14 @@
     "common_runtime/constant_folding.h",
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
+    "common_runtime/placer_inspection_required_ops_utils.h",
     "common_runtime/debugger_state_interface.h",
     "common_runtime/device_resolver_local.h",
     "common_runtime/dma_helper.h",
     "common_runtime/executor.h",
     "common_runtime/executor_factory.h",
     "common_runtime/graph_optimizer.h",
+    "common_runtime/isolate_placer_inspection_required_ops_pass.h",
     "common_runtime/local_device.h",
     "common_runtime/lower_function_call_op.h",
     "common_runtime/lower_if_op.h",
@@ -3125,6 +3168,9 @@
         "common_runtime/graph_optimizer.cc",
         "common_runtime/graph_runner.cc",
         "common_runtime/hierarchical_tree_broadcaster.cc",
+        "common_runtime/inspecting_placer.cc",
+        "common_runtime/inspecting_placer.h",
+        "common_runtime/isolate_placer_inspection_required_ops_pass.cc",
         "common_runtime/local_device.cc",
         "common_runtime/lower_function_call_op.cc",
         "common_runtime/lower_functional_ops.cc",
@@ -3137,6 +3183,8 @@
         "common_runtime/parallel_concat_optimizer.cc",
         "common_runtime/partitioning_utils.cc",
         "common_runtime/placer.cc",
+        "common_runtime/placer_inspection_required_ops_utils.cc",
+        "common_runtime/placer_inspection_required_ops_utils.h",
         "common_runtime/pool_allocator.cc",
         "common_runtime/process_function_library_runtime.cc",
         "common_runtime/process_state.cc",
@@ -3177,8 +3225,11 @@
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "//third_party/eigen3",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/internal:traceme_recorder",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -3208,6 +3259,7 @@
         ":lib",
         ":proto_text",
         ":protos_all_cc",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -3257,6 +3309,7 @@
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/profiler/lib:profiler_graph_lib",
         "//tensorflow/core/profiler/lib:profiler_session",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
     alwayslink = 1,
 )
@@ -3284,7 +3337,7 @@
     name = "device_tracer",
     srcs = tf_additional_device_tracer_srcs(),
     copts = tf_copts(),
-    cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()),
+    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
     visibility = [
         "//tensorflow:internal",
     ],
@@ -3293,7 +3346,6 @@
         ":lib",
         ":protos_all_cc",
         "//tensorflow/core/profiler/internal:profiler_interface",
-        # "//tensorflow/stream_executor/lib",
     ] + tf_additional_device_tracer_deps(),
     alwayslink = True,
 )
@@ -3381,6 +3433,7 @@
         ":lib_internal",
         ":protos_all_cc",
         ":stream_executor",
+        "//tensorflow/core/profiler/lib:traceme",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
@@ -3899,8 +3952,10 @@
         "common_runtime/collective_rma_local_test.cc",
         "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
+        "common_runtime/isolate_placer_inspection_required_ops_pass_test.cc",
         "common_runtime/optimization_registry_test.cc",
         "common_runtime/pending_counts_test.cc",
+        "common_runtime/placer_inspection_required_ops_utils_test.cc",
         "common_runtime/placer_test.cc",
         "common_runtime/session_test.cc",
         "common_runtime/threadpool_device_test.cc",
@@ -3960,6 +4015,7 @@
         "util/events_writer_test.cc",
         "util/example_proto_fast_parsing_test.cc",
         "util/example_proto_helper_test.cc",
+        "util/matmul_bcast_test.cc",
         "util/memmapped_file_system_test.cc",
         "util/presized_cuckoo_map_test.cc",
         "util/reffed_status_callback_test.cc",
@@ -4005,6 +4061,8 @@
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -4825,7 +4883,7 @@
     name = "gpu_debug_allocator_test",
     size = "medium",
     srcs = ["common_runtime/gpu/gpu_debug_allocator_test.cc"],
-    args = ["\"--gtest_death_test_style=threadsafe\""],
+    args = ["--gtest_death_test_style=threadsafe"],
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
diff --git a/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt b/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt
index 6631f4e..1726963 100644
--- a/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt
@@ -42,10 +42,10 @@
 This op expects to receive audio data as an input, stored as floats in the range
 -1 to 1, together with a window width in samples, and a stride specifying how
 far to move the window between slices. From this it generates a three
-dimensional output. The lowest dimension has an amplitude value for each
-frequency during that time slice. The next dimension is time, with successive
-frequency slices. The final dimension is for the channels in the input, so a
-stereo audio input would have two here for example.
+dimensional output. The first dimension is for the channels in the input, so a
+stereo audio input would have two here for example. The second dimension is time, 
+with successive frequency slices. The third dimension has an amplitude value for 
+each frequency during that time slice.
 
 This means the layout when converted and saved as an image is rotated 90 degrees
 clockwise from a typical spectrogram. Time is descending down the Y axis, and
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
index 7637601..669223d 100644
--- a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -27,7 +27,8 @@
 and works its way forward.
 
 For example,
-```
+
+```python
 >>> x = tf.constant([1, 2, 3])
 >>> y = tf.broadcast_to(x, [3, 3])
 >>> sess.run(y)
@@ -35,6 +36,7 @@
        [1, 2, 3],
        [1, 2, 3]], dtype=int32)
 ```
+
 In the above example, the input Tensor with the shape of `[1, 3]`
 is broadcasted to output Tensor with shape of `[3, 3]`.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodePaddedRaw.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodePaddedRaw.pbtxt
new file mode 100644
index 0000000..1f15678
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodePaddedRaw.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "DecodePaddedRaw"
+  in_arg {
+    name: "input_bytes"
+    description: "Tensor of string to be decoded."
+  }
+  in_arg {
+    name: "fixed_length"
+    description: <<END
+Length in bytes for each element of the decoded output. Must be a multiple
+of the size of the output type.
+END 
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor with one more dimension than the input `bytes`. The added dimension
+will have size equal to the length of the elements of `bytes` divided by the
+number of bytes to represent `out_type`.
+END
+  }
+  attr {
+    name: "little_endian"
+    description: <<END
+Whether the input `input_bytes` is in little-endian order. Ignored for
+`out_type` values that are stored in a single byte, like `uint8`
+END
+  }
+  summary: "Reinterpret the bytes of a string as a vector of numbers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Fingerprint.pbtxt b/tensorflow/core/api_def/base_api/api_def_Fingerprint.pbtxt
new file mode 100644
index 0000000..bf56a01
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Fingerprint.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "Fingerprint"
+  endpoint {
+    name: "Fingerprint"
+  }
+  in_arg {
+    name: "data"
+    description: <<END
+Must have rank 1 or higher.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+Fingerprint method used by this op. Currently available method is
+`farmhash::fingerprint64`.
+END
+  }
+  out_arg {
+    name: "fingerprint"
+    description: <<END
+A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to
+`data`'s first dimension, and the second dimension size depends on the
+fingerprint algorithm.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+This can be a POD-type or string type.
+END
+  }
+  summary: "Generates fingerprint values."
+  description: <<END
+Generates fingerprint values of `data`.
+
+Fingerprint op considers the first dimension of `data` as the batch dimension,
+and `output[i]` contains the fingerprint value generated from contents in
+`data[i, ...]` for all `i`.
+
+Fingerprint op writes fingerprint values as byte arrays. For example, the
+default method `farmhash64` generates a 64-bit fingerprint value at a time.
+This 8-byte value is written out as an `uint8` array of size 8, in little-endian
+order.
+
+For example, suppose that `data` has data type `DT_INT32` and shape (2, 3, 4),
+and that the fingerprint method is `farmhash64`. In this case, the output shape
+is (2, 8), where 2 is the batch dimension size of `data`, and 8 is the size of
+each fingerprint value in bytes. `output[0, :]` is generated from 12 integers in
+`data[0, :, :]` and similarly `output[1, :]` is generated from other 12 integers
+in `data[1, :, :]`.
+
+Note that this op fingerprints the raw underlying buffer, and it does not
+fingerprint Tensor's metadata such as data type and/or shape. For example, the
+fingerprint values are invariant under reshapes and bitcasts as long as the
+batch dimension remain the same:
+
+```
+Fingerprint(data) == Fingerprint(Reshape(data, ...))
+Fingerprint(data) == Fingerprint(Bitcast(data, ...))
+```
+
+For string data, one should expect `Fingerprint(data) !=
+Fingerprint(ReduceJoin(data))` in general.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
index e90de74..1cf4b49 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
@@ -36,9 +36,8 @@
 @end_compatibility
 END
   }
-  summary: "Solves systems of linear equations with upper or lower triangular matrices by"
+  summary: "Solves systems of linear equations with upper or lower triangular matrices by backsubstitution."
   description: <<END
-backsubstitution.
 
 `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
 square matrices. If `lower` is `True` then the strictly upper triangular part
@@ -53,5 +52,35 @@
 If `adjoint` is `False` then the strictly then the  innermost matrices in
 `output` satisfy matrix equations
 `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+
+Example:
+```python
+
+a = tf.constant([[3,  0,  0,  0],
+                 [2,  1,  0,  0],
+                 [1,  0,  1,  0],
+                 [1,  1,  1,  1]], dtype=tf.float32)
+
+b = tf.constant([[4],
+                 [2],
+                 [4],
+                 [2]], dtype=tf.float32)
+
+x = tf.linalg.triangular_solve(a, b, lower=True)
+x
+# <tf.Tensor: id=257, shape=(4, 1), dtype=float32, numpy=
+# array([[ 1.3333334 ],
+#        [-0.66666675],
+#        [ 2.6666665 ],
+#        [-1.3333331 ]], dtype=float32)>
+
+# in python3 one can use `a@x`
+tf.matmul(a, x)
+# <tf.Tensor: id=263, shape=(4, 1), dtype=float32, numpy=
+# array([[4.       ],
+#        [2.       ],
+#        [4.       ],
+#        [1.9999999]], dtype=float32)>
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorFromVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorFromVariant.pbtxt
new file mode 100644
index 0000000..89cec1c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorFromVariant.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "RaggedTensorFromVariant"
+  visibility: HIDDEN
+  in_arg {
+    name: "encoded_ragged"
+    description: <<END
+A `variant` Tensor containing encoded `RaggedTensor`s.
+END
+  }
+  out_arg {
+    name: "output_nested_splits"
+    description: <<END
+A list of one or more Tensors representing the splits of the output
+`RaggedTensor`.
+END
+  }
+  out_arg {
+    name: "output_dense_values"
+    description: <<END
+A Tensor representing the values of the output `RaggedTensor`.
+END
+  }
+  attr {
+    name: "input_ragged_rank"
+    description: <<END
+The ragged rank of each encoded `RaggedTensor` component in the input. If set to
+-1, this is inferred as `output_ragged_rank` - `rank(encoded_ragged)`
+END
+  }
+  attr {
+    name: "output_ragged_rank"
+    description: <<END
+The expected ragged rank of the output `RaggedTensor`. The following must hold:
+`output_ragged_rank = rank(encoded_ragged) + input_ragged_rank`.
+END
+  }
+  summary: <<END
+Decodes a `variant` Tensor into a `RaggedTensor`.
+END
+  description: <<END
+Decodes the given `variant` Tensor and returns a `RaggedTensor`. The input
+could be a scalar, meaning it encodes a single `RaggedTensor` with ragged_rank
+`output_ragged_rank`. It could also have an arbitrary rank, in which case each
+element is decoded into a `RaggedTensor` with ragged_rank `input_ragged_rank`
+and these are then stacked according to the input shape to output a single
+`RaggedTensor` with ragged_rank `output_ragged_rank`. Each `variant` element in
+the input Tensor is decoded by retrieving from the element a 1-D `variant`
+Tensor with `input_ragged_rank + 1` Tensors, corresponding to the splits and
+values of the decoded `RaggedTensor`. If `input_ragged_rank` is -1, then it is
+inferred as `output_ragged_rank` - `rank(encoded_ragged)`. See
+`RaggedTensorToVariant` for the corresponding encoding logic.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariant.pbtxt
new file mode 100644
index 0000000..8bcd136
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariant.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "RaggedTensorToVariant"
+  visibility: HIDDEN
+  in_arg {
+    name: "rt_nested_splits"
+    description: <<END
+A list of one or more Tensors representing the splits of the input
+`RaggedTensor`.
+END
+  }
+  in_arg {
+    name: "rt_dense_values"
+    description: <<END
+A Tensor representing the values of the input `RaggedTensor`.
+END
+  }
+  out_arg {
+    name: "encoded_ragged"
+    description: <<END
+A `variant` Tensor that containing encoded `RaggedTensor`.
+END
+  }
+  attr {
+    name: "batched_input"
+    description: <<END
+A `bool` denoting whether the input is a batched `RaggedTensor`.
+END
+  }
+  summary: <<END
+Encodes a `RaggedTensor` into a `variant` Tensor.
+END
+  description: <<END
+
+Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
+`batched_input` is True, then input `RaggedTensor` is unbatched along the
+zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+`variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
+a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
+creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
+splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
+is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
+corresponding decoding logic.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
index f75272a..b52e7f6 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
@@ -66,7 +66,7 @@
   description: <<END
 That is for rows we have grad for, we update var, accum and linear as follows:
 accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 accum = accum_new
diff --git a/tensorflow/core/api_def/base_api/api_def_RngSkip.pbtxt b/tensorflow/core/api_def/base_api/api_def_RngSkip.pbtxt
new file mode 100644
index 0000000..b85bc26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RngSkip.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RngSkip"
+  visibility: HIDDEN
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "algorithm"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The amount of advancement.
+END
+  }
+  summary: "Advance the counter of a counter-based RNG."
+  description: <<END
+The state of the RNG after
+`rng_skip(n)` will be the same as that after `stateful_uniform([n])`
+(or any other distribution). The actual increment added to the
+counter is an unspecified implementation detail.  
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SnapshotDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SnapshotDataset.pbtxt
new file mode 100644
index 0000000..d4e71ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SnapshotDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "SnapshotDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "path"
+  description: <<END
+The path we should write snapshots to / read snapshots from.
+END
+  }
+  summary: "Creates a dataset that will write to / read from a snapshot."
+  description: <<END
+This dataset attempts to determine whether a valid snapshot exists at the
+`snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
+If not, it will run the preprocessing pipeline as usual, and write out a
+snapshot of the data processed for future use.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt
new file mode 100644
index 0000000..118bb66
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StringLower"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
index b63fbd1..e9764e9 100644
--- a/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
@@ -21,7 +21,7 @@
   attr {
     name: "key"
     description: <<END
-The key for the keyed hash function passed as a list of two uint64
+The key used to seed the hash function, passed as a list of two uint64
 elements.
 END
   }
@@ -34,8 +34,11 @@
 A strong hash is important when inputs may be malicious, e.g. URLs with
 additional components. Adversaries could try to make their inputs hash to the
 same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it difficult, if not infeasible, to compute inputs
-that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+hash can be used to make it difficult to find inputs with a skewed hash value
+distribution over buckets. This requires that the hash function is
+seeded by a high-entropy (random) "key" unknown to the adversary.
+
+The additional robustness comes at a cost of roughly 4x higher compute
 time than `tf.string_to_hash_bucket_fast`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt
new file mode 100644
index 0000000..40cd7a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StringUpper"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TridiagonalMatMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_TridiagonalMatMul.pbtxt
new file mode 100644
index 0000000..d256e1b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TridiagonalMatMul.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "TridiagonalMatMul"
+  visibility: HIDDEN
+  in_arg {
+    name: "superdiag"
+    description: <<END
+Tensor of shape `[..., 1, M]`, representing superdiagonals of
+tri-diagonal matrices to the left of multiplication. Last element is ingored.
+END
+  }
+  in_arg {
+    name: "maindiag"
+    description: <<END
+Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
+matrices to the left of multiplication.
+END
+  }
+  in_arg {
+    name: "subdiag"
+    description: <<END
+Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
+matrices to the left of multiplication. First element is ingored.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
+multiplication.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Tensor of shape `[..., M, N]` containing the product.
+END
+  }
+  summary: "Calculate product with tridiagonal matrix."
+  description: <<END
+Calculates product of two matrices, where left matrix is a tridiagonal matrix.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
index 80f3675..1eb88c8 100644
--- a/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
@@ -4,30 +4,40 @@
   in_arg {
     name: "diagonals"
     description: <<END
-Shape is `[..., 3, M]`.
+Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the
+tridiagonal matrices with three rows being the superdiagonal, diagonals, and
+subdiagonals, in order. The last element of the superdiagonal and the first
+element of the subdiagonal is ignored.
 END
   }
   in_arg {
     name: "rhs"
     description: <<END
-Shape is `[..., M, K]`.
+Tensor of shape `[..., M, K]`, representing K right-hand sides per each
+left-hand side.
+END
+  }
+  attr {
+    name: "partial_pivoting"
+    description: <<END
+Whether to apply partial pivoting. Partial pivoting makes the procedure more
+stable, but slower.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-Shape is `[..., M, K]`.
+Tensor of shape `[..., M, K]` containing the solutions
 END
   }
 
   summary: "Solves tridiagonal systems of equations."
   description: <<END
-`diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
-represent matrices with three rows being the superdiagonal, diagonals, and
-subdiagonals, in order. The last element of the superdiagonal and the first
-element of the subdiagonal is ignored.
-`rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
-each left-hand side.
-The output is a tensor of shape `[..., M, K]` containing the solutions.
+  Solves tridiagonal systems of equations.
+  Supports batch dimensions and multiple right-hand sides per each left-hand
+  side.
+  On CPU, solution is computed via Gaussian elimination with or without partial
+  pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
+  library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
 END
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodePaddedRaw.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodePaddedRaw.pbtxt
new file mode 100644
index 0000000..672890e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodePaddedRaw.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DecodePaddedRaw"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt
index dab7a5e..6e50b5b 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "DecodeRaw"
-  endpoint {
-    name: "io.decode_raw"
-  }
-  endpoint {
-    name: "decode_raw"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Fingerprint.pbtxt b/tensorflow/core/api_def/python_api/api_def_Fingerprint.pbtxt
new file mode 100644
index 0000000..07f66fc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Fingerprint.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Fingerprint"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
index efd42b8..5d6a1d5 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
@@ -2,5 +2,6 @@
   graph_op_name: "FloorDiv"
   endpoint {
     name: "floor_div"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
index e5db6d4..5882217 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
@@ -1,9 +1,17 @@
 op {
   graph_op_name: "FloorMod"
   endpoint {
+    name: "math.floormod"
+  }
+  endpoint {
     name: "floormod"
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "math.mod"
   }
   endpoint {
     name: "mod"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt
new file mode 100644
index 0000000..27d6783
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringLower"
+  endpoint {
+    name: "strings.lower"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt
new file mode 100644
index 0000000..3905018
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringUpper"
+  endpoint {
+    name: "strings.upper"
+  }
+}
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 8ce06bc..0734e53 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -37,6 +37,7 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 #define VALUE_IN_DEBUG_STRING false
 
@@ -164,8 +165,8 @@
     return t;
   }
 
-  Tensor Scalar(Allocator* a) const override {
-    Tensor t(a, dt_, TensorShape({}));
+  Tensor Scalar(Allocator* a, const AllocationAttributes& attr) const override {
+    Tensor t(a, dt_, TensorShape({}), attr);
     return t;
   }
 
@@ -266,9 +267,13 @@
   // Op off into its own thread, consider queuing them on a
   // fixed-size thread-pool dedicated to running CollectiveOps.
   SchedClosure([col_impl, col_ctx, done_safe, ctx]() {
-    tracing::ScopedActivity activity(
-        ctx->op_kernel().name(), strings::StrCat(ctx->op_kernel().type_string(),
-                                                 "#id=", ctx->step_id(), "#"));
+    profiler::TraceMe activity(
+        [&] {
+          return strings::StrCat(ctx->op_kernel().name(), ":",
+                                 ctx->op_kernel().type_string(),
+                                 "#id=", ctx->step_id(), "#");
+        },
+        profiler::TraceMeLevel::kInfo);
     col_impl->Run([col_impl, col_ctx, done_safe](const Status& s) {
       done_safe(s);
       delete col_ctx;
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index bc85b5a..6ecfca2 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -58,7 +58,8 @@
 
   // Generate a scalar tensor of same DataType and on the same device
   // as the backing tensor.
-  virtual Tensor Scalar(Allocator* a) const = 0;
+  virtual Tensor Scalar(Allocator* a,
+                        const AllocationAttributes& attr) const = 0;
 
   // Debugging string describing buffer location
   virtual string TBounds(const Tensor& t) const = 0;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index add079b..ec64984 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -13,10 +13,10 @@
 limitations under the License.
 ==============================================================================*/
 
-#include <atomic>
-
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 
+#include <atomic>
+
 #include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/bits.h"
@@ -159,7 +159,7 @@
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
-  c->freed_count = 0;
+  c->freed_at_count = 0;
 
   region_manager_.set_handle(c->ptr, h);
 
@@ -184,6 +184,8 @@
 
 void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
+  c->allocation_id = -1;
+  c->bin_num = kInvalidBinNum;
   c->next = free_chunks_list_;
   free_chunks_list_ = h;
 }
@@ -194,7 +196,7 @@
   // Fast path: Try once to allocate without getting the retry_helper_ involved
   uint64 freed_by_count = 0;
   if (allocation_attr.freed_by_func != nullptr) {
-    freed_by_count = allocation_attr.freed_by_func();
+    freed_by_count = (*allocation_attr.freed_by_func)();
   }
   void* r =
       AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
@@ -206,7 +208,7 @@
         [this, &allocation_attr](size_t a, size_t nb, bool v) {
           uint64 freed_by_count = 0;
           if (allocation_attr.freed_by_func != nullptr) {
-            freed_by_count = allocation_attr.freed_by_func();
+            freed_by_count = (*allocation_attr.freed_by_func)();
           }
           return AllocateRawInternal(a, nb, v, freed_by_count);
         },
@@ -224,7 +226,7 @@
     bool dump_log_on_failure = VLOG_IS_ON(2);
     uint64 freed_by_count = 0;
     if (allocation_attr.freed_by_func != nullptr) {
-      freed_by_count = allocation_attr.freed_by_func();
+      freed_by_count = (*allocation_attr.freed_by_func)();
     }
     void* result = AllocateRawInternal(unused_alignment, num_bytes,
                                        dump_log_on_failure, freed_by_count);
@@ -236,6 +238,8 @@
         LOG(WARNING)
             << "Allocator (" << Name() << ") ran out of memory trying "
             << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
+            << " with freed_by_count=" << freed_by_count
+
             << ". The caller indicates that this is not a failure, but"
             << " may mean that there could be performance gains if more"
             << " memory were available.";
@@ -274,6 +278,10 @@
   BinNum bin_num = BinNumForSize(rounded_bytes);
 
   mutex_lock l(lock_);
+  if (!timestamped_chunks_.empty()) {
+    // Merge timestamped chunks whose counts have become safe for general use.
+    MergeTimestampedChunks(0);
+  }
   void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
   if (ptr != nullptr) {
     return ptr;
@@ -287,13 +295,27 @@
     }
   }
 
+  if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
+    // We're unable to satisfy an allocation request without a specific
+    // timestamp requirement.  Rather than fail, try merging any held-out
+    // timestamped chunks more aggressively until a free chunk of the necessary
+    // size is formed.
+    if (MergeTimestampedChunks(rounded_bytes)) {
+      ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
+      if (ptr != nullptr) {
+        return ptr;
+      }
+    }
+  }
+
   // We searched all bins for an existing free chunk to use and
   // couldn't find one.  This means we must have run out of memory,
   // Dump the memory log for analysis.
   if (dump_log_on_failure) {
     LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying "
                  << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
-                 << ".  Current allocation summary follows.";
+                 << " (rounded to " << rounded_bytes
+                 << ").  Current allocation summary follows.";
     DumpMemoryLog(rounded_bytes);
     LOG(WARNING) << RenderOccupancy();
   }
@@ -312,7 +334,7 @@
       const BFCAllocator::ChunkHandle h = (*citer);
       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
       DCHECK(!chunk->in_use());
-      if (freed_before > 0 && freed_before < chunk->freed_count) {
+      if (freed_before > 0 && freed_before < chunk->freed_at_count) {
         continue;
       }
       if (chunk->size >= rounded_bytes) {
@@ -378,7 +400,7 @@
   new_chunk->allocation_id = -1;
 
   // It inherits the freed time.
-  new_chunk->freed_count = c->freed_count;
+  new_chunk->freed_at_count = c->freed_at_count;
 
   // Maintain the pointers.
   // c <-> c_neighbor becomes
@@ -414,8 +436,15 @@
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle);
 
+  MarkFree(h);
+
   // Consider coalescing it.
-  FreeAndMaybeCoalesce(h);
+  if (timing_counter_) {
+    InsertFreeChunkIntoBin(h);
+    timestamped_chunks_.push_back(h);
+  } else {
+    InsertFreeChunkIntoBin(TryToCoalesce(h, false));
+  }
 
   if (VLOG_IS_ON(4)) {
     LOG(INFO) << "F: " << RenderOccupancy();
@@ -451,7 +480,7 @@
   c1->size += c2->size;
 
   // Pick latest free time.
-  c1->freed_count = std::max(c1->freed_count, c2->freed_count);
+  c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
 
   DeleteChunk(h2);
 }
@@ -491,7 +520,7 @@
   c->bin_num = kInvalidBinNum;
 }
 
-void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
+void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
 
@@ -500,33 +529,128 @@
 
   // Optionally record the free time.
   if (timing_counter_) {
-    c->freed_count = timing_counter_->next();
+    c->freed_at_count = timing_counter_->next();
   }
 
   // Updates the stats.
   stats_.bytes_in_use -= c->size;
+}
 
+BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
+                                                      bool ignore_freed_at) {
+  Chunk* c = ChunkFromHandle(h);
+  if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
   ChunkHandle coalesced_chunk = h;
 
   // If the next chunk is free, merge it into c and delete it.
   if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
-    // VLOG(8) << "Merging c->next " << ChunkFromHandle(c->next)->ptr
-    //         << " with c " << c->ptr;
-    RemoveFreeChunkFromBin(c->next);
-    Merge(h, c->next);
+    Chunk* n = ChunkFromHandle(c->next);
+    if ((n->freed_at_count == 0) || ignore_freed_at) {
+      VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
+      RemoveFreeChunkFromBin(c->next);
+      Merge(h, c->next);
+    }
   }
 
   // If the previous chunk is free, merge c into it and delete c.
   if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
-    // VLOG(8) << "Merging c " << c->ptr << " into c->prev "
-    //         << ChunkFromHandle(c->prev)->ptr;
-
-    coalesced_chunk = c->prev;
-    RemoveFreeChunkFromBin(c->prev);
-    Merge(c->prev, h);
+    Chunk* n = ChunkFromHandle(c->prev);
+    if ((n->freed_at_count == 0) || ignore_freed_at) {
+      VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
+      coalesced_chunk = c->prev;
+      RemoveFreeChunkFromBin(c->prev);
+      Merge(c->prev, h);
+    }
   }
 
-  InsertFreeChunkIntoBin(coalesced_chunk);
+  return coalesced_chunk;
+}
+
+void BFCAllocator::SetSafeFrontier(uint64 count) {
+  uint64 current = safe_frontier_.load(std::memory_order_relaxed);
+  while (count > current) {
+    if (safe_frontier_.compare_exchange_strong(current, count)) {
+      retry_helper_.NotifyDealloc();
+      return;
+    } else {
+      current = safe_frontier_.load(std::memory_order_relaxed);
+    }
+  }
+}
+
+bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
+  VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
+          << " required_bytes=" << required_bytes;
+  bool satisfied = (required_bytes == 0);
+  std::vector<void*> to_merge;
+  std::deque<ChunkHandle> new_ts_queue;
+  while (!timestamped_chunks_.empty()) {
+    ChunkHandle h = timestamped_chunks_.front();
+    timestamped_chunks_.pop_front();
+    DCHECK_NE(h, kInvalidChunkHandle);
+    Chunk* c = ChunkFromHandle(h);
+    // It's possible this chunk has already been merged so refetch and retest
+    // the handle.
+    h = region_manager_.get_handle(c->ptr);
+    if (h == kInvalidChunkHandle) {
+      continue;
+    }
+    if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
+      // This chunk has already been reallocated.
+      continue;
+    }
+    if (c->freed_at_count == 0) {
+      to_merge.push_back(c->ptr);
+      continue;
+    }
+    // Chunk should be free and assigned to a bin.
+    DCHECK_NE(c->bin_num, kInvalidBinNum);
+    if (c->freed_at_count < safe_frontier_) {
+      c->freed_at_count = 0;
+      to_merge.push_back(c->ptr);
+    } else if (required_bytes > 0) {
+      to_merge.push_back(c->ptr);
+    } else {
+      new_ts_queue.push_back(h);
+    }
+  }
+  DCHECK(timestamped_chunks_.empty());
+  std::swap(timestamped_chunks_, new_ts_queue);
+
+  // At this point all candidate chunks have been moved from timestamped_chunks_
+  // to to_merge.  If this is a standard merge (required_bytes == 0) then
+  // merge them all, otherwise merge just until a Chunk of the required size
+  // is produced.
+  for (int ci = 0; ci < to_merge.size(); ++ci) {
+    void* ptr = to_merge[ci];
+    // It's possible that the Chunk associated with this memory location got
+    // merged and deallocated in a prior iteration so refetch the handle and
+    // retest.
+    ChunkHandle h = region_manager_.get_handle(ptr);
+    if (h == kInvalidChunkHandle) continue;
+    if (required_bytes == 0 || !satisfied) {
+      Chunk* c = ChunkFromHandle(h);
+      DCHECK_NE(c->bin_num, kInvalidBinNum);
+      DCHECK(!c->in_use());
+      RemoveFreeChunkFromBin(h);
+      ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
+      InsertFreeChunkIntoBin(new_h);
+      if (required_bytes > 0) {
+        c = ChunkFromHandle(new_h);
+        if (new_h != h && c->freed_at_count > 0) {
+          timestamped_chunks_.push_back(new_h);
+        }
+        if (c->size >= required_bytes) {
+          satisfied = true;
+        }
+      }
+    } else {
+      // We were force merging Chunks with unsafe timestamps, but managed
+      // to create a satisfying Chunk so just requeue the rest.
+      timestamped_chunks_.push_back(h);
+    }
+  }
+  return satisfied;
 }
 
 bool BFCAllocator::TracksAllocationSizes() const { return true; }
@@ -667,16 +791,17 @@
   // number by size.
   std::map<size_t, int> in_use_by_size;
   for (const auto& region : region_manager_.regions()) {
+    LOG(INFO) << "Next region of size " << region.memory_size();
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
       if (c->in_use()) {
         in_use_by_size[c->size]++;
       }
-      LOG(INFO) << (c->in_use() ? "Chunk" : "Free ") << " at " << c->ptr
-                << " of size " << c->size
+      LOG(INFO) << (c->in_use() ? "InUse" : "Free ") << " at " << c->ptr
+                << " next " << c->next << " of size " << c->size
                 << (timing_counter_
-                        ? strings::StrCat(" freed_count ", c->freed_count)
+                        ? strings::StrCat(" freed_at_count ", c->freed_at_count)
                         : "");
       h = c->next;
     }
@@ -691,6 +816,12 @@
   }
   LOG(INFO) << "Sum Total of in-use chunks: "
             << strings::HumanReadableNumBytes(total_bytes);
+  LOG(INFO) << "total_region_allocated_bytes_: "
+            << total_region_allocated_bytes_
+            << " memory_limit_: " << memory_limit_ << " available bytes: "
+            << (memory_limit_ - total_region_allocated_bytes_)
+            << " curr_region_allocation_bytes_: "
+            << curr_region_allocation_bytes_;
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }
 
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index d4fea6f..ea385e6 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -17,6 +17,7 @@
 #define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
 
 #include <array>
+#include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -75,6 +76,8 @@
 
   void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
 
+  void SetSafeFrontier(uint64 count) override;
+
  private:
   struct Bin;
 
@@ -88,6 +91,23 @@
 
   void DeallocateRawInternal(void* ptr);
 
+  // Chunks whose freed_at_count is later than the safe frontier value are kept
+  // on a special list and not subject to merging immediately upon being freed.
+  //
+  // This function sweeps that list looking for Chunks whose timestamp is now
+  // safe. When found their freed_at_count is set to 0 and we attempt to merge
+  // them with their neighbors.
+  //
+  // If required_bytes > 0 then this function is being called in the context of
+  // a need for this many bytes that could not be satisfied without merging
+  // unsafe chunks, so we go ahead and merge the unsafe chunks too, just up to
+  // the point that a free chunk of required_bytes is produced.  Note that
+  // unsafe merged chunks adopt the most conservative timestamp from their
+  // constituents so they're only useful for allocations not requiring a
+  // particular timestamp.
+  bool MergeTimestampedChunks(size_t required_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
   typedef size_t ChunkHandle;
@@ -95,6 +115,7 @@
 
   typedef int BinNum;
   static const int kInvalidBinNum = -1;
+  // The following means that the largest bin'd chunk size is 256 << 21 = 512MB.
   static const int kNumBins = 21;
 
   // A Chunk points to a piece of memory that's either entirely free or entirely
@@ -141,7 +162,7 @@
     BinNum bin_num = kInvalidBinNum;
 
     // Optional count when this chunk was most recently made free.
-    uint64 freed_count = 0;
+    uint64 freed_at_count = 0;
 
     bool in_use() const { return allocation_id != -1; }
 
@@ -151,7 +172,7 @@
       strings::StrAppend(
           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
           " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
-          " | in_use: ", in_use());
+          " | in_use: ", in_use(), " | bin_num: ", bin_num);
       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
         Chunk* p = a->ChunkFromHandle(prev);
         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
@@ -165,6 +186,7 @@
   };
 
   // A Bin is a collection of similar-sized free chunks.
+  // Allocated chunks are never in a Bin.
   struct Bin {
     // All chunks in this bin have >= bin_size memory.
     size_t bin_size = 0;
@@ -201,10 +223,13 @@
 
   // BFCAllocator allocates memory into a collection of disjoint
   // AllocationRegions.  Each AllocationRegion corresponds to one call to
-  // SubAllocator::Alloc().
+  // SubAllocator::Alloc().  (Actually, if a subsequent call to
+  // SubAllocator::Alloc() returns another region immediately adjacent to the
+  // last, it will be used to extend the first AllocationRegion, not create a
+  // separate one.)
   //
   // An AllocationRegion contains one or more Chunks, covering all of its
-  // memory.  Its primary job is to map a pointers to ChunkHandles.
+  // memory.  Its primary job is to map pointers to ChunkHandles.
   //
   // This class is thread-compatible.
   class AllocationRegion {
@@ -358,6 +383,8 @@
 
   // Removes a free chunk from the bin.
   void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void MaybeRemoveFreeChunkFromBin(ChunkHandle h)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Removes the chunk metadata represented by 'h'.
   void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
@@ -372,6 +399,11 @@
   const Chunk* ChunkFromHandle(ChunkHandle h) const
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
+  void MarkFree(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
   // Information about a Bin that is useful for debugging.
   struct BinDebugInfo {
     size_t total_bytes_in_use = 0;
@@ -441,6 +473,9 @@
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
   SharedCounter* timing_counter_ = nullptr;
+  std::deque<ChunkHandle> timestamped_chunks_;
+
+  std::atomic<uint64> safe_frontier_ = {0};
 
   // Structures mutable after construction
   mutable mutex lock_;
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
index b57eb29..4d55818 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -34,7 +34,11 @@
   HookTable dummy_table;
   {
     mutex_lock l(mu_);
-    status_.Update(s);
+    // Use a "derived" status as the status for the rendezvous. Derived
+    // status messages are ignored when aggregating errors across devices: this
+    // allows us to prefer our original status message over any cancellation
+    // related errors.
+    status_.Update(StatusGroup::MakeDerived(s));
     hook_table_.swap(dummy_table);
   }
   PurgeTable(s, &dummy_table);
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 7621787..dbf395f 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -188,9 +188,11 @@
   prod_note.WaitForNotification();
   cons_note.WaitForNotification();
   EXPECT_FALSE(prod_status.ok());
-  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_NE(prod_status.error_message().find("Falling sky detected"),
+            string::npos);
   EXPECT_FALSE(cons_status.ok());
-  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+  EXPECT_NE(cons_status.error_message().find("Falling sky detected"),
+            string::npos);
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index 501c447..b9fae0a 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -17,12 +17,17 @@
 
 #include <memory>
 #include <set>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/inspecting_placer.h"
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -120,14 +125,18 @@
   return v;
 }
 
+bool IsRefOrResource(DataType data_type) {
+  return IsRefType(data_type) || data_type == DT_RESOURCE;
+}
+
 // While Placer can override requested device on ops processing
 // resources, i.e. node that take (and potentially return) a resource,
 // it must not override requested device on ops generating a resource,
 // e.g. VarHandleOp, _Arg. Such ops are currently no-input, single resource/ref
 // output nodes.
-bool IsResourceGeneratorNode(const Node& node) {
+bool IsRefOrResourceGeneratorNode(const Node& node) {
   return node.num_inputs() == 0 && node.num_outputs() == 1 &&
-         (IsRefType(node.output_type(0)) || node.output_type(0) == DT_RESOURCE);
+         IsRefOrResource(node.output_type(0));
 }
 
 bool IsExemptFromResourceInputColocation(const Node* node) {
@@ -137,7 +146,7 @@
   // dereferencing them.
   const string& op_type = node->op_def().name();
   return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall" ||
-         op_type == "ReduceDataset";
+         op_type == "ReduceDataset" || op_type == "ExperimentalScanDataset";
 }
 
 bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
@@ -229,6 +238,20 @@
   return Status::OK();
 }
 
+Status Member::FillPossibleDevices(PossibleDevices* possible_device) const {
+  if (DeviceNameUtils::HasSomeDetails(assigned_device_name_)) {
+    return errors::Internal(
+        "Cannot fill PossibleDevices from a member that has non-empty assigned "
+        "device. Did we start assigning devices to functions called by deep "
+        "ops? ",
+        DebugString());
+  }
+  possible_device->requested_device_name = requested_device_name_;
+  possible_device->resource_device_name = resource_device_name_;
+  possible_device->device_types = supported_device_types_;
+  return Status::OK();
+}
+
 Status Member::EnsureCompatibilityAcrossResourceEdge(
     const Node& src, const Member& src_root,
     const Node& dst, /*dst_root is this*/
@@ -330,18 +353,26 @@
 // tree is non-const because we can change some `parent` pointers in some
 // members for more efficient future lookups. The vector itself is not
 // changed.
-int Member::FindRoot(std::vector<Member>* tree, int node_id) {
+int Member::FindAndUpdateRoot(std::vector<Member>* tree, int node_id) {
   Member& member = (*tree)[node_id];
   if (member.parent_ == node_id) {
     // member.parent is the root of this disjoint tree.  Do nothing.
   } else {
-    member.parent_ = FindRoot(tree, member.parent_);
+    member.parent_ = FindAndUpdateRoot(tree, member.parent_);
   }
   // Now it is guaranteed that member.parent is the root of this disjoint
   // tree.
   return member.parent_;
 }
 
+int Member::FindRoot(const std::vector<Member>& tree, int node_id) {
+  const Member& member = tree[node_id];
+  if (member.parent_ == node_id) {
+    return member.parent_;
+  }
+  return FindRoot(tree, member.parent_);
+}
+
 Status Member::MergeDeviceNames(const Member& other,
                                 bool allow_soft_placement) {
   // Assuming the "requested is a specialization of assigned and resource
@@ -376,6 +407,11 @@
 // Updates this to contain the intersection of the device types in
 // this and "other".
 bool Member::MergeSupportedDevices(const Member& other) {
+  return MergeSupportedDevices(other.supported_device_types_);
+}
+
+bool Member::MergeSupportedDevices(
+    const PrioritizedDeviceTypeVector& other_devices) {
   // Generate intersection with priorities.
   // Each vector contains the same device types but with different priorities.
   // The priorities are taken from the corresponding source vector.
@@ -383,8 +419,7 @@
   PrioritizedDeviceTypeVector other_intersection;
   for (const auto& prioritized_device_type : supported_device_types_) {
     bool found = false;
-    for (const auto& other_prioritized_device_type :
-         other.supported_device_types_) {
+    for (const auto& other_prioritized_device_type : other_devices) {
       if (prioritized_device_type.first ==
           other_prioritized_device_type.first) {
         found = true;
@@ -495,6 +530,17 @@
   return Status::OK();
 }
 
+Status Member::LimitToPossibleDevices(const PossibleDevices& devices,
+                                      bool allow_soft_placement) {
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &requested_device_name_, devices.requested_device_name,
+      allow_soft_placement));
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &resource_device_name_, devices.resource_device_name));
+  MergeSupportedDevices(devices.device_types);
+  return Status::OK();
+}
+
 string Member::DebugString() const {
   return absl::StrCat(
       "Member(assigned_device_name_index_=", assigned_device_name_index_,
@@ -535,14 +581,18 @@
   return soft_device_name;
 }
 
-ColocationGraph::ColocationGraph(const Graph* graph,
+ColocationGraph::ColocationGraph(const Graph* graph, const FunctionStack& stack,
                                  const FunctionLibraryDefinition* flib_def,
                                  const DeviceSet* device_set,
                                  const Device* default_device,
                                  bool allow_soft_placement,
                                  bool log_device_placement)
     : graph_(*graph),
+      stack_(stack),
       flib_def_(*flib_def),
+      inspecting_placer_(graph, stack, flib_def, device_set, default_device,
+                         allow_soft_placement, log_device_placement),
+      inspection_required_checker_(graph, flib_def),
       device_set_(*device_set),
       device_types_(device_set->PrioritizedDeviceTypeList()),
       default_device_(default_device),
@@ -612,11 +662,12 @@
   return Status::OK();
 }
 
-Status ColocationGraph::ColocateResourceOrRefEdge(Node* src, Node* dst) {
+Status ColocationGraph::ColocateResourceOrRefEdge(const Node* src,
+                                                  const Node* dst) {
   // Colocate `src` and `dst` to maintain the invariant that nodes
   // connected by reference edges are colocated.
-  int src_root_id = FindRoot(src->id());
-  int dst_root_id = FindRoot(dst->id());
+  int src_root_id = FindAndUpdateRoot(src->id());
+  int dst_root_id = FindAndUpdateRoot(dst->id());
   auto& src_root = members_[src_root_id];
   auto& dst_root = members_[dst_root_id];
 
@@ -635,9 +686,8 @@
   return Status::OK();
 }
 
-Status ColocationGraph::ColocateResourceAndRefEdges() {
-  // Enumerate the constraint edges, and use them to update the disjoint
-  // node set.
+Status ColocationGraph::ColocateResourceAndRefEdges(
+    std::unordered_set<Node*>* inspection_required) {
   // If `node` has an input edge with reference type, add an edge from the
   // source of that edge to `node`.
   for (const Edge* edge : graph_.edges()) {
@@ -646,19 +696,172 @@
     }
     Node* src = edge->src();
     Node* dst = edge->dst();
+    bool needs_inspection;
+    TF_RETURN_IF_ERROR(inspection_required_checker_.IsPlacerInspectionRequired(
+        *src, &needs_inspection));
+    if (needs_inspection) {
+      inspection_required->insert(src);
+      continue;
+    }
+    TF_RETURN_IF_ERROR(inspection_required_checker_.IsPlacerInspectionRequired(
+        *dst, &needs_inspection));
+    if (needs_inspection) {
+      inspection_required->insert(dst);
+      continue;
+    }
+
     DataType input_type = dst->input_type(edge->dst_input());
+    // Even though we can look inside function calling ops, we make an exception
+    // here mostly for performance reasons. Looking inside function calling ops
+    // is extra overhead. It is only necessary when they return resources. When
+    // they don't, we don't look inside them and make this exception here.
+    // Looking inside, could potentially enable us to make better placement
+    // decisions. It might be worth doing at some point.
     if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
         !IsExemptFromResourceInputColocation(dst)) {
       TF_RETURN_IF_ERROR(ColocateResourceOrRefEdge(src, dst));
     }
   }
+
+  return Status::OK();
+}
+
+Status ColocationGraph::AddInspectionConstraints(
+    const std::unordered_set<Node*>& inspection_required) {
+  for (Node* node : inspection_required) {
+    IOColocationGroups groups;
+    TF_RETURN_IF_ERROR(
+        inspecting_placer_.ComputeIOColocationGroups(*node, &groups));
+    VLOG(2) << "Computed IOColocationGroups for node " << node->name()
+            << ":\n\t" << groups.DebugString();
+    TF_RETURN_IF_ERROR(ApplyIOColocationGroups(groups, *node));
+  }
   return Status::OK();
 }
 
 Status ColocationGraph::Initialize() {
   TF_RETURN_IF_ERROR(InitializeMembers());
-  TF_RETURN_IF_ERROR(ColocateResourceAndRefEdges());
-  TF_RETURN_IF_ERROR(ColocateAllNodes());
+
+  std::unordered_set<Node*> inspection_required;
+  TF_RETURN_IF_ERROR(ColocateResourceAndRefEdges(&inspection_required));
+  TF_RETURN_IF_ERROR(AddInspectionConstraints(inspection_required));
+
+  return ColocateAllNodes();
+}
+
+// pair containing a node and whether this node has a resource input
+// from the node requiring placer inspection.
+using NodeAndBool = std::pair<const Node*, bool>;
+
+namespace {
+
+// Returns a vector of node names from `nodes`.
+std::vector<string> NodeAndBoolToString(const std::vector<NodeAndBool>& nodes) {
+  std::vector<string> v;
+  v.reserve(nodes.size());
+  for (const NodeAndBool& node_and_bool : nodes) {
+    v.push_back(node_and_bool.first->name());
+  }
+  return v;
+}
+
+// Given a node requiring placer inspection and its IOColocationGroups,
+// computes `group_nodes`.
+// group_nodes[i] contains the nodes that are members of colocation
+// group i. These nodes are inputs or outputs of `node`.
+// group_nodes[i][j] is a pair containing a node and whether this node
+// has a resource input from `node`.
+// Note:
+// The same node can be added multiple times to the same group.
+// The same node can be added to multiple groups.
+Status GetGroupNodes(const IOColocationGroups& groups, const Node& node,
+                     std::vector<std::vector<NodeAndBool>>* group_nodes) {
+  group_nodes->reserve(groups.group_devices.size());
+  for (int arg_idx = 0; arg_idx < groups.input_groups.size(); ++arg_idx) {
+    const Node* src;
+    TF_RETURN_IF_ERROR(node.input_node(arg_idx, &src));
+    int group_id = groups.input_groups[arg_idx];
+    (*group_nodes)[group_id].emplace_back(src, false);
+  }
+
+  for (const Edge* edge : node.out_edges()) {
+    if (edge->IsControlEdge()) {
+      continue;
+    }
+
+    int group_id = groups.output_groups[edge->src_output()];
+    (*group_nodes)[group_id].emplace_back(
+        edge->dst(), edge->dst()->input_type(edge->dst_input()) == DT_RESOURCE);
+  }
+
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Colocated inputs/outputs of node: " << node.DebugString();
+    for (const std::vector<NodeAndBool>& nodes : *group_nodes) {
+      VLOG(2) << "\t[" << absl::StrJoin(NodeAndBoolToString(nodes), "\t\n")
+              << "]";
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status ColocationGraph::ApplyIOColocationGroups(
+    const IOColocationGroups& groups, const Node& node) {
+  if (groups.input_groups.size() != node.num_inputs()) {
+    return errors::Internal(
+        "Cannot apply input/output device constraints to node ",
+        node.DebugString(), " because input_groups.size() (",
+        groups.input_groups.size(),
+        ") is different from number of inputs into the op node (",
+        node.num_inputs(), ")");
+  }
+  if (groups.output_groups.size() != node.num_outputs()) {
+    return errors::Internal(
+        "Cannot apply input/output device constraints to node ",
+        node.DebugString(), " because output_groups.size() (",
+        groups.output_groups.size(),
+        ") is different from number of outputs into the op node (",
+        node.num_outputs(), ")");
+  }
+
+  // group_nodes[i] contains the nodes that are members of colocation
+  // group i. These nodes are inputs or outputs of `node`.
+  // group_nodes[i][j] is a pair containing the node and whether this node
+  // has a resource input from `node`.
+  // The same node can be added multiple times to the same group.
+  // The same node can be added to multiple groups.
+  // NOTE: group ids are guarantees to be [0, 1, ..., num_groups].
+  std::vector<std::vector<NodeAndBool>> group_nodes(
+      groups.group_devices.size());
+  TF_RETURN_IF_ERROR(GetGroupNodes(groups, node, &group_nodes));
+
+  // Colocate nodes in each group
+  for (const std::vector<NodeAndBool>& nodes : group_nodes) {
+    for (int i = 1; i < nodes.size(); ++i) {
+      VLOG(2) << "Colocating \"" << nodes[0].first->name() << "\" and \""
+              << nodes[i].first->name() << "\"";
+      if (nodes[i].second) {
+        TF_RETURN_IF_ERROR(
+            ColocateResourceOrRefEdge(nodes[0].first, nodes[i].first));
+      } else {
+        TF_RETURN_IF_ERROR(ColocateNodes(*nodes[0].first, *nodes[i].first));
+      }
+    }
+  }
+
+  // Limit devices in each group
+  for (int group_id = 0; group_id < groups.group_devices.size(); ++group_id) {
+    // Nothing to do for empty groups. Groups can be empty if some output
+    // of an op is not used.
+    if (group_nodes[group_id].empty()) {
+      continue;
+    }
+    const Node* group_node = group_nodes[group_id][0].first;
+    const PossibleDevices& possible_devices = groups.group_devices[group_id];
+    TF_RETURN_IF_ERROR(LimitToPossibleDevices(*group_node, possible_devices));
+  }
+
   return Status::OK();
 }
 
@@ -698,8 +901,8 @@
 // NOTE: If this method returns an error, *this is left in an undefined
 // state.
 Status ColocationGraph::ColocateNodes(const Node& x, const Node& y) {
-  int x_root = FindRoot(x.id());
-  int y_root = FindRoot(y.id());
+  int x_root = FindAndUpdateRoot(x.id());
+  int y_root = FindAndUpdateRoot(y.id());
   return ColocateNodes(x, x_root, y, y_root);
 }
 
@@ -760,7 +963,7 @@
         "got: ",
         node.DebugString());
   }
-  int root = FindRoot(node.id());
+  int root = FindAndUpdateRoot(node.id());
   Member& root_member = members_[root];
   return root_member.AssignDevice(node, allow_soft_placement_);
 }
@@ -785,6 +988,14 @@
     return;
   }
 
+  // TODO(iga): Disallow changing resource devices when this ColocationGraph
+  // is for :
+  // - a function called by an op requiring deep inspection, or
+  // - a graph containing ops requiring inspection.
+  // It is fairly tricky to make changing resource devices in presence of
+  // ops requiring inspection work correctly. One thing it would require is to
+  // communicate these "resource movement" decisions across Placer instances.
+
   // Failed to find supported devices that don't violate resource devices.
   // Try finding some devices that violated resource devices.
   // If we succceed, we will log a warning below.
@@ -808,17 +1019,17 @@
   }
 }
 
-// For the given node, subject to the constraints previously given
-// to this ColocationGraph, set its assigned_device_name. Returns OK
-// if a satisfying device can be found, otherwise an error.
-//
-// Note: This method returns a pointer to a field within members_.
-// The caller must not use the returned pointer after there is any possibility
-// that the members_[i].possible_devices field has been modified.
+Status ColocationGraph::LimitToPossibleDevices(const Node& node,
+                                               const PossibleDevices& devices) {
+  int root = FindAndUpdateRoot(node.id());
+  Member& root_member = members_[root];
+  return root_member.LimitToPossibleDevices(devices, allow_soft_placement_);
+}
+
 Status ColocationGraph::GetDevicesForNode(
     Node* node, const std::vector<Device*>** possible_devices) {
   *possible_devices = nullptr;
-  const int node_root = FindRoot(node->id());
+  const int node_root = FindAndUpdateRoot(node->id());
   if (!members_[node_root].possible_devices().empty()) {
     *possible_devices = &members_[node_root].possible_devices();
     return Status::OK();
@@ -959,7 +1170,7 @@
   return Status::OK();
 }
 
-string ColocationGraph::DebugString() {
+string ColocationGraph::DebugString() const {
   std::unordered_set<int> roots;
   std::vector<string> root_strings;
   for (const Node* node : graph_.nodes()) {
@@ -976,7 +1187,7 @@
 }
 
 // Returns debugging info for the node referred to by 'node_root'.
-string ColocationGraph::DebugInfo(const int node_root) {
+string ColocationGraph::DebugInfo(const int node_root) const {
   string text(
       "\nColocation Debug Info:\n"
       "Colocation group had the following types and supported devices: ");
@@ -1111,7 +1322,7 @@
     // If the NodeDef contains a device, then we interpret it as a
     // (partial) device specification.
     if (!node.requested_device().empty()) {
-      if (IsResourceGeneratorNode(node)) {
+      if (IsRefOrResourceGeneratorNode(node)) {
         // Treat requested device on resource generating nodes as assigned
         // device so that we don't override it.
         TF_RETURN_IF_ERROR(member->SetResourceDeviceName(node));
diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h
index 0c6a41b..03140da 100644
--- a/tensorflow/core/common_runtime/colocation_graph.h
+++ b/tensorflow/core/common_runtime/colocation_graph.h
@@ -21,6 +21,8 @@
 
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/inspecting_placer.h"
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -46,6 +48,8 @@
   Status SetResourceDeviceName(const Node& node);
   Status SetRequestedDeviceName(const Node& node);
 
+  Status FillPossibleDevices(PossibleDevices* possible_device) const;
+
   Status EnsureCompatibilityAcrossResourceEdge(
       const Node& src, const Member& src_root,
       const Node& dst, /*dst_root is this*/
@@ -60,10 +64,14 @@
   static void Merge(std::vector<Member>* tree, int x_root, int y_root,
                     Member** new_root, Member** old_root, bool dry_run);
 
-  // tree is non-const because we can change some `parent` pointers in some
-  // members for more efficient future lookups. The vector itself is not
-  // changed.
-  static int FindRoot(std::vector<Member>* tree, int node_id);
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  // FindRoot should be called only for debugging or after the members have
+  // been updated with direct root pointers because it does not update
+  // root pointers and can traverse many links. It exists to have
+  // a const version of FindAndUpdateRoot
+  static int FindRoot(const std::vector<Member>& tree, int node_id);
+  static int FindAndUpdateRoot(std::vector<Member>* tree, int node_id);
 
   Status MergeDeviceNames(const Member& other, bool allow_soft_placement);
 
@@ -74,6 +82,11 @@
 
   Status AssignDevice(const Node& node, bool allow_soft_placement);
 
+  // Limit the possible devices of this (should be a root) to the device
+  // specifications in `devices`.
+  Status LimitToPossibleDevices(const PossibleDevices& devices,
+                                bool allow_soft_placement);
+
   void set_possible_devices(std::vector<Device*>&& devices) {
     possible_devices_ = devices;
   }
@@ -92,6 +105,10 @@
   string DebugString() const;
 
  private:
+  // Updates this to contain the intersection of the device types in
+  // this and `other_devices`.
+  bool MergeSupportedDevices(const PrioritizedDeviceTypeVector& other_devices);
+
   // The id of the node that is the parent of this one, or its own
   // id if it is a root. parent <= 0 indicates that this member is invalid.
   int parent_ = -1;
@@ -115,7 +132,7 @@
 
   // The merged form of the device requested for this node, with those of all of
   // its children. requested_device_name_ is always kept a specialization (i.e.
-  // DeviceNameUtils::IsSpecialization) of assigned_device_name_. When no device
+  // DeviceNameUtils::IsSpecification) of assigned_device_name_. When no device
   // is requested, this field is set to assigned_device_name_.  As a
   // specialization of assigned_device_name_, requested_device_name_ represents
   // the most specific form of all assigned and requested devices of this node
@@ -124,7 +141,7 @@
   // to resource colocation constraints but not assigned devices (unless soft
   // placement is on).
   // INVARIANT: requested_device_name_ is always kept a
-  // DeviceNameUtils::IsSpecialization of assigned_device_name_ and
+  // DeviceNameUtils::IsSpecification of assigned_device_name_ and
   // resource_device_name_. This makes requested_device_name_ the "accumulation
   // of all wishes" about the device.
   DeviceNameUtils::ParsedName requested_device_name_;
@@ -185,12 +202,53 @@
 // device is ignored.
 class ColocationGraph {
  public:
-  // graph, flib_def, and device_set must not be null and must outlive this
-  // ColocationGraph. default_device can be null. If not, must outlive this.
-  ColocationGraph(const Graph* graph, const FunctionLibraryDefinition* flib_def,
+  // graph, flib_def, and device_set must not be null and must outlive
+  // this ColocationGraph. default_device can be null. If not, must outlive
+  // this.
+  ColocationGraph(const Graph* graph, const FunctionStack& stack,
+                  const FunctionLibraryDefinition* flib_def,
                   const DeviceSet* device_set, const Device* default_device,
                   bool allow_soft_placement, bool log_device_placement);
 
+  Status Initialize();
+
+  const std::vector<Member>& members() const { return members_; }
+
+  // Limit the group containing `node` to the device specifications in
+  // `devices`.
+  Status LimitToPossibleDevices(const Node& node,
+                                const PossibleDevices& devices);
+
+  // Limits the possible devices of `node`'s colocation group to the device
+  // to which `node` is assigned. This makes sure that all nodes in this
+  // colocation group will be assigned to the same device. Without this
+  // explicit restriction, heuristics can choose a different possible device
+  // for other nodes in the group.
+  Status LimitToAssignedDevice(const Node& node);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  // Updates the internal pointers so that future calls will returns faster.
+  int FindAndUpdateRoot(int node_id) {
+    return Member::FindAndUpdateRoot(&members_, node_id);
+  }
+
+  // For the given node, subject to the constraints previously given
+  // to this ColocationGraph, set its assigned_device_name. Returns OK
+  // if a satisfying device can be found, otherwise an error.
+  //
+  // Note: This method returns a pointer to a field within members_.
+  // The caller must not use the returned pointer after there is any possibility
+  // that the members_[i].possible_devices field has been modified.
+  Status GetDevicesForNode(Node* node,
+                           const std::vector<Device*>** possible_devices);
+
+  // Returns debugging info for the node referred to by 'node_root'.
+  string DebugInfo(const int node_root) const;
+
+  string DebugString() const;
+
+ private:
   // Adds each node of the Graph to this ColocationGraph as a singleton.
   //
   // NOTE: The implementation assumes that the ids of nodes passed to
@@ -200,11 +258,46 @@
   // state.
   Status ColocateAllNodes();
 
-  Status ColocateResourceOrRefEdge(Node* src, Node* dst);
+  Status ColocateResourceOrRefEdge(const Node* src, const Node* dst);
 
-  Status ColocateResourceAndRefEdges();
+  // Updates this ColocationGraph by making sure that all nodes
+  // touching resource and/or ref tensors are colocated.
+  // As it iterates over the edges, fills the `inspection_required` set with
+  // the nodes that
+  // PlacerInspectionRequiredOpChecker::IsPlacerInspectionRequired
+  // deems as requiring deep inspection by placer. This is an optimization.
+  Status ColocateResourceAndRefEdges(
+      std::unordered_set<Node*>* inspection_required);
 
-  Status Initialize();
+  Status AddInspectionConstraints(
+      const std::unordered_set<Node*>& inspection_required);
+
+  // Applies colocation groups for `node`'s inputs and outputs to this
+  // ColocationGraph.
+  // `groups` are the colocation groups to which `nodes`'s inputs and outputs
+  // belong.
+  // `node` is a node requiring deep inspection (e.g. a node calling
+  // a function)
+  //
+  // For example, consider a `node` taking two inputs and producing one output
+  //    a  b
+  //    |  |
+  //    v  v
+  //    node
+  //     |
+  //     v
+  //     c
+  //
+  // `groups` can tell us that `a` and `c` must be colocated and their device
+  // must be a GPU. `b` might be in a group by itself without any device
+  // restrictions.
+  //
+  // ApplyIOColocationGroups will have an effect of calling
+  // ColocateNodes(a, c) and LimitToPossibleDevices(`a`, "GPU"). The colocation
+  // group of the `node` itself is not directly impacted.
+  //
+  Status ApplyIOColocationGroups(const IOColocationGroups& groups,
+                                 const Node& node);
 
   Status ColocateNodeToGroup(
       std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
@@ -224,34 +317,12 @@
   // If this method returns an error, *this is unchanged.
   Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root);
 
-  // Limits the possible devices of `node`'s colocation group to the device
-  // to which `node` is assigned. This makes sure that all nodes in this
-  // colocation group will be assigned to the same device. Without this
-  // explicit restriction, heuristics can choose a different possible device
-  // for other nodes in the group.
-  Status LimitToAssignedDevice(const Node& node);
-
-  // For the given node, subject to the constraints previously given
-  // to this ColocationGraph, set its assigned_device_name. Returns OK
-  // if a satisfying device can be found, otherwise an error.
-  //
-  // Note: This method returns a pointer to a field within members_.
-  // The caller must not use the returned pointer after there is any possibility
-  // that the members_[i].possible_devices field has been modified.
-  Status GetDevicesForNode(Node* node,
-                           const std::vector<Device*>** possible_devices);
-
   void GetSoftDeviceCandidates(const Node& node, const Member& root_member,
                                int root_id,
                                std::vector<Device*>* possible_devices);
 
   Status InitializeMembers();
 
-  string DebugString();
-
-  // Returns debugging info for the node referred to by 'node_root'.
-  string DebugInfo(const int node_root);
-
   Status InitializeMemberWithAssignedDevice(const string& assigned_device_name,
                                             const string& node_type,
                                             Member* member);
@@ -260,11 +331,20 @@
 
   // Returns the root node of the disjoint tree to which the node with the
   // given id is connected.
-  int FindRoot(int node_id) { return Member::FindRoot(&members_, node_id); }
+  // FindRoot should be called only for debugging or after the members have
+  // been updated with direct root pointers because it does not update
+  // root pointers and can traverse many links. It exists to have
+  // a const version of FindAndUpdateRoot
+  int FindRoot(int node_id) const {
+    return Member::FindRoot(members_, node_id);
+  }
 
   const Graph& graph_;
+  const FunctionStack stack_;
   const FunctionLibraryDefinition& flib_def_;
   std::vector<Member> members_;
+  InspectingPlacer inspecting_placer_;
+  PlacerInspectionRequiredOpChecker inspection_required_checker_;
   const DeviceSet& device_set_;
   const std::vector<DeviceType> device_types_;
   const Device* default_device_;
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 6e2eb66..3482429 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -50,7 +50,8 @@
 void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
                       Allocator* out_allocator, StringPiece edge_name,
                       Device* dst, Tensor* output,
-                      DeviceContext* recv_dev_context, StatusCallback done) {
+                      DeviceContext* recv_dev_context, StatusCallback done,
+                      bool sync_dst_compute) {
   if (input->dtype() == DT_VARIANT) {
     Tensor copy(cpu_allocator, DT_VARIANT, input->shape());
     auto* status_cb = new ReffedStatusCallback(std::move(done));
@@ -62,13 +63,14 @@
     };
     auto copier = std::bind(
         [dst, recv_dev_context, out_allocator, status_cb, cpu_allocator,
-         edge_name](StatusCallback wrapped_done_,
-                    // Begin unbound arguments
-                    const Tensor& from, Tensor* to) {
+         edge_name, sync_dst_compute](StatusCallback wrapped_done_,
+                                      // Begin unbound arguments
+                                      const Tensor& from, Tensor* to) {
           if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
             CopyHostToDevice(&from, cpu_allocator, out_allocator, edge_name,
-                             dst, to, recv_dev_context, wrapped_done_);
+                             dst, to, recv_dev_context, wrapped_done_,
+                             sync_dst_compute);
             return Status::OK();
           } else {
             if (!DMAHelper::CanUseDMA(&from)) {
@@ -82,8 +84,8 @@
             if (status_cb->ok()) {
               status_cb->Ref();
               *to = Tensor(out_allocator, from.dtype(), from.shape());
-              recv_dev_context->CopyCPUTensorToDevice(&from, dst, to,
-                                                      wrapped_done_);
+              recv_dev_context->CopyCPUTensorToDevice(
+                  &from, dst, to, wrapped_done_, sync_dst_compute);
               return Status::OK();
             } else {
               return status_cb->status();
@@ -107,8 +109,8 @@
       *output = std::move(copy);
     }
   } else {
-    recv_dev_context->CopyCPUTensorToDevice(input, dst, output,
-                                            std::move(done));
+    recv_dev_context->CopyCPUTensorToDevice(input, dst, output, std::move(done),
+                                            sync_dst_compute);
   }
 }
 
@@ -251,7 +253,8 @@
                         Device* dst, const AllocatorAttributes src_alloc_attr,
                         const AllocatorAttributes dst_alloc_attr,
                         const Tensor* input, Tensor* output,
-                        int dev_to_dev_stream_index, StatusCallback done) {
+                        int dev_to_dev_stream_index, StatusCallback done,
+                        bool sync_dst_compute) {
   tracing::ScopedAnnotation annotation(edge_name);
   VLOG(1) << "Copy " << edge_name;
 
@@ -304,16 +307,17 @@
         std::move(done), std::placeholders::_1);
     std::function<void(const Status&)> then_copy_to_other_device = std::bind(
         [delete_and_done, recv_dev_context, cpu_tensor, cpu_allocator,
-         out_allocator, edge_name, dst, output](StatusCallback delete_and_done_,
-                                                // Begin unbound arguments.
-                                                Status status) {
+         out_allocator, edge_name, dst, output,
+         sync_dst_compute](StatusCallback delete_and_done_,
+                           // Begin unbound arguments.
+                           Status status) {
           if (!status.ok()) {
             delete_and_done_(status);
             return;
           }
           CopyHostToDevice(cpu_tensor, cpu_allocator, out_allocator, edge_name,
                            dst, output, recv_dev_context,
-                           std::move(delete_and_done_));
+                           std::move(delete_and_done_), sync_dst_compute);
         },
         std::move(delete_and_done), std::placeholders::_1);
     CopyDeviceToHost(input, cpu_allocator, out_allocator, edge_name, src,
@@ -334,7 +338,8 @@
   if (!non_cpu_src && non_cpu_dst) {
     // Host to Device copy.
     CopyHostToDevice(input, cpu_allocator, out_allocator, edge_name, dst,
-                     output, recv_dev_context, std::move(done));
+                     output, recv_dev_context, std::move(done),
+                     sync_dst_compute);
     return;
   }
 
diff --git a/tensorflow/core/common_runtime/copy_tensor.h b/tensorflow/core/common_runtime/copy_tensor.h
index 9cd5ac2..5879442 100644
--- a/tensorflow/core/common_runtime/copy_tensor.h
+++ b/tensorflow/core/common_runtime/copy_tensor.h
@@ -45,7 +45,8 @@
                      const AllocatorAttributes src_alloc_attr,
                      const AllocatorAttributes dst_alloc_attr,
                      const Tensor* input, Tensor* output,
-                     int dev_to_dev_stream_index, StatusCallback done);
+                     int dev_to_dev_stream_index, StatusCallback done,
+                     bool sync_dst_compute = true);
 
   // Object used to call Register() at static-initialization time.
   // Note: This should only ever be used as a global-static object; no stack
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 7a54adc..9361521 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -33,6 +33,7 @@
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
@@ -68,6 +69,7 @@
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
@@ -265,6 +267,14 @@
                                true /* owned */);
   } else {
     thread_pools_.emplace_back(GlobalThreadPool(options), false /* owned */);
+    // Run locally if environment value of TF_NUM_INTEROP_THREADS is negative
+    // and config.inter_op_parallelism_threads is unspecified or negative.
+    static const int env_num_threads = NumInterOpThreadsFromEnvironment();
+    if (options_.config.inter_op_parallelism_threads() < 0 ||
+        (options_.config.inter_op_parallelism_threads() == 0 &&
+         env_num_threads < 0)) {
+      run_in_caller_thread_ = true;
+    }
   }
   // The default value of sync_on_finish will be flipped soon and this
   // environment variable will be removed as well.
@@ -434,8 +444,9 @@
                                   ExecutorsAndKeys* executors_and_keys,
                                   RunMetadata* run_metadata) {
   const uint64 start_time_usecs = options_.env->NowMicros();
-  string session_id_meta = strings::StrCat("SessionRun #id=", step_id, "#");
-  tracing::ScopedActivity activity(session_id_meta);
+  profiler::TraceMe activity(
+      [&] { return strings::StrCat("SessionRun #id=", step_id, "#"); },
+      profiler::TraceMeLevel::kInfo);
 
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
@@ -564,6 +575,9 @@
       run_options.inter_op_thread_pool() >= 0
           ? thread_pools_[run_options.inter_op_thread_pool()].first
           : nullptr;
+  if (run_in_caller_thread_) {
+    pool = nullptr;
+  }
 
   if (pool == nullptr) {
     // We allow using the caller thread only when having a single executor
@@ -1248,6 +1262,11 @@
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string()))
         delete kernel;
     };
+    params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                   Rendezvous** r) {
+      *r = new IntraProcessRendezvous(device_mgr);
+      return Status::OK();
+    };
 
     optimizer.Optimize(lib, options_.env, device, &partition_graph,
                        /*shape_map=*/nullptr);
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 619d69c..7e94e98 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -401,6 +401,18 @@
   mutex collective_graph_key_lock_;
   int64 collective_graph_key_ GUARDED_BY(collective_graph_key_lock_) = -1;
 
+  // Run in caller's thread if RunOptions.inter_op_thread_pool is negative or
+  // all of following conditions are met:
+  // 1. This session doesn't own any thread pool.
+  // 2. RunOptions.inter_op_thread_pool is unspecified or 0.
+  // 3. This session has a single executor.
+  // 4. config.inter_op_parallelism_threads is specified to negative explicitly
+  //    or through environment variable TF_NUM_INTEROP_THREADS.
+  // 5. RunOptions.experimental.use_run_handler_pool is unspecified or false.
+  // Otherwise run in global thread pool, session owned thread pool or handler
+  // pool according to other specifications of RunOptions and ConfigProto.
+  bool run_in_caller_thread_ = false;
+
   TF_DISALLOW_COPY_AND_ASSIGN(DirectSession);
 
   // EXPERIMENTAL: debugger (tfdbg) related
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index a6440c5..68e035f 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -109,7 +109,7 @@
     z_ = z->name();
     z->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
 
-    test::graph::ToGraphDef(&graph, &def_);
+    graph.ToGraphDef(&def_);
   }
 
   string a_;
@@ -540,7 +540,7 @@
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:2");
 
-  test::graph::ToGraphDef(&graph, &def);
+  graph.ToGraphDef(&def);
 
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
@@ -552,7 +552,7 @@
   // Fix placement and run again
   def.Clear();
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
-  test::graph::ToGraphDef(&graph, &def);
+  graph.ToGraphDef(&def);
   session.reset(NewSession(options));
   TF_ASSERT_OK(session->Create(def));
   std::vector<Tensor> outputs;
@@ -671,7 +671,7 @@
   Node* init = test::graph::Assign(&g, var, twenty_node);
   init->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -706,7 +706,7 @@
   Node* second_const = test::graph::Constant(&g, second_value);
   Node* second_identity = test::graph::Identity(&g, second_const);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -779,7 +779,7 @@
   Node* second_const = test::graph::Constant(&g, second_value);
   Node* second_identity = test::graph::Identity(&g, second_const);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -865,7 +865,7 @@
   Node* y = test::graph::Add(&graph, left, right);
 
   GraphDef def;
-  test::graph::ToGraphDef(&graph, &def);
+  graph.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -904,7 +904,7 @@
   Node* seven_node = test::graph::Constant(&g, seven_tensor);
 
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -941,7 +941,7 @@
   Node* second_const = test::graph::Constant(&g, second_value);
   Node* second_identity = test::graph::Identity(&g, second_const);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1031,7 +1031,7 @@
   Node* x = test::graph::Constant(&g, vx);
   Node* y = test::graph::Unary(&g, "ThreadID", x);
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
   auto sess = CreateSession();
   TF_ASSERT_OK(sess->Create(def));
   std::vector<Tensor> outputs;
@@ -1044,6 +1044,27 @@
             static_cast<int64>(outputs[0].scalar<int64>()()));
 }
 
+TEST(DirectSessionTest, SyncSession) {
+  Graph g(OpRegistry::Global());
+  Tensor vx(DT_INT64, TensorShape({}));
+  vx.scalar<int64>()() = 17;
+  Node* x = test::graph::Constant(&g, vx);
+  Node* y = test::graph::Unary(&g, "ThreadID", x);
+  GraphDef def;
+  g.ToGraphDef(&def);
+  SessionOptions options;
+  options.config.set_inter_op_parallelism_threads(-1);
+  std::unique_ptr<Session> sess(NewSession(options));
+  TF_ASSERT_OK(sess->Create(def));
+  std::vector<Tensor> outputs;
+  RunOptions run_opts;
+  auto s = sess->Run(run_opts, {}, {y->name() + ":0"}, {}, &outputs, nullptr);
+
+  std::hash<std::thread::id> hasher;
+  EXPECT_EQ(static_cast<int64>(hasher(std::this_thread::get_id())),
+            static_cast<int64>(outputs[0].scalar<int64>()()));
+}
+
 REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
@@ -1066,7 +1087,7 @@
   Node* x = test::graph::Constant(&g, vx);
   Node* y = test::graph::Unary(&g, "Darth", x);
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
   auto sess = CreateSession();
   TF_ASSERT_OK(sess->Create(def));
   std::vector<Tensor> outputs;
@@ -1084,7 +1105,7 @@
     Node* y = test::graph::Unary(&g, "Darth", x);
     y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     GraphDef def;
-    test::graph::ToGraphDef(&g, &def);
+    g.ToGraphDef(&def);
 
     // By default, we place the entire graph, so we should fail the
     // call to Create.
@@ -1102,7 +1123,7 @@
     Node* y = test::graph::Unary(&g, "Darth", x);
     y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     GraphDef def;
-    test::graph::ToGraphDef(&g, &def);
+    g.ToGraphDef(&def);
 
     SessionOptions options;
     // Set the option to place pruned graphs, we should expect this
@@ -1133,7 +1154,7 @@
   Node* third = test::graph::Add(&g, first_identity, second_identity);
   Node* third_identity = test::graph::Identity(&g, third);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1189,7 +1210,7 @@
   Node* third = test::graph::Add(&g, first_identity, second_identity);
   Node* third_identity = test::graph::Identity(&g, third);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1222,7 +1243,7 @@
   Node* switch_node = test::graph::Switch(&g, bool_const, bool_const);
   Node* fourth_identity = test::graph::Identity(&g, switch_node, 1);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1271,7 +1292,7 @@
 
   Node* node7 = test::graph::Unary(&g, "DeleteSessionTensor", const2);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1324,7 +1345,7 @@
 
   Node* node7 = test::graph::Unary(&g, "DeleteSessionTensor", const2);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1526,7 +1547,7 @@
     y = test::graph::Unary(&g, "BlockingOp", x);
   }
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
   *def.mutable_library() = library_graph_def;
 
   // Create session with two inter-op thread pools.
@@ -1677,7 +1698,7 @@
   t.scalar<float>()() = {1.2f};
   Node* x = test::graph::Constant(&g, t);
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   SessionOptions options;
   options.config.mutable_graph_options()
@@ -1736,7 +1757,7 @@
   Node* var = test::graph::Var(&g, DT_FLOAT, {});
   Node* var_assign = test::graph::Assign(&g, var, var_val);
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
@@ -1790,7 +1811,7 @@
   Node* third = test::graph::Add(&g, first_identity, second_identity);
   Node* third_identity = test::graph::Identity(&g, third);
 
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
@@ -1829,7 +1850,7 @@
   Node* var = test::graph::Var(&g, DT_FLOAT, {});
   Node* var_assign = test::graph::Assign(&g, var, var_val);
   GraphDef def;
-  test::graph::ToGraphDef(&g, &def);
+  g.ToGraphDef(&def);
 
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 9f5aa1d..d8ab46e 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -167,6 +167,7 @@
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core/profiler/lib:traceme",
             "//tensorflow/core/grappler/optimizers:meta_optimizer",
         ],
     }),
@@ -277,3 +278,14 @@
         "//tensorflow/core:test_main",
     ],
 )
+
+filegroup(
+    name = "srcs",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = ["*_test.cc"],
+    ),
+)
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index bde58a0..e259115 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -15,19 +15,25 @@
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/monitoring.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
@@ -72,6 +78,10 @@
       use_send_tensor_rpc_(false),
       pin_small_ops_to_cpu_(ReadBoolFromEnvVar(
           "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", false)) {
+  // Starts exporting metrics through a platform-specific monitoring API (if
+  // provided). For builds using "tensorflow/core/platform/default", this is
+  // currently a no-op.
+  monitoring::StartExporter();
   if (device_mgr_owned) {
     local_device_manager_.reset(device_mgr);
     local_unowned_device_manager_ = nullptr;
@@ -140,15 +150,13 @@
   return Status::OK();
 }
 
-Status EagerContext::ClearCaches() {
+void EagerContext::ClearCaches() {
   // The executor stores pointers to kernels, so we need to make sure that no
   // async eager ops are still executing. We lock the cache during this time as
   // well.
   mutex_lock ml(cache_mu_);
-  TF_RETURN_IF_ERROR(executor_.WaitForAllPendingNodes());
+  executor_.WaitForAllPendingNodes().IgnoreError();
   gtl::STLDeleteValues(&kernel_cache_);
-
-  return Status::OK();
 }
 
 void EagerContext::SetThreadLocalDevicePlacementPolicy(
@@ -166,7 +174,7 @@
   return policy_;
 }
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 void EagerContext::CloseRemoteContexts() {
   // Close all remote contexts.
   std::vector<eager::CloseContextRequest> requests(remote_contexts_.size());
@@ -196,10 +204,12 @@
 
   counter.Wait();
 }
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 
 EagerContext::~EagerContext() {
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
+  ClearCaches();
+
   if (server_) {
     // TODO(nareshmodi): Fix this.
     LOG(WARNING) << "Unable to destroy server_ object, so releasing instead. "
@@ -215,10 +225,8 @@
   keep_alive_thread_.reset();
 
   CloseRemoteContexts();
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 
-  executor_.WaitForAllPendingNodes().IgnoreError();
-  ClearCaches().IgnoreError();
   rendezvous_->Unref();
 
   for (auto& thread : child_threads_) {
@@ -309,7 +317,7 @@
 
 Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
   if (remote_device_manager_ == nullptr) return Status::OK();
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
   BlockingCounter blocking_counter(static_cast<int>(remote_contexts_.size()));
 
   std::vector<eager::RegisterFunctionRequest> requests(remote_contexts_.size());
@@ -340,7 +348,7 @@
   for (int i = 0; i < remote_contexts_.size(); i++) {
     TF_RETURN_IF_ERROR(statuses[i]);
   }
-#endif
+#endif  // !IS_MOBILE_PLATFORM
   return Status::OK();
 }
 
@@ -399,7 +407,7 @@
 }
 }  // namespace
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 Status EagerContext::GetClientAndContextID(Device* device,
                                            eager::EagerClient** client,
                                            uint64* context_id) {
@@ -444,7 +452,7 @@
   devices_map_.clear();
 
   InitDeviceMapAndAsync();
-  TF_RETURN_IF_ERROR(ClearCaches());
+  ClearCaches();
 
   pflr_.reset(new ProcessFunctionLibraryRuntime(
       local_unowned_device_manager_, env_, TF_GRAPH_DEF_VERSION, &func_lib_def_,
@@ -509,12 +517,11 @@
 
   InitDeviceMapAndAsync();
 
-  TF_RETURN_IF_ERROR(ClearCaches());
+  ClearCaches();
+  executor_.ClearError();
 
   keep_alive_secs_ = keep_alive_secs;
-
   sleep_for_secs_ = std::max(1, keep_alive_secs_ / 2);
-
   // Only schedule a single closure.
   if (keep_alive_thread_ == nullptr) {
     keep_alive_thread_.reset(
@@ -574,6 +581,6 @@
   }
   return Status::OK();
 }
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 8b994d2..e360f30 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -23,6 +23,11 @@
 #include <string>
 #include <vector>
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -31,11 +36,11 @@
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/platform/env.h"
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -73,7 +78,7 @@
   virtual void BeforeClearRunMetadata() = 0;
 };
 
-class EagerContext {
+class EagerContext : public core::RefCounted {
  public:
   // TODO: remove this constructor once we migrate all callers to the next one.
   EagerContext(const SessionOptions& opts,
@@ -118,7 +123,7 @@
   }
 
   // Clears the kernel caches.
-  Status ClearCaches();
+  void ClearCaches();
 
   // Sets the device placement policy for the current thread.
   void SetThreadLocalDevicePlacementPolicy(ContextDevicePlacementPolicy policy);
@@ -200,7 +205,7 @@
 
   FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
   Status GetClientAndContextID(Device* device, eager::EagerClient** client,
                                uint64* context_id);
 
@@ -231,7 +236,7 @@
   Status StoreCollectiveOpsServer(
       std::unique_ptr<ServerInterface> server, DeviceMgr* device_mgr,
       CollectiveExecutorMgrInterface* rpc_collective_executor_mgr);
-#endif
+#endif  // IS_MOBILE_PLATFORM
 
   // If true, then tensors should be shipped across processes via the
   // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
@@ -314,7 +319,7 @@
   std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
   CollectiveExecutorMgrInterface* unowned_collective_executor_mgr_ = nullptr;
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
   void CloseRemoteContexts();
 
   // The server_ is not const since we release it when the context is destroyed.
@@ -337,7 +342,7 @@
   mutex keep_alive_thread_shutdown_mu_;
   condition_variable keep_alive_thread_cv_;
   bool shutting_down_ GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
-#endif
+#endif  // IS_MOBILE_PLATFORM
 
   bool use_send_tensor_rpc_;
   const bool pin_small_ops_to_cpu_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 145d981..0a46843 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -17,6 +17,11 @@
 
 #include <vector>
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 #include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -26,17 +31,19 @@
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
-#endif
+#endif  // IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -560,10 +567,12 @@
     }
     const string& device_name =
         device == nullptr ? unspecified_device_name : device->name();
-    if (ctx->LogDevicePlacement()) {
-      LOG(INFO) << "Executing op " << ndef.op() << " in device " << device_name;
-    } else {
-      VLOG(1) << "Executing op " << ndef.op() << " in device " << device_name;
+    if (ctx->LogDevicePlacement() || VLOG_IS_ON(1)) {
+      string msg = strings::StrCat("Executing op ", ndef.op(), " in device ",
+                                   device_name);
+      if (!logging::LogToListeners(msg)) {
+        LOG(INFO) << msg;
+      }
     }
 
     FunctionLibraryRuntime* flr =
@@ -676,11 +685,13 @@
   return status;
 }
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 std::function<void()> GetRemoteTensorDestructor(
     EagerContext* ctx, eager::EagerClient* eager_client, uint64 context_id,
     uint64 op_id, int output_num) {
+  ctx->Ref();
   return [ctx, eager_client, context_id, op_id, output_num]() {
+    auto cleanup = gtl::MakeCleanup([ctx]() { ctx->Unref(); });
     if (!ctx->HasActiveRemoteContext(context_id)) {
       // This means that this tensor was pointing to a remote device, which
       // has been changed out from under us. Simply return since there is
@@ -714,7 +725,7 @@
     return tensorflow::Status::OK();
   };
 }
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 
 // When !ctx->UseSendTensorRPC(), then tensors are shipped between remote
 // devices by the receiver invoking the WorkerService.RecvTensor RPC *on the
@@ -726,10 +737,10 @@
 // *on the receiver*.
 Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
                              Device* recv_device, TensorHandle** result) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
-      "Eager's remote execution is not available on Android devices.");
-#else
+      "Eager's remote execution is not available on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
   eager::EagerClient* eager_client;
   uint64 context_id;
   TF_RETURN_IF_ERROR(
@@ -786,15 +797,15 @@
   actual_handle->Unref();
 
   return Status::OK();
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 }
 
 Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                           int* num_retvals) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
-      "Eager's remote execution is not available on Android devices.");
-#else
+      "Eager's remote execution is not available on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
   EagerContext* ctx = op->EagerContext();
 
   eager::EagerClient* eager_client;
@@ -929,7 +940,7 @@
   }
 
   return Status::OK();
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 }
 
 // These ops are not pinnable since they generate data. It can be slower to
@@ -1053,9 +1064,12 @@
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
-  if (op->EagerContext()->LogDevicePlacement()) {
-    LOG(INFO) << "Executing op " << op->Name() << " in device "
-              << op->Device()->name();
+  if (op->EagerContext()->LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 op->Device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
   }
 
   return EagerRemoteExecute(op, retvals->data(), num_retvals);
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index f6fa297..c5ffde1 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,11 +35,12 @@
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#endif
+#endif  // !IS_MOBILE_PLATFORM
 
 namespace tensorflow {
 
@@ -113,7 +114,7 @@
   if (it != ndef.attr().end()) {
     options.executor_type = it->second.s();
   }
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM)
   // Android tf library does not include grappler.
   const auto& config_it = ndef.attr().find("config_proto");
   if (it != ndef.attr().end()) {
@@ -141,7 +142,7 @@
         options.config_proto, function_def->signature().name(),
         optimization_options, std::placeholders::_6);
   }
-#endif
+#endif  // !IS_MOBILE_PLATFORM
   options.graph_collector = graph_collector;
 
   // In Eager mode we always inline all functions into the top-level
@@ -289,16 +290,20 @@
     done.WaitForNotification();
   } else {
     const string& op_name = kernel_->name();
-    // If tracing if off, the overheads of ScopedAnnotation and ScopedActivity
+    // If tracing if off, the overheads of ScopedAnnotation and TraceMe
     // are negligible.
     if (device_->TraceUsingAnnotations()) {
       // 'ScopedActivity' will trace the OpKernel scheduling time on host.
-      tracing::ScopedActivity activity(op_name, kernel_->type_string());
+      profiler::TraceMe activity(
+          [&] { return strings::StrCat(op_name, ":", kernel_->type_string()); },
+          profiler::TraceMeLevel::kInfo);
       // 'ScopedAnnotation' will trace the OpKernel execution time on device.
       tracing::ScopedAnnotation annotation(op_name, kernel_->type_string());
       device_->Compute(kernel_.get(), &context);
     } else {
-      tracing::ScopedActivity activity(op_name, kernel_->type_string());
+      profiler::TraceMe activity(
+          [&] { return strings::StrCat(op_name, ":", kernel_->type_string()); },
+          profiler::TraceMeLevel::kInfo);
       device_->Compute(kernel_.get(), &context);
     }
   }
@@ -322,18 +327,13 @@
   FunctionLibraryRuntime::Options opts;
   // We don't pass rendezvous from eager context because we can get tensor
   // name collisions in send/recv ops when running multiple instances
-  // of the same multi-device function concurrently. Instead, we ask the
-  // function library runtime to create a new for this call. We could have
-  // created one here but it requires more state to be kept in
-  // KernelAndDeviceFunc.
+  // of the same multi-device function concurrently.
   Rendezvous* rendezvous = new IntraProcessRendezvous(pflr_->device_mgr());
   opts.rendezvous = rendezvous;
   opts.create_rendezvous = false;
 
   opts.cancellation_manager = &cm_;
   cm_.Reset();
-  // eager runtime does not yet support collective ops.
-  opts.collective_executor = nullptr;
   opts.allow_dead_tensors = true;
   opts.step_container = step_container;
   opts.collective_executor =
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 5832bf4..d2e599b 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -22,6 +22,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/pending_counts.h"
@@ -45,6 +47,7 @@
 #include "tensorflow/core/graph/edgeset.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -64,6 +67,8 @@
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
 namespace tensorflow {
@@ -153,14 +158,14 @@
   // The kernel for this node.
   OpKernel* kernel = nullptr;
 
-  bool kernel_is_async : 1;      // True iff kernel->AsAsync() != nullptr
-  bool is_merge : 1;             // True iff IsMerge(node)
-  bool is_enter : 1;             // True iff IsEnter(node)
-  bool is_constant_enter : 1;    // True iff IsEnter(node) and
-                                 // node->GetAttr("is_constant") == true.
-  bool is_exit : 1;              // True iff IsExit(node)
-  bool is_control_trigger : 1;   // True iff IsControlTrigger(node)
-  bool is_sink : 1;              // True iff IsSink(node)
+  bool kernel_is_async : 1;     // True iff kernel->AsAsync() != nullptr
+  bool is_merge : 1;            // True iff IsMerge(node)
+  bool is_enter : 1;            // True iff IsEnter(node)
+  bool is_constant_enter : 1;   // True iff IsEnter(node) and
+                                // node->GetAttr("is_constant") == true.
+  bool is_exit : 1;             // True iff IsExit(node)
+  bool is_control_trigger : 1;  // True iff IsControlTrigger(node)
+  bool is_sink : 1;             // True iff IsSink(node)
   // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
   bool is_enter_exit_or_next_iter : 1;
 
@@ -1244,6 +1249,7 @@
   int64 step_id_;
   // Not owned.
   Rendezvous* rendezvous_;
+  Executor::RendezvousFactory* create_rendezvous_ = nullptr;
   CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
   string session_handle_;
@@ -1378,6 +1384,7 @@
       log_memory_(LogMemory::IsEnabled()),
       step_id_(args.step_id),
       rendezvous_(args.rendezvous),
+      create_rendezvous_(&impl->params_.rendezvous_factory),
       collective_executor_(args.collective_executor),
       session_state_(args.session_state),
       session_handle_(args.session_handle),
@@ -1586,11 +1593,11 @@
                 bool using_annotations) {
   // Tracing will only be enabled if either `event_collector` is non null,
   // or `trace_collector` is non-null and enabled for this particular kernel.
-  // Although `tracing::ScopedActivity`,
-  // `tracing::ScopedAnnotation`, and `tracing::ScopedRegion` check subsets of
-  // these properties internally in their constructors, the cost of passing the
-  // necessary arguments to them can be significant, so we avoid constructing
-  // them in the common case (when we know they will not be used).
+  // Although `profiler::TraceMe`, `tracing::ScopedAnnotation`, and
+  // `tracing::ScopedRegion` check subsets of these properties internally in
+  // their constructors, the cost of passing the necessary arguments to them can
+  // be significant, so we avoid constructing them in the common case (when we
+  // know they will not be used).
   if (event_collector != nullptr) {
     return true;
   }
@@ -1598,12 +1605,10 @@
   if (trace_collector) {
     if (using_annotations) {
       return trace_collector->IsEnabledForAnnotations();
-    } else {
-      return trace_collector->IsEnabledForActivities(
-          item.kernel->IsExpensive());
     }
   }
-  return false;
+  return profiler::TraceMeRecorder::Active(
+      profiler::GetTFTraceMeLevel(item.kernel->IsExpensive()));
 }
 
 void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
@@ -1624,6 +1629,7 @@
   params.log_memory = log_memory_;
   params.record_tensor_accesses = impl_->device_record_tensor_accesses_;
   params.rendezvous = rendezvous_;
+  params.create_rendezvous = create_rendezvous_;
   params.collective_executor = collective_executor_;
   params.session_state = session_state_;
   params.session_handle = session_handle_;
@@ -1807,16 +1813,18 @@
           tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                        op_name);
           if (trace_using_annotations_) {
-            // 'ScopedActivity' will trace the OpKernel scheduling time.
-            tracing::ScopedActivity activity(kernel_label);
+            // 'TraceMe' will trace the OpKernel scheduling time.
+            profiler::TraceMe activity(absl::string_view(kernel_label),
+                                       profiler::TraceMeLevel::kInfo);
             // 'ScopedAnnotation' will trace the OpKernel execution time.
             tracing::ScopedAnnotation annotation(kernel_label);
             device->Compute(op_kernel, &ctx);
           } else {
-            // Use the cheaper `ScopedActivity` to trace just the OpKernel
+            // Use the cheaper `TraceMe` to trace just the OpKernel
             // execution.
-            tracing::ScopedActivity activity(kernel_label,
-                                             op_kernel->IsExpensive());
+            profiler::TraceMe activity(
+                absl::string_view(kernel_label),
+                profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
             device->Compute(op_kernel, &ctx);
           }
         } else {
@@ -2085,23 +2093,12 @@
 void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                      const NodeItem* item, EntryVector* outputs,
                                      TaggedNodeSeq* ready) {
-  auto activity_handle =
-      [&]() -> std::unique_ptr<tracing::TraceCollector::Handle> {
-    auto* trace_collector = tracing::GetTraceCollector();
-    if (TF_PREDICT_FALSE(trace_collector != nullptr &&
-                         trace_collector->IsEnabledForActivities(
-                             false /* is_expensive */))) {
-      const string& op_name = item->kernel->name();
-      // Intentionally using ExecutorPropagateOutputs as the first key so that
-      // users are aware that it's not the op invocation.
-      return trace_collector->CreateActivityHandle(
-          "ExecutorPropagateOutputs",
-          strings::StrCat(op_name, "#id=", step_id_, "#"),
-          false /* is_expensive */);
-    } else {
-      return nullptr;
-    }
-  }();
+  auto activity_handle = absl::make_unique<profiler::TraceMe>(
+      [&]() {
+        return strings::StrCat("ExecutorPropagateOutputs:",
+                               item->kernel->name(), "#id=", step_id_, "#");
+      },
+      profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 
   const Node* node = tagged_node.node;
   FrameState* input_frame = tagged_node.input_frame;
@@ -2221,11 +2218,28 @@
     mutex_lock l(mu_);
     if (status_.ok()) {
       abort_run = true;
-      status_ = s;
+
+      // If execution has been cancelled, mark any new errors as being derived.
+      // This ensures any errors triggered by cancellation are marked as
+      // derived.
+      if (cancellation_manager_ && cancellation_manager_->IsCancelled()) {
+        status_ = StatusGroup::MakeDerived(s);
+      } else {
+        status_ = s;
+      }
     }
   }
   if (abort_run) {
     TRACEPRINTF("StartAbort: %s", s.ToString().c_str());
+    if (cancellation_manager_) {
+      // only log when the abort happens during the actual run time.
+      auto device_name = impl_->params_.device->name();
+      // Use VLOG instead of LOG(warning) because error status is expected when
+      // the executor is run under the grappler optimization phase or when
+      // iterating through a tf.data input pipeline.
+      VLOG(1) << "[" << device_name << "] Executor start aborting: " << s;
+    }
+
     if (rendezvous_) {
       rendezvous_->StartAbort(s);
     }
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 4be60c6..ff64201 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -17,6 +17,7 @@
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -81,6 +82,9 @@
   //
   // RunAsync() dispatches closures to "runner". Typically, "runner"
   // is backed up by a bounded threadpool.
+  typedef std::function<Status(const int64, const DeviceMgr*, Rendezvous** r)>
+      RendezvousFactory;
+
   struct Args {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
@@ -135,6 +139,8 @@
   // when the executor is deleted.
   std::function<Status(const NodeDef&, OpKernel**)> create_kernel;
   std::function<void(OpKernel*)> delete_kernel;
+
+  Executor::RendezvousFactory rendezvous_factory;
 };
 ::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
                                       std::unique_ptr<const Graph> graph,
@@ -198,7 +204,7 @@
       if (--pending_ == 0) {
         CHECK(done_cb_ != nullptr);
         std::swap(done, done_cb_);
-        status = status_group_.as_status();
+        status = status_group_.as_summary_status();
       }
     }
 
@@ -210,6 +216,9 @@
 
     if (done != nullptr) {
       delete this;
+      if (!status.ok()) {
+        VLOG(1) << "ExecutorBarrier finished with bad status: " << status;
+      }
       done(status);
     }
   }
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index c311b25..57019b0 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -13,11 +13,12 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/executor.h"
+
 #include <algorithm>
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
@@ -68,10 +69,16 @@
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
+    rendez_ = NewLocalRendezvous();
+    params.rendezvous_factory = [this](const int64, const DeviceMgr*,
+                                       Rendezvous** r) {
+      *r = rendez_;
+      rendez_->Ref();
+      return Status::OK();
+    };
     delete exec_;
     TF_CHECK_OK(NewLocalExecutor(params, std::move(graph), &exec_));
     runner_ = [this](std::function<void()> fn) { thread_pool_->Schedule(fn); };
-    rendez_ = NewLocalRendezvous();
   }
 
   Status Run(Rendezvous* rendez) {
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 1ab2441..822dad7 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -401,6 +401,7 @@
     Executor* exec = nullptr;
     FunctionLibraryRuntimeOverlay* overlay_flr = nullptr;
     string executor_type;
+    Executor::RendezvousFactory rendezvous_factory = nullptr;
 
     ~Item() {
       delete this->func_graph;
@@ -498,7 +499,6 @@
                       errors::Internal("No function library is provided."),
                       done);
     FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.step_container = ctx->step_container();
@@ -773,6 +773,11 @@
         item->overlay_flr =
             new FunctionLibraryRuntimeOverlay(this, options.lib_def);
       }
+      item->rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                    Rendezvous** r) {
+        *r = new IntraProcessRendezvous(device_mgr);
+        return Status::OK();
+      };
       local_handle = next_handle_++;
       items_.emplace(local_handle, std::unique_ptr<Item>(item));
     }
@@ -924,6 +929,7 @@
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
   };
+  params.rendezvous_factory = (*item)->rendezvous_factory;
   Graph* graph = g.get();
   std::unique_ptr<Executor> exec;
   TF_RETURN_IF_ERROR(NewExecutor(executor_type, params, std::move(g), &exec));
@@ -1501,6 +1507,7 @@
       "disable_inlining=", true_false(disable_inlining),
       ", ignore_noinline=", true_false(ignore_noinline),
       ", override_device=", true_false(ignore_noinline),
+      ", initialize_empty_device=", true_false(initialize_empty_device),
       ", keep_caller_node=", keep_caller_node_str(), ", output_control_src=",
       output_control_src == OutputControlSrc::kDataOutputs ? "DataOutputs"
                                                            : "ControlOutputs");
@@ -1698,16 +1705,21 @@
   std::vector<Node*> node_map(fbody->graph->num_node_ids());
   for (Node* n : fbody->graph->op_nodes()) {
     NodeDef ndef = n->def();
-    ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
-    if (options.override_device || ndef.device().empty()) {
+
+    if (options.override_device) {
       ndef.set_device(caller->def().device());
     }
-    for (auto& attr : *ndef.mutable_attr()) {
-      if (attr.first == "_class") {
-        attr.second.set_s(
-            strings::StrCat(caller->name(), "/", attr.second.s()));
-      }
+    if (options.initialize_empty_device && ndef.device().empty()) {
+      ndef.set_device(caller->def().device());
     }
+
+    // Add the function node name as a prefix:
+    //  1) to node name to avoid collisions
+    //  2) to frame name to avoid multiple LoopCond nodes in one frame
+    //  3) to colocation attribute
+    const string prefix = strings::StrCat(caller->name(), "/");
+    TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef));
+
     Status added_node;
     Node* clone = g->AddNode(ndef, &added_node);
     if (options.override_device && !caller->assigned_device_name().empty()) {
@@ -2094,6 +2106,10 @@
 
   std::vector<Node*> node_map(src.num_node_ids());
 
+  // Copy just the fdef attributes (copy '_noinline' and other similar flags to
+  // the gradient function body).
+  *(gbody->fdef.mutable_attr()) = fbody_->fdef.attr();
+
   // Copy the nodes.
   node_map[src.source_node()->id()] = dst->source_node();
   node_map[src.sink_node()->id()] = dst->sink_node();
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 450c974..3d071db 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -201,6 +201,13 @@
   // If 'true' function inlining will override explicitly specified devices
   // inside function body with the caller node device.
   bool override_device = false;
+  // If 'true' function inlining will fill an empty device annotation inside
+  // function body with the caller node device.
+  // TODO(ezhulenev): Remove this flag. This is mostly legacy-compatibility
+  // mode. We should never explicitly define devices when we inline multi-device
+  // functions. However we do that in 'lower_function_call_op.cc' and
+  // 'function_optimizer' for now.
+  bool initialize_empty_device = false;
   // Controls if we want to keep a node with the name as the function call node
   // in a graph after function inlining.
   KeepCallerNode keep_caller_node = KeepCallerNode::kDoNotKeep;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 2986152..d6e3b67 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -98,6 +98,11 @@
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
+    params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                   Rendezvous** r) {
+      *r = new IntraProcessRendezvous(device_mgr);
+      return Status::OK();
+    };
     Executor* exec;
     TF_CHECK_OK(NewLocalExecutor(params, std::move(g), &exec));
     exec_.reset(exec);
@@ -1439,6 +1444,58 @@
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest, Gradient_Select) {
+  FunctionDef my_select = FunctionDefHelper::Create(
+      "MySelect",
+      // Args
+      {"condition: bool", "t: float32", "e: float32"},
+      // Return values
+      {"z: float32"},
+      // Attrs
+      {},
+      // Nodes
+      {
+          {{"select0"}, "Select", {"condition", "t", "e"}, {{"T", DT_FLOAT}}},
+          {{"select1"}, "Select", {"condition", "t", "e"}, {{"T", DT_FLOAT}}},
+          {{"add"},
+           "Add",
+           {"select0:output", "select1:output"},
+           {{"T", DT_FLOAT}}},
+      },
+      // Output mapping
+      {{"z", "add:z"}});
+  FunctionDef select_grad = FunctionDefHelper::Create(
+      "MySelectGrad",
+      // Args
+      {"condition: bool", "t:float32", "e: float32", "dz: float32"},
+      // Return values
+      {"dt: float32"},
+      // Attrs
+      {},
+      // Nodes
+      {{
+          {"grad"},
+          "SymbolicGradient",
+          {"condition", "t", "e", "dz"},
+          {
+              {"f", FunctionDefHelper::FunctionRef("MySelect")},
+              {"Tin", DataTypeSlice({DT_BOOL, DT_FLOAT, DT_FLOAT, DT_FLOAT})},
+              {"Tout", DataTypeSlice({DT_BOOL, DT_FLOAT, DT_FLOAT})},
+          },
+      }},
+      // Output mapping
+      {{"dt", "grad:output:1"}});
+  Init({my_select, select_grad});
+
+  auto condition = test::AsTensor<bool>({false});
+  auto t = test::AsTensor<float>({13.0});
+  auto e = test::AsTensor<float>({15.0});
+  auto dz = test::AsTensor<float>({1.0});
+  Tensor y;
+  TF_EXPECT_OK(InstantiateAndRun(flr0_, "MySelectGrad", {},
+                                 {condition, t, e, dz}, {&y}));
+}
+
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 2e44a37..75d21d8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
@@ -23,6 +24,7 @@
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
@@ -152,18 +154,18 @@
   GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
-  float* first_ptr = a.Allocate<float>(1024);
+  float* first_ptr = TypedAllocator::Allocate<float>(&a, 1024, {});
   a.DeallocateRaw(first_ptr);
   CheckStats(&a, 1, 0, 4096, 4096);
   for (int i = 0; i < 1024; ++i) {
     // Allocate several buffers of different sizes, and then clean them
     // all up.  We should be able to repeat this endlessly without
     // causing fragmentation and growth.
-    float* t1 = a.Allocate<float>(1024);
+    float* t1 = TypedAllocator::Allocate<float>(&a, 1024, {});
 
-    int64* t2 = a.Allocate<int64>(1048576);
-    double* t3 = a.Allocate<double>(2048);
-    float* t4 = a.Allocate<float>(10485760);
+    int64* t2 = TypedAllocator::Allocate<int64>(&a, 1048576, {});
+    double* t3 = TypedAllocator::Allocate<double>(&a, 2048, {});
+    float* t4 = TypedAllocator::Allocate<float>(&a, 10485760, {});
 
     a.DeallocateRaw(t1);
     a.DeallocateRaw(t2);
@@ -178,7 +180,7 @@
   // At the end, we should have coalesced all memory into one region
   // starting at the beginning, so validate that allocating a pointer
   // starts from this region.
-  float* first_ptr_after = a.Allocate<float>(1024);
+  float* first_ptr_after = TypedAllocator::Allocate<float>(&a, 1024, {});
   EXPECT_EQ(first_ptr, first_ptr_after);
   a.DeallocateRaw(first_ptr_after);
 }
@@ -189,7 +191,7 @@
       GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
       platform_gpu_id, false /*use_unified_memory*/, {}, {});
   GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
-  float* ptr = a.Allocate<float>(0);
+  float* ptr = TypedAllocator::Allocate<float>(&a, 0, {});
   EXPECT_EQ(nullptr, ptr);
 }
 
@@ -208,7 +210,7 @@
       GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
       platform_gpu_id, false /*use_unified_memory*/, {}, {});
   GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
-  float* t1 = a.Allocate<float>(1);
+  float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
   a.DeallocateRaw(t1);
@@ -222,8 +224,8 @@
   // Configure a 1MiB byte limit
   GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
 
-  float* first_ptr = a.Allocate<float>(1 << 6);
-  float* second_ptr = a.Allocate<float>(1 << 20);
+  float* first_ptr = TypedAllocator::Allocate<float>(&a, 1 << 6, {});
+  float* second_ptr = TypedAllocator::Allocate<float>(&a, 1 << 20, {});
 
   EXPECT_NE(nullptr, first_ptr);
   EXPECT_EQ(nullptr, second_ptr);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index a0728c0..28a2465 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 
@@ -24,6 +25,7 @@
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -46,7 +48,8 @@
   for (int s : {8}) {
     std::vector<int64> cpu_array(s);
     memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
-    int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+    int64* gpu_array =
+        TypedAllocator::Allocate<int64>(&a, cpu_array.size(), {});
     se::DeviceMemory<int64> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
     ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
                                                s * sizeof(int64)));
@@ -73,7 +76,8 @@
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
-          int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+          int64* gpu_array =
+              TypedAllocator::Allocate<int64>(&a, cpu_array.size(), {});
 
           se::DeviceMemory<int64> gpu_array_ptr{
               se::DeviceMemoryBase{gpu_array}};
@@ -109,7 +113,8 @@
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
-          int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+          int64* gpu_array =
+              TypedAllocator::Allocate<int64>(&a, cpu_array.size(), {});
 
           se::DeviceMemory<int64> gpu_array_ptr{
               se::DeviceMemoryBase{gpu_array}};
@@ -144,7 +149,7 @@
   std::vector<float> cpu_array_result(1024);
 
   // Allocate 1024 floats
-  float* gpu_array = a.Allocate<float>(cpu_array.size());
+  float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
   se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
@@ -191,7 +196,7 @@
   std::vector<float> cpu_array_result(1024);
 
   // Allocate 1024 floats
-  float* gpu_array = a.Allocate<float>(cpu_array.size());
+  float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
   se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
@@ -240,7 +245,7 @@
       new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                             platform_gpu_id),
       platform_gpu_id);
-  float* t1 = a.Allocate<float>(1);
+  float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
   a.DeallocateRaw(t1);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index c721675..a54dfad 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -15,7 +15,8 @@
 
 // TODO(opensource): Use a more generic sounding preprocessor name than
 // GOOGLE_CUDA
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/include/hip/hip_runtime.h"
@@ -23,10 +24,9 @@
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
-
 #include <stdlib.h>
 #include <string.h>
+
 #include <algorithm>
 #include <list>
 #include <map>
@@ -35,6 +35,7 @@
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
@@ -69,6 +70,7 @@
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
@@ -311,24 +313,6 @@
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
   GPUProcessState::singleton()->EnableGPUDevice();
-  pending_cap_ = options.config.gpu_options().experimental().pending_cap();
-  timestamped_allocator_ =
-      options.config.gpu_options().experimental().timestamped_allocator();
-  if (timestamped_allocator_ || pending_cap_ > 0) {
-    SharedCounter* timing_counter = nullptr;
-    if (timestamped_allocator_) {
-      // In this case the SharedCounter was already created and set in the
-      // associated Allocator, with ownership by GPUProcessState.
-      // The GPUKernelTracker will use this SharedCounter, instead of
-      // owning its own.
-      timing_counter =
-          GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id);
-      DCHECK(timing_counter);
-    } else {
-      DCHECK_GT(pending_cap_, 0);
-    }
-    kernel_tracker_.reset(new GPUKernelTracker(Env::Default(), timing_counter));
-  }
 }
 
 BaseGPUDevice::~BaseGPUDevice() {
@@ -377,7 +361,6 @@
   }
 
   executor_ = executor_status.ValueOrDie();
-  em_.reset(new EventMgr(executor_, options.config.gpu_options()));
 
   if (max_streams_ < 1) {
     return errors::InvalidArgument("Invalid value for max_streams.");
@@ -391,10 +374,44 @@
         i, streams_.back()->compute, streams_.back()->host_to_device,
         streams_.back()->device_to_host, streams_.back()->device_to_device));
   }
+
+  em_ = EventMgrFactory::Singleton()->GetEventMgr(executor_,
+                                                  options.config.gpu_options());
+
+  GPUKernelTracker::Params tracker_params(
+      options.config.gpu_options().experimental().kernel_tracker_max_interval(),
+      options.config.gpu_options().experimental().kernel_tracker_max_bytes(),
+      options.config.gpu_options().experimental().kernel_tracker_max_pending());
+  timestamped_allocator_ =
+      options.config.gpu_options().experimental().timestamped_allocator();
+  pending_cap_ = tracker_params.max_pending;
+  if (timestamped_allocator_ ||
+      (tracker_params.max_interval > 0 || tracker_params.max_bytes > 0 ||
+       tracker_params.max_pending > 0)) {
+    if (max_streams_ > 1) {
+      LOG(FATAL) << "max_streams > 1 was specified together with "
+                    "timestamped_allocator and/or kernel tracking.  This is an "
+                    "unsupported combination.";
+    }
+    SharedCounter* timing_counter = nullptr;
+    if (timestamped_allocator_) {
+      // In this case the SharedCounter was already created and set in the
+      // associated Allocator, with ownership by GPUProcessState.
+      // The GPUKernelTracker will use this SharedCounter, instead of
+      // owning its own.
+      timing_counter =
+          GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id_);
+      DCHECK(timing_counter);
+    }
+    kernel_tracker_.reset(new GPUKernelTracker(
+        tracker_params, Env::Default(), streams_[0]->compute, timing_counter,
+        timestamped_allocator_ ? gpu_allocator_ : nullptr, em_));
+  }
+
   gpu_device_info_ = new GpuDeviceInfo;
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
-  gpu_device_info_->event_mgr = em_.get();
+  gpu_device_info_->event_mgr = em_;
   PlatformGpuId platform_gpu_id;
   TF_RETURN_IF_ERROR(
       GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
@@ -567,9 +584,11 @@
       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
   }
-  if (pending_cap_ > 0) {
-    DCHECK(kernel_tracker_);
-    kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
+  if (kernel_tracker_.get()) {
+    context->set_record_memory_consumption(true);
+    if (pending_cap_ > 0) {
+      kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
+    }
   }
   ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
@@ -591,10 +610,12 @@
     if (kernel_tracker_) {
       GPUKernelTracker* tracker = kernel_tracker_.get();
       DCHECK(tracker);
-      uint64 queued_count = tracker->RecordQueued();
-      em_->ThenExecute(stream, [op_kernel, tracker, queued_count]() {
-        tracker->RecordTerminated(queued_count);
-      });
+      uint64 queued_count = tracker->MaybeQueue(context);
+      if (queued_count > 0) {
+        em_->ThenExecute(stream, [tracker, queued_count]() {
+          tracker->RecordTerminated(queued_count);
+        });
+      }
     }
   } else {
     if (vlog_1) {
@@ -635,8 +656,12 @@
 
   // When Xprof profiling is off (which is the default), constructing the
   // activity is simple enough that its overhead is negligible.
-  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
-                                   op_kernel->IsExpensive());
+  profiler::TraceMe activity(
+      [&] {
+        return strings::StrCat(op_kernel->name(), ":",
+                               op_kernel->type_string());
+      },
+      profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
   ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
@@ -655,8 +680,17 @@
       done(err);
       return err;
     }
-    auto* copy =
-        new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
+    AllocationAttributes allocation_attr;
+    uint64 safe_alloc_frontier = 0;
+    std::function<uint64()> freed_by_func = [this, &safe_alloc_frontier]() {
+      safe_alloc_frontier = SafeAllocFrontier(safe_alloc_frontier);
+      return safe_alloc_frontier;
+    };
+    if (timestamped_allocator_) {
+      allocation_attr.freed_by_func = &freed_by_func;
+    }
+    auto* copy = new Tensor(GetAllocator(alloc_attrs), from.dtype(),
+                            from.shape(), allocation_attr);
 
     // If the tensor is not initialized, we likely ran out of memory.
     if (!copy->IsInitialized()) {
@@ -681,8 +715,9 @@
         std::move(done), std::placeholders::_1);
 
     tracing::ScopedAnnotation annotation("MakeTensorFromProto");
-    device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
-                                               std::move(wrapped_done));
+    device_contexts_[0]->CopyCPUTensorToDevice(
+        &from, this, copy, std::move(wrapped_done),
+        !timestamped_allocator_ /*sync_dst_compute*/);
     return Status::OK();
   }
 }
@@ -888,8 +923,8 @@
 #endif
 
 #if defined(ANDROID_TEGRA)
-  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
-  // and Video RAM
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for
+  // RAM and Video RAM
   min_system_memory = 1 << 30;
 #endif
   return min_system_memory;
@@ -1042,11 +1077,30 @@
   std::vector<PlatformGpuId> valid_platform_gpu_ids;
   // If we aren't going to use any GPUs, don't initialize them.
   // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
-  // because it treats an empty gpu_options.visible_device_list as 'all GPUs are
-  // visible'.
+  // because it treats an empty gpu_options.visible_device_list as 'all GPUs
+  // are visible'.
   if (num_gpus_to_use > 0) {
     TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
                                               &visible_gpu_order));
+    bool new_gpu_found = false;
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      int visible_gpu_id = visible_gpu_order[i].value();
+
+      // Only perform this once per visible gpu id.
+      if (visible_gpu_initialized_[visible_gpu_id]) {
+        continue;
+      }
+
+      visible_gpu_initialized_[visible_gpu_id] = true;
+      new_gpu_found = true;
+    }
+
+    // Checking peering and shows matrix if more than one gpu found.
+    if (new_gpu_found && visible_gpu_order.size() > 1) {
+      // Enable peer access
+      TF_RETURN_IF_ERROR(EnablePeerAccess(visible_gpu_order));
+    }
+
     TF_RETURN_IF_ERROR(
         GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids));
   }
@@ -1212,8 +1266,9 @@
     cc_minor = 0;
   }
   // LINT.IfChange
-  return strings::StrCat("device: ", platform_gpu_id.value(), ", name: ",
-                         desc.name(), ", pci bus id: ", desc.pci_bus_id(),
+  return strings::StrCat("device: ", platform_gpu_id.value(),
+                         ", name: ", desc.name(),
+                         ", pci bus id: ", desc.pci_bus_id(),
                          ", compute capability: ", cc_major, ".", cc_minor);
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
 #elif TENSORFLOW_USE_ROCM
@@ -1236,9 +1291,12 @@
       GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
   int numa_node = dev_locality.numa_node();
 
-  se::StreamExecutor* se =
-      GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
-  const se::DeviceDescription& desc = se->GetDeviceDescription();
+  se::Platform* gpu_manager = GPUMachineManager();
+  auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
+  if (!desc_status.ok()) {
+    return desc_status.status();
+  }
+  auto desc = desc_status.ConsumeValueOrDie();
   GPUProcessState* process_state = GPUProcessState::singleton();
   Allocator* gpu_allocator = process_state->GetGPUAllocator(
       options.config.gpu_options(), tf_gpu_id, memory_limit);
@@ -1251,21 +1309,22 @@
   if (!stats) {
     return errors::Internal("No allocator statistics");
   }
-  // 'memory_limit' is the required memory size, but if the allocator with given
-  // tf_gpu_id was created before, we'll use it instead of creating a new one
-  // (as TF gpu device is a shared resource), in which case the actual memory
-  // limit represented by 'stats.bytes_limit' used by that allocator may be
-  // different (which should be an error).
+  // 'memory_limit' is the required memory size, but if the allocator with
+  // given tf_gpu_id was created before, we'll use it instead of creating a
+  // new one (as TF gpu device is a shared resource), in which case the actual
+  // memory limit represented by 'stats.bytes_limit' used by that allocator
+  // may be different (which should be an error).
   //
-  // TODO(laigd): report error if memory_limit doesn't match stats->bytes_limit.
+  // TODO(laigd): report error if memory_limit doesn't match
+  // stats->bytes_limit.
   int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
   std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
       options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
-      tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
+      tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, *desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
   LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
             << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
-            << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
+            << GetShortDeviceDescription(platform_gpu_id, *desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
   devices->push_back(std::move(gpu_device));
 
@@ -1331,10 +1390,14 @@
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
-    se::StreamExecutor* se =
-        GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
-    const se::DeviceDescription& desc = se->GetDeviceDescription();
-    int numa_node = desc.numa_node();
+    se::Platform* gpu_manager = GPUMachineManager();
+    auto desc_status =
+        gpu_manager->DescriptionForDevice(platform_gpu_id.value());
+    if (!desc_status.ok()) {
+      return desc_status.status();
+    }
+    auto desc = desc_status.ConsumeValueOrDie();
+    int numa_node = desc->numa_node();
     if (numa_node < 0) {
       // For some reason the StreamExecutor couldn't get the NUMA
       // affinity of the GPU.  If this is not a multi-socket mobo with
@@ -1387,7 +1450,7 @@
     (*localities)[tf_gpu_id] = dev_locality;
     VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId "
             << tf_gpu_id << " on bus " << dev_locality.bus_id()
-            << " numa: " << numa_node << " pci: " << desc.pci_bus_id()
+            << " numa: " << numa_node << " pci: " << desc->pci_bus_id()
             << " DeviceLocality: " << dev_locality.DebugString();
   }
   return Status::OK();
@@ -1401,15 +1464,14 @@
   // Find the highest multi-processor count across all visible GPUs.
   int max_count = -1;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    auto exec_status =
-        GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_order[i]);
-    if (!exec_status.ok()) {
+    int visible_gpu_id = visible_gpu_order[i].value();
+    auto description_status = gpu_manager->DescriptionForDevice(visible_gpu_id);
+    if (!description_status.ok()) {
       continue;
     }
 
-    se::StreamExecutor* se = exec_status.ValueOrDie();
-    const se::DeviceDescription& desc = se->GetDeviceDescription();
-    max_count = std::max(max_count, desc.core_count());
+    auto description = description_status.ConsumeValueOrDie();
+    max_count = std::max(max_count, description->core_count());
   }
 
   if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) {
@@ -1508,8 +1570,11 @@
 }
 #endif  // TENSORFLOW_USE_ROCM
 
-Status EnablePeerAccess(se::Platform* platform,
-                        const std::vector<PlatformGpuId>& visible_gpu_order) {
+}  // namespace
+
+Status BaseGPUDeviceFactory::EnablePeerAccess(
+    const std::vector<PlatformGpuId>& visible_gpu_order) {
+  se::Platform* gpu_manager = GPUMachineManager();
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
@@ -1518,10 +1583,10 @@
       const PlatformGpuId platform_gpu_j = visible_gpu_order[j];
       // We have already validated that ExecutorForDevice() calls return OK.
       se::StreamExecutor* from =
-          GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i)
+          GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, platform_gpu_i)
               .ValueOrDie();
       se::StreamExecutor* to =
-          GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j)
+          GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, platform_gpu_j)
               .ValueOrDie();
 
       if (from->CanEnablePeerAccessTo(to)) {
@@ -1551,75 +1616,44 @@
   return Status::OK();
 }
 
-}  // namespace
-
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
     const std::vector<PlatformGpuId>& visible_gpu_order,
     std::vector<PlatformGpuId>* ids) {
   se::Platform* gpu_manager = GPUMachineManager();
-  bool new_gpu_found = false;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
-
-    // Only perform this once per visible platform gpu id.
-    if (visible_gpu_initialized_[visible_gpu_id.value()]) {
-      continue;
+    int visible_gpu_id = visible_gpu_order[i].value();
+    auto description_status = gpu_manager->DescriptionForDevice(visible_gpu_id);
+    if (!description_status.ok()) {
+      return description_status.status();
     }
 
-    visible_gpu_initialized_[visible_gpu_id.value()] = true;
-    new_gpu_found = true;
-
-    auto executor =
-        GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id);
-    if (!executor.ok()) {
-      return executor.status();
-    }
-
-    auto stream_exec = executor.ValueOrDie();
-    int64 free_bytes;
-    int64 total_bytes;
-    if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
-      // Logs internally on failure.
-      free_bytes = 0;
-      total_bytes = 0;
-    }
-    const auto& description = stream_exec->GetDeviceDescription();
+    auto description = description_status.ConsumeValueOrDie();
 #if GOOGLE_CUDA
     int cc_major;
     int cc_minor;
-    if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
+    if (!description->cuda_compute_capability(&cc_major, &cc_minor)) {
       // Logs internally on failure.
       cc_major = 0;
       cc_minor = 0;
     }
     LOG(INFO) << "Found device " << i << " with properties: "
-              << "\nname: " << description.name() << " major: " << cc_major
+              << "\nname: " << description->name() << " major: " << cc_major
               << " minor: " << cc_minor
-              << " memoryClockRate(GHz): " << description.clock_rate_ghz()
-              << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
-              << strings::HumanReadableNumBytes(total_bytes)
-              << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
+              << " memoryClockRate(GHz): " << description->clock_rate_ghz()
+              << "\npciBusID: " << description->pci_bus_id();
 #elif TENSORFLOW_USE_ROCM
     int isa_version;
-    if (!description.rocm_amdgpu_isa_version(&isa_version)) {
+    if (!description->rocm_amdgpu_isa_version(&isa_version)) {
       // Logs internally on failure.
       isa_version = 0;
     }
     LOG(INFO) << "Found device " << i << " with properties: "
-              << "\nname: " << description.name() << "\nAMDGPU ISA: gfx"
+              << "\nname: " << description->name() << "\nAMDGPU ISA: gfx"
               << isa_version << "\nmemoryClockRate (GHz) "
-              << description.clock_rate_ghz() << "\npciBusID "
-              << description.pci_bus_id() << "\nTotal memory: "
-              << strings::HumanReadableNumBytes(total_bytes)
-              << "\nFree memory: "
-              << strings::HumanReadableNumBytes(free_bytes);
+              << description->clock_rate_ghz() << "\npciBusID "
+              << description->pci_bus_id();
 #endif
   }
-  // Checking peering and shows matrix if more than one gpu found.
-  if (new_gpu_found && visible_gpu_order.size() > 1) {
-    // Enable peer access
-    TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
-  }
 
 #if GOOGLE_CUDA
   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
@@ -1645,23 +1679,23 @@
   // Filter out devices that don't have the right capability or power.
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
     const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
-    auto exec_status =
-        GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id);
-    if (!exec_status.ok()) {
+    auto description_status =
+        gpu_manager->DescriptionForDevice(visible_gpu_id.value());
+    if (!description_status.ok()) {
       LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
                 << " whose executor is in invalid state: "
-                << exec_status.status().ToString();
+                << description_status.status().ToString();
       continue;
     }
-    se::StreamExecutor* se = exec_status.ValueOrDie();
-    const se::DeviceDescription& desc = se->GetDeviceDescription();
+
+    auto desc = description_status.ConsumeValueOrDie();
 
 #if GOOGLE_CUDA
     CudaVersion device_capability;
-    if (!desc.cuda_compute_capability(&device_capability.major_part,
-                                      &device_capability.minor_part)) {
+    if (!desc->cuda_compute_capability(&device_capability.major_part,
+                                       &device_capability.minor_part)) {
       LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
                 << ") "
                 << "whose CUDA compute capability is not available.";
       continue;
@@ -1670,7 +1704,7 @@
     // accepted.
     if (device_capability < min_supported_capability) {
       LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
                 << ") "
                 << "with Cuda compute capability " << device_capability
                 << ". The minimum required Cuda capability is "
@@ -1679,14 +1713,14 @@
     }
 #elif TENSORFLOW_USE_ROCM
     int device_isa;
-    if (!desc.rocm_amdgpu_isa_version(&device_isa)) {
+    if (!desc->rocm_amdgpu_isa_version(&device_isa)) {
       continue;
     }
     // Only GPUs with no less than the minimum supported compute capability is
     // accepted.
     if (device_isa < min_supported_isa) {
       LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
                 << ") "
                 << "with AMDGPU ISA gfx" << device_isa
                 << ". The minimum required AMDGPU ISA is gfx"
@@ -1699,11 +1733,11 @@
     // count than the fastest GPU are filtered out, unless they have 8 or more
     // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
     // variable is set, its value will be used to filter out GPUs.
-    if (desc.core_count() < min_gpu_core_count) {
+    if (desc->core_count() < min_gpu_core_count) {
       LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
                 << ") "
-                << "with core count: " << desc.core_count()
+                << "with core count: " << desc->core_count()
                 << ". The minimum required count is " << min_gpu_core_count
                 << ". You can adjust this requirement with the env var "
                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
@@ -1722,9 +1756,9 @@
   return Status::OK();
 }
 
-uint64 BaseGPUDevice::SafeAllocFrontier() {
+uint64 BaseGPUDevice::SafeAllocFrontier(uint64 old_value) {
   if (timestamped_allocator_) {
-    return kernel_tracker_->LastTerminatedCount();
+    return kernel_tracker_->LastTerminatedCount(old_value);
   } else {
     return 0;
   }
@@ -1737,19 +1771,50 @@
   return 0;
 }
 
-uint64 GPUKernelTracker::RecordQueued() {
+uint64 GPUKernelTracker::MaybeQueue(OpKernelContext* ctx) {
   mutex_lock l(mu_);
+  ++ops_since_last_;
+  int64 mem_used =
+      ctx->persistent_memory_allocated() + ctx->temp_memory_allocated();
+  VLOG(2) << "kernel: " << ctx->op_kernel().name() << " mem_used: " << mem_used;
+  mem_since_last_ += mem_used;
+  int weight = 1;
+  // Note that if all {max_bytes, max_interval, max_pending} are zero then
+  // we we track every single kernel with no pending cap.  This can happen
+  // if timestamped_allocator alone was specified.
+  if ((mem_since_last_ < params_.max_bytes) &&
+      (ops_since_last_ < params_.max_interval)) {
+    return 0;
+  } else {
+    weight = std::min(
+        params_.max_pending,
+        std::max(1, mem_since_last_ / std::max(16386, params_.max_bytes)));
+    mem_since_last_ = 0;
+    ops_since_last_ = 0;
+  }
   uint64 queued_count = timing_counter_->next();
+  RecordQueued(queued_count, weight);
+  return queued_count;
+}
+
+void GPUKernelTracker::RecordQueued(uint64 queued_count, int weight) {
   VLOG(2) << "RecordQueued queued_count=" << queued_count
           << " first_available_=" << first_available_
           << " last_completed_=" << last_completed_
           << " num_pending_=" << num_pending_;
   pending_kernels_[first_available_].queued_count = queued_count;
+  pending_kernels_[first_available_].weight = weight;
   pending_kernels_[first_available_].terminated = false;
   ++first_available_;
-  ++num_pending_;
+  num_pending_ += weight;
   if (first_available_ >= pending_kernels_.size()) {
-    first_available_ = 0;
+    if (last_completed_ >= 0) {
+      // wrap
+      first_available_ = 0;
+    } else {
+      // enlarge the ring buffer
+      pending_kernels_.resize(2 * pending_kernels_.size());
+    }
   }
   if (first_available_ == last_completed_) {
     // Ring buffer is full: double it.  All of the same valid PendingKernel
@@ -1768,12 +1833,30 @@
             << " num_pending_=" << num_pending_;
   }
   DCHECK_NE(first_available_, last_completed_) << "exhausted pending_kernels";
-  return queued_count;
+}
+
+// Called by LastTerminatedCount() when new_value is equal to old_value.  This
+// case can occur where an allocation failed and waited for memory to be freed,
+// then when it retried the safe allocation frontier had not advanced because no
+// tracking event had matured.  Maybe GPU progress has stalled waiting on an i/o
+// event, or maybe we're tracking at too infrequent an interval.  In any case if
+// the GPU compute queue is actually empty it's safe to advance the safe
+// frontier so that this request can allocate from unrestricted (and better
+// compacted) memory.  So queue an event on the compute stream to ensure the
+// frontier does advance.
+void GPUKernelTracker::MaybeQueueProgressEvent() {
+  mutex_lock l(mu_);
+  if (num_pending_ == 0) {
+    uint64 new_count = timing_counter_->next();
+    RecordQueued(new_count, 1);
+    em_->ThenExecute(stream_,
+                     [this, new_count]() { RecordTerminated(new_count); });
+  }
 }
 
 void GPUKernelTracker::RecordTerminated(uint64 queued_count) {
   mutex_lock l(mu_);
-  VLOG(2) << "RecordTerminated queued_count=" << queued_count
+  VLOG(2) << this << " RecordTerminated queued_count=" << queued_count
           << " first_available_=" << first_available_
           << " last_completed_=" << last_completed_
           << " num_pending_=" << num_pending_ << " LC="
@@ -1785,26 +1868,31 @@
   // Starting just past the last completed entry, find the entry with
   // this queued_count and mark it done.
   int index = (last_completed_ + 1) % pending_kernels_.size();
+  int weight = 1;
   while (true) {
     if (index == first_available_) {
       // This should never happen.
       LOG(FATAL) << "Failed to find " << queued_count  // Crash OK
-                 << " in queue";
+                 << " in queue, last_completed_=" << last_completed_
+                 << " index=" << index
+                 << " first_available_=" << first_available_
+                 << " pending_kernels_.size()=" << pending_kernels_.size();
     }
     if (pending_kernels_[index].queued_count == queued_count) {
       pending_kernels_[index].terminated = true;
+      weight = pending_kernels_[index].weight;
       break;
     }
     index = (index + 1) % pending_kernels_.size();
   }
   // Next move last_completed_ forward past all completed kernels.  In theory
   // kernels should always complete in queued order so we should be able to
-  // advance the completed frontier to the last queued PendingKernel.  In
-  // practice we occassionally see the termination callbacks arrive out of order
-  // probably because of thread scheduling.  Eventually we may support out-of-
-  // order completion involving multple compute streams so here we follow a
-  // conservative approach and wait for every single callback to arrive before
-  // advancing the frontier.
+  // advance the completed frontier to the just-completed PendingKernel.  In
+  // practice we occasionally see the termination callbacks arrive out of
+  // order probably because of thread scheduling.  Eventually we may support
+  // out-of- order completion involving multple compute streams so here we
+  // follow a conservative approach and wait for every single callback to
+  // arrive before advancing the frontier.
   while (true) {
     int next_index = (last_completed_ + 1) % pending_kernels_.size();
     if (next_index == first_available_) break;
@@ -1814,21 +1902,16 @@
       break;
     }
   }
-  // Last decrease num_pending before maybe waking a waiter.
-  --num_pending_;
-  pending_decreased_.notify_one();
-}
-
-uint64 GPUKernelTracker::LastTerminatedCount() {
-  mutex_lock l(mu_);
-  if (last_completed_ < 0) {
-    // This is an edge case that can be encountered only at the beginning of
-    // execution.  There's not yet a safe threshold count. We don't want to
-    // return 0 since that bypasses the count mechanism in BFCAllocator, so
-    // return the least non-zero value.
-    return 1;
+  if (last_completed_ >= 0) {
+    int64 v = pending_kernels_[last_completed_].queued_count;
+    last_terminated_count_ = v;
+    if (allocator_) {
+      allocator_->SetSafeFrontier(v);
+    }
   }
-  return pending_kernels_[last_completed_].queued_count;
+  // Last decrease num_pending before maybe waking a waiter.
+  num_pending_ -= weight;
+  pending_decreased_.notify_all();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index b210613..4a62512 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -125,7 +125,7 @@
 
   // If returned value is > 0 then GPU Memory chunks freed before this count
   // are guaranteed not to be in use by any kernel pending on this device.
-  uint64 SafeAllocFrontier() override;
+  uint64 SafeAllocFrontier(uint64 old_value) override;
 
   // Returns the number of kernels that have been queued for execution on
   // the compute stream and are not yet known to have completed.
@@ -157,10 +157,10 @@
   TfGpuId tf_gpu_id_;
   const bool sync_every_op_ = false;
   const int32 max_streams_;
-  std::unique_ptr<EventMgr> em_;
+  EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::unique_ptr<GPUKernelTracker> kernel_tracker_;
-  int pending_cap_ = 0;
+  int32 pending_cap_ = 0;
   bool timestamped_allocator_ = false;
 
   // Initialize scractch buffers used by Eigen.
@@ -185,15 +185,43 @@
 };
 
 // A per-compute-stream utility that keeps track of kernels that have been
-// queued for execution but may not yet have terminated, and also the queued
+// queued for execution but may not yet have terminated and also the queued
 // time of the most recently terminated kernel.
 class GPUKernelTracker {
  public:
+  // Controls the strategy for inserting tracking events after GPU kernels.
+  //   If max_interval >= 0, then insert an event after this many kernels
+  //     if an event has not been inserted for another reason.
+  //   If max_bytes > 0, then insert an event after kernels allocating this
+  //     many bytes have been queued since the last event.
+  //   If max_pending > 0, then track up to this many events at once.  If
+  //     this limit is reached the GPU::Compute() method will delay starting
+  //     additional ops until some event completes.  If 0 and one of the other
+  //     fields is non-zero, then a reasonable default will be selected.
+  struct Params {
+    int max_interval = 0;
+    int max_bytes = 0;
+    int max_pending = 0;
+    Params(int mi, int mb, int mp)
+        : max_interval(mi), max_bytes(mb), max_pending(mp) {}
+  };
+
   // If we're going to share a SharedCounter with an allocator, it's owned
   // by the allocator because allocators are initialized once per process.
   // Devices are per-session.
-  explicit GPUKernelTracker(Env* env, SharedCounter* timing_counter)
-      : env_(env), timing_counter_(timing_counter), pending_kernels_(64) {
+  explicit GPUKernelTracker(const Params& params, Env* env,
+                            se::Stream* compute_stream,
+                            SharedCounter* timing_counter, Allocator* allocator,
+                            EventMgr* event_manager)
+      : params_(params),
+        env_(env),
+        stream_(compute_stream),
+        timing_counter_(timing_counter),
+        allocator_(allocator),
+        em_(event_manager),
+        pending_kernels_(
+            params.max_pending > 0 ? std::max(8, 2 * params.max_pending) : 64) {
+    mem_since_last_ = 0;
     if (!timing_counter_) {
       // There's not a preexisting counter owned by GPUProcessState, i.e.
       // pending_cap > 0 but timestamped_allocator == false.
@@ -202,19 +230,33 @@
     }
   }
 
+  // Determine whether a GPU kernel should have a recording event queued
+  // immediately afterwards.  If so, advance the counter and return the new
+  // counter value after enqueuing.
+  uint64 MaybeQueue(OpKernelContext* ctx);
+
   // Record that a GPU kernel has just been enqueued on the compute stream.
-  // Inserts a new timing counter value in a new PendingKernel record appended
+  // Inserts the supplied counter value in a new PendingKernel record appended
   // to the end of the ring buffer then returns that same count.
-  uint64 RecordQueued();
+  // Caller is responsible for ensuring that RecordTerminate() is eventually
+  // called with the same counter value.
+  void RecordQueued(uint64 queued_count, int weight)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Takes a count value returned by RecordQueued and finds the corresponding
   // PendingKernel record in the ring buffer.  Marks the kernel as completed and
   // advances the completion frontier accordingly.
-  void RecordTerminated(uint64 at_count);
+  void RecordTerminated(uint64 queued_count);
 
   // Returns the largest timing count such that all kernels queued no
   // later than that count are known to have terminated.
-  uint64 LastTerminatedCount();
+  inline uint64 LastTerminatedCount(uint64 old_value) {
+    uint64 new_value = last_terminated_count_.load(std::memory_order_relaxed);
+    if (new_value == old_value) {
+      MaybeQueueProgressEvent();
+    }
+    return new_value;
+  }
 
   // Returns the number of kernels enqueued that are not yet known to
   // have terminated.
@@ -225,28 +267,42 @@
 
   // Yield current thread until number of pending kernels no longer
   // exceeds the cap.
-  void PauseWhilePendingExceeds(int cap) {
+  void PauseWhilePendingExceeds(int cap) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     while (num_pending_ > cap) {
+      VLOG(1) << "num_pending_=" << num_pending_ << " cap=" << cap;
       pending_decreased_.wait(l);
     }
   }
 
  private:
+  friend class GPUKernelTrackerTest;
+  Params params_;
   Env* env_;
+  se::Stream* stream_;
   SharedCounter* timing_counter_;
   std::unique_ptr<SharedCounter> owned_counter_;
+  Allocator* allocator_ = nullptr;
+  EventMgr* em_ = nullptr;
+  std::atomic<uint64> last_terminated_count_ = {1};
+
+  void MaybeQueueProgressEvent();
 
   // Records when a kernel was queued for execution.  Kernel launches are
   // identified by a unique count value from a per-GPU device timing counter.
   struct PendingKernel {
     uint64 queued_count;
+    int weight;
     bool terminated;
     PendingKernel(const PendingKernel& pk)
-        : queued_count(pk.queued_count), terminated(pk.terminated) {}
-    PendingKernel() : queued_count(0), terminated(false) {}
+        : queued_count(pk.queued_count),
+          weight(pk.weight),
+          terminated(pk.terminated) {}
+    PendingKernel() : queued_count(0), weight(0), terminated(false) {}
   };
   mutex mu_;
+  int32 mem_since_last_ GUARDED_BY(mu_);
+  int32 ops_since_last_ GUARDED_BY(mu_);
   // Ring buffer of PendingKernel records.
   std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
   // Next unused slot in pending_kernels_.
@@ -254,9 +310,9 @@
   // Last completed PendingKernel such that all prior PendingKernels are
   // also completed.  With out-of-order completion there may be a mixture
   // of completed and uncompleted entries between last_completed_ and
-  // first_available_, hence num_pending_ is not guaranteed equal to
-  // their differerence.
+  // first_available_.
   int last_completed_ GUARDED_BY(mu_) = -1;
+  // Sum of weights of the outstanding events marking tracked kernels.
   int num_pending_ GUARDED_BY(mu_) = 0;
   condition_variable pending_decreased_ GUARDED_BY(mu_);
 };
@@ -314,6 +370,8 @@
       const string& physical_device_desc, Allocator* gpu_allocator,
       Allocator* cpu_allocator) = 0;
 
+  Status EnablePeerAccess(const std::vector<PlatformGpuId>& visible_gpu_order);
+
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
   // based upon 'visible_gpu_order' which was generated by parsing
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 72ff4cc..e6b2520 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 5e184c8..3741f78 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -348,27 +348,36 @@
 
 class GPUKernelTrackerTest : public ::testing::Test {
  protected:
-  void SetUp() {
+  void Init(const GPUKernelTracker::Params& params) {
     timing_counter_.reset(new SharedCounter);
-    kernel_tracker_.reset(
-        new GPUKernelTracker(Env::Default(), timing_counter_.get()));
+    kernel_tracker_.reset(new GPUKernelTracker(params, Env::Default(), nullptr,
+                                               timing_counter_.get(), nullptr,
+                                               nullptr));
+  }
+
+  void RecordQueued(uint64 v) {
+    mutex_lock l(kernel_tracker_->mu_);
+    kernel_tracker_->RecordQueued(v, 1);
   }
 
   std::unique_ptr<GPUKernelTracker> kernel_tracker_;
   std::unique_ptr<SharedCounter> timing_counter_;
 };
 
-TEST_F(GPUKernelTrackerTest, basic) {
+TEST_F(GPUKernelTrackerTest, CappingOnly) {
+  Init({0 /*max_interval*/, 0 /*max_bytes*/, 32 /*max_pending*/});
   EXPECT_EQ(0, kernel_tracker_->NumPending());
   // 1 is the expected value when no kernels have yet terminated.
-  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount(0));
 
   std::deque<int64> queued_counts;
   for (int i = 0; i < 32; ++i) {
-    queued_counts.push_back(kernel_tracker_->RecordQueued());
+    uint64 queued_count = timing_counter_->next();
+    queued_counts.push_back(queued_count);
+    RecordQueued(queued_count);
   }
   EXPECT_EQ(32, kernel_tracker_->NumPending());
-  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount(0));
 
   // Mature the kernels in order until empty.
   while (!queued_counts.empty()) {
@@ -376,23 +385,25 @@
     queued_counts.pop_front();
     kernel_tracker_->RecordTerminated(x);
     EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
-    EXPECT_EQ(x, kernel_tracker_->LastTerminatedCount());
+    EXPECT_EQ(x, kernel_tracker_->LastTerminatedCount(0));
   }
-  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount(0));
 
   // Next inject so many kernel events that the ring buffer needs
   // to grow a couple of times, while maturing a few in random order
   // to introduce gaps between last_completed_ and first_available_.
   int64 lower_bound = timing_counter_->get();
   for (int i = 0; i < 1111; ++i) {
-    queued_counts.push_back(kernel_tracker_->RecordQueued());
+    uint64 queued_count = timing_counter_->next();
+    queued_counts.push_back(queued_count);
+    RecordQueued(queued_count);
     int64 upper_bound = timing_counter_->get();
     if (0 == (i % 16)) {
       size_t index = (random::New64() % queued_counts.size());
       kernel_tracker_->RecordTerminated(queued_counts[index]);
       queued_counts.erase(queued_counts.begin() + index);
-      EXPECT_LE(lower_bound, kernel_tracker_->LastTerminatedCount());
-      EXPECT_GE(upper_bound, kernel_tracker_->LastTerminatedCount());
+      EXPECT_LE(lower_bound, kernel_tracker_->LastTerminatedCount(0));
+      EXPECT_GE(upper_bound, kernel_tracker_->LastTerminatedCount(0));
     }
   }
 
@@ -405,9 +416,9 @@
     // There may be a gap here where we find a kernel that got terminated
     // out of order, earlier, so the LastTerminatedCount can actually
     // jump past x.
-    EXPECT_LE(x, kernel_tracker_->LastTerminatedCount());
+    EXPECT_LE(x, kernel_tracker_->LastTerminatedCount(0));
   }
-  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount(0));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 6531d6d..27cfe9b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -295,4 +295,26 @@
   }
 }
 
+EventMgrFactory* EventMgrFactory::Singleton() {
+  static EventMgrFactory* instance = new EventMgrFactory;
+  return instance;
+}
+
+EventMgr* EventMgrFactory::GetEventMgr(se::StreamExecutor* se,
+                                       const GPUOptions& gpu_options) {
+  mutex_lock l(mu_);
+  // TODO(laigd): consider making gpu_options part of the key. It's not
+  // currently since EventMgr depends only rely on field deferred_deletion_bytes
+  // and polling_active_delay_usecs from gpu_options which are not used or
+  // rarely used.
+  auto itr = event_mgr_map_.find(se);
+  if (itr == event_mgr_map_.end()) {
+    auto event_mgr = new EventMgr(se, gpu_options);
+    event_mgr_map_[se] = event_mgr;
+    return event_mgr;
+  } else {
+    return itr->second;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index 2d406b6..169a86a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -63,9 +63,7 @@
 // Events are recorded.
 class EventMgr {
  public:
-  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
-
-  ~EventMgr();
+  virtual ~EventMgr();
 
   // Releases the references on the elements of "tensors" as soon as
   // all events currently enqueued on "stream" have completed.
@@ -107,7 +105,9 @@
   }
 
  private:
+  friend class TEST_EventMgr;
   friend class TEST_EventMgrHelper;
+  friend class EventMgrFactory;
   se::StreamExecutor* const exec_;
   const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
@@ -125,6 +125,8 @@
 
   typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
 
+  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
+
   void FreeMemory(const ToFreeVector& to_free) {
     for (const auto& iu : to_free) {
       if (iu.mem != nullptr) {
@@ -202,5 +204,20 @@
   thread::ThreadPool threadpool_;
 };
 
+// Manages all the EventMgr instances.
+class EventMgrFactory {
+ public:
+  static EventMgrFactory* Singleton();
+
+  EventMgr* GetEventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
+
+ private:
+  mutex mu_;
+
+  // Maintain one EventMgr per physical device (StreamExecutor is
+  // per-physical-device).
+  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ GUARDED_BY(mu_);
+};
+
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 3a0eb0b..43ac015 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -37,6 +37,13 @@
 
 namespace tensorflow {
 
+// Subclass EventMgr to access its private constructor.
+class TEST_EventMgr : public EventMgr {
+ public:
+  TEST_EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
+      : EventMgr(se, gpu_options) {}
+};
+
 class TEST_EventMgrHelper {
  public:
   explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {
@@ -62,7 +69,7 @@
     em_->QueueTensors(stream, tensors);
   }
 
-  void PollEvents(bool is_dedicated_poller) {
+  void PollEvents() {
     while (queue_size() > 0) {
       // For ordinary tensor frees, this function
       // should synchronously harvest all complete
@@ -70,15 +77,15 @@
       EventMgr::ToFreeVector to_free;
       {
         mutex_lock l(em_->mu_);
-        em_->PollEvents(is_dedicated_poller, &to_free);
+        em_->PollEvents(true, &to_free);
       }
       em_->FreeMemory(to_free);
     }
   }
 
-  void StopPollingLoop() { em_->StopPollingLoop(); }
+  void StopPollingLoop() { return em_->StopPollingLoop(); }
 
-  void StartPollingLoop() { em_->StartPollingLoop(); }
+  void StartPollingLoop() { return em_->StartPollingLoop(); }
 
  private:
   EventMgr* em_;
@@ -109,7 +116,7 @@
 
 TEST(EventMgr, Empty) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
@@ -126,7 +133,7 @@
 // the max simultaneously pending, we should not allocate any more.
 TEST(EventMgr, DelayedPolling) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
@@ -140,7 +147,7 @@
     EXPECT_EQ(i + 1, th.queue_size());
     EXPECT_EQ(0, th.free_size());
   }
-  th.PollEvents(false);
+  th.PollEvents();
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(5, th.free_size());
   for (int j = 0; j < 2; ++j) {
@@ -151,7 +158,7 @@
       EXPECT_EQ(i + 1, th.queue_size());
       EXPECT_EQ(4 - i, th.free_size());
     }
-    th.PollEvents(false);
+    th.PollEvents();
     EXPECT_EQ(0, th.queue_size());
     EXPECT_EQ(5, th.free_size());
   }
@@ -159,7 +166,7 @@
 
 TEST(EventMgr, FlushLargeTensorImmediately) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -169,14 +176,14 @@
     TensorReferenceVector v;
     AddTensorReference(&v, 100 * 1048576);
     em.ThenDeleteTensors(stream.get(), v);
-    th.PollEvents(false);  // Ensure things get registered to be freed by Poll
+    th.PollEvents();  // Ensure things get registered to be freed by Poll
     EXPECT_EQ(0, live_tensor_bytes);
   }
 }
 
 TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -188,14 +195,14 @@
       AddTensorReference(&v, 100 * 1024);
     }
     em.ThenDeleteTensors(stream.get(), v);
-    th.PollEvents(false);  // Harvest the tensors ready to be freed.
+    th.PollEvents();  // Harvest the tensors ready to be freed.
     EXPECT_EQ(0, live_tensor_bytes);
   }
 }
 
 TEST(EventMgr, StreamSwitchingFlushesImmediately) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream1(new se::Stream(stream_exec));
@@ -210,14 +217,14 @@
   AddTensorReference(&v2, 1024);
   int64 initial_live_bytes = live_tensor_bytes;
   em.ThenDeleteTensors(stream2.get(), v2);
-  th.PollEvents(false);  // Ensure things get registered to be freed by Poll
+  th.PollEvents();  // Ensure things get registered to be freed by Poll
   // Different stream should cause first tensor to get deleted
   EXPECT_GT(initial_live_bytes, live_tensor_bytes);
 }
 
 TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -229,7 +236,7 @@
       AddTensorReference(&v, 100 * 1024);
       em.ThenDeleteTensors(stream.get(), v);
     }
-    th.PollEvents(false);  // Ensure things get registered to be freed by Poll
+    th.PollEvents();  // Ensure things get registered to be freed by Poll
     // Some of the tensors at least should be flushed
     EXPECT_GT(1000 * 100 * 1024, live_tensor_bytes);
   }
@@ -239,7 +246,7 @@
 // down gracefully.
 TEST(EventMgr, NonEmptyShutdown) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
@@ -258,12 +265,13 @@
 // Tests that WarnIfInCallback() triggers correctly.
 TEST(EventMgr, WarnIfInCallback) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream);
   stream->Init();
   bool hit = false;
+  th.StartPollingLoop();
   gpu_event_mgr::WarnIfInCallback([&hit] { hit = true; });
   EXPECT_FALSE(hit);
   Notification note;
@@ -281,7 +289,7 @@
 // Provides access to private resources of BaseGPUDevice.
 class GPUDeviceTestHelper {
  public:
-  GPUDeviceTestHelper(size_t memory_limit) {
+  GPUDeviceTestHelper(size_t memory_limit, int pending_cap) {
     SessionOptions sops;
     device_ =
         DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
@@ -298,7 +306,8 @@
   se::Stream* h2d_stream() { return gpu_->streams_[0]->host_to_device; }
   se::Stream* d2h_stream() { return gpu_->streams_[0]->device_to_host; }
   se::Stream* d2d_stream() { return gpu_->streams_[0]->device_to_device[0]; }
-  EventMgr* event_mgr() { return gpu_->em_.get(); }
+  EventMgr* event_mgr() { return gpu_->em_; }
+  int pending_cap() { return gpu_->pending_cap_; }
 
  private:
   std::unique_ptr<Device> device_;
@@ -340,23 +349,23 @@
 
   EMBenchmarkHelper(GPUDeviceTestHelper* h) : gpu_helper_(h) {}
 
-  void ReInit(int num_ops) {
+  void ReInit(int num_ops, int tensor_size) {
     gpu_inputs_.clear();
     while (gpu_inputs_.size() < 2) {
       gpu_inputs_.push_back(Tensor(gpu_helper_->gpu_allocator(), DT_FLOAT,
-                                   {kTDim}, AllocationAttributes()));
+                                   {tensor_size}, AllocationAttributes()));
     }
     gpu_outputs_.clear();
     while (gpu_outputs_.size() < 1) {
       gpu_outputs_.push_back(Tensor(gpu_helper_->gpu_allocator(), DT_FLOAT,
-                                    {kTDim}, AllocationAttributes()));
+                                    {tensor_size}, AllocationAttributes()));
     }
     host_inputs_.clear();
     while (host_inputs_.size() < 2) {
       int instance_index = host_inputs_.size();
       host_inputs_.push_back(Tensor(gpu_helper_->host_allocator(), DT_FLOAT,
-                                    {kTDim}, AllocationAttributes()));
-      for (int i = 0; i < kTDim; ++i) {
+                                    {tensor_size}, AllocationAttributes()));
+      for (int i = 0; i < tensor_size; ++i) {
         host_inputs_.back().flat<float>()(i) =
             i * (1.0 + (0.5 * instance_index));
       }
@@ -364,8 +373,8 @@
     host_outputs_.clear();
     while (host_outputs_.size() < 1) {
       host_outputs_.push_back(Tensor(gpu_helper_->host_allocator(), DT_FLOAT,
-                                     {kTDim}, AllocationAttributes()));
-      for (int i = 0; i < kTDim; ++i) {
+                                     {tensor_size}, AllocationAttributes()));
+      for (int i = 0; i < tensor_size; ++i) {
         host_outputs_.back().flat<float>()(i) = -1;
       }
     }
@@ -583,7 +592,7 @@
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream);
   stream->Init();
-  EventMgr em(stream_exec, GPUOptions());  //, stream.get());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   testing::StartTiming();
   std::atomic<int> counter;
   counter.store(0, std::memory_order_seq_cst);
@@ -615,10 +624,11 @@
 mutex helper_mu;
 
 #ifdef PLATFORM_GOOGLE
-static void BM_chain_ops(int iters, int adds_per_round, bool event_after_add) {
+static void BM_chain_ops(int iters, int tensor_size, int adds_per_round,
+                         bool event_after_add, int pending_cap) {
 #else
-static void BM_chain_ops(int iters, int adds_per_round, bool event_after_add,
-                         int threads) {
+static void BM_chain_ops(int iters, int tensor_size, int adds_per_round,
+                         bool event_after_add, int pending_cap, int threads) {
 #endif
   testing::StopTiming();
 #ifdef PLATFORM_GOOGLE
@@ -628,12 +638,19 @@
 #endif  // PLATFORM_GOOGLE
   {
     mutex_lock l(helper_mu);
+    if (gpu_helper && gpu_helper->pending_cap() != pending_cap) {
+      delete bm_helper;
+      bm_helper = nullptr;
+      delete gpu_helper;
+      gpu_helper = nullptr;
+    }
     if (!gpu_helper) {
-      gpu_helper = new GPUDeviceTestHelper(1 << 20);
+      gpu_helper = new GPUDeviceTestHelper(1 << 24, pending_cap);
       bm_helper = new EMBenchmarkHelper(gpu_helper);
     }
-    if (bm_helper->num_ops() != adds_per_round) {
-      bm_helper->ReInit(adds_per_round);
+    if (bm_helper->num_ops() != adds_per_round ||
+        bm_helper->tensor_size() != tensor_size) {
+      bm_helper->ReInit(adds_per_round, tensor_size);
     }
   }
   std::vector<EMBenchmarkHelper::TimeSet> times;
@@ -648,7 +665,7 @@
   // First iter is always slow, so do one prior to the timed loop.
   int expected = 1 + (event_after_add ? adds_per_round : 0);
   bm_helper->DoAddChain(adds_per_round, 1, event_after_add, callback, nullptr);
-  while (counter < 1) {
+  while (counter < expected) {
     Env::Default()->SleepForMicroseconds(1);
   }
   counter = 0;
@@ -677,71 +694,169 @@
 }
 
 #ifdef PLATFORM_GOOGLE
-static void BM_chain_1_false(int iters) { BM_chain_ops(iters, 1, false); }
+static void BM_chain_1024_1_false(int iters) {
+  BM_chain_ops(iters, 1024, 1, false, 0);
+}
 
-static void BM_chain_1_true(int iters) { BM_chain_ops(iters, 1, true); }
+static void BM_chain_1024_1_true(int iters) {
+  BM_chain_ops(iters, 1024, 1, true, 0);
+}
 
-static void BM_chain_10_false(int iters) { BM_chain_ops(iters, 10, false); }
+static void BM_chain_1024_10_false(int iters) {
+  BM_chain_ops(iters, 1024, 10, false, 0);
+}
 
-static void BM_chain_10_true(int iters) { BM_chain_ops(iters, 10, true); }
+static void BM_chain_1024_10_true(int iters) {
+  BM_chain_ops(iters, 1024, 10, true, 0);
+}
 
-static void BM_chain_100_false(int iters) { BM_chain_ops(iters, 100, false); }
+static void BM_chain_1024_100_false(int iters) {
+  BM_chain_ops(iters, 1024, 100, false, 0);
+}
 
-static void BM_chain_100_true(int iters) { BM_chain_ops(iters, 100, true); }
+static void BM_chain_1024_100_true(int iters) {
+  BM_chain_ops(iters, 1024, 100, true, 0);
+}
 
-BENCHMARK(BM_chain_1_false)->Threads(1);
-BENCHMARK(BM_chain_1_true)->Threads(1);
-BENCHMARK(BM_chain_1_false)->Threads(2);
-BENCHMARK(BM_chain_1_true)->Threads(2);
-BENCHMARK(BM_chain_1_false)->Threads(8);
-BENCHMARK(BM_chain_1_true)->Threads(8);
-BENCHMARK(BM_chain_10_false)->Threads(1);
-BENCHMARK(BM_chain_10_true)->Threads(1);
-BENCHMARK(BM_chain_10_false)->Threads(8);
-BENCHMARK(BM_chain_10_true)->Threads(8);
-BENCHMARK(BM_chain_100_false)->Threads(1);
-BENCHMARK(BM_chain_100_true)->Threads(1);
-BENCHMARK(BM_chain_100_false)->Threads(8);
-BENCHMARK(BM_chain_100_true)->Threads(8);
+static void BM_chain_1M_1_false(int iters) {
+  BM_chain_ops(iters, 1 << 20, 1, false, 0);
+}
+
+static void BM_chain_1M_1_true(int iters) {
+  BM_chain_ops(iters, 1 << 20, 1, true, 0);
+}
+
+static void BM_chain_1M_10_false(int iters) {
+  BM_chain_ops(iters, 1 << 20, 10, false, 0);
+}
+
+static void BM_chain_1M_10_true(int iters) {
+  BM_chain_ops(iters, 1 << 20, 10, true, 0);
+}
+
+static void BM_chain_1M_100_false(int iters) {
+  BM_chain_ops(iters, 1 << 20, 100, false, 0);
+}
+
+static void BM_chain_1M_100_true(int iters) {
+  BM_chain_ops(iters, 1 << 20, 100, true, 0);
+}
+
+BENCHMARK(BM_chain_1024_1_false)->Threads(1);
+BENCHMARK(BM_chain_1024_1_true)->Threads(1);
+BENCHMARK(BM_chain_1024_1_false)->Threads(2);
+BENCHMARK(BM_chain_1024_1_true)->Threads(2);
+BENCHMARK(BM_chain_1024_1_false)->Threads(8);
+BENCHMARK(BM_chain_1024_1_true)->Threads(8);
+BENCHMARK(BM_chain_1024_10_false)->Threads(1);
+BENCHMARK(BM_chain_1024_10_true)->Threads(1);
+BENCHMARK(BM_chain_1024_10_false)->Threads(8);
+BENCHMARK(BM_chain_1024_10_true)->Threads(8);
+BENCHMARK(BM_chain_1024_100_false)->Threads(1);
+BENCHMARK(BM_chain_1024_100_true)->Threads(1);
+BENCHMARK(BM_chain_1024_100_false)->Threads(2);
+BENCHMARK(BM_chain_1024_100_true)->Threads(2);
+BENCHMARK(BM_chain_1024_100_false)->Threads(8);
+BENCHMARK(BM_chain_1024_100_true)->Threads(8);
+
+BENCHMARK(BM_chain_1M_1_false)->Threads(1);
+BENCHMARK(BM_chain_1M_1_true)->Threads(1);
+BENCHMARK(BM_chain_1M_1_false)->Threads(2);
+BENCHMARK(BM_chain_1M_1_true)->Threads(2);
+BENCHMARK(BM_chain_1M_1_false)->Threads(8);
+BENCHMARK(BM_chain_1M_1_true)->Threads(8);
+BENCHMARK(BM_chain_1M_10_false)->Threads(1);
+BENCHMARK(BM_chain_1M_10_true)->Threads(1);
+BENCHMARK(BM_chain_1M_10_false)->Threads(8);
+BENCHMARK(BM_chain_1M_10_true)->Threads(8);
+BENCHMARK(BM_chain_1M_100_false)->Threads(1);
+BENCHMARK(BM_chain_1M_100_true)->Threads(1);
+BENCHMARK(BM_chain_1M_100_false)->Threads(2);
+BENCHMARK(BM_chain_1M_100_true)->Threads(2);
+BENCHMARK(BM_chain_1M_100_false)->Threads(8);
+BENCHMARK(BM_chain_1M_100_true)->Threads(8);
 #else
-static void BM_chain_1_false(int iters, int threads) {
-  BM_chain_ops(iters, 1, false, threads);
+static void BM_chain_1024_1_false(int iters, int threads) {
+  BM_chain_ops(iters, 1024, 1, false, 0, threads);
 }
 
-static void BM_chain_1_true(int iters, int threads) {
-  BM_chain_ops(iters, 1, true, threads);
+static void BM_chain_1024_1_true(int iters, int threads) {
+  BM_chain_ops(iters, 1024, 1, true, 0, threads);
 }
 
-static void BM_chain_10_false(int iters, int threads) {
-  BM_chain_ops(iters, 10, false, threads);
+static void BM_chain_1024_10_false(int iters, int threads) {
+  BM_chain_ops(iters, 1024, 10, false, 0, threads);
 }
 
-static void BM_chain_10_true(int iters, int threads) {
-  BM_chain_ops(iters, 10, true, threads);
+static void BM_chain_1024_10_true(int iters, int threads) {
+  BM_chain_ops(iters, 1024, 10, true, 0, threads);
 }
 
-static void BM_chain_100_false(int iters, int threads) {
-  BM_chain_ops(iters, 100, false, threads);
+static void BM_chain_1024_100_false(int iters, int threads) {
+  BM_chain_ops(iters, 1024, 100, false, 0, threads);
 }
 
-static void BM_chain_100_true(int iters, int threads) {
-  BM_chain_ops(iters, 100, true, threads);
+static void BM_chain_1024_100_true(int iters, int threads) {
+  BM_chain_ops(iters, 1024, 100, true, 0, threads);
 }
 
-BENCHMARK(BM_chain_1_false)->Arg(1);
-BENCHMARK(BM_chain_1_true)->Arg(1);
-BENCHMARK(BM_chain_1_false)->Arg(2);
-BENCHMARK(BM_chain_1_true)->Arg(2);
-BENCHMARK(BM_chain_1_false)->Arg(8);
-BENCHMARK(BM_chain_1_true)->Arg(8);
-BENCHMARK(BM_chain_10_false)->Arg(1);
-BENCHMARK(BM_chain_10_true)->Arg(1);
-BENCHMARK(BM_chain_10_false)->Arg(8);
-BENCHMARK(BM_chain_10_true)->Arg(8);
-BENCHMARK(BM_chain_100_false)->Arg(1);
-BENCHMARK(BM_chain_100_true)->Arg(1);
-BENCHMARK(BM_chain_100_false)->Arg(8);
-BENCHMARK(BM_chain_100_true)->Arg(8);
+static void BM_chain_1M_1_false(int iters, int threads) {
+  BM_chain_ops(iters, 1 << 20, 1, false, 0, threads);
+}
+
+static void BM_chain_1M_1_true(int iters, int threads) {
+  BM_chain_ops(iters, 1 << 20, 1, true, 0, threads);
+}
+
+static void BM_chain_1M_10_false(int iters, int threads) {
+  BM_chain_ops(iters, 1 << 20, 10, false, 0, threads);
+}
+
+static void BM_chain_1M_10_true(int iters, int threads) {
+  BM_chain_ops(iters, 1 << 20, 10, true, 0, threads);
+}
+
+static void BM_chain_1M_100_false(int iters, int threads) {
+  BM_chain_ops(iters, 1 << 20, 100, false, 0, threads);
+}
+
+static void BM_chain_1M_100_true(int iters, int threads) {
+  BM_chain_ops(iters, 1 << 20, 100, true, 0, threads);
+}
+
+BENCHMARK(BM_chain_1024_1_false)->Arg(1);
+BENCHMARK(BM_chain_1024_1_true)->Arg(1);
+BENCHMARK(BM_chain_1024_1_false)->Arg(2);
+BENCHMARK(BM_chain_1024_1_true)->Arg(2);
+BENCHMARK(BM_chain_1024_1_false)->Arg(8);
+BENCHMARK(BM_chain_1024_1_true)->Arg(8);
+BENCHMARK(BM_chain_1024_10_false)->Arg(1);
+BENCHMARK(BM_chain_1024_10_true)->Arg(1);
+BENCHMARK(BM_chain_1024_10_false)->Arg(8);
+BENCHMARK(BM_chain_1024_10_true)->Arg(8);
+BENCHMARK(BM_chain_1024_100_false)->Arg(1);
+BENCHMARK(BM_chain_1024_100_true)->Arg(1);
+BENCHMARK(BM_chain_1024_100_false)->Arg(2);
+BENCHMARK(BM_chain_1024_100_true)->Arg(2);
+BENCHMARK(BM_chain_1024_100_false)->Arg(8);
+BENCHMARK(BM_chain_1024_100_true)->Arg(8);
+
+BENCHMARK(BM_chain_1M_1_false)->Arg(1);
+BENCHMARK(BM_chain_1M_1_true)->Arg(1);
+BENCHMARK(BM_chain_1M_1_false)->Arg(2);
+BENCHMARK(BM_chain_1M_1_true)->Arg(2);
+BENCHMARK(BM_chain_1M_1_false)->Arg(8);
+BENCHMARK(BM_chain_1M_1_true)->Arg(8);
+BENCHMARK(BM_chain_1M_10_false)->Arg(1);
+BENCHMARK(BM_chain_1M_10_true)->Arg(1);
+BENCHMARK(BM_chain_1M_10_false)->Arg(8);
+BENCHMARK(BM_chain_1M_10_true)->Arg(8);
+BENCHMARK(BM_chain_1M_100_false)->Arg(1);
+BENCHMARK(BM_chain_1M_100_true)->Arg(1);
+BENCHMARK(BM_chain_1M_100_false)->Arg(2);
+BENCHMARK(BM_chain_1M_100_true)->Arg(2);
+BENCHMARK(BM_chain_1M_100_false)->Arg(8);
+BENCHMARK(BM_chain_1M_100_true)->Arg(8);
 #endif
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index ea45bfa..3dae01b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -81,7 +81,8 @@
                                             TfGpuId tf_gpu_id,
                                             size_t total_bytes) {
   CHECK(process_state_);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
@@ -146,7 +147,7 @@
     }
     allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
                        std::unique_ptr<SharedCounter>(timing_counter),
-                       sub_allocator,
+                       gpu_bfc_allocator, sub_allocator,
                        std::unique_ptr<Allocator>(recording_allocator)};
   }
   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
@@ -163,14 +164,22 @@
 
 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
   DCHECK(process_state_);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
   mutex_lock l(mu_);
   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
+    LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
+               << " but only have " << gpu_allocators_.size();
     return nullptr;
   }
 
   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
+  if (allocator_parts.counter.get() == nullptr) {
+    SharedCounter* timing_counter = new SharedCounter;
+    allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
+    allocator_parts.counter.reset(timing_counter);
+  }
   return allocator_parts.counter.get();
 #else
   return nullptr;
@@ -240,6 +249,7 @@
       LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
     }
     int64 gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
+
     Allocator* allocator =
         new BFCAllocator(sub_allocator, gpu_host_mem_limit,
                          true /*allow_growth*/, "gpu_host_bfc" /*name*/);
@@ -251,7 +261,7 @@
     }
     gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
                                     std::unique_ptr<SharedCounter>(nullptr),
-                                    sub_allocator,
+                                    nullptr, sub_allocator,
                                     std::unique_ptr<Allocator>(nullptr)});
     AllocatorParts& allocator_parts = gpu_host_allocators_.back();
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
@@ -275,7 +285,8 @@
 
 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
                                          const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   mutex_lock lock(mu_);
   CHECK(gpu_allocators_.empty())  // Crash OK
       << "AddGPUAllocVisitor must be called before "
@@ -290,7 +301,8 @@
 
 void GPUProcessState::AddGpuHostAllocVisitor(
     int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   mutex_lock lock(mu_);
   CHECK(gpu_host_allocators_.empty())  // Crash OK
       << "AddGpuHostAllocVisitor must be called before "
@@ -304,7 +316,8 @@
 
 void GPUProcessState::AddGpuHostFreeVisitor(
     int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   mutex_lock lock(mu_);
   CHECK(gpu_host_allocators_.empty())  // Crash OK
       << "AddGpuHostFreeVisitor must be called before "
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 09e5575..601ccb2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -33,6 +33,7 @@
 namespace tensorflow {
 
 class Allocator;
+class GPUBFCAllocator;
 class PoolAllocator;
 class SharedCounter;
 
@@ -137,6 +138,7 @@
   struct AllocatorParts {
     std::unique_ptr<Allocator> allocator;
     std::unique_ptr<SharedCounter> counter;
+    GPUBFCAllocator* bfc_allocator;
     SubAllocator* sub_allocator;  // owned by allocator
     std::unique_ptr<Allocator> recording_allocator;
   };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 56f68c8..c0e8ac4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -151,7 +151,8 @@
   if (total_bytes > 0) {
     tracing::ScopedAnnotation annotation("SetProtoFromGPU");
     alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
-    buf = alloc->Allocate<char>(total_bytes);
+    buf = static_cast<char*>(
+        alloc->AllocateRaw(Allocator::kAllocatorAlignment, total_bytes));
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawAllocation("SetProtoFromGPU",
                                      LogMemory::PROTO_BUFFER_STEP_ID,
@@ -178,7 +179,7 @@
                                              LogMemory::PROTO_BUFFER_STEP_ID,
                                              buf, alloc, false);
           }
-          alloc->Deallocate<char>(buf, total_bytes);
+          alloc->DeallocateRaw(buf);
         }
         done(Status::OK());
       });
@@ -300,7 +301,7 @@
 void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  const DeviceContext* device_context,
                                  Device* gpu_device, Tensor* gpu_tensor,
-                                 StatusCallback done) {
+                                 StatusCallback done, bool sync_dst_compute) {
   VLOG(1) << "CopyCPUTensorToGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
   se::Stream* recv_stream = nullptr;
@@ -319,7 +320,9 @@
     return;
   }
   // Wait for the recv-stream to make sure the buffer is truly available.
-  recv_host_to_device_stream->ThenWaitFor(recv_stream);
+  if (sync_dst_compute) {
+    recv_host_to_device_stream->ThenWaitFor(recv_stream);
+  }
 
   const int64 total_bytes = cpu_tensor->TotalBytes();
   // Note that 0-size tensors have no backing buffer.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 8ac3feb..b3614e1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -88,7 +88,7 @@
   static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  const DeviceContext* device_context,
                                  Device* gpu_device, Tensor* gpu_tensor,
-                                 StatusCallback done);
+                                 StatusCallback done, bool sync_dst_compute);
 
   static void DeviceToDeviceCopy(
       DeviceContext* send_dev_context, DeviceContext* recv_dev_context,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
index 0ef39fb..b58250c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
@@ -26,8 +26,10 @@
 void GPUDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
                                              Tensor* device_tensor,
-                                             StatusCallback done) const {
-  GPUUtil::CopyCPUTensorToGPU(cpu_tensor, this, device, device_tensor, done);
+                                             StatusCallback done,
+                                             bool sync_dst_compute) const {
+  GPUUtil::CopyCPUTensorToGPU(cpu_tensor, this, device, device_tensor, done,
+                              sync_dst_compute);
 }
 
 void GPUDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index f513526..eab46b7 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -50,8 +50,8 @@
   int stream_id() const { return stream_id_; }
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
-                             Tensor* device_tensor,
-                             StatusCallback done) const override;
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
 
   void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name,
                              Device* device, Tensor* cpu_tensor,
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 0faf5bc..5290332 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -22,6 +22,7 @@
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
@@ -571,7 +572,7 @@
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
-  Placer placer(new_graph.get(), flib_def_.get(), device_set_,
+  Placer placer(new_graph.get(), "", flib_def_.get(), device_set_,
                 /* default_device= */ nullptr,
                 session_options_ == nullptr ||
                     session_options_->config.allow_soft_placement(),
@@ -607,10 +608,12 @@
     graph_->ToGraphDef(&item.graph);
 
     // It's ok to skip invalid device annotations in Grappler.
-    Status inferred_devices = item.InferDevicesFromGraph();
-    if (!inferred_devices.ok()) {
-      VLOG(3) << inferred_devices.error_message();
+    for (const Device* d : device_set_->devices()) {
+      Status added_device = item.AddDevice(d->name());
+      if (!added_device.ok()) VLOG(3) << added_device.error_message();
     }
+    VLOG(3) << "Grappler available devices: "
+            << absl::StrJoin(item.devices(), ", ");
 
     // TODO(b/114748242): Add a unit test to test this bug fix.
     if (flib_def_) {
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 13f4784..88cb238 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -164,6 +164,11 @@
                                  kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
+  params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                 Rendezvous** r) {
+    *r = new IntraProcessRendezvous(device_mgr);
+    return Status::OK();
+  };
 
   Executor* executor;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index 4f9fc2b..da081b82 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -33,6 +33,7 @@
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 // Set true for greater intelligibility of debug mode log messages.
 #define READABLE_KEYS false
@@ -316,7 +317,9 @@
 
     if (my_rank >= 0 && my_rank != source_rank) {
       // Begin by receiving the value.
-      tracing::ScopedActivity activity("ReceiveValue", std::to_string(si));
+      profiler::TraceMe activity(
+          [&] { return strings::StrCat("ReceiveValue:", si); },
+          profiler::TraceMeLevel::kInfo);
       int recv_from_rank = TreeRecvFrom(*col_params_, si);
       Notification note;
       DispatchRecv(si, recv_from_rank, my_rank, col_ctx_->output,
@@ -330,7 +333,9 @@
 
     // Then forward value to all descendent devices.
     {
-      tracing::ScopedActivity activity("ForwardValue", std::to_string(si));
+      profiler::TraceMe activity(
+          [&] { return strings::StrCat("ForwardValue:", si); },
+          profiler::TraceMeLevel::kInfo);
       if (my_rank >= 0 && status_.ok()) {
         std::vector<int> send_to_ranks;
         TreeSendTo(*col_params_, si, &send_to_ranks);
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index d17e4b0..a300ae0 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -455,8 +455,9 @@
     for (int di = 0; di < instances_.size(); ++di) {
       if (!instances_[di]->status_.ok()) {
         ASSERT_GT(fail_after, 0);
-        ASSERT_EQ(instances_[di]->status_.error_message(),
-                  "Deliberate failure");
+        ASSERT_NE(
+            instances_[di]->status_.error_message().find("Deliberate failure"),
+            string::npos);
         mutex_lock l(mu_);
         ++failure_count_;
         continue;
diff --git a/tensorflow/core/common_runtime/inspecting_placer.cc b/tensorflow/core/common_runtime/inspecting_placer.cc
new file mode 100644
index 0000000..19cc784
--- /dev/null
+++ b/tensorflow/core/common_runtime/inspecting_placer.cc
@@ -0,0 +1,160 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/inspecting_placer.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/colocation_graph.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+string IOColocationGroups::DebugString() const {
+  std::unordered_map<int, std::vector<string>> group_members;
+  for (int arg_index = 0; arg_index < input_groups.size(); ++arg_index) {
+    int group_id = input_groups[arg_index];
+    group_members[group_id].push_back(strings::StrCat("i:", arg_index));
+  }
+  for (int ret_index = 0; ret_index < output_groups.size(); ++ret_index) {
+    int group_id = output_groups[ret_index];
+    group_members[group_id].push_back(strings::StrCat("o:", ret_index));
+  }
+
+  std::vector<string> group_strings;
+  for (const auto& it : group_members) {
+    int group_id = it.first;
+    const std::vector<string>& members = it.second;
+    const PossibleDevices& devices = group_devices[group_id];
+    group_strings.push_back(strings::StrCat(
+        "Group(", group_id, " members = [", absl::StrJoin(members, ", "),
+        "] requested_device_name = \"",
+        DeviceNameUtils::ParsedNameToString(devices.requested_device_name),
+        "\" resource_device_name = \"",
+        DeviceNameUtils::ParsedNameToString(devices.resource_device_name),
+        "\" device_types = [",
+        absl::StrJoin(
+            devices.device_types, ", ",
+            [](string* out, const std::pair<DeviceType, int32>& type_and_pref) {
+              out->append(DeviceTypeString(type_and_pref.first));
+            }),
+        "])"));
+  }
+
+  return absl::StrJoin(group_strings, "\n\t");
+}
+
+// Utility class for constructing IOColocationGroups from a ColocationGraph.
+class ColocationGraphToIOColocationGroups {
+ public:
+  // colocation_graph is mutable because finding root nodes can update
+  // parent pointers. It is not modified otherwise.
+  explicit ColocationGraphToIOColocationGroups(
+      ColocationGraph* colocation_graph)
+      : colocation_graph_(colocation_graph), next_group_id_(0) {}
+
+  void AssignGroups(const gtl::InlinedVector<Node*, 4>& nodes,
+                    std::vector<int>* groups) {
+    for (int i = 0; i < nodes.size(); ++i) {
+      int root_id = colocation_graph_->FindAndUpdateRoot(nodes[i]->id());
+      const auto& it = group_ids_.find(root_id);
+      int assigned_group_id;
+      if (it == group_ids_.end()) {
+        group_ids_[root_id] = next_group_id_;
+        assigned_group_id = next_group_id_;
+        ++next_group_id_;
+      } else {
+        assigned_group_id = it->second;
+      }
+      groups->push_back(assigned_group_id);
+    }
+  }
+
+  Status FillGroups(std::vector<PossibleDevices>* group_devices) {
+    group_devices->resize(group_ids_.size());
+    for (const auto& it : group_ids_) {
+      int assigned_group_id = it.second;
+      PossibleDevices& possible_devices = (*group_devices)[assigned_group_id];
+      const Member& member = colocation_graph_->members()[it.first];
+      TF_RETURN_IF_ERROR(member.FillPossibleDevices(&possible_devices));
+    }
+    return Status::OK();
+  }
+
+ private:
+  ColocationGraph* colocation_graph_;
+  // Allocated group ids: collocation_graph root id -> allocated group id.
+  std::unordered_map<int, int> group_ids_;
+  int next_group_id_;
+};
+
+InspectingPlacer::InspectingPlacer(const Graph* graph,
+                                   const FunctionStack& stack,
+                                   const FunctionLibraryDefinition* flib_def,
+                                   const DeviceSet* device_set,
+                                   const Device* default_device,
+                                   bool allow_soft_placement,
+                                   bool log_device_placement)
+    : graph_(*graph),
+      stack_(stack),
+      flib_def_(*flib_def),
+      device_set_(*device_set),
+      default_device_(default_device),
+      allow_soft_placement_(allow_soft_placement),
+      log_device_placement_(log_device_placement) {}
+
+Status InspectingPlacer::ComputeIOColocationGroups(const Node& node,
+                                                   IOColocationGroups* groups) {
+  const FunctionDef* fdef;
+  NameAttrList func;
+  TF_RETURN_IF_ERROR(GetFunctionDefAndAttrs(flib_def_, node, &fdef, &func));
+  std::unique_ptr<FunctionBody> fbody;
+
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(&func.attr()),
+                                             &flib_def_, &fbody));
+
+  TF_RETURN_IF_ERROR(
+      IsolatePlacerInspectionRequiredOps(flib_def_, fbody->graph));
+  if (stack_.HasFunction(func.name())) {
+    return errors::Unimplemented(
+        "Recursive function calls are not supported. Node ",
+        FormatNodeForError(node), " inside the body of ",
+        errors::FormatFunctionForError(stack_.current_function_name()),
+        " calls function ", errors::FormatFunctionForError(func.name()),
+        " which is already present in the call stack:\n  ",
+        stack_.FormatForError());
+  }
+
+  ColocationGraph colocation_graph(
+      fbody->graph, stack_.Push(&node, func.name()), &flib_def_, &device_set_,
+      default_device_, allow_soft_placement_, log_device_placement_);
+  TF_RETURN_IF_ERROR(colocation_graph.Initialize());
+
+  ColocationGraphToIOColocationGroups converter(&colocation_graph);
+  converter.AssignGroups(fbody->arg_nodes, &groups->input_groups);
+  converter.AssignGroups(fbody->ret_nodes, &groups->output_groups);
+  TF_RETURN_IF_ERROR(converter.FillGroups(&groups->group_devices));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/inspecting_placer.h b/tensorflow/core/common_runtime/inspecting_placer.h
new file mode 100644
index 0000000..6cba364
--- /dev/null
+++ b/tensorflow/core/common_runtime/inspecting_placer.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INSPECTING_PLACER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INSPECTING_PLACER_H_
+
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// TODO(iga): Convert this struct into a class to ensure invariants between
+// device names, i.e.
+//  DeviceNameUtils::IsSpecification(resource_device_name,
+//                                   requested_device_name)
+// PossibleDevices does not contain assigned_device_name because we don't
+// assign devices to nested functions.
+struct PossibleDevices {
+  // The same as Member::requested_device_name_ in colocation_graph.cc.
+  DeviceNameUtils::ParsedName requested_device_name;
+
+  // The same as Member::resource_device_name_ in colocation_graph.cc.
+  DeviceNameUtils::ParsedName resource_device_name;
+
+  // A device type outside of this set will not be supported by some
+  // internal op.
+  PrioritizedDeviceTypeVector device_types;
+};
+
+// A struct for communicating constraints on devices that can
+// be chosen for inputs and outputs of an op requiring deep placer inspection.
+struct IOColocationGroups {
+  // input_groups[i] contains the group id that i'th input belongs to.
+  // List inputs are not supported.
+  std::vector<int> input_groups;
+  // output_groups[i] contains the group id that i'th output belongs to.
+  // List inputs are not supported.
+  std::vector<int> output_groups;
+  // group_devices[i] contains possible devices for group with id i.
+  std::vector<PossibleDevices> group_devices;
+
+  string DebugString() const;
+};
+
+class InspectingPlacer {
+ public:
+  // graph and device_set must not be null and must outlive this
+  // InspectingPlacer. default_device can be null. If not, must outlive this.
+  // TODO(iga): Add a "stack trace" to detect recursion and improve log
+  // messages. Currently, we will enter an infinite loop for recursive
+  // functions.
+  InspectingPlacer(const Graph* graph, const FunctionStack& stack,
+                   const FunctionLibraryDefinition* flib_def,
+                   const DeviceSet* device_set, const Device* default_device,
+                   bool allow_soft_placement, bool log_device_placement);
+
+  // `node` must be
+  // PlacerInspectionRequiredOpsChecker::IsPlacerInspectionRequired.
+  Status ComputeIOColocationGroups(const Node& node,
+                                   IOColocationGroups* groups);
+
+ private:
+  const Graph& graph_;
+  const FunctionStack stack_;
+  const FunctionLibraryDefinition& flib_def_;
+  const DeviceSet& device_set_;
+  const Device* default_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(InspectingPlacer);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INSPECTING_PLACER_H_
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc
new file mode 100644
index 0000000..bc19153
--- /dev/null
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h"
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+
+Status IsolatePlacerInspectionRequiredOpsPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr) {
+    VLOG(1) << "Not running IsolatePlacerInspectionRequiredOpsPass because no "
+               "graph is provided";
+    return Status::OK();
+  }
+
+  VLOG(1) << "IsolatePlacerInspectionRequiredOpsPass::Run";
+
+  Graph* graph = options.graph->get();
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("isolate_deep_ops_before", *graph, nullptr, "/tmp");
+  }
+
+  const FunctionLibraryDefinition* flib_def =
+      options.flib_def == nullptr ? &graph->flib_def() : options.flib_def;
+  Status status = IsolatePlacerInspectionRequiredOps(*flib_def, graph);
+
+  if (VLOG_IS_ON(3) && status.ok()) {
+    DumpGraphToFile("isolate_deep_ops_after", *graph, nullptr, "/tmp");
+  }
+  return status;
+}
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 25,
+                      IsolatePlacerInspectionRequiredOpsPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h
new file mode 100644
index 0000000..3d86c45
--- /dev/null
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ISOLATE_PLACER_INSPECTION_REQUIRED_OPS_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ISOLATE_PLACER_INSPECTION_REQUIRED_OPS_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+// Adds Identities for each input/output of function-calling ops.
+//
+// For example, the following graph calling a function on inputs `a` and `b`
+// and producing output `y` will be rewritted to include identities on all
+// edges:
+//
+//      a             b
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//         v
+//         y
+//
+// is transformed to
+//
+//      a             b
+//      |             |
+//  a_f (Identity)   a_f (Identity)
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//      f_y (Identity)
+//         |
+//         v
+//         y
+//
+// This pass is currently needed to simplify correctly placing the nodes
+// producing inputs for as well as consuming output from function-calling ops.
+//
+// This pass should also help to implement replacing PartitionedCallOp with
+// component function calls (to avoid copying input/output tensors), if we get
+// to it.
+class IsolatePlacerInspectionRequiredOpsPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ISOLATE_PLACER_INSPECTION_REQUIRED_OPS_PASS_H_
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
new file mode 100644
index 0000000..6fb01c3
--- /dev/null
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
@@ -0,0 +1,437 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h"
+
+#include <map>
+#include <unordered_map>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
+
+namespace tensorflow {
+
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+using FDH = ::tensorflow::FunctionDefHelper;
+
+// Returns void so that we can call TF_ASSERT_OK inside it.
+void RunPass(const GraphDef& original, GraphDef* rewritten,
+             FunctionLibraryDefinition* flib_def = nullptr) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  GraphConstructorOptions opts;
+  TF_ASSERT_OK(ConvertGraphDefToGraph(opts, original, graph.get()));
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  options.flib_def = flib_def;
+  IsolatePlacerInspectionRequiredOpsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  graph->ToGraphDef(rewritten);
+}
+
+void RunPassAndCompare(const GraphDef& original, const GraphDef& expected) {
+  GraphDef rewritten;
+  RunPass(original, &rewritten);
+  TF_EXPECT_GRAPH_EQ(expected, rewritten);
+}
+
+void RunPassAndCompare(const GraphDef& original,
+                       const std::vector<GraphDef>& expected_alternatives) {
+  GraphDef rewritten;
+  RunPass(original, &rewritten);
+
+  std::vector<string> errors;
+  errors.push_back(absl::StrCat("Graphs did not match.\n  Rewritten graph:\n",
+                                SummarizeGraphDef(rewritten)));
+  for (const GraphDef& alternative : expected_alternatives) {
+    string diff;
+    bool graphs_equal = EqualGraphDef(rewritten, alternative, &diff);
+    if (graphs_equal) {
+      return;
+    }
+    errors.push_back(absl::StrCat("  Expected alternative:\n",
+                                  SummarizeGraphDef(alternative)));
+  }
+  EXPECT_TRUE(false) << absl::StrJoin(errors, "\n");
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, Basic) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (PartitionedCallOp: ResourceIdentity)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef original = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"x"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("y", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  GraphDef expected = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("x_f", "Identity", {"x"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"x_f"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("f_y", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("y", "_Retval", {"f_y:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, expected);
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, FunctionDefinitionNotInGraph) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (PartitionedCallOp: ResourceIdentity)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef original = GDef({
+      NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+      NDef("f", "PartitionedCall", {"x"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+      NDef("y", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+  });
+
+  GraphDef expected = GDef({
+      NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+      NDef("x_f", "Identity", {"x"}, {{"T", DT_RESOURCE}}),
+      NDef("f", "PartitionedCall", {"x_f"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+      NDef("f_y", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+      NDef("y", "_Retval", {"f_y:0"}, {{"T", DT_RESOURCE}}),
+  });
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
+  TF_ASSERT_OK(flib_def.AddFunctionDef(func));
+  GraphDef rewritten;
+  RunPass(original, &rewritten, &flib_def);
+  TF_EXPECT_GRAPH_EQ(expected, rewritten);
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, MultipleInputsAndOutputs) {
+  /*
+   *                a (_Arg, DT_RESOURCE)
+   *                   |   b (_Arg, DT_RESOURCE)
+   *                   |      |
+   *                   v      v
+   *                f (PartitionedCallOp: Swap)
+   *                   |      |
+   *                   |      v
+   *                   v    r2 (_Retval, DT_RESOURCE)
+   *                r1 (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef original = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("r1", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "_Retval", {"f:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  GraphDef expected = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("a_f", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b_f", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a_f", "b_f"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("f_r1", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r1", "_Retval", {"f_r1"}, {{"T", DT_RESOURCE}}),
+          NDef("f_r2", "Identity", {"f:1"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "_Retval", {"f_r2"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, expected);
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, UnusedOutput) {
+  /*
+   *                a (_Arg, DT_RESOURCE)
+   *                   |   b (_Arg, DT_RESOURCE)
+   *                   |      |
+   *                   v      v
+   *                f (PartitionedCallOp: Swap)
+   *                   |      |
+   *                   |      v
+   *                   v    <unused>
+   *                r1 (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef original = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("r1", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  GraphDef expected = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("a_f", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b_f", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a_f", "b_f"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("f_r1", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r1", "_Retval", {"f_r1"}, {{"T", DT_RESOURCE}}),
+          // Identity is created for output that was not used.
+          NDef("f_0", "Identity", {"f:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, expected);
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, OutputsConsumedBySameOp) {
+  /*
+   *                a (_Arg, DT_RESOURCE)
+   *                   |   b (_Arg, DT_RESOURCE)
+   *                   |      |
+   *                   v      v
+   *                f (PartitionedCallOp: Swap)
+   *                   |     |
+   *                   |     |
+   *                   v     v
+   *                add (Add, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef original = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("add", "Add", {"f:0", "f:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  // There are two possible namings for outputs depending on map
+  // iteration order.
+  GraphDef expected1 = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("a_f", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b_f", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a_f", "b_f"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("f_add", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("f_add_0", "Identity", {"f:1"}, {{"T", DT_RESOURCE}}),
+          NDef("add", "Add", {"f_add", "f_add_0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  GraphDef expected2 = GDef(
+      {
+          // Same as above
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("a_f", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b_f", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a_f", "b_f"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          // Different from above
+          NDef("f_add", "Identity", {"f:1"}, {{"T", DT_RESOURCE}}),
+          NDef("f_add_0", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("add", "Add", {"f_add_0", "f_add"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, {expected1, expected2});
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, IdenticalInputs) {
+  /*
+   *                a (_Arg, DT_RESOURCE)
+   *                   |      |
+   *                   |      |
+   *                   v      v
+   *                f (PartitionedCallOp: Swap)
+   *                   |      |
+   *                   |      v
+   *                   v    r2 (_Retval, DT_RESOURCE)
+   *                r1 (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef original = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a", "a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("r1", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "_Retval", {"f:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  // There are two possible namings for outputs depending on map
+  // iteration order.
+  GraphDef expected1 = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("a_f", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("a_f_0", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"a_f", "a_f_0"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("f_r1", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r1", "_Retval", {"f_r1"}, {{"T", DT_RESOURCE}}),
+          NDef("f_r2", "Identity", {"f:1"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "_Retval", {"f_r2"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  GraphDef expected2 = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("a_f", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("a_f_0", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall",
+               {"a_f_0", "a_f"},  // the only different line from above
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("f_r1", "Identity", {"f:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r1", "_Retval", {"f_r1"}, {{"T", DT_RESOURCE}}),
+          NDef("f_r2", "Identity", {"f:1"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "_Retval", {"f_r2"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, {expected1, expected2});
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest, DirectCallsAreNotIsolated) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (direct function call to ResourceIdentity)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef original = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "ResourceIdentity", {"x"}),
+          NDef("y", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, original);
+}
+
+TEST(IsolatePlacerInspectionRequiredOpsPassTest,
+     FunctionsNotReturningResourcesAreNotIsolated) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (PartitionedCallOp, ReadResourceVariable)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_FLOAT)
+   */
+  FunctionDef func = test::function::ReadResourceVariable();
+  GraphDef original = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"x"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("ReadResourceVariable", {})}}),
+          NDef("y", "_Retval", {"f:0"}, {{"T", DT_FLOAT}}),
+      },
+      // FunctionLib
+      {func});
+
+  RunPassAndCompare(original, original);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.cc b/tensorflow/core/common_runtime/lower_function_call_op.cc
index aaa1755..4df335a 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op.cc
@@ -60,6 +60,7 @@
     // Tensorflow 2.0 Eager mode, and it has control outputs to represent
     // side-effects that must always execute (see `control_ret` in FunctionDef).
     inline_options.override_device = false;
+    inline_options.initialize_empty_device = true;
     inline_options.output_control_src = OutputControlSrc::kControlOutputs;
   } else {
     // Native function call (node.type_string() is the function name). These
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.h b/tensorflow/core/common_runtime/lower_functional_ops.h
index 297f585..84d15a1 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.h
+++ b/tensorflow/core/common_runtime/lower_functional_ops.h
@@ -32,10 +32,8 @@
 class LowerFunctionalOpsPass : public GraphOptimizationPass {
  public:
   LowerFunctionalOpsPass() = default;
-  LowerFunctionalOpsPass(bool lower_function_calls,
-                         bool keep_lowered_nodes_fetchable)
-      : lower_function_calls_(lower_function_calls),
-        keep_lowered_nodes_fetchable_(keep_lowered_nodes_fetchable) {}
+  LowerFunctionalOpsPass(bool keep_lowered_nodes_fetchable)
+      : keep_lowered_nodes_fetchable_(keep_lowered_nodes_fetchable) {}
 
   Status Run(const GraphOptimizationPassOptions& options) override;
 
@@ -45,10 +43,6 @@
       "_lower_as_multi_device_function";
 
  private:
-  // TODO(ezhulenev): This is only required until Grappler function optimizer is
-  // not migrated to use function inlining from common_runtime.
-  bool lower_function_calls_ = true;
-
   // If defined use the value to control if functional ops must be fetchable
   // after lowering (we add IdentityN in place of all lowered nodes). If not
   // defined, this option will be inferred automatically from the graph (in
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index d700040..0fcf70d 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -70,7 +70,7 @@
 }
 
 Status UpdateArgAndRetvalMetadata(
-    Graph* subgraph, std::vector<int>* arg_indices,
+    Graph* subgraph, const string& device_type, std::vector<int>* arg_indices,
     std::vector<int>* ret_indices,
     std::vector<AllocatorAttributes>* arg_alloc_attrs,
     std::vector<AllocatorAttributes>* ret_alloc_attrs) {
@@ -101,7 +101,9 @@
     TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
     AllocatorAttributes alloc_attr;
     DataType type = attr_value->type();
-    if (MTypeFromDType(type) == HOST_MEMORY) {
+    MemoryType mtype = (device_type == "TPU") ? MTypeFromDTypeIntsOnDevice(type)
+                                              : MTypeFromDType(type);
+    if (mtype == HOST_MEMORY) {
       alloc_attr.set_on_host(true);
     }
     arg_alloc_attrs->push_back(alloc_attr);
@@ -112,7 +114,9 @@
     TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
     AllocatorAttributes alloc_attr;
     DataType type = attr_value->type();
-    if (MTypeFromDType(type) == HOST_MEMORY) {
+    MemoryType mtype = (device_type == "TPU") ? MTypeFromDTypeIntsOnDevice(type)
+                                              : MTypeFromDType(type);
+    if (mtype == HOST_MEMORY) {
       alloc_attr.set_on_host(true);
     }
     ret_alloc_attrs->push_back(alloc_attr);
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
index c282647..1a6551a 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.h
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -57,7 +57,7 @@
 //  (3) records which `Arg` and `Retval` nodes live in host memory in
 //      `*_alloc_attrs`.
 Status UpdateArgAndRetvalMetadata(
-    Graph* subgraph, std::vector<int>* arg_indices,
+    Graph* subgraph, const string& device_type, std::vector<int>* arg_indices,
     std::vector<int>* ret_indices,
     std::vector<AllocatorAttributes>* arg_alloc_attrs,
     std::vector<AllocatorAttributes>* ret_alloc_attrs);
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 705b52a..9a3f3a6 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -68,7 +68,7 @@
     TF_ASSERT_OK(s.ToGraph(graph));
 
     if (assign_device) {
-      Placer placer(graph, &device_set_, device0_);
+      Placer placer(graph, "", &device_set_, device0_);
       TF_ASSERT_OK(placer.Run());
     }
   }
@@ -84,7 +84,7 @@
     auto dx_retval = ops::_Retval(s2.WithOpName("retval1"), id_y, 0);
     auto dy_retval = ops::_Retval(s1.WithOpName("retval2"), id_x, 1);
     TF_ASSERT_OK(s.ToGraph(graph));
-    Placer placer(graph, &device_set_, device0_);
+    Placer placer(graph, "", &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
   }
 
@@ -98,7 +98,7 @@
     auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
     auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
     TF_ASSERT_OK(s.ToGraph(subgraph));
-    Placer placer(subgraph, &device_set_, device0_);
+    Placer placer(subgraph, "", &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
   }
 
@@ -183,9 +183,11 @@
   std::vector<AllocatorAttributes> arg_alloc_attrs;
   std::vector<AllocatorAttributes> ret_alloc_attrs;
 
-  Status status =
-      UpdateArgAndRetvalMetadata(graph.get(), &arg_indices, &ret_indices,
-                                 &arg_alloc_attrs, &ret_alloc_attrs);
+  string device_type = "CPU";
+
+  Status status = UpdateArgAndRetvalMetadata(
+      graph.get(), device_type, &arg_indices, &ret_indices, &arg_alloc_attrs,
+      &ret_alloc_attrs);
   ASSERT_TRUE(status.ok()) << status.ToString();
 
   CheckIndices({3}, arg_indices);
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index c4cac35..d158cdf 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -71,22 +71,27 @@
 
 }  // namespace
 
-Placer::Placer(Graph* graph, const FunctionLibraryDefinition* flib_def,
+Placer::Placer(Graph* graph, const string& function_name,
+               const FunctionLibraryDefinition* flib_def,
                const DeviceSet* devices, const Device* default_device,
                bool allow_soft_placement, bool log_device_placement)
     : graph_(graph),
+      function_name_(function_name),
       flib_def_(flib_def),
       devices_(devices),
       default_device_(default_device),
       allow_soft_placement_(allow_soft_placement),
       log_device_placement_(log_device_placement) {}
 
-Placer::Placer(Graph* graph, const DeviceSet* devices,
-               const Device* default_device)
-    : Placer(graph, &graph->flib_def(), devices, default_device, true, false) {}
+Placer::Placer(Graph* graph, const string& function_name,
+               const DeviceSet* devices, const Device* default_device)
+    : Placer(graph, function_name, &graph->flib_def(), devices, default_device,
+             true, false) {}
 
-Placer::Placer(Graph* graph, const DeviceSet* devices)
-    : Placer(graph, &graph->flib_def(), devices, nullptr, true, false) {}
+Placer::Placer(Graph* graph, const string& function_name,
+               const DeviceSet* devices)
+    : Placer(graph, function_name, &graph->flib_def(), devices, nullptr, true,
+             false) {}
 
 Placer::~Placer() {}
 
@@ -106,8 +111,9 @@
     }
   }
 
-  ColocationGraph colocation_graph(graph_, flib_def_, devices_, default_device_,
-                                   allow_soft_placement_,
+  FunctionStack stack(function_name_);
+  ColocationGraph colocation_graph(graph_, stack, flib_def_, devices_,
+                                   default_device_, allow_soft_placement_,
                                    log_device_placement_);
 
   TF_RETURN_IF_ERROR(colocation_graph.Initialize());
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index 46f0584..592f08f 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -57,6 +57,9 @@
   // Creates an instance of the Placer algorithm for the given
   // Graph "graph" (nodes in which may or may not be assigned) on the
   // given DeviceSet "devices".
+  // "function_name" should be set to the name of the function whose body is
+  // represented by "graph". If "graph" is not representing a function body,
+  // "function_name" should be empty.
   //
   // If non-null, default_device is used where possible as a placement for nodes
   // which do not have a device specified, ahead of other devices which would
@@ -64,13 +67,15 @@
   //
   // The "graph", "devices", and "default_device" pointer arguments are borrowed
   // by this Placer, and must outlive it.
-  Placer(Graph* graph, const FunctionLibraryDefinition* flib_def,
-         const DeviceSet* devices, const Device* default_device,
-         bool allow_soft_placement, bool log_device_placement);
+  Placer(Graph* graph, const string& function_name,
+         const FunctionLibraryDefinition* flib_def, const DeviceSet* devices,
+         const Device* default_device, bool allow_soft_placement,
+         bool log_device_placement);
 
-  Placer(Graph* graph, const DeviceSet* devices, const Device* default_device);
+  Placer(Graph* graph, const string& function_name, const DeviceSet* devices,
+         const Device* default_device);
 
-  Placer(Graph* graph, const DeviceSet* devices);
+  Placer(Graph* graph, const string& function_name, const DeviceSet* devices);
 
   ~Placer();
 
@@ -87,7 +92,8 @@
   bool CanAssignToDevice(const string& candidate_device_name,
                          const std::vector<Device*>& devices) const;
 
-  Graph* const graph_;                               // Not owned.
+  Graph* const graph_;  // Not owned.
+  const string function_name_;
   const FunctionLibraryDefinition* const flib_def_;  // Not owned.
   const DeviceSet* const devices_;                   // Not owned.
   const Device* default_device_;                     // Not owned.
diff --git a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
new file mode 100644
index 0000000..d483558
--- /dev/null
+++ b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
@@ -0,0 +1,312 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+
+bool IsFunctionCall(const Node& node) {
+  // TODO(iga): Handle non-PCO functions when we add multi-device support
+  // to regular function calls. Also, the GetFunctionDefAndAttrs assumes that
+  // the function name is stored in the `f` attribute of the node. That code
+  // will need to change as well.
+  const string& op_type = node.op_def().name();
+  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+}
+
+// Utility to set node's value in `cache` and `is_deep` to `value`.
+Status Set(const Node& node, bool value, bool* is_deep,
+           std::vector<absl::optional<bool>>* cache) {
+  *is_deep = value;
+  (*cache)[node.id()] = value;
+  return Status::OK();
+}
+
+}  // namespace
+
+PlacerInspectionRequiredOpChecker::PlacerInspectionRequiredOpChecker(
+    const Graph* graph)
+    : PlacerInspectionRequiredOpChecker(graph, &graph->flib_def()) {}
+
+PlacerInspectionRequiredOpChecker::PlacerInspectionRequiredOpChecker(
+    const Graph* graph, const FunctionLibraryDefinition* flib_def)
+    : graph_(*graph), flib_def_(*flib_def) {
+  cache_.resize(graph_.num_node_ids());
+}
+
+Status PlacerInspectionRequiredOpChecker::IsPlacerInspectionRequired(
+    const Node& node, bool* is_deep) {
+  if (cache_[node.id()].has_value()) {
+    *is_deep = cache_[node.id()].value();
+    return Status::OK();
+  }
+
+  if (!IsFunctionCall(node)) {
+    return Set(node, false, is_deep, &cache_);
+  }
+  const FunctionDef* fdef;
+  NameAttrList func;
+  TF_RETURN_IF_ERROR(GetFunctionDefAndAttrs(flib_def_, node, &fdef, &func));
+  DataTypeVector types;
+  TF_RETURN_IF_ERROR(
+      OutputTypesForNode(AttrSlice(&func.attr()), fdef->signature(), &types));
+  for (DataType type : types) {
+    if (type == DT_RESOURCE) {
+      return Set(node, true, is_deep, &cache_);
+    }
+  }
+  return Set(node, false, is_deep, &cache_);
+}
+
+Status GetFunctionDefAndAttrs(const FunctionLibraryDefinition& flib_def,
+                              const Node& node, const FunctionDef** fdef,
+                              NameAttrList* func) {
+  TF_RETURN_IF_ERROR(GetNodeAttr(node.def(), "f", func));
+  const string& function_name = func->name();
+  *fdef = flib_def.Find(function_name);
+  if (*fdef == nullptr) {
+    return errors::InvalidArgument(
+        "Failed to find function \"", function_name,
+        "\" in function library: ", flib_def.ToProto().DebugString());
+  }
+  return Status::OK();
+}
+
+FunctionStack::FunctionStack(const string& function_name)
+    : current_function_name_(function_name) {}
+
+FunctionStack FunctionStack::Push(const Node* node_in_current_function,
+                                  const string& new_current_function) const {
+  FunctionStack new_stack(new_current_function);
+  new_stack.frames_ = frames_;
+  new_stack.frames_.emplace_back(current_function_name_,
+                                 node_in_current_function);
+  return new_stack;
+}
+
+bool FunctionStack::HasFunction(const string& function_name) const {
+  if (current_function_name_ == function_name) {
+    return true;
+  }
+  for (const Frame& frame : frames_) {
+    if (frame.function_name == function_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+string FunctionStack::FormatForError() const {
+  std::vector<string> msgs;
+  for (int i = 0; i < frames_.size(); ++i) {
+    if (frames_[i].function_name.empty()) {
+      // Empty function body should only happen at the top level, i.e. i = 0.
+      // All internal frames should have valid function names.
+      msgs.push_back(absl::StrCat("Graph contains node ",
+                                  FormatNodeForError(*frames_[i].node)));
+
+    } else {
+      msgs.push_back(absl::StrCat(
+          "Function ", errors::FormatFunctionForError(frames_[i].function_name),
+          " contains node ", FormatNodeForError(*frames_[i].node)));
+    }
+    const string& fname = (i + 1 < frames_.size())
+                              ? frames_[i + 1].function_name
+                              : current_function_name_;
+    msgs.push_back(absl::StrCat("Node ", FormatNodeForError(*frames_[i].node),
+                                " calls function ",
+                                errors::FormatFunctionForError(fname)));
+  }
+  return absl::StrJoin(msgs, "\n  ");
+}
+
+namespace {
+
+using OutputEdgeMap = std::vector<std::vector<const Edge*>>;
+
+constexpr char kIdentityOp[] = "Identity";
+
+string Uniquify(const string& candidate_name,
+                std::unordered_set<string>* node_names) {
+  if (node_names->find(candidate_name) == node_names->end()) {
+    node_names->insert(candidate_name);
+    return candidate_name;
+  }
+
+  for (int counter = 0;; ++counter) {
+    string candidate = absl::StrCat(candidate_name, "_", counter);
+    if (node_names->find(candidate) == node_names->end()) {
+      node_names->insert(candidate);
+      return candidate;
+    }
+  }
+}
+
+Status AddInputIdentity(Node* node, int input_idx, Graph* graph,
+                        std::unordered_set<string>* node_names) {
+  const Edge* edge;
+  TF_RETURN_IF_ERROR(node->input_edge(input_idx, &edge));
+
+  string identity_name = Uniquify(
+      absl::StrCat(edge->src()->name(), "_", node->name()), node_names);
+
+  NodeDefBuilder builder(identity_name, kIdentityOp);
+  builder.Attr("T", node->input_type(input_idx));
+  NodeDefBuilder::NodeOut input(edge->src()->name(), edge->src_output(),
+                                node->input_type(input_idx));
+  builder.Input(input);
+  NodeDef identity_def;
+  TF_RETURN_IF_ERROR(builder.Finalize(&identity_def));
+  MergeDebugInfo(NodeDebugInfo(*node), &identity_def);
+
+  VLOG(6) << "Adding identity into " << edge->src()->name() << ":"
+          << edge->src_output() << " -> " << edge->dst()->name() << ":"
+          << input_idx << " :" << identity_def.DebugString();
+
+  Status status;
+  Node* identity_node = graph->AddNode(identity_def, &status);
+  if (!status.ok()) {
+    return status;
+  }
+  graph->AddEdge(edge->src(), edge->src_output(), identity_node, 0);
+
+  // Replace node's `input_idx` input with the new identity's 0'th output
+  TF_RETURN_IF_ERROR(graph->UpdateEdge(identity_node, 0, node, input_idx));
+
+  VLOG(6) << "Successfully inserted identity. Modified node: "
+          << node->DebugString();
+  return Status::OK();
+}
+
+Status AddOutputIdentities(Node* node, Graph* graph,
+                           std::unordered_set<string>* node_names) {
+  auto add_identity = [&](int src_output, const string& identity_name,
+                          Node** identity_node) {
+    NodeDefBuilder builder(identity_name, kIdentityOp);
+    builder.Attr("T", node->output_type(src_output));
+    NodeDefBuilder::NodeOut input(node->name(), src_output,
+                                  node->output_type(src_output));
+    builder.Input(input);
+    NodeDef identity_def;
+    TF_RETURN_IF_ERROR(builder.Finalize(&identity_def));
+    MergeDebugInfo(NodeDebugInfo(*node), &identity_def);
+
+    Status status;
+    *identity_node = graph->AddNode(identity_def, &status);
+    if (!status.ok()) {
+      return status;
+    }
+    graph->AddEdge(node, src_output, *identity_node, 0);
+    return Status::OK();
+  };
+
+  // output_used[i] == true iff `node`'s i'th output is used
+  // in this graph
+  std::vector<bool> output_used(node->num_outputs(), false);
+  // Copy the set of edges since EdgeSet does not allow modifications
+  // to graph edges during iteration.
+  const EdgeSet& out_edges = node->out_edges();
+  std::unordered_set<const Edge*> edge_set(out_edges.begin(), out_edges.end());
+  for (const Edge* edge : edge_set) {
+    if (edge->IsControlEdge()) {
+      continue;
+    }
+    output_used[edge->src_output()] = true;
+
+    Node* dst = edge->dst();
+    int dst_input = edge->dst_input();
+    int src_output = edge->src_output();
+    string identity_name =
+        Uniquify(absl::StrCat(node->name(), "_", dst->name()), node_names);
+    Node* identity_node;
+    TF_RETURN_IF_ERROR(add_identity(src_output, identity_name, &identity_node));
+    VLOG(6) << "Adding identity into " << node->name() << ":" << src_output
+            << " -> " << dst->name() << ":" << dst_input << ": "
+            << identity_node->DebugString();
+
+    // Make original dst node consume the new identity's output instead of
+    // `node`'s output.
+    TF_RETURN_IF_ERROR(graph->UpdateEdge(identity_node, 0, dst, dst_input));
+  }
+
+  for (int output_idx = 0; output_idx < node->num_outputs(); ++output_idx) {
+    if (output_used[output_idx]) {
+      continue;
+    }
+    // The output is unused in the graph. Just add an identity
+    // consuming it.
+    string identity_name = Uniquify(node->name(), node_names);
+    Node* identity_node;
+    TF_RETURN_IF_ERROR(add_identity(output_idx, identity_name, &identity_node));
+    VLOG(6) << "Added identity into " << node->name() << ":" << output_idx
+            << " -> <no consumer>: " << identity_node->DebugString();
+  }
+  return Status::OK();
+}
+
+Status IsolateNode(Node* node, Graph* graph) {
+  // We use `node_names` to make sure we pick unique names.
+  // We don't use graph->NewName() because it produces verbose names and
+  // does not actually ensure that they are unique (it assumes all names
+  // are generated using it, which is not true today).
+  std::unordered_set<string> node_names(graph->num_nodes());
+  for (Node* n : graph->nodes()) {
+    node_names.insert(n->name());
+  }
+
+  for (int i = 0; i < node->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(AddInputIdentity(node, i, graph, &node_names));
+  }
+  TF_RETURN_IF_ERROR(AddOutputIdentities(node, graph, &node_names));
+  return Status::OK();
+}
+
+}  // namespace
+
+Status IsolatePlacerInspectionRequiredOps(
+    const FunctionLibraryDefinition& flib_def, Graph* graph) {
+  PlacerInspectionRequiredOpChecker checker(graph, &flib_def);
+  // It is OK to add nodes to the graph during iteration.
+  // New nodes will get ids above current ids. The loop
+  // will loop over current nodes only because the op_nodes()
+  // iterator uses node ids to iterate.
+  // Because the new nodes will be higher ids, the caching in
+  // the checker will also work fine as new nodes are added.
+  for (Node* node : graph->op_nodes()) {
+    bool should_be_isolated = false;
+    TF_RETURN_IF_ERROR(
+        checker.IsPlacerInspectionRequired(*node, &should_be_isolated));
+    if (!should_be_isolated) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(IsolateNode(node, graph));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h
new file mode 100644
index 0000000..68f9543
--- /dev/null
+++ b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h
@@ -0,0 +1,158 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_INSPECTION_REQUIRED_OPS_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_INSPECTION_REQUIRED_OPS_UTILS_H_
+
+// Operations calling functions are becoming ubiquitous in TF 2.0.
+// Examples include PartitionedCallOp, functional If/While, and Dataset ops.
+// Such operations might require deep inspection - looking at the body of the
+// called function - to place them and surrounding ops correctly.
+
+// This file contains some utilities for placer to correctly place such ops
+// including:
+// - PlacerInspectionRequiredOpChecker: A simple class with a single
+// IsPlacerInspectionRequired method.
+// - IsolatePlacerInspectionRequiredOps: This function adds Identity ops for
+// each input/output of ops requiring placer inspection. It greatly simplifies
+// the implementation of placing such ops.
+
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// PlacerInspectionRequiredOpChecker allows one to check if Placer needs to
+// look deeply into the op to place ops consuming the outputs correctly.
+//
+// It is a class instead of a standalone method because checking whether
+// a function returns a resource takes non-trivial time and we cache the
+// results.
+class PlacerInspectionRequiredOpChecker {
+ public:
+  // Calls the constructor below with flib_def = graph->flib_def().
+  explicit PlacerInspectionRequiredOpChecker(const Graph* graph);
+  // Constructs a PlacerInspectionRequiredOpChecker for nodes of `graph`.
+  // The functions referenced by nodes in `graph` will be looked up in
+  // `flib_def`
+  PlacerInspectionRequiredOpChecker(const Graph* graph,
+                                    const FunctionLibraryDefinition* flib_def);
+
+  // If `node` is considered a deep op, sets `*is_deep` to true and returns
+  // Status::OK(). If an error occurs, returns that error, and the value of
+  // `*is_deep` is undefined.
+  // Currently, an op is considered deep, if it is a calling a function
+  // returning a resource. This definition is driven by Placer's need to
+  // look inside the op.
+  // REQUIRES: `node` is part of `graph` passed into constructor.
+  Status IsPlacerInspectionRequired(const Node& node, bool* is_deep);
+
+ private:
+  const Graph& graph_;
+  const FunctionLibraryDefinition& flib_def_;
+  // Indexed by the node id.
+  // If cache_[node_id] is empty, the deepness of the node with id `node_id` has
+  // not been computed yet. Else, it contains the value already computed.
+  std::vector<absl::optional<bool>> cache_;
+};
+
+// Extracts `fdef` and `func` from `flib_def` for the function identified
+// in "f" attribute of `node`.
+Status GetFunctionDefAndAttrs(const FunctionLibraryDefinition& flib_def,
+                              const Node& node, const FunctionDef** fdef,
+                              NameAttrList* func);
+
+// The "call" stack of functions.
+// Useful for better error messages as well as for detecting recursion.
+// Stores references to graph nodes. These references must outlive this.
+class FunctionStack {
+ public:
+  explicit FunctionStack(const string& function_name);
+
+  // `node_in_current_function` must outlive this.
+  FunctionStack Push(const Node* node_in_current_function,
+                     const string& new_current_function) const;
+
+  // Returns true iff this stack already includes `function_name`.
+  bool HasFunction(const string& function_name) const;
+
+  const string& current_function_name() const { return current_function_name_; }
+
+  // Format's this suitable for error interpolation that retrieves
+  // Python files and line numbers.
+  string FormatForError() const;
+
+ private:
+  struct Frame {
+    Frame(const string& function, const Node* node)
+        : function_name(function), node(node) {}
+
+    string function_name;
+    const Node* node;
+  };
+
+  // The function at the top of the stack. In other words, the function
+  // that is currently being inspected for placement.
+  string current_function_name_;
+
+  // The stack of frames that got the placement to the current_function_name_.
+  // frames_[0].function_name is the top function that Placer was constructed
+  // with. frames_[0].function_name can be empty if placer was constructed with
+  // a nameless graph, not a function.  frames_[0].node_name is a name of a node
+  // in frames_[0].function_name that required deep inspection (e.g. a
+  // PartitionedCallOp). The function that this node invoked is
+  // frames_[1].function_name, if frames_.size() > 1.  Else, the function that
+  // this node invoked is current_function_name_.
+  std::vector<Frame> frames_;
+};
+
+// Adds Identities for each input and output of function-calling ops in `graph`
+//
+// For example, the following graph calling a function on inputs `a` and `b`
+// and producing output `y` will be rewritten to include identities on all
+// edges:
+//
+//      a             b
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//         v
+//         y
+//
+// is transformed to
+//
+//      a             b
+//      |             |
+//  a_f (Identity)   b_f (Identity)
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//      f_y (Identity)
+//         |
+//         v
+//         y
+//
+Status IsolatePlacerInspectionRequiredOps(
+    const FunctionLibraryDefinition& flib_def, Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_INSPECTION_REQUIRED_OPS_UTILS_H_
diff --git a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc
new file mode 100644
index 0000000..b7d2971
--- /dev/null
+++ b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+
+#include <map>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+using FDH = ::tensorflow::FunctionDefHelper;
+
+// Returns void so that we can call TF_ASSERT_OK inside it.
+void VerifyPlacerInspectionRequiredOps(const GraphDef& graph_def,
+                                       std::map<string, bool> deep_nodes) {
+  Graph graph(OpRegistry::Global());
+  GraphConstructorOptions opts;
+  TF_ASSERT_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+  PlacerInspectionRequiredOpChecker checker(&graph);
+  std::unordered_map<string, Node*> node_map = graph.BuildNodeNameIndex();
+  for (const auto& entry : deep_nodes) {
+    const Node* node = node_map[entry.first];
+    ASSERT_NE(node, nullptr) << "Failed to find node " << entry.first
+                             << " in the graph " << graph_def.DebugString();
+    const bool expected_is_deep = entry.second;
+    bool actual_is_deep;
+    TF_EXPECT_OK(checker.IsPlacerInspectionRequired(*node, &actual_is_deep));
+    EXPECT_EQ(expected_is_deep, actual_is_deep)
+        << " Expected is_deep to be " << expected_is_deep << " for node "
+        << entry.first;
+  }
+}
+
+TEST(PlacerInspectionRequiredOpCheckerTest, Basic) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (PartitionedCallOp: ResourceIdentity)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph_def = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"x"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("y", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  VerifyPlacerInspectionRequiredOps(graph_def,
+                                    {{"x", false}, {"f", true}, {"x", false}});
+}
+
+TEST(PlacerInspectionRequiredOpCheckerTest, DirectCallsAreNotDeep) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (direct function call to ResourceIdentity)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph_def = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "ResourceIdentity", {"x"}),
+          NDef("y", "_Retval", {"f:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  VerifyPlacerInspectionRequiredOps(graph_def,
+                                    {{"x", false}, {"f", false}, {"x", false}});
+}
+
+TEST(PlacerInspectionRequiredOpCheckerTest,
+     FunctionsNotReturningResourcesAreNotDeep) {
+  /*
+   *                x (_Arg, DT_RESOURCE)
+   *                   |
+   *                   v
+   *                f (direct function call to ResourceIdentity)
+   *                   |
+   *                   v
+   *                y (_Retval, DT_RESOURCE)
+   */
+  FunctionDef func = test::function::ReadResourceVariable();
+  GraphDef graph_def = GDef(
+      {
+          NDef("x", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("f", "PartitionedCall", {"x"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("ReadResourceVariable", {})}}),
+          NDef("y", "_Retval", {"f:0"}, {{"T", DT_FLOAT}}),
+      },
+      // FunctionLib
+      {func});
+
+  VerifyPlacerInspectionRequiredOps(graph_def,
+                                    {{"x", false}, {"f", false}, {"x", false}});
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index bee80c9..0a4312f 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -24,6 +24,7 @@
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -42,6 +43,8 @@
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 
@@ -201,18 +204,20 @@
 ////////////////////////////////////////////////////////////////////////////////
 class PlacerTest : public ::testing::Test {
  protected:
-  PlacerTest() {
-    // Build a set of 10 GPU and 10 CPU devices.
+  PlacerTest() : PlacerTest(10) {}
+
+  explicit PlacerTest(int num_devices) {
+    // Build a set of num_devices GPU and num_devices CPU devices.
     // NOTE: this->local_devices_ owns the device objects;
     // this->devices_ contains borrowed pointers to the device
     // objects.
-    for (int i = 0; i < 10; ++i) {
+    for (int i = 0; i < num_devices; ++i) {
       local_devices_.emplace_back(FakeDevice::MakeCPU(
           strings::StrCat("/job:a/replica:0/task:0/device:FakeCPU:", i)));
       devices_.AddDevice(local_devices_.back().get());
       // Insert the GPUs in reverse order.
-      local_devices_.emplace_back(FakeDevice::MakeGPU(
-          strings::StrCat("/job:a/replica:0/task:0/device:FakeGPU:", 9 - i)));
+      local_devices_.emplace_back(FakeDevice::MakeGPU(strings::StrCat(
+          "/job:a/replica:0/task:0/device:FakeGPU:", num_devices - 1 - i)));
       devices_.AddDevice(local_devices_.back().get());
     }
   }
@@ -238,7 +243,44 @@
   // REQUIRES: "*graph" was produced by the most recent call to BuildGraph.
   Status Place(Graph* graph, DeviceSet* devices, bool allow_soft_placement,
                bool log_device_placement) {
-    Placer placer(graph, &graph->flib_def(), devices, nullptr,
+    Placer placer(graph, "", &graph->flib_def(), devices, nullptr,
+                  allow_soft_placement, log_device_placement);
+    return placer.Run();
+  }
+
+  Status CallOptPassesAndPlace(Graph* graph, DeviceSet* devices,
+                               bool allow_soft_placement,
+                               bool log_device_placement) {
+    // Disable all real optimizations (i.e. Grappler and GraphOptimizer)
+    // to make sure functions are not inlined and not constant folded
+    SessionOptions session_options;
+    GraphOptions* graph_opts = session_options.config.mutable_graph_options();
+    OptimizerOptions* optimizer_opts = graph_opts->mutable_optimizer_options();
+    optimizer_opts->set_opt_level(OptimizerOptions::L0);
+    optimizer_opts->set_global_jit_level(OptimizerOptions::OFF);
+    RewriterConfig* rewriter_config = graph_opts->mutable_rewrite_options();
+    rewriter_config->set_disable_meta_optimizer(true);
+
+    // Placing nested functions requires go through some PRE_PLACEMNT passes.
+    // Currently, just the IsolateDeepOpsPass.
+    GraphOptimizationPassOptions optimization_options;
+    std::unique_ptr<Graph> graph_ptr(graph);
+    optimization_options.graph = &graph_ptr;
+    FunctionLibraryDefinition flib_def(graph->flib_def());
+    optimization_options.flib_def = &flib_def;
+    optimization_options.device_set = &devices_;
+    optimization_options.session_options = &session_options;
+    Status s = OptimizationPassRegistry::Global()->RunGrouping(
+        OptimizationPassRegistry::PRE_PLACEMENT, optimization_options);
+    if (!s.ok()) {
+      graph_ptr.release();
+      return s;
+    }
+    graph = graph_ptr.release();
+
+    RebuildNodeNameMap(*graph);
+
+    Placer placer(graph, "", &graph->flib_def(), devices, nullptr,
                   allow_soft_placement, log_device_placement);
     return placer.Run();
   }
@@ -254,6 +296,16 @@
 
   Status Place(Graph* graph) { return Place(graph, &devices_, true, false); }
 
+  Status CallOptPassesAndPlace(Graph* graph, bool allow_soft_placement,
+                               bool log_device_placement) {
+    return CallOptPassesAndPlace(graph, &devices_, allow_soft_placement,
+                                 log_device_placement);
+  }
+
+  Status CallOptPassesAndPlace(Graph* graph) {
+    return CallOptPassesAndPlace(graph, &devices_, true, false);
+  }
+
   // Returns the node in "graph" with the given name.
   //
   // REQUIRES: "graph" was produced by the most recent call to BuildGraph.
@@ -326,7 +378,7 @@
                 .device_type())
 
 #define EXPECT_DEVICE_CONTAINS(g, name, device_substr) \
-  EXPECT_TRUE(::tensorflow::str_util::StrContains(     \
+  EXPECT_TRUE(absl::StrContains(                       \
       GetNodeByName((g), (name))->assigned_device_name(), device_substr))
 
 // Test that a graph with no constraints will successfully assign nodes to the
@@ -813,7 +865,7 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code()) << s.ToString();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(),
       "Assigned device '/job:a/replica:0/task:0/device:FakeGPU:0' "
       "does not have registered OpKernel support for TestInput"))
@@ -866,14 +918,14 @@
   {
     Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", "FakeGPU"));
   {
     Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", "FakeGPU"));
@@ -969,7 +1021,7 @@
     Status s = Place(&g, allow_soft_placement, true);
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     if (set_assigned) {
-      EXPECT_TRUE(str_util::StrContains(
+      EXPECT_TRUE(absl::StrContains(
           s.error_message(),
           "Cannot place the graph because a reference or resource edge "
           "connects "
@@ -978,7 +1030,7 @@
           "/job:a/replica:0/task:0/device:FakeCPU:0"))
           << s.ToString();
     } else {
-      EXPECT_TRUE(str_util::StrContains(
+      EXPECT_TRUE(absl::StrContains(
           s.error_message(),
           "Cannot place the graph because a reference or resource edge "
           "connects "
@@ -1152,7 +1204,7 @@
     EXPECT_DEVICE_TYPE(g, "colocated_1", "FakeCPU");
     EXPECT_DEVICE_TYPE(g, "foo", "FakeGPU");
   } else {
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(),
         "Cannot colocate nodes {{colocation_node foo}} and "
         "{{colocation_node in}} because no device type supports both of those "
@@ -1225,7 +1277,7 @@
     EXPECT_EQ(error::OK, s.code()) << s.ToString();
   } else {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(),
         "Cannot colocate nodes {{colocation_node assign3}} and "
         "{{colocation_node var2}} because no device type supports both of "
@@ -1291,7 +1343,7 @@
 
   Status s = Place(&g, &empty);
   EXPECT_TRUE(
-      str_util::StrContains(s.error_message(), "No devices are registered"));
+      absl::StrContains(s.error_message(), "No devices are registered"));
 }
 
 // Test that placement fails when the requested device forces an
@@ -1316,17 +1368,16 @@
   heterogeneous.AddDevice(cpu.get());
   Status s = Place(&g, &heterogeneous);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      str_util::StrContains(s.error_message(),
-                            "colocated with a group of nodes that required "
-                            "incompatible device"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "colocated with a group of nodes that required "
+                                "incompatible device"));
 
   // The error message should contain information that indicates which
   // op types have which registered device types.
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
       << s;
   EXPECT_TRUE(
-      str_util::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
+      absl::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
       << s;
 }
 
@@ -1341,7 +1392,7 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -1356,7 +1407,7 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -1373,7 +1424,7 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(),
       "Assigned device '/job:foo' does not match any device"));
 }
@@ -1390,12 +1441,10 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      str_util::StrContains(s.error_message(),
-                            "No OpKernel was registered to support Op "
-                            "'VariableNoKernels' used by {{node var}}"));
-  EXPECT_TRUE(
-      str_util::StrContains(s.error_message(), "<no registered kernels>"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "No OpKernel was registered to support Op "
+                                "'VariableNoKernels' used by {{node var}}"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "<no registered kernels>"));
 }
 
 // Test that placement fails when a kernel is registered but no known
@@ -1415,10 +1464,10 @@
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "No OpKernel was registered to support Op "
-                                    "'VariableGPU' used by {{node var}}"));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "device='FakeGPU'"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "No OpKernel was registered to support Op "
+                                "'VariableGPU' used by {{node var}}"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "device='FakeGPU'"));
 }
 
 // Test that placement fails when a requested device is malformed.
@@ -1432,8 +1481,8 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(), "Malformed device specification '/foo:bar'"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "Malformed device specification '/foo:bar'"));
 }
 
 // Test that placement fails when a previously-assigned device is malformed.
@@ -1449,8 +1498,8 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "Malformed assigned device '/foo:bar'"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "Malformed assigned device '/foo:bar'"));
 }
 
 // Test that placement fails when a device was previously assigned to
@@ -1467,7 +1516,7 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(), "Assigned device '/job:a' does not match any device"));
 }
 
@@ -1499,7 +1548,7 @@
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:FakeGPU:11"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "/device:FakeGPU:11"));
 }
 
 // Test that the "Cannot assign a device" error message contains a format tag
@@ -1516,9 +1565,9 @@
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "Cannot assign a device for operation in"));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "{{node in}}"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "Cannot assign a device for operation in"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "{{node in}}"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1534,11 +1583,11 @@
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:FakeCPU:0"))
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "/device:FakeCPU:0"))
       << s.ToString();
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "no supported kernel for FakeCPU devices is available"))
+  EXPECT_TRUE(
+      absl::StrContains(s.error_message(),
+                        "no supported kernel for FakeCPU devices is available"))
       << s.ToString();
 }
 
@@ -1556,10 +1605,9 @@
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(), "was explicitly assigned to /job:foo/replica:17"));
-  EXPECT_TRUE(
-      str_util::StrContains(s.error_message(), "but available devices"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "but available devices"));
 }
 
 #if !GOOGLE_CUDA
@@ -1577,7 +1625,7 @@
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(),
       "The requested device appears to be a GPU, but CUDA is not enabled."));
 }
@@ -1642,9 +1690,9 @@
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "Cannot colocate nodes {{colocation_node "
-                                    "var}} and {{colocation_node assign}}"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "Cannot colocate nodes {{colocation_node "
+                                "var}} and {{colocation_node assign}}"));
 }
 
 // Test that a generator node follows its consumers (where there are several
@@ -1769,7 +1817,7 @@
     EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
   } else {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(),
         "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
         "id1}}: Cannot merge devices with incompatible types: "
@@ -1833,10 +1881,10 @@
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
   EXPECT_TRUE(
-      str_util::StrContains(s.error_message(),
-                            "{{colocation_node iter}} was colocated with a "
-                            "group of nodes that required incompatible device "
-                            "'/job:a/replica:0/task:0/device:FakeCPU:0'"))
+      absl::StrContains(s.error_message(),
+                        "{{colocation_node iter}} was colocated with a "
+                        "group of nodes that required incompatible device "
+                        "'/job:a/replica:0/task:0/device:FakeCPU:0'"))
       << s.ToString();
 }
 
@@ -1884,7 +1932,7 @@
     EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
   } else {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(),
         "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
         "id1}}: Cannot merge devices with incompatible types: "
@@ -1894,5 +1942,1003 @@
   }
 }
 
+// Fixture for tests that place graphs containing function calls.
+// Particularly the case where internal functions return resources.
+class NestedPlacerTest : public PlacerTest {
+ public:
+  // Create one FakeCPU and one FakeGPU. These tests don't need multiple devices
+  // of the same type.
+  NestedPlacerTest() : PlacerTest(1) {}
+};
+
+TEST_F(NestedPlacerTest, OutputOneResource) {
+  /*
+   *                a:FLOAT:GPU
+   *                 |  b:RESOURCE:CPU
+   *                 |   |
+   *                 v   v
+   *                  PCO
+   *                 |   \
+   *                 |   v
+   *                 v   r2:FLOAT
+   *                 r1:RESOURCE
+   *
+   * PartitionedCallOp (PCO) should be placed on GPU even through it
+   * takes a CPU resource as input. The resource output should be placed
+   * on CPU since it is the same resource as the input one.
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}, kGPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_FLOAT}},
+                {"f", FDH::FunctionRef("ResourceOutput", {})}}),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_FLOAT}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g));
+
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, OutputOneResource_ExtraIdentities) {
+  /*
+   *                a:FLOAT
+   *                 |  b:RESOURCE
+   *                 |   |
+   *              ai:GPU |
+   *                 |  bi:CPU
+   *                 |   |
+   *                 v   v
+   *                  PCO
+   *                 |   \
+   *                 |   v
+   *                 v   r2:FLOAT
+   *                 r1:RESOURCE
+   *
+   * Same as above except that devices are requested on identities, not on
+   * resource generating ops.
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}, kGPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("ai", "Identity", {"a"}, {{"T", DT_FLOAT}}),
+          NDef("bi", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("y", "PartitionedCall", {"ai", "bi"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_FLOAT}},
+                {"f", FDH::FunctionRef("ResourceOutput", {})}}),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_FLOAT}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "ai", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "bi", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, OutputOneResource_OverrideOutputResourceDevice) {
+  /*
+   *                a:FLOAT:GPU
+   *                 |  b:RESOURCE:CPU
+   *                 |   |
+   *                 v   v
+   *                  PCO
+   *                 |   \
+   *                 |   v
+   *                 v   r2:FLOAT
+   *                 r1:RESOURCE:GPU
+   *
+   * Same as above except r1 is wrongly assigned on GPU. Placer will override
+   * this device assignment.
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}, kGPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_FLOAT}},
+                {"f", FDH::FunctionRef("ResourceOutput", {})}}),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_FLOAT}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, OutputTwoResources) {
+  /*
+   *                a:RESOURCE:CPU
+   *                 |  b:RESOURCE:GPU
+   *                 |   |
+   *                 v   v
+   *                  PCO (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 v   r2:RESOURCE
+   *                 r1:RESOURCE
+   *
+   * Ops consuming output resources should be placed on correct devices.
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_EXPECT_OK(CallOptPassesAndPlace(&g));
+
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeCPU");
+}
+
+TEST_F(NestedPlacerTest, OutputTwoResources_PCOOnCPU) {
+  /*
+   *                a:RESOURCE:CPU
+   *                 |  b:RESOURCE:GPU
+   *                 |   |
+   *                 v   v
+   *                  PCO:CPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 v   r2:RESOURCE
+   *                 r1:RESOURCE
+   *
+   * Ops consuming output resources should be placed on correct devices, even
+   * when PCO is explicitly placed.
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}},
+               kCPU),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_EXPECT_OK(CallOptPassesAndPlace(&g));
+
+  EXPECT_DEVICE_TYPE(g, "y", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeCPU");
+}
+
+TEST_F(NestedPlacerTest, OutputTwoResources_UnassignedResource) {
+  /*
+   *                a:RESOURCE
+   *                 |  b:RESOURCE:GPU
+   *                 |   |
+   *                 v   v
+   *                  PCO:CPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 v   r2:RESOURCE
+   *                 r1:RESOURCE
+   *
+   * Resource input `a` is not explicitly assigned. Placer leaves `a` and `b` to
+   * the "second pass" as they are "sources". It assigns `r1` to GPU because it
+   * is in the same group as `b`. It assigns `r2` to GPU because GPU has a
+   * higher device preference. Finally, `a` is assigned to GPU because `r2` is
+   * on GPU - this test that the "second pass" heuristics respect colocaton
+   * groups (even when the consumer of the source, i.e. PCO is on a different
+   * device).
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}},
+               kCPU),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "b", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "y", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, OutputTwoResources_UnassignedResource_CPU) {
+  /*
+   *                a:RESOURCE
+   *                 |  b:RESOURCE:CPU
+   *                 |   |
+   *                 v   v
+   *                  PCO:CPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 v   r2:RESOURCE
+   *                 r1:RESOURCE
+   *
+   * Same as above except `b` is on CPU.
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}},
+               kCPU),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "y", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, OutputResourceConsumedByMultipleOps) {
+  /*
+   *                a:RESOURCE
+   *                 |  b:RESOURCE:CPU
+   *                 |   |
+   *                 v   v
+   *                  PCO:CPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 |  r3:RESOURCE:GPU
+   *                 |
+   *              ---+---
+   *             |       |
+   *             |   r2:RESOURCE
+   *         r1:RESOURCE
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}}),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r3", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}, kGPU),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r3", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, DuplicateInputResource) {
+  /*
+   *                a:RESOURCE
+   *                  / \
+   *                 |   |
+   *                 v   v
+   *                  PCO:GPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 v   r2:RESOURCE:CPU
+   *                 r1:RESOURCE
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("y", "PartitionedCall", {"a", "a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}},
+               kGPU),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}, kCPU),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeCPU");
+}
+
+TEST_F(NestedPlacerTest, DuplicateInputs_OutputResourceConsumedByMultipleOps) {
+  /*
+   *                a:RESOURCE
+   *                  /  \
+   *                 |   |
+   *                 v   v
+   *                  PCO:GPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 |  r3:RESOURCE
+   *                 |
+   *              ---+---
+   *             |       |
+   *             |   r2:RESOURCE:CPU
+   *         r1:RESOURCE
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("y", "PartitionedCall", {"a", "a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}},
+               kGPU),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+          NDef("r2", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("r3", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r2", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r3", "FakeCPU");
+}
+
+TEST_F(NestedPlacerTest, DuplicateInputResource_Conflict) {
+  /*
+   *                a:RESOURCE
+   *                  / \
+   *                 |   |
+   *                 v   v
+   *                  PCO:GPU (simple swap)
+   *                 |   \
+   *                 |   v
+   *                 v   r2:RESOURCE:CPU
+   *                 r1:RESOURCE:GPU
+   *
+   * There is a conflict but Placer always overrides requested devices
+   * when they result in coflict due to resource edges. Which device
+   * is picked for a/r1/r2 is indeterministic.
+   */
+  FunctionDef func = test::function::Swap();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("y", "PartitionedCall", {"a", "a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_RESOURCE}},
+                {"f", FDH::FunctionRef("Swap", {{"T", DT_RESOURCE}})}},
+               kGPU),
+          NDef("r1", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("r2", "Identity", {"y:1"}, {{"T", DT_RESOURCE}}, kCPU),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(CallOptPassesAndPlace(&g, false, true));
+
+  EXPECT_SAME_TYPE(g, "a", "r1");
+  EXPECT_SAME_TYPE(g, "a", "r2");
+}
+
+TEST_F(NestedPlacerTest, TestDstDeviceIsIgnoredWhenConstrainedByResourceEdge) {
+  /*
+   *                a:RESOURCE:CPU
+   *                   |
+   *                   |
+   *                   v
+   *                  PCO (identity)
+   *                   |
+   *                   |
+   *                   v
+   *                r1:RESOURCE:GPU
+   *
+   * r1'th device will be overridden.
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("r1", "_Retval", {"y:0"}, {{"T", DT_RESOURCE}},
+               kGPU  // This device specification will be overridden
+               ),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_EXPECT_OK(CallOptPassesAndPlace(&g));
+
+  EXPECT_DEVICE_TYPE(g, "a", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+}
+
+TEST_F(
+    NestedPlacerTest,
+    TestDstDeviceIsIgnoredWhenConstrainedByResourceEdge_EvenWhenPCOIsPlaced) {
+  /*
+   *                a:RESOURCE:CPU
+   *                   |
+   *                   |
+   *                   v
+   *                  PCO:GPU (identity)
+   *                   |
+   *                   |
+   *                   v
+   *                r1:RESOURCE:GPU
+   *
+   * r1'th device will be overridden.
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}},
+               kGPU),
+          NDef("r1", "_Retval", {"y:0"}, {{"T", DT_RESOURCE}},
+               kGPU  // This device specification will be overridden
+               ),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  TF_EXPECT_OK(CallOptPassesAndPlace(&g));
+
+  EXPECT_DEVICE_TYPE(g, "r1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "y", "FakeGPU");
+}
+
+TEST_F(NestedPlacerTest, ResourceConflictInvolvingPCO) {
+  /*
+   *                a:RESOURCE:CPU
+   *                   |
+   *                   |
+   *                   v
+   *                  PCO (identity)
+   *                   |
+   *                   |   b:RESOURCE:GPU
+   *                   |    |
+   *                   v    v
+   *                Add:RESOURCE
+   *
+   * Add op cannot be placed because the requested devices are on
+   * resource generating ops and they conflict.
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("add", "Add", {"y:0", "b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Cannot place the graph because a reference or resource edge connects "
+      "colocation groups with incompatible resource devices: /device:FakeCPU:0 "
+      "vs /device:FakeGPU:0"))
+      << s.ToString();
+}
+
+TEST_F(NestedPlacerTest, ResourceConflictInvolvingTwoPCOs) {
+  /*
+   *            a:RESOURCE:CPU
+   *               |
+   *               |          b:RESOURCE:GPU
+   *               |              |
+   *               v              |
+   *            y:PCO (identity)  |
+   *               |              v
+   *                \          z:PCO (identity)
+   *                 \           /
+   *                  \         /
+   *                   v       v
+   *                 Add:RESOURCE
+   *
+   * Add op cannot be placed.
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("z", "PartitionedCall", {"b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("add", "Add", {"y:0", "z:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Cannot place the graph because a reference or resource edge connects "
+      "colocation groups with incompatible resource devices: /device:FakeCPU:0 "
+      "vs /device:FakeGPU:0"))
+      << s.ToString();
+}
+
+// Function that returns a resource that can be produced on CPU only.
+FunctionDef CPUResourceOutput() {
+  return FDH::Create(
+      // Name
+      "CPUResourceOutput",
+      // Args
+      {"x: float"},
+      // Return values
+      {"ds: resource", "x_out: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"make_ds"}, "CreateDatasetCPU", {}},
+      },
+      {{"ds", "make_ds:o:0"}, {"x_out", "x"}});
+}
+
+TEST_F(NestedPlacerTest, DeepDeviceConstraintsPropagated) {
+  /*
+   *            a:FLOAT
+   *               |
+   *               v
+   *          PCO (CPUResourceOutput)
+   *               |    |
+   *               |    v
+   *               |  (ignored)
+   *               |
+   *               v
+   *          id:Identity:GPU (assigned)
+   *
+   * The graph cannot be placed because the PCO can produce the resource
+   * on CPU only.
+   */
+  FunctionDef func = CPUResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_FLOAT}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_FLOAT}},
+                {"f", FDH::FunctionRef("CPUResourceOutput", {})}}),
+          NDef("id", "Identity", {"y:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  GetNodeByName(g, "id")->set_assigned_device_name(kFullGPU);
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+  // TODO(b/129057603): When better error messages are implemented, this should
+  // change.
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(), "Could not satisfy explicit device specification"))
+      << s.ToString();
+}
+
+FunctionDef NestedCPUResourceOutput() {
+  return FDH::Create(
+      // Name
+      "NestedCPUResourceOutput",
+      // Args
+      {"x: float"},
+      // Return values
+      {"ds: resource", "x_out: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"y"},
+           "PartitionedCall",
+           {"x"},
+           {{"Tin", DataTypeSlice{DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_RESOURCE, DT_FLOAT}},
+            {"f", FDH::FunctionRef("CPUResourceOutput", {})}}},
+      },
+      {{"ds", "y:output:0"}, {"x_out", "y:output:1"}});
+}
+
+TEST_F(NestedPlacerTest, NestedDeepDeviceConstraintsPropagated) {
+  /*
+   *            a:FLOAT
+   *               |
+   *               v
+   *          PCO (NestedCPUResourceOutput)
+   *               |    |
+   *               |    v
+   *               |  (ignored)
+   *               |
+   *               v
+   *          id:_Retval:GPU (assigned)
+   *
+   * The graph cannot be placed because the PCO can produce the resource
+   * on CPU only.
+   */
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_FLOAT}},
+                {"Tout", DataTypeSlice{DT_RESOURCE, DT_FLOAT}},
+                {"f", FDH::FunctionRef("NestedCPUResourceOutput", {})}}),
+          NDef("id", "_Retval", {"y:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {CPUResourceOutput(), NestedCPUResourceOutput()});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+  GetNodeByName(g, "id")->set_assigned_device_name(kFullGPU);
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+  // TODO(b/129057603): When better error messages are implemented, this should
+  // change.
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(), "Could not satisfy explicit device specification"))
+      << s.ToString();
+}
+
+TEST_F(NestedPlacerTest, TwoFunctionsBackToBack) {
+  /*
+   *            a:RESOURCE:CPU
+   *               |
+   *               |          b:RESOURCE:GPU
+   *               v              |
+   *            y:PCO (identity)  |
+   *               |              |
+   *            w:PCO (identity)  |
+   *               |              v
+   *                \          z:PCO (identity)
+   *                 \           /
+   *                  \         /
+   *                   v       v
+   *                 Add:RESOURCE
+   *
+   * Add op cannot be placed.
+   * Two PCOs back to back is a challenging case that required adding
+   * IsolateDeepOpsPass.
+   */
+  FunctionDef func = test::function::ResourceIdentity();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("w", "PartitionedCall", {"y:0"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("z", "PartitionedCall", {"b"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("ResourceIdentity", {})}}),
+          NDef("add", "Add", {"w:0", "z:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Cannot place the graph because a reference or resource edge connects "
+      "colocation groups with incompatible resource devices: /device:FakeCPU:0 "
+      "vs /device:FakeGPU:0"))
+      << s.ToString();
+}
+
+FunctionDef NestedCallFunctionsBackToBack() {
+  return FDH::Create(
+      // Name
+      "NestedCallFunctionsBackToBack",
+      // Args
+      {},
+      // Return values
+      {"output: resource"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"cpu_ds"}, "CreateDatasetCPU", {}},
+          {{"y"},
+           "PartitionedCall",
+           {"cpu_ds:o:0"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("ResourceIdentity", {})}}},
+          {{"w"},
+           "PartitionedCall",
+           {"y:output:0"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("ResourceIdentity", {})}}},
+          {{"gpu_ds"}, "CreateDatasetGPU", {}},
+          {{"z"},
+           "PartitionedCall",
+           {"gpu_ds:o:0"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("ResourceIdentity", {})}}},
+          {{"add"}, "Add", {"w:output:0", "z:output:0"}, {{"T", DT_RESOURCE}}},
+      },
+      {{"output", "add:z:0"}});
+}
+
+TEST_F(NestedPlacerTest, NestedTwoFunctionsBackToBack) {
+  /*
+   * Same as TwoFunctionsBackToBack above but the functions are invoked in
+   * another function instead of the top level graph. This tests that Placer
+   * isolates deep ops in nested function bodies.
+   */
+  FunctionDef func = NestedCallFunctionsBackToBack();
+  GraphDef graph = GDef(
+      {
+          NDef("y", "PartitionedCall", {},
+               {{"Tin", {}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("NestedCallFunctionsBackToBack", {})}}),
+      },
+      // FunctionLib
+      {NestedCallFunctionsBackToBack(), test::function::ResourceIdentity()});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Nodes were connected by a reference connection (requiring them to be on "
+      "the same device), but the two nodes were assigned two different "
+      "devices"))
+      << s.ToString();
+}
+
+FunctionDef RecursiveResourceIdentity() {
+  return FDH::Create(
+      // Name
+      "RecursiveResourceIdentity",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: resource"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"out"},
+           "PartitionedCall",
+           {"x"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("RecursiveResourceIdentity", {})}}},
+      },
+      // Output mapping
+      {{"y", "out:output:0"}});
+}
+
+TEST_F(NestedPlacerTest, DirectRecursion) {
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("RecursiveResourceIdentity", {})}}),
+          NDef("r1", "_Retval", {"y:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {RecursiveResourceIdentity()});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::UNIMPLEMENTED, s.code()) << s.ToString();
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Recursive function calls are not supported. Node {{node out}} inside "
+      "the body of {{function_node RecursiveResourceIdentity}} calls function "
+      "{{function_node RecursiveResourceIdentity}}"))
+      << s.ToString();
+}
+
+FunctionDef RecursiveF1() {
+  return FDH::Create(
+      // Name
+      "RecursiveF1",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: resource"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"out"},
+           "PartitionedCall",
+           {"x"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("RecursiveF2", {})}}},
+      },
+      // Output mapping
+      {{"y", "out:output:0"}});
+}
+
+FunctionDef RecursiveF2() {
+  return FDH::Create(
+      // Name
+      "RecursiveF2",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: resource"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"out"},
+           "PartitionedCall",
+           {"x"},
+           {{"Tin", DataTypeSlice{DT_RESOURCE}},
+            {"Tout", DataTypeSlice{DT_RESOURCE}},
+            {"f", FDH::FunctionRef("RecursiveF1", {})}}},
+      },
+      // Output mapping
+      {{"y", "out:output:0"}});
+}
+
+TEST_F(NestedPlacerTest, IndirectRecursion) {
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("y", "PartitionedCall", {"a"},
+               {{"Tin", DataTypeSlice{DT_RESOURCE}},
+                {"Tout", DataTypeSlice{DT_RESOURCE}},
+                {"f", FDH::FunctionRef("RecursiveF1", {})}}),
+          NDef("r1", "_Retval", {"y:0"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {RecursiveF1(), RecursiveF2()});
+
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(BuildGraph(graph, &g));
+
+  Status s = CallOptPassesAndPlace(&g);
+  EXPECT_EQ(error::UNIMPLEMENTED, s.code()) << s.ToString();
+  EXPECT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Recursive function calls are not supported. Node {{node out}} inside "
+      "the body of {{function_node RecursiveF2}} calls function "
+      "{{function_node RecursiveF1}} which is already present in the call "
+      "stack"))
+      << s.ToString();
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 96b59d0..ccec6c0 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -83,6 +83,15 @@
         device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, custom_kernel_creator, this);
   }
+
+  DeviceMgr const* all_devices = device_mgr_;
+  if (parent_ != nullptr && parent_->remote_device_mgr() != nullptr) {
+    all_devices = parent_->remote_device_mgr();
+  }
+
+  for (auto d : all_devices->ListDevices()) {
+    device_set_.AddDevice(d);
+  }
 }
 
 /* static */
@@ -371,7 +380,11 @@
                   << " colo group: " << colocation_group;
           while (src_device->empty() && colocation_group.empty() &&
                  src_node->IsIdentity()) {
-            src_node = *src_node->in_nodes().begin();
+            // Only follows the real data input of Identity, not control edges.
+            Node* input_node;
+            TF_RETURN_IF_ERROR(src_node->input_node(0, &input_node));
+            src_node = input_node;
+
             src_device = AssignedOrRequestedDeviceName(*src_node);
             GetColocationGroup(src_node, &colocation_group);
             VLOG(3) << "Considering src: " << src_node->name()
@@ -588,16 +601,11 @@
     options.graph_collector->CollectRawGraph(def);
   }
 
-  DeviceSet device_set;
-  for (auto d : device_mgr_->ListDevices()) {
-    device_set.AddDevice(d);
-  }
-
   TF_RETURN_IF_ERROR(SetArgShape(options.input_tensor_shapes,
                                  options.input_resource_dtypes_and_shapes,
                                  arg_nodes));
   TF_RETURN_IF_ERROR(PinArgsAndRets(options.input_devices,
-                                    options.output_devices, device_set,
+                                    options.output_devices, device_set_,
                                     arg_nodes, ret_nodes));
 
   std::unique_ptr<MultiDeviceFunctionData> data =
@@ -613,7 +621,7 @@
   optimization_options.session_options = &session_options;
   optimization_options.graph = &graph;
   optimization_options.flib_def = &data->lib_def_;
-  optimization_options.device_set = &device_set;
+  optimization_options.device_set = &device_set_;
 
   DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -637,8 +645,9 @@
 
   // TODO(b/124993244): Smartly merge options in nested defuns, and raise
   // exceptions/warnings in case where nested function call options are ignored.
-  Placer placer(graph.get(), optimization_options.flib_def, &device_set,
-                default_device, options.config_proto.allow_soft_placement(),
+  Placer placer(graph.get(), function_name, optimization_options.flib_def,
+                &device_set_, default_device,
+                options.config_proto.allow_soft_placement(),
                 options.config_proto.log_device_placement());
   TF_RETURN_IF_ERROR(placer.Run());
 
@@ -653,7 +662,7 @@
     DumpGraph("Before running graph optimization fn", graph.get());
     Status status = options.optimize_graph_fn(
         std::move(ret_node_names), std::move(control_ret_node_names),
-        &data->lib_def_, device_set, cpu_device, &graph);
+        &data->lib_def_, device_set_, cpu_device, &graph);
     if (!status.ok()) {
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
                    << status.ToString();
@@ -674,7 +683,7 @@
 
   std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
   TF_RETURN_IF_ERROR(
-      PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs));
 
   for (const auto& pair : subgraphs) {
     DumpGraph(strings::StrCat("Before running POST_PARTITIONING passes (",
@@ -731,20 +740,20 @@
   FunctionNameGenerator name_generator(&data->lib_def_, function_name);
   for (const auto& pair : subgraphs) {
     i += 1;
-    // TODO(iga): Fail gracefully if the set of devices corresponds
-    // to more than one address space.
     const string& target = pair.first;
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    const string& device_type = target_flr->device()->device_type();
     Graph* subgraph = pair.second.get();
 
     ComponentFunctionData* comp_data = &data->glue_[target];
     TF_RETURN_IF_ERROR(UpdateArgAndRetvalMetadata(
-        subgraph, &comp_data->arg_indices_, &comp_data->ret_indices_,
-        &comp_data->arg_alloc_attrs_, &comp_data->ret_alloc_attrs_));
+        subgraph, device_type, &comp_data->arg_indices_,
+        &comp_data->ret_indices_, &comp_data->arg_alloc_attrs_,
+        &comp_data->ret_alloc_attrs_));
     FunctionDef shard;
     string unique_name = name_generator.GetName();
     TF_RETURN_IF_ERROR(
         GraphToFunctionDef(*subgraph, unique_name, control_ret, &shard));
-    FunctionLibraryRuntime* target_flr = GetFLR(target);
     TF_RETURN_IF_ERROR(data->lib_def_.AddFunctionDef(shard));
     FunctionLibraryRuntime::InstantiateOptions opts;
     opts.executor_type = options.executor_type;
@@ -754,8 +763,8 @@
     opts.state_handle = options.state_handle;
     FunctionLibraryRuntime::Handle component_handle;
 
-    TF_RETURN_IF_ERROR(target_flr->Instantiate(
-        unique_name, AttrSlice(&shard.attr()), opts, &component_handle));
+    TF_RETURN_IF_ERROR(Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
+                                   &component_handle));
     VLOG(1) << "Instantiated component function " << unique_name
             << " on device " << target << " with component handle "
             << component_handle;
@@ -852,34 +861,57 @@
     opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs_;
     opts_copy.remote_execution = false;
 
-    FunctionLibraryRuntime* flr = GetFLR(target);
-    // When target device has private thread pool, use the target device runner
-    thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
-    opts_copy.runner = (pool == nullptr) ? opts_copy.runner : flr->runner();
-
     std::vector<Tensor> comp_args =
         GetArgsForIndices(comp_data.arg_indices_, args);
     std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
     rets->resize(data->num_outputs_);
 
-    VLOG(1) << "Running component function on device " << target
-            << " with handle " << handle;
-    VLOG(4) << "    with " << opts_copy.DebugString();
-    flr->Run(
-        opts_copy, handle, comp_args, comp_rets,
-        [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-          if (!status.ok()) {
-            LOG(ERROR) << "Component function execution failed: " << status;
-            refcounted_done->UpdateStatus(status);
-          } else {
-            for (int i = 0; i < comp_rets->size(); ++i) {
-              (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+    FunctionLibraryRuntime* flr = GetFLR(target);
+    if (flr != nullptr) {
+      // When target device has private thread pool, use the target device
+      // runner
+      thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
+      opts_copy.runner = (pool == nullptr) ? opts_copy.runner : flr->runner();
+
+      VLOG(1) << "Running component function on device " << target
+              << " with handle " << handle;
+      VLOG(4) << "    with " << opts_copy.DebugString();
+      flr->Run(
+          opts_copy, handle, comp_args, comp_rets,
+          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+            if (!status.ok()) {
+              VLOG(2) << "Component function execution failed: " << status;
+              refcounted_done->UpdateStatus(status);
+            } else {
+              for (int i = 0; i < comp_rets->size(); ++i) {
+                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+              }
             }
-          }
-          delete comp_rets;
-          // refcounted_done is thread-safe
-          refcounted_done->Unref();
-        });
+            delete comp_rets;
+            // refcounted_done is thread-safe
+            refcounted_done->Unref();
+          });
+    } else {
+      opts_copy.remote_execution = true;
+
+      VLOG(1) << "Running component function on device " << target
+              << " with handle " << handle;
+      VLOG(4) << "    with " << opts_copy.DebugString();
+      Run(opts_copy, handle, comp_args, comp_rets,
+          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+            if (!status.ok()) {
+              VLOG(2) << "Component function execution failed: " << status;
+              refcounted_done->UpdateStatus(status);
+            } else {
+              for (int i = 0; i < comp_rets->size(); ++i) {
+                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+              }
+            }
+            delete comp_rets;
+            // refcounted_done is thread-safe
+            refcounted_done->Unref();
+          });
+    }
   }
   refcounted_done->Unref();
 }
@@ -916,8 +948,10 @@
     f = function_data_[h].get();
     *handle = h;
   }
-  TF_RETURN_IF_ERROR(
-      f->DistributedInit(parent_, function_name, *lib_def_, attrs, options));
+  TF_RETURN_IF_ERROR(f->DistributedInit(
+      parent_, function_name,
+      options.lib_def == nullptr ? *lib_def_ : *options.lib_def, attrs,
+      options));
   VLOG(1) << "ProcessFLR Instantiate [success]: " << function_name
           << " on: " << options.target << " with handle: " << *handle
           << " (this: " << this << ")";
@@ -955,6 +989,13 @@
     FunctionLibraryRuntime::Handle flr_handle = it.second.handle_;
     FunctionLibraryRuntime* flr = GetFLR(device);
     if (flr == nullptr) {
+      // TODO(nareshmodi): Implement DeregisterGraph call to remote device if
+      // parent is not null.
+      if (parent_ != nullptr) {
+        return errors::Unimplemented(
+            "Releasing a multi-device component handle on a remote device is "
+            "not yet implemented.");
+      }
       return errors::InvalidArgument(
           "Failed to find FunctionLibraryRuntime for device ", device,
           " when releasing multi-device function handle ", handle);
@@ -999,7 +1040,7 @@
     multi_device = mdevice_data_.find(handle) != mdevice_data_.end();
   }
   if (multi_device) {
-    return RunMultiDevice(opts, handle, args, rets, done);
+    return RunMultiDevice(opts, handle, args, rets, std::move(done));
   }
 
   FunctionLibraryRuntime* flr = nullptr;
@@ -1105,6 +1146,12 @@
                         // Begin unbound arguments.
                         const Status& status) {
             std::unique_ptr<std::vector<Tensor>> rets_releaser(rets);
+
+            if (!status.ok()) {
+              done(status);
+              return;
+            }
+
             if (rets->size() != frame->num_retvals()) {
               done(errors::Internal(
                   "Number of return values from function (", rets->size(),
@@ -1117,6 +1164,7 @@
               Status s = frame->SetRetval(i, (*rets)[i]);
               if (!s.ok()) {
                 done(s);
+                return;
               }
             }
             done(Status::OK());
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 6e842dd..ec0f9db 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -307,6 +307,7 @@
 
   Env* const env_;
   const DeviceMgr* const device_mgr_;
+  DeviceSet device_set_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index a73cb5a..6af3efe 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -43,7 +43,7 @@
 
 class TestClusterFLR : public DistributedFunctionLibraryRuntime {
  public:
-  TestClusterFLR() {}
+  explicit TestClusterFLR(DeviceMgr* device_mgr) : device_mgr_(device_mgr) {}
 
   Status Instantiate(const string& function_name,
                      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
@@ -60,9 +60,12 @@
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
+  DeviceMgr* remote_device_mgr() const override { return device_mgr_; }
+
  private:
   mutex mu_;
   int next_handle_ GUARDED_BY(mu_) = 0;
+  DeviceMgr* device_mgr_;
 };
 
 // TODO(b/128707168): Tests requiring a GPU device are currently always skipped
@@ -101,7 +104,7 @@
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    cluster_flr_.reset(new TestClusterFLR());
+    cluster_flr_.reset(new TestClusterFLR(device_mgr_.get()));
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, nullptr, cluster_flr_.get()));
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index baf80cb..0122aab 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -42,8 +42,8 @@
     return env_num_threads;
   }
 
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
+  // Default to the maximum parallelism for the current process.
+  return port::MaxParallelism();
 #else
   // Historically, -D__ANDROID__ resulted in the inter-op threadpool not being
   // used (regardless of what was chosen here); instead, all work was done on
@@ -99,7 +99,7 @@
 
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
-  if (inter_op != 0) return inter_op;
+  if (inter_op > 0) return inter_op;
 #ifdef INTEL_MKL
   if (!DisableMKL()) {
     // MKL library executes ops in parallel using OMP threads
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index 7ad658b..4d8ac4a 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -37,11 +37,11 @@
 int32 NumIntraOpThreadsFromEnvironment();
 
 // Returns the number of inter op threads specified in `options` or a default.
-// If no value is specified in the provided options, then the function returns
-// the value defined in the TF_NUM_INTEROP_THREADS environment variable.
-// If neither a value is specified in the options or in the environment,
-// this function will return a reasonable default value based on the number
-// of schedulable CPUs, and any MKL and OpenMP configurations.
+// If no value or a negative value is specified in the provided options, then
+// the function returns the value defined in the TF_NUM_INTEROP_THREADS
+// environment variable. If neither a value is specified in the options or in
+// the environment, this function will return a reasonable default value based
+// on the number of schedulable CPUs, and any MKL and OpenMP configurations.
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 6d24797..89dba7b 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -101,16 +101,30 @@
   attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
                           recv_args.alloc_attrs.gpu_compatible());
   Allocator* out_allocator = dst_device->GetAllocator(attr);
+  bool sync_dst_compute = true;
   if (in.dtype() != DT_VARIANT) {
     // Variants are handled by CopyTensor::ViaDMA.
-    Tensor copy(out_allocator, in.dtype(), in.shape());
+    AllocationAttributes aa;
+    uint64 safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
+    std::function<uint64()> freed_by_func = [dst_device,
+                                             &safe_alloc_frontier]() {
+      safe_alloc_frontier = dst_device->SafeAllocFrontier(safe_alloc_frontier);
+      return safe_alloc_frontier;
+    };
+    if (parsed.dst.type == "GPU" && safe_alloc_frontier > 0) {
+      // There's a timestamped allocator at work, so use it instead
+      // of sync_dst_compute.
+      aa.freed_by_func = &freed_by_func;
+      sync_dst_compute = false;
+    }
+    Tensor copy(out_allocator, in.dtype(), in.shape(), aa);
     *out = copy;
   }
 
-  CopyTensor::ViaDMA(parsed.edge_name, send_args.device_context,
-                     recv_args.device_context, src_device, dst_device,
-                     send_args.alloc_attrs, recv_args.alloc_attrs, &in, out,
-                     0 /*dev_to_dev_stream_index*/, std::move(done));
+  CopyTensor::ViaDMA(
+      parsed.edge_name, send_args.device_context, recv_args.device_context,
+      src_device, dst_device, send_args.alloc_attrs, recv_args.alloc_attrs, &in,
+      out, 0 /*dev_to_dev_stream_index*/, std::move(done), sync_dst_compute);
 }
 
 void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
index 4fdac90..f13f617 100644
--- a/tensorflow/core/common_runtime/ring_gatherer.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -15,6 +15,7 @@
 #include "tensorflow/core/common_runtime/ring_gatherer.h"
 
 #include <stdlib.h>
+
 #include <atomic>
 #include <functional>
 #include <utility>
@@ -39,6 +40,7 @@
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 Status RingGatherer::InitializeCollectiveParams(CollectiveParams* col_params) {
@@ -98,7 +100,7 @@
   // We are running in a blockable thread and the callback can't block so
   // just wait here on the copy.
   {
-    tracing::ScopedActivity activity("MemCpyAsync");
+    profiler::TraceMe activity("MemCpyAsync", profiler::TraceMeLevel::kInfo);
     Notification note;
     Status status;
     Tensor alias_chunk(ca_->ChunkAlias(col_params_->subdiv_rank[0]));
@@ -143,7 +145,8 @@
     // complete before proceeding.  The previous InitRingField calls allocated
     // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
     // write) unless we do.
-    tracing::ScopedActivity activity("WaitForQueuedEvents");
+    profiler::TraceMe activity("WaitForQueuedEvents",
+                               profiler::TraceMeLevel::kInfo);
     Notification note;
     Status s = gpu_info->default_context->ThenExecute(
         col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
@@ -164,7 +167,7 @@
 
   // Loop until all RingFields have advanced to completion.
   {
-    tracing::ScopedActivity activity("Loop");
+    profiler::TraceMe activity("Loop", profiler::TraceMeLevel::kInfo);
     while (field_done_count < rfv_.size()) {
       VLOG(4) << FieldState();
       // Wait for a RingField to appear in the ready_queue.
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 97ff7b5..f0f2998 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -288,8 +288,9 @@
     if (fail_after > 0) {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
-        EXPECT_EQ("Deliberate failure",
-                  instances_[di]->status_.error_message());
+        EXPECT_NE(
+            instances_[di]->status_.error_message().find("Deliberate failure"),
+            string::npos);
       }
     } else {
       // Confirm that every device accumulated the same set of correct
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 659ebfd..57cd14d 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -15,6 +15,7 @@
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 
 #include <stdlib.h>
+
 #include <atomic>
 #include <functional>
 #include <utility>
@@ -39,6 +40,7 @@
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
@@ -88,7 +90,7 @@
     // just wait here on the copy.
     Notification note;
     Status status;
-    tracing::ScopedActivity activity("MemCpyAsync");
+    profiler::TraceMe activity("MemCpyAsync", profiler::TraceMeLevel::kInfo);
     CollectiveRemoteAccessLocal::MemCpyAsync(
         col_ctx_->op_ctx->input_device_context(0),
         col_ctx_->op_ctx->op_device_context(), col_ctx_->device,
@@ -122,17 +124,29 @@
     // can be provided to the kernel in host memory?
     Tensor group_size_val = ca_->Scalar(group_size_);
     if (col_params_->group.device_type != "CPU") {
-      group_size_tensor_ = ca_->Scalar(col_ctx_->device->GetAllocator(
-          col_ctx_->op_ctx->input_alloc_attr(0)));
+      uint64 safe_alloc_frontier = col_ctx_->device->SafeAllocFrontier(0);
+      AllocationAttributes aa;
+      std::function<uint64()> freed_by_func = [this, &safe_alloc_frontier]() {
+        safe_alloc_frontier =
+            col_ctx_->device->SafeAllocFrontier(safe_alloc_frontier);
+        return safe_alloc_frontier;
+      };
+      if (safe_alloc_frontier > 0) {
+        aa.freed_by_func = &freed_by_func;
+      }
+      group_size_tensor_ = ca_->Scalar(
+          col_ctx_->device->GetAllocator(col_ctx_->op_ctx->input_alloc_attr(0)),
+          aa);
       DeviceContext* op_dev_ctx = col_ctx_->op_ctx->op_device_context();
-      op_dev_ctx->CopyCPUTensorToDevice(&group_size_val, col_ctx_->device,
-                                        &group_size_tensor_,
-                                        [this](const Status& s) {
-                                          if (!s.ok()) {
-                                            StartAbort(s);
-                                          }
-                                          group_size_tensor_ready_.Notify();
-                                        });
+      op_dev_ctx->CopyCPUTensorToDevice(
+          &group_size_val, col_ctx_->device, &group_size_tensor_,
+          [this](const Status& s) {
+            if (!s.ok()) {
+              StartAbort(s);
+            }
+            group_size_tensor_ready_.Notify();
+          },
+          (safe_alloc_frontier == 0));
     } else {
       group_size_tensor_ = group_size_val;
       group_size_tensor_ready_.Notify();
@@ -177,7 +191,8 @@
     // complete before proceeding.  The previous InitRingField calls allocated
     // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
     // write) unless we do.
-    tracing::ScopedActivity activity("WaitForQueuedEvents");
+    profiler::TraceMe activity("WaitForQueuedEvents",
+                               profiler::TraceMeLevel::kInfo);
     Notification note;
     Status s = gpu_info->default_context->ThenExecute(
         col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
@@ -197,7 +212,7 @@
   std::atomic<bool> aborted(false);
 
   {
-    tracing::ScopedActivity activity("Loop");
+    profiler::TraceMe activity("Loop", profiler::TraceMeLevel::kInfo);
     // Loop until all RingFields have advanced to completion.
     while (field_done_count < rfv_.size()) {
       VLOG(4) << FieldState();
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 7f18cdb..16dbabd 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -306,8 +306,9 @@
     if (fail_after > 0) {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
-        EXPECT_EQ("Deliberate failure",
-                  instances_[di]->status_.error_message());
+        EXPECT_NE(
+            instances_[di]->status_.error_message().find("Deliberate failure"),
+            string::npos);
       }
     } else {
       // Confirm that every device computed the same correct reduction value.
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 64e3373..683bbc7 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -106,7 +106,6 @@
   }
   void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_) override;
   bool TracksAllocationSizes() const override { return false; }
-  bool ShouldAllocateEmptyTensors() const override { return false; }
   size_t RequestedSize(const void* ptr) const override { return 0; }
   size_t AllocatedSize(const void* ptr) const override { return 0; }
   int64 AllocationId(const void* ptr) const override { return 0; }
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index 8c30bee..be640f9 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -13,12 +13,14 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/public/session.h"
+
 #include <string>
 
 #include "tensorflow/core/common_runtime/session_factory.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/monitoring.h"
 
 namespace tensorflow {
 
@@ -58,6 +60,10 @@
     LOG(ERROR) << s;
     return nullptr;
   }
+  // Starts exporting metrics through a platform-specific monitoring API (if
+  // provided). For builds using "tensorflow/core/platform/default", this is
+  // currently a no-op.
+  monitoring::StartExporter();
   Session* out_session;
   s = NewSession(options, &out_session);
   if (!s.ok()) {
@@ -75,6 +81,10 @@
     LOG(ERROR) << s;
     return s;
   }
+  // Starts exporting metrics through a platform-specific monitoring API (if
+  // provided). For builds using "tensorflow/core/platform/default", this is
+  // currently a no-op.
+  monitoring::StartExporter();
   s = factory->NewSession(options, out_session);
   if (!s.ok()) {
     *out_session = nullptr;
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index f6554b5..7b25460 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -98,8 +98,10 @@
         ":worker_interface",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
     ],
 )
 
@@ -127,6 +129,7 @@
     hdrs = ["session_mgr.h"],
     deps = [
         ":graph_mgr",
+        ":remote_device",
         ":worker_cache_wrapper",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
@@ -197,6 +200,7 @@
     deps = [
         ":graph_mgr",
         ":partial_run_mgr",
+        ":recent_request_ids",
         ":rendezvous_mgr_interface",
         ":session_mgr",
         ":tensor_coding",
@@ -204,7 +208,6 @@
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/distributed_runtime:recent_request_ids",
         "//tensorflow/core/profiler/lib:profiler_session",
     ],
 )
@@ -431,6 +434,7 @@
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index a642313..a5ec95f 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -27,6 +27,7 @@
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -269,19 +270,28 @@
   attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
                           recv_args.alloc_attrs.gpu_compatible());
   Allocator* out_allocator = dst_device->GetAllocator(attr);
-
+  AllocationAttributes allocation_attr;
+  uint64 safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
+  bool sync_dst_compute = (safe_alloc_frontier == 0);
+  std::function<uint64()> freed_by_func = [dst_device, &safe_alloc_frontier]() {
+    safe_alloc_frontier = dst_device->SafeAllocFrontier(safe_alloc_frontier);
+    return safe_alloc_frontier;
+  };
+  if (!sync_dst_compute) {
+    allocation_attr.freed_by_func = &freed_by_func;
+  }
   if (in.dtype() != DT_VARIANT) {
     // Variants are handled by CopyTensor::ViaDMA.
-    Tensor copy(out_allocator, in.dtype(), in.shape());
+    Tensor copy(out_allocator, in.dtype(), in.shape(), allocation_attr);
     *out = copy;
   }
 
   // The following function takes care of cpu->gpu, gpu->cpu, gpu->gpu copies,
   // etc.
-  CopyTensor::ViaDMA(parsed.edge_name, send_args.device_context,
-                     recv_args.device_context, src_device, dst_device,
-                     send_args.alloc_attrs, recv_args.alloc_attrs, &in, out,
-                     0 /*dev_to_dev_stream_index*/, std::move(done));
+  CopyTensor::ViaDMA(
+      parsed.edge_name, send_args.device_context, recv_args.device_context,
+      src_device, dst_device, send_args.alloc_attrs, recv_args.alloc_attrs, &in,
+      out, 0 /*dev_to_dev_stream_index*/, std::move(done), sync_dst_compute);
 }
 
 bool BaseRemoteRendezvous::IsSameWorker(DeviceNameUtils::ParsedName src,
@@ -363,14 +373,20 @@
 
 void BaseRemoteRendezvous::StartAbort(const Status& s) {
   CHECK(!s.ok());
-  local_->StartAbort(s);
+  // Use a "derived" status as the status for the rendezvous. Derived
+  // status messages are ignored when aggregating errors across devices: this
+  // allows us to prefer our original status message over any cancellation
+  // related errors.
+  Status derived_status = StatusGroup::MakeDerived(s);
+
+  local_->StartAbort(derived_status);
   {
     // Aborts all active RecvTensor calls.
     mutex_lock l(mu_);
     if (status_.ok()) {
-      status_ = s;
+      status_ = derived_status;
       for (BaseRecvTensorCall* call : active_) {
-        call->StartAbort(s);
+        call->StartAbort(derived_status);
       }
       active_.clear();
     }
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 6edc2ec..3a60ff0 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -17,10 +17,13 @@
 #include <map>
 
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/named_tensor.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
@@ -135,13 +138,21 @@
   }
 
   // Make RPC and obtain a graph handle.
-  const FunctionDef* fdef = lib_def.Find(function_name);
-  const OpDef& sig = fdef->signature();
   GraphDef gdef;
   std::vector<string> send_keys, recv_keys;
-  TF_RETURN_IF_ERROR(ConstructFunctionGraph(sig, attrs, options, &gdef,
-                                            &send_keys, &recv_keys));
-  *gdef.mutable_library() = lib_def.ToProto();
+  auto construct_graph_fn = [&](const FunctionLibraryDefinition* lib_def) {
+    const FunctionDef* fdef = lib_def->Find(function_name);
+    const OpDef& sig = fdef->signature();
+    TF_RETURN_IF_ERROR(ConstructFunctionGraph(sig, attrs, options, &gdef,
+                                              &send_keys, &recv_keys));
+    *gdef.mutable_library() = lib_def->ToProto();
+    return Status::OK();
+  };
+  if (options.lib_def) {
+    TF_RETURN_IF_ERROR(construct_graph_fn(options.lib_def));
+  } else {
+    TF_RETURN_IF_ERROR(construct_graph_fn(&lib_def));
+  }
 
   RegisterGraphRequest req;
   req.set_session_handle(worker_session_->session_name);
@@ -185,9 +196,7 @@
   req->set_session_handle(worker_session_->session_name);
   req->set_create_worker_session_called(create_worker_session_called_);
   req->set_graph_handle(function_data->graph_handle);
-  // Borrowed from master_session.cc
-  const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
-  req->set_step_id(step_id);
+  req->set_step_id(opts.step_id);
   int i = 0;
   for (const auto& send_key : function_data->send_keys) {
     NamedTensorProto* send = req->add_send();
@@ -200,16 +209,35 @@
     req->add_recv_key(recv_key);
   }
 
+  CleanupGraphRequest* cleanup_req = new CleanupGraphRequest;
+  cleanup_req->set_step_id(opts.step_id);
+
   RunGraphResponse* resp = new RunGraphResponse();
+  CleanupGraphResponse* cleanup_resp = new CleanupGraphResponse;
   CallOptions* call_options = new CallOptions();
   wi->RunGraphAsync(
       call_options, req, resp,
-      [call_options, req, resp, rets, recv_keys, done](const Status& status) {
-        if (!status.ok()) {
-          done(status);
-          delete call_options;
-          delete req;
-          delete resp;
+      [wi, call_options, req, resp, rets, recv_keys, cleanup_req, cleanup_resp,
+       done](const Status& status) {
+        Status* local_status = new Status(status);
+        auto cleanup =
+            gtl::MakeCleanup([wi, call_options, req, resp, cleanup_req,
+                              cleanup_resp, local_status, done] {
+              wi->CleanupGraphAsync(
+                  cleanup_req, cleanup_resp,
+                  [call_options, req, resp, cleanup_req, cleanup_resp,
+                   local_status, done](const Status& cleanup_status) {
+                    local_status->Update(cleanup_status);
+                    done(*local_status);
+                    delete local_status;
+                    delete call_options;
+                    delete req;
+                    delete resp;
+                    delete cleanup_req;
+                    delete cleanup_resp;
+                  });
+            });
+        if (!local_status->ok()) {
           return;
         }
         std::map<string, TensorProto*> mapped_recvs;
@@ -220,28 +248,19 @@
         for (const auto& recv_key : recv_keys) {
           TensorProto* tp = mapped_recvs[recv_key];
           if (tp == nullptr) {
-            done(errors::Internal("Could not find key: ", recv_key));
-            delete call_options;
-            delete req;
-            delete resp;
+            local_status->Update(
+                errors::Internal("Could not find key: ", recv_key));
             return;
           }
           Tensor t;
           if (t.FromProto(*tp)) {
             rets->push_back(t);
           } else {
-            done(errors::Internal("Could not convert tensor proto: ",
-                                  tp->DebugString()));
-            delete call_options;
-            delete req;
-            delete resp;
+            local_status->Update(errors::Internal(
+                "Could not convert tensor proto: ", tp->DebugString()));
             return;
           }
         }
-        done(status);
-        delete call_options;
-        delete req;
-        delete resp;
       });
 }
 
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index 1ea0a3a..28128f4 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -28,9 +28,11 @@
 class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
  public:
   ClusterFunctionLibraryRuntime(WorkerSession* worker_session,
-                                bool create_worker_session_called)
+                                bool create_worker_session_called,
+                                DeviceMgr* remote_device_mgr)
       : worker_session_(worker_session),
-        create_worker_session_called_(create_worker_session_called) {}
+        create_worker_session_called_(create_worker_session_called),
+        remote_device_mgr_(remote_device_mgr) {}
 
   ~ClusterFunctionLibraryRuntime() override;
 
@@ -44,6 +46,8 @@
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
+  DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
+
  private:
   static Status ConstructFunctionGraph(
       const OpDef& sig, AttrSlice attrs,
@@ -55,6 +59,8 @@
   WorkerSession* const worker_session_ = nullptr;  // not owned.
   const bool create_worker_session_called_;
 
+  DeviceMgr* remote_device_mgr_;  // not owned.
+
   struct FunctionData {
     const string graph_handle;
     const string target;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index cd6e135..45a11fe 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -42,10 +42,10 @@
     worker_session_.reset(new WorkerSession(
         "cluster_test_session", "/job:localhost/replica:0/task:0",
         std::move(worker_cache), std::unique_ptr<DeviceMgr>(),
-        std::unique_ptr<GraphMgr>()));
+        std::unique_ptr<GraphMgr>(), nullptr));
 
-    cluster_flr_.reset(
-        new ClusterFunctionLibraryRuntime(worker_session_.get(), true));
+    cluster_flr_.reset(new ClusterFunctionLibraryRuntime(worker_session_.get(),
+                                                         true, nullptr));
   }
 
   Status ConstructFunctionGraphHelper(
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 2b27ac8..448319f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -104,10 +104,10 @@
   // Initialize remote tensor communication based on worker session.
   TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));
 
-  std::unique_ptr<tensorflow::EagerContext> ctx(new tensorflow::EagerContext(
+  tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      request->async(), device_mgr, false, r, nullptr));
+      request->async(), device_mgr, false, r, nullptr);
 
   std::vector<DeviceAttributes> device_attributes;
   device_mgr->ListDeviceAttributes(&device_attributes);
@@ -122,9 +122,8 @@
     do {
       context_id = random::New64();
     } while (contexts_.find(context_id) != contexts_.end());
-    contexts_.emplace(
-        context_id,
-        new ServerContext(std::move(ctx), request->keep_alive_secs(), env_));
+    contexts_.emplace(context_id,
+                      new ServerContext(ctx, request->keep_alive_secs(), env_));
   }
   response->set_context_id(context_id);
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 2784c5d..a33c678 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -104,9 +104,9 @@
   // and the EagerContext).
   class ServerContext : public core::RefCounted {
    public:
-    explicit ServerContext(std::unique_ptr<tensorflow::EagerContext> ctx,
+    explicit ServerContext(tensorflow::EagerContext* ctx,
                            int64 destroy_after_secs, const WorkerEnv* env)
-        : ctx_(std::move(ctx)), env_(env) {
+        : ctx_(ctx), env_(env) {
       destroy_after_micros_ =
           destroy_after_secs * tensorflow::EnvTime::kSecondsToMicros;
       RecordAccess();
@@ -115,9 +115,11 @@
       for (const auto& entry : tensors_) {
         entry.second->Unref();
       }
+
+      ctx_->Unref();
     }
 
-    tensorflow::EagerContext* Context() const { return ctx_.get(); }
+    tensorflow::EagerContext* Context() const { return ctx_; }
 
     void AddOperationOutputs(
         const gtl::ArraySlice<tensorflow::TensorHandle*>& handles,
@@ -179,7 +181,7 @@
                      RemoteTensorHandleInternalEquals>;
 
     // The context for this execution.
-    std::unique_ptr<tensorflow::EagerContext> ctx_;
+    tensorflow::EagerContext* ctx_;
 
     // The state related to the context for this execution.
     mutex tensors_mu_;
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index a619c24..81d6412 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -49,6 +49,7 @@
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/util/env_var.h"
 
@@ -120,13 +121,14 @@
 //
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
-Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
+Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
+                          WorkerSession* session,
                           const GraphOptions& graph_options,
                           const DebugOptions& debug_options,
                           int64 collective_graph_key,
                           DistributedFunctionLibraryRuntime* cluster_flr,
                           Item* item) {
-  item->session = session;
+  item->session = handle;
   item->collective_graph_key = collective_graph_key;
   item->lib_def.reset(
       new FunctionLibraryDefinition(OpRegistry::Global(), gdef.library()));
@@ -222,7 +224,7 @@
     // kernels. Therefore, as long as the executor is alive, we need
     // to ensure the kernels cached for the session are alive.
     auto opseg = unit->device->op_segment();
-    opseg->AddHold(session);
+    opseg->AddHold(handle);
 
     // Function library runtime.
     FunctionLibraryRuntime* lib = item->proc_flr->GetFLR(unit->device->name());
@@ -234,8 +236,8 @@
     // Construct the root executor for the subgraph.
     params.device = unit->device;
     params.function_library = lib;
-    params.create_kernel = [session, lib, opseg](const NodeDef& ndef,
-                                                 OpKernel** kernel) {
+    params.create_kernel = [handle, lib, opseg](const NodeDef& ndef,
+                                                OpKernel** kernel) {
       // NOTE(mrry): We must not share function kernels (implemented
       // using `CallOp`) between subgraphs, because `CallOp::handle_`
       // is tied to a particular subgraph. Even if the function itself
@@ -249,13 +251,21 @@
       // Kernels created for subgraph nodes need to be cached.  On
       // cache miss, create_fn() is invoked to create a kernel based
       // on the function library here + global op registry.
-      return opseg->FindOrCreate(session, ndef.name(), kernel, create_fn);
+      return opseg->FindOrCreate(handle, ndef.name(), kernel, create_fn);
     };
     params.delete_kernel = [lib](OpKernel* kernel) {
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string())) {
         delete kernel;
       }
     };
+    params.rendezvous_factory = [this, session](const int64 step_id,
+                                                const DeviceMgr*,
+                                                Rendezvous** r) -> Status {
+      auto* remote_r = this->worker_env_->rendezvous_mgr->Find(step_id);
+      TF_RETURN_IF_ERROR(remote_r->Initialize(session));
+      *r = remote_r;
+      return Status::OK();
+    };
 
     optimizer.Optimize(lib, worker_env_->env, params.device, &subgraph,
                        /*shape_map=*/nullptr);
@@ -280,14 +290,15 @@
   return Status::OK();
 }
 
-Status GraphMgr::Register(const string& session, const GraphDef& gdef,
+Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
+                          WorkerSession* session,
                           const GraphOptions& graph_options,
                           const DebugOptions& debug_options,
                           int64 collective_graph_key,
                           DistributedFunctionLibraryRuntime* cluster_flr,
-                          string* handle) {
+                          string* graph_handle) {
   Item* item = new Item;
-  Status s = InitItem(session, gdef, graph_options, debug_options,
+  Status s = InitItem(handle, gdef, session, graph_options, debug_options,
                       collective_graph_key, cluster_flr, item);
   if (!s.ok()) {
     item->Unref();
@@ -297,9 +308,9 @@
   // Inserts one item into table_.
   {
     mutex_lock l(mu_);
-    *handle = strings::Printf("%016llx", ++next_id_);
-    item->handle = *handle;
-    CHECK(table_.insert({*handle, item}).second);
+    *graph_handle = strings::Printf("%016llx", ++next_id_);
+    item->handle = *graph_handle;
+    CHECK(table_.insert({*graph_handle, item}).second);
   }
   return Status::OK();
 }
@@ -408,7 +419,8 @@
                             const NamedTensors& in, StatusCallback done) {
   const uint64 start_time_usecs = Env::Default()->NowMicros();
   string session_id_meta = strings::StrCat("RunGraph #id=", step_id, "#");
-  auto* activity = new tracing::ScopedActivity(session_id_meta);
+  auto* activity = new profiler::TraceMe(absl::string_view(session_id_meta),
+                                         profiler::TraceMeLevel::kInfo);
   // Lookup an item. Holds one ref while executing.
   Item* item = nullptr;
   {
@@ -470,29 +482,26 @@
     return;
   }
 
-  StartParallelExecutors(
-      handle, step_id, item, rendezvous, ce_handle, collector, cost_graph,
-      cancellation_manager,
-      [item, rendezvous, ce_handle, done, start_time_usecs, input_size,
-       activity](const Status& s) {
-        done(s);
-        metrics::RecordGraphInputTensors(input_size);
-        metrics::UpdateGraphExecTime(Env::Default()->NowMicros() -
-                                     start_time_usecs);
-        rendezvous->Unref();
-        item->Unref();
-        delete activity;
-        delete ce_handle;
-      });
+  StartParallelExecutors(handle, step_id, item, rendezvous, ce_handle,
+                         collector, cost_graph, cancellation_manager, session,
+                         [item, rendezvous, ce_handle, done, start_time_usecs,
+                          input_size, activity](const Status& s) {
+                           done(s);
+                           metrics::RecordGraphInputTensors(input_size);
+                           metrics::UpdateGraphExecTime(
+                               Env::Default()->NowMicros() - start_time_usecs);
+                           rendezvous->Unref();
+                           item->Unref();
+                           delete activity;
+                           delete ce_handle;
+                         });
 }
 
-void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
-                                      Item* item, Rendezvous* rendezvous,
-                                      CollectiveExecutor::Handle* ce_handle,
-                                      StepStatsCollector* collector,
-                                      CostGraphDef* cost_graph,
-                                      CancellationManager* cancellation_manager,
-                                      StatusCallback done) {
+void GraphMgr::StartParallelExecutors(
+    const string& handle, int64 step_id, Item* item, Rendezvous* rendezvous,
+    CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
+    CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
+    WorkerSession* session, StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
   ScopedStepContainer* step_container = new ScopedStepContainer(
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 5196046..fcd316d 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -74,11 +74,11 @@
 
   // Registers a graph. Fills in "handle". The registered graph retains a
   // reference to cluster_flr to do cross process function calls.
-  Status Register(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options,
+  Status Register(const string& handle, const GraphDef& gdef,
+                  WorkerSession* session, const GraphOptions& graph_options,
                   const DebugOptions& debug_options, int64 collective_graph_key,
                   DistributedFunctionLibraryRuntime* cluster_flr,
-                  string* handle);
+                  string* graph_handle);
 
   // Executes one step of a registered graph "handle".
   //
@@ -168,7 +168,7 @@
                               StepStatsCollector* collector,
                               CostGraphDef* cost_graph,
                               CancellationManager* cancellation_manager,
-                              StatusCallback done);
+                              WorkerSession* session, StatusCallback done);
 
   // Don't attempt to process cost models unless explicitly requested for at
   // least one of the items.
@@ -177,8 +177,8 @@
   void BuildCostModel(Item* item, StepStatsCollector* collector,
                       CostGraphDef* cost_graph);
 
-  Status InitItem(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options,
+  Status InitItem(const string& handle, const GraphDef& gdef,
+                  WorkerSession* session, const GraphOptions& graph_options,
                   const DebugOptions& debug_options, int64 collective_graph_key,
                   DistributedFunctionLibraryRuntime* cluster_flr, Item* item);
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 9d3e417..5c55067 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -507,17 +507,20 @@
   Call* get(int index) { return &calls_[index]; }
 
   // When the index-th call is done, updates the overall status.
-  void WhenDone(int index, const Status& s) {
+  void WhenDone(int index, const std::string& worker_name, const Status& s) {
     TRACEPRINTF("Partition %d %s", index, s.ToString().c_str());
     auto resp = get(index)->resp.get();
     if (resp->status_code() != error::Code::OK) {
       // resp->status_code will only be non-OK if s.ok().
       mutex_lock l(mu_);
-      ReportBadStatus(
-          Status(resp->status_code(), resp->status_error_message()));
+      ReportBadStatus(Status(resp->status_code(),
+                             strings::StrCat("From ", worker_name, ":\n",
+                                             resp->status_error_message())));
     } else if (!s.ok()) {
       mutex_lock l(mu_);
-      ReportBadStatus(s);
+      ReportBadStatus(Status(
+          s.code(),
+          strings::StrCat("From ", worker_name, ":\n", s.error_message())));
     }
     pending_.DecrementCount();
   }
@@ -531,7 +534,9 @@
 
   Status status() const {
     mutex_lock l(mu_);
-    return status_group_.as_status();
+    // Concat status objects in this StatusGroup to get the aggregated status,
+    // as each status in status_group_ is already summarized status.
+    return status_group_.as_concatenated_status();
   }
 
  private:
@@ -540,10 +545,16 @@
   BlockingCounter pending_;
   mutable mutex mu_;
   StatusGroup status_group_ GUARDED_BY(mu_);
+  bool cancel_issued_ GUARDED_BY(mu_) = false;
 
   void ReportBadStatus(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // Start cancellation if we aren't already in an error state.
-    if (status_group_.ok()) {
+    VLOG(1) << "Master received error status " << s;
+    if (!cancel_issued_ && !StatusGroup::IsDerived(s)) {
+      // Only start cancelling other workers upon receiveing a non-derived
+      // error
+      cancel_issued_ = true;
+
+      VLOG(1) << "Master received error report. Cancelling remaining workers.";
       for (Call& call : calls_) {
         call.opts.StartCancel();
       }
@@ -686,13 +697,17 @@
     const Part& part = partitions_[i];
     RunManyGraphs::Call* call = calls.get(i);
     TRACEPRINTF("Partition %d %s", i, part.name.c_str());
-    part.worker->RunGraphAsync(
-        &call->opts, call->req.get(), call->resp.get(),
-        std::bind(&RunManyGraphs::WhenDone, &calls, i, std::placeholders::_1));
+    part.worker->RunGraphAsync(&call->opts, call->req.get(), call->resp.get(),
+                               std::bind(&RunManyGraphs::WhenDone, &calls, i,
+                                         part.name, std::placeholders::_1));
   }
 
   // Waits for the RunGraph calls.
-  call_opts->SetCancelCallback([&calls]() { calls.StartCancel(); });
+  call_opts->SetCancelCallback([&calls]() {
+    LOG(INFO) << "Client requested cancellation for RunStep, cancelling "
+                  "worker operations.";
+    calls.StartCancel();
+  });
   auto token = cm->get_cancellation_token();
   const bool success =
       cm->RegisterCallback(token, [&calls]() { calls.StartCancel(); });
@@ -1266,6 +1281,13 @@
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
+    if (session_opts_.config.experimental()
+            .share_cluster_devices_in_session()) {
+      for (const auto& remote_dev : devices_->devices()) {
+        *workers[i].request.add_cluster_device_attributes() =
+            remote_dev->attributes();
+      }
+    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index a043c5d..f0fc666 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -54,6 +54,16 @@
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteDevice);
 };
 
+void AsRemoteDevices(
+    Env* env,
+    const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    std::vector<std::unique_ptr<Device>>* remote_devices) {
+  for (const auto& da : device_attributes) {
+    auto d = new RemoteDevice(env, da);
+    remote_devices->emplace_back(d);
+  }
+}
+
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
                       const string& worker_name, NewRemoteDevicesDone done) {
   WorkerInterface* wi = worker_cache->CreateWorker(worker_name);
diff --git a/tensorflow/core/distributed_runtime/remote_device.h b/tensorflow/core/distributed_runtime/remote_device.h
index 686af95..1b2a4cd 100644
--- a/tensorflow/core/distributed_runtime/remote_device.h
+++ b/tensorflow/core/distributed_runtime/remote_device.h
@@ -19,13 +19,23 @@
 #include <functional>
 #include <string>
 #include <vector>
+
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
+class DeviceAttributes;
 class Device;
 class Env;
 class WorkerCacheInterface;
 
+// Creates Remote Devices for the provided device attributes. Helpful when the
+// list of attributes is known, and doesn't need to be discovered via RPC.
+void AsRemoteDevices(
+    Env* env,
+    const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    std::vector<std::unique_ptr<Device>>* remote_devices);
+
 // NewRemoteDevices discovers available devices on the
 // 'remote_worker'.  The implementation uses 'channel_cache' to
 // discover how to communicate with the 'remote_worker' (via gRPC, for
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 506b7178..252d352 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -15,8 +15,8 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cuda_library",
     "tf_cc_test",
+    "tf_cuda_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
@@ -223,6 +223,7 @@
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:master_interface",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
     alwayslink = 1,
 )
@@ -241,6 +242,7 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:master",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 17dd369..e3f2feb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -32,7 +32,6 @@
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
-
 #include "tensorflow/core/distributed_runtime/master.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
@@ -41,6 +40,7 @@
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
 namespace tensorflow {
@@ -284,7 +284,7 @@
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
-  tracing::ScopedActivity* TraceRpc(
+  profiler::TraceMe* TraceRpc(
       StringPiece name,
       const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) {
     StringPiece id;
@@ -292,7 +292,8 @@
     if (it != metadata.end()) {
       id = StringPiece(it->second.data(), it->second.size());
     }
-    return new tracing::ScopedActivity(name, id);
+    return new profiler::TraceMe([&] { return strings::StrCat(name, ":", id); },
+                                 profiler::TraceMeLevel::kInfo);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index a845590..ffda704 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -26,6 +26,7 @@
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
 namespace tensorflow {
@@ -110,11 +111,12 @@
 
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  tracing::ScopedActivity* NewTraceRpc(StringPiece name,
-                                       ::grpc::ClientContext* ctx) {
+  profiler::TraceMe* NewTraceRpc(StringPiece name, ::grpc::ClientContext* ctx) {
     string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return new tracing::ScopedActivity(name, trace_id);
+    return new profiler::TraceMe(
+        [&] { return strings::StrCat(name, ":", trace_id); },
+        profiler::TraceMeLevel::kInfo);
   }
 
   template <typename Request, typename Response>
@@ -131,7 +133,7 @@
     Status s;
     for (int num_retries = 0;; ++num_retries) {
       ::grpc::ClientContext ctx;
-      std::unique_ptr<tracing::ScopedActivity> trace;
+      std::unique_ptr<profiler::TraceMe> trace;
       if (!trace_string.empty()) {
         trace.reset(NewTraceRpc(trace_string, &ctx));
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
index b3ea64b..279e982 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
@@ -23,6 +23,7 @@
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mutex.h"
 
 // gRPC response caching.  Most WorkerService methods cannot be retried directly
 // as they will fail or deadlock.  To enable retrying, we can instead cache
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index ad0f8e5..5de831c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -1093,4 +1093,348 @@
   IsSingleFloatValue(outputs[0], kTestValue);
 }
 
+TEST(GrpcSessionTest, ErrorAggregationTwoWorkersTwoErrors) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  auto& devs = cluster->devices();
+  const string& master = cluster->targets()[0];
+  // worker 1
+  const string w1_dev1 = devs[0].name();
+  // worker 2
+  const string w2_dev1 = devs[1].name();
+
+  LOG(INFO) << "master " << master << "w1_dev1 " << w1_dev1 << " w2_dev1 "
+            << w2_dev1;
+  GraphDef gdef;
+  std::vector<string> fetches;
+  {
+    // Set up a graph to test the error handling when two workers both reports
+    // original errors. The expected behavior is:
+    //   1. The master issues a cancel operation upon receiving the first error.
+    //   2. The master may receive one or both errors depending on the timing
+    //      of the cancel operation.
+    //
+    // Set up:
+    // Set up two workers. Both worker reports error the master without any
+    // delay.
+    Graph g(OpRegistry::Global());
+
+    // Worker 1. a_err runs on w1_dev1 and a_delay runs on w2_dev2.
+    auto a = test::graph::Constant(&g, Tensor(1));
+    a->set_assigned_device_name(w1_dev1);
+
+    auto a_err = test::graph::Error(&g, a, "fantasia1!");
+    a_err->set_assigned_device_name(w1_dev1);
+
+    fetches.push_back(a_err->name());
+
+    // Worker 2. b2 depends on a_err and detects the error via the rendezvous
+    // from worker 1.
+    auto b = test::graph::Constant(&g, Tensor(1));
+    b->set_assigned_device_name(w2_dev1);
+
+    auto b_err = test::graph::Error(&g, b, "fantasia2!");
+    b_err->set_assigned_device_name(w2_dev1);
+
+    fetches.push_back(b_err->name());
+
+    g.ToGraphDef(&gdef);
+  }
+
+  std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(gdef));
+  {
+    std::vector<Tensor> outputs;
+    Status status = session->Run({}, fetches, {}, &outputs);
+    LOG(INFO) << status;
+    EXPECT_FALSE(status.ok());
+    // Status contains the error either worker1 or worker2.
+    EXPECT_NE(status.ToString().find("fantasia"), string::npos);
+    EXPECT_EQ(status.code(), error::Code::INTERNAL);
+  }
+  // session->Close() shall clean up all states related to the session->
+  // E.g., deregisters subgraph with workers, etc.
+  TF_CHECK_OK(session->Close());
+
+  // Sleep a bit so that most of asynchronous works finishes before
+  // the test process finishes.
+  Env::Default()->SleepForMicroseconds(2000000);
+}
+
+TEST(GrpcSessionTest, ErrorAggregationTwoWorkerRace) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(2, 0), 2, &cluster));
+  auto& devs = cluster->devices();
+  const string& master = cluster->targets()[0];
+  // worker 1
+  const string w1_dev1 = devs[0].name();
+  const string w1_dev2 = devs[1].name();
+  // worker 2
+  const string w2_dev1 = devs[2].name();
+
+  LOG(INFO) << "master " << master << "w1_dev1 " << w1_dev1 << " w1_dev2 "
+            << w1_dev2 << " w2_dev1 " << w2_dev1;
+  GraphDef gdef;
+  std::vector<string> fetches;
+  std::vector<string> targets;
+  {
+    // Set up a graph to test the error handling when a derived error is
+    // reported to master before the original error. The expected behavior is:
+    //    1. the original error will be received by the master and reported
+    //       to the user as the error status.
+    //
+    // Setup:
+    //
+    // Worker 1 generates the original error but it delays for 5 seconds before
+    // reporting to master. Worker 2 detects the error (via Rendezvous) and
+    // reports to the master before worker 1.
+    Graph g(OpRegistry::Global());
+
+    // Worker 1. a_err runs on w1_dev1 and a_delay runs on w2_dev2.
+    auto a = test::graph::Constant(&g, Tensor(1));
+    a->set_assigned_device_name(w1_dev1);
+
+    auto a_err = test::graph::Error(&g, a, "fantasia!");
+    a_err->set_assigned_device_name(w1_dev1);
+
+    auto a_delay = test::graph::Delay(&g, a, Microseconds(5000000));
+    a_delay->set_assigned_device_name(w1_dev2);
+
+    // We need to put a_delay in targets instead of fetches. Putting
+    // a_delay in fetches will cause the executor for w1_dev2 to report failure
+    // status as well as the Rendezvous is already poisoned by the a_err op in
+    // w1_dev1.
+    targets.push_back(a_delay->name());
+    fetches.push_back(a_err->name());
+
+    // Worker 2. b2 depends on a_err and detects the error via the rendezvous
+    // from worker 1.
+    auto b = test::graph::Constant(&g, Tensor(3));
+    b->set_assigned_device_name(w2_dev1);
+    auto b2 = test::graph::Add(&g, b, a_err);
+    b2->set_assigned_device_name(w2_dev1);
+    fetches.push_back(b2->name());
+
+    g.ToGraphDef(&gdef);
+  }
+
+  std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(gdef));
+  {
+    std::vector<Tensor> outputs;
+    Status status = session->Run({}, fetches, targets, &outputs);
+    LOG(INFO) << status;
+    EXPECT_FALSE(status.ok());
+    // assert status contains the root error
+    EXPECT_NE(status.ToString().find("fantasia!"), string::npos);
+    // assert status does not contain cancelled error.
+    EXPECT_EQ(status.ToString().find("Cancelled"), string::npos);
+    EXPECT_EQ(status.code(), error::Code::INTERNAL);
+  }
+  // session->Close() shall clean up all states related to the session->
+  // E.g., deregisters subgraph with workers, etc.
+  TF_CHECK_OK(session->Close());
+
+  // Sleep a bit so that most of asynchronous works finishes before
+  // the test process finishes.
+  Env::Default()->SleepForMicroseconds(2000000);
+}
+
+TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant1) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(2, 0), 3, &cluster));
+  auto& devs = cluster->devices();
+  const string& master = cluster->targets()[0];
+  // worker 1
+  const string w1_dev1 = devs[0].name();
+  const string w1_dev2 = devs[1].name();
+  // worker 2
+  const string w2_dev1 = devs[2].name();
+  // worker 3
+  const string w3_dev1 = devs[4].name();
+
+  LOG(INFO) << "master " << master << "w1_dev1 " << w1_dev1 << " w1_dev2 "
+            << w1_dev2 << " w2_dev1 " << w2_dev1 << " w3_dev1 " << w3_dev1;
+  GraphDef gdef;
+  std::vector<string> fetches;
+  std::vector<string> targets;
+  {
+    // Set up a graph to test the error handling when a derived error is
+    // reported to master before the original error and a third worker is
+    // canceled by the master. The expect behavior is that
+    //    1. the original error will be received by the master,
+    //    2. the canceled error will be treated as a derived error.
+    //
+    // Setup:
+    //
+    // Worker 1 generates the original error but it delays for 5 seconds before
+    // reporting to master. Worker 2 detects the error (via Rendezvous) and
+    // reports to the master before worker 1. Worker 3 runs a delay op and will
+    // be canceled by the master.
+    Graph g(OpRegistry::Global());
+
+    // Worker 1. a_err runs on w1_dev1 and a_delay runs on w2_dev2.
+    auto a = test::graph::Constant(&g, Tensor(1));
+    a->set_assigned_device_name(w1_dev1);
+
+    auto a_err = test::graph::Error(&g, a, "fantasia!");
+    a_err->set_assigned_device_name(w1_dev1);
+
+    auto a_delay = test::graph::Delay(&g, a, Microseconds(5000000));
+    a_delay->set_assigned_device_name(w1_dev2);
+
+    // Putting a_delay in fetches will cause the executor for w1_dev2 to report
+    // failure status as well due to the use of SendOp, as the Rendezvous is
+    // already poisoned by the a_err op in w1_dev1.
+    targets.push_back(a_delay->name());
+    fetches.push_back(a_err->name());
+
+    // Worker 2. b2 depends on a_err and detects the error via the rendezvous
+    // from worker 1.
+    auto b = test::graph::Constant(&g, Tensor(3));
+    b->set_assigned_device_name(w2_dev1);
+    auto b2 = test::graph::Add(&g, b, a_err);
+    b2->set_assigned_device_name(w2_dev1);
+    fetches.push_back(b2->name());
+
+    // Worker 3. Runs only a delay op. This worker will be cancelled by master
+    // when the master receives the root error from Worker 1.
+    auto c = test::graph::Constant(&g, Tensor(3));
+    c->set_assigned_device_name(w3_dev1);
+    auto c_delay = test::graph::Delay(&g, c, Microseconds(4000000));
+    c_delay->set_assigned_device_name(w3_dev1);
+
+    // Put c_delay in targets so that an implicit SendOp for c_delay to
+    // worker 1 is not generated.
+    targets.push_back(c_delay->name());
+
+    g.ToGraphDef(&gdef);
+  }
+
+  std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(gdef));
+  {
+    std::vector<Tensor> outputs;
+    Status status = session->Run({}, fetches, targets, &outputs);
+    LOG(INFO) << status;
+    EXPECT_FALSE(status.ok());
+    // assert status contains the root error
+    EXPECT_NE(status.ToString().find("fantasia!"), string::npos);
+    // assert status does not contain cancelled or aborted error.
+    EXPECT_EQ(status.ToString().find("Cancelled"), string::npos);
+    EXPECT_EQ(status.ToString().find("Aborted"), string::npos);
+    EXPECT_EQ(status.code(), error::Code::INTERNAL);
+  }
+  // session->Close() shall clean up all states related to the session->
+  // E.g., deregisters subgraph with workers, etc.
+  TF_CHECK_OK(session->Close());
+
+  // Sleep a bit so that most of asynchronous works finishes before
+  // the test process finishes.
+  Env::Default()->SleepForMicroseconds(2000000);
+}
+
+TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant2) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(2, 0), 3, &cluster));
+  auto& devs = cluster->devices();
+  const string& master = cluster->targets()[0];
+  // worker 1
+  const string w1_dev1 = devs[0].name();
+  const string w1_dev2 = devs[1].name();
+  // worker 2
+  const string w2_dev1 = devs[2].name();
+  // worker 3
+  const string w3_dev1 = devs[4].name();
+
+  LOG(INFO) << "master " << master << "w1_dev1 " << w1_dev1 << " w1_dev2 "
+            << w1_dev2 << " w2_dev1 " << w2_dev1 << " w3_dev1 " << w3_dev1;
+  GraphDef gdef;
+  std::vector<string> fetches;
+  std::vector<string> targets;
+  {
+    // Set up a graph to test the error handling when a derived error is
+    // reported to master before the original error and a third worker is
+    // aborted from an implicit SendOp. The expect behavior is that
+    //    1. the original error will be received by the master,
+    //    2. the aborted error will be treated as a derived error.
+    //
+    // Setup:
+    //
+    // Worker 1 generates the original error but it delays for 5 seconds before
+    // reporting to master. Worker 2 detects the error (via Rendezvous) and
+    // reports to the master before worker 1. Worker 3 runs a delay op and an
+    // implicit SendOp (for sending tensor c_delay to Worker 1) and will be
+    // aborted by worker 1.
+    Graph g(OpRegistry::Global());
+
+    // Worker 1. a_err runs on w1_dev1 and a_delay runs on w2_dev2.
+    auto a = test::graph::Constant(&g, Tensor(1));
+    a->set_assigned_device_name(w1_dev1);
+
+    auto a_err = test::graph::Error(&g, a, "fantasia!");
+    a_err->set_assigned_device_name(w1_dev1);
+
+    auto a_delay = test::graph::Delay(&g, a, Microseconds(5000000));
+    a_delay->set_assigned_device_name(w1_dev2);
+
+    // Putting a_delay in fetches will cause the executor for w1_dev2 to report
+    // failure status as well due to the use of SendOp, as the Rendezvous is
+    // already poisoned by the a_err op in w1_dev1.
+    targets.push_back(a_delay->name());
+    fetches.push_back(a_err->name());
+
+    // Worker 2. b2 depends on a_err and detects the error via the rendezvous
+    // from worker 1.
+    auto b = test::graph::Constant(&g, Tensor(3));
+    b->set_assigned_device_name(w2_dev1);
+    auto b2 = test::graph::Add(&g, b, a_err);
+    b2->set_assigned_device_name(w2_dev1);
+    fetches.push_back(b2->name());
+
+    // Worker 3. Runs only a delay op. This worker will be cancelled by master
+    // when the master receives the root error from Worker 1.
+    auto c = test::graph::Constant(&g, Tensor(3));
+    c->set_assigned_device_name(w3_dev1);
+    auto c_delay = test::graph::Delay(&g, c, Microseconds(4000000));
+    c_delay->set_assigned_device_name(w3_dev1);
+
+    // Put c_delay in fetches so that an implicit SendOp for c_delay to worker 1
+    // is generated.
+    fetches.push_back(c_delay->name());
+
+    g.ToGraphDef(&gdef);
+  }
+
+  std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(gdef));
+  {
+    std::vector<Tensor> outputs;
+    Status status = session->Run({}, fetches, targets, &outputs);
+    LOG(INFO) << status;
+    EXPECT_FALSE(status.ok());
+    // assert status contains the root error
+    EXPECT_NE(status.ToString().find("fantasia!"), string::npos);
+    // assert status does not contain cancelled or aborted error.
+    EXPECT_EQ(status.ToString().find("Cancelled"), string::npos);
+    EXPECT_EQ(status.ToString().find("Aborted"), string::npos);
+    EXPECT_EQ(status.code(), error::Code::INTERNAL);
+  }
+  // session->Close() shall clean up all states related to the session->
+  // E.g., deregisters subgraph with workers, etc.
+  TF_CHECK_OK(session->Close());
+
+  // Sleep a bit so that most of asynchronous works finishes before
+  // the test process finishes.
+  Env::Default()->SleepForMicroseconds(2000000);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 83b4189..1abfdf9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -20,11 +20,11 @@
 
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
-
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
@@ -133,6 +133,13 @@
                  strings::StrCat(s.error_message(),
                                  "\nAdditional GRPC error information:\n",
                                  context_->debug_error_string()));
+      // Always treat gRPC cancellation as a derived error. This ensures that
+      // other error types are preferred during status aggregation. (gRPC
+      // cancellation messages do not contain the original status message).
+      if (s.code() == tensorflow::error::Code::CANCELLED) {
+        s = StatusGroup::MakeDerived(s);
+      }
+
       done_(s);
       delete this;
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 14f0887..9fa7328 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -186,27 +186,35 @@
            RequestMessage, ResponseMessage>;
 
   // Handle all non-cancellable simple methods with a standard wrapper.
-#define HANDLE_CALL(method)                                                   \
+  // The boolean `may_block_on_compute_pool` indicates whether or not the
+  // operation may block on activities (such as op execution) that run on the
+  // compute pool.
+#define HANDLE_CALL(method, may_block_on_compute_pool)                        \
   void method##Handler(WorkerCall<method##Request, method##Response>* call) { \
-    Schedule([this, call]() {                                                 \
+    auto closure = [this, call]() {                                           \
       Status s = worker_->method(&call->request, &call->response);            \
       if (!s.ok()) {                                                          \
         VLOG(1) << "Bad response from " << #method << ": " << s;              \
       }                                                                       \
       call->SendResponse(ToGrpcStatus(s));                                    \
-    });                                                                       \
+    };                                                                        \
+    if ((may_block_on_compute_pool)) {                                        \
+      worker_->env()->env->SchedClosure(std::move(closure));                  \
+    } else {                                                                  \
+      worker_->env()->compute_pool->Schedule(std::move(closure));             \
+    }                                                                         \
     ENQUEUE_REQUEST(method, false);                                           \
   }
 
-  HANDLE_CALL(GetStatus);
-  HANDLE_CALL(CreateWorkerSession);
-  HANDLE_CALL(DeleteWorkerSession);
-  HANDLE_CALL(CleanupAll);
-  HANDLE_CALL(RegisterGraph);
-  HANDLE_CALL(DeregisterGraph);
-  HANDLE_CALL(CleanupGraph);
-  HANDLE_CALL(Logging);
-  HANDLE_CALL(Tracing);
+  HANDLE_CALL(GetStatus, false);
+  HANDLE_CALL(CreateWorkerSession, false);
+  HANDLE_CALL(DeleteWorkerSession, true);
+  HANDLE_CALL(CleanupAll, false);
+  HANDLE_CALL(RegisterGraph, false);
+  HANDLE_CALL(DeregisterGraph, false);
+  HANDLE_CALL(CleanupGraph, false);
+  HANDLE_CALL(Logging, false);
+  HANDLE_CALL(Tracing, false);
 
 #undef HANDLE_CALL
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index b070dd1..28ac30d 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -71,7 +71,7 @@
         worker_session_("rpc_session", "/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
                         std::unique_ptr<DeviceMgr>(),
-                        std::unique_ptr<GraphMgr>()),
+                        std::unique_ptr<GraphMgr>(), nullptr),
         rmgr_(&env) {
     env.env = Env::Default();
   }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 868f0f8..ace4e45 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -20,6 +20,7 @@
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/remote_device.h"
 #include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
@@ -40,18 +41,27 @@
               new WorkerCacheWrapper(default_worker_cache_.get())),
           worker_env->device_mgr,
           std::unique_ptr<GraphMgr>(
-              new GraphMgr(worker_env, worker_env->device_mgr)))),
+              new GraphMgr(worker_env, worker_env->device_mgr)),
+          nullptr)),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 /* static */
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
-                         server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(),
+                         "/replica:0/task:", server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
                                  const ServerDef& server_def,
                                  bool isolate_session_state) {
+  return CreateSession(session, server_def, {}, isolate_session_state);
+}
+
+Status SessionMgr::CreateSession(
+    const string& session, const ServerDef& server_def,
+    const protobuf::RepeatedPtrField<DeviceAttributes>&
+        cluster_device_attributes,
+    bool isolate_session_state) {
   mutex_lock l(mu_);
   if (session.empty()) {
     return errors::InvalidArgument("Session must be non-empty.");
@@ -76,6 +86,14 @@
 
   std::shared_ptr<WorkerSession> worker_session;
 
+  std::unique_ptr<DeviceMgr> remote_devices;
+  if (!cluster_device_attributes.empty()) {
+    std::vector<std::unique_ptr<Device>> cluster_devices;
+    tensorflow::AsRemoteDevices(worker_env_->env, cluster_device_attributes,
+                                &cluster_devices);
+    remote_devices.reset(new DeviceMgr(std::move(cluster_devices)));
+  }
+
   if (isolate_session_state || server_def.cluster().job_size()) {
     if (server_def.cluster().job_size()) {
       VLOG(1) << "ClusterSpec propagation is enabled.";
@@ -96,16 +114,18 @@
     worker_session.reset(
         new WorkerSession(session, worker_name,
                           std::unique_ptr<WorkerCacheInterface>(worker_cache),
-                          std::move(device_mgr), std::move(graph_mgr)));
+                          std::move(device_mgr), std::move(graph_mgr),
+                          std::move(remote_devices)));
   } else {
-    // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
+    // Borrow the WorkerEnv's DeviceMgr for the WorkerSession, so
     // that resources using it can use its devices after the
     // WorkerSession has been deleted.
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
     worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
         session, worker_name,
         std::unique_ptr<WorkerCacheInterface>(worker_cache),
-        worker_env_->device_mgr, std::move(graph_mgr));
+        worker_env_->device_mgr, std::move(graph_mgr),
+        std::move(remote_devices));
   }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 04d1d61..22bbe82 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -48,6 +48,10 @@
   // Allocates state for a new session.
   Status CreateSession(const string& session, const ServerDef& server_def,
                        bool isolate_session_state);
+  Status CreateSession(
+      const string& session, const ServerDef& server_def,
+      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+      bool isolate_session_state);
 
   // Locates the worker session for a given session handle
   Status WorkerSessionForSession(const string& session_handle,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 6a78b4e..47c0651 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -45,9 +45,9 @@
 void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
                                       CreateWorkerSessionResponse* response,
                                       StatusCallback done) {
-  Status s = env_->session_mgr->CreateSession(request->session_handle(),
-                                              request->server_def(),
-                                              request->isolate_session_state());
+  Status s = env_->session_mgr->CreateSession(
+      request->session_handle(), request->server_def(),
+      request->cluster_device_attributes(), request->isolate_session_state());
   done(s);
 }
 
@@ -72,7 +72,7 @@
   }
   if (s.ok()) {
     s = session->graph_mgr->Register(
-        request->session_handle(), request->graph_def(),
+        request->session_handle(), request->graph_def(), session.get(),
         request->graph_options(), request->debug_options(),
         request->collective_graph_key(), session->cluster_flr.get(),
         response->mutable_graph_handle());
@@ -196,6 +196,7 @@
   }
   CancellationManager* cm = new CancellationManager;
   opts->SetCancelCallback([this, cm, step_id]() {
+    LOG(INFO) << "Cancellation requested for RunGraph.";
     cm->StartCancel();
     AbortStep(step_id);
   });
@@ -293,6 +294,7 @@
 
   // Before we start doing anything, we set the RPC cancellation.
   opts->SetCancelCallback([this, cm, step_id]() {
+    LOG(INFO) << "Cancellation requested for PartialRunGraph.";
     cm->StartCancel();
     AbortStep(step_id);
   });
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index c7d0c6b..1a716c6 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -14,6 +14,8 @@
 ==============================================================================*/
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 
+#include "tensorflow/core/platform/monitoring.h"
+
 namespace tensorflow {
 
 namespace {
@@ -96,39 +98,55 @@
                              const string& worker_name,
                              std::unique_ptr<WorkerCacheInterface> worker_cache,
                              std::unique_ptr<DeviceMgr> device_mgr,
-                             std::unique_ptr<GraphMgr> graph_mgr)
+                             std::unique_ptr<GraphMgr> graph_mgr,
+                             std::unique_ptr<DeviceMgr> remote_device_mgr)
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(
-          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      cluster_flr(new ClusterFunctionLibraryRuntime(
+          this, !session_name.empty(),
+          remote_device_mgr ? remote_device_mgr.get() : nullptr)),
       device_mgr_(std::move(device_mgr)),
-      borrowed_device_mgr_(nullptr) {}
+      borrowed_device_mgr_(nullptr),
+      remote_device_mgr_(std::move(remote_device_mgr)) {
+  // Starts exporting metrics through a platform-specific monitoring API (if
+  // provided). For builds using "tensorflow/core/platform/default", this is
+  // currently a no-op.
+  monitoring::StartExporter();
+}
 
 /* static */
 std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
-    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr) {
-  return std::shared_ptr<WorkerSession>(
-      new WorkerSession(session_name, worker_name, std::move(worker_cache),
-                        borrowed_device_mgr, std::move(graph_mgr)));
+    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    std::unique_ptr<DeviceMgr> remote_device_mgr) {
+  return std::shared_ptr<WorkerSession>(new WorkerSession(
+      session_name, worker_name, std::move(worker_cache), borrowed_device_mgr,
+      std::move(graph_mgr), std::move(remote_device_mgr)));
 }
 
 WorkerSession::WorkerSession(const string& session_name,
                              const string& worker_name,
                              std::unique_ptr<WorkerCacheInterface> worker_cache,
                              DeviceMgr* borrowed_device_mgr,
-                             std::unique_ptr<GraphMgr> graph_mgr)
+                             std::unique_ptr<GraphMgr> graph_mgr,
+                             std::unique_ptr<DeviceMgr> remote_device_mgr)
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(
-          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      cluster_flr(new ClusterFunctionLibraryRuntime(this, !session_name.empty(),
+                                                    remote_device_mgr.get())),
       device_mgr_(nullptr),
-      borrowed_device_mgr_(borrowed_device_mgr) {}
+      borrowed_device_mgr_(borrowed_device_mgr),
+      remote_device_mgr_(std::move(remote_device_mgr)) {
+  // Starts exporting metrics through a platform-specific monitoring API (if
+  // provided). For builds using "tensorflow/core/platform/default", this is
+  // currently a no-op.
+  monitoring::StartExporter();
+}
 
 WorkerSession::~WorkerSession() {
   if (graph_mgr) {
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index f1faf49..90b656f 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -60,12 +60,14 @@
   WorkerSession(const string& session_name, const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
-                std::unique_ptr<GraphMgr> graph_mgr);
+                std::unique_ptr<GraphMgr> graph_mgr,
+                std::unique_ptr<DeviceMgr> remote_device_mgr);
 
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
       const string& session_name, const string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
-      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr);
+      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+      std::unique_ptr<DeviceMgr> remote_device_mgr);
 
   ~WorkerSession();
 
@@ -73,10 +75,12 @@
   WorkerSession(const string& session_name, const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 DeviceMgr* borrowed_device_mgr,
-                std::unique_ptr<GraphMgr> graph_mgr);
+                std::unique_ptr<GraphMgr> graph_mgr,
+                std::unique_ptr<DeviceMgr> remote_device_mgr);
 
   const std::unique_ptr<DeviceMgr> device_mgr_;
   DeviceMgr* const borrowed_device_mgr_;  // Not owned.
+  const std::unique_ptr<DeviceMgr> remote_device_mgr_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/example/feature_util.cc b/tensorflow/core/example/feature_util.cc
index f0593ed..16a508b 100644
--- a/tensorflow/core/example/feature_util.cc
+++ b/tensorflow/core/example/feature_util.cc
@@ -103,6 +103,21 @@
 }
 
 template <>
+void ClearFeatureValues<protobuf_int64>(Feature* feature) {
+  feature->mutable_int64_list()->Clear();
+}
+
+template <>
+void ClearFeatureValues<float>(Feature* feature) {
+  feature->mutable_float_list()->Clear();
+}
+
+template <>
+void ClearFeatureValues<string>(Feature* feature) {
+  feature->mutable_bytes_list()->Clear();
+}
+
+template <>
 Features* GetFeatures<Features>(Features* proto) {
   return proto;
 }
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index 32c6247..2cb895c 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -75,11 +75,13 @@
 //     FeatureType, belongs to the Features or Example proto.
 //   HasFeatureList(key, sequence_example) -> bool
 //     Returns true if SequenceExample has a feature_list with the key.
+//
 //   GetFeatureValues<FeatureType>(key, proto) -> RepeatedField<FeatureType>
 //     Returns values for the specified key and the FeatureType.
 //     Supported types for the proto: Example, Features.
 //   GetFeatureList(key, sequence_example) -> RepeatedPtrField<Feature>
 //     Returns Feature protos associated with a key.
+//
 //   AppendFeatureValues(begin, end, feature)
 //   AppendFeatureValues(container or initializer_list, feature)
 //     Copies values into a Feature.
@@ -87,6 +89,17 @@
 //   AppendFeatureValues(container or initializer_list, key, proto)
 //     Copies values into Features and Example protos with the specified key.
 //
+//   ClearFeatureValues<FeatureType>(feature)
+//     Clears the feature's repeated field of the given type.
+//
+//   SetFeatureValues(begin, end, feature)
+//   SetFeatureValues(container or initializer_list, feature)
+//     Clears a Feature, then copies values into it.
+//   SetFeatureValues(begin, end, key, proto)
+//   SetFeatureValues(container or initializer_list, key, proto)
+//     Clears Features or Example protos with the specified key,
+//     then copies values into them.
+//
 // Auxiliary functions, it is unlikely you'll need to use them directly:
 //   GetFeatures(proto) -> Features
 //     A convenience function to get Features proto.
@@ -307,6 +320,67 @@
                                     proto);
 }
 
+// Clears the feature's repeated field (int64, float, or string).
+template <typename... FeatureType>
+void ClearFeatureValues(Feature* feature);
+
+// Clears the feature's repeated field (int64, float, or string). Copies
+// elements from the range, defined by [first, last) into the feature's repeated
+// field.
+template <typename IteratorType>
+void SetFeatureValues(IteratorType first, IteratorType last, Feature* feature) {
+  using FeatureType = typename internal::FeatureTrait<
+      typename std::iterator_traits<IteratorType>::value_type>::Type;
+  ClearFeatureValues<FeatureType>(feature);
+  AppendFeatureValues(first, last, feature);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the initializer list into the feature's repeated field.
+template <typename ValueType>
+void SetFeatureValues(std::initializer_list<ValueType> container,
+                      Feature* feature) {
+  SetFeatureValues(container.begin(), container.end(), feature);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the container into the feature's repeated field.
+template <typename ContainerType>
+void SetFeatureValues(const ContainerType& container, Feature* feature) {
+  using IteratorType = typename ContainerType::const_iterator;
+  SetFeatureValues<IteratorType>(container.begin(), container.end(), feature);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies
+// elements from the range, defined by [first, last) into the feature's repeated
+// field.
+template <typename IteratorType, typename ProtoType>
+void SetFeatureValues(IteratorType first, IteratorType last, const string& key,
+                      ProtoType* proto) {
+  SetFeatureValues(first, last, GetFeature(key, GetFeatures(proto)));
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the container into the feature's repeated field.
+template <typename ContainerType, typename ProtoType>
+void SetFeatureValues(const ContainerType& container, const string& key,
+                      ProtoType* proto) {
+  using IteratorType = typename ContainerType::const_iterator;
+  SetFeatureValues<IteratorType>(container.begin(), container.end(), key,
+                                 proto);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the initializer list into the feature's repeated field.
+template <typename ValueType, typename ProtoType>
+void SetFeatureValues(std::initializer_list<ValueType> container,
+                      const string& key, ProtoType* proto) {
+  using IteratorType =
+      typename std::initializer_list<ValueType>::const_iterator;
+  SetFeatureValues<IteratorType>(container.begin(), container.end(), key,
+                                 proto);
+}
+
 // Returns true if a feature with the specified key belongs to the Features.
 // The template parameter pack accepts zero or one template argument - which
 // is FeatureType. If the FeatureType not specified (zero template arguments)
diff --git a/tensorflow/core/example/feature_util_test.cc b/tensorflow/core/example/feature_util_test.cc
index 53d3634..869d786 100644
--- a/tensorflow/core/example/feature_util_test.cc
+++ b/tensorflow/core/example/feature_util_test.cc
@@ -256,6 +256,20 @@
   EXPECT_NEAR(3.3, tag_ro.Get(2), kTolerance);
 }
 
+TEST(SetFeatureValuesTest, FloatValuesUsingInitializerList) {
+  Example example;
+
+  // The first set of values should be overwritten by the second.
+  AppendFeatureValues({1.1, 2.2, 3.3}, "tag", &example);
+  SetFeatureValues({10.1, 20.2, 30.3}, "tag", &example);
+
+  auto tag_ro = GetFeatureValues<float>("tag", example);
+  ASSERT_EQ(3, tag_ro.size());
+  EXPECT_NEAR(10.1, tag_ro.Get(0), kTolerance);
+  EXPECT_NEAR(20.2, tag_ro.Get(1), kTolerance);
+  EXPECT_NEAR(30.3, tag_ro.Get(2), kTolerance);
+}
+
 TEST(AppendFeatureValuesTest, Int64ValuesUsingInitializerList) {
   Example example;
 
@@ -466,5 +480,97 @@
             "}\n");
 }
 
+TEST(SequenceExampleTest, SetContextFeatureValuesWithInitializerList) {
+  SequenceExample se;
+
+  // The first set of values should be overwritten by the second.
+  SetFeatureValues({101, 102, 103}, "ids", se.mutable_context());
+  SetFeatureValues({1, 2, 3}, "ids", se.mutable_context());
+
+  // These values should be appended without overwriting.
+  AppendFeatureValues({4, 5, 6}, "ids", se.mutable_context());
+
+  EXPECT_EQ(se.DebugString(),
+            "context {\n"
+            "  feature {\n"
+            "    key: \"ids\"\n"
+            "    value {\n"
+            "      int64_list {\n"
+            "        value: 1\n"
+            "        value: 2\n"
+            "        value: 3\n"
+            "        value: 4\n"
+            "        value: 5\n"
+            "        value: 6\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "}\n");
+}
+
+TEST(SequenceExampleTest, SetFeatureValuesWithInitializerList) {
+  SequenceExample se;
+
+  // The first set of values should be overwritten by the second.
+  AppendFeatureValues({1, 2, 3}, "ids", se.mutable_context());
+  SetFeatureValues({4, 5, 6}, "ids", se.mutable_context());
+
+  // Two distinct features are added to the same feature list, so both will
+  // coexist in the output.
+  AppendFeatureValues({"cam1-0", "cam2-0"},
+                      GetFeatureList("images", &se)->Add());
+  SetFeatureValues({"cam1-1", "cam2-1"}, GetFeatureList("images", &se)->Add());
+
+  // The first set of values should be overwritten by the second.
+  AppendFeatureValues({"cam1-0", "cam2-0"},
+                      GetFeatureList("more-images", &se)->Add());
+  SetFeatureValues({"cam1-1", "cam2-1"},
+                   GetFeatureList("more-images", &se)->Mutable(0));
+
+  EXPECT_EQ(se.DebugString(),
+            "context {\n"
+            "  feature {\n"
+            "    key: \"ids\"\n"
+            "    value {\n"
+            "      int64_list {\n"
+            "        value: 4\n"
+            "        value: 5\n"
+            "        value: 6\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "}\n"
+            "feature_lists {\n"
+            "  feature_list {\n"
+            "    key: \"images\"\n"
+            "    value {\n"
+            "      feature {\n"
+            "        bytes_list {\n"
+            "          value: \"cam1-0\"\n"
+            "          value: \"cam2-0\"\n"
+            "        }\n"
+            "      }\n"
+            "      feature {\n"
+            "        bytes_list {\n"
+            "          value: \"cam1-1\"\n"
+            "          value: \"cam2-1\"\n"
+            "        }\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "  feature_list {\n"
+            "    key: \"more-images\"\n"
+            "    value {\n"
+            "      feature {\n"
+            "        bytes_list {\n"
+            "          value: \"cam1-1\"\n"
+            "          value: \"cam2-1\"\n"
+            "        }\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "}\n");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index dd2d516..fb4f890 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -18,7 +18,6 @@
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
-#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -41,22 +40,6 @@
 
 Allocator::~Allocator() {}
 
-void RunResourceCtor(ResourceHandle* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
-}
-
-void RunResourceDtor(ResourceHandle* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
-}
-
-void Allocator::RunVariantCtor(Variant* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
-}
-
-void Allocator::RunVariantDtor(Variant* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
-}
-
 // If true, cpu allocator collects more stats.
 static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index ad1d13c..861e855 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -18,25 +18,31 @@
 
 #include <stdlib.h>
 
+#include <functional>
 #include <limits>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-class Variant;
-
 // Attributes for a single allocation call. Different calls to the same
 // allocator could potentially have different allocation attributes.
 struct AllocationAttributes {
+  AllocationAttributes() = default;
+
+  AllocationAttributes(bool no_retry_on_failure, bool allocation_will_be_logged,
+                       std::function<uint64()>* freed_by_func)
+      : no_retry_on_failure(no_retry_on_failure),
+        allocation_will_be_logged(allocation_will_be_logged),
+        freed_by_func(freed_by_func) {}
+
   // If the first attempt to allocate the memory fails, the allocation
   // should return immediately without retrying.
   // An example use case is optional scratch spaces where a failure
@@ -49,9 +55,11 @@
   // true.
   bool allocation_will_be_logged = false;
   // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
-  // a memory chunk whose last-freed count is at this value or earlier may be
+  // a memory chunk whose freed_at_count is at this value or earlier may be
   // returned.
-  std::function<uint64()> freed_by_func = nullptr;
+  std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
 };
 
 // Runtime statistics collected by an allocator. Exactly the same as
@@ -67,11 +75,20 @@
   // is known.
   absl::optional<int64> bytes_limit;
 
+  // Stats for reserved memory usage.
+  int64 bytes_reserved;       // Number of bytes reserved.
+  int64 peak_bytes_reserved;  // The peak number of bytes reserved.
+  // The upper limit on the number bytes of reservable memory,
+  // if such a limit is known.
+  absl::optional<int64> bytes_reservable_limit;
+
   AllocatorStats()
       : num_allocs(0),
         bytes_in_use(0),
         peak_bytes_in_use(0),
-        largest_alloc_size(0) {}
+        largest_alloc_size(0),
+        bytes_reserved(0),
+        peak_bytes_reserved(0) {}
 
   string DebugString() const;
 };
@@ -109,51 +126,25 @@
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
 
-  // Convenience functions to do typed allocation.  C++ constructors
-  // and destructors are invoked for complex types if necessary,
-  // depending on the concrete Allocator implementation. May return
-  // NULL if the tensor has too many elements to represent in a single
-  // allocation.
-  template <typename T>
-  T* Allocate(size_t num_elements) {
-    return Allocate<T>(num_elements, AllocationAttributes());
-  }
-
-  template <typename T>
-  T* Allocate(size_t num_elements,
-              const AllocationAttributes& allocation_attr) {
-    // TODO(jeff): Do we need to allow clients to pass in alignment
-    // requirements?
-
-    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
-      return NULL;
-    }
-
-    void* p = AllocateRaw(kAllocatorAlignment, sizeof(T) * num_elements,
-                          allocation_attr);
-    T* typed_p = reinterpret_cast<T*>(p);
-    if (typed_p) RunCtor<T>(typed_p, num_elements);
-    return typed_p;
-  }
-
-  template <typename T>
-  void Deallocate(T* ptr, size_t num_elements) {
-    if (ptr) {
-      RunDtor<T>(ptr, num_elements);
-      DeallocateRaw(ptr);
-    }
-  }
-
   // Returns true if this allocator tracks the sizes of allocations.
   // RequestedSize and AllocatedSize must be overridden if
   // TracksAllocationSizes is overridden to return true.
   virtual bool TracksAllocationSizes() const { return false; }
 
-  // Returns true if this allocator requires tensors with 0 elements
-  // to allocate buffers. This is false for most allocators, but may
-  // be used by special-case allocators that want to track tensor
-  // usage.
-  virtual bool ShouldAllocateEmptyTensors() const { return false; }
+  // Returns true if this allocator allocates an opaque handle rather than the
+  // requested number of bytes.
+  //
+  // This method returns false for most allocators, but may be used by
+  // special-case allocators that track tensor usage. If this method returns
+  // true, AllocateRaw() should be invoked for all values of `num_bytes`,
+  // including 0.
+  //
+  // NOTE: It is the caller's responsibility to track whether an allocated
+  // object is a buffer or an opaque handle. In particular, when this method
+  // returns `true`, users of this allocator must not run any constructors or
+  // destructors for complex objects, since there is no backing store for the
+  // tensor in which to place their outputs.
+  virtual bool AllocatesOpaqueHandle() const { return false; }
 
   // Returns the user-requested size of the data allocated at
   // 'ptr'.  Note that the actual buffer allocated might be larger
@@ -211,79 +202,9 @@
   // Clears the internal stats except for the `in_use` field.
   virtual void ClearStats() {}
 
- private:
-  // No constructors or destructors are run for simple types
-  template <typename T>
-  void RunCtor(T* p, size_t n) {
-    static_assert(is_simple_type<T>::value, "T is not a simple type.");
-  }
-
-  template <typename T>
-  void RunDtor(T* p, size_t n) {}
-
-  // custom constructors and destructors that can be overridden for
-  // non-standard allocators
-
-  // Runs string's default constructor for  p[0], p[1], ..., p[n-1].
-  virtual void RunStringCtor(string* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) new (p) string();
-  }
-
-  // Runs string's default destructor for  p[0], p[1], ..., p[n-1].
-  virtual void RunStringDtor(string* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) p->~string();
-  }
-
-  virtual void RunResourceCtor(ResourceHandle* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
-  }
-
-  // Runs string's default destructor for  p[0], p[1], ..., p[n-1].
-  virtual void RunResourceDtor(ResourceHandle* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
-  }
-
-  virtual void RunVariantCtor(Variant* p, size_t n);
-
-  virtual void RunVariantDtor(Variant* p, size_t n);
-
-  // TODO(jeff): Maybe provide some interface to give info about
-  // current allocation state (total number of bytes available for
-  // allocation, number of bytes free on device, etc.)
+  virtual void SetSafeFrontier(uint64 count) {}
 };
 
-// Allocator-specific constructors and destructors are used for
-// strings
-template <>
-inline void Allocator::RunCtor(string* p, size_t n) {
-  RunStringCtor(p, n);
-}
-
-template <>
-inline void Allocator::RunDtor(string* p, size_t n) {
-  RunStringDtor(p, n);
-}
-
-template <>
-inline void Allocator::RunCtor(ResourceHandle* p, size_t n) {
-  RunResourceCtor(p, n);
-}
-
-template <>
-inline void Allocator::RunDtor(ResourceHandle* p, size_t n) {
-  RunResourceDtor(p, n);
-}
-
-template <>
-inline void Allocator::RunCtor(Variant* p, size_t n) {
-  RunVariantCtor(p, n);
-}
-
-template <>
-inline void Allocator::RunDtor(Variant* p, size_t n) {
-  RunVariantDtor(p, n);
-}
-
 // An implementation of Allocator that delegates all calls to another Allocator.
 //
 // Useful to clients who want to override part of the functionality of another
@@ -314,8 +235,8 @@
     return wrapped_->TracksAllocationSizes();
   }
 
-  bool ShouldAllocateEmptyTensors() const override {
-    return wrapped_->TracksAllocationSizes();
+  bool AllocatesOpaqueHandle() const override {
+    return wrapped_->AllocatesOpaqueHandle();
   }
 
   size_t RequestedSize(const void* ptr) const override {
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index d9f3280..85b0fba 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -48,6 +48,7 @@
 // framework.  This definition allows us to access the one method we need.
 class ProcessStateInterface {
  public:
+  virtual ~ProcessStateInterface() {}
   virtual Allocator* GetCPUAllocator(int numa_node) = 0;
 };
 
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 85e8ba6..3d03b2d 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -102,14 +103,14 @@
     a->DeallocateRaw(ptrs[i]);
   }
   CheckStats(a, 1023, 0, 552640, 1024);
-  float* t1 = a->Allocate<float>(1024);
-  double* t2 = a->Allocate<double>(1048576);
+  float* t1 = TypedAllocator::Allocate<float>(a, 1024, {});
+  double* t2 = TypedAllocator::Allocate<double>(a, 1048576, {});
   CheckStats(a, 1025, 1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double));
 
-  a->Deallocate(t1, 1024);
-  a->Deallocate(t2, 1048576);
+  TypedAllocator::Deallocate(a, t1, 1024);
+  TypedAllocator::Deallocate(a, t2, 1048576);
 
   CheckStats(a, 1025, 0, 1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double));
@@ -130,7 +131,8 @@
 
   // The maximum size_t value will definitely overflow.
   size_t count_to_allocate = std::numeric_limits<size_t>::max();
-  TestStruct* const test_pointer = a->Allocate<TestStruct>(count_to_allocate);
+  TestStruct* const test_pointer =
+      TypedAllocator::Allocate<TestStruct>(a, count_to_allocate, {});
 
   CHECK_EQ(test_pointer, reinterpret_cast<TestStruct*>(NULL));
 }
@@ -141,7 +143,8 @@
   // count_to_allocate is the smallest count that will cause overflow.
   const size_t count_to_allocate =
       (std::numeric_limits<size_t>::max() / sizeof(TestStruct)) + 1;
-  TestStruct* const test_pointer = a->Allocate<TestStruct>(count_to_allocate);
+  TestStruct* const test_pointer =
+      TypedAllocator::Allocate<TestStruct>(a, count_to_allocate, {});
 
   CHECK_EQ(test_pointer, reinterpret_cast<TestStruct*>(NULL));
 }
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 2107a25..3022e61 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -25,6 +25,7 @@
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+
 class BufRendezvous;
 class CancellationManager;
 class CompleteGroupRequest;
@@ -35,7 +36,6 @@
 class DeviceMgr;
 class GetStepSequenceRequest;
 class GetStepSequenceResponse;
-class Op;
 class Tensor;
 
 // Types of supported collective operations.
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 668ebbd..b4fdf8e 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -565,7 +565,7 @@
     TF_RETURN_IF_ERROR(CheckValidPadding(padding, explicit_paddings,
                                          /*num_dims=*/4, data_format));
   } else {
-    DCHECK(padding != Padding::EXPLICIT);
+    CHECK(padding != Padding::EXPLICIT);  // Crash ok.
   }
 
   DimensionHandle output_rows, output_cols;
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index e0d2db9..703c4a7 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
+
 #include <unordered_map>
 
 #include "tensorflow/core/framework/device_base.h"
@@ -22,6 +23,7 @@
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace data {
@@ -48,6 +50,14 @@
     if (dataset_) dataset_->Ref();
   }
 
+  DatasetVariantWrapper& operator=(DatasetVariantWrapper&& other) {
+    if (&other == this) return *this;
+    std::swap(dataset_, other.dataset_);
+    return *this;
+  }
+
+  DatasetVariantWrapper& operator=(const DatasetVariantWrapper& other) = delete;
+
   ~DatasetVariantWrapper() {
     if (dataset_) dataset_->Unref();
   }
@@ -73,7 +83,7 @@
   }
 
  private:
-  DatasetBase* const dataset_;  // Owns one reference.
+  DatasetBase* dataset_;  // Owns one reference.
 };
 
 const char kWrappedDatasetVariantTypeName[] =
@@ -400,6 +410,26 @@
   return status;
 }
 
+Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
+                                    std::vector<Tensor>* out_tensors,
+                                    bool* end_of_sequence) {
+  profiler::TraceMe activity(absl::string_view(params_.prefix),
+                             profiler::TraceMeLevel::kInfo);
+  RecordStart(ctx, /*stop_output=*/true);
+  Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
+  if (s.ok() && !*end_of_sequence) RecordElement(ctx);
+  RecordStop(ctx, /*start_output=*/true);
+  if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
+    s = errors::Internal(
+        "Iterator \"", params_.prefix,
+        "\" returned OutOfRange without setting `*end_of_sequence`. This "
+        "indicates that an error may have occurred. Original message: ",
+        s.error_message());
+    LOG(ERROR) << s;
+  }
+  return s;
+}
+
 void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset = nullptr;
   MakeDataset(ctx, &dataset);
@@ -478,5 +508,28 @@
   }
 }
 
+namespace {
+class RunnerImpl : public Runner {
+ public:
+  void Run(const std::function<void()>& f) override {
+    f();
+
+    // NOTE: We invoke a virtual function to prevent `f` being tail-called, and
+    // thus ensure that this function remains on the stack until after `f`
+    // returns.
+    PreventTailCall();
+  }
+
+ private:
+  virtual void PreventTailCall() {}
+};
+}  // namespace
+
+/* static */
+Runner* Runner::get() {
+  static Runner* singleton = new RunnerImpl;
+  return singleton;
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index c677869..0bcdf52 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -154,7 +154,7 @@
   }
 
   // Adds a node corresponding to the `DatasetType` to the Graph.
-  // Return value of `DatasetType::op_name()` is used as the op type for the
+  // Return value of `DatasetType::type_string()` is used as the op type for the
   // node.
   // Values for the output_types and output_shapes node attributes are also
   // written if those attributes are defined in the OpDef.
@@ -269,6 +269,19 @@
 class StatsAggregator;
 class FunctionHandleCache;
 
+// A utility class for running a function and ensuring that there is always a
+// `tensorflow::data` symbol on the stack.
+class Runner {
+ public:
+  virtual ~Runner() {}
+
+  // Runs the given function.
+  virtual void Run(const std::function<void()>& f) = 0;
+
+  // Returns a global singleton Runner.
+  static Runner* get();
+};
+
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
 // might run computation in an iterator whose lifetime is not nested within the
@@ -296,9 +309,7 @@
           thread_factory(ctx->thread_factory()) {}
 
     explicit Params(OpKernelContext* ctx)
-        : env(ctx->env()),
-          flr(ctx->function_library()),
-          runner(*(ctx->runner())) {
+        : env(ctx->env()), flr(ctx->function_library()) {
       // NOTE: need reinterpret_cast because function.h forward-declares Device.
       DeviceBase* device =
           reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
@@ -312,6 +323,21 @@
       } else {
         runner_threadpool_size = port::NumSchedulableCPUs();
       }
+
+      // NOTE: Wrap every runner invocation in a call to Runner()->Run(), so
+      // that a symbol in the tensorflow::data namespace is always on the stack
+      // when executing a function inside a Dataset.
+      runner = std::bind(
+          [](
+              // Note: `runner` is a const reference to avoid copying it.
+              const std::function<void(std::function<void()>)>& ctx_runner,
+              std::function<void()> fn) {
+            std::function<void()> wrapped_fn = std::bind(
+                [](const std::function<void()>& fn) { Runner::get()->Run(fn); },
+                std::move(fn));
+            ctx_runner(std::move(wrapped_fn));
+          },
+          *ctx->runner(), std::placeholders::_1);
     }
 
     // The Allocator to be used to allocate the output of an iterator.
@@ -523,7 +549,7 @@
   }
 
  private:
-  friend class DatasetBase;  // for access to `AddCleanupFunction`
+  friend class DatasetBase;          // for access to `AddCleanupFunction`
   friend class DatasetBaseIterator;  // for access to `node_`
 
   // Registers a cleanup function to be called upon object destruction.
@@ -656,7 +682,8 @@
 
  protected:
   friend Status AsGraphDef(
-      OpKernelContext* ctx, DatasetBase* dataset,
+      OpKernelContext* ctx, const DatasetBase* dataset,
+      SerializationContext&& serialization_ctx,
       GraphDef* graph_def);  // For access to graph related members.
   friend class CapturedFunction;
 
@@ -716,22 +743,7 @@
   }
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                 bool* end_of_sequence) final {
-    tracing::ScopedActivity activity(params_.prefix);
-    RecordStart(ctx, true /* stop_output */);
-    Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
-    if (s.ok() && !*end_of_sequence) RecordElement(ctx);
-    RecordStop(ctx, true /* start_output */);
-    if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
-      s = errors::Internal(
-          "Iterator \"", params_.prefix,
-          "\" returned OutOfRange without setting `*end_of_sequence`. This "
-          "indicates that an error may have occurred. Original message: ",
-          s.error_message());
-      LOG(ERROR) << s;
-    }
-    return s;
-  }
+                 bool* end_of_sequence) final;
 
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) final {
     TF_RETURN_IF_ERROR(params_.dataset->Save(ctx, writer));
@@ -755,6 +767,22 @@
     return model::MakeUnknownNode(std::move(args));
   }
 
+  // When modeling is enabled, this method disables autotuning for the given
+  // iterator (and the transitive closure of its inputs).
+  void DisableAutotune(IteratorContext* ctx, IteratorBase* iterator) {
+    if (iterator->node_) {
+      iterator->node_->set_autotune(false);
+    }
+  }
+
+  // When modeling is enabled, this method enables autotuning for the given
+  // iterator (and the transitive closure of its inputs).
+  void EnableAutotune(IteratorContext* ctx, IteratorBase* iterator) {
+    if (iterator->node_) {
+      iterator->node_->set_autotune(true);
+    }
+  }
+
   // When modeling is enabled, this method records the fact that this iterator
   // has dequeued an element from an internal buffer.
   void RecordBufferDequeue(IteratorContext* ctx,
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 16dbb0d..ee20438 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -74,11 +74,11 @@
   }
 
   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
-  // "device_tensor" which is on a GPU device "device". "device_tensor"
+  // "device_tensor" which is on a non-CPU device "device". "device_tensor"
   // must be allocated to be of the same size as "cpu_tensor".
   virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
-                                     Tensor* device_tensor,
-                                     StatusCallback done) const {
+                                     Tensor* device_tensor, StatusCallback done,
+                                     bool sync_dst_compute = true) const {
     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
   }
 
@@ -253,7 +253,7 @@
   // device memory tagged with an earlier freed-at count is really unencumbered
   // by pending uses.  For this to be useful the device memory allocator must
   // be tagging deallocated memory chunks using the same counter.
-  virtual uint64 SafeAllocFrontier() { return 0; }
+  virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
 
   // Copies `input_tensor` to `output_tensor`, where both tensors are on this
   // device. This function assumes that `output_tensor` has already been
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 5766109..b6ef479 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -17,6 +17,7 @@
 #define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
 
 #include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -27,6 +28,7 @@
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -628,8 +630,13 @@
   // In the cross-process scenario, runner isn't used for making the Async
   // RPC calls.
   struct Options {
-    // The id of the step that is calling this function.
-    int64 step_id = 0;
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    const int64 step_id = -std::abs(static_cast<int64>(random::New64()));
+
     Rendezvous* rendezvous = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
@@ -785,6 +792,9 @@
                    FunctionLibraryRuntime::LocalHandle handle,
                    gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                    FunctionLibraryRuntime::DoneCallback done) = 0;
+
+  // DeviceMgr with *all* available devices.
+  virtual DeviceMgr* remote_device_mgr() const = 0;
 };
 
 // Extracts the actual type from "attr_values" based on its definition
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 82e58e4..4cc8d12 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -81,7 +81,7 @@
       // Args
       {"x: T"},
       // Return values
-      {"equal: T"},
+      {"equal: bool"},
       // Attr def
       {"T:{float, double, int32, int64, string}"},
       {
@@ -569,6 +569,23 @@
          {"output_shapes", "$output_shapes"}}}});
 }
 
+FunctionDef Unique() {
+  return FDH::Create(
+      // Name
+      "GetUnique",
+      // Args
+      {"x:T"},
+      // Return values
+      {"y:T", "idx: out_idx"},
+      // Attr def
+      {"T: type", "out_idx: {int32, int64} = DT_INT32"},
+      // Nodes
+      {
+          {{"result"}, "Unique", {"x"}, {{"T", "$T"}, {"out_idx", "$out_idx"}}},
+      },
+      {{"y", "result:y:0"}, {"idx", "result:idx:0"}});
+}
+
 void FunctionTestSchedClosure(std::function<void()> fn) {
   static thread::ThreadPool* w =
       new thread::ThreadPool(Env::Default(), "Test", 8);
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 85398e8..9893d1d 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -135,6 +135,9 @@
 // x:T -> y: TensorSliceDatasetOp::Dataset
 FunctionDef MakeTensorSliceDataset();
 
+// x:T -> y: T, idx: out_idx
+FunctionDef Unique();
+
 void FunctionTestSchedClosure(std::function<void()> fn);
 
 }  // end namespace function
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index d67a418..664e8e2 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -208,10 +208,18 @@
       node_def->add_input(
           strings::StrCat(edge->src()->name(), ":", edge->src_output()));
     }
-
     // Add control inputs
+    std::vector<std::string> control_inputs;
+    control_inputs.reserve(control_edges.size());
     for (const Edge* edge : control_edges) {
-      node_def->add_input(strings::StrCat("^", edge->src()->name()));
+      control_inputs.push_back(strings::StrCat("^", edge->src()->name()));
+    }
+    // Sort the control inputs so that nodes that are semantically equivalent
+    // generate idential node_def.
+    std::sort(control_inputs.begin(), control_inputs.end());
+
+    for (const auto& input : control_inputs) {
+      node_def->add_input(input);
     }
 
     // Populate tensor_renaming.
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index fcacc3b..4d1ae2c 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -34,6 +34,77 @@
   return *this;
 }
 
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<int64>(
+    const char* attr_name, gtl::ArraySlice<int64> allowed) {
+  auto* constraint = kernel_def_->add_constraint();
+  constraint->set_name(attr_name);
+  auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
+  for (const int64 integer : allowed) {
+    LOG(INFO) << integer;
+    allowed_values->add_i(integer);
+  }
+  return *this;
+}
+
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<int64>(const char* attr_name,
+                                                          int64 allowed) {
+  return AttrConstraint(
+      attr_name,
+      gtl::ArraySlice<int64>(std::initializer_list<int64>({allowed})));
+}
+
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
+    const char* attr_name, gtl::ArraySlice<string> allowed) {
+  auto* constraint = kernel_def_->add_constraint();
+  constraint->set_name(attr_name);
+  auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
+  for (const auto& str : allowed) {
+    allowed_values->add_s(str);
+  }
+  return *this;
+}
+
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
+    const char* attr_name, string allowed) {
+  return AttrConstraint(
+      attr_name,
+      gtl::ArraySlice<string>(std::initializer_list<string>({allowed})));
+}
+
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<const char*>(
+    const char* attr_name, gtl::ArraySlice<const char*> allowed) {
+  auto* constraint = kernel_def_->add_constraint();
+  constraint->set_name(attr_name);
+  auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
+  for (const auto& str : allowed) {
+    allowed_values->add_s(str);
+  }
+  return *this;
+}
+
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<const char*>(
+    const char* attr_name, const char* allowed) {
+  return AttrConstraint(attr_name,
+                        gtl::ArraySlice<const char*>(
+                            std::initializer_list<const char*>({allowed})));
+}
+
+template <>
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<bool>(const char* attr_name,
+                                                         bool allowed) {
+  auto* constraint = kernel_def_->add_constraint();
+  constraint->set_name(attr_name);
+  auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
+  allowed_values->add_b(allowed);
+  return *this;
+}
+
 KernelDefBuilder& KernelDefBuilder::TypeConstraint(
     const char* attr_name, gtl::ArraySlice<DataType> allowed) {
   auto* constraint = kernel_def_->add_constraint();
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index d74453c..de0a884 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -43,6 +43,18 @@
   // particular type or list(type) attr (a further restriction than
   // what the Op allows).
   // Returns *this.
+  template <typename T>
+  KernelDefBuilder& AttrConstraint(const char* attr_name,
+                                   gtl::ArraySlice<T> allowed);
+
+  // Like AttrConstraint above but supports just a single value.
+  template <typename T>
+  KernelDefBuilder& AttrConstraint(const char* attr_name, T allowed);
+
+  // Specify that this kernel supports a limited set of values for a
+  // particular type or list(type) attr (a further restriction than
+  // what the Op allows).
+  // Returns *this.
   KernelDefBuilder& TypeConstraint(const char* attr_name,
                                    gtl::ArraySlice<DataType> allowed);
 
diff --git a/tensorflow/core/framework/kernel_def_builder_test.cc b/tensorflow/core/framework/kernel_def_builder_test.cc
index 31656c9..30bfd93 100644
--- a/tensorflow/core/framework/kernel_def_builder_test.cc
+++ b/tensorflow/core/framework/kernel_def_builder_test.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 #include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -72,6 +73,86 @@
   delete def;
 }
 
+TEST(KernelDefBuilderTest, Int64Constraint) {
+  const KernelDef* def =
+      KernelDefBuilder("B").Device(DEVICE_GPU).AttrConstraint("T", 5ll).Build();
+  KernelDef expected;
+  protobuf::TextFormat::ParseFromString(R"proto(
+                                          op: 'B'
+                                          device_type: 'GPU'
+                                          constraint {
+                                            name: 'T'
+                                            allowed_values { list { i: 5 } }
+                                          })proto",
+                                        &expected);
+
+  EXPECT_EQ(def->DebugString(), expected.DebugString());
+  delete def;
+
+  def = KernelDefBuilder("C")
+            .Device(DEVICE_GPU)
+            .AttrConstraint("U", gtl::ArraySlice<int64>{5ll, 17ll})
+            .AttrConstraint("V", string("proto"))
+            .Build();
+
+  protobuf::TextFormat::ParseFromString(
+      R"proto(
+        op: 'C'
+        device_type: 'GPU'
+        constraint {
+          name: 'U'
+          allowed_values { list { i: [ 5, 17 ] } }
+        }
+        constraint {
+          name: 'V'
+          allowed_values { list { s: 'proto' } }
+        })proto",
+      &expected);
+  EXPECT_EQ(def->DebugString(), expected.DebugString());
+  delete def;
+}
+
+TEST(KernelDefBuilderTest, StringConstraint) {
+  const KernelDef* def = KernelDefBuilder("B")
+                             .Device(DEVICE_GPU)
+                             .AttrConstraint("T", "hi")
+                             .Build();
+  KernelDef expected;
+  protobuf::TextFormat::ParseFromString(R"proto(
+                                          op: 'B'
+                                          device_type: 'GPU'
+                                          constraint {
+                                            name: 'T'
+                                            allowed_values { list { s: 'hi' } }
+                                          })proto",
+                                        &expected);
+
+  EXPECT_EQ(def->DebugString(), expected.DebugString());
+  delete def;
+
+  def = KernelDefBuilder("C")
+            .Device(DEVICE_GPU)
+            .AttrConstraint("U", gtl::ArraySlice<const char*>{"boo", "ya"})
+            .AttrConstraint("V", string("proto"))
+            .Build();
+
+  protobuf::TextFormat::ParseFromString(
+      R"proto(
+        op: 'C'
+        device_type: 'GPU'
+        constraint {
+          name: 'U'
+          allowed_values { list { s: [ 'boo', 'ya' ] } }
+        }
+        constraint {
+          name: 'V'
+          allowed_values { list { s: 'proto' } }
+        })proto",
+      &expected);
+  EXPECT_EQ(def->DebugString(), expected.DebugString());
+  delete def;
+}
+
 TEST(KernelDefBuilderTest, HostMemory) {
   const KernelDef* def = KernelDefBuilder("E")
                              .Device(DEVICE_GPU)
diff --git a/tensorflow/core/framework/logging.cc b/tensorflow/core/framework/logging.cc
new file mode 100644
index 0000000..7a819e7
--- /dev/null
+++ b/tensorflow/core/framework/logging.cc
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/logging.h"
+
+#include <iostream>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+namespace logging {
+
+typedef std::vector<void (*)(const char*)> Listeners;
+
+Listeners* GetListeners() {
+  static Listeners* listeners = new Listeners;
+  return listeners;
+}
+
+bool RegisterListener(void (*listener)(const char*)) {
+  GetListeners()->push_back(listener);
+  return true;
+}
+
+bool LogToListeners(string msg, string end) {
+  auto listeners = logging::GetListeners();
+  if (listeners->empty()) {
+    return false;
+  }
+
+  string ended_msg = strings::StrCat(msg, end);
+
+  for (auto& listener : *listeners) {
+    listener(ended_msg.c_str());
+  }
+
+  return true;
+}
+
+}  // end namespace logging
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/framework/logging.h
similarity index 74%
rename from tensorflow/core/kernels/logging_ops.h
rename to tensorflow/core/framework/logging.h
index 92a8d63..9bde3d5 100644
--- a/tensorflow/core/kernels/logging_ops.h
+++ b/tensorflow/core/framework/logging.h
@@ -13,11 +13,10 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
 
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include <string>
 
 namespace tensorflow {
 
@@ -27,7 +26,12 @@
 // Returns true if it is successfully registered.
 bool RegisterListener(void (*listener)(const char*));
 
+// Log string to active listeners. Returns true if any listeners were
+// registered.
+bool LogToListeners(std::string msg, std::string end = "\n");
+
 }  // namespace logging
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 8caea35..6e58935 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -72,6 +72,10 @@
                                                             : DEVICE_MEMORY;
 }
 
+MemoryType MTypeFromDTypeIntsOnDevice(const DataType dtype) {
+  return DataTypeAlwaysOnHost(dtype) ? HOST_MEMORY : DEVICE_MEMORY;
+}
+
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
                           const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* inp_mtypes,
@@ -100,8 +104,20 @@
   // to derive the correct input/output memory types. We should also split
   // host-memory and non host-memory arguments into separate type lists.
   if (!status.ok() || IsFunctionCallOp(ndef.op())) {
-    for (const auto& t : inp_dtypes) inp_mtypes->push_back(MTypeFromDType(t));
-    for (const auto& t : out_dtypes) out_mtypes->push_back(MTypeFromDType(t));
+    if (device_type.type_string() == "TPU") {
+      // Here we assume that if tf.function() is called within
+      // "with tf.device('/device:TPU:0')", the whole function will be compiled
+      // and executed on TPU. This is true today, but when we implement auto
+      // clustering on function body, this will no longer be true. For example,
+      // we might want to place string arguments on host.
+      for (const auto& t : inp_dtypes)
+        inp_mtypes->push_back(MTypeFromDTypeIntsOnDevice(t));
+      for (const auto& t : out_dtypes)
+        out_mtypes->push_back(MTypeFromDTypeIntsOnDevice(t));
+    } else {
+      for (const auto& t : inp_dtypes) inp_mtypes->push_back(MTypeFromDType(t));
+      for (const auto& t : out_dtypes) out_mtypes->push_back(MTypeFromDType(t));
+    }
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 6cff7e5..e132095 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -41,7 +41,8 @@
 // The formula used for computing the probability is derived by modeling the
 // problem as an M/M/1/K queue
 // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
-int64 ComputeWaitTime(int64 output_time, int64 input_time, int64 buffer_size) {
+double ComputeWaitTime(double output_time, double input_time,
+                       int64 buffer_size) {
   if (output_time == 0 || input_time == 0) {
     return output_time;
   }
@@ -75,34 +76,40 @@
         Args{id_, name_, std::move(output)});
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  // The output time is the sum of the self processing time and the average
+  // output time of inputs comprising the interleave "cycle".
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     if (inputs_.size() <= 1) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
-    int64 delta = NanosPerElementLocked() * (inputs_.size() - 1);
+    double delta = SelfProcessingTimeLocked() * (inputs_.size() - 1);
     input_times->back() += delta;
     auto cleanup = gtl::MakeCleanup(
         [input_times, delta]() { input_times->back() -= delta; });
-    int64 output_time =
-        static_cast<double>(OutputTimeForInputs(input_times) -
-                            inputs_.front()->OutputTime(input_times)) /
-        static_cast<double>(inputs_.size() - 1);
-    return NanosPerElementLocked() + output_time;
+    double output_time = (OutputTimeForInputs(input_times) -
+                          inputs_.front()->OutputTime(input_times)) /
+                         static_cast<double>(inputs_.size() - 1);
+    return SelfProcessingTimeLocked() + output_time;
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+  // The processing time is the sum of the self processing time and the average
+  // processing time of inputs comprising the interleave "cycle".
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
     if (inputs_.size() <= 1) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
-    int64 processing_time =
-        static_cast<double>(ProcessingTimeForInputs() -
-                            inputs_.front()->ProcessingTime()) /
+    double processing_time =
+        (ProcessingTimeForInputs() - inputs_.front()->TotalProcessingTime()) /
         static_cast<double>(inputs_.size() - 1);
-    return NanosPerElementLocked() + processing_time;
+    return SelfProcessingTimeLocked() + processing_time;
   }
 };
 
+// The first input of AsyncInterleaveMany corresponds to the input dataset whose
+// elements are used to create the (derived) input datasets whose elements are
+// interleaved as output.
+//
 // TODO(jsimsa): model the first input
 class AsyncInterleaveMany : public Node {
  public:
@@ -127,14 +134,19 @@
         Args{id_, name_, std::move(output)}, parameters);
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  // The output time is estimated using `ComputeWaitTime(output_time,
+  // input_time, parallelism)`, where `output_time` is the sum of the
+  // self-processing time and the average output time of inputs comprising the
+  // interleave "cycle", `input_time` is specified through `input_times` and
+  // `buffer_size` is derived from parallelism.
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     if (inputs_.size() <= 1) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
-    int64 old_input_time = input_times->back();
-    int64 new_input_time = static_cast<double>(NanosPerElementLocked()) *
-                           static_cast<double>(inputs_.size() - 1);
+    double old_input_time = input_times->back();
+    double new_input_time =
+        SelfProcessingTimeLocked() * static_cast<double>(inputs_.size() - 1);
     input_times->push_back(new_input_time);
     auto cleanup =
         gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
@@ -143,23 +155,23 @@
       parallelism = std::min(static_cast<int>(parallelism),
                              static_cast<int>((*parameter)->value));
     }
-    int64 output_time =
-        static_cast<double>(OutputTimeForInputs(input_times) -
-                            inputs_.front()->OutputTime(input_times)) /
-        static_cast<double>(inputs_.size() - 1) / parallelism;
-    return ComputeWaitTime(NanosPerElementLocked() + output_time,
+    double output_time = (OutputTimeForInputs(input_times) -
+                          inputs_.front()->OutputTime(input_times)) /
+                         static_cast<double>(num_inputs() - 1) / parallelism;
+    return ComputeWaitTime(SelfProcessingTimeLocked() + output_time,
                            old_input_time, parallelism);
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+  // The processing time is the sum of the self processing time and the average
+  // processing time of inputs comprising the interleave "cycle".
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
     if (inputs_.size() <= 1) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
-    int64 processing_time =
-        ProcessingTimeForInputs() - inputs_.front()->ProcessingTime();
-    return NanosPerElementLocked() +
-           static_cast<double>(processing_time) /
-               static_cast<double>(inputs_.size() - 1);
+    double processing_time =
+        ProcessingTimeForInputs() - inputs_.front()->TotalProcessingTime();
+    return SelfProcessingTimeLocked() +
+           processing_time / static_cast<double>(num_inputs() - 1);
   }
 };
 
@@ -176,22 +188,27 @@
                                         ratio_);
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  // The output time is the sum of the self processing time and the product of
+  // `ratio_` and the sum of output times of inputs.
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     if (ratio_ == 0) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
-    int64 old_input_time = input_times->back();
-    input_times->back() += static_cast<int64>(
-        static_cast<double>(old_input_time + NanosPerElementLocked()) / ratio_);
+    double old_input_time = input_times->back();
+    input_times->back() +=
+        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
     auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
       input_times->back() = old_input_time;
     });
-    return NanosPerElementLocked() + ratio_ * OutputTimeForInputs(input_times);
+    return SelfProcessingTimeLocked() +
+           ratio_ * OutputTimeForInputs(input_times);
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
-    return NanosPerElementLocked() + ratio_ * ProcessingTimeForInputs();
+  // The processing time is the sum of the self processing time and the product
+  // of `ratio_` and the sum of processing times of inputs.
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return SelfProcessingTimeLocked() + ratio_ * ProcessingTimeForInputs();
   }
 
  private:
@@ -221,31 +238,35 @@
         Args{id_, name_, std::move(output)}, ratio_, parameters);
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  // The output time is estimated using `ComputeWaitTime(output_time,
+  // input_time, parallelism)`, where `output_time` is the sum of the self
+  // processing time and the product of `ratio_` and the sum of output times of
+  // inputs, `input_time` is specified through `input_times` and `buffer_size`
+  // is derived from parallelism.
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
     if (auto* parameter = gtl::FindOrNull(parameters_, "parallelism")) {
       parallelism = (*parameter)->value;
     }
     if (ratio_ == 0.0) {
-      int64 output_time =
-          static_cast<double>(NanosPerElementLocked()) / parallelism;
+      double output_time = SelfProcessingTimeLocked() / parallelism;
       return ComputeWaitTime(output_time, input_times->back(), parallelism);
     }
-    int64 old_input_time = input_times->back();
-    int64 new_input_time = static_cast<int64>(
-        static_cast<double>(NanosPerElementLocked()) / ratio_ / parallelism);
+    double old_input_time = input_times->back();
+    double new_input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
     input_times->push_back(new_input_time);
     auto cleanup =
         gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-    int64 output_time = static_cast<int64>(
-        static_cast<double>(NanosPerElementLocked()) / parallelism +
-        ratio_ * OutputTimeForInputs(input_times));
+    double output_time = SelfProcessingTimeLocked() / parallelism +
+                         ratio_ * OutputTimeForInputs(input_times);
     return ComputeWaitTime(output_time, old_input_time, parallelism);
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
-    return NanosPerElementLocked() + ratio_ * ProcessingTimeForInputs();
+  // The processing time is the sum of the self processing time and the product
+  // of `ratio_` and the sum of processing times of inputs.
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return SelfProcessingTimeLocked() + ratio_ * ProcessingTimeForInputs();
   }
 
  private:
@@ -264,40 +285,40 @@
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  // The output time is the sum of the self processing time and the product of
+  // the ratio estimate and the sum of output times of inputs.
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
     // TODO(jsimsa): The current implementation assumes that the number of input
     // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    int64 old_input_time = input_times->back();
-    input_times->back() =
-        static_cast<double>(old_input_time + NanosPerElementLocked()) / ratio;
+    double old_input_time = input_times->back();
+    input_times->back() = (old_input_time + SelfProcessingTimeLocked()) / ratio;
     auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
       input_times->back() = old_input_time;
     });
-    return NanosPerElementLocked() +
-           static_cast<int64>(
-               ratio * static_cast<double>(OutputTimeForInputs(input_times)));
+    return SelfProcessingTimeLocked() +
+           ratio * OutputTimeForInputs(input_times);
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+  // The processing time is the sum of the self processing time and the product
+  // of the ratio estimate and the sum of processing times of inputs.
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
     if (inputs_.empty() || num_elements_ == 0) {
-      return NanosPerElementLocked();
+      return SelfProcessingTimeLocked();
     }
-    // TODO(jsimsa): The current implementation that the number of input
+    // TODO(jsimsa): The current implementation assumes that the number of input
     // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    return NanosPerElementLocked() +
-           static_cast<int64>(ratio *
-                              static_cast<double>(ProcessingTimeForInputs()));
+    return SelfProcessingTimeLocked() + ratio * ProcessingTimeForInputs();
   }
 };
 
@@ -313,12 +334,14 @@
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  // The output time is the sum of output times of inputs.
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     return OutputTimeForInputs(input_times);
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+  // The processing time is the sum of processing times of inputs.
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
     return ProcessingTimeForInputs();
   }
 };
@@ -415,13 +438,13 @@
     snapshot = output_->Snapshot(nullptr);
   }
   VLOG(2) << "Starting optimization of tunable parameters";
-  const int64 processing_time = ProcessingTime(snapshot);
+  const double processing_time = TotalProcessingTime(snapshot);
   auto parameters = CollectTunableParameters(snapshot);
   for (auto& pair : parameters) {
     pair.second->value = 1;
   }
   while (true) {
-    const int64 output_time = OutputTime(snapshot);
+    const double output_time = OutputTime(snapshot);
     bool all_max = true;
     for (auto& pair : parameters) {
       if (pair.second->value < pair.second->max) {
@@ -432,22 +455,15 @@
     if (output_time < processing_time / cpu_budget || all_max) {
       break;
     }
-    int64 best_delta = -1;
+    double best_delta = -1.0L;
     Parameter* best_parameter = nullptr;
     for (auto& pair : parameters) {
       if (pair.second->value == pair.second->max) {
         continue;
       }
       pair.second->value++;
-      int64 new_output_time = OutputTime(snapshot);
-      int64 delta = output_time - new_output_time;
-      if (delta < 0) {
-        VLOG(3) << "Increasing the parallelism of tunable parameter "
-                << pair.first << " resulted in slowdown (before=" << output_time
-                << ", after=" << new_output_time
-                << "). This should never happen because the latency "
-                   "should be monotonic w.r.t. to parallelism.";
-      }
+      double new_output_time = OutputTime(snapshot);
+      double delta = output_time - new_output_time;
       if (delta > best_delta) {
         best_delta = delta;
         best_parameter = pair.second.get();
@@ -455,11 +471,10 @@
       pair.second->value--;
     }
     if (!best_parameter) {
-      // This should never happen because we are using a model snapshot and
-      // the output time is monotonically decreasing w.r.t. parallelism.
       LOG(WARNING) << "Failed to find a tunable parameter that would "
-                      "decrease the output time, aborting the current "
-                      "optimization attempt.";
+                      "decrease the output time. This means that the "
+                      "autotuning optimization got stuck in a local maximum. "
+                      "The optimization attempt will be aborted.";
       return;
     }
     best_parameter->value++;
@@ -536,13 +551,19 @@
   return parameters;
 }
 
-int64 Model::OutputTime(std::shared_ptr<Node> node) {
-  std::vector<int64> input_times(1, 0);
+double Model::OutputTime(std::shared_ptr<Node> node) {
+  std::vector<double> input_times(1, 0);
+  // TODO(jsimsa): Now that we are accounting for buffer size in wait time
+  // computation, assuming that the input is infinitely fast will result in
+  // inaccurate estimates of the output latency.
+  //
+  // We should compute the output latency as a fix-point of the following
+  // equation: `output_time = node(OutputTime(input_times(1, output_time))`.
   return node->OutputTime(&input_times);
 }
 
-int64 Model::ProcessingTime(std::shared_ptr<Node> node) {
-  return node->ProcessingTime();
+double Model::TotalProcessingTime(std::shared_ptr<Node> node) {
+  return node->TotalProcessingTime();
 }
 
 }  // namespace model
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index cf50195..f373407 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -118,6 +118,8 @@
   explicit Node(Args args)
       : id_(args.id), name_(args.name), output_(args.output.get()) {}
 
+  virtual ~Node() {}
+
   // Increments the bytes buffered by the given delta.
   void add_buffered_bytes(int64 delta) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
@@ -136,6 +138,12 @@
     processing_time_ += delta;
   }
 
+  // Returns an indication whether autotuning is enabled for this node.
+  bool autotune() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return autotune_;
+  }
+
   // Returns the number of bytes stored in this node's buffer.
   int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
@@ -213,11 +221,20 @@
     inputs_.remove(input);
   }
 
+  // Sets the value that determines whether autotuning is enabled for this node.
+  void set_autotune(bool autotune) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    autotune_ = autotune;
+  }
+
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       std::map<string, std::shared_ptr<Parameter>>* parameters) const
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
+    if (!autotune_) {
+      return;
+    }
     for (auto& pair : parameters_) {
       if (pair.second->state->tunable) {
         parameters->insert(std::make_pair(long_name(), pair.second));
@@ -228,17 +245,31 @@
     }
   }
 
-  // Returns the per-element output time for this node.
-  int64 OutputTime(std::vector<int64>* input_times) const LOCKS_EXCLUDED(mu_) {
+  // Returns a human-readable representation of this node.
+  string DebugString() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
-    return OutputTimeLocked(input_times);
+    string result;
+    strings::StrAppend(&result, long_name(), ":\n");
+    strings::StrAppend(&result, "  autotune=", autotune_, "\n");
+    strings::StrAppend(&result, "  buffered_bytes=", buffered_bytes_, "\n");
+    strings::StrAppend(&result, "  processing_time=", processing_time_, "\n");
+    strings::StrAppend(&result, "  num_elements=", num_elements_, "\n");
+    string inputs;
+    for (auto& input : inputs_) {
+      strings::StrAppend(&inputs, input->long_name(), ",");
+    }
+    strings::StrAppend(&result, "  inputs={", inputs, "}\n");
+    for (auto& input : inputs_) {
+      strings::StrAppend(&result, input->DebugString());
+    }
+    return result;
   }
 
-  // Returns the per-element processing time spent in the subtree rooted in
-  // this node.
-  int64 ProcessingTime() const LOCKS_EXCLUDED(mu_) {
+  // Returns the per-element output time for this node.
+  double OutputTime(std::vector<double>* input_times) const
+      LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
-    return ProcessingTimeLocked();
+    return OutputTimeLocked(input_times);
   }
 
   // Returns a copy of this node, making a deep copy of its inputs and a
@@ -252,6 +283,7 @@
     std::shared_ptr<Node> result = Clone(output);
     {
       mutex_lock l2(result->mu_);
+      result->autotune_ = autotune_;
       result->buffered_bytes_ = buffered_bytes_;
       result->processing_time_ = processing_time_;
       result->num_elements_ = num_elements_;
@@ -263,57 +295,90 @@
     return result;
   }
 
+  // Returns the per-element CPU time spent in the subtree rooted in this node.
+  double TotalProcessingTime() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return TotalProcessingTimeLocked();
+  }
+
  protected:
+  // Returns the number of inputs.
+  int64 num_inputs() const SHARED_LOCKS_REQUIRED(mu_) {
+    int64 num_inputs = 0;
+    for (auto& input : inputs_) {
+      // Inputs for which autotuning is disabled are excluded.
+      if (input->autotune()) {
+        ++num_inputs;
+      }
+    }
+    return num_inputs;
+  }
+
   // Creates a clone of this node.
   virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
       SHARED_LOCKS_REQUIRED(mu_) = 0;
 
-  // Returns the per-element processing time spent in this node.
-  int64 NanosPerElementLocked() const SHARED_LOCKS_REQUIRED(mu_) {
-    if (num_elements_ == 0) {
-      return 0;
-    }
-    return static_cast<int64>(static_cast<double>(processing_time_) /
-                              static_cast<double>(num_elements_));
-  }
-
   // Returns the sum of per-element output time for the inputs of this node.
-  int64 OutputTimeForInputs(std::vector<int64>* input_times) const
+  double OutputTimeForInputs(std::vector<double>* input_times) const
       SHARED_LOCKS_REQUIRED(mu_) {
-    int64 sum = 0;
+    double sum = 0;
     for (auto& input : inputs_) {
-      sum += input->OutputTime(input_times);
+      // Inputs for which autotuning is disabled are excluded.
+      if (input->autotune()) {
+        sum += input->OutputTime(input_times);
+      }
     }
     return sum;
   }
 
   // Returns the per-element output time for this node.
-  virtual int64 OutputTimeLocked(std::vector<int64>* input_times) const
+  virtual double OutputTimeLocked(std::vector<double>* input_times) const
       SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   // Returns the sum of per-element processing time for the inputs of this node.
   //
   // TODO(jsimsa): use processing time history as a prior for future inputs
-  int64 ProcessingTimeForInputs() const SHARED_LOCKS_REQUIRED(mu_) {
+  double ProcessingTimeForInputs() const SHARED_LOCKS_REQUIRED(mu_) {
     int64 sum = 0;
     for (auto& input : inputs_) {
-      sum += input->ProcessingTime();
+      // Inputs for which autotuning is disabled are excluded.
+      if (input->autotune()) {
+        sum += input->SelfProcessingTimeLocked();
+      }
     }
     return sum;
   }
 
-  // Returns the per-element processing time spent in the subtree rooted in
-  // this node.
-  virtual int64 ProcessingTimeLocked() const SHARED_LOCKS_REQUIRED(mu_) = 0;
+  // Returns the per-element processing time spent in this node.
+  double SelfProcessingTimeLocked() const SHARED_LOCKS_REQUIRED(mu_) {
+    if (num_elements_ == 0) {
+      return 0;
+    }
+    return static_cast<double>(processing_time_) /
+           static_cast<double>(num_elements_);
+  }
+
+  // Returns the per-element CPU time spent in the subtree rooted in this node.
+  virtual double TotalProcessingTimeLocked() const
+      SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   mutable mutex mu_;
   const int64 id_;
   const string name_;
+
+  // Indicates whether the subtree rooted in this node should be included in
+  // autotuning. In particular, if this is `false`, then the subtree is excluded
+  // from computation of output time and processing time.
+  bool autotune_ GUARDED_BY(mu_) = true;
   int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
   int64 processing_time_ GUARDED_BY(mu_) = 0;
   int64 num_elements_ GUARDED_BY(mu_) = 0;
   std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
   std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
+
+  // Inputs of this node. These can represent an iterator created from the input
+  // dataset but also other input iterators (e.g. created by the user-defined
+  // functions of `flat_map` or `interleave`).
   std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
 
   // The reference to the output node is not owned so that deletion of a
@@ -416,10 +481,10 @@
       std::shared_ptr<Node> node);
 
   // Collects the output time for the given node.
-  int64 OutputTime(std::shared_ptr<Node> node);
+  double OutputTime(std::shared_ptr<Node> node);
 
   // Collects the processing time for the given node.
-  int64 ProcessingTime(std::shared_ptr<Node> node);
+  double TotalProcessingTime(std::shared_ptr<Node> node);
 
   // Used for coordination between different input pipeline threads. Exclusive
   // access is required only when adding or removing nodes. Concurrent access to
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 1d7f407..52e6339 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -25,11 +25,11 @@
 namespace {
 
 class AsyncInterleaveManyTest
-    : public ::testing::TestWithParam<std::tuple<int64, int64>> {};
+    : public ::testing::TestWithParam<std::tuple<int64, double>> {};
 
 TEST_P(AsyncInterleaveManyTest, Model) {
   const int64 parallelism = std::get<0>(GetParam());
-  const int64 input_time = std::get<1>(GetParam());
+  const double input_time = std::get<1>(GetParam());
   std::shared_ptr<Node> async_interleave_many =
       model::MakeAsyncInterleaveManyNode(
           {0, "async_interleave_many", nullptr},
@@ -55,29 +55,29 @@
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<int64> input_times(1, input_time);
+  std::vector<double> input_times(1, input_time);
   async_interleave_many->add_processing_time(100);
   EXPECT_EQ(async_interleave_many->processing_time(), 100);
-  EXPECT_EQ(async_interleave_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 0);
   EXPECT_EQ(async_interleave_many->OutputTime(&input_times), 0);
   async_interleave_many->record_element();
   EXPECT_EQ(async_interleave_many->num_elements(), 1);
-  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100 + 250);
+  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100 + 250);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times),
             100 + 250 / parallelism);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   async_interleave_many->record_element();
-  EXPECT_EQ(async_interleave_many->ProcessingTime(), 50 + 250);
+  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 50 + 250);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times),
             50 + 250 / parallelism);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
@@ -89,11 +89,11 @@
                                                               200)));
 
 class AsyncKnownRatioTest
-    : public ::testing::TestWithParam<std::tuple<int64, int64, int64>> {};
+    : public ::testing::TestWithParam<std::tuple<int64, double, int64>> {};
 
 TEST_P(AsyncKnownRatioTest, Model) {
   const int64 parallelism = std::get<0>(GetParam());
-  const int64 input_time = std::get<1>(GetParam());
+  const double input_time = std::get<1>(GetParam());
   const int64 num_inputs_per_output = std::get<2>(GetParam());
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
       {0, "async_known_many", nullptr}, num_inputs_per_output,
@@ -107,50 +107,51 @@
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  std::vector<int64> input_times(1, input_time);
+  std::vector<double> input_times(1, input_time);
   source1->add_processing_time(100);
-  EXPECT_EQ(async_known_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->TotalProcessingTime(), 0);
   EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(async_known_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->TotalProcessingTime(), 0);
   EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(async_known_many->ProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+            num_inputs_per_output * 100);
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * 100);
   EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source2->record_element();
-  EXPECT_EQ(async_known_many->ProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
             num_inputs_per_output * (100 + 200));
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * (100 + 200));
   EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(async_known_many->ProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 200));
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 200));
   EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source2->record_element();
-  EXPECT_EQ(async_known_many->ProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 100));
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100));
   EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->add_processing_time(128);
-  EXPECT_EQ(async_known_many->ProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 100));
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100));
   EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->record_element();
-  EXPECT_EQ(async_known_many->ProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 100) + 128);
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100) + 128 / parallelism);
   EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->record_element();
-  EXPECT_EQ(async_known_many->ProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 100) + 64);
   EXPECT_LE(async_known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100) + 64 / parallelism);
@@ -174,25 +175,25 @@
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", interleave_many});
   interleave_many->add_input(source2);
-  std::vector<int64> input_times(1, 0);
+  std::vector<double> input_times(1, 0);
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
-  EXPECT_EQ(interleave_many->ProcessingTime(), 0);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(), 0);
   EXPECT_EQ(interleave_many->OutputTime(&input_times), 0);
   interleave_many->record_element();
   EXPECT_EQ(interleave_many->num_elements(), 1);
-  EXPECT_EQ(interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(), 100);
   EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(), 100);
   EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(interleave_many->ProcessingTime(), 350);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(), 350);
   EXPECT_EQ(interleave_many->OutputTime(&input_times), 350);
   interleave_many->record_element();
-  EXPECT_EQ(interleave_many->ProcessingTime(), 300);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(), 300);
   EXPECT_EQ(interleave_many->OutputTime(&input_times), 300);
 }
 
@@ -208,39 +209,43 @@
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  std::vector<int64> input_times(1, 0);
+  std::vector<double> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(known_many->ProcessingTime(), 0);
+  EXPECT_EQ(known_many->TotalProcessingTime(), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(known_many->ProcessingTime(), 0);
+  EXPECT_EQ(known_many->TotalProcessingTime(), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_EQ(known_many->TotalProcessingTime(), num_inputs_per_output * 100);
   EXPECT_EQ(known_many->OutputTime(&input_times), num_inputs_per_output * 100);
   source2->record_element();
-  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (100 + 200));
+  EXPECT_EQ(known_many->TotalProcessingTime(),
+            num_inputs_per_output * (100 + 200));
   EXPECT_EQ(known_many->OutputTime(&input_times),
             num_inputs_per_output * (100 + 200));
   source1->record_element();
-  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 200));
+  EXPECT_EQ(known_many->TotalProcessingTime(),
+            num_inputs_per_output * (50 + 200));
   EXPECT_EQ(known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 200));
   source2->record_element();
-  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
+  EXPECT_EQ(known_many->TotalProcessingTime(),
+            num_inputs_per_output * (50 + 100));
   EXPECT_EQ(known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100));
   known_many->add_processing_time(128);
-  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
+  EXPECT_EQ(known_many->TotalProcessingTime(),
+            num_inputs_per_output * (50 + 100));
   EXPECT_EQ(known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100));
   known_many->record_element();
-  EXPECT_EQ(known_many->ProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 100) + 128);
   EXPECT_EQ(known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100) + 128);
   known_many->record_element();
-  EXPECT_EQ(known_many->ProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(),
             num_inputs_per_output * (50 + 100) + 64);
   EXPECT_EQ(known_many->OutputTime(&input_times),
             num_inputs_per_output * (50 + 100) + 64);
@@ -250,18 +255,18 @@
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
-  std::vector<int64> input_times(1, 0);
+  std::vector<double> input_times(1, 0);
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
-  EXPECT_EQ(source->ProcessingTime(), 0);
+  EXPECT_EQ(source->TotalProcessingTime(), 0);
   EXPECT_EQ(source->OutputTime(&input_times), 0);
   source->record_element();
   EXPECT_EQ(source->num_elements(), 1);
-  EXPECT_EQ(source->ProcessingTime(), 100);
+  EXPECT_EQ(source->TotalProcessingTime(), 100);
   EXPECT_EQ(source->OutputTime(&input_times), 100);
   source->record_element();
   EXPECT_EQ(source->num_elements(), 2);
-  EXPECT_EQ(source->ProcessingTime(), 50);
+  EXPECT_EQ(source->TotalProcessingTime(), 50);
   EXPECT_EQ(source->OutputTime(&input_times), 50);
 }
 
@@ -274,25 +279,25 @@
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  std::vector<int64> input_times(1, 0);
+  std::vector<double> input_times(1, 0);
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
-  EXPECT_EQ(unknown_many->ProcessingTime(), 0);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(), 0);
   EXPECT_EQ(unknown_many->OutputTime(&input_times), 0);
   unknown_many->record_element();
   EXPECT_EQ(unknown_many->num_elements(), 1);
-  EXPECT_EQ(unknown_many->ProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(), 100);
   EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
   source1->add_processing_time(100);
   source2->add_processing_time(200);
-  EXPECT_EQ(unknown_many->ProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(), 100);
   EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(unknown_many->ProcessingTime(), 400);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(), 400);
   EXPECT_EQ(unknown_many->OutputTime(&input_times), 400);
   unknown_many->record_element();
-  EXPECT_EQ(unknown_many->ProcessingTime(), 200);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(), 200);
   EXPECT_EQ(unknown_many->OutputTime(&input_times), 200);
 }
 
@@ -305,36 +310,36 @@
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  std::vector<int64> input_times(1, 0);
+  std::vector<double> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(unknown->ProcessingTime(), 0);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times), 0);
   source2->add_processing_time(100);
-  EXPECT_EQ(unknown->ProcessingTime(), 0);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   source2->record_element();
-  EXPECT_EQ(unknown->ProcessingTime(), 200);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 200);
   EXPECT_EQ(unknown->OutputTime(&input_times), 200);
   source1->record_element();
-  EXPECT_EQ(unknown->ProcessingTime(), 150);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 150);
   EXPECT_EQ(unknown->OutputTime(&input_times), 150);
   source2->record_element();
-  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times), 100);
-  // Unknown node processing time should not affect its ProcessingTime() or
+  // Unknown node processing time should not affect its TotalProcessingTime() or
   // OutputTime().
   unknown->add_processing_time(100);
   EXPECT_EQ(unknown->processing_time(), 100);
-  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times), 100);
-  // Unknown node number of elements should not affect its ProcessingTime() or
-  // OutputTime().
+  // Unknown node number of elements should not affect its TotalProcessingTime()
+  // or OutputTime().
   unknown->record_element();
   EXPECT_EQ(unknown->num_elements(), 1);
-  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times), 100);
 }
 
@@ -350,12 +355,12 @@
     return nullptr;
   }
 
-  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+  double OutputTimeLocked(std::vector<double>* input_times) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     return 0;
   }
 
-  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+  double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
     return 0;
   }
 };
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index ded30a6..6c657c8 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -21,7 +21,6 @@
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
@@ -50,7 +49,7 @@
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-static string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
+string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
   string ret;
 
   // We sort the attrs so the output is deterministic.
@@ -120,6 +119,13 @@
   return FormatNodeForError(NodeDebugInfo(node_def));
 }
 
+string FormatNodeDefForError(
+    StringPiece node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info) {
+  return FormatNodeForError(NodeDebugInfo(
+      node_name, has_experimental_debug_info, experimental_debug_info));
+}
+
 void GetMergedOriginalNodeNames(const NodeDebugInfo& from,
                                 const NodeDebugInfo& to,
                                 std::set<string>* names) {
@@ -783,6 +789,8 @@
 Status AddPrefixAndSuffixToNode(StringPiece prefix, StringPiece suffix,
                                 NodeDef* node_def) {
   node_def->set_name(strings::StrCat(prefix, node_def->name(), suffix));
+
+  // Update frame name to avoid multiple LoopCond nodes in one frame.
   if (node_def->op() == "Enter" || node_def->op() == "RefEnter") {
     string frame_name;
     TF_RETURN_IF_ERROR(GetNodeAttr(*node_def, "frame_name", &frame_name));
@@ -790,6 +798,13 @@
     frame_name = strings::StrCat(prefix, frame_name, suffix);
     attr.set_s(frame_name);
   }
+
+  // Update colocation constraints.
+  auto class_attr = node_def->mutable_attr()->find("_class");
+  if (class_attr != node_def->mutable_attr()->end()) {
+    class_attr->second.set_s(strings::StrCat(prefix, class_attr->second.s()));
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 5e8b53d..d85c53a 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -34,6 +35,7 @@
 // We forward declare protos so that kernels don't need to depend on them
 class NodeDef;
 class OpDef;
+class AttrSlice;
 
 // Name of the attribute used to encode node colocation constraints.
 //
@@ -50,12 +52,16 @@
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 string SummarizeAttrs(const NodeDef& node_def);
+string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
 // followed is: {{node <node_name>}}
 string FormatNodeForError(const Node& node);
 string FormatNodeDefForError(const NodeDef& node_def);
+string FormatNodeDefForError(
+    StringPiece node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 
 // Merges the original node names from the debug information of 'from' to the
 // debug information of 'to'.
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 829c623..605275a 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -702,9 +702,10 @@
     DataType type, const TensorShape& shape, Tensor* out_tensor,
     AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
   Allocator* a = get_allocator(attr);
-  AllocationAttributes logged_attr(allocation_attr);
-  logged_attr.allocation_will_be_logged = true;
-  Tensor new_tensor(a, type, shape, logged_attr);
+  Tensor new_tensor(a, type, shape,
+                    AllocationAttributes(allocation_attr.no_retry_on_failure,
+                                         /* allocation_will_be_logged= */ true,
+                                         allocation_attr.freed_by_func));
 
   if (!new_tensor.IsInitialized()) {
     return errors::ResourceExhausted(
@@ -750,6 +751,9 @@
       int64 alloc_size = a->AllocatedSize(out_temp->tensor_data().data());
       record_temp_memory_allocation(alloc_size, *out_temp);
     }
+  } else if (record_memory_consumption_) {
+    mutex_lock l(stats_mu_);
+    temp_memory_allocated_ += out_temp->TotalBytes();
   }
   return s;
 }
@@ -774,6 +778,10 @@
         int64 alloc_id = a->AllocationId(t->tensor_data().data());
         record_persistent_memory_allocation(alloc_size, alloc_id);
       }
+    } else if (record_memory_consumption_) {
+      mutex_lock l(stats_mu_);
+      persistent_memory_allocated_ +=
+          out_persistent->AccessTensor(this)->TotalBytes();
     }
   }
   return s;
@@ -977,7 +985,10 @@
 // This maps from 'op_type' + DeviceType to the set of KernelDefs and
 // factory functions for instantiating the OpKernel that matches the
 // KernelDef.
-typedef std::unordered_multimap<string, KernelRegistration> KernelRegistry;
+struct KernelRegistry {
+  mutex mu;
+  std::unordered_multimap<string, KernelRegistration> registry GUARDED_BY(mu);
+};
 
 #if defined(_WIN32)
 static const char kKernelLibPattern[] = "libtfkernel*.dll";
@@ -1105,9 +1116,12 @@
     // before some file libraries can initialize, which in turn crashes the
     // program flakily. Until we get rid of static initializers in kernel
     // registration mechanism, we have this workaround here.
-    reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry())
-        ->emplace(key, KernelRegistration(*kernel_def, kernel_class_name,
-                                          std::move(factory)));
+    auto global_registry =
+        reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
+    mutex_lock l(global_registry->mu);
+    global_registry->registry.emplace(
+        key,
+        KernelRegistration(*kernel_def, kernel_class_name, std::move(factory)));
   }
   delete kernel_def;
 }
@@ -1124,28 +1138,33 @@
 static const StringPiece kKernelAttr("_kernel");
 
 // TODO(irving): Replace with const Node& version below.
-Status FindKernelRegistration(const DeviceType& device_type,
-                              const NodeDef& node_def,
-                              const KernelRegistration** reg,
-                              bool* was_attr_mismatch) {
+Status FindKernelRegistration(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, AttrSlice node_attrs, const KernelRegistration** reg,
+    bool* was_attr_mismatch) {
   *reg = nullptr;
   *was_attr_mismatch = false;
   // Label defaults to empty if not found in NodeDef.
-  const string& label = GetNodeAttrString(node_def, kKernelAttr);
+  const string& label = GetNodeAttrString(node_attrs, kKernelAttr);
 
-  const string key = Key(node_def.op(), device_type, label);
-  auto regs = GlobalKernelRegistryTyped()->equal_range(key);
+  const string key = Key(node_op, device_type, label);
+  auto typed_registry = GlobalKernelRegistryTyped();
+  tf_shared_lock lock(typed_registry->mu);
+  auto regs = typed_registry->registry.equal_range(key);
   for (auto iter = regs.first; iter != regs.second; ++iter) {
     // If there is a kernel registered for the op and device_type,
     // check that the attrs match.
     bool match;
-    TF_RETURN_IF_ERROR(KernelAttrsMatch(iter->second.def, node_def, &match));
+    TF_RETURN_IF_ERROR(KernelAttrsMatch(iter->second.def, node_attrs, &match));
     if (match) {
       if (*reg != nullptr) {
         return errors::InvalidArgument(
             "Multiple OpKernel registrations match NodeDef '",
-            FormatNodeDefForError(node_def), "': '",
-            ProtoShortDebugString((*reg)->def), "' and '",
+            FormatNodeDefForError(node_name, has_experimental_debug_info,
+                                  experimental_debug_info),
+            "': '", ProtoShortDebugString((*reg)->def), "' and '",
             ProtoShortDebugString(iter->second.def), "'");
       }
       *reg = &iter->second;
@@ -1156,6 +1175,16 @@
   return Status::OK();
 }
 
+Status FindKernelRegistration(const DeviceType& device_type,
+                              const NodeDef& node_def,
+                              const KernelRegistration** reg,
+                              bool* was_attr_mismatch) {
+  return FindKernelRegistration(
+      device_type, node_def.name(), node_def.has_experimental_debug_info(),
+      node_def.experimental_debug_info(), node_def.op(),
+      AttrSlice(&node_def.attr()), reg, was_attr_mismatch);
+}
+
 }  // namespace
 
 bool KernelDefAvailable(const DeviceType& device_type,
@@ -1168,24 +1197,31 @@
 }
 
 // TODO(irving): Change const NodeDef& to const Node&
-Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
-                     const KernelDef** def, string* kernel_class_name) {
+Status FindKernelDef(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, StringPiece node_device, AttrSlice node_attrs,
+    const KernelDef** def, string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
-  TF_RETURN_IF_ERROR(
-      FindKernelRegistration(device_type, node_def, &reg, &was_attr_mismatch));
+  TF_RETURN_IF_ERROR(FindKernelRegistration(
+      device_type, node_name, has_experimental_debug_info,
+      experimental_debug_info, node_op, node_attrs, &reg, &was_attr_mismatch));
   if (reg == nullptr) {
     Status s = errors::NotFound(
-        "No registered '", node_def.op(), "' OpKernel for ",
+        "No registered '", node_op, "' OpKernel for ",
         DeviceTypeString(device_type), " devices compatible with node ",
-        FormatNodeDefForError(node_def));
+        FormatNodeDefForError(node_name, has_experimental_debug_info,
+                              experimental_debug_info));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
           &s, " (OpKernel was found, but attributes didn't match) ",
-          "Requested Attributes: ", SummarizeAttrs(node_def));
+          "Requested Attributes: ",
+          SummarizeAttrsHelper(node_attrs, node_device));
     }
-    errors::AppendToMessage(
-        &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
+    errors::AppendToMessage(&s,
+                            ".  Registered:", KernelsRegisteredForOp(node_op));
     return s;
   }
   if (def != nullptr) *def = &reg->def;
@@ -1193,6 +1229,14 @@
   return Status::OK();
 }
 
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
+                     const KernelDef** def, string* kernel_class_name) {
+  return FindKernelDef(
+      device_type, node_def.name(), node_def.has_experimental_debug_info(),
+      node_def.experimental_debug_info(), node_def.op(), node_def.device(),
+      AttrSlice(&node_def.attr()), def, kernel_class_name);
+}
+
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
     PrioritizedDeviceTypeVector* prioritized_device_types) {
@@ -1241,10 +1285,11 @@
 
 KernelList GetFilteredRegisteredKernels(
     const std::function<bool(const KernelDef&)>& predicate) {
-  const KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
+  KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
   KernelList kernel_list;
-  kernel_list.mutable_kernel()->Reserve(typed_registry->size());
-  for (const auto& p : *typed_registry) {
+  tf_shared_lock lock(typed_registry->mu);
+  kernel_list.mutable_kernel()->Reserve(typed_registry->registry.size());
+  for (const auto& p : typed_registry->registry) {
     const KernelDef& kernel_def = p.second.def;
     if (predicate(kernel_def)) {
       *kernel_list.add_kernel() = kernel_def;
@@ -1370,7 +1415,9 @@
 }  // namespace
 
 Status ValidateKernelRegistrations(const OpRegistryInterface& op_registry) {
-  for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
+  auto typed_registry = GlobalKernelRegistryTyped();
+  tf_shared_lock lock(typed_registry->mu);
+  for (const auto& key_registration : typed_registry->registry) {
     const KernelDef& kernel_def(key_registration.second.def);
     const OpRegistrationData* op_reg_data;
     const Status status = op_registry.LookUp(kernel_def.op(), &op_reg_data);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 6531b1a..d25f155 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -18,9 +18,9 @@
 
 #include <atomic>
 #include <functional>
-
 #include <utility>
 #include <vector>
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
@@ -28,6 +28,7 @@
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/rendezvous.h"
@@ -66,6 +67,7 @@
 
 class AsyncOpKernel;
 class CallFrameInterface;
+class DeviceMgr;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
 class OpKernelContext;       // declared below,
@@ -647,6 +649,8 @@
     // Mechanism used by this op kernel invocation to communicate with
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
+    const std::function<Status(const int64, const DeviceMgr*, Rendezvous** r)>*
+        create_rendezvous;
 
     // Mechanism for executing a collective op that needs to coordinate
     // with parallel instances running on other devices.
@@ -1082,6 +1086,10 @@
   // An op kernel communicates with outside environment through
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
+  Status create_rendezvous(const int64 step_id, const DeviceMgr* device_mgr,
+                           Rendezvous** r) const {
+    return (*params_->create_rendezvous)(step_id, device_mgr, r);
+  }
 
   CollectiveExecutor* collective_executor() const {
     return params_->collective_executor;
@@ -1225,6 +1233,8 @@
 
   bool input_is_ref(int index) const;
 
+  void set_record_memory_consumption(bool v) { record_memory_consumption_ = v; }
+
   // Used by OpKernel implementations to track actively running deferred ops.
   //
   // A deferred op is one whose Compute method returns (or whose ComputeAsync
@@ -1245,6 +1255,7 @@
 
  private:
   Allocator* get_allocator(AllocatorAttributes attr);
+  bool record_memory_consumption_ = false;
 
   // Internal method to add a tensor's buffer to the list of buffers
   // referenced during the execution of the Op, so that GPUs may
@@ -1433,6 +1444,17 @@
 // Checks whether a given kernel is registered on device_type.
 bool KernelDefAvailable(const DeviceType& device_type, const NodeDef& node_def);
 
+// If node of node_name, experimental_debug_info, node_op, node_device and
+// node_attrs has a corresponding kernel registered on device_type, returns OK
+// and fill in the kernel def and kernel_class_name. <def> and
+// <kernel_class_name> may be null.
+Status FindKernelDef(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, StringPiece node_device, AttrSlice node_attrs,
+    const KernelDef** def, string* kernel_class_name);
+
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
 // <kernel_class_name> may be null.
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index e84143f..c50d916 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -182,7 +182,13 @@
 
     // There is an earliest waiter to consume this message.
     Item* item = queue->front();
-    queue->pop_front();
+
+    // Delete the queue when the last element has been consumed.
+    if (queue->size() == 1) {
+      table_.erase(key_hash);
+    } else {
+      queue->pop_front();
+    }
     mu_.unlock();
 
     // Notify the waiter by invoking its done closure, outside the
@@ -225,7 +231,13 @@
     // A message has already arrived and is queued in the table under
     // this key.  Consumes the message and invokes the done closure.
     Item* item = queue->front();
-    queue->pop_front();
+
+    // Delete the queue when the last element has been consumed.
+    if (queue->size() == 1) {
+      table_.erase(key_hash);
+    } else {
+      queue->pop_front();
+    }
     mu_.unlock();
 
     // Invokes the done() by invoking its done closure, outside scope
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index 47c009a..c4db98f 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -97,27 +97,30 @@
 
 // Does unlock and unref automatically when going out of scope, and also
 // supports early manual release.
-class ScopedUnlockUnrefVar {
+class SCOPED_LOCKABLE ScopedUnlockUnrefVar {
  public:
-  explicit ScopedUnlockUnrefVar(Var* var) : var_(var) {
+  explicit ScopedUnlockUnrefVar(Var* var) EXCLUSIVE_LOCK_FUNCTION(var_->mu())
+      : var_(var) {
     if (var_) {
       var_->mu()->lock();
     }
   }
-  void Release() {
+  void Release() UNLOCK_FUNCTION() {
     if (var_) {
       var_->mu()->unlock();
       var_->Unref();
       var_ = nullptr;
     }
   }
-  ~ScopedUnlockUnrefVar() { Release(); }
+  ~ScopedUnlockUnrefVar() UNLOCK_FUNCTION() { Release(); }
 
  private:
   Var* var_;
 
   ScopedUnlockUnrefVar(const ScopedUnlockUnrefVar&) = delete;
-  void operator=(const ScopedUnlockUnrefVar&) = delete;
+  ScopedUnlockUnrefVar(ScopedUnlockUnrefVar&&) = delete;
+  ScopedUnlockUnrefVar& operator=(const ScopedUnlockUnrefVar&) = delete;
+  ScopedUnlockUnrefVar& operator=(ScopedUnlockUnrefVar&&) = delete;
 };
 
 }  //  end namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index 55790b6..fa7d46c 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -92,25 +92,18 @@
       handlers_.emplace_back(new RunHandler::Impl(this));
       free_handlers_.push_back(handlers_.back().get());
     }
-    // Set steal partitions to a fixed size steal domain of size 6 = 2 *
-    // kMinThreadsPerRequest.
+
     std::vector<std::pair<unsigned, unsigned>> steal_partitions(
         num_inter_op_threads);
-    int kStealDomainSize = std::min(6, num_inter_op_threads);
-    unsigned steal_start = 0, steal_end = kStealDomainSize;
+    std::vector<std::uint_fast32_t> start_vec(num_inter_op_threads);
+    std::vector<std::uint_fast32_t> end_vec(num_inter_op_threads);
+
+    ComputeInterOpStealingRanges(num_inter_op_threads, kMinThreadsPerDomain,
+                                 &start_vec, &end_vec);
     for (int i = 0; i < num_inter_op_threads; ++i) {
-      if (i > steal_start) {
-        if (steal_end + kStealDomainSize < num_inter_op_threads) {
-          steal_start = steal_end;
-          steal_end += kStealDomainSize;
-        } else {
-          steal_end = num_inter_op_threads;
-          steal_start = steal_end - kStealDomainSize;
-        }
-      }
-      steal_partitions[i] = std::make_pair(steal_start, steal_end);
-      VLOG(1) << "Steal partition i: " << i << " steal_start: " << steal_start
-              << " steal_end: " << steal_end;
+      steal_partitions[i] = std::make_pair(start_vec[i], end_vec[i]);
+      VLOG(1) << "Steal partition i: " << i << " steal_start: " << start_vec[i]
+              << " steal_end: " << end_vec[i];
     }
     inter_op_thread_pool_->SetStealPartitions(steal_partitions);
   }
@@ -183,6 +176,14 @@
   // inference).
   const int max_handlers_;
 
+  // Minimum number of threads allocated to process a request.
+  const int kMinThreadsPerRequest = 3;
+
+  // Minmum number of threads in a steal domain. Each thread will first try
+  // to steal from threads in the same domain before stealing from threads
+  // in different domains.
+  const int kMinThreadsPerDomain = 2 * kMinThreadsPerRequest;
+
   // Thread safe part.
   const std::unique_ptr<thread::ThreadPool> inter_op_thread_pool_;
 
@@ -209,7 +210,6 @@
   inter_op_start_.resize(num_active_requests);
   inter_op_limit_.resize(num_active_requests);
 
-  const int kMinThreadsPerRequest = 3;
   ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
                                  kMinThreadsPerRequest, &inter_op_start_,
                                  &inter_op_limit_);
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
index 3087998..0fe99f0 100644
--- a/tensorflow/core/framework/run_handler_util.cc
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -54,4 +54,25 @@
     last_cumulative_weight = cumulative_weight;
   }
 }
+
+void ComputeInterOpStealingRanges(int num_threads, int min_threads_per_domain,
+                                  std::vector<std::uint_fast32_t>* start_vec,
+                                  std::vector<std::uint_fast32_t>* end_vec) {
+  int steal_domain_size = std::min(min_threads_per_domain, num_threads);
+  unsigned steal_start = 0, steal_end = steal_domain_size;
+  for (int i = 0; i < num_threads; ++i) {
+    if (i >= steal_end) {
+      if (steal_end + steal_domain_size < num_threads) {
+        steal_start = steal_end;
+        steal_end += steal_domain_size;
+      } else {
+        steal_end = num_threads;
+        steal_start = steal_end - steal_domain_size;
+      }
+    }
+    start_vec->at(i) = steal_start;
+    end_vec->at(i) = steal_end;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler_util.h b/tensorflow/core/framework/run_handler_util.h
index c0c36ae..a6ee4e8 100644
--- a/tensorflow/core/framework/run_handler_util.h
+++ b/tensorflow/core/framework/run_handler_util.h
@@ -24,7 +24,7 @@
 // Assign thread ranges to requests.
 // Requests are numbered 0...num_active_requests-1, and
 // threads are numbered 0...num_threads-1.
-// On return, the range start_vec->at(i)...end_vec->at(i)-1
+// On return, the range [start_vec->at(i), end_vec->at(i))
 // indicates the subrange of the threads available to request i.
 // The ranges given to different requests may overlap.
 // Lower numbered requests will tend to be assigned more threads.
@@ -39,5 +39,12 @@
                                     std::vector<std::uint_fast32_t>* start_vec,
                                     std::vector<std::uint_fast32_t>* end_vec);
 
+// Assign thread steal ranges to threads.Threads are numbered 0...num_threads-1.
+// On return, the range [start_vec->at(i), end_vec->at(i)) indicates the steal
+// range of the thread i. The ranges given to different threads may overlap.
+void ComputeInterOpStealingRanges(int num_threads, int min_threads_per_domain,
+                                  std::vector<std::uint_fast32_t>* start_vec,
+                                  std::vector<std::uint_fast32_t>* end_vec);
+
 }  // end namespace tensorflow
 #endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/tensorflow/core/framework/run_handler_util_test.cc b/tensorflow/core/framework/run_handler_util_test.cc
index a1928c1..1fcdc68 100644
--- a/tensorflow/core/framework/run_handler_util_test.cc
+++ b/tensorflow/core/framework/run_handler_util_test.cc
@@ -22,8 +22,9 @@
 namespace tensorflow {
 namespace {
 
-void VerifyFunction(int num_active_requests, int num_threads,
-                    int min_threads_per_request, bool print_stats = false) {
+void VerifySchedulingRanges(int num_active_requests, int num_threads,
+                            int min_threads_per_request,
+                            bool print_stats = false) {
   if (print_stats) {
     LOG(INFO) << "Test case# num_active_requests: " << num_active_requests
               << " num_threads: " << num_threads
@@ -82,12 +83,35 @@
          ++num_active_requests) {
       for (int num_threads = min_threads_per_request;
            num_threads <= kMaxThreads; ++num_threads) {
-        VerifyFunction(num_active_requests, num_threads,
-                       min_threads_per_request);
+        VerifySchedulingRanges(num_active_requests, num_threads,
+                               min_threads_per_request);
       }
     }
   }
 }
 
+TEST(RunHandlerUtilTest, TestComputeInterOpStealingRanges) {
+  int num_inter_op_threads = 9;
+  std::vector<std::uint_fast32_t> start_vec(num_inter_op_threads);
+  std::vector<std::uint_fast32_t> end_vec(num_inter_op_threads);
+
+  // When there is 9 threads, there should be two thread groups.
+  // The first group has threads [0, 6) with stealing range [0, 6)
+  // The second group has threads [6, 9) with stealing range [3, 9)
+
+  ComputeInterOpStealingRanges(num_inter_op_threads, 6, &start_vec, &end_vec);
+  int stealing_ranges[2][2] = {{0, 6}, {3, 9}};
+
+  for (int i = 0; i < num_inter_op_threads; ++i) {
+    int expected_start = stealing_ranges[i / 6][0];
+    int expected_end = stealing_ranges[i / 6][1];
+    string message =
+        strings::StrCat("Stealing range of thread ", i, " should be [",
+                        expected_start, ", ", expected_end, "]");
+    ASSERT_EQ(start_vec[i], expected_start) << message;
+    ASSERT_EQ(end_vec[i], expected_end) << message;
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index e97b3c5..1bd2a43 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -35,6 +35,7 @@
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
@@ -443,12 +444,14 @@
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n)
-    : BufferBase(a, a->Allocate<T>(n)), elem_(n) {}
+    : BufferBase(a, TypedAllocator::Allocate<T>(a, n, AllocationAttributes())),
+      elem_(n) {}
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n,
                   const AllocationAttributes& allocation_attr)
-    : BufferBase(a, a->Allocate<T>(n, allocation_attr)), elem_(n) {}
+    : BufferBase(a, TypedAllocator::Allocate<T>(a, n, allocation_attr)),
+      elem_(n) {}
 
 template <typename T>
 Buffer<T>::~Buffer() {
@@ -456,7 +459,7 @@
     if (LogMemory::IsEnabled()) {
       RecordDeallocation();
     }
-    alloc_->Deallocate<T>(static_cast<T*>(data()), elem_);
+    TypedAllocator::Deallocate<T>(alloc_, static_cast<T*>(data()), elem_);
   }
 }
 
@@ -734,7 +737,7 @@
     : shape_(shape), buf_(nullptr) {
   set_dtype(type);
   CHECK_NOTNULL(a);
-  if (shape_.num_elements() > 0 || a->ShouldAllocateEmptyTensors()) {
+  if (shape_.num_elements() > 0 || a->AllocatesOpaqueHandle()) {
     CASES(type, buf_ = new Buffer<T>(a, shape.num_elements()));
   }
   if (buf_ != nullptr && buf_->data() != nullptr && LogMemory::IsEnabled()) {
@@ -748,7 +751,7 @@
     : shape_(shape), buf_(nullptr) {
   set_dtype(type);
   CHECK_NOTNULL(a);
-  if (shape_.num_elements() > 0 || a->ShouldAllocateEmptyTensors()) {
+  if (shape_.num_elements() > 0 || a->AllocatesOpaqueHandle()) {
     CASES(type, buf_ = new Buffer<T>(a, shape.num_elements(), allocation_attr));
   }
   if (!allocation_attr.allocation_will_be_logged && buf_ != nullptr &&
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 6454cb8..edbdc29 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -43,6 +43,7 @@
 class Tensor;
 class TensorBuffer;
 class TensorCApi;
+class TensorCord;
 class TensorDescription;
 class TensorProto;
 class Var;
@@ -237,7 +238,8 @@
     return true;
 #else
     void* ptr = base<void>();
-    return reinterpret_cast<intptr_t>(ptr) % EIGEN_MAX_ALIGN_BYTES == 0;
+    return dtype() == DT_STRING ||
+           (reinterpret_cast<intptr_t>(ptr) % EIGEN_MAX_ALIGN_BYTES == 0);
 #endif
   }
 
@@ -606,6 +608,7 @@
 
   friend class DMAHelper;
   friend class TensorCApi;
+  friend class TensorCord;            // For access to buf_
   friend class TensorReference;       // For access to buf_
   friend class VariableOp;            // For access to set_shape
   friend class AutoReloadVariableOp;  // For access to set_shape
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 3473a44..035bbef2 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -47,7 +47,7 @@
   TensorShapeRep(const TensorShapeRep& b);
   void operator=(const TensorShapeRep& b);
 
-  /// Move the specified shape.  After moving, <b> is safe for destruction and
+  /// Move the specified shape.  After moving, `b` is safe for destruction and
   // can be reassigned into, but its dimensions and number of elements can be
   // nonsensical (e.g., negative dimension sizes, or number of elements not
   // properly recomputed).
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/framework/typed_allocator.cc
similarity index 61%
rename from tensorflow/core/util/cuda_kernel_helper.h
rename to tensorflow/core/framework/typed_allocator.cc
index 04dba3e..25f15e5 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/framework/typed_allocator.cc
@@ -13,10 +13,20 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
-#define TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
+#include "tensorflow/core/framework/typed_allocator.h"
 
-// Forward to new header.
-#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/framework/variant.h"
 
-#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
+namespace tensorflow {
+
+/* static */
+void TypedAllocator::RunVariantCtor(Variant* p, size_t n) {
+  for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
+}
+
+/* static */
+void TypedAllocator::RunVariantDtor(Variant* p, size_t n) {
+  for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/typed_allocator.h b/tensorflow/core/framework/typed_allocator.h
new file mode 100644
index 0000000..37532fd
--- /dev/null
+++ b/tensorflow/core/framework/typed_allocator.h
@@ -0,0 +1,133 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
+
+#include <limits>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Variant;
+
+// Convenience functions to do typed allocation.  C++ constructors
+// and destructors are invoked for complex types if necessary.
+class TypedAllocator {
+ public:
+  // May return NULL if the tensor has too many elements to represent in a
+  // single allocation.
+  template <typename T>
+  static T* Allocate(Allocator* raw_allocator, size_t num_elements,
+                     const AllocationAttributes& allocation_attr) {
+    // TODO(jeff): Do we need to allow clients to pass in alignment
+    // requirements?
+
+    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
+      return nullptr;
+    }
+
+    void* p =
+        raw_allocator->AllocateRaw(Allocator::kAllocatorAlignment,
+                                   sizeof(T) * num_elements, allocation_attr);
+    T* typed_p = reinterpret_cast<T*>(p);
+    if (typed_p) RunCtor<T>(raw_allocator, typed_p, num_elements);
+    return typed_p;
+  }
+
+  template <typename T>
+  static void Deallocate(Allocator* raw_allocator, T* ptr,
+                         size_t num_elements) {
+    if (ptr) {
+      RunDtor<T>(raw_allocator, ptr, num_elements);
+      raw_allocator->DeallocateRaw(ptr);
+    }
+  }
+
+ private:
+  // No constructors or destructors are run for simple types
+  template <typename T>
+  static void RunCtor(Allocator* raw_allocator, T* p, size_t n) {
+    static_assert(is_simple_type<T>::value, "T is not a simple type.");
+  }
+
+  template <typename T>
+  static void RunDtor(Allocator* raw_allocator, T* p, size_t n) {}
+
+  static void RunVariantCtor(Variant* p, size_t n);
+
+  static void RunVariantDtor(Variant* p, size_t n);
+};
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, string* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) string();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, string* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~string();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, ResourceHandle* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, ResourceHandle* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, Variant* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    RunVariantCtor(p, n);
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, Variant* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    RunVariantDtor(p, n);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index ef10ba1..39346df 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -40,7 +40,8 @@
 const char* const DEVICE_SYCL = "SYCL";
 
 const std::string DeviceName<Eigen::ThreadPoolDevice>::value = DEVICE_CPU;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 const std::string DeviceName<Eigen::GpuDevice>::value = DEVICE_GPU;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 5b6eb87..5f559f8 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -83,7 +83,8 @@
   static const std::string value;
 };
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 template <>
 struct DeviceName<Eigen::GpuDevice> {
   static const std::string value;
@@ -477,6 +478,12 @@
 // DEVICE_MEMORY otherwise.
 MemoryType MTypeFromDType(const DataType dtype);
 
+// Returns HOST_MEMORY if `dtype` is always on host, DEVICE_MEMORY otherwise.
+// The reason we have MTypeFromDType() and MTypeFromDTypeIntsOnDevice(): for
+// GPUs, we would like to keep int operations on host for performance concerns.
+// But for TPUs (and other devices), int operations are placed on device.
+MemoryType MTypeFromDTypeIntsOnDevice(const DataType dtype);
+
 // Types that always sit on host: DT_STRING, DT_STRING_REF, DT_RESOURCE.
 // For DT_RESOURCE, the handle always sits on host (even if the underlying
 // object has device-allocated resources).
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
index d43e3c7..e61afea 100644
--- a/tensorflow/core/framework/variant.cc
+++ b/tensorflow/core/framework/variant.cc
@@ -23,9 +23,11 @@
 
 namespace tensorflow {
 
+Variant::~Variant() { clear(); }
+
 bool Variant::Decode(VariantTensorData data) {
   if (!is_empty()) {
-    return value_->Decode(std::move(data));
+    return GetValue()->Decode(std::move(data));
   }
   return true;
 }
@@ -35,7 +37,7 @@
   if (is_empty()) {
     return nullptr;
   }
-  return value_->RawPtr();
+  return GetValue()->RawPtr();
 }
 
 template <>
@@ -43,7 +45,7 @@
   if (is_empty()) {
     return nullptr;
   }
-  return value_->RawPtr();
+  return GetValue()->RawPtr();
 }
 
 template <>
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index 10eabbc8..9b4ee86 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -23,6 +23,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -68,7 +69,7 @@
 //
 //   string TypeName() const;
 //   void Encode(VariantTensorData* data) const;
-//   void Decode(VariantTensorData data);
+//   bool Decode(VariantTensorData data);
 //
 // Simple POD types can elide the Encode/Decode functions, they are provided by
 // helper methods.
@@ -149,39 +150,57 @@
 //
 class Variant {
  public:
-  constexpr Variant() noexcept = default;
+  Variant() noexcept : is_inline_(false) {}
 
-  Variant(const Variant& other)
-      : value_(other.is_empty() ? std::unique_ptr<ValueInterface>()
-                                : other.value_->Clone()) {}
+  ~Variant();
 
-  Variant(Variant&& other) noexcept = default;
+  Variant(const Variant& other);
+  Variant(Variant&& other) noexcept;
 
-  // Make sure that the type is CopyConstructible and not a tensorflow::Variant
-  // object itself. We want the copy constructor to be chosen for the
-  // tensorflow::Variant case.
+  // Make sure that the type is CopyConstructible and not a
+  // tensorflow::Variant object itself. We want the copy constructor to be
+  // chosen for the tensorflow::Variant case.
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_move_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant(T&& value);
+
   template <typename T, typename VT = typename std::decay<T>::type,
             typename std::enable_if<!std::is_same<Variant, VT>::value &&
                                         std::is_copy_constructible<VT>::value,
                                     void>::type* = nullptr>
-  Variant(T&& value)  // NOLINT
-      : value_(new Value<VT>(in_place, std::forward<T>(value))) {}
+  Variant(const T& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_copy_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant& operator=(const T& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_move_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant& operator=(T&& value);
 
   Variant& operator=(const Variant& rhs) {
+    if (&rhs == this) return *this;
     Variant(rhs).swap(*this);
     return *this;
   }
 
   Variant& operator=(Variant&& rhs) noexcept {
+    if (&rhs == this) return *this;
     Variant(std::move(rhs)).swap(*this);
     return *this;
   }
 
-  bool is_empty() const { return value_ == nullptr; }
+  bool is_empty() const { return GetValue() == nullptr; }
 
-  void clear() noexcept { value_.reset(); }
+  void clear() noexcept;
 
-  void swap(Variant& other) noexcept { value_.swap(other.value_); }
+  void swap(Variant& other) noexcept;
 
   // Note, unlike TypeName(), TypeId() does not return the TypeIndex
   // of the original type when a TensorValueDataProto is stored as the
@@ -191,12 +210,13 @@
     if (is_empty()) {
       return VoidTypeIndex;
     }
-    return value_->TypeId();
+    return GetValue()->TypeId();
   }
 
   string DebugString() const {
-    return strings::StrCat("Variant<type: ", TypeName(),
-                           " value: ", value_->DebugString(), ">");
+    return strings::StrCat(
+        "Variant<type: ", TypeName(),
+        " value: ", is_empty() ? "[empty]" : GetValue()->DebugString(), ">");
   }
 
   // Returns a pointer to the stored value if it is type T, or nullptr
@@ -205,7 +225,7 @@
   T* get() {
     const TypeIndex TTypeIndex = MakeTypeIndex<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
-    return std::addressof(static_cast<Variant::Value<T>*>(value_.get())->value);
+    return std::addressof(static_cast<Variant::Value<T>*>(GetValue())->value);
   }
 
   // Returns a pointer to the stored value if it is type T, or nullptr
@@ -215,7 +235,7 @@
     const TypeIndex TTypeIndex = MakeTypeIndex<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
     return std::addressof(
-        static_cast<const Variant::Value<T>*>(value_.get())->value);
+        static_cast<const Variant::Value<T>*>(GetValue())->value);
   }
 
   // Returns TypeNameVariant(value).
@@ -227,13 +247,13 @@
     if (is_empty()) {
       return "";
     }
-    return value_->TypeName();
+    return GetValue()->TypeName();
   }
 
   // Serialize the contents of the stored object into `data`.
   void Encode(VariantTensorData* data) const {
     if (!is_empty()) {
-      value_->Encode(data);
+      GetValue()->Encode(data);
     }
   }
 
@@ -243,26 +263,36 @@
   // Helper methods to directly serialize/deserialize from strings.
   void Encode(string* buf) const {
     if (!is_empty()) {
-      value_->Encode(buf);
+      GetValue()->Encode(buf);
     }
   }
   bool Decode(string buf) {
     if (!is_empty()) {
-      return value_->Decode(std::move(buf));
+      return GetValue()->Decode(std::move(buf));
     }
     return true;
   }
 
+  template <typename VT>
+  static constexpr bool CanInlineType() {
+    return ((sizeof(Value<VT>) <= InlineValue::kMaxValueSize) &&
+            (alignof(Value<VT>) <= kMaxInlineValueAlignSize));
+  }
+
  private:
   struct in_place_t {};
-  static constexpr in_place_t in_place{};
+  static constexpr in_place_t kInPlace{};
 
   struct ValueInterface {
     virtual ~ValueInterface() = default;
     virtual TypeIndex TypeId() const = 0;
     virtual void* RawPtr() = 0;
     virtual const void* RawPtr() const = 0;
-    virtual std::unique_ptr<ValueInterface> Clone() const = 0;
+    virtual ValueInterface* Clone() const = 0;
+    virtual void CloneInto(ValueInterface* memory) const = 0;
+    virtual void Swap(ValueInterface* memory) = 0;
+    virtual void MoveAssign(ValueInterface* memory) = 0;
+    virtual void MoveInto(ValueInterface* memory) = 0;
     virtual string TypeName() const = 0;
     virtual string DebugString() const = 0;
     virtual void Encode(VariantTensorData* data) const = 0;
@@ -277,6 +307,10 @@
     explicit Value(in_place_t /*tag*/, Args&&... args)
         : value(std::forward<Args>(args)...) {}
 
+    // NOTE(ebrevdo): Destructor must be explicitly defined for CUDA to happily
+    // build `alignof(Variant<void*>)`.
+    ~Value() final = default;
+
     TypeIndex TypeId() const override {
       const TypeIndex value_type_index =
           MakeTypeIndex<typename std::decay<T>::type>();
@@ -287,8 +321,33 @@
 
     const void* RawPtr() const override { return &value; }
 
-    std::unique_ptr<ValueInterface> Clone() const override {
-      return std::unique_ptr<ValueInterface>(new Value(in_place, value));
+    ValueInterface* Clone() const override {
+      // NOTE: Use placement new here because we override `operator delete`,
+      // and need to match the call to `port::Free()` with a call to
+      // `port::Malloc()`.
+      auto* clone = static_cast<Value*>(port::Malloc(sizeof(Value)));
+      new (clone) Value(kInPlace, value);
+      return clone;
+    }
+
+    void MoveAssign(ValueInterface* memory) override {
+      CHECK(TypeId() == memory->TypeId())
+          << TypeId().name() << " vs. " << memory->TypeId().name();
+      static_cast<Value*>(memory)->value = std::move(value);
+    }
+
+    void CloneInto(ValueInterface* memory) const override {
+      new (memory) Value(kInPlace, value);
+    }
+
+    void MoveInto(ValueInterface* memory) override {
+      new (memory) Value(kInPlace, std::move(value));
+    }
+
+    void Swap(ValueInterface* memory) override {
+      CHECK(TypeId() == memory->TypeId())
+          << TypeId().name() << " vs. " << memory->TypeId().name();
+      std::swap(value, static_cast<Value*>(memory)->value);
     }
 
     string TypeName() const override { return TypeNameVariant(value); }
@@ -307,14 +366,363 @@
 
     bool Decode(string buf) override { return DecodeVariant(&buf, &value); }
 
+    // We override operator delete in order to selectively free memory
+    // depending on if Value<VT> is stored inline or on the heap:
+    //
+    // Value<VT> is stored inline if its size <= InlineValue::kMaxValueSize and
+    // its alignment <= kMaxInlineValueAlignSize.  This check is performed by
+    // CanInlineType<VT>().
+    //
+    // We only need to call its destructor in this case and then overwrite
+    // the inline memory with zeros.  Variant::clear() does this.
+    // Thus, in the inline case, the delete operator does nothing (calling
+    // delete on the memory location calls the destructor only).
+    //
+    // If !CanInlineType<VT>(), then it is stored as a pointer inside HeapValue.
+    // The memory buffer it resides in on the heap was allocated with
+    // port::Malloc, and it should be deallocated via port::Free.
+    //
+    // operator delete is stored in the vtable since ~ValueInterface is a
+    // virtual destructor; furthermore it has access to VT and can calculate
+    // CanInlineType<VT>().
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
+    }
+
     T value;
   };
+  static constexpr int kMaxInlineValueAlignSize = alignof(Value<void*>);
+
+  using HeapValue = std::unique_ptr<ValueInterface>;
+
+  struct InlineValue {
+    // We try to size InlineValue so that sizeof(Variant) <= 64 and it can fit
+    // into the aligned space of a TensorBuffer.
+    static constexpr int kMaxValueSize = (64 - /*some extra padding=*/16);
+
+    typedef char ValueDataArray[kMaxValueSize];
+    alignas(kMaxInlineValueAlignSize) ValueDataArray value_data;
+    bool has_value = false;
+
+    explicit InlineValue() {}
+
+    InlineValue(const InlineValue& other) noexcept
+        : has_value(other.has_value) {
+      if (other.has_value) {
+        other.AsValueInterface()->CloneInto(AsValueInterface());
+      }
+    }
+
+    InlineValue(InlineValue&& other) noexcept : has_value(other.has_value) {
+      if (other.has_value) {
+        other.AsValueInterface()->MoveInto(AsValueInterface());
+        other.Cleanup();
+      }
+    }
+
+    void Cleanup() {
+      // **NOTE** This must be a no-op if the memory representation of
+      // InlineValue is all zeros, in order to properly interact with
+      // HeapOrInline::ResetMemory().
+      if (has_value) {
+        // This doesn't actually delete anything on the heap; the delete
+        // operator of Value<VT> is overridden to do nothing for inline
+        // values; the side-effect of delete is that the virtual destructor is
+        // called.
+        //
+        // We leave it to callers to overwrite the data buffer in value_data
+        // with new objects.
+        delete AsValueInterface();
+      }
+      has_value = false;
+    }
+
+    InlineValue& operator=(const InlineValue& other) {
+      if (&other == this) return *this;
+      Cleanup();
+      if (other.has_value) {
+        other.AsValueInterface()->CloneInto(AsValueInterface());
+      }
+      has_value = other.has_value;
+      return *this;
+    }
+
+    InlineValue& operator=(InlineValue&& other) {
+      if (&other == this) return *this;
+      if (other.has_value) {
+        if (has_value && AsValueInterface()->TypeId() ==
+                             other.AsValueInterface()->TypeId()) {
+          other.AsValueInterface()->Swap(AsValueInterface());
+        } else {
+          if (has_value) {
+            if (AsValueInterface()->TypeId() !=
+                other.AsValueInterface()->TypeId()) {
+              Cleanup();
+              other.AsValueInterface()->MoveInto(AsValueInterface());
+            } else {
+              other.AsValueInterface()->MoveAssign(AsValueInterface());
+            }
+          } else {
+            other.AsValueInterface()->MoveInto(AsValueInterface());
+          }
+          other.Cleanup();
+          has_value = true;
+        }
+      } else {
+        Cleanup();
+      }
+      return *this;
+    }
+
+    ValueInterface* AsValueInterface() {
+      return reinterpret_cast<ValueInterface*>(value_data);
+    }
+
+    const ValueInterface* AsValueInterface() const {
+      return reinterpret_cast<const ValueInterface*>(value_data);
+    }
+
+    // **WARNING** This must be a no-op when the byte-representation of
+    // InlineValue is all zeros.
+    ~InlineValue() { Cleanup(); }
+  };
 
   // value_ can point to any type T as wrapped by a ValueInterface.
   // The only real requirement is that T is default-constructible.
-  std::unique_ptr<ValueInterface> value_;
+  union HeapOrInline {
+    HeapOrInline() { ResetMemory(); }
+    explicit HeapOrInline(HeapValue&& v) : heap_value(std::move(v)) {}
+    explicit HeapOrInline(InlineValue&& v) : inline_value(std::move(v)) {}
+    ~HeapOrInline() {}  // Taken care of by owner.
+
+    // This must be called when modifying which element of HeapOrInline is
+    // being used, because the destructor of the new class may be called
+    // while the memory is still a representation of the old class.
+    // **WARNING** This code assumes that the destructors of HeapValue and
+    // InlineValue are no-ops when the internal representation is zeros.
+    //
+    // Example of when this is needed:
+    //   value.heap_value = HeapValue(...);
+    //   // Segfault.  This calls InlineValue::Cleanup on value.inline_value
+    //   // but the internal memory representation is that of HeapValue.
+    //   value.inline_value = InlineValue();
+    //
+    //   The correct way to do this:
+    //   value.heap_value = HeapValue(...);
+    //   value.ResetMemory();
+    //   value.inline_value = InlineValue();
+    void ResetMemory();
+
+    HeapValue heap_value;
+    InlineValue inline_value;
+  } value_;
+  bool is_inline_;
+
+  bool IsInlineValue() const { return is_inline_; }
+
+  ValueInterface* GetValue() {
+    if (IsInlineValue()) {
+      return value_.inline_value.AsValueInterface();
+    } else {
+      return value_.heap_value.get();
+    }
+  }
+
+  const ValueInterface* GetValue() const {
+    if (IsInlineValue()) {
+      return value_.inline_value.AsValueInterface();
+    } else {
+      return value_.heap_value.get();
+    }
+  }
+
+  // PRECONDITION: Called on construction or clear() has been called before
+  // this method.
+  template <typename T, typename VT>
+  void InsertValueMove(T&& value) {
+    if (is_inline_) {
+      Value<VT>* inline_value_data =
+          reinterpret_cast<Value<VT>*>(value_.inline_value.value_data);
+      new (inline_value_data) Value<VT>(kInPlace, std::forward<T>(value));
+      value_.inline_value.has_value = true;
+    } else {
+      auto* moved = static_cast<Value<VT>*>(port::Malloc(sizeof(Value<VT>)));
+      new (moved) Value<VT>(kInPlace, std::forward<T>(value));
+      value_.heap_value = HeapValue(moved);
+    }
+  }
+
+  // PRECONDITION: Called on construction or clear() has been called before
+  // this method.
+  template <typename T, typename VT>
+  void InsertValueCopy(const T& value) {
+    if (is_inline_) {
+      Value<VT>* inline_value_data =
+          reinterpret_cast<Value<VT>*>(value_.inline_value.value_data);
+      new (inline_value_data) Value<VT>(kInPlace, value);
+      value_.inline_value.has_value = true;
+    } else {
+      auto* moved = static_cast<Value<VT>*>(port::Malloc(sizeof(Value<VT>)));
+      new (moved) Value<VT>(kInPlace, value);
+      value_.heap_value = HeapValue(moved);
+    }
+  }
 };
 
+// Make sure that a Variant object can reside in a 64-byte aligned Tensor
+// buffer.
+static_assert(sizeof(Variant) <= 64,
+              "Expected internal representation to be 64 bytes.");
+
+inline Variant::Variant(const Variant& other) : is_inline_(other.is_inline_) {
+  if (!other.is_empty()) {
+    if (other.IsInlineValue()) {
+      value_.inline_value = InlineValue();
+      other.GetValue()->CloneInto(GetValue());
+      value_.inline_value.has_value = true;
+    } else {
+      value_.heap_value = HeapValue(other.GetValue()->Clone());
+      is_inline_ = false;
+    }
+  }
+}
+
+inline Variant::Variant(Variant&& other) noexcept
+    : is_inline_(other.is_inline_) {
+  if (!other.is_empty()) {
+    if (other.IsInlineValue()) {
+      value_.inline_value = InlineValue();
+      other.GetValue()->MoveInto(GetValue());
+      value_.inline_value.has_value = true;
+    } else {
+      value_.heap_value = std::move(other.value_.heap_value);
+      is_inline_ = false;
+    }
+  }
+}
+
+template <typename VT>
+void Variant::Value<VT>::operator delete(void* ptr) {
+  if (!CanInlineType<VT>()) port::Free(ptr);
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_move_constructible<VT>::value,
+                                  void>::type*>
+inline Variant::Variant(T&& value) : is_inline_(CanInlineType<VT>()) {
+  InsertValueMove<T, VT>(std::forward<T>(value));
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_copy_constructible<VT>::value,
+                                  void>::type*>
+inline Variant::Variant(const T& value) : is_inline_(CanInlineType<VT>()) {
+  InsertValueCopy<T, VT>(value);
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_move_constructible<VT>::value,
+                                  void>::type*>
+inline Variant& Variant::operator=(T&& value) {
+  clear();
+  is_inline_ = CanInlineType<VT>();
+  InsertValueMove<T, VT>(std::forward<T>(value));
+  return *this;
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_copy_constructible<VT>::value,
+                                  void>::type*>
+inline Variant& Variant::operator=(const T& value) {
+  clear();
+  is_inline_ = CanInlineType<VT>();
+  InsertValueCopy<T, VT>(value);
+  return *this;
+}
+
+inline void Variant::HeapOrInline::ResetMemory() {
+  memset(  // NOLINT: not TriviallyCopyable
+      this, 0, sizeof(Variant::HeapOrInline));
+}
+
+inline void Variant::clear() noexcept {
+  if (!is_empty()) {
+    if (IsInlineValue()) {
+      value_.inline_value.~InlineValue();
+    } else {
+      value_.heap_value.~HeapValue();
+    }
+    value_.ResetMemory();
+  }
+  is_inline_ = false;
+}
+
+inline void Variant::swap(Variant& other) noexcept {
+  if (is_empty()) {
+    if (other.IsInlineValue()) {
+      value_.ResetMemory();
+      value_.inline_value = std::move(other.value_.inline_value);
+      other.value_.ResetMemory();
+      other.value_.heap_value = HeapValue();
+      is_inline_ = true;
+      other.is_inline_ = false;
+    } else {
+      value_.ResetMemory();
+      value_.heap_value = std::move(other.value_.heap_value);
+      other.value_.ResetMemory();
+      other.value_.heap_value = HeapValue();
+      is_inline_ = false;
+      other.is_inline_ = false;
+    }
+  } else if (other.is_empty()) {
+    if (IsInlineValue()) {
+      other.value_.ResetMemory();
+      other.value_.inline_value = std::move(value_.inline_value);
+      value_.ResetMemory();
+      value_.heap_value = HeapValue();
+      other.is_inline_ = true;
+      is_inline_ = false;
+    } else {
+      other.value_.ResetMemory();
+      other.value_.heap_value = std::move(value_.heap_value);
+      value_.ResetMemory();
+      value_.heap_value = HeapValue();
+      other.is_inline_ = false;
+      is_inline_ = false;
+    }
+  } else {  // Both Variants have values.
+    if (other.IsInlineValue() && IsInlineValue()) {
+      std::swap(value_.inline_value, other.value_.inline_value);
+    } else if (!other.IsInlineValue() && !IsInlineValue()) {
+      std::swap(value_.heap_value, other.value_.heap_value);
+    } else if (other.IsInlineValue() && !IsInlineValue()) {
+      HeapValue v = std::move(value_.heap_value);
+      value_.ResetMemory();
+      value_.inline_value = std::move(other.value_.inline_value);
+      other.value_.ResetMemory();
+      other.value_.heap_value = std::move(v);
+      is_inline_ = true;
+      other.is_inline_ = false;
+    } else {  // !other.IsInlineValue() && IsInlineValue()
+      HeapValue v = std::move(other.value_.heap_value);
+      other.value_.ResetMemory();
+      other.value_.inline_value = std::move(value_.inline_value);
+      value_.ResetMemory();
+      value_.heap_value = std::move(v);
+      is_inline_ = false;
+      other.is_inline_ = true;
+    }
+  }
+}
+
 template <>
 void* Variant::get();
 
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index d98cf6b..8c654cc 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -62,6 +62,10 @@
     return GetMetadata<T>(value, PODResolver<T>());
   }
 
+  string& metadata_string() { return metadata_; }
+
+  const string& metadata_string() const { return metadata_; }
+
   // Tensors contained within objects being serialized.
   int tensors_size() const;
   const Tensor& tensors(int index) const;
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 8947f93..f12b0ea 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -13,15 +13,18 @@
 limitations under the License.
 ==============================================================================*/
 
-#include <vector>
-
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
+
+#include <xmmintrin.h>
+
+#include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -29,17 +32,206 @@
 
 namespace {
 
-template <typename T>
+template <typename T, bool BIG>
 struct Wrapper {
   T value;
+  char big[BIG ? 256 : 0];
   string TypeName() const { return "POD"; }
 };
 
-using Int = Wrapper<int>;
-using Float = Wrapper<float>;
+template <bool BIG>
+using Int = Wrapper<int, BIG>;
+
+template <bool BIG>
+using Float = Wrapper<float, BIG>;
+
+template <bool BIG>
+class MaybeAlive {
+ public:
+  MaybeAlive() : alive_(false) {}
+
+  explicit MaybeAlive(bool alive) : alive_(alive) {
+    if (alive) ++live_counter_;
+  }
+
+  ~MaybeAlive() {
+    if (alive_) --live_counter_;
+  }
+
+  MaybeAlive(const MaybeAlive& rhs) : alive_(rhs.alive_) {
+    if (alive_) ++live_counter_;
+  }
+
+  MaybeAlive& operator=(const MaybeAlive& rhs) {
+    if (this == &rhs) return *this;
+    if (alive_) --live_counter_;
+    alive_ = rhs.alive_;
+    if (alive_) ++live_counter_;
+    return *this;
+  }
+
+  MaybeAlive(MaybeAlive&& rhs) : alive_(false) {
+    alive_ = std::move(rhs.alive_);
+    if (alive_) ++live_counter_;
+  }
+
+  MaybeAlive& operator=(MaybeAlive&& rhs) {
+    if (this == &rhs) return *this;
+    if (alive_) --live_counter_;
+    alive_ = std::move(rhs.alive_);
+    if (alive_) ++live_counter_;
+    return *this;
+  }
+
+  static int LiveCounter() { return live_counter_; }
+
+  string TypeName() const { return "MaybeAlive"; }
+  void Encode(VariantTensorData* data) const {}
+  bool Decode(VariantTensorData data) { return false; }
+
+ private:
+  bool alive_;
+  char big_[BIG ? 256 : 0];
+  static int live_counter_;
+};
+
+template <>
+int MaybeAlive<false>::live_counter_ = 0;
+template <>
+int MaybeAlive<true>::live_counter_ = 0;
+
+template <bool BIG>
+class DeleteCounter {
+ public:
+  DeleteCounter() : big_{}, counter_(nullptr) {}
+  explicit DeleteCounter(int* counter) : big_{}, counter_(counter) {}
+  ~DeleteCounter() {
+    if (counter_) ++*counter_;
+  }
+  // Need custom move operations because int* just gets copied on move, but we
+  // need to clear counter_ on move.
+  DeleteCounter& operator=(const DeleteCounter& rhs) = default;
+  DeleteCounter& operator=(DeleteCounter&& rhs) {
+    if (this == &rhs) return *this;
+    counter_ = rhs.counter_;
+    rhs.counter_ = nullptr;
+    return *this;
+  }
+  DeleteCounter(DeleteCounter&& rhs) {
+    counter_ = rhs.counter_;
+    rhs.counter_ = nullptr;
+  }
+  DeleteCounter(const DeleteCounter& rhs) = default;
+  char big_[BIG ? 256 : 0];
+  int* counter_;
+
+  string TypeName() const { return "DeleteCounter"; }
+  void Encode(VariantTensorData* data) const {}
+  bool Decode(VariantTensorData data) { return false; }
+};
 
 }  // end namespace
 
+TEST(VariantTest, MoveAndCopyBetweenBigAndSmall) {
+  Variant x;
+  int deleted_big = 0;
+  int deleted_small = 0;
+  x = DeleteCounter</*BIG=*/true>(&deleted_big);
+  EXPECT_EQ(deleted_big, 0);
+  x = DeleteCounter</*BIG=*/false>(&deleted_small);
+  EXPECT_EQ(deleted_big, 1);
+  EXPECT_EQ(deleted_small, 0);
+  x = DeleteCounter</*BIG=*/true>(&deleted_big);
+  EXPECT_EQ(deleted_big, 1);
+  EXPECT_EQ(deleted_small, 1);
+  x.clear();
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+  DeleteCounter</*BIG=*/true> big(&deleted_big);
+  DeleteCounter</*BIG=*/false> small(&deleted_small);
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+  x = big;
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+  x = small;
+  EXPECT_EQ(deleted_big, 3);
+  EXPECT_EQ(deleted_small, 1);
+  x = std::move(big);
+  EXPECT_EQ(deleted_big, 3);
+  EXPECT_EQ(deleted_small, 2);
+  x = std::move(small);
+  EXPECT_EQ(deleted_big, 4);
+  EXPECT_EQ(deleted_small, 2);
+  x.clear();
+  EXPECT_EQ(deleted_big, 4);
+  EXPECT_EQ(deleted_small, 3);
+}
+
+TEST(VariantTest, MoveAndCopyBetweenBigAndSmallVariants) {
+  int deleted_big = 0;
+  int deleted_small = 0;
+  {
+    Variant x = DeleteCounter</*BIG=*/true>(&deleted_big);
+    Variant y = DeleteCounter</*BIG=*/false>(&deleted_small);
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 0);
+    x = y;
+    EXPECT_EQ(deleted_big, 1);
+    EXPECT_EQ(deleted_small, 0);
+    x = x;
+    EXPECT_EQ(deleted_big, 1);
+    EXPECT_EQ(deleted_small, 0);
+    EXPECT_NE(x.get<DeleteCounter<false>>(), nullptr);
+    EXPECT_NE(y.get<DeleteCounter<false>>(), nullptr);
+    x = std::move(y);
+    EXPECT_EQ(deleted_small, 1);
+    EXPECT_NE(x.get<DeleteCounter<false>>(), nullptr);
+  }
+  EXPECT_EQ(deleted_big, 1);
+  EXPECT_EQ(deleted_small, 2);
+
+  deleted_big = 0;
+  deleted_small = 0;
+  {
+    Variant x = DeleteCounter</*BIG=*/false>(&deleted_small);
+    Variant y = DeleteCounter</*BIG=*/true>(&deleted_big);
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 0);
+    x = y;
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 1);
+    x = x;
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 1);
+    EXPECT_NE(x.get<DeleteCounter<true>>(), nullptr);
+    EXPECT_NE(y.get<DeleteCounter<true>>(), nullptr);
+    x = std::move(y);
+    EXPECT_EQ(deleted_big, 1);
+    EXPECT_NE(x.get<DeleteCounter<true>>(), nullptr);
+  }
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+}
+
+template <bool BIG>
+void TestDestructOnVariantMove() {
+  CHECK_EQ(MaybeAlive<BIG>::LiveCounter(), 0);
+  {
+    Variant a = MaybeAlive<BIG>(true);
+    Variant b = std::move(a);
+  }
+  EXPECT_EQ(MaybeAlive<BIG>::LiveCounter(), 0);
+}
+
+TEST(VariantTest, RHSDestructOnVariantMoveBig) {
+  TestDestructOnVariantMove</*BIG=*/true>();
+}
+
+TEST(VariantTest, RHSDestructOnVariantMoveSmall) {
+  TestDestructOnVariantMove</*BIG=*/false>();
+}
+
 TEST(VariantTest, Int) {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
@@ -49,45 +241,125 @@
   EXPECT_EQ(x.TypeName(), "int");
 }
 
-TEST(VariantTest, Basic) {
+struct MayCreateAlignmentDifficulties {
+  int a;
+  __m128 b;
+};
+
+bool M128AllEqual(const __m128& a, const __m128& b) {
+  return _mm_movemask_ps(_mm_cmpeq_ps(a, b)) == 0xf;
+}
+
+TEST(VariantTest, NotAlignable) {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+  __m128 v = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+  x = MayCreateAlignmentDifficulties{-1, v};
+  EXPECT_NE(x.get<void>(), nullptr);
+  auto* x_val = x.get<MayCreateAlignmentDifficulties>();
+  // check that *x_val == x
+  Variant y = x;
+  EXPECT_EQ(x_val->a, -1);
+  EXPECT_TRUE(M128AllEqual(x_val->b, v));
+  auto* y_val = y.get<MayCreateAlignmentDifficulties>();
+  EXPECT_EQ(y_val->a, -1);
+  EXPECT_TRUE(M128AllEqual(y_val->b, v));
+  Variant z = std::move(y);
+  auto* z_val = z.get<MayCreateAlignmentDifficulties>();
+  EXPECT_EQ(z_val->a, -1);
+  EXPECT_TRUE(M128AllEqual(z_val->b, v));
+}
+
+template <bool BIG>
+void TestBasic() {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
 
-  x = Int{42};
+  x = Int<BIG>{42};
 
   EXPECT_NE(x.get<void>(), nullptr);
-  EXPECT_NE(x.get<Int>(), nullptr);
-  EXPECT_EQ(x.get<Int>()->value, 42);
+  EXPECT_NE(x.get<Int<BIG>>(), nullptr);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 42);
   EXPECT_EQ(x.TypeName(), "POD");
 }
 
-TEST(VariantTest, ConstGet) {
+TEST(VariantTest, Basic) { TestBasic<false>(); }
+
+TEST(VariantTest, BasicBig) { TestBasic<true>(); }
+
+template <bool BIG>
+void TestConstGet() {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
 
-  x = Int{42};
+  x = Int<BIG>{42};
 
   const Variant y = x;
 
   EXPECT_NE(y.get<void>(), nullptr);
-  EXPECT_NE(y.get<Int>(), nullptr);
-  EXPECT_EQ(y.get<Int>()->value, 42);
+  EXPECT_NE(y.get<Int<BIG>>(), nullptr);
+  EXPECT_EQ(y.get<Int<BIG>>()->value, 42);
 }
 
-TEST(VariantTest, Clear) {
+TEST(VariantTest, ConstGet) { TestConstGet<false>(); }
+
+TEST(VariantTest, ConstGetBig) { TestConstGet<true>(); }
+
+template <bool BIG>
+void TestClear() {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
 
-  x = Int{42};
+  x = Int<BIG>{42};
 
   EXPECT_NE(x.get<void>(), nullptr);
-  EXPECT_NE(x.get<Int>(), nullptr);
-  EXPECT_EQ(x.get<Int>()->value, 42);
+  EXPECT_NE(x.get<Int<BIG>>(), nullptr);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 42);
 
   x.clear();
   EXPECT_EQ(x.get<void>(), nullptr);
 }
 
+TEST(VariantTest, Clear) { TestClear<false>(); }
+
+TEST(VariantTest, ClearBig) { TestClear<true>(); }
+
+template <bool BIG>
+void TestClearDeletes() {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+
+  int deleted_count = 0;
+  using DC = DeleteCounter<BIG>;
+  DC dc(&deleted_count);
+  EXPECT_EQ(deleted_count, 0);
+  x = dc;
+  EXPECT_EQ(deleted_count, 0);
+
+  EXPECT_NE(x.get<void>(), nullptr);
+  EXPECT_NE(x.get<DC>(), nullptr);
+
+  x.clear();
+  EXPECT_EQ(x.get<void>(), nullptr);
+  EXPECT_EQ(deleted_count, 1);
+
+  x = dc;
+  EXPECT_EQ(deleted_count, 1);
+
+  Variant y = x;
+  EXPECT_EQ(deleted_count, 1);
+
+  x.clear();
+  EXPECT_EQ(deleted_count, 2);
+
+  y.clear();
+  EXPECT_EQ(deleted_count, 3);
+}
+
+TEST(VariantTest, ClearDeletesOnHeap) { TestClearDeletes</*BIG=*/true>(); }
+
+TEST(VariantTest, ClearDeletesOnStack) { TestClearDeletes</*BIG=*/false>(); }
+
 TEST(VariantTest, Tensor) {
   Variant x;
   Tensor t(DT_FLOAT, {});
@@ -101,6 +373,16 @@
   EXPECT_EQ(x.TypeName(), "tensorflow::Tensor");
 }
 
+TEST(VariantTest, NontrivialTensorVariantCopy) {
+  Tensor variants(DT_VARIANT, {});
+  Tensor t(true);
+  test::FillValues<Variant>(&variants, gtl::ArraySlice<Variant>({t}));
+  const Tensor* t_c = variants.flat<Variant>()(0).get<Tensor>();
+  EXPECT_EQ(t_c->dtype(), t.dtype());
+  EXPECT_EQ(t_c->shape(), t.shape());
+  EXPECT_EQ(t_c->scalar<bool>()(), t.scalar<bool>()());
+}
+
 TEST(VariantTest, TensorProto) {
   Variant x;
   TensorProto t;
@@ -114,31 +396,41 @@
   EXPECT_EQ(x.get<TensorProto>()->tensor_shape().unknown_rank(), true);
 }
 
-TEST(VariantTest, CopyValue) {
+template <bool BIG>
+void TestCopyValue() {
   Variant x, y;
-  x = Int{10};
+  x = Int<BIG>{10};
   y = x;
 
-  EXPECT_EQ(x.get<Int>()->value, 10);
-  EXPECT_EQ(x.get<Int>()->value, y.get<Int>()->value);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 10);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, y.get<Int<BIG>>()->value);
 }
 
-TEST(VariantTest, MoveValue) {
+TEST(VariantTest, CopyValue) { TestCopyValue<false>(); }
+
+TEST(VariantTest, CopyValueBig) { TestCopyValue<true>(); }
+
+template <bool BIG>
+void TestMoveValue() {
   Variant x;
   x = []() -> Variant {
     Variant y;
-    y = Int{10};
+    y = Int<BIG>{10};
     return y;
   }();
-  EXPECT_EQ(x.get<Int>()->value, 10);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 10);
 }
 
+TEST(VariantTest, MoveValue) { TestMoveValue<false>(); }
+
+TEST(VariantTest, MoveValueBig) { TestMoveValue<true>(); }
+
 TEST(VariantTest, TypeMismatch) {
   Variant x;
-  x = Int{10};
+  x = Int<false>{10};
   EXPECT_EQ(x.get<float>(), nullptr);
   EXPECT_EQ(x.get<int>(), nullptr);
-  EXPECT_NE(x.get<Int>(), nullptr);
+  EXPECT_NE(x.get<Int<false>>(), nullptr);
 }
 
 struct TensorList {
@@ -206,19 +498,26 @@
                 "Variant<type: TensorList value: ", data.DebugString(), ">"));
 }
 
-TEST(VariantTest, VariantArray) {
+template <bool BIG>
+void TestVariantArray() {
   Variant x[2];
-  x[0] = Int{2};
-  x[1] = Float{2.0f};
+  x[0] = Int<BIG>{2};
+  x[1] = Float<BIG>{2.0f};
 
-  EXPECT_EQ(x[0].get<Int>()->value, 2);
-  EXPECT_EQ(x[1].get<Float>()->value, 2.0f);
+  EXPECT_EQ(x[0].get<Int<BIG>>()->value, 2);
+  EXPECT_EQ(x[1].get<Float<BIG>>()->value, 2.0f);
 }
 
-TEST(VariantTest, PodUpdate) {
+TEST(VariantTest, VariantArray) { TestVariantArray<false>(); }
+
+TEST(VariantTest, VariantArrayBig) { TestVariantArray<true>(); }
+
+template <bool BIG>
+void PodUpdateTest() {
   struct Pod {
     int x;
     float y;
+    char big[BIG ? 256 : 0];
 
     string TypeName() const { return "POD"; }
   };
@@ -232,10 +531,16 @@
   EXPECT_EQ(x.get<Pod>()->x, 30);
 }
 
-TEST(VariantTest, EncodeDecodePod) {
+TEST(VariantTest, PodUpdate) { PodUpdateTest<false>(); }
+
+TEST(VariantTest, PodUpdateBig) { PodUpdateTest<true>(); }
+
+template <bool BIG>
+void TestEncodeDecodePod() {
   struct Pod {
     int x;
     float y;
+    char big[BIG ? 256 : 0];
 
     string TypeName() const { return "POD"; }
   };
@@ -247,14 +552,17 @@
   VariantTensorData serialized;
   x.Encode(&serialized);
 
-  Variant y;
-  y = Pod();
+  Variant y = Pod{};
   y.Decode(serialized);
 
   EXPECT_EQ(p.x, y.get<Pod>()->x);
   EXPECT_EQ(p.y, y.get<Pod>()->y);
 }
 
+TEST(VariantTest, EncodeDecodePod) { TestEncodeDecodePod<false>(); }
+
+TEST(VariantTest, EncodeDecodePodBig) { TestEncodeDecodePod<true>(); }
+
 TEST(VariantTest, EncodeDecodeTensor) {
   Variant x;
   Tensor t(DT_INT32, {});
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index 5abe77f..cbef1c2 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -25,6 +25,15 @@
 
 // Control flow info for a graph node.
 struct ControlFlowInfo {
+  // 'frame' and 'parent_frame' are pointers to:
+  //
+  // a) One of the Enter nodes corresponding to the loop body, if the node
+  //    executes inside a loop. If multiple tensors enter the while loop, it's
+  //    undefined which Enter node will be used.
+  //
+  // b) SOURCE node (node.id() == Graph::kSourceId), if the node is not inside
+  //    any of the while loops.
+
   const Node* frame = nullptr;         // frame of a node
   const Node* parent_frame = nullptr;  // parent frame of a node
   string frame_name;                   // frame name of a node
diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/graph/gradients.cc
index bec4171..75352fc 100644
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/graph/gradients.cc
@@ -198,6 +198,9 @@
   void BackpropAlongEdge(const NodeOut& dst_grad, const NodeOut& src);
   void BackpropZerosAlongEdge(const NodeOut& src);
 
+  // Returns a node representing the sum of any backpropped gradients for 'src'.
+  // This will be an AddN node if there is more than one accumulated gradient.
+  // Returns zeros if there are no gradients, or the dtype is DT_BOOL.
   NodeOut SumGradients(const NodeOut& src);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientBuilder);
@@ -296,7 +299,7 @@
   auto iter = backprops_.find(src);
   CHECK(iter != backprops_.end());
   const auto& grads = iter->second;
-  if (grads.empty()) {
+  if (grads.empty() || dtype == DT_BOOL) {
     // Nothing propagated back. The best we can come up is zeros.
     Node* zero_like = AddZerosLike(graph_, src);
     return {zero_like, 0};
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index e616f72..6574f3b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -315,9 +315,15 @@
 // NodeDebugInfo
 
 NodeDebugInfo::NodeDebugInfo(const Node& n) : NodeDebugInfo(n.def()) {}
-NodeDebugInfo::NodeDebugInfo(const NodeDef& ndef) : name(ndef.name()) {
-  if (ndef.has_experimental_debug_info()) {
-    const auto& names = ndef.experimental_debug_info().original_node_names();
+NodeDebugInfo::NodeDebugInfo(const NodeDef& ndef)
+    : NodeDebugInfo(ndef.name(), ndef.has_experimental_debug_info(),
+                    ndef.experimental_debug_info()) {}
+NodeDebugInfo::NodeDebugInfo(
+    StringPiece node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info)
+    : name(node_name) {
+  if (has_experimental_debug_info) {
+    const auto& names = experimental_debug_info.original_node_names();
     original_node_names.assign(names.begin(), names.end());
   }
 }
@@ -447,8 +453,6 @@
   DCHECK(!node->IsSink());
 
   // Remove any edges involving this node.
-  free_edges_.reserve(free_edges_.size() + node->in_edges_.size() +
-                      node->out_edges_.size());
   for (const Edge* e : node->in_edges_) {
     CHECK_EQ(e->src_->out_edges_.erase(e), size_t{1});
     edges_[e->id_] = nullptr;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 197058e..6913f50 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -40,7 +40,9 @@
 #include <functional>
 #include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/edgeset.h"
@@ -309,6 +311,8 @@
 
   NodeDebugInfo(const Node& n);
   NodeDebugInfo(const NodeDef& ndef);
+  NodeDebugInfo(StringPiece node_name, bool has_experimental_debug_info,
+                const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 };
 
 // Represents an input of a node, i.e., the `index`-th input to `node`.
@@ -766,15 +770,20 @@
   return IsIdentity(node) || IsControlFlow(node);
 }
 
+// NOTE: We declare Reference type of NodeIter and NeighborIter as Node* (see
+// https://en.cppreference.com/w/cpp/iterator/iterator).
+
 // Iterator for stepping through the nodes of a graph.
-class NodeIter {
+class NodeIter
+    : public std::iterator<std::forward_iterator_tag, Node, std::ptrdiff_t,
+                           /*Pointer*/ Node*, /*Reference*/ Node*> {
  public:
   NodeIter(const Graph* graph, int id);
-  bool operator==(const NodeIter& rhs);
-  bool operator!=(const NodeIter& rhs);
+  bool operator==(const NodeIter& rhs) const;
+  bool operator!=(const NodeIter& rhs) const;
   void operator++();
-  Node* operator*();
-  Node* operator->();
+  reference operator*() const;
+  pointer operator->() const;
 
  private:
   // Invariant: id_ == graph_->num_node_ids() || graph_->FindId(id_) != nullptr
@@ -783,14 +792,16 @@
 };
 
 // Iterator for stepping through the neighbors of a node.
-class NeighborIter {
+class NeighborIter
+    : public std::iterator<std::forward_iterator_tag, Node, std::ptrdiff_t,
+                           /*Pointer*/ Node*, /*Reference*/ Node*> {
  public:
   NeighborIter(EdgeSet::const_iterator iter, bool incoming);
-  bool operator==(const NeighborIter& rhs);
-  bool operator!=(const NeighborIter& rhs);
+  bool operator==(const NeighborIter& rhs) const;
+  bool operator!=(const NeighborIter& rhs) const;
   void operator++();
-  Node* operator*();
-  Node* operator->();
+  reference operator*() const;
+  pointer operator->() const;
 
  private:
   EdgeSet::const_iterator iter_;
@@ -802,12 +813,12 @@
 inline NodeIter::NodeIter(const Graph* graph, int id)
     : graph_(graph), id_(id) {}
 
-inline bool NodeIter::operator==(const NodeIter& rhs) {
+inline bool NodeIter::operator==(const NodeIter& rhs) const {
   DCHECK(graph_ == rhs.graph_);
   return id_ == rhs.id_;
 }
 
-inline bool NodeIter::operator!=(const NodeIter& rhs) {
+inline bool NodeIter::operator!=(const NodeIter& rhs) const {
   return !(*this == rhs);
 }
 
@@ -821,29 +832,29 @@
   }
 }
 
-inline Node* NodeIter::operator*() { return graph_->FindNodeId(id_); }
+inline Node* NodeIter::operator*() const { return graph_->FindNodeId(id_); }
 
-inline Node* NodeIter::operator->() { return graph_->FindNodeId(id_); }
+inline Node* NodeIter::operator->() const { return graph_->FindNodeId(id_); }
 
 inline NeighborIter::NeighborIter(EdgeSet::const_iterator iter, bool incoming)
     : iter_(iter), incoming_(incoming) {}
 
-inline bool NeighborIter::operator==(const NeighborIter& rhs) {
+inline bool NeighborIter::operator==(const NeighborIter& rhs) const {
   return iter_ == rhs.iter_ && incoming_ == rhs.incoming_;
 }
 
-inline bool NeighborIter::operator!=(const NeighborIter& rhs) {
+inline bool NeighborIter::operator!=(const NeighborIter& rhs) const {
   return !(*this == rhs);
 }
 
 inline void NeighborIter::operator++() { ++iter_; }
 
-inline Node* NeighborIter::operator*() {
+inline Node* NeighborIter::operator*() const {
   const Edge* e = *iter_;
   return incoming_ ? e->src() : e->dst();
 }
 
-inline Node* NeighborIter::operator->() {
+inline Node* NeighborIter::operator->() const {
   const Edge* e = *iter_;
   return incoming_ ? e->src() : e->dst();
 }
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 2b9609b..6e7393e 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -1132,7 +1132,7 @@
   Status FuseNode(std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
                   const MklLayoutRewritePass::FusionInfo fi);
 
-  // Fuse tranpose(to "NHWC") + mklop("NHWC") + transpose(to "NCHW") into
+  // Fuse transpose(to "NHWC") + mklop("NHWC") + transpose(to "NCHW") into
   // mklop("NCHW").
   // Here "mklop" can be any MKL-DNN supported op, such as Conv2D.
   static Status FuseTransposeMklOpTranspose(
@@ -1180,34 +1180,27 @@
           TF_CHECK_OK(GetNodeAttr(perm_node->def(), "value", &proto));
 
           DataType type;
-          GetNodeAttr(perm_node->def(), "dtype", &type);
+          TF_CHECK_OK(GetNodeAttr(perm_node->def(), "dtype", &type));
 
-          // Here we directly access to the "tensor_content", rather than
-          // "int_val". This is because we find "int_val" is
-          // not set properly under some circumstances.
+          Tensor tensor;
+          if (!tensor.FromProto(*proto)) {
+            TF_CHECK_OK(errors::InvalidArgument(
+                "Could not construct Tensor from TensorProto in node: ",
+                node->name()));
+            return false;
+          }
+          DCHECK_EQ(tensor.dims(), 1);
+          DCHECK_EQ(tensor.dim_size(0), perm.size());
           if (type == DT_INT32) {
-            const int type_size = 4;
-            const int* tensor_content =
-                reinterpret_cast<const int*>(proto->tensor_content().c_str());
-            const int tensor_content_size =
-                proto->tensor_content().size() / type_size;
-
-            std::vector<int> perm_value(tensor_content,
-                                        tensor_content + tensor_content_size);
-
-            return perm_value == perm;
+            const auto tensor_content = tensor.flat<int>().data();
+            for (int i = 0; i < perm.size(); ++i)
+              if (tensor_content[i] != perm[i]) return false;
+            return true;
           } else if (type == DT_INT64) {
-            const int type_size = 8;
-            const long* tensor_content =
-                reinterpret_cast<const long*>(proto->tensor_content().c_str());
-            const int tensor_content_size =
-                proto->tensor_content().size() / type_size;
-
-            std::vector<long> perm_value(tensor_content,
-                                         tensor_content + tensor_content_size);
-            std::vector<long> long_perm(perm.cbegin(), perm.cend());
-
-            return perm_value == long_perm;
+            const auto tensor_content = tensor.flat<int64>().data();
+            for (int i = 0; i < perm.size(); ++i)
+              if (tensor_content[i] != perm[i]) return false;
+            return true;
           }
           return false;
         }
@@ -1248,9 +1241,9 @@
   static bool DequantizeRewrite(const Node* n) {
     DCHECK(n);
     Node* input = nullptr;
-    n->input_node(0, &input);
+    TF_CHECK_OK(n->input_node(0, &input));
     string mode_string;
-    GetNodeAttr(n->def(), "mode", &mode_string);
+    TF_CHECK_OK(GetNodeAttr(n->def(), "mode", &mode_string));
     if (mode_string != "SCALED") {
       VLOG(1) << "DequantizeRewrite: Mode is not SCALED. "
               << "This case is not optimized by Intel MKL kernel, thus using "
@@ -1279,10 +1272,11 @@
     string data_format_str;
     TensorFormat data_format;
     std::vector<int32> ksize, strides;
-    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
-    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+    TF_CHECK_OK(GetNodeAttr(n->def(), "ksize", &ksize));
+    TF_CHECK_OK(GetNodeAttr(n->def(), "strides", &strides));
+    TF_CHECK_OK(GetNodeAttr(n->def(), "data_format", &data_format_str));
+    bool result = FormatFromString(data_format_str, &data_format);
+    DCHECK(result);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
     if (GetTensorDim(ksize, data_format, 'N') == 1 &&
@@ -1303,7 +1297,7 @@
     CHECK_NOTNULL(n);
 
     int depth_radius;
-    CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
+    TF_CHECK_OK(GetNodeAttr(n->def(), "depth_radius", &depth_radius));
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
     // and use eigen node instead
@@ -1361,11 +1355,11 @@
   static bool QuantizeOpRewrite(const Node* n) {
     DCHECK(n);
     Node* filter_node = nullptr;
-    n->input_node(0, &filter_node);
+    TF_CHECK_OK(n->input_node(0, &filter_node));
     string mode_string;
     string round_mode_string;
-    GetNodeAttr(n->def(), "mode", &mode_string);
-    GetNodeAttr(n->def(), "round_mode", &round_mode_string);
+    TF_CHECK_OK(GetNodeAttr(n->def(), "mode", &mode_string));
+    TF_CHECK_OK(GetNodeAttr(n->def(), "round_mode", &round_mode_string));
     if (mode_string != "SCALED" || round_mode_string != "HALF_TO_EVEN") {
       VLOG(1) << "QuantizeOpRewrite: Mode is not SCALED and/or"
               << "rounding mode is not HALF_TO_EVEN. "
@@ -2150,7 +2144,7 @@
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
 
   Node* filter_node = nullptr;
-  orig_node->input_node(1, &filter_node);
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -2237,7 +2231,7 @@
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
 
   Node* filter_node = nullptr;
-  orig_node->input_node(1, &filter_node);
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -2261,7 +2255,7 @@
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
   // Check if filter is a constant.
   Node* filter_node = nullptr;
-  orig_node->input_node(1, &filter_node);
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
 
   // Add attributes to new node.
   nb->Attr("Tpaddings", Tpaddings);
@@ -2378,7 +2372,7 @@
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
 
   Node* filter_node = nullptr;
-  orig_node->input_node(1, &filter_node);
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -2535,7 +2529,7 @@
   }
 
   Node* filter_node = nullptr;
-  orig_node->input_node(1, &filter_node);
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
 
   // Add attributes to new node.
   nb->Attr("Tinput", Tinput);
@@ -2743,7 +2737,7 @@
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
 
   Node* filter_node = nullptr;
-  orig_node->input_node(1, &filter_node);
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -3412,6 +3406,16 @@
     return nullptr;
   }
 
+  // We make an exception for Conv2D, as the corresponding MKL ops
+  // currently do not support the case of padding == EXPLICIT yet.
+  if (n->type_string() == csinfo_.conv2d ||
+      n->type_string() == csinfo_.conv2d_grad_input ||
+      n->type_string() == csinfo_.conv2d_grad_filter) {
+    string padding;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "padding", &padding));
+    if (padding == "EXPLICIT") return nullptr;
+  }
+
   // We make an exception for __MklDummyConv2DWithBias,
   // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their
   // names do not match Mkl node names.
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index cf321f9..fc55090 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -132,6 +132,7 @@
     tags = [
         "no_cuda_on_cpu_tap",
         "no_gpu",
+        "nomsan",  # TODO(b/132138608): Re-enable this.
     ],
     deps = [
         ":single_machine",
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index fb740a1..4d8e567 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -1607,14 +1607,14 @@
 
   Status InferShapes(const NodeDef& node, NodeContext* c) {
     // Infer the shapes of output tensors.
-    if (!c->op_data || c->op_data->shape_inference_fn == nullptr) {
+    if (!c->op_data || c->op_data->shape_inference_fn == nullptr ||
+        !c->inference_context->Run(c->op_data->shape_inference_fn).ok()) {
       // Annotate outputs with unknown shapes. Update output shapes with
       // annotated information later on if available.
+      // Note that shape inference function may return an error, but we ignore
+      // it, and use UnknownShape in that case.
       TF_RETURN_IF_ERROR(
           c->inference_context->Run(shape_inference::UnknownShape));
-    } else {
-      TF_RETURN_IF_ERROR(
-          c->inference_context->Run(c->op_data->shape_inference_fn));
     }
     Status status = Status::OK();
     auto it = fed_ports_.find(node.name());
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 21583fe..72d3100 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -39,6 +39,7 @@
 constexpr char kDepthwiseConv2dNativeBackpropInput[] =
     "DepthwiseConv2dNativeBackpropInput";
 constexpr char kMatMul[] = "MatMul";
+constexpr char kXlaEinsum[] = "XlaEinsum";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kSparseTensorDenseMatMul[] = "SparseTensorDenseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -68,6 +69,12 @@
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
+// Dynamic control flow ops.
+constexpr char kSwitch[] = "Switch";
+constexpr char kMerge[] = "Merge";
+constexpr char kEnter[] = "Enter";
+constexpr char kExit[] = "Exit";
+constexpr char kNextIteration[] = "NextIteration";
 // Persistent ops.
 constexpr char kConst[] = "Const";
 constexpr char kVariable[] = "Variable";
@@ -177,6 +184,17 @@
   return count;
 }
 
+// Helper function for determining whether there are repeated indices in the
+// input Einsum equation.
+bool CheckRepeatedDimensions(const string& dim_str) {
+  for (int idx = 0; idx < dim_str.size() - 1; idx++) {
+    if (dim_str.find(dim_str[idx], idx + 1) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 // Return a minimum shape if the shape is unknown. If known, return the original
@@ -250,6 +268,7 @@
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
       {kQuantizedMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kQuantizedMatMulV2, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kXlaEinsum, wrap(&OpLevelCostEstimator::PredictEinsum)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kGuaranteeConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -268,6 +287,11 @@
       {kSqueeze, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kSwitch, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kMerge, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kEnter, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kExit, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kNextIteration, wrap(&OpLevelCostEstimator::PredictIdentity)},
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
@@ -1236,6 +1260,157 @@
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
+  // Einsum computes a generalized contraction between tensors of arbitrary
+  // dimension as defined by the equation written in the Einstein summation
+  // convention. The number of tensors in the computation and the number of
+  // contractions can be arbitrarily long. The current model only contemplates
+  // Einsum equations, which can be translated into a single BatchMatMul
+  // operation. Einsum operations with more than two operands are not currently
+  // supported. Subscripts where an axis appears more than once for a single
+  // input and ellipsis are currently also excluded. See:
+  // https://www.tensorflow.org/api_docs/python/tf/einsum
+  // We distinguish four kinds of dimensions, depending on their placement in
+  // the equation:
+  // + B: Batch dimensions: Dimensions which appear in both operands and RHS.
+  // + K: Contracting dimensions: These appear in both inputs but not RHS.
+  // + M: Operand A dimensions: These appear in the first operand and the RHS.
+  // + N: Operand B dimensions: These appear in the second operand and the RHS.
+  // Then, the operation to estimate is BatchMatMul([B,M,K],[B,K,N])
+  const auto& op_info = op_context.op_info;
+
+  string equation = op_info.attr().at("equation").s();
+  std::vector<string> equation_split = absl::StrSplit(equation, "->");
+
+  if (equation_split.empty()) {
+    LOG(WARNING) << "Einsum with malformed equation";
+    return PredictCostOfAnUnknownOp(op_context);
+  }
+  std::vector<string> input_split = absl::StrSplit(equation_split[0], ',');
+
+  // The current model covers Einsum operations with two operands and a RHS
+  if (op_info.inputs_size() != 2 || equation_split.size() != 2) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+    return PredictCostOfAnUnknownOp(op_context);
+  }
+  string rhs_str = equation_split[1];
+  string a_input_str = input_split[0];
+  string b_input_str = input_split[1];
+
+  // Ellipsis are not currently supported
+  if (a_input_str.find("...") != std::string::npos ||
+      b_input_str.find("...") != std::string::npos) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", ellipsis not supported";
+    return PredictCostOfAnUnknownOp(op_context);
+  }
+
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
+  const int matrix_rank = 2;
+
+  bool found_unknown_shapes = false;
+  bool a_input_shape_unknown = false;
+  bool b_input_shape_unknown = false;
+
+  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
+      a_input.shape(), std::max(matrix_rank, a_input.shape().dim_size()),
+      &a_input_shape_unknown);
+  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
+      b_input.shape(), std::max(matrix_rank, b_input.shape().dim_size()),
+      &b_input_shape_unknown);
+
+  found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
+                         (a_input.shape().dim_size() < matrix_rank) ||
+                         (b_input.shape().dim_size() < matrix_rank);
+
+  if (a_input_str.size() != a_input_shape.dim_size() ||
+      b_input_str.size() != b_input_shape.dim_size()) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", equation subscripts don't match tensor rank.";
+    return PredictCostOfAnUnknownOp(op_context);
+  }
+
+  // Subscripts where axis appears more than once for a single input are not yet
+  // supported
+  if (CheckRepeatedDimensions(a_input_str) ||
+      CheckRepeatedDimensions(b_input_str) ||
+      CheckRepeatedDimensions(rhs_str)) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", Subscripts where axis appears more than once for a single "
+               "input are not yet supported";
+    return PredictCostOfAnUnknownOp(op_context);
+  }
+
+  OpInfo batch_matmul_op_info = op_info;
+  batch_matmul_op_info.mutable_inputs()->Clear();
+  batch_matmul_op_info.set_op("BatchMatMul");
+
+  AttrValue transpose_attribute;
+  transpose_attribute.set_b(false);
+  (*batch_matmul_op_info.mutable_attr())["transpose_a"] = transpose_attribute;
+  (*batch_matmul_op_info.mutable_attr())["transpose_b"] = transpose_attribute;
+
+  OpInfo::TensorProperties* a_matrix = batch_matmul_op_info.add_inputs();
+  TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
+  a_matrix->set_dtype(a_input.dtype());
+
+  OpInfo::TensorProperties* b_matrix = batch_matmul_op_info.add_inputs();
+  b_matrix->set_dtype(b_input.dtype());
+  TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
+
+  TensorShapeProto_Dim m_dim;
+  TensorShapeProto_Dim n_dim;
+  TensorShapeProto_Dim k_dim;
+
+  m_dim.set_size(1);
+  n_dim.set_size(1);
+  k_dim.set_size(1);
+
+  for (int i_idx = 0; i_idx < a_input_str.size(); ++i_idx) {
+    if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
+      if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
+        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+        return PredictCostOfAnUnknownOp(op_context);
+      }
+
+      m_dim.set_size(m_dim.size() * a_input_shape.dim(i_idx).size());
+      continue;
+    } else if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
+      // The dimension does not appear in the RHS, therefore it is a contracting
+      // dimension.
+      k_dim.set_size(k_dim.size() * a_input_shape.dim(i_idx).size());
+      continue;
+    }
+    // It appears in both input operands, therefore we place it as an outer
+    // dimension for the Batch Matmul.
+    *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
+    *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
+  }
+  for (int i_idx = 0; i_idx < b_input_str.size(); ++i_idx) {
+    if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
+      if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
+        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+        return PredictCostOfAnUnknownOp(op_context);
+      }
+      n_dim.set_size(n_dim.size() * b_input_shape.dim(i_idx).size());
+    }
+  }
+
+  // The two inner-most dimensions of the Batch Matmul are added.
+  *(a_matrix_shape->add_dim()) = m_dim;
+  *(a_matrix_shape->add_dim()) = k_dim;
+  *(b_matrix_shape->add_dim()) = k_dim;
+  *(b_matrix_shape->add_dim()) = n_dim;
+
+  OpContext batch_matmul_op_context = op_context;
+  batch_matmul_op_context.op_info = batch_matmul_op_info;
+  Costs costs = PredictCosts(batch_matmul_op_context);
+  costs.inaccurate = costs.inaccurate || found_unknown_shapes;
+
+  return costs;
+}
+
 Costs OpLevelCostEstimator::PredictSparseTensorDenseMatMul(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 31acc29..07edb83 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -147,6 +147,7 @@
   Costs PredictAvgPoolGrad(const OpContext& op_context) const;
   Costs PredictFusedBatchNorm(const OpContext& op_context) const;
   Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
+  Costs PredictEinsum(const OpContext& op_context) const;
 
   // Generic cost prediction method for fused operations.
   Costs PredictFusedOp(const OpContext& op_context,
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index f6c4f2a..dfd4784 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -135,6 +135,23 @@
   return op_context;
 }
 
+// Returns an OpInfo for an Einsum
+OpContext DescribeEinsum(const std::vector<int>& dims_a,
+                         const std::vector<int>& dims_b,
+                         const string& equation) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("XlaEinsum");
+  AttrValue equation_attribute;
+  equation_attribute.set_s(equation);
+  (*op_context.op_info.mutable_attr())["equation"] = equation_attribute;
+  if (!dims_a.empty())
+    DescribeArbitraryRankInput(dims_a, DT_FLOAT, &op_context.op_info);
+  if (!dims_b.empty())
+    DescribeArbitraryRankInput(dims_b, DT_FLOAT, &op_context.op_info);
+  return op_context;
+}
+
 // Wrangles the minimum number of proto fields to set up a 1D Tensor for cost
 // estimation purposes.
 void DescribeTensor1D(int dim0, OpInfo::TensorProperties* tensor) {
@@ -1375,5 +1392,132 @@
   EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
                                      cost.intermediate_memory_time);
 }
+
+TEST_F(OpLevelCostEstimatorTest, Einsum) {
+  {  // Test a simple matrix multiplication.
+    auto cost = PredictCosts(DescribeEinsum({100, 50}, {100, 50}, "ik,jk->ij"));
+    EXPECT_EQ(Costs::Duration(104000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
+              cost.compute_time);
+    EXPECT_EQ(Costs::Duration(4000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test a simple batch matrix multiplication.
+    auto cost = PredictCosts(
+        DescribeEinsum({25, 100, 50}, {100, 50, 25}, "Bik,jkB->Bij"));
+    EXPECT_EQ(Costs::Duration(25 * 104000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(25 * 100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
+              cost.compute_time);
+    EXPECT_EQ(Costs::Duration(25 * 4000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test multiple batch dimensions.
+    auto cost = PredictCosts(DescribeEinsum(
+        {25, 16, 100, 50}, {16, 100, 50, 25}, "BNik,NjkB->BNij"));
+    EXPECT_EQ(Costs::Duration(16 * 25 * 104000), cost.execution_time);
+    EXPECT_EQ(
+        Costs::Duration(16 * 25 * 100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
+        cost.compute_time);
+    EXPECT_EQ(Costs::Duration(16 * 25 * 4000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test multiple M dimensions.
+    auto cost =
+        PredictCosts(DescribeEinsum({25, 100, 50}, {100, 50}, "Aik,jk->Aij"));
+    EXPECT_EQ(Costs::Duration(2552000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(25 * 100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
+              cost.compute_time);
+    EXPECT_EQ(Costs::Duration(52000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test multiple N dimensions.
+    auto cost =
+        PredictCosts(DescribeEinsum({100, 50}, {25, 100, 50}, "ik,Bjk->ijB"));
+    EXPECT_EQ(Costs::Duration(2552000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(25 * 100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
+              cost.compute_time);
+    EXPECT_EQ(Costs::Duration(52000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test multiple contracting dimensions.
+    auto cost = PredictCosts(
+        DescribeEinsum({100, 50, 25}, {100, 50, 25}, "ikl,jkl->ij"));
+    EXPECT_EQ(Costs::Duration(2600000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(100 * 50 * 25 * 100 * 2 / (1000 * 10 * 1e-3)),
+              cost.compute_time);
+    EXPECT_EQ(Costs::Duration(100000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test a simple matrix transpose.
+    auto cost = PredictCosts(DescribeEinsum({100, 50}, {}, "ij->ji"));
+    EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test a malformed Einsum equation: Mismatch between shapes and equation.
+    auto cost =
+        PredictCosts(DescribeEinsum({100, 50, 25}, {50, 100}, "ik,kl->il"));
+    EXPECT_EQ(Costs::Duration(52000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(52000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+
+    cost = PredictCosts(DescribeEinsum({100, 50}, {50, 100, 25}, "ik,kl->il"));
+    EXPECT_EQ(Costs::Duration(52000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(52000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test an unsupported Einsum: ellipsis
+    auto cost = PredictCosts(DescribeEinsum(
+        {100, 50, 25, 16}, {50, 100, 32, 12}, "ik...,kl...->il..."));
+    EXPECT_EQ(Costs::Duration(1568000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(1568000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test a malformed/unsupported Einsum: repeated indices
+    auto cost =
+        PredictCosts(DescribeEinsum({100, 100, 50}, {50, 100}, "iik,kl->il"));
+    EXPECT_EQ(Costs::Duration(202000), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(202000), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {  // Test missing shapes.
+    auto cost = PredictCosts(DescribeEinsum({-1, 50}, {100, 50}, "ik,jk->ij"));
+    EXPECT_EQ(Costs::Duration(3020), cost.execution_time);
+    EXPECT_EQ(Costs::Duration(1 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
+              cost.compute_time);
+    EXPECT_EQ(Costs::Duration(2020), cost.memory_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 4e55f02..bc69e77 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -37,6 +37,32 @@
 
 namespace {
 
+using ::absl::StrCat;
+using ::tensorflow::strings::HumanReadableNumBytes;
+
+constexpr char kAttrInputSrc[] = "input_source_";
+constexpr char kAttrSrcDevice[] = "send_device";
+constexpr char kAttrDstDevice[] = "recv_device";
+constexpr char kAttrTensorName[] = "tensor_name";
+constexpr char kChannelDevice[] = "Channel";
+
+float Round2(const float x) {
+  // Not using std::round from <cmath> here because not all platforms seem to
+  // support that (specifically Android).
+  return ::round(100.0 * x) / 100.0;
+}
+
+Costs& FindOrCreateZero(const string& op_name,
+                        std::map<string, Costs>* op_cost) {
+  auto it = op_cost->find(op_name);
+  if (it == op_cost->end()) {
+    // Note that default constructor of Costs sets some memory related fields
+    // to unknown values so we should explicitly initialize it with ZeroCosts.
+    it = op_cost->emplace(op_name, Costs::ZeroCosts()).first;
+  }
+  return it->second;
+}
+
 // Key to the cached _Recv ops map, and its hash and predicate structures.
 struct RecvNodeDescriptor {
   const NodeDef* node;
@@ -62,9 +88,9 @@
     return a.node == b.node && a.port_num == b.port_num && a.device == b.device;
   }
 };
+
 }  // namespace
 
-// ReadyNodeManager
 const NodeDef* LIFOManager::GetCurrNode() {
   CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
   if (curr_pos_ == nodes_.end()) {
@@ -86,40 +112,35 @@
   curr_pos_ = nodes_.end();  // Reset curr_pos_.
 }
 
-FirstReadyManager::FirstReadyManager() : ReadyNodeManager() {
+HeapReadyManager::HeapReadyManager() : ReadyNodeManager() {
   std::make_heap(nodes_.begin(), nodes_.end());
 }
 
-void FirstReadyManager::Init(
-    const std::unordered_map<const NodeDef*, NodeState>* node_state) {
-  // Reset the node state since different instances of the scheduler can reuse
+Status HeapReadyManager::Init(
+    const std::unordered_map<const NodeDef*, NodeState>* node_map) {
+  // Resets the node state since different instances of the scheduler can reuse
   // the same node_manager.
-  node_state_ = node_state;
+  node_map_ = node_map;
   nodes_.clear();
   waiting_queue_.clear();
-  greater_ = [this](const NodeDef* a, const NodeDef* b) -> bool {
-    if (node_state_->at(a).time_ready == node_state_->at(b).time_ready) {
-      // Use Node name as tie-breaker for deterministic node scheduling.
-      return a->name().compare(b->name()) > 0;
-    } else {
-      // Note: we need a node with minimum time_ready, not
-      // maximum; hence, using a > b for comparison function.
-      return node_state_->at(a).time_ready > node_state_->at(b).time_ready;
-    }
-  };
+
+  // Sets up the comparator for the heap.
+  greater_ = Greater();
+
+  return Status::OK();
 }
 
-const NodeDef* FirstReadyManager::GetCurrNode() {
+const NodeDef* HeapReadyManager::GetCurrNode() {
   if (nodes_.empty()) {
-    // Nothing in the node_; probably, the very first call. Move
-    // waiting_queue_ to node_.
+    // Nothing in the node_; probably, the very first call. Move waiting_queue_
+    // to node_.
     DrainWaitingQueue();
     CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
   }
   return nodes_.front();
 }
 
-void FirstReadyManager::RemoveCurrNode() {
+void HeapReadyManager::RemoveCurrNode() {
   if (nodes_.empty()) {
     // Make sure that there is a node to be removed at the front of nodes_.
     GetCurrNode();
@@ -129,11 +150,11 @@
   DrainWaitingQueue();
 }
 
-bool FirstReadyManager::Empty() const {
+bool HeapReadyManager::Empty() const {
   return nodes_.empty() && waiting_queue_.empty();
 }
 
-void FirstReadyManager::DrainWaitingQueue() {
+void HeapReadyManager::DrainWaitingQueue() {
   for (const auto* node : waiting_queue_) {
     // push_heap in AddNode() and pop_heap in RemoveCurrNode() guarantees that
     // the first element is the node with minimum time_ready.
@@ -143,15 +164,54 @@
   waiting_queue_.clear();
 }
 
+std::function<bool(const NodeDef*, const NodeDef*)>
+FirstReadyManager::Greater() {
+  auto greater = [this](const NodeDef* a, const NodeDef* b) -> bool {
+    if (node_map_->at(a).time_ready == node_map_->at(b).time_ready) {
+      // Use Node name as tie-breaker for deterministic node scheduling.
+      return a->name().compare(b->name()) > 0;
+    } else {
+      // Note: we need a node with minimum time_ready, not maximum; hence, using
+      // a > b for comparison function.
+      return node_map_->at(a).time_ready > node_map_->at(b).time_ready;
+    }
+  };
+  return greater;
+}
+
+std::function<bool(const NodeDef*, const NodeDef*)>
+PriorityReadyManager::Greater() {
+  auto greater = [this](const NodeDef* a, const NodeDef* b) -> bool {
+    return node_priority_.at(a->name()) > node_priority_.at(b->name());
+  };
+  return greater;
+}
+
+Status PriorityReadyManager::SetPriority(
+    const std::unordered_map<string, int>& node_priority) {
+  // Checks each node has a unique priority.
+  std::unordered_set<int> priorities;
+  for (const auto& it : node_priority_) {
+    if (priorities.find(it.second) != priorities.end()) {
+      return errors::InvalidArgument("Non-unique priority found");
+    }
+    priorities.insert(it.second);
+  }
+
+  node_priority_ = node_priority;
+  return Status::OK();
+}
+
 CompositeNodeManager::CompositeNodeManager()
     : ReadyNodeManager(), send_manager_(), recv_manager_() {}
 
-void CompositeNodeManager::Init(
-    const std::unordered_map<const NodeDef*, NodeState>* node_state) {
-  node_state_ = node_state;
-  send_manager_.Init(node_state);
-  recv_manager_.Init(node_state);
+Status CompositeNodeManager::Init(
+    const std::unordered_map<const NodeDef*, NodeState>* node_map) {
+  node_map_ = node_map;
+  TF_RETURN_IF_ERROR(send_manager_.Init(node_map));
+  TF_RETURN_IF_ERROR(recv_manager_.Init(node_map));
   curr_node_ = nullptr;
+  return Status::OK();
 }
 
 void CompositeNodeManager::AddNode(const NodeDef* node) {
@@ -160,7 +220,7 @@
   } else if (IsRecv(*node)) {
     recv_manager_.AddNode(node);
   } else {
-    const auto& device = node_state_->at(node).device_name;
+    const auto& device = node_map_->at(node).device_name;
     ops_lifo_map_[device].AddNode(node);
   }
 }
@@ -177,16 +237,16 @@
   for (auto& ops_lifo : ops_lifo_map_) {
     if (!ops_lifo.second.Empty()) {
       const auto* op = ops_lifo.second.GetCurrNode();
-      candidates.emplace_back(op, node_state_->at(op).time_ready);
+      candidates.emplace_back(op, node_map_->at(op).time_ready);
     }
   }
   if (!send_manager_.Empty()) {
     const auto* send = send_manager_.GetCurrNode();
-    candidates.emplace_back(send, node_state_->at(send).time_ready);
+    candidates.emplace_back(send, node_map_->at(send).time_ready);
   }
   if (!recv_manager_.Empty()) {
     const auto* recv = recv_manager_.GetCurrNode();
-    candidates.emplace_back(recv, node_state_->at(recv).time_ready);
+    candidates.emplace_back(recv, node_map_->at(recv).time_ready);
   }
   CHECK(!candidates.empty());
   auto first_ready = std::min_element(
@@ -225,7 +285,7 @@
   } else if (IsRecv(*node)) {
     recv_manager_.RemoveCurrNode();
   } else {
-    const auto device = node_state_->at(node).device_name;
+    const auto device = node_map_->at(node).device_name;
     ops_lifo_map_[device].RemoveCurrNode();
   }
   // Reset curr_node_ so that GetCurrNode() finds another node.
@@ -274,9 +334,6 @@
 }
 
 Status VirtualScheduler::Init(const GrapplerItem* item) {
-  grappler_item_ = item;
-  graph_properties_ = absl::make_unique<GraphProperties>(*item);
-
   initialized_ = false;
 
   // Clear all internal states so that the VirtualScheduler is reusable for
@@ -296,9 +353,10 @@
   // necessary information for emulating tensorflow op scheduling and
   // construct internal data structures (NodeState and DeviceState) for virtual
   // scheduling.
-  ready_nodes_->Init(GetNodeStates());
+  TF_RETURN_IF_ERROR(ready_nodes_->Init(GetNodeStates()));
 
-  // Construct graph properties.
+  // Constructs graph properties and performs shape inference.
+  graph_properties_ = absl::make_unique<GraphProperties>(*item);
   if (use_static_shapes_) {
     TF_RETURN_IF_ERROR(graph_properties_->InferStatically(
         true, use_aggressive_shape_inference_));
@@ -306,6 +364,7 @@
     TF_RETURN_IF_ERROR(graph_properties_->InferDynamically(cluster_));
   }
 
+  grappler_item_ = item;
   const auto& graph = grappler_item_->graph;
   const auto& fetch_nodes = grappler_item_->fetch;
   std::set<string> feed_nodes;
@@ -439,7 +498,7 @@
 
     feed_nodes.erase(curr_node->name());
 
-    if (IsPersistentNode(curr_node)) {
+    if (IsPersistent(*curr_node)) {
       auto& device_state = device_[curr_node_device];
       for (int port_num = 0;
            port_num < curr_node_state.output_properties.size(); ++port_num) {
@@ -514,17 +573,6 @@
   }
 }
 
-float VirtualScheduler::Round2(const float x) const {
-  // Not using std::round from <cmath> here because not all platforms seem to
-  // support that (specifically Android).
-  return ::round(100.0 * x) / 100.0;
-}
-
-bool VirtualScheduler::IsPersistentNode(const NodeDef* node) const {
-  // Variables are persistent nodes.
-  return IsVariable(*node);
-}
-
 string VirtualScheduler::DeviceName(const NodeDef* node) const {
   return placer_->get_canonical_device_name(*node);
 }
@@ -539,8 +587,8 @@
 string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
                                            const NodeDef* to) const {
   CHECK(!initialized_) << "ChannelDeviceName is called after Init().";
-  return kChannelDevice + "_from_" + SanitizedDeviceName(from) + "_to_" +
-         SanitizedDeviceName(to);
+  return StrCat(kChannelDevice, "_from_", SanitizedDeviceName(from), "_to_",
+                SanitizedDeviceName(to));
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
@@ -562,9 +610,9 @@
   auto input_node_port_num = NodePosition(input_name);
   string src_name;
   if (input_node_port_num >= 0) {
-    src_name = strings::StrCat(from->name(), "_", input_node_port_num);
+    src_name = StrCat(from->name(), "_", input_node_port_num);
   } else {
-    src_name = strings::StrCat(from->name(), "_minus1");
+    src_name = StrCat(from->name(), "_minus1");
   }
 
   // _Send op.
@@ -694,17 +742,6 @@
   return it->second;
 }
 
-Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
-                                          std::map<string, Costs>* op_cost) {
-  auto it = op_cost->find(op_name);
-  if (it == op_cost->end()) {
-    // Note that default constructor of Costs sets some memory related fields
-    // to unknown values so we should explicitly initialize it with ZeroCosts.
-    it = op_cost->emplace(op_name, Costs::ZeroCosts()).first;
-  }
-  return it->second;
-}
-
 void VirtualScheduler::AddOutputNodesToReadyQueue(
     const NodeDef* node, const Costs::Duration& curr_time) {
   // Checks whether the Switch's output slots change over iterations.
@@ -769,7 +806,7 @@
     string node_description = GetOpDescription(op_context.op_info);
     op_counts_[node_description] += 1;
     op_costs_[node_description] =
-        std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+        std::make_pair(total_node_costs.execution_time.asMicroSeconds().count(),
                        !node_costs.inaccurate);
   }
 
@@ -788,7 +825,7 @@
   node_state.time_finished = curr_time;
 
   // Update device memory usage.
-  if (!IsPersistentNode(node)) {
+  if (!IsPersistent(*node)) {
     for (const auto& port_num_output_pair : node_state.outputs) {
       int port_num = port_num_output_pair.first;
       // There's a chance that a specific output is not used at all.
@@ -825,7 +862,7 @@
     input_state.num_outputs_executed[port]++;
     if (input_state.num_outputs_executed[port] ==
             input_state.outputs[port].size() &&
-        !IsPersistentNode(input)) {
+        !IsPersistent(*input)) {
       // All the outputs are executed; no reference to this output port of
       // input node.
       input_state.time_no_references[port] = curr_time;
@@ -838,7 +875,7 @@
     }
   }
 
-  if (!IsPersistentNode(node)) {
+  if (!IsPersistent(*node)) {
     // Now that output memory is added and used up nodes are deallocated,
     // check max memory usage.
     if (device.memory_usage > device.max_memory_usage) {
@@ -925,13 +962,10 @@
     VLOG(1) << "Device = " << name
             << ", num_nodes = " << state.nodes_executed.size()
             << ", wall_time_ns = " << wall_time_ns.count() << ", memory usage: "
-            << "persistent = "
-            << strings::HumanReadableNumBytes(persistent_memory_usage)
-            << ", peak = "
-            << strings::HumanReadableNumBytes(state.max_memory_usage)
-            << ", total = " << strings::HumanReadableNumBytes(max_memory_usage)
-            << ", at the end: "
-            << strings::HumanReadableNumBytes(state.memory_usage);
+            << "persistent = " << HumanReadableNumBytes(persistent_memory_usage)
+            << ", peak = " << HumanReadableNumBytes(state.max_memory_usage)
+            << ", total = " << HumanReadableNumBytes(max_memory_usage)
+            << ", at the end: " << HumanReadableNumBytes(state.memory_usage);
 
     // Overall statement about accuracy
     VLOG(1) << state.device_costs.num_ops_total
@@ -983,7 +1017,7 @@
                        static_cast<int64>(compute_cost),
                        static_cast<int64>(memory_cost),
                        static_cast<int64>(intermediate_memory_cost))
-                << " (" << strings::HumanReadableNumBytes(op_mem_usage) << " ["
+                << " (" << HumanReadableNumBytes(op_mem_usage) << " ["
                 << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
@@ -1068,7 +1102,7 @@
       // VirtualScheduler does not specify scratch pad memory usage.
       mem_stats->set_temp_memory_size(0);
       int64 persistent_memory_size = 0;
-      if (IsPersistentNode(node_def)) {
+      if (IsPersistent(*node_def)) {
         persistent_memory_size = total_output_size;
       }
       mem_stats->set_persistent_memory_size(persistent_memory_size);
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 9a67fa9..821353f 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -132,8 +132,10 @@
  public:
   ReadyNodeManager() {}
   virtual ~ReadyNodeManager() {}
-  virtual void Init(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state) {}
+  virtual Status Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_map) {
+    return Status::OK();
+  }
   virtual void AddNode(const NodeDef* node) = 0;
   virtual const NodeDef* GetCurrNode() = 0;
   virtual void RemoveCurrNode() = 0;
@@ -144,8 +146,6 @@
  public:
   FIFOManager() : ReadyNodeManager() {}
   ~FIFOManager() override {}
-  void Init(const std::unordered_map<const NodeDef*, NodeState>* node_state)
-      override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
   const NodeDef* GetCurrNode() override {
     CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
@@ -166,8 +166,6 @@
  public:
   LIFOManager() : ReadyNodeManager() {}
   ~LIFOManager() override {}
-  void Init(const std::unordered_map<const NodeDef*, NodeState>* node_state)
-      override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
   const NodeDef* GetCurrNode() override;
   void RemoveCurrNode() override;
@@ -182,21 +180,22 @@
   std::list<const NodeDef*>::iterator curr_pos_ = nodes_.end();
 };
 
-// FirstReadyManager picks a node with the minimum time_ready value.
-// Behavior is unknown if there are more than one nodes with the minimum
-// time_ready value (it depends on C++ STL push_heap and pop_heap).
-class FirstReadyManager : public ReadyNodeManager {
+// Abstract class that maintains a heap/priority queue for scheduling ready
+// nodes. Derived class needs to implement the Greater() function which returns
+// the comparator for the heap.
+class HeapReadyManager : public ReadyNodeManager {
  public:
-  FirstReadyManager();
-  void Init(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state) override;
-  ~FirstReadyManager() override {}
+  HeapReadyManager();
+  Status Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_map) override;
+  ~HeapReadyManager() override {}
   void AddNode(const NodeDef* node) override { waiting_queue_.push_back(node); }
   const NodeDef* GetCurrNode() override;
   void RemoveCurrNode() override;
   bool Empty() const override;
 
- private:
+ protected:
+  virtual std::function<bool(const NodeDef*, const NodeDef*)> Greater() = 0;
   // Move all the nodes in the waiting_queue_ to nodes_.
   void DrainWaitingQueue();
 
@@ -213,7 +212,38 @@
 
   // NodeState structure from VirtualScheduler to get time_ready of ready nodes.
   // Not owned by FirstReadyManager.
-  const std::unordered_map<const NodeDef*, NodeState>* node_state_;
+  const std::unordered_map<const NodeDef*, NodeState>* node_map_;
+};
+
+// FirstReadyManager picks a node with the minimum time_ready value.
+// Behavior is deterministic when there are more than one nodes with the minimum
+// time_ready value with unique node names as the tie-breaker.
+class FirstReadyManager : public HeapReadyManager {
+ public:
+  FirstReadyManager() : HeapReadyManager() {}
+  ~FirstReadyManager() override {}
+
+ protected:
+  std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
+};
+
+// PriorityReadyManager uses the given node priorities when picking up next node
+// from all the ready nodes.
+class PriorityReadyManager : public HeapReadyManager {
+ public:
+  PriorityReadyManager() : HeapReadyManager() {}
+  ~PriorityReadyManager() override {}
+
+  // Note this should be called after Init().
+  Status SetPriority(const std::unordered_map<string, int>& node_priority);
+
+ protected:
+  std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
+
+ private:
+  // A map from unique node name to unique priority. Lower number means higher
+  // priority.
+  std::unordered_map<string, int> node_priority_;
 };
 
 // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
@@ -227,8 +257,8 @@
   CompositeNodeManager();
   ~CompositeNodeManager() override {}
 
-  void Init(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state) override;
+  Status Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_map) override;
   void AddNode(const NodeDef* node) override;
   const NodeDef* GetCurrNode() override;
   void RemoveCurrNode() override;
@@ -245,8 +275,8 @@
   FirstReadyManager recv_manager_;
 
   // NodeState structure from VirtualScheduler to get time_ready of ready nodes.
-  // Not owned by FirstReadyManager.
-  const std::unordered_map<const NodeDef*, NodeState>* node_state_;
+  // Not owned by CompositeReadyManager.
+  const std::unordered_map<const NodeDef*, NodeState>* node_map_;
 
   // Cached curr node. Set back to nullptr from RemoveCurrNode().
   const NodeDef* curr_node_;
@@ -303,13 +333,6 @@
   void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
 
  private:
-  // Constants.
-  const string kAttrInputSrc = "input_source_";
-  const string kAttrSrcDevice = "send_device";
-  const string kAttrDstDevice = "recv_device";
-  const string kAttrTensorName = "tensor_name";
-  const string kChannelDevice = "Channel";
-
   // Methods called from Init(). Fails if initialize_ is set.
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
@@ -321,10 +344,6 @@
   string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
 
   // Helper methods.
-  Costs& FindOrCreateZero(const string& op_name,
-                          std::map<string, Costs>* op_cost);
-  float Round2(const float x) const;
-  bool IsPersistentNode(const NodeDef* node) const;
   void AddOutputNodesToReadyQueue(const NodeDef* node,
                                   const Costs::Duration& curr_time);
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 97b6586..3e867e3 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -14,8 +14,8 @@
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
@@ -26,6 +26,599 @@
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+
+// Device names:
+constexpr char kCPU0[] = "/job:localhost/replica:0/task:0/cpu:0";
+constexpr char kCPU1[] = "/job:localhost/replica:0/task:0/cpu:1";
+constexpr char kChannelFrom0To1[] = "Channel from CPU0 to CPU1";
+constexpr char kChannelFrom1To0[] = "Channel from CPU1 to CPU0";
+// Op names:
+constexpr char kConv2D[] = "Conv2D";
+constexpr char kSend[] = "_Send";
+constexpr char kRecv[] = "_Recv";
+
+class ReadyNodeManagerTest : public ::testing::Test {
+ protected:
+  ReadyNodeManagerTest() {
+    // node1_ to node6_ on kCPU0, with time_ready in reverse_order.
+    NodeSetUp("Node1", kConv2D, kCPU0, 6000, &node1_);
+    NodeSetUp("Node2", kConv2D, kCPU0, 5000, &node2_);
+    NodeSetUp("Node3", kConv2D, kCPU0, 4000, &node3_);
+    NodeSetUp("Node4", kConv2D, kCPU0, 3000, &node4_);
+    NodeSetUp("Node5", kConv2D, kCPU0, 2000, &node5_);
+    NodeSetUp("Node6", kConv2D, kCPU0, 1000, &node6_);
+  }
+
+  void NodeSetUp(const string& name, const string& op_name,
+                 const string& device_name, const uint64 time_ready,
+                 NodeDef* node) {
+    node->set_name(name);
+    node->set_op(op_name);
+    node->set_device(device_name);
+
+    node_states_[node] = NodeState();
+    node_states_[node].time_ready = time_ready;
+    node_states_[node].device_name = device_name;
+  }
+
+  NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
+  std::unordered_map<const NodeDef*, NodeState> node_states_;
+};
+
+// Tests that FIFOManager correctly returns the current node with only 1 node.
+TEST_F(ReadyNodeManagerTest, GetSingleNodeFIFOManager) {
+  FIFOManager manager = FIFOManager();
+  manager.AddNode(&node1_);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+}
+
+// Tests that FIFOManager removes the only node contained within.
+TEST_F(ReadyNodeManagerTest, RemoveSingleNodeFIFOManager) {
+  FIFOManager manager = FIFOManager();
+  manager.AddNode(&node1_);
+
+  // Removes the only node in FIFOManager.
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Tests that FIFOManager can remove multiple nodes and returns the current node
+// in the right order.
+TEST_F(ReadyNodeManagerTest, GetAndRemoveMultipleFIFOManager) {
+  FIFOManager manager = FIFOManager();
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keeps checking current node while removing nodes from manager.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Tests that FIFOManager can remove multiple nodes and add more nodes, still
+// returning the current node in the right order.
+TEST_F(ReadyNodeManagerTest, AddAndRemoveMultipleFIFOManager) {
+  FIFOManager manager = FIFOManager();
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keeps checking current node as nodes are removed and added.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.AddNode(&node5_);
+  // GetCurrNode() should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.AddNode(&node6_);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Tests that LIFOManager correctly returns the current node with only 1 node.
+TEST_F(ReadyNodeManagerTest, GetSingleNodeLIFOManager) {
+  LIFOManager manager = LIFOManager();
+  manager.AddNode(&node1_);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+}
+
+// Tests that LIFOManager removes the only node contained within.
+TEST_F(ReadyNodeManagerTest, RemoveSingleNodeLIFOManager) {
+  LIFOManager manager = LIFOManager();
+  manager.AddNode(&node1_);
+
+  // Removes the only node in LIFOManager.
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Tests that LIFOManager can remove multiple nodes and returns the current node
+// in the right order.
+TEST_F(ReadyNodeManagerTest, GetAndRemoveMultipleLIFOManager) {
+  LIFOManager manager = LIFOManager();
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keeps checking current node while removing nodes from manager.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Tests that LIFOManager can remove multiple nodes (must be removing the
+// current node) and add more nodes, still returning the current node in the
+// right order.
+TEST_F(ReadyNodeManagerTest, AddAndRemoveMultipleLIFOManager) {
+  LIFOManager manager = LIFOManager();
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keeps checking current node as nodes are removed and added.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.AddNode(&node5_);
+  // GetCurrNode()  should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.AddNode(&node6_);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, GetSingleNodeFirstReadyManager) {
+  FirstReadyManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  manager.AddNode(&node1_);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+}
+
+TEST_F(ReadyNodeManagerTest, RemoveSingleNodeFirstReadyManager) {
+  FirstReadyManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  manager.AddNode(&node1_);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, GetAndRemoveMultipleFirstReadyManager) {
+  FirstReadyManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  // Insert nodes in some random order.
+  manager.AddNode(&node2_);
+  manager.AddNode(&node1_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node6_);
+
+  // In whatever order we insert nodes, we get the same order based on nodes'
+  // time_ready.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, GetCurrNodeFirstReadyManager) {
+  FirstReadyManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+
+  // Inserts nodes in some random order.
+  manager.AddNode(&node2_);
+  manager.AddNode(&node1_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node6_);
+
+  // Among these nodes, node6 has the smallest time_ready, hence, GetCurrNode()
+  // should return it.
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+
+  // Now insrets a few other nodes, but their time_ready's are even smaller than
+  // that of Node6. Before calling RemoveCurrNode(), GetCurrNode() should return
+  // the same node, Node6, in this case.
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeSetUp("Node7", kConv2D, kCPU0, 5, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU0, 4, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU0, 3, &node9);
+
+  manager.AddNode(&node7);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+
+  manager.AddNode(&node8);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+
+  manager.RemoveCurrNode();
+  // Now Node6 is removed, and GetCurrNode() will return Node8.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+
+  // Again, AddNode shouldn't change GetCurrNode().
+  manager.AddNode(&node9);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node9");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node7");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, DeterminismInFirstReadyManager) {
+  FirstReadyManager manager1;
+  TF_EXPECT_OK(manager1.Init(&node_states_));
+  FirstReadyManager manager2;
+  TF_EXPECT_OK(manager2.Init(&node_states_));
+
+  // 6 nodes with same time_ready.
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeDef node10;
+  NodeDef node11;
+  NodeDef node12;
+  NodeSetUp("Node7", kConv2D, kCPU0, 1000, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU0, 1000, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU0, 1000, &node9);
+  NodeSetUp("Node10", kConv2D, kCPU0, 1000, &node10);
+  NodeSetUp("Node11", kConv2D, kCPU0, 1000, &node11);
+  NodeSetUp("Node12", kConv2D, kCPU0, 1000, &node12);
+
+  // Adds the above 6 nodes to manager1.
+  manager1.AddNode(&node7);
+  manager1.AddNode(&node8);
+  manager1.AddNode(&node9);
+  manager1.AddNode(&node10);
+  manager1.AddNode(&node11);
+  manager1.AddNode(&node12);
+
+  // Adds the above 6 nodes to manager2, but in a different order.
+  manager2.AddNode(&node8);
+  manager2.AddNode(&node11);
+  manager2.AddNode(&node9);
+  manager2.AddNode(&node10);
+  manager2.AddNode(&node7);
+  manager2.AddNode(&node12);
+
+  // Expects both managers return the same nodes for deterministic node
+  // scheduling.
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_TRUE(manager1.Empty());
+  EXPECT_TRUE(manager2.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, GetAndRemoveMultiplePriorityReadyManager) {
+  PriorityReadyManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+
+  // Sets up node priorities.
+  std::unordered_map<string, int> node_priority = {{"Node1", 1}, {"Node2", 2},
+                                                   {"Node3", 3}, {"Node4", 4},
+                                                   {"Node5", 5}, {"Node6", 6}};
+  TF_EXPECT_OK(manager.SetPriority(node_priority));
+
+  // Inserts nodes in some random order.
+  manager.AddNode(&node2_);
+  manager.AddNode(&node1_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node6_);
+
+  // Expects nodes scheduled based on priority.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, RemoveSingleNodeCompositeNodeManager) {
+  CompositeNodeManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  manager.AddNode(&node1_);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, GetAndRemoveMultipleComopsiteNodeManager) {
+  CompositeNodeManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keeps checking current node as nodes are removed and added.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.AddNode(&node5_);
+  // GetCurrNode()  should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.AddNode(&node6_);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, MultiDeviceSendRecvComopsiteNodeManager) {
+  CompositeNodeManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  // Additional nodes on kCPU1.
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeSetUp("Node7", kConv2D, kCPU1, 1001, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU1, 2001, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU1, 3001, &node9);
+
+  // Send and Recv nodes.
+  NodeDef send1;
+  NodeDef send2;
+  NodeDef recv1;
+  NodeDef recv2;
+  NodeSetUp("Send1", kSend, kChannelFrom0To1, 2002, &send1);
+  NodeSetUp("Send2", kSend, kChannelFrom1To0, 2005, &send2);
+  NodeSetUp("Recv1", kRecv, kCPU0, 2003, &recv1);
+  NodeSetUp("Recv2", kRecv, kCPU1, 2004, &recv2);
+
+  // Inserts nodes.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node6_);
+  manager.AddNode(&node7);
+  manager.AddNode(&node8);
+  manager.AddNode(&node9);
+  manager.AddNode(&send1);
+  manager.AddNode(&send2);
+  manager.AddNode(&recv1);
+  manager.AddNode(&recv2);
+
+  // On kCPU0; last one is node6_, on kCPU1: last one is node9;
+  // so choose one that has earliest time_ready among node6_, node9,
+  // Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  // Then, the next one on kCPU0 is node5_; choose the earliest time_ready node
+  // among node5_, node9, Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Send1");
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Sen2, Recv1, and Recv2.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Recv1");
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Send2, and Recv2.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Recv2");
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, and Send2.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Send2");
+  manager.RemoveCurrNode();
+  // Next, choose between node4_, node9.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node9.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node9");
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node8.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node7.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node7");
+  manager.RemoveCurrNode();
+  // Then, just the nodes on kCPU1 -- LIFO.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(ReadyNodeManagerTest, DeterminismInCompositeNodeManager) {
+  CompositeNodeManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+  CompositeNodeManager manager2;
+  TF_EXPECT_OK(manager2.Init(&node_states_));
+
+  // 6 nodes with same time_ready.
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeDef node10;
+  NodeDef node11;
+  NodeDef node12;
+  NodeSetUp("Node7", kConv2D, kCPU0, 1000, &node7);
+  NodeSetUp("Node8", kSend, kCPU0, 1000, &node8);
+  NodeSetUp("Node9", kRecv, kCPU0, 1000, &node9);
+  NodeSetUp("Node10", kConv2D, kCPU0, 999, &node10);
+  NodeSetUp("Node11", kRecv, kCPU0, 999, &node11);
+  NodeSetUp("Node12", kConv2D, kCPU1, 1000, &node12);
+
+  // Adds Nodes 7 to 9 to manager.
+  manager.AddNode(&node7);
+  manager.AddNode(&node8);
+  manager.AddNode(&node9);
+
+  // It should return _Send, Recv, and the other op order, when the candidate
+  // nodes have same time_ready.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kSend);
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node9");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kRecv);
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node7");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kConv2D);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Adds Nodes 7 to 9 to manager, but in a different order.
+  manager.AddNode(&node9);
+  manager.AddNode(&node8);
+  manager.AddNode(&node7);
+
+  // Expects same order (_Send, _Recv, and the other op), regardless of Add
+  // order.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kSend);
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node9");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kRecv);
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node7");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kConv2D);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Conv2D's time_ready < Send's time_ready; Expects Conv2D first.
+  manager.AddNode(&node8);
+  manager.AddNode(&node10);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node10");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kConv2D);
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kSend);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Recv's time_ready < Send' time_ready; Expects Recv first.
+  manager.AddNode(&node11);
+  manager.AddNode(&node8);
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node11");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kRecv);
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node8");
+  EXPECT_EQ(manager.GetCurrNode()->op(), kSend);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Node7 and 12 are normal ops with the same time_ready, placed on different
+  // devices. These two nodes are added to manager and manager2, but in
+  // different orders; Expects GetCurrNode() returns the nodes in the same
+  // order.
+  manager.AddNode(&node7);
+  manager.AddNode(&node12);
+
+  manager2.AddNode(&node12);
+  manager2.AddNode(&node7);
+
+  EXPECT_EQ(manager.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
 
 // Class for testing virtual scheduler.
 class TestVirtualScheduler : public VirtualScheduler {
@@ -53,14 +646,6 @@
 class VirtualSchedulerTest : public ::testing::Test {
  protected:
   VirtualSchedulerTest() {
-    // node1_ to node6_ on kCPU0, with time_ready in reverse_order.
-    NodeSetUp("Node1", kConv2D, kCPU0, 6000, &node1_);
-    NodeSetUp("Node2", kConv2D, kCPU0, 5000, &node2_);
-    NodeSetUp("Node3", kConv2D, kCPU0, 4000, &node3_);
-    NodeSetUp("Node4", kConv2D, kCPU0, 3000, &node4_);
-    NodeSetUp("Node5", kConv2D, kCPU0, 2000, &node5_);
-    NodeSetUp("Node6", kConv2D, kCPU0, 1000, &node6_);
-
     // Initializes cluster_ and scheduler_.
     std::unordered_map<string, DeviceProperties> devices;
 
@@ -77,19 +662,6 @@
         /*use_aggressive_shape_inference=*/true, cluster_.get());
   }
 
-  NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
-  std::unordered_map<const NodeDef*, NodeState> node_states_;
-
-  // Device names:
-  const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
-  const string kCPU1 = "/job:localhost/replica:0/task:0/cpu:1";
-  const string kChannelFrom0To1 = "Channel from CPU0 to CPU1";
-  const string kChannelFrom1To0 = "Channel from CPU1 to CPU0";
-  // Op names:
-  const string kSend = "_Send";
-  const string kRecv = "_Recv";
-  const string kConv2D = "Conv2D";
-
   DeviceProperties GetDummyCPUDevice() {
     // Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth.
     // - 8 Gflops
@@ -102,18 +674,6 @@
     return cpu_device;
   }
 
-  void NodeSetUp(const string& name, const string& op_name,
-                 const string& device_name, const uint64 time_ready,
-                 NodeDef* node) {
-    node->set_name(name);
-    node->set_op(op_name);
-    node->set_device(device_name);
-
-    node_states_[node] = NodeState();
-    node_states_[node].time_ready = time_ready;
-    node_states_[node].device_name = device_name;
-  }
-
   // Three Conv2Ds with only two in fetch nodes.
   void CreateGrapplerItemWithConv2Ds() {
     Scope s = Scope::NewRootScope().WithDevice(kCPU0);
@@ -1673,7 +2233,7 @@
     EXPECT_EQ(expected.size(), test_elements.size());
   }
 
-  // Helper method tthat checks name - port pairs.
+  // Helper method that checks name - port pairs.
   void ValidateMemoryUsageSnapshot(
       const std::vector<string>& expected_names, const int port_num_expected,
       const std::unordered_set<std::pair<const NodeDef*, int>,
@@ -1724,559 +2284,6 @@
   const int depth_out_ = 16;
 };
 
-// Test that FIFOManager correctly returns the current node with only 1 node.
-TEST_F(VirtualSchedulerTest, GetSingleNodeFIFOManager) {
-  // Init.
-  FIFOManager manager = FIFOManager();
-
-  // Add the node to FIFOManager.
-  manager.AddNode(&node1_);
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-}
-
-// Test that FIFOManager removes the only node contained within.
-TEST_F(VirtualSchedulerTest, RemoveSingleNodeFIFOManager) {
-  // Init.
-  FIFOManager manager = FIFOManager();
-
-  // Add the node to FIFOManager.
-  manager.AddNode(&node1_);
-
-  // Remove the only node in FIFOManager.
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-// Test that FIFOManager can remove multiple nodes and returns the current node
-// in the right order
-TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleFIFOManager) {
-  // Init.
-  FIFOManager manager = FIFOManager();
-
-  // Add the nodes to FIFOManager.
-  manager.AddNode(&node1_);
-  manager.AddNode(&node2_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node4_);
-
-  // Keep checking current node while removing nodes from manager.
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-// Test that FIFOManager can remove multiple nodes and add more nodes, still
-// returning the current node in the right order
-TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleFIFOManager) {
-  // Init.
-  FIFOManager manager = FIFOManager();
-
-  // Add the nodes to FIFOManager.
-  manager.AddNode(&node1_);
-  manager.AddNode(&node2_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node4_);
-
-  // Keep checking current node as nodes are removed and added.
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.AddNode(&node5_);
-  // GetCurrNode()  should return the same node even if some nodes are added,
-  // until RemoveCurrNode() is called.
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.AddNode(&node6_);
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-// Test that LIFOManager correctly returns the current node with only 1 node.
-TEST_F(VirtualSchedulerTest, GetSingleNodeLIFOManager) {
-  // Init.
-  LIFOManager manager = LIFOManager();
-
-  // Add the node to LIFOManager.
-  manager.AddNode(&node1_);
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-}
-
-// Test that LIFOManager removes the only node contained within.
-TEST_F(VirtualSchedulerTest, RemoveSingleNodeLIFOManager) {
-  // Init.
-  LIFOManager manager = LIFOManager();
-
-  // Add the node to LIFOManager.
-  manager.AddNode(&node1_);
-
-  // Remove the only node in LIFOManager.
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-// Test that LIFOManager can remove multiple nodes and returns the current node
-// in the right order
-TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleLIFOManager) {
-  // Init.
-  LIFOManager manager = LIFOManager();
-
-  // Add the nodes to LIFOManager.
-  manager.AddNode(&node1_);
-  manager.AddNode(&node2_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node4_);
-
-  // Keep checking current node while removing nodes from manager.
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-// Test that LIFOManager can remove multiple nodes (must be removing the current
-// node) and add more nodes, still returning the current node in the right order
-TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleLIFOManager) {
-  // Init.
-  LIFOManager manager = LIFOManager();
-
-  // Add the nodes to LIFOManager.
-  manager.AddNode(&node1_);
-  manager.AddNode(&node2_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node4_);
-
-  // Keep checking current node as nodes are removed and added.
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.AddNode(&node5_);
-  // GetCurrNode()  should return the same node even if some nodes are added,
-  // until RemoveCurrNode() is called.
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.AddNode(&node6_);
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, GetSingleNodeFirstReadyManager) {
-  FirstReadyManager manager;
-  manager.Init(&node_states_);
-
-  manager.AddNode(&node1_);
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-}
-
-TEST_F(VirtualSchedulerTest, RemoveSingleNodeFirstReadyManager) {
-  FirstReadyManager manager;
-  manager.Init(&node_states_);
-  manager.AddNode(&node1_);
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleFirstReadyManager) {
-  FirstReadyManager manager;
-  manager.Init(&node_states_);
-  // Insert nodes in some random order.
-  manager.AddNode(&node2_);
-  manager.AddNode(&node1_);
-  manager.AddNode(&node4_);
-  manager.AddNode(&node5_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node6_);
-
-  // In whatever order we insert nodes, we get the same order based on nodes'
-  // time_ready.
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
-  FirstReadyManager manager;
-  manager.Init(&node_states_);
-  // Insert nodes in some random order.
-  manager.AddNode(&node2_);
-  manager.AddNode(&node1_);
-  manager.AddNode(&node4_);
-  manager.AddNode(&node5_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node6_);
-
-  // Among these nodes, node6 has the smallest time_ready, hence, GetCurrNode()
-  // should return it.
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-  // Now insret a few other nodes, but their time_ready's are even smaller than
-  // that of Node6. Before calling RemoveCurrNode(), GetCurrNode() should return
-  // the same node, Node6, in this case.
-
-  NodeDef node7;
-  NodeDef node8;
-  NodeDef node9;
-  NodeSetUp("Node7", kConv2D, kCPU0, 5, &node7);
-  NodeSetUp("Node8", kConv2D, kCPU0, 4, &node8);
-  NodeSetUp("Node9", kConv2D, kCPU0, 3, &node9);
-
-  manager.AddNode(&node7);
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-
-  manager.AddNode(&node8);
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-
-  manager.RemoveCurrNode();
-  // Now Node6 is removed, and GetCurrNode() will return Node8.
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-
-  // Again, AddNode shouldn't change GetCurrNode().
-  manager.AddNode(&node9);
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, DeterminismInFirstReadyManager) {
-  FirstReadyManager manager1;
-  manager1.Init(&node_states_);
-  FirstReadyManager manager2;
-  manager2.Init(&node_states_);
-
-  // 6 nodes with same time_ready.
-  NodeDef node7;
-  NodeDef node8;
-  NodeDef node9;
-  NodeDef node10;
-  NodeDef node11;
-  NodeDef node12;
-  NodeSetUp("Node7", kConv2D, kCPU0, 1000, &node7);
-  NodeSetUp("Node8", kConv2D, kCPU0, 1000, &node8);
-  NodeSetUp("Node9", kConv2D, kCPU0, 1000, &node9);
-  NodeSetUp("Node10", kConv2D, kCPU0, 1000, &node10);
-  NodeSetUp("Node11", kConv2D, kCPU0, 1000, &node11);
-  NodeSetUp("Node12", kConv2D, kCPU0, 1000, &node12);
-
-  // Add the above 6 nodes to manager1.
-  manager1.AddNode(&node7);
-  manager1.AddNode(&node8);
-  manager1.AddNode(&node9);
-  manager1.AddNode(&node10);
-  manager1.AddNode(&node11);
-  manager1.AddNode(&node12);
-
-  // Add the above 6 nodes to manager2, but in a different order.
-  manager2.AddNode(&node8);
-  manager2.AddNode(&node11);
-  manager2.AddNode(&node9);
-  manager2.AddNode(&node10);
-  manager2.AddNode(&node7);
-  manager2.AddNode(&node12);
-
-  // Expect both managers return the same nodes for deterministic node
-  // scheduling.
-  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager1.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-
-  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager1.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-
-  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager1.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-
-  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager1.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-
-  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager1.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-
-  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager1.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-
-  EXPECT_TRUE(manager1.Empty());
-  EXPECT_TRUE(manager2.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, RemoveSingleNodeCompositeNodeManager) {
-  CompositeNodeManager manager;
-  manager.Init(&node_states_);
-  manager.AddNode(&node1_);
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, RemoveSingleNodeComopsiteNodeManager) {
-  CompositeNodeManager manager;
-  manager.Init(&node_states_);
-
-  manager.AddNode(&node1_);
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleComopsiteNodeManager) {
-  CompositeNodeManager manager;
-  manager.Init(&node_states_);
-
-  // Add the nodes to LIFOManager.
-  manager.AddNode(&node1_);
-  manager.AddNode(&node2_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node4_);
-
-  // Keep checking current node as nodes are removed and added.
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.AddNode(&node5_);
-  // GetCurrNode()  should return the same node even if some nodes are added,
-  // until RemoveCurrNode() is called.
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.AddNode(&node6_);
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, MultiDeviceSendRecvComopsiteNodeManager) {
-  CompositeNodeManager manager;
-  manager.Init(&node_states_);
-  // Additional nodes on kCPU1
-  NodeDef node7;
-  NodeDef node8;
-  NodeDef node9;
-  NodeSetUp("Node7", kConv2D, kCPU1, 1001, &node7);
-  NodeSetUp("Node8", kConv2D, kCPU1, 2001, &node8);
-  NodeSetUp("Node9", kConv2D, kCPU1, 3001, &node9);
-
-  // Send and Recv nodes.
-  NodeDef send1;
-  NodeDef send2;
-  NodeDef recv1;
-  NodeDef recv2;
-  NodeSetUp("Send1", kSend, kChannelFrom0To1, 2002, &send1);
-  NodeSetUp("Send2", kSend, kChannelFrom1To0, 2005, &send2);
-  NodeSetUp("Recv1", kRecv, kCPU0, 2003, &recv1);
-  NodeSetUp("Recv2", kRecv, kCPU1, 2004, &recv2);
-
-  // Insert nodes.
-  manager.AddNode(&node1_);
-  manager.AddNode(&node2_);
-  manager.AddNode(&node3_);
-  manager.AddNode(&node4_);
-  manager.AddNode(&node5_);
-  manager.AddNode(&node6_);
-  manager.AddNode(&node7);
-  manager.AddNode(&node8);
-  manager.AddNode(&node9);
-  manager.AddNode(&send1);
-  manager.AddNode(&send2);
-  manager.AddNode(&recv1);
-  manager.AddNode(&recv2);
-
-  // on kCPU0; last one is node6_, on kCPU1: last one is node9;
-  // so choose one that has earliest time_ready among node6_, node9,
-  // Send1, Send2, Recv1, and Recv2.
-  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Then, the next one on kCPU0 is node5_; choose the earliest time_ready node
-  // among node5_, node9, Send1, Send2, Recv1, and Recv2.
-  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose among node4_, node9, Send1, Send2, Recv1, and Recv2.
-  EXPECT_EQ("Send1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose among node4_, node9, Sen2, Recv1, and Recv2.
-  EXPECT_EQ("Recv1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose among node4_, node9, Send2, and Recv2.
-  EXPECT_EQ("Recv2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose among node4_, node9, and Send2.
-  EXPECT_EQ("Send2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose between node4_, node9.
-  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose between node3_, node9.
-  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose between node3_, node8.
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Next, choose between node3_, node7.
-  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  // Then, just the nodes on kCPU1 -- LIFO.
-  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
-TEST_F(VirtualSchedulerTest, DeterminismInCompositeNodeManager) {
-  CompositeNodeManager manager;
-  manager.Init(&node_states_);
-  CompositeNodeManager manager2;
-  manager2.Init(&node_states_);
-
-  // 6 nodes with same time_ready.
-  NodeDef node7;
-  NodeDef node8;
-  NodeDef node9;
-  NodeDef node10;
-  NodeDef node11;
-  NodeDef node12;
-  NodeSetUp("Node7", kConv2D, kCPU0, 1000, &node7);
-  NodeSetUp("Node8", kSend, kCPU0, 1000, &node8);
-  NodeSetUp("Node9", kRecv, kCPU0, 1000, &node9);
-  NodeSetUp("Node10", kConv2D, kCPU0, 999, &node10);
-  NodeSetUp("Node11", kRecv, kCPU0, 999, &node11);
-  NodeSetUp("Node12", kConv2D, kCPU1, 1000, &node12);
-
-  // Add Nodes 7 to 9 to manager.
-  manager.AddNode(&node7);
-  manager.AddNode(&node8);
-  manager.AddNode(&node9);
-
-  // It should return _Send, Recv, and the other op order, when the candidate
-  // nodes have same time_ready.
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
-  EXPECT_EQ(kRecv, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
-  EXPECT_EQ(kConv2D, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-
-  // Add Nodes 7 to 9 to manager, but in a different order.
-  manager.AddNode(&node9);
-  manager.AddNode(&node8);
-  manager.AddNode(&node7);
-
-  // Expect same order (_Send, _Recv, and the other op), regardless of Add
-  // order.
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
-  EXPECT_EQ(kRecv, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
-  EXPECT_EQ(kConv2D, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-
-  // Conv2D's time_ready < Send's time_ready; Expect Conv2D first.
-  manager.AddNode(&node8);
-  manager.AddNode(&node10);
-  EXPECT_EQ("Node10", manager.GetCurrNode()->name());
-  EXPECT_EQ(kConv2D, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-
-  // Recv's time_ready < Send' time_ready; Expect Recv first.
-  manager.AddNode(&node11);
-  manager.AddNode(&node8);
-  EXPECT_EQ("Node11", manager.GetCurrNode()->name());
-  EXPECT_EQ(kRecv, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
-  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
-  manager.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-
-  // Node7 and 12 are normal ops with the same time_ready, placed on different
-  // devices. These two nodes are added to manager and manager2, but in
-  // different orders; Expect GetCurrNode() returns the nodes in the same order.
-  manager.AddNode(&node7);
-  manager.AddNode(&node12);
-
-  manager2.AddNode(&node12);
-  manager2.AddNode(&node7);
-
-  EXPECT_EQ(manager.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-  EXPECT_EQ(manager.GetCurrNode()->name(), manager2.GetCurrNode()->name());
-  manager.RemoveCurrNode();
-  manager2.RemoveCurrNode();
-  EXPECT_TRUE(manager.Empty());
-}
-
 // Create small graph, run predict costs on it, make sure the costs from the
 // summary match the hand-calculated costs.
 TEST_F(VirtualSchedulerTest, SummaryCostTest) {
@@ -2517,11 +2524,10 @@
 
   // There is one Conv2D that takes x and f, but f is variable, so it should be
   // in persistent nodes.
-  // f is variable.
-  ValidateMemoryUsageSnapshot({"f"}, 0 /* port_num_expected */,
+  ValidateMemoryUsageSnapshot({"f", "Const/Const"}, /*port_num_expected=*/0,
                               cpu_state.persistent_nodes);
   // Only x in peak memory usage snapshot.
-  ValidateMemoryUsageSnapshot({"x"}, 0 /* port_num_expected */,
+  ValidateMemoryUsageSnapshot({"x"}, /*port_num_expected=*/0,
                               cpu_state.mem_usage_snapshot_at_peak);
 }
 
@@ -2898,5 +2904,6 @@
   EXPECT_GT(ops_executed.count("Recv"), 0);
 }
 
+}  // namespace
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index c6f9487..920900c 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -36,16 +36,15 @@
     if (gpu_manager != nullptr) {
       int num_gpus = gpu_manager->VisibleDeviceCount();
       for (int i = 0; i < num_gpus; i++) {
-        auto exec_status = gpu_manager->ExecutorForDevice(i);
-        if (exec_status.ok()) {
-          se::StreamExecutor* se = exec_status.ValueOrDie();
-          const se::DeviceDescription& desc = se->GetDeviceDescription();
+        auto desc_status = gpu_manager->DescriptionForDevice(i);
+        if (desc_status.ok()) {
+          auto desc = desc_status.ConsumeValueOrDie();
           int cc_major = 0;
           int cc_minor = 0;
-          desc.cuda_compute_capability(&cc_major, &cc_minor);
+          desc->cuda_compute_capability(&cc_major, &cc_minor);
           std::pair<int, int> cuda_compute_capability(cc_major, cc_minor);
           int min_gpu_core_count = 8;
-          if (desc.core_count() >= min_gpu_core_count &&
+          if (desc->core_count() >= min_gpu_core_count &&
               cuda_compute_capability >= min_cuda_compute_capability) {
             num_eligible_gpus++;
           }
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 401f4d7..49de09a 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -149,8 +149,8 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
@@ -268,6 +268,7 @@
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:canonicalizer",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
@@ -603,6 +604,7 @@
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/utils:canonicalizer",
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
@@ -744,12 +746,14 @@
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "shape_optimizer_test",
     srcs = ["shape_optimizer_test.cc"],
+    tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":shape_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index e0eb8e4..6f801d4 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -39,6 +39,7 @@
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
@@ -1758,7 +1759,7 @@
   bool IsSupported(const NodeDef* node) const override {
     // Note: div_no_nan(a, sqrt(b)) => mul_no_nan(a, rsqrt(b))
     // for b == 0 would result in a / Inf instead of 0.
-    return IsAnyDiv(*node) && !IsDivNoNan(*node);
+    return IsAnyDiv(*node) && !IsDivNoNan(*node) && !IsFloorDiv(*node);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -2114,26 +2115,32 @@
     TF_RETURN_IF_TRUE(NumNonControlOutputs(*source, *ctx().node_map) != 1);
 
     const NodeDef* mul = source;
-
-    // TODO(jingyue): handle the case where `scale` is 0-th operand.
-    NodeDef* scale;  // scalar multiplier fot the input tensor
+    int input_idx = 0;
+    int scale_idx = 1;
+    NodeDef* scale;  // scalar multiplier for the input tensor
     NodeDef* input;
-    TF_RETURN_IF_ERROR(GetInputNode(mul->input(1), &scale));
-    TF_RETURN_IF_ERROR(GetInputNode(mul->input(0), &input));
-
-    // Check that 'scale * weight' can be const folded.
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(scale_idx), &scale));
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(input_idx), &input));
+    if (!IsConstant(*scale) && IsConstant(*input)) {
+      VLOG(3) << "Swapped inputs to mul";
+      std::swap(scale_idx, input_idx);
+      std::swap(scale, input);
+    }
     TF_RETURN_IF_TRUE(!IsConstant(*scale));
-    TF_RETURN_IF_ERROR(CheckAttrsExist(*scale, {"dtype", "value"}));
-    TF_RETURN_IF_ERROR(CheckAttrExists(*weights, "dtype"));
-    TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
-                      weights->attr().at("dtype").type());
 
-    // Check that `scale` is a scalar.
+    // Check that one of the inputs to mul is a constant scalar.
     const TensorProto& scale_tensor = scale->attr().at("value").tensor();
     bool scale_is_a_scalar = scale_tensor.has_tensor_shape() &&
                              scale_tensor.tensor_shape().dim_size() == 0;
     TF_RETURN_IF_TRUE(!scale_is_a_scalar);
 
+    // Check that 'scale * weight' can be const folded.
+    TF_RETURN_IF_TRUE(!IsConstant(*scale));
+    TF_RETURN_IF_ERROR(CheckAttrsExist(*scale, {"dtype"}));
+    TF_RETURN_IF_ERROR(CheckAttrExists(*weights, "dtype"));
+    TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
+                      weights->attr().at("dtype").type());
+
     // At this point all preconditions are met, and we safely do the rewrite.
     VLOG(3) << "Fold multiply into conv: conv=" << conv->name()
             << " mul=" << mul->name() << " weights=" << weights->name();
@@ -2148,7 +2155,7 @@
     // Link in its inputs.
     scaled_weights->add_input(conv->input(1));
     ctx().node_map->AddOutput(weights->name(), scaled_weights->name());
-    scaled_weights->add_input(mul->input(1));
+    scaled_weights->add_input(mul->input(scale_idx));
     ctx().node_map->AddOutput(scale->name(), scaled_weights->name());
     ForwardControlDependencies(scaled_weights, {source});
 
@@ -2159,7 +2166,7 @@
     AddToOptimizationQueue(conv);
 
     // Update `tail` node to bypass `mul` because it's folded to the weights.
-    tail->set_input(0, mul->input(0));
+    tail->set_input(0, mul->input(input_idx));
     ctx().node_map->UpdateInput(tail->name(), mul->name(), input->name());
     AddToOptimizationQueue(tail);
     *simplified_node_name = conv->name();
@@ -3326,6 +3333,21 @@
     return node;
   }
 
+  void RemoveRepresentative(NodeDef* node) {
+    auto it = memoized_signatures_.find(node);
+    if (it == memoized_signatures_.end()) return;
+
+    std::vector<NodeDef*>& candidates = rep_[it->second];
+    for (int i = 0; i < candidates.size(); ++i) {
+      if (candidates[i] == node) {
+        std::swap(candidates[i], candidates[candidates.size() - 1]);
+        candidates.resize(candidates.size() - 1);
+        break;
+      }
+    }
+    memoized_signatures_.erase(node);
+  }
+
  private:
   uint64 ComputeSignature(const NodeDef& node);
   bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
@@ -3355,6 +3377,9 @@
   return h;
 }
 
+// PRECONDITION:
+//  Node input orders are assumed to be canonicalized, i.e. control inputs for
+//  all nodes as well as regular inputs for commutative nodes must be sorted.
 bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   if (node1.op() != node2.op()) {
     return false;
@@ -3370,38 +3395,13 @@
   }
 
   // Compare inputs.
-  if (IsCommutative(node1)) {
-    std::vector<string> inputs1(node1.input().begin(), node1.input().end());
-    std::sort(inputs1.begin(), inputs1.end());
-    std::vector<string> inputs2(node2.input().begin(), node2.input().end());
-    std::sort(inputs2.begin(), inputs2.end());
-    return inputs1 == inputs2;
-  } else {
-    // The order or ordinary inputs matters.
-    int index = 0;
-    for (; index < node1.input_size(); ++index) {
-      if (IsControlInput(node1.input(index))) {
-        break;
-      } else if (node1.input(index) != node2.input(index)) {
-        return false;
-      }
-    }
-    // The order of control inputs does not matter.
-    if (index < node1.input_size()) {
-      std::vector<string> ctrl_inputs1(node1.input().begin() + index,
-                                       node1.input().end());
-      std::sort(ctrl_inputs1.begin(), ctrl_inputs1.end());
-      std::vector<string> ctrl_inputs2(node2.input().begin() + index,
-                                       node2.input().end());
-      std::sort(ctrl_inputs2.begin(), ctrl_inputs2.end());
-      return ctrl_inputs1 != ctrl_inputs2;
-    }
+  auto it1 = node1.input().begin();
+  auto it2 = node2.input().begin();
+  for (; it1 != node1.input().end(); ++it1, ++it2) {
+    if (*it1 != *it2) return false;
   }
 
   // Compare attributes.
-  if (node1.attr().size() != node2.attr().size()) {
-    return false;
-  }
   for (const auto& attr1 : node1.attr()) {
     auto it = node2.attr().find(attr1.first);
     if (it == node2.attr().end()) return false;
@@ -3429,6 +3429,10 @@
 }
 
 void ArithmeticOptimizer::DedupComputations() {
+  CanonicalizeGraph(optimized_graph_);
+  // LOG(INFO) << "Graph after canonicalization: \n"
+  //           << optimized_graph_->DebugString();
+
   GraphTopologyView graph_view;
   if (!graph_view.InitializeFromGraph(*optimized_graph_).ok()) {
     LOG(WARNING) << "Failed to initialize GraphTopologyView.";
@@ -3478,26 +3482,38 @@
       if (feeds_inplace_op.find(rep) != feeds_inplace_op.end()) {
         continue;
       }
-      VLOG(3) << "Remove duplicated node: node=" << node->name()
-              << " representative=" << rep->name();
       const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
       std::vector<NodeDef*> fanouts(tmp.begin(), tmp.end());
       for (NodeDef* fanout : fanouts) {
+        // Update consumers of node.
+        bool updated_fanout = false;
         for (int i = 0; i < fanout->input_size(); ++i) {
           string* fanout_input = fanout->mutable_input(i);
+
           const int position =
               NodePositionIfSameNode(*fanout_input, node->name());
           // Update name in-place.
           if (position < -1) {
             continue;
-          } else if (position > 0) {
-            *fanout_input = StrCat(rep->name(), ":", position);
-          } else if (position == 0) {
-            *fanout_input = rep->name();
           } else {
-            *fanout_input = StrCat("^", rep->name());
+            if (!updated_fanout) {
+              // The signature of the fanout node will change. Remove it from
+              // nodes.
+              nodes.RemoveRepresentative(fanout);
+            }
+            updated_fanout = true;
+            if (position > 0) {
+              *fanout_input = StrCat(rep->name(), ":", position);
+            } else if (position == 0) {
+              *fanout_input = rep->name();
+            } else {
+              *fanout_input = StrCat("^", rep->name());
+            }
           }
-          node_map_->AddOutput(rep->name(), fanout->name());
+        }
+        if (updated_fanout) {
+          node_map_->UpdateInput(fanout->name(), node->name(), rep->name());
+          CanonicalizeNode(fanout);
         }
       }
       duplicates.insert(i);
@@ -3513,21 +3529,6 @@
   }
 }
 
-void ArithmeticOptimizer::ForwardControlDependencies(
-    NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
-  for (const auto& src : src_nodes) {
-    for (int i = src->input_size() - 1; i >= 0; --i) {
-      if (IsControlInput(src->input(i))) {
-        *target_node->add_input() = src->input(i);
-        node_map_->AddOutput(NodeName(src->input(i)), target_node->name());
-      } else {
-        break;
-      }
-    }
-  }
-  DedupControlInputs(target_node);
-}
-
 Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
@@ -3540,7 +3541,8 @@
                                   &feed_nodes_, opt_level_);
   const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
 
-  // Stop pipeline after first stage returning non-empty simplified tensor name.
+  // Stop pipeline after first stage returning non-empty simplified tensor
+  // name.
   const auto stop = [](const string& result) { return !result.empty(); };
   GraphOptimizerStagePipeline<string> pipeline(stop);
 
@@ -3658,19 +3660,19 @@
   fetch_nodes_known_ = !item.fetch.empty();
   GrapplerItem optimized_item(item);
   optimized_graph_ = &optimized_item.graph;
-  node_map_.reset(new NodeMap(optimized_graph_));
 
+  node_map_.reset(new NodeMap(optimized_graph_));
   for (const auto& feed : item.feed) {
     feed_nodes_.insert(NodeName(feed.first));
   }
 
-  // Disable restricted graph rewrites.
+  // // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
       item.optimization_options().allow_non_differentiable_rewrites;
 
   // Perform topological sort on the graph in order to help DedupComputations
-  // and AddOpsRewrite to optimize larger subgraphs starting from the roots with
-  // more inputs.
+  // and AddOpsRewrite to optimize larger subgraphs starting from the roots
+  // with more inputs.
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
   GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 94508c6..d9ce9f6 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -163,11 +163,10 @@
   EXPECT_EQ(output.node_size(), 5);
   const NodeDef* new_div = node_map.GetNode("div");
   ASSERT_NE(new_div, nullptr);
-  ASSERT_EQ(new_div->input_size(), 4);
+  ASSERT_EQ(new_div->input_size(), 3);
   EXPECT_EQ(new_div->input(0), "check1");
   EXPECT_EQ(new_div->input(1), "check1");
   EXPECT_EQ(new_div->input(2), "^assert1");
-  EXPECT_EQ(new_div->input(3), "^assert1");
 
   auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
   EXPECT_EQ(tensors.size(), 1);
@@ -507,8 +506,8 @@
   const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
   ASSERT_NE(mul_node, nullptr);
   ASSERT_EQ(mul_node->input_size(), 2);
-  EXPECT_EQ(mul_node->input(0), "Placeholder");
-  EXPECT_EQ(mul_node->input(1), HoistAddName("Add_6"));
+  EXPECT_EQ(mul_node->input(0), HoistAddName("Add_6"));
+  EXPECT_EQ(mul_node->input(1), "Placeholder");
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
   ASSERT_NE(add_6_node, nullptr);
@@ -1578,47 +1577,53 @@
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_FLOAT,
-                                   ops::Placeholder::Shape({8, 28, 28, 3}));
-  Output scale = ops::Const(s.WithOpName("scale"), 1.0f / 255.0f, {});
-  Output scaled_inputs =
-      ops::Multiply(s.WithOpName("scaled_inputs"), inputs, scale);
-  Output perm_nhwc_to_nchw =
-      ops::Const(s.WithOpName("perm_nhwc_to_nchw"), {0, 3, 1, 2}, {4});
-  Output inputs_nchw = ops::Transpose(s.WithOpName("inputs_nchw"),
-                                      scaled_inputs, perm_nhwc_to_nchw);
-  Output weights = ops::Const(s.WithOpName("weights"),
-                              Input::Initializer(127.0f, {5, 5, 3, 16}));
-  Output conv =
-      ops::Conv2D(s.WithOpName("conv"), inputs_nchw, weights, {1, 1, 1, 1},
-                  "VALID", ops::Conv2D::DataFormat("NCHW"));
-  Output outputs = ops::Identity(s.WithOpName("outputs"), conv);
+  for (bool swap_inputs : {false, true}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_FLOAT,
+                                     ops::Placeholder::Shape({1, 28, 28, 3}));
+    Output scale = ops::Const(s.WithOpName("scale"), 1.0f / 255.0f, {});
+    Output scaled_inputs = ops::Multiply(s.WithOpName("scaled_inputs"),
+                                         swap_inputs ? scale : inputs,
+                                         swap_inputs ? inputs : scale);
+    Output perm_nhwc_to_nchw =
+        ops::Const(s.WithOpName("perm_nhwc_to_nchw"), {0, 3, 1, 2}, {4});
+    Output inputs_nchw = ops::Transpose(s.WithOpName("inputs_nchw"),
+                                        scaled_inputs, perm_nhwc_to_nchw);
+    Output weights = ops::Const(s.WithOpName("weights"),
+                                Input::Initializer(127.0f, {5, 5, 3, 4}));
+    Output conv =
+        ops::Conv2D(s.WithOpName("conv"), inputs_nchw, weights, {1, 1, 1, 1},
+                    "VALID", ops::Conv2D::DataFormat("NCHW"));
+    Output outputs = ops::Identity(s.WithOpName("outputs"), conv);
 
-  GrapplerItem item;
-  item.fetch = {"outputs"};
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GrapplerItem item;
+    item.fetch = {"outputs"};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  GraphDef output;
-  ArithmeticOptimizer optimizer;
-  EnableOnlyFoldMultipleIntoConv(&optimizer);
-  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+    //    LOG(INFO) << "Before:\n" << item.graph.DebugString();
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlyFoldMultipleIntoConv(&optimizer);
+    OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
-  NodeMap node_map(&output);
+    //    LOG(INFO) << "After:\n"  << output.DebugString();
+    NodeMap node_map(&output);
+    // `conv` is now a folded convolution with scaled weights.
+    const NodeDef* folded_conv = node_map.GetNode(conv.node()->name());
+    ASSERT_NE(folded_conv, nullptr);
 
-  // `conv` is now a folded convolution with scaled weights.
-  const NodeDef* folded_conv = node_map.GetNode(conv.node()->name());
-  ASSERT_NE(folded_conv, nullptr);
+    const NodeDef* folded_conv_weights =
+        node_map.GetNode(folded_conv->input(1));
+    ASSERT_NE(folded_conv_weights, nullptr);
+    EXPECT_EQ(folded_conv_weights->op(), "Mul");
 
-  const NodeDef* folded_conv_weights = node_map.GetNode(folded_conv->input(1));
-  ASSERT_NE(folded_conv_weights, nullptr);
-  EXPECT_EQ(folded_conv_weights->op(), "Mul");
-
-  // Its input should be a transpose of `inputs`.
-  const NodeDef* transpose = node_map.GetNode(NodeName(folded_conv->input(0)));
-  ASSERT_NE(transpose, nullptr);
-  ASSERT_EQ(transpose->input_size(), 2);
-  EXPECT_EQ(transpose->input(0), "inputs");
+    // Its input should be a transpose of `inputs`.
+    const NodeDef* transpose =
+        node_map.GetNode(NodeName(folded_conv->input(0)));
+    ASSERT_NE(transpose, nullptr);
+    ASSERT_EQ(transpose->input_size(), 2);
+    EXPECT_EQ(transpose->input(0), "inputs");
+  }
 }
 
 TEST_F(ArithmeticOptimizerTest, NotFoldMulAcrossPreservedTranspose) {
@@ -1921,8 +1926,8 @@
   auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
   auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
   auto c = ops::Variable(s.WithOpName("c"), {2, 2}, DT_FLOAT);
-  auto add_ab = ops::Add(sx.WithOpName("Add_ab"), a, b);
-  auto add_abc = ops::Add(sy.WithOpName("Add_abc"), add_ab, c);
+  auto add_bc = ops::Add(sx.WithOpName("Add_bc"), b, c);
+  auto add_abc = ops::Add(sy.WithOpName("Add_abc"), a, add_bc);
 
   auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
 
@@ -1948,9 +1953,9 @@
   //
   //     +
   //    / \
-  //   +   c      -->    AddN(a, b, c)
-  //  / \
-  // a   b
+  //   a   +         -->    AddN(a, b, c)
+  //      / \
+  //     b   c
   EXPECT_EQ(output.node_size(), 5);
 
   NodeMap node_map(&output);
@@ -2544,6 +2549,43 @@
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMulExcludeFloorDiv) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output sqrt_y = ops::Sqrt(s.WithOpName("sqrt_y"), y);
+  Output div_x_sqrt_y = ops::FloorDiv(s.WithOpName("output"), x, sqrt_y);
+
+  GrapplerItem item;
+  item.fetch = {"output"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySqrtDivToRsqrtMul(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(tensors.size(), 1);
+
+  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+  EXPECT_EQ(output.node_size(), item.graph.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "output") {
+      EXPECT_EQ(node.op(), "FloorDiv");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "x");
+      EXPECT_EQ(node.input(1), "sqrt_y");
+    } else if (node.name() == "sqrt_y") {
+      EXPECT_EQ(node.op(), "Sqrt");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "y");
+    }
+  }
+}
+
 TEST_F(ArithmeticOptimizerTest, FuseSquaredDiff) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 4029e9c..bd195f2 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -2382,20 +2382,28 @@
   return false;
 }
 
-bool ConstantFolding::IsReductionCandidateForSimplification(
-    const NodeDef& node, const GraphProperties& properties,
-    TensorShapeProto* input_tensor_shape, TensorShapeProto* output_tensor_shape,
-    bool* is_single_element_op) const {
+bool ConstantFolding::IsReductionWithConstantIndices(
+    const NodeDef& node, bool* indices_is_empty) const {
   // Ensure its an appropriate Reduce node.
   if (!IsReduction(node) || node.input_size() < 2) {
     return false;
   }
   // Ensure that the axes to reduce by are constant.
   NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-  if (!IsReallyConstant(*reductions_indices)) {
+  if (!IsReallyConstant(*reductions_indices) ||
+      !reductions_indices->attr().count("value")) {
     return false;
   }
+  const TensorShapeProto& reduction_indices_shape =
+      reductions_indices->attr().at("value").tensor().tensor_shape();
+  *indices_is_empty = TensorShape(reduction_indices_shape).num_elements() == 0;
+  return true;
+}
 
+bool ConstantFolding::IsReductionCandidateForSimplification(
+    const NodeDef& node, const GraphProperties& properties,
+    TensorShapeProto* input_tensor_shape, TensorShapeProto* output_tensor_shape,
+    bool* is_single_element_op) const {
   // Get the properties of the input & output tensors and check if they both
   // contain a single element.
   if (!properties.HasInputProperties(node.name()) ||
@@ -2460,9 +2468,34 @@
   return simplifiable;
 }
 
+bool ConstantFolding::ReplaceReductionWithIdentity(NodeDef* node) const {
+  // Replace the reduction node with an identity node, that can be further
+  // optimized by other passes.
+  DataType output_type;
+  if (node->attr().count("T") != 0) {
+    output_type = node->attr().at("T").type();
+  } else if (IsAny(*node) || IsAll(*node)) {
+    output_type = DT_BOOL;
+  } else {
+    return false;
+  }
+  node->set_op("Identity");
+  node->clear_attr();
+  (*node->mutable_attr())["T"].set_type(output_type);
+  *node->mutable_input(1) = AsControlDependency(node->input(1));
+  return true;
+}
+
 bool ConstantFolding::SimplifyReduction(GraphDef* optimized_graph,
                                         const GraphProperties& properties,
                                         NodeDef* node) {
+  bool indices_is_empty = false;
+  if (!IsReductionWithConstantIndices(*node, &indices_is_empty)) {
+    return false;
+  }
+  if (indices_is_empty) {
+    return ReplaceReductionWithIdentity(node);
+  }
   bool is_single_element_op = false;
   TensorShapeProto input_tensor_shape, output_tensor_shape;
   if (!IsReductionCandidateForSimplification(
@@ -2524,20 +2557,7 @@
     (*node->mutable_attr())["Tshape"] = attr_type_indices;
     return true;
   } else if (simplifiable_to_identity) {
-    // Replace the reduction node with an identity node, that can be further
-    // optimized by the model pruner.
-    DataType output_type;
-    if (node->attr().count("T") != 0) {
-      output_type = node->attr().at("T").type();
-    } else {
-      // This is an 'any' or 'all' reduction. The output is always boolean.
-      output_type = DT_BOOL;
-    }
-    node->set_op("Identity");
-    node->clear_attr();
-    (*node->mutable_attr())["T"].set_type(output_type);
-    *node->mutable_input(1) = AsControlDependency(node->input(1));
-    return true;
+    return ReplaceReductionWithIdentity(node);
   }
   return false;
 }
@@ -3137,11 +3157,11 @@
     for (auto interval : constant_input_runs) {
       // Push the constant inputs in the interval to a child node than can be
       // constant folded.
-      const string new_node_name = OptimizedNodeName(
-          *node, strings::StrCat("_partial_split_", interval.first));
-      if (node_map_->NodeExists(new_node_name)) {
-        break;
-      }
+      string new_node_name = OptimizedNodeName(*node, "_partial_split");
+      do {
+        new_node_name += strings::StrCat("_", interval.first);
+      } while (node_map_->NodeExists(new_node_name));
+
       NodeDef* added_node = optimized_graph->add_node();
       *added_node = *node;
       added_node->set_name(new_node_name);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 45b1ca2..b4c39a5 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -153,6 +153,11 @@
   bool SimplifyReshape(const GraphProperties& properties, bool use_shape_info,
                        NodeDef* node);
 
+  // Returns true iff the node is a reduction and its reduction indices are
+  // constant. Sets *indices_is_empty to true if the set of dimensions to reduce
+  // along is empty (this happens often in the gradient graphs).
+  bool IsReductionWithConstantIndices(const NodeDef& node,
+                                      bool* indices_is_empty) const;
   // Returns true if theres a possibility that a Reduce node could be simplified
   // to an Identity/Reshape.
   bool IsReductionCandidateForSimplification(
@@ -160,11 +165,12 @@
       TensorShapeProto* input_tensor_shape,
       TensorShapeProto* output_tensor_shape, bool* is_single_element_op) const;
   // Returns true iff this reduction can be reduced to an identity (i.e if the
-  // set of dimensions to reduce along is empty). This happens often in the
-  // gradient graphs.
+  // input dimensions to reduce along are all of size 1 and keep_dims is true).
   bool IsReductionSimplifiableToIdentity(
       const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
       const gtl::InlinedVector<TensorValue, 4>& reduction_indices_vector) const;
+  // Changes a reduction into an Identity op, returning true on success.
+  bool ReplaceReductionWithIdentity(NodeDef* node) const;
   // Simplifies a Reduction operation to an Identity/Reshape operation if
   // applicable.
   bool SimplifyReduction(GraphDef* optimized_graph,
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 22d8ccc..11c362c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2366,6 +2366,40 @@
   CompareGraphs(want, got);
 }
 
+TEST_F(ConstantFoldingTest, MergeConcat_PartialFolding) {
+  Scope scope = Scope::NewRootScope();
+  Output c1 = ops::Const(scope.WithOpName("c1"), 1.0f, {2, 2});
+  Output c2 = ops::Const(scope.WithOpName("c2"), 2.0f, {2, 2});
+  Output c3 = ops::Const(scope.WithOpName("c3"), 3.0f, {2, 2});
+  Output c4 = ops::Const(scope.WithOpName("c4"), 4.0f, {2, 2});
+  Output ph = ops::Placeholder(scope.WithOpName("ph"), DT_FLOAT,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output axis = ops::Const(scope.WithOpName("axis"), 0, {});
+
+  ops::Concat concat1(scope.WithOpName("concat1"), {c1, c2, ph}, axis);
+  ops::Concat concat2(scope.WithOpName("concat2"), {c3, c4, Output(concat1)},
+                      axis);
+
+  GrapplerItem item;
+  item.fetch = {"concat2"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("ConstantFolding/concat2_partial_split_0_0", "Const", {}, {}, &want);
+  AddNode("axis", "Const", {}, {}, &want);
+  AddNode("ph", "Placeholder", {}, {}, &want);
+  AddNode("concat2", "ConcatV2",
+          {"ConstantFolding/concat2_partial_split_0_0", "ph", "axis"}, {},
+          &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -2477,8 +2511,12 @@
   attr = attr.KeepDims(true);
   Output p2 = ops::Prod(scope.WithOpName("p2"), v2, c2, attr);
 
+  // Test with unknown input shape.
+  Output a = ops::Placeholder(scope.WithOpName("a"), DT_FLOAT);
+  Output p3 = ops::Prod(scope.WithOpName("p3"), a, i, attr);
+
   GrapplerItem item;
-  item.fetch = {"s", "p2"};
+  item.fetch = {"s", "p2", "p3"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
   ConstantFolding optimizer(/*cpu_device=*/nullptr);
@@ -2500,19 +2538,28 @@
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("v2", node.input(0));
       EXPECT_EQ("^c2", node.input(1));
+    } else if (node.name() == "p3") {
+      found++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+      EXPECT_EQ("^i", node.input(1));
     }
   }
-  EXPECT_EQ(2, found);
+  EXPECT_EQ(3, found);
 
   auto v_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
   auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 1}));
-  auto tensors_expected =
-      EvaluateNodes(item.graph, item.fetch, {{"v", v_t}, {"v2", v2_t}});
-  EXPECT_EQ(2, tensors_expected.size());
-  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}, {"v2", v2_t}});
-  EXPECT_EQ(2, tensors.size());
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch,
+                                        {{"v", v_t}, {"v2", v2_t}, {"a", a_t}});
+  EXPECT_EQ(3, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch, {{"v", v_t}, {"v2", v2_t}, {"a", a_t}});
+  EXPECT_EQ(3, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+  test::ExpectTensorNear<float>(tensors_expected[2], tensors[2], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 643acb3..d3858d6 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -17,7 +17,6 @@
         ":filter_with_random_uniform_fusion",
         ":hoist_random_uniform",
         ":latency_all_edges",
-        ":make_numa_aware",
         ":make_sloppy",
         ":map_and_batch_fusion",
         ":map_and_filter_fusion",
@@ -26,7 +25,9 @@
         ":map_vectorization",
         ":meta_optimizer",
         ":noop_elimination",
+        ":parallel_batch",
         ":shuffle_and_repeat_fusion",
+        ":slack",
     ],
 )
 
@@ -206,6 +207,7 @@
         "graph_utils.h",
     ],
     deps = [
+        "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -321,38 +323,6 @@
 )
 
 cc_library(
-    name = "make_numa_aware",
-    srcs = ["make_numa_aware.cc"],
-    hdrs = ["make_numa_aware.h"],
-    deps = [
-        ":graph_utils",
-        ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-    ] + tf_protos_all(),
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "make_numa_aware_test",
-    srcs = ["make_numa_aware_test.cc"],
-    deps = [
-        ":graph_test_utils",
-        ":graph_utils",
-        ":make_numa_aware",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler:grappler_item",
-    ],
-)
-
-cc_library(
     name = "make_sloppy",
     srcs = ["make_sloppy.cc"],
     hdrs = ["make_sloppy.h"],
@@ -604,24 +574,6 @@
 )
 
 cc_library(
-    name = "rebatch",
-    srcs = ["rebatch.cc"],
-    hdrs = ["rebatch.h"],
-    deps = [
-        ":graph_utils",
-        ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler/utils:functions",
-        "//tensorflow/core:lib",
-    ] + tf_protos_all(),
-    alwayslink = 1,
-)
-
-cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
     hdrs = [
@@ -668,6 +620,54 @@
 )
 
 cc_library(
+    name = "parallel_batch",
+    srcs = ["parallel_batch.cc"],
+    hdrs = ["parallel_batch.h"],
+    deps = [
+        ":optimizer_base",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "parallel_batch_test",
+    srcs = ["parallel_batch_test.cc"],
+    deps = [
+        ":graph_test_utils",
+        ":graph_utils",
+        ":parallel_batch",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
+    name = "rebatch",
+    srcs = ["rebatch.cc"],
+    hdrs = ["rebatch.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core:lib",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+cc_library(
     name = "shuffle_and_repeat_fusion",
     srcs = ["shuffle_and_repeat_fusion.cc"],
     hdrs = [
@@ -703,6 +703,45 @@
 )
 
 cc_library(
+    name = "slack",
+    srcs = ["slack.cc"],
+    hdrs = [
+        "slack.h",
+    ],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "slack_test",
+    srcs = ["slack_test.cc"],
+    deps = [
+        ":function_utils",
+        ":graph_utils",
+        ":slack",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/kernels/data",
+    ],
+)
+
+cc_library(
     name = "vectorization_utils",
     srcs = ["vectorization_utils.cc"],
     hdrs = [
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 906b078..0652df4 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -50,7 +50,8 @@
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 22> kPassThroughOps = {
+constexpr std::array<const char*, 23> kPassThroughOps = {
+    "_Retval",
     "BatchDataset",
     "BatchDatasetV2",
     "ExperimentalMapAndBatchDataset",
@@ -121,6 +122,9 @@
   new_node.add_input(num_shards_node->name());
   new_node.add_input(index_node->name());
 
+  // Ensure that each shard will have at least one element.
+  (*(new_node.mutable_attr()))["require_non_empty"].set_b(true);
+
   // Add shapes and other attributes
   NodeDef* add_after = graph->GetNode(add_before.input(0));
 
@@ -282,16 +286,14 @@
   // function in flat_map.
   if (IsDatasetNodeOfType(node, kFuncDatasetOps) &&
       ReaderOpInFunction(node, *flib)) {
-    TF_RETURN_IF_ERROR(ProcessDatasetSourceNode(graph, node, nodes_to_delete,
-                                                num_workers, index));
-    return Status::OK();
+    return ProcessDatasetSourceNode(graph, node, nodes_to_delete, num_workers,
+                                    index);
   }
 
   if (IsDatasetNodeOfType(node, kReaderDatasetOps)) {
     // We reached a reader dataset directly and we try to shard input 0.
-    TF_RETURN_IF_ERROR(ProcessDatasetSourceNode(graph, node, nodes_to_delete,
-                                                num_workers, index));
-    return Status::OK();
+    return ProcessDatasetSourceNode(graph, node, nodes_to_delete, num_workers,
+                                    index);
   }
 
   if (!IsDatasetNodeOfType(node, kPassThroughOps)) {
@@ -324,17 +326,16 @@
   // that dataset, in effect giving a piece to each worker. Finally, we remove
   // occurences from randomness from before that point in the graph (e.g. things
   // like ShuffleDataset) to ensure that `shard` returns a sensible result.
-
-  NodeDef sink_node;
-  TF_RETURN_IF_ERROR(graph_utils::FindSinkNode(item.graph, &sink_node));
-  Status s = RecursivelyHandleOp(sink_node, num_workers, index, &flib, &graph,
+  NodeDef* sink_node;
+  TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
+  Status s = RecursivelyHandleOp(*sink_node, num_workers, index, &flib, &graph,
                                  &nodes_to_delete);
 
   if (!s.ok() && errors::IsNotFound(s)) {
     LOG(WARNING) << "Cannot find shardable dataset, adding a shard node at "
                  << "the end of the dataset instead. This may have performance "
                  << "implications.";
-    TF_RETURN_IF_ERROR(AddShardNode(&graph, sink_node, num_workers, index));
+    TF_RETURN_IF_ERROR(AddShardNode(&graph, *sink_node, num_workers, index));
   } else if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.cc
index 9e1a888..08af725 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.cc
@@ -269,8 +269,11 @@
   // Nothing to do for FilterWithRandomUniformFusion
 }
 
-REGISTER_GRAPH_OPTIMIZER_AS(FilterWithRandomUniformFusion,
-                            "filter_with_random_uniform_fusion");
+// TODO(b/131229793): The current implementation of the optimization is brittle
+// as it depends on the order of inputs to commutative nodes. Make the
+// optimization robust to the input ordering before re-enabling it.
+// REGISTER_GRAPH_OPTIMIZER_AS(FilterWithRandomUniformFusion,
+//                             "filter_with_random_uniform_fusion");
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 483b958..758f778 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -301,37 +301,17 @@
   return Status::OK();
 }
 
-// Tries to find a Sink node in the graph. A sink node is defined as a node
-// that has at least one input and no outputs. If there are multiple of these,
-// this might return any one of them. This is useful to identify the final
-// Dataset op in the graph but in some cases there might be multiple Identity
-// ops added to the end and this would return the last Identity op in that case.
+Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
+                    NodeDef** fetch_node) {
+  if (item.fetch.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected only one fetch node but there were ", item.fetch.size(), ": ",
+        absl::StrJoin(item.fetch, ", "));
+  }
 
-Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node) {
-  absl::flat_hash_map<string, int> all_node_names;
-  absl::flat_hash_map<string, int> node_input_map;
-  for (int i = 0; i < graph_def.node_size(); ++i) {
-    all_node_names.insert_or_assign(graph_def.node(i).name(), i);
-    node_input_map.insert_or_assign(graph_def.node(i).name(), 0);
-  }
-  // Counts how many graph nodes for each input name. Candidate sink
-  // nodes are ones which are inputs into zero nodes.
-  for (const NodeDef& node : graph_def.node()) {
-    for (const string& input_name : node.input()) {
-      node_input_map[input_name]++;
-    }
-  }
-  for (const auto& it : node_input_map) {
-    if (it.second == 0) {
-      const NodeDef& sink_graph_node = graph_def.node(all_node_names[it.first]);
-      if (sink_graph_node.input_size() == 0) {
-        continue;
-      }
-      *sink_node = sink_graph_node;
-      return Status::OK();
-    }
-  }
-  return errors::InvalidArgument("Failed to find a sink node");
+  *fetch_node = graph.GetNode(item.fetch.at(0));
+
+  return Status::OK();
 }
 
 }  // namespace graph_utils
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 0253b6d..417a8c4 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -24,6 +24,7 @@
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -144,8 +145,10 @@
 // and renaming nodes does not mutate any edges.
 Status EnsureNodeNamesUnique(Graph* g);
 
-// Returns the sink node (i.e. last node) in the graph.
-Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node);
+// Returns the item's fetch node, if there is exactly one. Otherwise, returns an
+// error.
+Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
+                    NodeDef** fetch_node);
 
 }  // namespace graph_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 8108c84..93df72a 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -270,37 +270,45 @@
   EXPECT_NE(const_0->name(), const_2->name());
 }
 
-TEST(GraphUtilsTest, TestFindSinkNodeStandard) {
-  GraphDef graph_def;
-  MutableGraphView graph(&graph_def);
+TEST(GraphUtilsTest, TestGetFetchNode) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
 
-  AddNode("node1", "Identity", {}, {}, &graph);
-  AddNode("node2", "Identity", {"node1"}, {}, &graph);
-  NodeDef* node3 = AddNode("node3", "Identity", {"node2"}, {}, &graph);
+  NodeDef* node1 = AddNode("node1", "Identity", {}, {}, &graph);
+  NodeDef* node2 = AddNode("node2", "Identity", {node1->name()}, {}, &graph);
+  NodeDef* node3 = AddNode("node3", "Identity", {node2->name()}, {}, &graph);
+  item.fetch.push_back(node3->name());
 
-  NodeDef sink_node;
-  TF_EXPECT_OK(FindSinkNode(graph_def, &sink_node));
-  EXPECT_EQ(sink_node.name(), node3->name());
+  NodeDef* sink_node;
+  TF_EXPECT_OK(GetFetchNode(graph, item, &sink_node));
+  EXPECT_EQ(sink_node->name(), node3->name());
 }
 
-TEST(GraphUtilsTest, TestFindSinkNodeNoSingleSink) {
-  GraphDef graph_def;
-  MutableGraphView graph(&graph_def);
+TEST(GraphUtilsTest, TestFindSinkNodeMultipleFetches) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
 
-  AddNode("node1", "Identity", {}, {}, &graph);
-  AddNode("node2", "Identity", {}, {}, &graph);
+  NodeDef* node1 = AddNode("node1", "Identity", {}, {}, &graph);
+  NodeDef* node2 = AddNode("node2", "Identity", {node1->name()}, {}, &graph);
+  NodeDef* node3 = AddNode("node3", "Identity", {node2->name()}, {}, &graph);
+  item.fetch.push_back(node2->name());
+  item.fetch.push_back(node3->name());
 
-  NodeDef sink_node;
-  Status s = FindSinkNode(graph_def, &sink_node);
+  NodeDef* sink_node;
+  Status s = GetFetchNode(graph, item, &sink_node);
   EXPECT_FALSE(s.ok());
 }
 
-TEST(GraphUtilsTest, TestFindSinkNodeGraphDefEmpty) {
-  GraphDef graph_def;
-  MutableGraphView graph(&graph_def);
+TEST(GraphUtilsTest, TestFindSinkNodeNoFetches) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
 
-  NodeDef sink_node;
-  Status s = FindSinkNode(graph_def, &sink_node);
+  NodeDef* node1 = AddNode("node1", "Identity", {}, {}, &graph);
+  NodeDef* node2 = AddNode("node2", "Identity", {node1->name()}, {}, &graph);
+  AddNode("node3", "Identity", {node2->name()}, {}, &graph);
+
+  NodeDef* sink_node;
+  Status s = GetFetchNode(graph, item, &sink_node);
   EXPECT_FALSE(s.ok());
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 189046f..4bd8dec 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -83,14 +83,6 @@
       // node corresponds to a `Dataset` op.
       continue;
     }
-    MutableGraphView::OutputPort output_port =
-        graph.GetOutputPort(node.name(), 0);
-    auto fanout = graph.GetFanout(output_port);
-    if (fanout.size() > 1) {
-      LOG(WARNING) << node.name() << " has fanout size " << fanout.size();
-      continue;
-    }
-    // fanout will have size 0 for last dataset node in the pipeline.
     NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
     TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), latency_node->name()));
     stats->num_changes++;
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
deleted file mode 100644
index 221f4c2..0000000
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/data/make_numa_aware.h"
-
-#include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
-#include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-
-namespace tensorflow {
-namespace grappler {
-namespace {
-
-NodeDef MakeNumaAwareNode(const NodeDef& node, MutableGraphView* graph) {
-  NodeDef numa_aware_node = node;
-  graph_utils::SetUniqueGraphNodeName("make_numa_aware", graph->graph(),
-                                      &numa_aware_node);
-  numa_aware_node.set_op("ExperimentalNumaMapAndBatchDataset");
-  return numa_aware_node;
-}
-
-}  // namespace
-
-Status MakeNumaAware::OptimizeAndCollectStats(Cluster* cluster,
-                                              const GrapplerItem& item,
-                                              GraphDef* output,
-                                              OptimizationStats* stats) {
-  *output = item.graph;
-  MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
-
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "ExperimentalMapAndBatchDataset") continue;
-
-    auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
-    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), numa_node->name()));
-    nodes_to_delete.insert(node.name());
-    stats->num_changes++;
-  }
-  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return Status::OK();
-}
-
-REGISTER_GRAPH_OPTIMIZER_AS(MakeNumaAware, "make_numa_aware");
-
-}  // namespace grappler
-}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
deleted file mode 100644
index 4b83fb6..0000000
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/data/make_numa_aware.h"
-
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace grappler {
-namespace {
-
-TEST(MakeNumaAwareTest, ReplaceSimple) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {
-          NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-          NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-          NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-          NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-          NDef("batch_size", "Const", {}, {{"value", 3}, {"dtype", DT_INT32}}),
-          NDef("num_parallel_calls", "Const", {},
-               {{"value", 5}, {"dtype", DT_INT32}}),
-          NDef("drop_remainder", "Const", {},
-               {{"value", 0}, {"dtype", DT_BOOL}}),
-          graph_tests_utils::MakeMapAndBatchNode(
-              "map_and_batch", "range", "batch_size", "num_parallel_calls",
-              "drop_remainder"),
-      },
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
-
-  MakeNumaAware optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
-                                               output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
-      "ExperimentalNumaMapAndBatchDataset", output));
-}
-
-TEST(MapAndBatchNumaAawareReplacementTest, ReplaceWithExtraChild) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {
-          NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-          NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-          NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-          NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-          NDef("batch_size", "Const", {}, {{"value", 3}, {"dtype", DT_INT32}}),
-          NDef("num_parallel_calls", "Const", {},
-               {{"value", 5}, {"dtype", DT_INT32}}),
-          NDef("drop_remainder", "Const", {},
-               {{"value", 0}, {"dtype", DT_BOOL}}),
-          graph_tests_utils::MakeMapAndBatchNode(
-              "map_and_batch", "range", "batch_size", "num_parallel_calls",
-              "drop_remainder"),
-          NDef("cache", "CacheDataset", {"map_and_batch"}, {}),
-      },
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
-
-  MakeNumaAware optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
-                                               output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
-      "ExperimentalNumaMapAndBatchDataset", output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
-
-  int numa_map_and_batch_component_id = graph_utils::FindGraphNodeWithOp(
-      "ExperimentalNumaMapAndBatchDataset", output);
-  auto& numa_map_and_batch_component =
-      output.node(numa_map_and_batch_component_id);
-  EXPECT_EQ(numa_map_and_batch_component.input(0), "range");
-
-  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
-  auto& cache_node = output.node(cache_id);
-  EXPECT_EQ(cache_node.input(0), numa_map_and_batch_component.name());
-}
-
-}  // namespace
-}  // namespace grappler
-}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index eccbad9..121acda 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,6 +35,31 @@
 using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
+// tf.data optimizations, in the order we want to perform them.
+constexpr std::array<const char*, 14> kTFDataOptimizations = {
+    "noop_elimination",
+    "shuffle_and_repeat_fusion",
+    "map_fusion",
+    "filter_fusion",
+    "filter_with_random_uniform_fusion",
+    "map_and_filter_fusion",
+    "hoist_random_uniform",
+    "map_parallelization",
+    "map_and_batch_fusion",
+    "map_vectorization",
+    "latency_all_edges",
+    "make_sloppy",
+    "parallel_batch",
+    "slack"};
+
+// Standard grappler optimizations, in the order we want to perform them.
+constexpr std::array<const char*, 5> kGrapplerOptimizations = {
+    "pruning",
+    "function",
+    "shape",
+    "arithmetic",
+    "dependency"};
+
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
 Status ToConfigMap(
@@ -80,13 +105,12 @@
   GrapplerItem optimized_item = item;
 
   // Perform optimizations in a meaningful order.
-  for (const auto& optimization :
-       {"noop_elimination", "shuffle_and_repeat_fusion", "map_fusion",
-        "filter_fusion", "filter_with_random_uniform_fusion",
-        "map_and_filter_fusion", "hoist_random_uniform", "map_parallelization",
-        "map_and_batch_fusion", "map_vectorization", "make_numa_aware",
-        "latency_all_edges", "make_sloppy", "pruning", "function", "shape",
-        "arithmetic", "dependency"}) {
+  for (const auto& optimization : kTFDataOptimizations) {
+    TF_RETURN_IF_ERROR(
+        ApplyOptimization(optimization, cluster, &optimized_item));
+  }
+
+  for (const auto& optimization : kGrapplerOptimizations) {
     TF_RETURN_IF_ERROR(
         ApplyOptimization(optimization, cluster, &optimized_item));
   }
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch.cc b/tensorflow/core/grappler/optimizers/data/parallel_batch.cc
new file mode 100644
index 0000000..fbc69c3
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/parallel_batch.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status ParallelBatch::OptimizeAndCollectStats(Cluster* cluster,
+                                              const GrapplerItem& item,
+                                              GraphDef* output,
+                                              OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  for (NodeDef& node : *output->mutable_node()) {
+    if (node.op() == "BatchDatasetV2" || node.op() == "PaddedBatchDatasetV2") {
+      (*node.mutable_attr())["parallel_copy"].set_b(true);
+      stats->num_changes++;
+    }
+  }
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(ParallelBatch, "parallel_batch");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
similarity index 73%
rename from tensorflow/core/grappler/optimizers/data/make_numa_aware.h
rename to tensorflow/core/grappler/optimizers/data/parallel_batch.h
index 81dbb31..8fa6413 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,20 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_PARALLEL_BATCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_PARALLEL_BATCH_H_
 
 #include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MakeNumaAware : public TFDataOptimizerBase {
+class ParallelBatch : public TFDataOptimizerBase {
  public:
-  MakeNumaAware() = default;
-  ~MakeNumaAware() override = default;
+  ParallelBatch() = default;
+  ~ParallelBatch() override = default;
 
-  string name() const override { return "make_numa_aware"; }
+  string name() const override { return "parallel_batch"; }
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
@@ -44,4 +44,4 @@
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_PARALLEL_BATCH_H_
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch_test.cc b/tensorflow/core/grappler/optimizers/data/parallel_batch_test.cc
new file mode 100644
index 0000000..2572d1b
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/parallel_batch.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(ParallelBatch, Batch) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("batch_size", "Const", {}, {{"value", 5}, {"dtype", DT_INT32}}),
+       NDef("drop_remainder", "Const", {},
+            {{"value", false}, {"dtype", DT_BOOL}}),
+       NDef("batch", "BatchDatasetV2",
+            {"range", "batch_size", "drop_remainder"}, {})});
+
+  ParallelBatch optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("batch", output));
+  int index = graph_utils::FindGraphNodeWithName("batch", output);
+  EXPECT_TRUE(output.node(index).attr().at("parallel_copy").b());
+}
+
+TEST(ParallelBatch, PaddedBatch) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("batch_size", "Const", {}, {{"value", 5}, {"dtype", DT_INT32}}),
+       NDef("drop_remainder", "Const", {},
+            {{"value", false}, {"dtype", DT_BOOL}}),
+       NDef("batch", "PaddedBatchDatasetV2",
+            {"range", "batch_size", "drop_remainder"}, {})});
+
+  ParallelBatch optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("batch", output));
+  int index = graph_utils::FindGraphNodeWithName("batch", output);
+  EXPECT_TRUE(output.node(index).attr().at("parallel_copy").b());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
index 3bb3d64..6e69037 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.cc
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -41,8 +41,9 @@
 namespace {
 
 constexpr char kCastOp[] = "Cast";
-constexpr char kRealDivOp[] = "RealDiv";
 constexpr char kConstOp[] = "Const";
+constexpr char kIdentityOp[] = "Identity";
+constexpr char kRealDivOp[] = "RealDiv";
 
 constexpr std::array<const char*, 5> kBatchDatasetOps = {
     "BatchDataset",
@@ -77,22 +78,27 @@
     "WindowDataset"
 };
 
-constexpr std::array<const char*, 3> kFuncDatasetOps = {
+constexpr std::array<const char*, 4> kFuncDatasetOps = {
+    "ExperimentalGroupByWindowDataset",
     "FlatMapDataset",
     "InterleaveDataset",
-    "ParallelInterleaveDatasetV2"
+    "ParallelInterleaveDatasetV2",
 };
 
+const std::map<string, const char*>* kFuncDatasetOpFuncs =
+    new std::map<string, const char*>({
+        {"ExperimentalGroupByWindowDataset", "reduce_func"},
+        {"FlatMapDataset", "f"},
+        {"InterleaveDataset", "f"},
+        {"ParallelInterleaveDatasetV2", "f"},
+    });
+
 constexpr std::array<const char*, 9> kSourceDatasetOps = {
-    "FixedLengthRecordDataset",
-    "FixedLengthRecordDatasetV2",
-    "GeneratorDataset",
-    "RangeDataset",
-    "SparseTensorsSliceDataset",
-    "TensorDataset",
-    "TensorSliceDataset",
-    "TextLineDataset",
-    "TFRecordDataset"
+    "FixedLengthRecordDataset",  "FixedLengthRecordDatasetV2",
+    "GeneratorDataset",          "RangeDataset",
+    "SparseTensorsSliceDataset", "TensorDataset",
+    "TensorSliceDataset",        "TextLineDataset",
+    "TFRecordDataset",
 };
 
 NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
@@ -135,14 +141,33 @@
   return false;
 }
 
+Status UpdateOutputShapes(const string& node_name, int64 num_workers,
+                          MutableGraphView* graph) {
+  NodeDef* node = graph->GetNode(node_name);
+  if (node->op() == kIdentityOp) {
+    return Status::OK();
+  }
+  AttrValue output_shapes = node->attr().at("output_shapes");
+  for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) {
+    shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_workers);
+  }
+  (*node->mutable_attr())["output_shapes"] = output_shapes;
+  return Status::OK();
+}
+
 // Given a "batch" dataset node, modifies the batch_size input to divide the
 // current batch size by num_workers.
 Status MutateBatchSize(const NodeDef& node, int64 num_workers,
                        MutableGraphView* graph) {
-  // TODO(rohanj): Fix up the output_shapes attribute as well. For this Dataset
-  // as well as all the downstream datasets.
-  // For all the batching datasets the batch_size is input number 1.
-  NodeDef* batch_size_node = graph_utils::GetInputNode(node, *graph, 1);
+  // For all the batching datasets the batch_size is input number 1 except for
+  // MapAndBatchDataset.
+  int64 batch_size_arg_index = 1;
+  if (node.op() == "ExperimentalMapAndBatchDataset") {
+    // For MapAndBatch we take the 3rd last input.
+    batch_size_arg_index = node.input_size() - 3;
+  }
+  NodeDef* batch_size_node =
+      graph_utils::GetInputNode(node, *graph, batch_size_arg_index);
   // By the time this optimization is run, the batch_size is computed and
   // is a constant.
   if (batch_size_node->op() != kConstOp) {
@@ -168,7 +193,7 @@
   // multiple nodes sharing the same batch size constant node. This is also
   // why we don't delete batch_size_node as well.
   TF_RETURN_IF_ERROR(graph->UpdateRegularFaninByPort(
-      node.name(), 1, {new_batch_size_node->name(), 0}));
+      node.name(), batch_size_arg_index, {new_batch_size_node->name(), 0}));
   return Status::OK();
 }
 
@@ -187,7 +212,8 @@
                            FunctionLibraryDefinition* flib,
                            MutableGraphView* graph) {
   if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
-    return MutateBatchSize(node, num_workers, graph);
+    TF_RETURN_IF_ERROR(MutateBatchSize(node, num_workers, graph));
+    TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
   } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
     // For all multiple input datasets, all inputs are datasets themselves.
     for (int i = 0; i < node.input_size(); ++i) {
@@ -195,14 +221,17 @@
       TF_RETURN_IF_ERROR(
           RecursivelyHandleOp(*input_node, num_workers, flib, graph));
     }
+    TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
   } else if (IsDatasetNodeOfType(node, kPassThroughOps)) {
     // For all the dataset ops that are pass through, the input dataset is
     // input 0.
     NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
     TF_RETURN_IF_ERROR(
         RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+    TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
   } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
-    const string func_name = node.attr().at("f").func().name();
+    const string func_name =
+        node.attr().at(kFuncDatasetOpFuncs->at(node.op())).func().name();
     const FunctionDef* fdef = flib->Find(func_name);
     GrapplerFunctionItem f_item;
     TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
@@ -226,6 +255,7 @@
 
       // Replace optimized function with a new FunctionDef.
       TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func));
+      TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
     } else {
       VLOG(2) << "Failed to optimize dataset function. Error: "
               << s.error_message();
@@ -256,10 +286,10 @@
 
   FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
 
-  NodeDef sink_node;
-  TF_RETURN_IF_ERROR(graph_utils::FindSinkNode(item.graph, &sink_node));
+  NodeDef* sink_node;
+  TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
   TF_RETURN_IF_ERROR(
-      RecursivelyHandleOp(sink_node, num_workers, &flib, &graph));
+      RecursivelyHandleOp(*sink_node, num_workers, &flib, &graph));
   *output->mutable_library() = flib.ToProto();
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
new file mode 100644
index 0000000..8096435
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/slack.h"
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+constexpr char kRetValOp[] = "_Retval";
+
+}  // namespace
+
+Status Slack::OptimizeAndCollectStats(Cluster* cluster,
+                                      const GrapplerItem& item,
+                                      GraphDef* output,
+                                      OptimizationStats* stats) {
+  if (slack_period_ < 1)
+    return errors::InvalidArgument("Invalid `slack_period` parameter: ",
+                                   slack_period_);
+
+  *output = item.graph;
+  MutableGraphView graph(output);
+  for (const auto& fetch_name : item.fetch) {
+    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+    // because we only want to add slack to the prefetch on the main dataset
+    // pipeline.
+    auto fetch = graph.GetNode(fetch_name);
+    if (fetch == nullptr || fetch->op() == kRetValOp) {
+      // Heuristic: If the fetch nodes are Retval ops, this item is from a
+      // function.
+      return Status::OK();
+    }
+  }
+  if (item.fetch.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected only one fetch node but there were ", item.fetch.size(), ": ",
+        absl::StrJoin(item.fetch, ", "));
+  }
+  // Walk the input pipeline backwards from the fetch node to find the last
+  // PrefetchDataset node in the pipeline.
+  // TODO(rachelim): This doesn't do the right thing when the "final" prefetch
+  // is nested under an interleave or flat_map. Make this work, similar to
+  // `auto_shard.cc` and `rebatch.cc`.
+  NodeDef* dataset_node = graph.GetNode(item.fetch.at(0));
+  while (true) {
+    if (dataset_node->op() == "PrefetchDataset") {
+      if (HasNodeAttr(*dataset_node, "slack_period")) {
+        (*dataset_node->mutable_attr())["slack_period"].set_i(slack_period_);
+      } else {
+        AddNodeAttr("slack_period", slack_period_, dataset_node);
+      }
+      return Status::OK();
+    }
+    if (dataset_node->op() == "Identity" ||
+        (absl::EndsWith(dataset_node->op(), "Dataset") &&
+         dataset_node->input_size() > 0)) {
+      dataset_node = graph_utils::GetInputNode(*dataset_node, graph);
+    } else {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+void Slack::Feedback(Cluster* cluster, const GrapplerItem& item,
+                     const GraphDef& optimize_output, double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(Slack, "slack");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/slack.h b/tensorflow/core/grappler/optimizers/data/slack.h
new file mode 100644
index 0000000..fcdc2e9
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/slack.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
+
+#include "absl/strings/numbers.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization sets the slack attr of the terminal PrefetchDataset node in
+// an input pipeline.
+class Slack : public TFDataOptimizerBase {
+ public:
+  Slack() = default;
+  ~Slack() override = default;
+
+  string name() const override { return "slack"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return errors::InvalidArgument("Config parameter required.");
+
+    const string& slack_period_param =
+        config->parameter_map().at("slack_period").s();
+    if (!absl::SimpleAtoi(slack_period_param, &slack_period_)) {
+      return errors::InvalidArgument("Invalid `slack_period` parameter: ",
+                                     slack_period_param);
+    }
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  int64 slack_period_ = -1;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
diff --git a/tensorflow/core/grappler/optimizers/data/slack_test.cc b/tensorflow/core/grappler/optimizers/data/slack_test.cc
new file mode 100644
index 0000000..6a6a2c4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/slack_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/slack.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+void SetupGrapplerItem(GrapplerItem *item) {
+  MutableGraphView graph(&item->graph);
+
+  std::vector<std::pair<string, AttrValue>> common_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue(std::vector<TensorShape>({{}}), &shapes_attr);
+  common_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue(std::vector<DataType>({DT_INT64}), &types_attr);
+  common_attrs[1] = std::make_pair("output_types", types_attr);
+
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node = graph_utils::AddNode(
+      "RangeDataset", "RangeDataset", range_inputs, common_attrs, &graph);
+
+  NodeDef *buffer_size_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+  NodeDef *prefetch_node = graph_utils::AddNode(
+      "PrefetchDataset", "PrefetchDataset",
+      {range_node->name(), buffer_size_node->name()}, common_attrs, &graph);
+  item->fetch.push_back(prefetch_node->name());
+}
+
+struct ParameterizedSlackTest
+    : ::testing::TestWithParam<std::tuple<string, int>> {};
+
+TEST_P(ParameterizedSlackTest, BasicTest) {
+  GrapplerItem item;
+  SetupGrapplerItem(&item);
+
+  Slack optimizer;
+  tensorflow::RewriterConfig_CustomGraphOptimizer config;
+  (*config.mutable_parameter_map())["slack_period"].set_s(
+      std::get<0>(GetParam()));
+  TF_ASSERT_OK(optimizer.Init(&config));
+
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  NodeDef optimized_prefetch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("PrefetchDataset", output));
+  EXPECT_EQ(optimized_prefetch_node.attr().at("slack_period").i(),
+            std::get<1>(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(DifferentSlackEveryValues, ParameterizedSlackTest,
+                         ::testing::Values(std::make_tuple("1", 1),
+                                           std::make_tuple("8", 8)));
+
+TEST(SlackTest, TestFailWithoutInit) {
+  GrapplerItem item;
+  Slack optimizer;
+  GraphDef output;
+  Status result = optimizer.Optimize(nullptr, item, &output);
+
+  EXPECT_FALSE(result.ok());
+  EXPECT_TRUE(errors::IsInvalidArgument(result));
+}
+
+TEST(SlackTest, TestFailWithInvalidSlackEveryParam) {
+  GrapplerItem item;
+  SetupGrapplerItem(&item);
+
+  Slack optimizer;
+  tensorflow::RewriterConfig_CustomGraphOptimizer config;
+  (*config.mutable_parameter_map())["slack_period"].set_s("0");
+  TF_ASSERT_OK(optimizer.Init(&config));
+
+  GraphDef output;
+  Status result = optimizer.Optimize(nullptr, item, &output);
+
+  EXPECT_FALSE(result.ok());
+  EXPECT_TRUE(errors::IsInvalidArgument(result));
+}
+
+TEST(SlackTest, TestFunctionNotOptimized) {
+  GrapplerFunctionItem item;
+  FunctionDefLibrary lib_def;
+  FunctionDef *fdef = lib_def.add_function();
+  fdef->mutable_signature()->set_name("nested_function");
+  auto *input_arg = fdef->mutable_signature()->add_input_arg();
+  input_arg->set_name("args_0");
+  input_arg->set_type(DT_INT64);
+  auto *output_arg = fdef->mutable_signature()->add_output_arg();
+  output_arg->set_name("identity");
+  output_arg->set_type(DT_VARIANT);
+  fdef->mutable_signature()->set_is_stateful(true);
+
+  AttrValue shapes_attr;
+  SetAttrValue(std::vector<TensorShape>({{}}), &shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue(std::vector<DataType>({DT_INT64}), &types_attr);
+  NodeDef *tensor_dataset_node =
+      function_utils::AddNode("TensorDataset", "TensorDataset", {"args_0"},
+                              {std::make_pair("output_shapes", shapes_attr),
+                               std::make_pair("Toutput_types", types_attr)},
+                              fdef);
+  NodeDef *prefetch_node = function_utils::AddNode(
+      "PrefetchDataset", "PrefetchDataset",
+      {strings::StrCat(tensor_dataset_node->name(), ":handle:0"), "args_0"},
+      {std::make_pair("output_shapes", shapes_attr),
+       std::make_pair("output_types", types_attr)},
+      fdef);
+
+  AttrValue variant_type_attr;
+  SetAttrValue(DT_VARIANT, &variant_type_attr);
+  NodeDef *identity_node = function_utils::AddNode(
+      "Identity", "Identity",
+      {strings::StrCat(prefetch_node->name(), ":handle:0"),
+       strings::StrCat("^", tensor_dataset_node->name())},
+      {std::make_pair("T", variant_type_attr)}, fdef);
+
+  (*fdef->mutable_ret())["identity"] =
+      strings::StrCat(identity_node->name(), ":output:0");
+  (*fdef->mutable_control_ret())[tensor_dataset_node->name()] =
+      tensor_dataset_node->name();
+  fdef->mutable_signature()->add_control_output(tensor_dataset_node->name());
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
+
+  TF_ASSERT_OK(
+      MakeGrapplerFunctionItem(*fdef, flib, /*graph_def_version=*/27, &item));
+
+  GraphDef output;
+  Slack optimizer;
+  tensorflow::RewriterConfig_CustomGraphOptimizer config;
+  (*config.mutable_parameter_map())["slack_period"].set_s("8");
+  TF_ASSERT_OK(optimizer.Init(&config));
+
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  NodeDef optimized_prefetch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("PrefetchDataset", output));
+  // Should not set slack for function items.
+  EXPECT_EQ(optimized_prefetch_node.attr().at("slack_period").i(), 0);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 1788fb9..3541613 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -295,6 +295,7 @@
     }
     node->set_op("NoOp");
     node->clear_attr();
+    DedupControlInputs(node);
     nodes_to_simplify->PushBack(node_to_idx_[node]);
     return;
   }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 5c51972..630fcde 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -28,9 +28,9 @@
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/lower_functional_ops.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/lower_while_op.h"
 #include "tensorflow/core/common_runtime/placer.h"
-#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -39,59 +39,26 @@
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/grappler/graph_topology_view.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
-#include "tensorflow/core/grappler/utils/topological_sort.h"
-#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-// WARNING: Code in this file implicitly assumes that function input and output
-// arguments are plain tensors (tensor lists are not supported). Function inputs
-// and outputs are always expanded to a single placeholder or output tensor.
-// With this assumption, the calling node's input/output ports always match
-// function input/output arguments.
-//
-// This is guaranteed by the implementation of MakeGrapplerFunctionItem.
+constexpr const char* const kFuncAttr = FunctionLibraryDefinition::kFuncAttr;
 
 // Mark functions that were created as a result of function specialization.
-constexpr char kGrapplerSpecializedFuncAttr[] = "_GrapplerSpecializedFunc";
-
-// Name of the attribute that defines the function for indirect function calls.
-constexpr char kFuncAttrName[] = "f";
-
-constexpr char kNoInlineAttr[] = "_noinline";
-
-// Name of the node that will have control edges from function input nodes, and
-// also used as a new destination for incoming control edges.
-constexpr char kInputsReadyNodeName[] = "inputs_ready";
-
-// Name of the node that will have control edges from function control output
-// nodes, and also used as a new source of outgoing control edges. This node
-// will guarantee that all side-effects inside function body will be executed
-// after function inlining.
-constexpr char kSideEffectsExecutedNodeName[] = "side_effects_executed";
-
-bool AttrIsTrue(const FunctionDef& func, const string& attr) {
-  return func.attr().count(attr) != 0 && func.attr().at(attr).b();
-}
-
-bool MarkedSpecialized(const FunctionDef& func) {
-  return AttrIsTrue(func, kGrapplerSpecializedFuncAttr);
-}
-
-bool MarkedNoInline(const FunctionDef& func) {
-  return AttrIsTrue(func, kNoInlineAttr);
-}
+constexpr const char* const kGrapplerSpecializedFuncAttr =
+    "_GrapplerSpecializedFunc";
 
 // There are two ways of calling a Tensorflow function:
 //
@@ -113,7 +80,7 @@
     return false;
   }
 
-  auto* func_attr = AttrSlice(func_node).Find(kFuncAttrName);
+  auto* func_attr = AttrSlice(func_node).Find(kFuncAttr);
   return func_attr != nullptr && func_attr->has_func() &&
          func_attr->func().name() == func.signature().name();
 }
@@ -124,7 +91,7 @@
     return AttrSlice(func_node);
 
   } else if (IsIndirectFunctionCall(func, func_node)) {
-    auto* func_attr = AttrSlice(func_node).Find(kFuncAttrName);
+    auto* func_attr = AttrSlice(func_node).Find(kFuncAttr);
     return AttrSlice(&func_attr->func().attr());
 
   } else {
@@ -292,52 +259,21 @@
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
   }
-
-  FunctionLibraryDefinition* mutable_function_library() {
-    return &function_library_;
-  }
-
-  FunctionLibraryRuntime* mutable_function_library_runtime() {
-    InitializeFunctionLibraryRuntime();
-    return flr_;
-  }
+  FunctionLibraryDefinition& function_library() { return function_library_; }
 
   const absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
   tensor_mapping() const {
     return tensor_mapping_;
   }
 
-  const absl::flat_hash_map<string, std::vector<string>>& control_overrides()
-      const {
-    return control_overrides_;
-  }
-
   const GraphView& graph_view() const { return graph_view_; }
 
-  const DeviceSet* devices() const {
-    // Create fake devices lazily only if we need a DeviceSet.
-    if (available_devices_.empty() && !item_->devices().empty()) {
-      for (const string& name : item_->devices()) {
-        auto device = absl::make_unique<FakeDevice>(name);
-        available_device_set_.AddDevice(device.get());
-        available_devices_.push_back(std::move(device));
-      }
-    }
-    return &available_device_set_;
-  }
-
   bool IsFetchNode(const string& node_name) const {
     return absl::c_any_of(item_->fetch, [&](const string& fetch) {
       return ParseTensorName(fetch).node() == node_name;
     });
   }
 
-  bool IsKeepOp(const string& node_name) const {
-    return absl::c_any_of(item_->keep_ops, [&](const string& keep_node) {
-      return keep_node == node_name;
-    });
-  }
-
   bool IsTrulyConst(const string& name) const {
     return TrulyConstNode(name) != nullptr;
   }
@@ -381,17 +317,6 @@
     }
   }
 
-  void AddControlOverrides(const NodeDef& func_node,
-                           const std::vector<string>& control_overrides) {
-    VLOG(4) << "Add control overrides: from=" << func_node.name() << " to: ["
-            << absl::StrJoin(control_overrides, ", ") << "]";
-
-    control_overrides_[func_node.name()].reserve(control_overrides.size());
-    for (const string& control_override : control_overrides) {
-      control_overrides_[func_node.name()].push_back(control_override);
-    }
-  }
-
  private:
   static absl::flat_hash_map<string, const NodeDef*> InferTrulyConstNodes(
       const GrapplerItem& item, const GraphDef& graph) {
@@ -410,39 +335,12 @@
     return const_nodes;
   }
 
-  void InitializeFunctionLibraryRuntime() {
-    if (!flr_) {
-      Env* env = Env::Default();
-      std::vector<std::unique_ptr<Device>> devices;
-      devices.push_back(absl::make_unique<FakeDevice>(env, "/device:CPU:0"));
-      device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
-      OptimizerOptions optimizer_opts;
-      optimizer_opts.set_do_function_inlining(true);
-      process_flr_.reset(new ProcessFunctionLibraryRuntime(
-          device_mgr_.get(), env, item_->graph.versions().producer(),
-          &function_library_, optimizer_opts));
-      flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
-    }
-  }
-
   const GrapplerItem* item_;  // must outlive this object
   RewriterConfig::Toggle opt_level_;
 
   // Function library constructed from current graph.
   FunctionLibraryDefinition function_library_;
 
-  // These fields initialized lazily only if needed.
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
-  FunctionLibraryRuntime* flr_ = nullptr;
-
-  // List of available `FakedDevices` (lazily initialized, see devices()).
-  mutable std::vector<std::unique_ptr<Device>> available_devices_;
-
-  // DeviceSet of fake devices (`FakeDevice`) constructed from
-  // item_.devices() (lazily initialized).
-  mutable DeviceSet available_device_set_;
-
   // Nodes that are Const and not in feed.
   absl::flat_hash_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
@@ -450,24 +348,15 @@
                       const FunctionSpecialization>
       specialized_functions_;
 
-  // After function inlining and specialization, the optimized graph might be in
-  // invalid state, nodes can read from non-existing function call nodes that
-  // were inlined, or they can read from output index that is no longer valid
-  // after unused outputs pruning.
+  // After function specialization, the optimized graph might be in invalid
+  // state, nodes can read from output index that is no longer valid after
+  // unused outputs pruning.
   //
   // Tensor mapping that has to be applied to the graph after all functions
   // optimizations (invalidated tensor id -> optimized graph tensor id).
   absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
       tensor_mapping_;
 
-  // When we inline a function into the optimized graph, we no longer have the
-  // function call node to anchor control dependencies. Instead we must expand
-  // each function call control output edge into multiple control dependencies
-  // to all side-effectful ops inside the function body.
-  //
-  // Invalidated function call node name -> Inlined side-effectful nodes
-  absl::flat_hash_map<string, std::vector<string>> control_overrides_;
-
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
 
@@ -594,11 +483,10 @@
   // Keep only non-const inputs.
   std::vector<string> keep_inputs;
   const auto& inputs = specialized_func_node->input();
-  std::copy_if(inputs.begin(), inputs.end(), std::back_inserter(keep_inputs),
-               [&](const string& input) {
-                 return specialization.const_inputs.find(input) ==
-                        specialization.const_inputs.end();
-               });
+  absl::c_copy_if(inputs, std::back_inserter(keep_inputs),
+                  [&](const string& input) {
+                    return !specialization.const_inputs.contains(input);
+                  });
 
   specialized_func_node->clear_input();
   for (const auto& keep : keep_inputs) specialized_func_node->add_input(keep);
@@ -612,7 +500,7 @@
     }
 
     for (const string& ctrl : specialization.control_deps) {
-      if (existing_control_deps.find(ctrl) == existing_control_deps.end()) {
+      if (!existing_control_deps.contains(ctrl)) {
         VLOG(3) << "Forward control dependency: input=" << ctrl;
         specialized_func_node->add_input(ctrl);
       }
@@ -640,8 +528,7 @@
     const string& input = func_node.input(i);
     if (IsControlInput(input)) break;
 
-    if (specialization.const_inputs.find(input) ==
-        specialization.const_inputs.end()) {
+    if (!specialization.const_inputs.contains(input)) {
       DataType dt = tin->list().type(i);
       (*attr)["Tin"].mutable_list()->add_type(dt);
     }
@@ -665,8 +552,7 @@
 
   // Keep output types of active outputs only.
   for (int i = 0; i < tout->list().type_size(); ++i) {
-    if (specialization.active_outputs.find(i) !=
-        specialization.active_outputs.end()) {
+    if (specialization.active_outputs.contains(i)) {
       DataType dt = tout->list().type(i);
       (*attr)["Tout"].mutable_list()->add_type(dt);
     }
@@ -682,7 +568,7 @@
 
   } else if (IsIndirectFunctionCall(func, func_node)) {
     auto* attr = specialized_func_node->mutable_attr();
-    (*attr)[kFuncAttrName].mutable_func()->set_name(specialized_func_name);
+    (*attr)[kFuncAttr].mutable_func()->set_name(specialized_func_name);
 
   } else {
     return errors::InvalidArgument("Unknown function call site");
@@ -852,8 +738,7 @@
   (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
 
   // Add specialized function to the library.
-  TF_RETURN_IF_ERROR(
-      ctx->mutable_function_library()->AddFunctionDef(specialized_func));
+  TF_RETURN_IF_ERROR(ctx->function_library().AddFunctionDef(specialized_func));
 
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
@@ -873,1125 +758,590 @@
 }
 
 // -------------------------------------------------------------------------- //
-// Inline direct functions calls.
+// Inline function calls into a graph using function inlining implementation
+// from common_runtime:
 //
-// When we inline direct function calls, we instantiate the function body from
-// its FunctionDef and caller node attributes, and embed the instantiated graph
-// into the "main graph". When we do that, we must preserve the function call
-// semantics:
-//
-// 1) All input nodes must be executed before any of function body nodes will
-//    start executing.
-// 2) All function body nodes must be executed before any of the nodes, reading
-//    outputs of the function will start executing.
-// 3) All nodes with side effects inside a function must be executed, this is
-//    different from the nodes with side effects in the main graph, that can be
-//    pruned if they are not in transitive dependency set of any of the fetch
-//    nodes.
-// 4) All nodes of the function body must be execute on the device specified by
-//    the function caller node.
-//
-// To guarantee that function call semantics are preserved after inlining, we
-// insert an IdentityN node before the inlined function body, and hook all
-// inputs into that, and we insert another IdentityN node to hook all function
-// outputs to it.
+// 1) Convert GraphDef to Graph.
+// 2) Inline function calls.
+// 3) Convert Graph back to the GraphDef.
 
-// Returns `Status::OK()` iff `node` is a direct function call of `func`, and we
-// know how to inline it into the main graph, otherwise returns and error
-// indicating why the function call is not inlinable.
-Status IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
-                                     const FunctionDef& func,
-                                     const NodeDef& func_node) {
-  // Indirect function calls (PartitionedCallOp) have automatic control
-  // dependencies and inlined separately from direct function calls.
-  if (!IsDirectFunctionCall(func, func_node)) {
-    return errors::InvalidArgument("Unsupported function call type: ",
-                                   SummarizeNodeDef(func_node));
-  }
+constexpr const char* const kLowerUsingSwitchMergeAttr =
+    LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr;
+constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
+    LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr;
 
-  // For direct function  calls we insert IdentityN nodes before/after inlined
-  // function body to preserve function call semantics (all inputs evaluated
-  // before function evaluation starts, and all function body nodes finished
-  // before output consumed by other nodes).
-  if (func.signature().input_arg_size() == 0) {
-    return errors::FailedPrecondition(
-        "Can't inline direct function call with empty inputs: ",
-        SummarizeNodeDef(func_node));
-  }
+using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
+using OutputControlSource = InlineFunctionBodyOptions::OutputControlSource;
 
-  // TODO(ezhulenev): Relax constraint on output args?
-  if (func.signature().output_arg_size() == 0) {
-    return errors::FailedPrecondition(
-        "Can't inline direct function call with empty outputs: ",
-        SummarizeNodeDef(func_node));
-  }
-
-  // Function must execute all the nodes in a function body that might have side
-  // effects. After inlining these nodes into the main graph, we can no longer
-  // guarantee that. For now we disable inlining functions with side effects.
-  //
-  // Attaching control dependency to the output IdentityN node is not safe,
-  // because it might be split or pruned in a later optimization pass.
-  //
-  // Indirect function calls (via PartitionedCallOp) have automatic dependency
-  // tracking, and allow us to safely inline functions with side effects.
-  bool has_side_effects =
-      absl::c_any_of(func.node_def(), [&ctx](const NodeDef& node) {
-        return !IsFreeOfSideEffect(node, &ctx.function_library());
-      });
-  if (has_side_effects) {
-    return errors::FailedPrecondition(
-        "Can't inline function with side-effects in the function body: ",
-        SummarizeNodeDef(func_node));
-  }
-
-  // We ignore `_noinline` marker in aggressive mode.
-  bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
-  if (MarkedNoInline(func) && !aggressive) {
-    return errors::FailedPrecondition(
-        "Can't inline function marked with '_noinline': ",
-        SummarizeNodeDef(func_node));
-  }
-
-  // Function specialization and inlining must be mutually exclusive.
-  if (MarkedSpecialized(func)) {
-    return errors::FailedPrecondition(
-        "Can't inline function created in Grappler function specialization: ",
-        SummarizeNodeDef(func_node));
-  }
-
-  return Status::OK();
+// Checks if boolean attribute is defined and it's value is 'true'.
+bool CheckBoolAttr(const Node* n, absl::string_view attr_name) {
+  bool match;
+  Status s = GetNodeAttr(n->attrs(), attr_name, &match);
+  return s.ok() && match;
 }
 
-// Create an IdentityN node to hook the function inputs to: this ensures that
-// they're all evaluated before the evaluation of the function body starts.
-NodeDef InlinedFunctionInputsNode(const NodeDef& func_node,
-                                  const GrapplerFunctionItem& item) {
-  NodeDef inputs;
-  inputs.set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
-  inputs.set_op("IdentityN");
-  inputs.set_device(func_node.device());
-  *inputs.mutable_input() = func_node.input();
-  AttrValue::ListValue* type_list =
-      (*inputs.mutable_attr())["T"].mutable_list();
-
-  for (const InputArgInstantiation& input_arg : item.inputs()) {
-    type_list->add_type(input_arg.data_type);
-  }
-
-  return inputs;
+// Checks if string attribute is defined and it's not empty.
+bool CheckStringAttr(const Node* n, absl::string_view attr_name) {
+  string match;
+  Status s = GetNodeAttr(n->attrs(), attr_name, &match);
+  return s.ok() && !match.empty();
 }
 
-// Create an IdentityN node to hook the function outputs to: this ensures that
-// the function body is fully evaluated before its fanout gets scheduled.
-NodeDef InlinedFunctionOutputsNode(
-    const NodeDef& func_node, const GrapplerFunctionItem& item,
-    const absl::flat_hash_map<absl::string_view, absl::string_view>
-        output_tensors) {
-  NodeDef outputs;
-  outputs.set_name(func_node.name());
-  outputs.set_op("IdentityN");
-  outputs.set_device(func_node.device());
-  AttrValue::ListValue* type_list =
-      (*outputs.mutable_attr())["T"].mutable_list();
-
-  for (const OutputArgInstantiation& output_arg : item.outputs()) {
-    const absl::string_view output_tensor =
-        output_tensors.at(output_arg.node_name);
-    type_list->add_type(output_arg.data_type);
-    outputs.add_input(strings::StrCat(func_node.name(), "/", output_tensor));
-  }
-
-  return outputs;
+bool LowerUsingSwitchMergeIsOn(const Node* n) {
+  return CheckBoolAttr(n, kLowerUsingSwitchMergeAttr);
 }
 
-Status InlineDirectFunctionCall(const NodeDef& func_node,
-                                const FunctionDef& func,
-                                const FunctionOptimizerContext& ctx,
-                                GraphDef* optimized_graph) {
-  VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
-  TF_RETURN_IF_ERROR(IsInlinableDirectFunctionCall(ctx, func, func_node));
+bool LowerAsMultiDeviceFunctionIsOn(const Node* n) {
+  return CheckBoolAttr(n, kLowerAsMultiDeviceFunctionAttr);
+}
 
-  const AttrSlice func_instantiation_attr =
-      FunctionInstantiationAttributes(func, func_node);
+bool MarkedForTpuCompilation(const Node* n) {
+  static constexpr const char* const kTpuReplicateAttr = "_tpu_replicate";
+  return CheckStringAttr(n, kTpuReplicateAttr);
+}
 
-  GrapplerFunctionItem item;
-  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
-                                                ctx.function_library(),
-                                                ctx.graph_version(), &item);
+bool MarkedForXlaCompilation(const Node* n) {
+  static constexpr const char* const kXlaClusterAttr = "_xla_compile_id";
+  return CheckStringAttr(n, kXlaClusterAttr);
+}
 
-  if (!item_status.ok()) {
-    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
-                                   " instantiated by ", func_node.name(),
-                                   ". Error: ", item_status.error_message());
-  }
+// Validates that all side effects inside function body will be executed after
+// function inlining. We do it by looking for a path from stateful ops, to one
+// of the output control sources.
+//
+// When function executed via FunctionLibraryRuntime we do not have to check
+// this, because `PruneFunctionBody` has special pruning rules for stateful ops.
+Status ValidateSideEffectsExecution(
+    const FunctionBody& fbody, OutputControlSource output_control_source,
+    bool has_outgoing_control_edges,
+    bool validate_outgoing_control_edge = true) {
+  // ReadVariableOp marked as stateful because it consumes DT_RESOURCE, but it
+  // can't generate any observable side-effect.
+  static constexpr const char* const kReadVariableOp = "ReadVariableOp";
 
-  // Mapping from input arg node name to function input position.
-  absl::flat_hash_map<absl::string_view, int> input_args_idx;
-  for (const InputArgInstantiation& input_arg : item.inputs()) {
-    const int idx = input_args_idx.size();
-    input_args_idx[input_arg.node_name] = idx;
-  }
+  // Find all nodes that can produce side effects in the function body graph. We
+  // use 'is_stateful()' bit as an approximation of "has side effects" property.
+  std::vector<const Node*> fbody_side_effects;
+  absl::c_copy_if(fbody.graph->nodes(), std::back_inserter(fbody_side_effects),
+                  [](const Node* n) {
+                    return n->op_def().is_stateful() && !n->IsArg() &&
+                           !n->IsRetval() &&
+                           n->type_string() != kReadVariableOp;
+                  });
 
-  // Mapping from the '_Retval' node name to the output tensor.
-  absl::flat_hash_map<absl::string_view, absl::string_view> output_tensors;
-  for (const NodeDef& func_body_node : item.function_body().node()) {
-    if (!IsRetval(func_body_node)) continue;
-    if (func_body_node.input_size() != 1) {
-      return errors::Internal("_Retval node must have single input: ",
-                              SummarizeNodeDef(func_body_node));
-    }
-    output_tensors.emplace(func_body_node.name(), func_body_node.input(0));
-  }
-
-  // Hook inlined function inputs to IdentityN node.
-  NodeDef* func_inputs = optimized_graph->add_node();
-  *func_inputs = InlinedFunctionInputsNode(func_node, item);
-
-  for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
-    const string& node_name = func_body_node.name();
-
-    // Skip function output nodes.
-    if (IsRetval(func_body_node)) continue;
-
-    // Turn _Arg nodes added in place of input arguments into identity nodes.
-    const auto input_arg_idx = input_args_idx.find(node_name);
-    if (input_arg_idx != input_args_idx.end()) {
-      CHECK_EQ(0, func_body_node.input_size());
-      func_body_node.set_op("Identity");
-      func_body_node.mutable_attr()->erase("index");
-      func_body_node.mutable_attr()->erase("shape");
-      func_body_node.add_input(
-          strings::StrCat(func_inputs->name(), ":", input_arg_idx->second));
+  // When graph executed in TF-2.0 context with automatic control dependencies
+  // tracking, absence of outgoing control edge indicates that no one is
+  // interested in observing side effects, so it is safe to inline the function
+  // body, even if some side-effects will not be executed.
+  if (!fbody_side_effects.empty() && !has_outgoing_control_edges) {
+    const string error_message =
+        "Can't guarantee execution of function side-effects after inlining. "
+        "Function call node has no outgoing control edges.";
+    if (validate_outgoing_control_edge) {
+      return errors::Internal(error_message);
     } else {
-      // Update the input names if any.
-      for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
-      }
-      // If the node has no input, make hook it up to the func_inputs node to
-      // ensure it runs in the same frame as the other nodes of the function
-      // body.
-      if (func_body_node.input_size() == 0) {
-        *func_body_node.add_input() = AsControlDependency(func_inputs->name());
-      }
-    }
-
-    // Add the function node name as a prefix 1) to node name to avoid
-    // collisions; 2) to frame name to avoid multiple LoopCond nodes in one
-    // frame after inlining.
-    const string prefix = strings::StrCat(func_node.name(), "/");
-    TF_RETURN_IF_ERROR(
-        AddPrefixAndSuffixToNode(prefix, "" /* suffix */, &func_body_node));
-
-    // Make sure the node is placed.
-    func_body_node.set_device(func_node.device());
-
-    // Move the node to the main graph.
-    optimized_graph->add_node()->Swap(&func_body_node);
-  }
-
-  DCHECK(output_tensors.size() == item.output_size())
-      << "Each function output must be mapped to an output tensor";
-
-  // Hook inlined function outputs to IdentityN node.
-  NodeDef* func_outputs = optimized_graph->add_node();
-  *func_outputs = InlinedFunctionOutputsNode(func_node, item, output_tensors);
-
-  return Status::OK();
-}
-
-Status InlineSymbolicGradient(const NodeDef& node,
-                              FunctionOptimizerContext* ctx,
-                              GraphDef* optimized_graph) {
-  VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
-
-  GraphDef graph_def;
-
-  // Create a node to anchor the gradient inputs
-  NodeDef* inlined_input = graph_def.add_node();
-  inlined_input->set_name("FunctionInputs");
-  inlined_input->set_op("IdentityN");
-  AttrValue::ListValue* type_list =
-      (*inlined_input->mutable_attr())["T"].mutable_list();
-  for (const auto& type : node.attr().at("Tin").list().type()) {
-    type_list->add_type(static_cast<DataType>(type));
-  }
-
-  // Add the gradient node
-  NodeDef* inlined = graph_def.add_node();
-  *inlined = node;
-  inlined->clear_input();
-  for (int i = 0; i < node.attr().at("Tin").list().type_size(); ++i) {
-    inlined->add_input(strings::StrCat(inlined_input->name(), ":", i));
-  }
-
-  // Create a node to anchor the gradient outputs
-  NodeDef* inlined_output = graph_def.add_node();
-  inlined_output->set_name("FunctionOutputs");
-  inlined_output->set_op("IdentityN");
-  type_list = (*inlined_output->mutable_attr())["T"].mutable_list();
-  for (const auto& type : node.attr().at("Tout").list().type()) {
-    type_list->add_type(static_cast<DataType>(type));
-  }
-  for (int i = 0; i < node.attr().at("Tout").list().type_size(); ++i) {
-    inlined_output->add_input(strings::StrCat(inlined->name(), ":", i));
-  }
-
-  // Convert the graphdef to a graph
-  GraphConstructorOptions graph_ctor_opts;
-  graph_ctor_opts.allow_internal_ops = true;
-  graph_ctor_opts.expect_device_spec = false;
-  Graph graph(ctx->function_library());
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
-
-  FunctionLibraryRuntime* flr = ctx->mutable_function_library_runtime();
-
-  // 1. Inline symbolic gradient node.
-  const bool expanded = ExpandInlineFunctions(flr, &graph);
-  if (!expanded) {
-    return errors::Internal("Failed to expand SymbolicGradient op");
-  }
-
-  // TODO(ezhulenev): InlineFunctionBody in common_runtime/function silently
-  // fails to inline function into the graph, and leaves the graph unmodified.
-  // We check that graph has our symbolic gradient inlined, otherwise we return
-  // a error.
-  const auto is_symbolic_gradient_op = [&](const Node* node) {
-    return node->name() == inlined->name() &&
-           node->type_string() == "SymbolicGradient";
-  };
-  for (Node* node : graph.nodes()) {
-    if (is_symbolic_gradient_op(node)) {
-      return errors::Internal("Failed to inline symbolic gradient node: ",
-                              SummarizeNode(*node));
+      VLOG(3) << error_message;
     }
   }
 
-  // 2. Recursively inline nested function calls.
-  int iteration = 0;
-  while (ExpandInlineFunctions(flr, &graph)) {
-    if (++iteration >= 50) {
-      VLOG(2) << "Break symbolic gradient inlining loop at iteration #"
-              << iteration;
-      break;
-    }
+  // Find all nodes in the function body that will be used as control sources.
+  absl::flat_hash_set<const Node*> control_sources;
+  if (output_control_source == OutputControlSource::kDataOutputs) {
+    control_sources = {fbody.ret_nodes.begin(), fbody.ret_nodes.end()};
+  } else if (output_control_source == OutputControlSource::kControlOutputs) {
+    control_sources = {fbody.control_ret_nodes.begin(),
+                       fbody.control_ret_nodes.end()};
   }
 
-  GraphDef inlined_graph_def;
-  graph.ToGraphDef(&inlined_graph_def);
-
-  // Add the default values of attributes to the nodes that have been inlined.
-  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def,
-                                               *graph.op_registry(), 0, true));
-
-  // Add the inlined nodes to the graph
-  for (NodeDef& inlined_node : *inlined_graph_def.mutable_node()) {
-    if (inlined_node.name() == "FunctionOutputs") {
-      inlined_node.set_name(node.name());
-      for (int i = 0; i < inlined_node.input_size(); ++i) {
-        inlined_node.set_input(
-            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
-      }
-    } else if (inlined_node.name() == "FunctionInputs") {
-      inlined_node.set_name(
-          AddPrefixToNodeName(inlined_node.name(), node.name()));
-      inlined_node.clear_input();
-      for (int i = 0; i < node.input_size(); ++i) {
-        inlined_node.add_input(node.input(i));
-      }
-    } else {
-      inlined_node.set_name(
-          AddPrefixToNodeName(inlined_node.name(), node.name()));
-      for (int i = 0; i < inlined_node.input_size(); ++i) {
-        inlined_node.set_input(
-            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
-      }
-      // If the node has no input, hook it up to the function input node to make
-      // sure it runs in the same frame as the other nodes of the function body.
-      if (inlined_node.input_size() == 0) {
-        *inlined_node.add_input() = AsControlDependency(
-            AddPrefixToNodeName("FunctionInputs", node.name()));
-      }
-    }
-    inlined_node.set_device(node.device());
-    optimized_graph->add_node()->Swap(&inlined_node);
-  }
-
-  return Status::OK();
-}
-
-// -------------------------------------------------------------------------- //
-// Inline indirect functions calls (aka PartitionedCallOp).
-//
-// When we inline indirect function calls, we instantiate the function body from
-// its FunctionDef and caller node attributes, and embed the instantiated graph
-// into the "main graph".
-//
-// In contrast to direct function calls, `PartitionedCallOp` has automatic
-// dependency tracking via input/output control edges, and we relax some of the
-// constraints that we have for direct function call inlining.
-//
-// Automatic control dependency rules:
-//
-// 1) "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data
-//    type) input argument it "captures" the mutable resource.  This is
-//    implemented by automatically adding a incoming control edge from the
-//    previous side-effectful op touching that resource, and an outgoing control
-//    edge to the next side-effectful op using the same resource. This
-//    serializes the mutations of the resource to make graph execution
-//    deterministic.
-//
-// 2) All stateful ops inside a function body are guaranteed to execute in
-//    program order, this is achieved by adding control edges between stateful
-//    ops at graph construction time.
-//
-// 3) Furthermore, all ops accepting the same resource as an input are
-//    guaranteed to run in program order. This is also done by adding control
-//    edges at graph construction time. The last op touching the resource
-//    will have an outgoing control edge to all function return nodes, which
-//    will guarantee that all side effects to the resource will happen before
-//    function completion.
-//
-// Function call inlining must preserve side effect visibility:
-//
-// 1) All side effects to the captured resources, that happened before function
-//    call must be visible to the function body nodes using that resources.
-// 2) All side effects to the captured resources, that happened inside function
-//    body, must be visible to every op/function using that resource after the
-//    function call completed.
-//
-// To guarantee that these properties are preserved after inlining we:
-//
-// 1) Create "input_control" NoOp. Function call node incoming control edges
-//    will be forwarded *to* this node. Function inputs (Identity nodes) will
-//    have a control edge *from* this node. If function has no inputs, by
-//    construction it must have nodes without inputs in the function body, and
-//    in this case these nodes will have a control edge *from* this node.
-
-// 2) Create "output_control" NoOp. All nodes that have incoming control edge
-//    *from* the function call node, will be forwarded to this node. Function
-//    outputs (Identity nodes) will have a control edge *to* this node. This
-//    will guarantee that nodes that have control dependency on the function
-//    call, will observe all side-effects (guaranteed by graph construction with
-//    automatic control dependencies tracking).
-//
-// If after function instantiation we find a stateful or a dataset op inside
-// the function body, that is not reachable from any of the function outputs (or
-// if the function has no outputs), we do not inline it, because we can't
-// guarantee that these nodes will be executed in correct order (or executed at
-// all) after inlining.
-//
-// We do not try to add any extra control edges to make sure that all
-// side-effectful nodes will be executed, that should be handled at graph
-// construction time.
-
-struct MaybeDeadOutput {
-  const NodeDef* dead_tensor_src;
-  const NodeDef* output_node_dst;
-};
-
-// Finds all function outputs that might return a dead tensor. This can happen
-// if there is no `Merge` node on the path from the `Switch` node, to the
-// function output.
-Status MaybeDeadOutputs(const FunctionOptimizerContext& ctx,
-                        const GrapplerFunctionItem& item,
-                        std::vector<MaybeDeadOutput>* maybe_dead) {
-  VLOG(3) << "Find function outputs that might return dead tensors: item.id="
-          << item.id;
-  DCHECK(maybe_dead->empty()) << "Input argument must be an empty vector";
-
-  std::vector<const NodeDef*> dead_tensor_srcs;
-  for (const NodeDef& node : item.graph.node()) {
-    if (IsSwitch(node)) {
-      VLOG(4) << "Add dead tensors source. Switch node: " << node.name();
-      dead_tensor_srcs.push_back(&node);
-      continue;
-    }
-
-    // Regular (aka 'direct') function call can also produce dead tensors if
-    // the function body has mergeless switches.
-    const FunctionDef* func = ctx.function_library().Find(node.op());
-    if (func != nullptr) {
-      GrapplerFunctionItem func_item;
-      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
-          *func, FunctionInstantiationAttributes(*func, node),
-          ctx.function_library(), ctx.graph_version(), &func_item));
-
-      std::vector<MaybeDeadOutput> func_dead_outputs;
-      TF_RETURN_IF_ERROR(MaybeDeadOutputs(ctx, func_item, &func_dead_outputs));
-
-      if (!func_dead_outputs.empty()) {
-        VLOG(4) << "Add dead tensors source. Function call: " << node.op()
-                << " node=" << node.name();
-        dead_tensor_srcs.push_back(&node);
-      }
-    }
-  }
-
-  // If we do not have dead tensor sources in the function body, it's
-  // guaranteed that all output tensors can't become dead.
-  if (dead_tensor_srcs.empty()) return Status::OK();
-
-  // Names of the function body nodes that return function output values.
-  absl::flat_hash_set<absl::string_view> output_nodes;
-  for (const auto& output_arg : item.outputs()) {
-    output_nodes.insert(output_arg.node_name);
-  }
-
-  GraphTopologyView topology_view;
-  TF_RETURN_IF_ERROR(topology_view.InitializeFromGraph(item.graph));
-
-  for (const NodeDef* dead_tensor_src : dead_tensor_srcs) {
-    DfsTraversal(topology_view, {dead_tensor_src},
-                 TraversalDirection::kFollowOutputs,
-                 // Stop traversal when reached first `Merge` node.
-                 DfsPredicates::Advance(
-                     [](const NodeDef* node) { return !IsMerge(*node); }),
-                 // If we reached output node, add MaybeDeadOutput edge.
-                 DfsCallbacks::PreOrder([&](const NodeDef* node) {
-                   if (output_nodes.find(node->name()) != output_nodes.end()) {
-                     maybe_dead->push_back({dead_tensor_src, node});
-                   }
-                 }));
-  }
-
-  return Status::OK();
-}
-
-// Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
-// we know how to inline it into the main graph, otherwise returns and error
-// indicating why the function call is not inlinable.
-Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
-                                       const FunctionDef& func,
-                                       const NodeDef& func_node) {
-  // We inline direct function calls above, using different rules.
-  if (!IsIndirectFunctionCall(func, func_node)) {
-    return errors::InvalidArgument("Unsupported function call type: ",
-                                   SummarizeNodeDef(func_node));
-  }
-
-  if (MarkedNoInline(func)) {
-    return errors::FailedPrecondition(
-        "Can't inline function marked with '_noinline': ",
-        SummarizeNodeDef(func_node));
-  }
-
-  // Function specialization and inlining must be mutually exclusive.
-  if (MarkedSpecialized(func)) {
-    return errors::FailedPrecondition(
-        "Can't inline function created in Grappler function specialization: ",
-        SummarizeNodeDef(func_node));
-  }
-
-  // We can't inline functions that are in a fetch set, because it would
-  // invalidate fetch tensors (function call node fully inlined and doesn't
-  // exist in the optimized graph).
-  if (ctx.IsFetchNode(func_node.name())) {
-    return errors::FailedPrecondition(
-        "Can't inline function in a Grappler item fetch set: ",
-        SummarizeNodeDef(func_node));
-  }
-
-  return Status::OK();
-}
-
-// Checks that all side-effects will be executed in well defined order. We do it
-// by checking if there is a path from stateful/dataset ops to one of the
-// control output nodes.
-Status CheckThatSideEffectsWillExecute(
-    const FunctionOptimizerContext& ctx,
-    const GraphTopologyView& graph_topo_view,
-    const absl::flat_hash_set<string> control_output_nodes) {
-  // In aggressive mode we just print a warning for side-effectful nodes that
-  // might not be executed after inlining.
-  const bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
-
-  for (const NodeDef& func_body_node : graph_topo_view.graph()->node()) {
-    const bool node_must_execute =
-        IsDataset(func_body_node) ||
-        IsStateful(func_body_node, &ctx.function_library());
-
-    // If op has DT_RESOURCE argument it will be marked as stateful, though if
-    // it only reads from that resource, it's allowed to prune it, because it
-    // can't produce any visible side-effects.
-    const bool read_only = IsReadVariableOp(func_body_node);
-
-    // _Retval marked as stateful, but we will remove it before inlining.
-    const bool retval = IsRetval(func_body_node);
-
-    if (read_only || retval || !node_must_execute) continue;
-
-    VLOG(3) << "Check that node " << func_body_node.name()
+  for (const Node* side_effect : fbody_side_effects) {
+    VLOG(4) << "Check that node " << side_effect->name()
             << " will execute after inlining.";
     bool will_execute = false;
 
-    // Check if we reached one of the output nodes.
-    const auto callbacks = DfsCallbacks::PreOrder([&](const NodeDef* node) {
-      if (control_output_nodes.contains(node->name())) {
-        VLOG(4) << "Found a path to control output node: " << node->name();
+    const auto is_control_source = [&](const Node* n) -> void {
+      const auto it = control_sources.find(n);
+      if (it != control_sources.end()) {
+        VLOG(4) << "Found a path to control source: " << side_effect->name()
+                << " ---> " << (*it)->name();
         will_execute = true;
       }
-    });
+    };
 
-    // Stop if we already proved that node will execute.
-    const auto predicates = DfsPredicates::Enter(
-        [&](const NodeDef* node) { return !will_execute; });
-
-    DfsTraversal(graph_topo_view, {&func_body_node},
-                 TraversalDirection::kFollowOutputs, predicates, callbacks);
+    DFSFrom(*fbody.graph, {side_effect}, /*enter=*/is_control_source,
+            /*leave=*/{}, NodeComparatorName{});
 
     if (!will_execute) {
-      const string error_message = absl::StrCat(
+      return errors::Internal(
           "Can't guarantee execution of a side-effectful node, that is not "
-          "reachable from function outputs. Function body node: ",
-          SummarizeNodeDef(func_body_node));
-
-      if (aggressive) {
-        LOG(WARNING) << error_message;
-      } else {
-        return errors::Internal(error_message);
-      }
+          "reachable from function control source. Function body node: ",
+          SummarizeNode(*side_effect));
     }
   }
 
   return Status::OK();
 }
 
-Status PlaceInlinedFunctionBody(
-    const NodeDef& func_node, const GrapplerFunctionItem& item,
-    const absl::flat_hash_map<absl::string_view, int>& input_args_idx,
-    FunctionOptimizerContext* ctx, GraphDef* placed_graph_def) {
-  // Control flow lowering and Placer works with a Graph object.
-  std::unique_ptr<Graph> func_body_graph =
-      absl::make_unique<Graph>(ctx->function_library());
+// Validates that no dead tensor can reach function output.
+Status ValidateNoDeadOutputs(const FunctionLibraryDefinition& flib_def,
+                             const FunctionBody& fbody) {
+  absl::flat_hash_set<const Node*> output_nodes = {fbody.ret_nodes.begin(),
+                                                   fbody.ret_nodes.end()};
 
-  GraphConstructorOptions opts;
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(opts, item.graph, func_body_graph.get()));
+  // Find all nodes that can produce dead tensors.
+  std::vector<const Node*> dead_tensor_sources;
+  for (const Node* n : fbody.graph->nodes()) {
+    if (n->IsSwitch()) {
+      VLOG(4) << "Add dead tensors source. Switch node: " << n->name();
+      dead_tensor_sources.push_back(n);
+      continue;
+    }
 
-  // ------------------------------------------------------------------------ //
-  // Grappler receives the graph after PRE_PLACEMENT, Placer, and POST_PLACEMENT
-  // passes, so each node has a valid device assignment. Also V2 control
-  // flow ops (functional If and While) should have been lowered to V1 control
-  // flow (Switch and Merge nodes). To keep the graph valid for execution we
-  // must assign device to every inlined graph node, and also lower the control
-  // flow.
+    // Native function call can also produce dead tensors if the function body
+    // has mergeless switches.
+    const FunctionDef* fdef = flib_def.Find(n->type_string());
+    if (fdef != nullptr) {
+      std::unique_ptr<FunctionBody> nested_fbody;
 
-  GraphOptimizationPassOptions opt_options;
-  opt_options.graph = &func_body_graph;
-  opt_options.flib_def = ctx->mutable_function_library();
+      NameAttrList func;
+      TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(n->def(), &func));
+      TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(&func.attr()),
+                                                 &flib_def, &nested_fbody));
 
-  // TODO(ezhulenev): Should we run full PRE_PLACEMENT pass here? And
-  // POST_PLACEMENT after placer?
-  LowerFunctionalOpsPass pass(/*lower_function_calls=*/false,
-                              /*keep_lowered_nodes_fetchable=*/false);
-  TF_RETURN_IF_ERROR(pass.Run(opt_options));
-
-  // ------------------------------------------------------------------------ //
-  // Before placing the function body nodes we pin input arguments to the
-  // same device as their corresponding input nodes.
-
-  for (Node* func_body_node : func_body_graph->nodes()) {
-    const auto input_arg_idx = input_args_idx.find(func_body_node->name());
-
-    if (input_arg_idx != input_args_idx.end()) {
-      const int input_idx = input_arg_idx->second;
-      const GraphView::OutputPort output_port =
-          ctx->graph_view().GetRegularFanin({&func_node, input_idx});
-
-      const string& input_device = output_port.node->device();
-
-      if (!input_device.empty()) {
-        VLOG(3) << "Pin inlined function input node '" << func_body_node->name()
-                << "' to the '" << output_port.node->device() << "' device.";
-        func_body_node->set_requested_device(output_port.node->device());
-      } else {
-        VLOG(3) << "Inlined function input node '" << func_body_node->name()
-                << "' device is undefined.";
+      if (!ValidateNoDeadOutputs(flib_def, *nested_fbody).ok()) {
+        VLOG(4) << "Add dead tensors source. Function call: " << func.name()
+                << " node=" << n->name();
+        dead_tensor_sources.push_back(n);
       }
     }
   }
 
+  for (const Node* dead_tensor_source : dead_tensor_sources) {
+    bool has_dead_output = false;
+
+    const auto is_output_node = [&](const Node* n) -> void {
+      const auto it = output_nodes.find(n);
+      if (it != output_nodes.end()) {
+        VLOG(4) << "Found a path to output node from dead tensor source: "
+                << dead_tensor_source->name() << " ---> " << (*it)->name();
+        has_dead_output = true;
+      }
+    };
+
+    // Stop DFS traversal at a Merge node or if already found a dead output.
+    const auto stop_traversal = [&has_dead_output](const Edge& edge) -> bool {
+      return !edge.src()->IsMerge() || has_dead_output;
+    };
+
+    DFSFrom(*fbody.graph, {dead_tensor_source}, /*enter=*/is_output_node,
+            /*leave=*/{}, NodeComparatorName{},
+            /*edge_filter=*/stop_traversal);
+
+    if (has_dead_output) {
+      return errors::Internal(
+          "Can't inline a function with dead outputs. Dead tensor source: ",
+          SummarizeNode(*dead_tensor_source));
+    }
+  }
+
+  return Status::OK();
+}
+
+// Makes an instance of FunctionBody for inlining from a Node.
+Status MakeFunctionBodyForInlining(const Node& node,
+                                   const FunctionLibraryDefinition& flib_def,
+                                   std::unique_ptr<FunctionBody>* fbody) {
+  VLOG(3) << "Make function body for inlining: " << SummarizeNode(node);
+
+  // Finds a FunctionDef in a library and verifies that it exists.
+  const auto find_fdef = [&flib_def, &node](
+                             const string& name,
+                             const FunctionDef** fdef) -> Status {
+    if ((*fdef = flib_def.Find(name)) == nullptr) {
+      return errors::Internal(
+          "Was not able to find a function definition (name=", name,
+          ") for a function call: ", SummarizeNode(node));
+    }
+    return Status::OK();
+  };
+
+  // SymbolicGradient is a special "function call" op, which has been
+  // deprecated for a while, but we still support for compatibility reasons.
+  if (node.type_string() == FunctionLibraryDefinition::kGradientOp) {
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), kFuncAttr, &func));
+
+    const string grad = flib_def.FindGradient(func.name());
+
+    if (!grad.empty()) {
+      // Function has a custom gradient registered in a library.
+      const FunctionDef* grad_fdef;
+      TF_RETURN_IF_ERROR(find_fdef(grad, &grad_fdef));
+
+      VLOG(4) << "Instantiate a custom SymbolicGradient: gradient=" << grad
+              << " (function=" << func.name() << ")";
+      TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+          *grad_fdef, AttrSlice(&func.attr()), &flib_def, fbody));
+
+    } else if (flib_def.Find(func.name()) == nullptr) {
+      // Function is not really a function, but a primitive op.
+      gradient::Creator creator;
+      TF_RETURN_IF_ERROR(gradient::GetOpGradientCreator(func.name(), &creator));
+      if (creator == nullptr) {
+        return errors::InvalidArgument("No gradient is defined for ",
+                                       func.name());
+      }
+      FunctionDef grad_fdef;
+      TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef));
+
+      VLOG(4) << "Instantiate a SymbolicGradient for a primitive op: "
+              << func.name();
+      TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+          grad_fdef, AttrSlice(&func.attr()), &flib_def, fbody));
+
+    } else {
+      // Build a gradient graph from the function body.
+      const FunctionDef* fdef;
+      TF_RETURN_IF_ERROR(find_fdef(func.name(), &fdef));
+
+      VLOG(4) << "Instantiate a SymbolicGradient for a function: "
+              << func.name();
+      TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(&func.attr()),
+                                                 &flib_def, fbody));
+      *fbody = SymbolicGradient(**fbody);
+    }
+
+  } else {
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node.def(), &func));
+    const FunctionDef* fdef;
+    TF_RETURN_IF_ERROR(find_fdef(func.name(), &fdef));
+
+    VLOG(4) << "Instantiate a function call: function=" << func.name();
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(&func.attr()),
+                                               &flib_def, fbody));
+  }
+
+  return Status::OK();
+}
+
+// Adds a control edges from each data input to the 'caller' to enforce strict
+// inputs semantics (all inputs are ready and alive). This is required when:
+//
+//  1) The function takes resources as inputs, and it doesn't have incoming
+//     control edges. In Tensorflow v2 context (eager mode) this should never
+//     happen, because automatic control dependencies tracking will add a
+//     control edge from the last op touching the resource. However such graphs
+//     might be produced by legacy v1 code without automatic dependency
+//     tracking. In this case strict function call semantics is required for
+//     enforcing side effects execution order.
+//
+//  2) One of the inputs is consuming Enter[is_constant=true] node, in which
+//     case it will be always alive, and potentially can lead to partial
+//     function execution after the last loop execution.
+//
+// Both of these cases would be considered illegal by construction in Tensorflow
+// V2, however we have to guarantee that graphs constructed with Tensorflow V1
+// will produce correct results.
+void AddStrictInputSemantics(Node* caller, Graph* g) {
+  const bool has_incoming_control_edges =
+      absl::c_any_of(caller->in_edges(),
+                     [](const Edge* edge) { return edge->IsControlEdge(); });
+
+  const bool has_resource_input =
+      absl::c_any_of(caller->input_types(),
+                     [](const DataType dtype) { return dtype == DT_RESOURCE; });
+
+  const bool has_constant_enter_input =
+      absl::c_any_of(caller->in_edges(), [](const Edge* edge) {
+        Node* src = edge->src();
+        return src->IsEnter() && CheckBoolAttr(src, "is_constant");
+      });
+
+  const bool requires_strict_semantics =
+      (!has_incoming_control_edges && has_resource_input) ||  // Case #1
+      (has_constant_enter_input);                             // Case #2
+  if (!requires_strict_semantics) return;
+
+  std::vector<const Node*> data_inputs;
+  data_inputs.reserve(caller->in_edges().size());
+
+  for (const Edge* edge : caller->in_edges()) {
+    if (edge->IsControlEdge()) continue;
+    data_inputs.push_back(edge->src());
+  }
+
+  VLOG(3) << "Add control edges from all data inputs to enforce strict "
+             "semantics with regard to function inputs";
+  for (const Node* node : data_inputs) {
+    g->AddControlEdge(g->FindNodeId(node->id()), caller);
+  }
+}
+
+// Adds a control edge from a frame node if the 'caller' is executing inside a
+// While loop (see control_flow.h for the 'frame' node explanation).
+void AddFrameForwardingControlEdge(const std::vector<ControlFlowInfo>& info,
+                                   Node* caller, Graph* g) {
+  // All nodes added to the graph by v2 control flow lowering and function
+  // inlining are guaranteed to have control edges to nested function calls.
+  if (caller->id() >= info.size()) return;
+
+  // Check if a lowered node is executing inside a while loop.
+  const Node* frame = info[caller->id()].frame;
+  const bool is_in_while_loop = frame->id() != Graph::kSourceId;
+  if (!is_in_while_loop) return;
+
+  // Check if a node already has an incoming control edge. All incoming edges
+  // must be from the same execution frame (executor.cc invariant), so if we
+  // already have an incoming control edge, it's guaranteed that it will "carry"
+  // the same frame as all regular inputs.
+  const bool has_incoming_control_edges =
+      absl::c_any_of(caller->in_edges(),
+                     [](const Edge* edge) { return edge->IsControlEdge(); });
+  if (has_incoming_control_edges) return;
+
+  VLOG(3) << "Add a frame forwarding control edge: from=" << frame->name()
+          << " to=" << caller->name();
+  g->AddControlEdge(g->FindNodeId(frame->id()), caller);
+}
+
+// Inlines all function calls that are safe for inlining into the main graph.
+// Also lowers control flow V2 ops (functional If/While) into the V1 low level
+// ops (Switch/Merge/...).
+//
+// Runs a placer after inlining, to keep all nodes in a graph placed.
+Status InlineFunctionCalls(const GrapplerItem& item,
+                           const RewriterConfig::Toggle opt_level,
+                           GraphDef* output_graph) {
+  bool is_aggressive = opt_level == RewriterConfig::AGGRESSIVE;
+  VLOG(2) << "Inline function calls: grappler_item_id=" << item.id
+          << " (aggessive_mode=" << is_aggressive << ")";
+
+  FunctionLibraryDefinition flib_def =
+      FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library());
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(flib_def);
+
+  GraphConstructorOptions graph_constructor_options;
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_constructor_options,
+                                            item.graph, graph.get()));
+
+  using NodeNames = absl::flat_hash_set<absl::string_view>;
+  NodeNames fetch_nodes;
+  fetch_nodes.reserve(item.fetch.size());
+  for (const string& fetch : item.fetch) {
+    fetch_nodes.insert(ParseTensorName(fetch).node());
+  }
+  NodeNames keep_nodes(item.keep_ops.begin(), item.keep_ops.end());
+
+  std::vector<string> inlined_function_names;
+
+  // If a function call is inside a While loop, it must have an incoming control
+  // edge, because it will be used to pass execution frame into the function
+  // body. All nodes without inputs in the function body (e.g. Const and NoOp)
+  // will be added an extra control edge from the 'input_control_node'.
+  std::vector<ControlFlowInfo> control_flow_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph.get(), &control_flow_info));
+
+  // Function inlining always adds new nodes to the end of the list, so we keep
+  // iterating until we are out of nodes.
+  for (int i = 2; i < graph->num_node_ids(); ++i) {
+    Node* n = graph->FindNodeId(i);
+    if (n == nullptr) continue;  // deleted node
+
+    // Special case for lowering functional control flow ops. We do not rely on
+    // LowerFunctionOpsPass because in Grappler we have to be more restrictive
+    // about what type of function calls we are allowed to inline.
+    if (LowerUsingSwitchMergeIsOn(n)) {
+      VLOG(2) << "Lower functional control flow op: " << SummarizeNode(*n);
+      AddStrictInputSemantics(n, graph.get());
+      AddFrameForwardingControlEdge(control_flow_info, n, graph.get());
+
+      if (n->type_string() == "If") {
+        TF_RETURN_IF_ERROR(RewriteIfNode(n, graph.get(), flib_def, false));
+      } else if (n->type_string() == "While") {
+        TF_RETURN_IF_ERROR(RewriteWhileNode(n, graph.get(), flib_def, false));
+      }
+      continue;
+    }
+
+    // Skip nodes that are not function calls.
+    if (!IsFunctionCall(flib_def, *n)) continue;
+    // Skip function calls that we plan to compile later.
+    if (MarkedForTpuCompilation(n)) continue;
+    if (MarkedForXlaCompilation(n)) continue;
+
+    // Function body that we will inline into the main graph. It can be a
+    // function instantiation, or a gradient function instantiated from
+    // SymbolicGradient op.
+    std::unique_ptr<FunctionBody> fbody;
+    TF_RETURN_IF_ERROR(MakeFunctionBodyForInlining(*n, flib_def, &fbody));
+
+    InlineFunctionBodyOptions inline_options;
+    // Ignore '_noinline' flag in aggressive mode.
+    inline_options.ignore_noinline = is_aggressive;
+
+    // Function calls created after inlining If/While ops are always inlined as
+    // multi-device functions and are not required to pass additional Grappler
+    // validations (side effects execution validation below).
+    bool force_inline_as_multi_device = LowerAsMultiDeviceFunctionIsOn(n);
+
+    // `PartitionedCall` is a TF-2.0 function call mechanism for multi-device
+    // functions:
+    // a) Function can be multi-device, and we can't override device placements.
+    // b) Automatic control dependencies tracking guarantees that all function
+    //    side-effectful nodes will have a path to one of the control outputs.
+    //    Control outputs and control edges between side-effectful (stateful)
+    //    nodes are used to explicitly mark the nodes that must execute, and to
+    //    define their execution order.
+    if (n->IsPartitionedCall() || force_inline_as_multi_device) {
+      inline_options.override_device = false;
+      inline_options.initialize_empty_device = true;
+      inline_options.output_control_src = OutputControlSource::kControlOutputs;
+    } else {
+      inline_options.override_device = true;
+      inline_options.output_control_src = OutputControlSource::kDataOutputs;
+    }
+
+    if (fetch_nodes.contains(n->name())) {
+      inline_options.keep_caller_node = KeepCallerNode::kFetchable;
+    } else if (keep_nodes.contains(n->name())) {
+      inline_options.keep_caller_node = KeepCallerNode::kTargetable;
+    } else {
+      inline_options.keep_caller_node = KeepCallerNode::kDoNotKeep;
+    }
+
+    // Basic validation rules defined in common_runtime shared by all functions.
+    Status can_inline_function_call =
+        ValidateInlining(n, fbody.get(), inline_options);
+
+    // Additional validation rules defined only in Grappler.
+    // TODO(ezhulenev): Move it to common_runtime InlineFunctionBodyOptions?
+    if (can_inline_function_call.ok()) {
+      bool has_outgoing_control_edges = absl::c_any_of(
+          n->out_edges(),
+          [](const Edge* edge) { return edge->IsControlEdge(); });
+
+      can_inline_function_call = ValidateSideEffectsExecution(
+          *fbody, inline_options.output_control_src,
+          has_outgoing_control_edges);
+
+      if (!can_inline_function_call.ok() &&
+          (is_aggressive || force_inline_as_multi_device)) {
+        VLOG(2) << "Ignore error: " << can_inline_function_call.error_message();
+        can_inline_function_call = Status::OK();
+      }
+    }
+    if (can_inline_function_call.ok()) {
+      can_inline_function_call = ValidateNoDeadOutputs(flib_def, *fbody);
+    }
+
+    if (can_inline_function_call.ok()) {
+      VLOG(2) << "Inline function call node: " << n->name();
+      AddStrictInputSemantics(n, graph.get());
+      AddFrameForwardingControlEdge(control_flow_info, n, graph.get());
+
+      TF_RETURN_IF_ERROR(InlineFunctionBody(flib_def, graph.get(), n,
+                                            fbody.get(), inline_options));
+      inlined_function_names.push_back(fbody->fdef.signature().name());
+
+    } else {
+      VLOG(2) << "Failed to inline function call node: "
+              << can_inline_function_call.error_message();
+    }
+  }
+
+  VLOG(4) << "Inlined " << inlined_function_names.size()
+          << " function calls: " << absl::StrJoin(inlined_function_names, ", ");
+
   // ------------------------------------------------------------------------ //
-  // After placing nodes corresponding to the function inputs, we need to assign
-  // device placements to all other function body nodes.
+  // Grappler receives the graph after PRE_PLACEMENT, Placer, and POST_PLACEMENT
+  // passes, so each node has a valid device assignment. After function inlining
+  // and control flow V2 lowering we have to keep graph placed.
 
-  const DeviceSet* devices = ctx->devices();
+  if (inlined_function_names.empty()) {
+    VLOG(3) << "Not placing graph after function inlining"
+            << " (did not inline any of the function calls).";
 
-  if (devices->devices().empty()) {
-    // If there are no devices available for placer, we do not place function
-    // body nodes. This happens when Grappler optimizing function library, or
-    // when graph optimized "offline", without active runtime session, for
-    // example as a part of batch job for graph analysis/optimization.
-    // GrapplerItem instantiated from a function library doesn't have to be
-    // fully placed after all optimization, it will be placed by the function
-    // library runtime before execution.
-    VLOG(3) << "Do not place instantiated function body.";
+  } else if (item.devices().empty()) {
+    // If there are no devices available for placer, we do not place graph after
+    // function inlining. This happens when Grappler is optimizing the function
+    // library, or when a graph optimized "offline", without an active runtime
+    // session, for example as a part of batch job for graph
+    // analysis/optimization. GrapplerItem instantiated from a function library
+    // doesn't have to be fully placed after all optimizations; it will be
+    // placed by the function library runtime before execution.
+    VLOG(3) << "Not placing graph after function inlining"
+            << " (device set is empty)";
+
   } else {
     // If we are running in an active runtime session, Grappler will get the
     // graph after initial placing is done, and we should have devices for the
     // placer.
-    VLOG(3) << "Run placer for instantiated function body. Devices: ["
-            << absl::StrJoin(
-                   devices->devices(), ", ",
-                   [](string* out, const Device* d) { out->append(d->name()); })
-            << "]";
+    VLOG(3) << "Run placer for the graph after function inlining. "
+            << "Devices: [" << absl::StrJoin(item.devices(), ", ") << "]";
 
-    // Use function caller node device as a default for placer.
-    const Device* default_device =
-        devices->FindDeviceByName(func_node.device());
+    DeviceSet device_set;                               // does not own devices
+    std::vector<std::unique_ptr<Device>> fake_devices;  // owns fake devices
 
-    Placer placer(func_body_graph.get(), devices, default_device);
+    for (const string& name : item.devices()) {
+      auto device = absl::make_unique<FakeDevice>(name);
+      device_set.AddDevice(device.get());
+      fake_devices.push_back(std::move(device));
+    }
+
+    Placer placer(graph.get(), item.id, &device_set);
     TF_RETURN_IF_ERROR(placer.Run());
   }
 
-  // Convert Graph back to the placed GraphDef.
-  func_body_graph->ToGraphDef(placed_graph_def);
-
+  graph->ToGraphDef(output_graph);
   return Status::OK();
 }
 
-Status InlineIndirectFunctionCall(const NodeDef& func_node,
-                                  const FunctionDef& func,
-                                  FunctionOptimizerContext* ctx,
-                                  GraphDef* optimized_graph) {
-  VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
-  VLOG(4) << "Inlined function definition: " << DebugString(func);
-  TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
+// Restores tensor mapping after function specialization: all inputs must be
+// connected to valid nodes.
+void RestoreTensorMapping(const FunctionOptimizerContext& ctx,
+                          GraphDef* optimized_graph) {
+  if (ctx.tensor_mapping().empty()) return;
 
-  const AttrSlice func_instantiation_attr =
-      FunctionInstantiationAttributes(func, func_node);
-
-  GrapplerFunctionItem item;
-  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
-                                                ctx->function_library(),
-                                                ctx->graph_version(), &item);
-
-  if (!item_status.ok()) {
-    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
-                                   " instantiated by ", func_node.name(),
-                                   ". Error: ", item_status.error_message());
-  }
-
-  // `PartitionedCallOp` invokes functions with `allow_dead_tensors = true` to
-  // reset dead flag, and return default initialized tensors instead of a dead
-  // tensors. There is no way to express this in a regular Tensorflow graph, so
-  // we choose not to inline if a function can have dead tensors as an output
-  // position. In practice `mergeless switches` should not exists in a function
-  // body, because tf-eager will only use v2 control flow ops.
-  std::vector<MaybeDeadOutput> maybe_dead_outputs;
-  TF_RETURN_IF_ERROR(MaybeDeadOutputs(*ctx, item, &maybe_dead_outputs));
-  if (!maybe_dead_outputs.empty()) {
-    struct MaybeDeadOutputFormatter {
-      void operator()(string* out, const MaybeDeadOutput& md) const {
-        absl::StrAppend(out, SummarizeNodeDef(*md.dead_tensor_src));
-      }
-    };
-    return errors::FailedPrecondition(
-        "Can't inline function with dead outputs. Dead tensor sources (size = ",
-        maybe_dead_outputs.size(), "): ",
-        absl::StrJoin(maybe_dead_outputs, "\n", MaybeDeadOutputFormatter()));
-  }
-
-  GraphView::InputPort control_input_port =
-      ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
-  GraphView::OutputPort control_output_port =
-      ctx->graph_view().GetOutputPort(func_node.name(), Graph::kControlSlot);
-
-  // Nodes that have side effects to the captured resources.
-  std::vector<string> happens_before;
-  absl::c_transform(
-      ctx->graph_view().GetFanin(control_input_port),
-      std::back_inserter(happens_before),
-      [](const GraphView::OutputPort port) { return port.node->name(); });
-
-  VLOG(3) << "Happens before set (size = " << happens_before.size()
-          << "): " << absl::StrJoin(happens_before, ", ");
-
-  // Nodes that must observe side effects to the captured resources.
-  std::vector<string> happens_after;
-  absl::c_transform(
-      ctx->graph_view().GetFanout(control_output_port),
-      std::back_inserter(happens_after),
-      [](const GraphView::InputPort port) { return port.node->name(); });
-
-  VLOG(3) << "Happens after set (size = " << happens_after.size()
-          << "): " << absl::StrJoin(happens_after, ", ");
-
-  // Regular (data) inputs to the function call.
-  std::vector<SafeTensorId> inputs;
-  for (const string& input : func_node.input()) {
-    SafeTensorId tensor_id = ParseTensorName(input);
-    if (tensor_id.index() == Graph::kControlSlot) break;
-    inputs.push_back(tensor_id);
-  }
-
-  // Mapping from input argument node to function input position.
-  absl::flat_hash_map<absl::string_view, int> input_args_idx;
-  for (const InputArgInstantiation& input_arg : item.inputs()) {
-    const int idx = input_args_idx.size();
-    input_args_idx[input_arg.node_name] = idx;
-  }
-
-  const string prefix = strings::StrCat(func_node.name(), "/");
-
-  // ------------------------------------------------------------------------ //
-  // IMPORTANT: Actual inputs will be added to the following nodes at the very
-  // last stage, because we don't want to have invalid edges in a function body
-  // graph (control edges that depend on the nodes in the "outer" optimized
-  // graph).
-
-  // If one of the function inputs is a dead tensor, we must not execute any of
-  // the function body nodes, and let the dead tensor flag propagate through the
-  // inlined function body. We add NoOp inputs_ready node, and add control edges
-  // to it from all input nodes. Inlined function arguments (Identity nodes)
-  // will have a control dependency on it.
+  // During function specialization, we might prune unused function outputs. We
+  // need to "close the holes" that might appear in the function outputs.
   //
-  // TODO(ezhulenev): We do not need to provide this guarantee for ALL nodes in
-  // the function body. We must only ensure that we do not generate observable
-  // side effects.
+  // Example: prune unused output "f:1"
   //
-  // If the function call node has incoming control edges, we will update them
-  // to use this node as destination, to ensure side-effects execution order.
-  NodeDef* inputs_ready_node = nullptr;
-  if (func_node.input_size() > 0) {
-    inputs_ready_node = item.graph.add_node();
-    inputs_ready_node->set_op("NoOp");
-    inputs_ready_node->set_name(kInputsReadyNodeName);
-  }
-
-  // All nodes that have a control edge from the function call node, will be
-  // updated to have a control edge from 'side_effects_executed_node`. This node
-  // will have control edges from all function control outputs (see
-  // `control_ret` in FunctionDef). This a "barrier" that guarantees that all
-  // ops with side effects in the function body were executed
+  //   f = my_func[T=float](...)          f = my_func_specialized[T=float](...)
+  //   a = Identity(f:0)             ->   a = Identity(f:0)
+  //   b = Identity(f:2)                  b = Identity(f:1)
   //
-  // If the function call node has no outgoing control edges, it means that no
-  // one is interested in the function side-effect affecting captured resources.
-  //
-  // If node is in keep_ops set, it means that it must execute. This could
-  // happen if the graph is an instantiation of a function with control output.
-  NodeDef* side_effects_executed_node = nullptr;
-  if (!happens_after.empty() || ctx->IsKeepOp(func_node.name())) {
-    side_effects_executed_node = item.graph.add_node();
-    side_effects_executed_node->set_op("NoOp");
-    side_effects_executed_node->set_name(kSideEffectsExecutedNodeName);
-  }
+  // Tensor mapping (size=1): [f:2 -> f:1]
+  for (NodeDef& node : *optimized_graph->mutable_node()) {
+    for (int idx = 0; idx < node.input_size(); ++idx) {
+      TensorId input_tensor = ParseTensorName(node.input(idx));
+      if (input_tensor.index() == Graph::kControlSlot) break;
 
-  // If function executed only for the regular data outputs, it's totally safe
-  // to prune side-effects. If side-effects order is important, it must be
-  // captured at graph construction time via control edges.
-  if (item.control_output_size() > 0 && happens_after.empty()) {
-    VLOG(2) << "Function has control outputs and empty happens after set.";
-  }
-
-  // ------------------------------------------------------------------------ //
-  // If we have a node inside the function body without inputs (e.g. Const), we
-  // must attach a control dependency to it, to make sure that if a function
-  // call happens inside a loop, the node will be evaluated in correct frame.
-  //
-  // If the function call node has no inputs and no control dependencies, it
-  // means that it can't be a function call inside a loop, and we can safely
-  // insert that node without inputs into the main graph.
-  //
-  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
-  // the function is called inside a loop.
-  std::vector<string> empty_inputs_hook;
-  if (inputs_ready_node != nullptr) {
-    empty_inputs_hook.push_back(inputs_ready_node->name());
-  }
-
-  // ------------------------------------------------------------------------ //
-  // Grappler called after PRE_PLACEMENT and PLACEMENT passes, so we have to
-  // make sure that after inlining all nodes will have valid device assignment.
-
-  GraphDef placed_graph_def;
-  TF_RETURN_IF_ERROR(PlaceInlinedFunctionBody(func_node, item, input_args_idx,
-                                              ctx, &placed_graph_def));
-
-  // ------------------------------------------------------------------------ //
-  // Mapping from the '_Retval' node name to the output tensor. We build this
-  // mapping after the placement, because we might have inlined some of the
-  // functional If/While nodes (see a call to LowerFunctionalOpsPass).
-  absl::flat_hash_map<string, string> output_tensors;
-
-  for (const NodeDef& func_body_node : placed_graph_def.node()) {
-    if (!IsRetval(func_body_node)) continue;
-    if (func_body_node.input_size() != 1) {
-      return errors::Internal("_Retval node must have single input: ",
-                              SummarizeNodeDef(func_body_node));
-    }
-    output_tensors.emplace(func_body_node.name(), func_body_node.input(0));
-  }
-
-  // ------------------------------------------------------------------------ //
-  // After all nodes placed we need to prepare them for inlining into the
-  // optimized graph: turn placeholders into identities, update nodes
-  // connectivity, etc...
-
-  const auto inlined_node_name = [&func_node](const string& name) -> string {
-    return AddPrefixToNodeName(name, /*prefix=*/func_node.name());
-  };
-
-  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    const string& node_name = func_body_node.name();
-
-    // Turn _Arg nodes added in place of input arguments into identity nodes.
-    const auto input_arg_idx = input_args_idx.find(node_name);
-    if (input_arg_idx != input_args_idx.end()) {
-      DCHECK_EQ(0, func_body_node.input_size());
-      func_body_node.set_op("Identity");
-      func_body_node.mutable_attr()->erase("index");
-      func_body_node.mutable_attr()->erase("shape");
-      const int input_idx = input_arg_idx->second;
-      func_body_node.add_input(inputs[input_idx].ToString());
-
-      // Add a control dependency on 'inputs_ready' node, to guarantee that all
-      // inputs are alive and all side-effects executed before function body.
-      if (inputs_ready_node) {
-        func_body_node.add_input(
-            AsControlDependency(inlined_node_name(inputs_ready_node->name())));
-      }
-    } else {
-      // Update inputs of the regular function body nodes.
-      for (string& input : *func_body_node.mutable_input()) {
-        input = inlined_node_name(input);
-      }
-
-      // Check if we need to ensure node execution in correct loop frame.
-      bool node_needs_empty_inputs_hook =
-          // We have a node to hook and node has no inputs.
-          !empty_inputs_hook.empty() && func_body_node.input_size() == 0 &&
-          // Inputs ready node will always have edge from main graph. If
-          // function call has no regular and control inputs, we will not add
-          // inputs_ready node to the function body graph.
-          node_name != kInputsReadyNodeName &&
-          // The node acting as a return barrier for execution of side effects
-          // might not have any inputs (in case function has no control outputs,
-          // but we still added it because of non-empty happens-after set), so
-          // we must make sure it's executed in correct frame.
-          (node_name != kSideEffectsExecutedNodeName ||
-           item.control_output_size() == 0);
-
-      if (node_needs_empty_inputs_hook) {
-        *func_body_node.add_input() =
-            AsControlDependency(inlined_node_name(empty_inputs_hook[0]));
-      }
-    }
-
-    // Add the function node name as a prefix 1) to node name to avoid
-    // collisions; 2) to frame name to avoid multiple LoopCond nodes in one
-    // frame after inlining.
-    TF_RETURN_IF_ERROR(
-        AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &func_body_node));
-
-    // After inlining into the optimized graph, NodeDef must have all attributes
-    // defined, which is not required for a node in a FunctionDef.
-    const OpDef* op_def;
-    TF_RETURN_IF_ERROR(
-        ctx->function_library().LookUpOpDef(func_body_node.op(), &op_def));
-    AddDefaultsToNodeDef(*op_def, &func_body_node);
-  }
-
-  // ------------------------------------------------------------------------ //
-  // Check that after inlining all side-effects will be executed in well defined
-  // order. We do it by checking if there is a path from stateful/dataset ops to
-  // one of the control output nodes.
-
-  // Names of the inlined control output nodes.
-  absl::flat_hash_set<string> inlined_control_output_nodes;
-  for (const ControlOutput& control_output : item.control_outputs()) {
-    inlined_control_output_nodes.insert(
-        inlined_node_name(control_output.node_name));
-  }
-
-  // Construct a graph topology view for DFS traversals (skip invalid edges for
-  // input nodes connected to nodes in the optimized graph).
-  GraphTopologyView placed_topo_view(/*skip_invalid_edges=*/true);
-  TF_RETURN_IF_ERROR(placed_topo_view.InitializeFromGraph(placed_graph_def));
-  TF_RETURN_IF_ERROR(CheckThatSideEffectsWillExecute(
-      *ctx, placed_topo_view, inlined_control_output_nodes));
-
-  // ------------------------------------------------------------------------ //
-  // Move all the nodes to the optimized graph after successful preprocessing.
-
-  if (inputs_ready_node != nullptr) {
-    string inlined_node = inlined_node_name(inputs_ready_node->name());
-    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
-
-    absl::flat_hash_set<string> input_nodes;
-    for (const string& input : func_node.input()) {
-      SafeTensorId tensor = ParseTensorName(input);
-
-      // Input node might have been a function call that was already inlined.
-      auto it = ctx->tensor_mapping().find(tensor);
-      while (it != ctx->tensor_mapping().end()) {
-        tensor = it->second;
-        it = ctx->tensor_mapping().find(tensor);
-      }
-
-      if (input_nodes.insert(tensor.node()).second) {
-        placed_graph_def.mutable_node(*node_idx)->add_input(
-            AsControlDependency(tensor.node()));
+      auto mapping = ctx.tensor_mapping().find(input_tensor);
+      if (mapping != ctx.tensor_mapping().end()) {
+        node.set_input(idx, mapping->second.ToString());
       }
     }
   }
-
-  if (side_effects_executed_node != nullptr) {
-    string inlined_node = inlined_node_name(side_effects_executed_node->name());
-    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
-
-    // Add control edges from all control output nodes.
-    for (const string& node_name : inlined_control_output_nodes) {
-      placed_graph_def.mutable_node(*node_idx)->add_input(
-          AsControlDependency(node_name));
-    }
-
-    // Forward all control dependencies in the optimized graph to the new node.
-    ctx->AddControlOverrides(func_node, {inlined_node});
-  }
-
-  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    // We bypass _Retval nodes and fetch tensors from `retval.input(0)`.
-    if (IsRetval(func_body_node)) continue;
-    optimized_graph->add_node()->Swap(&func_body_node);
-  }
-
-  // Indirect function call is fully inlined into the optimized graph, and we do
-  // not copy the original function call node, so we have to setup tensor
-  // mapping from old output tensors, to the outputs of inlined nodes.
-  int output_idx = 0;
-  for (const OutputArgInstantiation& output : item.outputs()) {
-    const string& output_tensor = output_tensors.at(output.node_name);
-
-    const SafeTensorId from_tensor(func_node.name(), output_idx++);
-    const SafeTensorId to_tensor = ParseTensorName(output_tensor);
-
-    const SafeTensorId inlined_to_tensor =
-        SafeTensorId(absl::StrCat(func_node.name(), "/", to_tensor.node()),
-                     to_tensor.index());
-
-    ctx->AddTensorMapping(from_tensor, inlined_to_tensor);
-  }
-
-  // If function call node was in keep_ops set, it means that we need to keep a
-  // node with the same name in the optimized graph. We forward all data
-  // consumers to inlined nodes, and we verify that the node is not in a fetch
-  // set, so it's safe to assume that the function call node is only required
-  // for a control edge source.
-  if (ctx->IsKeepOp(func_node.name())) {
-    VLOG(4) << "Add NoOp for inlined function in keep ops set.";
-    NodeDef* keep_func_node = optimized_graph->add_node();
-    keep_func_node->set_op("NoOp");
-    keep_func_node->set_name(func_node.name());
-    keep_func_node->set_device(func_node.device());
-    keep_func_node->add_input(
-        AsControlDependency(inlined_node_name(kSideEffectsExecutedNodeName)));
-  }
-
-  VLOG(3) << "Successfully inlined indirect function call: "
-          << SummarizeNodeDef(func_node);
-
-  return Status::OK();
-}
-
-// Restores graph invariants after function specialization and inlining: all
-// inputs must be connected to valid nodes.
-Status RestoreGraphInvariants(const FunctionOptimizerContext& ctx,
-                              GraphDef* optimized_graph) {
-  // After function specialization and inlining graph might be in invalid
-  // state, and some nodes can read tensors that do not exists anymore in the
-  // optimized graph: function call node was fully inlined into the graph, or
-  // output index was invalidated by the output pruning.
-
-  if (!ctx.tensor_mapping().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      for (int idx = 0; idx < node.input_size(); ++idx) {
-        TensorId input_tensor = ParseTensorName(node.input(idx));
-        if (input_tensor.index() == Graph::kControlSlot) break;
-
-        auto mapping = ctx.tensor_mapping().find(input_tensor);
-        if (mapping != ctx.tensor_mapping().end()) {
-          node.set_input(idx, mapping->second.ToString());
-        }
-      }
-    }
-  }
-
-  // Function inlining instantiates function body directly into the optimized
-  // graph, and we might end up with control dependencies to the nodes that no
-  // longer exist in a graph. We need to apply control overrides to all
-  // invalidated nodes, and rewire control dependencies to the control outputs
-  // node (it's also possible to rewrite singe control edge into multiple edges
-  // to inlined side-effectful nodes).
-
-  if (!ctx.control_overrides().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      // Keep track of new control inputs to the node.
-      absl::flat_hash_set<string> add_ctrl_inputs;
-
-      // Remove all invalidated control inputs.
-      for (int idx = 0; idx < node.input_size(); /* see below */) {
-        // TODO(ezhulenev): Use non-allocating TensorId after migrating
-        // `control_overrides()` to absl::flat_hash_set.
-        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
-
-        auto overrides = ctx.control_overrides().find(input_tensor.node());
-        if (overrides != ctx.control_overrides().end()) {
-          // If this happens it's a bug in the function inlining.
-          if (input_tensor.index() != Graph::kControlSlot) {
-            return errors::Internal(
-                "Illegal input edge from inlined function call node");
-          }
-          // Remove control dependency to the inlined function call node.
-          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
-          node.mutable_input()->RemoveLast();
-
-          // Keep track of all overrides.
-          for (const string& override : overrides->second) {
-            add_ctrl_inputs.insert(AsControlDependency(override));
-          }
-        } else {
-          // Go to the next input only if the current one was not invalidated,
-          // otherwise we need to check the swapped input as well.
-          ++idx;
-        }
-      }
-
-      // Add overrides to the node inputs.
-      for (const string& ctrl_input : add_ctrl_inputs) {
-        node.add_input(ctrl_input);
-      }
-    }
-  }
-
-  return Status::OK();
 }
 
 }  // namespace
 
 Status FunctionOptimizer::RunFunctionOptimizerPass(
-    const GrapplerItem& item, const GraphDef& graph, const int iteration,
-    std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
-    bool* graph_has_unoptimized_function_calls) const {
-  VLOG(3) << absl::Substitute(
-      "Run function optimizer pass (iteration = $0): grappler_item_id = $1",
-      iteration, item.id);
+    const GrapplerItem& item, GraphDef* optimized_graph) const {
+  VLOG(3) << "Run function optimizer pass: grappler_item_id=" << item.id;
 
-  FunctionOptimizerContext ctx(item, opt_level_, graph);
+  // Inline all function calls into a graph using common_runtime/function
+  // implementation (see `InlineFunctionBody` function documentation).
+  GraphDef graph_after_inlining;
+  TF_RETURN_IF_ERROR(
+      InlineFunctionCalls(item, opt_level_, &graph_after_inlining));
 
-  bool inline_gradients = options_.enable_symbolic_gradient_inlining;
-  bool inline_func = options_.enable_function_inlining;
-  bool specialize_func = options_.enable_function_specialization;
+  // Specialize function calls that we could not inline.
+  FunctionOptimizerContext ctx(item, opt_level_, graph_after_inlining);
 
-  // We will process all the nodes in topological order, to correctly handle
-  // inlining of function call chains.
-  std::vector<const NodeDef*> topo_ordered_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(graph, &topo_ordered_nodes));
-
-  for (const NodeDef* node : topo_ordered_nodes) {
-    // Each node optimization can modify optimized graph only by adding new
+  for (const NodeDef& node : graph_after_inlining.node()) {
+    // Function specialization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
     const auto is_graph_modified = [&]() {
@@ -2000,150 +1350,50 @@
       return num_nodes > num_nodes_before;
     };
 
-    // Copy node from the `graph` to the `optimized_graph`.
-    const auto copy_node = [&]() { *optimized_graph->add_node() = *node; };
+    // Copy node from the `graph_after_inlining` to the `optimized_graph`.
+    const auto copy_node = [&]() { *optimized_graph->add_node() = node; };
 
-    // If we already failed to optimize this node during one of the previous
-    // passes, we just give up, and do not try on more time.
-    if (skip_nodes->find(node->name()) != skip_nodes->end()) {
-      VLOG(3) << "Skip optimization for node: " << node->name();
+    // Find if a node is a function call (direct or indirect).
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+    if (func == nullptr) {
       copy_node();
       continue;
     }
 
-// Skip errors if optimized graph was not modified before error happened.
-#define TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(...)                     \
-  do {                                                             \
-    const Status _status = (__VA_ARGS__);                          \
-    if (TF_PREDICT_FALSE(!_status.ok() && is_graph_modified()))    \
-      return _status;                                              \
-    if (TF_PREDICT_FALSE(!_status.ok() && !is_graph_modified())) { \
-      VLOG(3) << "Skip error: " << _status.error_message();        \
-      skip_nodes->insert(node->name());                            \
-      copy_node();                                                 \
-    }                                                              \
-  } while (0)
+    const string& func_name = func->signature().name();
 
-    // ---------------------------------------------------------------------- //
-    // 1. Inline symbolic gradients into the optimized graph.                 //
-    // ---------------------------------------------------------------------- //
+    // Specialize it to its instantiation context if it has something worth
+    // specializing.
+    bool specialization_worthy = IsParametrized(*func) ||
+                                 HasTrulyConstInputs(node, ctx) ||
+                                 HasUnusedOutputs(node, *func, ctx);
+    // Do not specialize if function has custom gradient.
+    const string grad_func = ctx.function_library().FindGradient(func_name);
 
-    if (IsSymbolicGradient(*node) && inline_gradients) {
-      // Inline symbolic gradients only if the corresponding function is not
-      // marked as `_noinline`.
-      const auto* f_attr = gtl::FindOrNull(node->attr(), "f");
-      const string f_name = f_attr != nullptr ? f_attr->func().name() : "";
-      const FunctionDef* func = ctx.function_library().Find(f_name);
-      if (func && !MarkedNoInline(*func)) {
-        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            InlineSymbolicGradient(*node, &ctx, optimized_graph));
-        continue;
-      } else {
-        VLOG(2) << "Skip SymbolicGradient inlining: function=" << f_name;
-        skip_nodes->insert(node->name());
+    if (grad_func.empty() && specialization_worthy) {
+      // TODO(ezhulenev): Specialize function call if input has a known shape.
+      // Specialize function body for its instantiation attributes and inputs.
+      Status status = SpecializeFunction(node, *func, &ctx, optimized_graph);
+      if (!status.ok() && is_graph_modified()) {
+        return status;
+      } else if (!status.ok() && !is_graph_modified()) {
+        VLOG(3) << "Skip specialization error: " << status.error_message();
+        copy_node();
       }
+      continue;
+    } else {
+      VLOG(2) << "Skip function specialization: " << func->signature().name();
+      copy_node();
     }
-
-    // ---------------------------------------------------------------------- //
-    // 2. Inline or specialize function calls.                                //
-    // ---------------------------------------------------------------------- //
-
-    // Find if a node is a function call (direct or indirect).
-    const FunctionDef* func = FindFunctionCall(ctx, *node);
-
-    if (func != nullptr) {
-      const string& func_name = func->signature().name();
-
-      const bool is_direct_func = IsDirectFunctionCall(*func, *node);
-      const bool is_indirect_func = IsIndirectFunctionCall(*func, *node);
-
-      // 2a. Inline direct function call if it's inlinable.
-      if (inline_func && is_direct_func) {
-        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, *node);
-        if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-              InlineDirectFunctionCall(*node, *func, ctx, optimized_graph));
-          continue;
-        } else {
-          VLOG(2) << inlinable.error_message();
-          skip_nodes->insert(node->name());
-        }
-      }
-
-      // 2b. Inline indirect function call if it's inlinable.
-      if (inline_func && is_indirect_func) {
-        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, *node);
-        if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-              InlineIndirectFunctionCall(*node, *func, &ctx, optimized_graph));
-          continue;
-        } else {
-          VLOG(2) << inlinable.error_message();
-          skip_nodes->insert(node->name());
-        }
-      }
-
-      // 2c. Specialize it to its instantiation context if can't be inlined,
-      // and it has something worth specializing.
-      bool specialization_worthy = IsParametrized(*func) ||
-                                   HasTrulyConstInputs(*node, ctx) ||
-                                   HasUnusedOutputs(*node, *func, ctx);
-
-      // Do not specialize if function has custom gradient.
-      const string grad_func = ctx.function_library().FindGradient(func_name);
-
-      if (specialize_func && grad_func.empty() && specialization_worthy) {
-        // TODO(ezhulenev): Specialize function call if input has a known shape.
-        // Specialize function body for its instantiation attributes and inputs.
-        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            SpecializeFunction(*node, *func, &ctx, optimized_graph));
-        continue;
-      } else {
-        VLOG(2) << "Skip function specialization: " << func->signature().name();
-        skip_nodes->insert(node->name());
-      }
-    }
-
-    // ---------------------------------------------------------------------- //
-    // If we reached this point, node was not handled by any of the stages
-    // (inline, specialize), simply copy the node to the optimized graph.
-    copy_node();
-
-#undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
-  TF_RETURN_IF_ERROR(RestoreGraphInvariants(ctx, optimized_graph));
+  RestoreTensorMapping(ctx, optimized_graph);
 
   // Preserve the graph version.
-  *optimized_graph->mutable_versions() = graph.versions();
-
+  *optimized_graph->mutable_versions() = item.graph.versions();
   // Prune unreachable function from the library.
-  if (options_.enable_trim_function_library) {
-    *optimized_graph->mutable_library() =
-        PruneFunctionLibrary(ctx.function_library(), *optimized_graph);
-  } else {
-    *optimized_graph->mutable_library() = ctx.function_library().ToProto();
-  }
-
-  // Before returning we check if after single optimization pass we have more
-  // unoptimized function calls.
-  *graph_has_unoptimized_function_calls = false;
-  for (const NodeDef& node : optimized_graph->node()) {
-    // Check if we can inline symbolic gradient.
-    if (IsSymbolicGradient(node) && inline_gradients &&
-        skip_nodes->count(node.name()) == 0) {
-      *graph_has_unoptimized_function_calls = true;
-      break;
-    }
-
-    // Check if after inlining we have unoptimized function calls.
-    const FunctionDef* func = FindFunctionCall(ctx, node);
-    if (func != nullptr && !MarkedSpecialized(*func) &&
-        skip_nodes->count(node.name()) == 0) {
-      *graph_has_unoptimized_function_calls = true;
-      break;
-    }
-  }
+  *optimized_graph->mutable_library() =
+      PruneFunctionLibrary(ctx.function_library(), *optimized_graph);
 
   return Status::OK();
 }
@@ -2156,35 +1406,7 @@
     return Status::OK();
   }
 
-  // Do not retry failed function inlining or specialization.
-  std::unordered_set<string> skip_nodes;
-  bool graph_has_unoptimized_function_calls = false;
-
-  // We'll keep running function optimizer pass until we inlined and optimized
-  // all function call nodes.
-  int iteration = 0;
-  constexpr int kMaxIterations = 50;
-
-  // 1. Run first optimizer pass with GrapplerItem.graph.
-  TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
-      item, item.graph, 0, &skip_nodes, optimized_graph,
-      &graph_has_unoptimized_function_calls));
-
-  // 2. If after function inlining we have unoptimized function calls, we have
-  // to run function optimization pass one more time.
-  while (graph_has_unoptimized_function_calls) {
-    if (iteration++ > kMaxIterations) {
-      VLOG(1) << "Break function optimizer loop at iteration #" << iteration;
-      break;
-    }
-
-    GraphDef workspace_graph;
-    workspace_graph.Swap(optimized_graph);
-
-    TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
-        item, workspace_graph, iteration, &skip_nodes, optimized_graph,
-        &graph_has_unoptimized_function_calls));
-  }
+  TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(item, optimized_graph));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index ab90281..8c96bbc 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -41,25 +41,15 @@
  private:
   friend class FunctionOptimizerTest;
 
-  struct FunctionOptimizerOptions {
-    bool enable_function_inlining = true;
-    bool enable_function_specialization = true;
-    bool enable_symbolic_gradient_inlining = true;
-    bool enable_trim_function_library = true;
-  };
-
   // Runs a single function optimizer pass over the `graph`. All nodes that are
   // not function calls will be copied from the `graph` to the
   // `optimized_graph`. Function call nodes inlined or specialized, and
   // instantiated function body or specialized function call nodes will be added
   // to the `optimized_graph`.
-  Status RunFunctionOptimizerPass(
-      const GrapplerItem& item, const GraphDef& graph, const int iteration,
-      std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
-      bool* graph_has_unoptimized_function_calls) const;
+  Status RunFunctionOptimizerPass(const GrapplerItem& item,
+                                  GraphDef* optimized_graph) const;
 
   RewriterConfig::Toggle opt_level_;
-  FunctionOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 8917243..1455399 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -33,12 +33,7 @@
 constexpr char kDevice[] = "/job:localhost/replica:0/task:0/device:CPU:0";
 }  // namespace
 
-class FunctionOptimizerTest : public GrapplerTest {
- protected:
-  void DisableFunctionSpecialization(FunctionOptimizer* optimizer) {
-    optimizer->options_.enable_function_specialization = false;
-  }
-};
+class FunctionOptimizerTest : public GrapplerTest {};
 
 TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
   using test::function::NDef;
@@ -59,52 +54,23 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  int count = 0;
-  for (const NodeDef& node : output.node()) {
-    if (node.name() == "y/inlined_inputs") {
-      count++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-    } else if (node.name() == "y/x") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "y/two") {
-      count++;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^y/inlined_inputs", node.input(0));
-    } else if (node.name() == "y/scale") {
-      count++;
-      EXPECT_EQ("Cast", node.op());
-      EXPECT_EQ(kDevice, node.device());
-    } else if (node.name() == "y/y") {
-      count++;
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("y/x", node.input(0));
-      EXPECT_EQ("y/scale", node.input(1));
-    } else if (node.name() == "y") {
-      count++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/y", node.input(0));
-    } else if (node.name() == "z") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y", node.input(0));
-    }
-  }
-  EXPECT_EQ(7, count);
+  const string arg0 = "Func/y/input/_0";
+  const string ret0 = "Func/y/output/_1";
+
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  GraphDef expected = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}),
+       NDef(arg0, "Identity", {"x"}, {{"T", DT_FLOAT}}),
+       NDef("y/two", "Const", {}, {{"dtype", DT_INT64}, {"value", kTwo}}),
+       NDef("y/scale", "Cast", {"y/two"},
+            {{"DstT", DT_FLOAT}, {"SrcT", DT_INT64}}),
+       NDef("y/y", "Mul", {arg0, "y/scale"}, {{"T", DT_FLOAT}}),
+       NDef(ret0, "Identity", {"y/y"}, {{"T", DT_FLOAT}}),
+       NDef("z", "Identity", {ret0}, {{"T", DT_FLOAT}})},
+      {});
+  for (NodeDef& node : *expected.mutable_node()) node.set_device(kDevice);
+
+  CompareGraphs(expected, output);
 
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
@@ -115,82 +81,6 @@
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, InlineFunction_SkipErrorsIfGraphNotModified) {
-  using test::function::NDef;
-
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-
-  // Standard XTimesTwo() function.
-  FunctionDef x_times_two = test::function::XTimesTwo();
-
-  // Function signature has non-type attribute (currently not supported).
-  FunctionDef my_identity_n = FunctionDefHelper::Create(
-      // Name
-      "MyIdentityN",
-      // Args
-      {"x: N*T"},
-      // Return values
-      {"out: N*T"},
-      // Attrs
-      {"N:int", "T:{float, double, int32, int64}"},
-      // Nodes (just forward inputs through IdentityN)
-      {
-          {{"Id"}, "IdentityN", {"x"}, {{"T", "$T"}, {"N", "$N"}}},
-      },
-      // Output mapping
-      {{"out", "Id:output:0"}});
-
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-       NDef("y1", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("y2", "MyIdentityN", {"x"}, {{"T", DT_FLOAT}, {"N", 1}}, kDevice),
-       NDef("z1", "Identity", {"y1:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("z2", "Identity", {"y2:0"}, {{"T", DT_FLOAT}}, kDevice)},
-      // FunctionLib
-      {x_times_two, my_identity_n});
-
-  GraphDef output;
-  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-
-  // Verify that only MyIdentityN is in the function library after optimization.
-  ASSERT_EQ(1, output.library().function().size());
-  EXPECT_EQ("MyIdentityN", output.library().function(0).signature().name());
-
-  // And that XTimesTwo was successfully inlined.
-  int found = 0;
-  for (const NodeDef& node : output.node()) {
-    if (node.name() == "y1/inlined_inputs") {
-      found++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-    } else if (node.name() == "y1") {
-      found++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y1/y", node.input(0));
-    } else if (node.name() == "y2") {
-      found++;
-      EXPECT_EQ("MyIdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-    }
-  }
-  EXPECT_EQ(3, found);
-
-  Tensor pi = test::AsScalar<float>(3.14f);
-  item.fetch = {"z1"};
-  item.feed.emplace_back("x", pi);
-  auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized = item.WithGraph(std::move(output));
-  auto tensors = EvaluateFetchNodes(optimized);
-  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
-}
-
 TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
   using test::function::NDef;
 
@@ -234,53 +124,12 @@
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  int count = 0;
+  // Calls to XTimesTwo were removed from the graph.
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "y/inlined_inputs") {
-      count++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-    } else if (node.name() == "y/x") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "y/two") {
-      count++;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^y/inlined_inputs", node.input(0));
-      EXPECT_EQ(kDevice, node.device());
-    } else if (node.name() == "y/y") {
-      count++;
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("y/x", node.input(0));
-      EXPECT_EQ("y/two", node.input(1));
-    } else if (node.name() == "y") {
-      count++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/y", node.input(0));
-    } else if (node.name() == "z") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y", node.input(0));
-    } else if (node.name() == "y/enter") {
-      count++;
-      EXPECT_TRUE(IsEnter(node));
-      const string frame_name = node.attr().at("frame_name").s();
-      EXPECT_EQ("y/frame", frame_name);
-    }
+    EXPECT_NE(node.op(), "XTimesTwo");
   }
-  EXPECT_EQ(7, count);
+  // And the function itself was removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
 
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
@@ -324,47 +173,12 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  int count = 0;
+  // Function call was removed from the graph.
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "y/inlined_inputs") {
-      count++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-    } else if (node.name() == "y/in") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "y/Linear_func") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/in", node.input(0));
-    } else if (node.name() == "y/Exp") {
-      count++;
-      EXPECT_EQ("Exp", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/Linear_func", node.input(0));
-    } else if (node.name() == "y") {
-      count++;
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/Exp", node.input(0));
-    } else if (node.name() == "z") {
-      count++;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y", node.input(0));
-    }
+    EXPECT_NE(node.op(), "Exp_func");
   }
-  EXPECT_EQ(6, count);
+  // And the function itself was removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
 
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
@@ -413,6 +227,13 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
+  // Function call was removed from the graph.
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.op(), "ForwardInputs");
+  }
+  // And the function itself was removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
+
   item.fetch = {"z0", "z1", "z2"};
   item.feed.emplace_back("x0", test::AsScalar<float>(3.14f));
   item.feed.emplace_back("x1", test::AsScalar<float>(2.7f));
@@ -431,7 +252,6 @@
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  DisableFunctionSpecialization(&optimizer);  // do not specialize noinline func
 
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -449,7 +269,7 @@
 
   GrapplerItem item;
   item.graph = test::function::GDef(
-      {NDef("y", "GenerateTwo", {}, {}, kDevice),
+      {NDef("y", "GenerateTwo", {}, {{"T", DT_FLOAT}}, kDevice),
        NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
@@ -459,8 +279,18 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  // For now we won't inline the function.
-  EXPECT_EQ(item.graph.DebugString(), output.DebugString());
+  // Function call was removed from the graph.
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.op(), "GenerateTwo");
+  }
+  // And the function itself was removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
+
+  item.fetch = {"z"};
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
@@ -494,58 +324,13 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  int count = 0;
+  // Function calls were removed from the graph.
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "square/inlined_inputs" && ++count) {
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("a", node.input(0));
-    } else if (node.name() == "square/x" && ++count) {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "square/output/inlined_inputs" && ++count) {
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("square/x", node.input(0));
-      EXPECT_EQ("square/x", node.input(1));
-    } else if (node.name() == "square/output/x" && ++count) {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "square/output/y" && ++count) {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output/inlined_inputs:1", node.input(0));
-    } else if (node.name() == "square/output/output" && ++count) {
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("square/output/x", node.input(0));
-      EXPECT_EQ("square/output/y", node.input(1));
-    } else if (node.name() == "square/output" && ++count) {
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output/output", node.input(0));
-    } else if (node.name() == "square" && ++count) {
-      EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output", node.input(0));
-    } else if (node.name() == "outputs" && ++count) {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(kDevice, node.device());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square:0", node.input(0));
-    }
+    EXPECT_NE(node.op(), "MySquare");
+    EXPECT_NE(node.op(), "MyMul");
   }
-  EXPECT_EQ(9, count);
+  // And functions were removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
 
   item.fetch = {"outputs"};
   item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
@@ -592,6 +377,13 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
+  // SymbolicGradient calls were removed from the graph.
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.op(), "SymbolicGradient");
+  }
+  // And functions were removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
+
   std::vector<Tensor> expected =
       EvaluateNodes(item.graph, {"out1", "out2"}, {});
   std::vector<Tensor> optimized = EvaluateNodes(output, {"out1", "out2"}, {});
@@ -634,34 +426,12 @@
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(13, output.node_size());
-  EXPECT_EQ("Const", output.node(0).name());
-  EXPECT_EQ("Const_1", output.node(1).name());
-  EXPECT_EQ("SymbolicGradient/FunctionInputs", output.node(2).name());
-  EXPECT_EQ("SymbolicGradient", output.node(3).name());
-  EXPECT_EQ("SymbolicGradient/SymbolicGradient/Identity",
-            output.node(4).name());
-  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/input/_0",
-            output.node(5).name());
-  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/input/_1",
-            output.node(6).name());
-  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/output/_2",
-            output.node(7).name());
-  EXPECT_EQ("SymbolicGradient/SymbolicGradient/Func/_1/dx",
-            output.node(8).name());
-  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/Func/_1/input/_3",
-            output.node(9).name());
-  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/Func/_1/input/_4",
-            output.node(10).name());
-  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/Func/_1/output/_5",
-            output.node(11).name());
-  EXPECT_EQ("out", output.node(12).name());
-  for (int i = 2; i < 4; ++i) {
-    EXPECT_EQ("IdentityN", output.node(i).op());
+  // SymbolicGradient calls were removed from the graph.
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.op(), "SymbolicGradient");
   }
-  for (int i = 4; i < 11; ++i) {
-    EXPECT_EQ("Identity", output.node(i).op());
-  }
+  // And functions were removed from the library.
+  EXPECT_EQ(output.library().function_size(), 0);
 
   std::vector<Tensor> expected = EvaluateNodes(item.graph, {"out"}, {});
   std::vector<Tensor> optimized = EvaluateNodes(output, {"out"}, {});
@@ -737,6 +507,10 @@
   item.feed.emplace_back("a", pi);
   item.feed.emplace_back("b", pi);
 
+  const string input_x = "Func/c/input/_0";
+  const string input_y = "Func/c/input/_1";
+  const string output_z = "Func/c/output/_2";
+
   // If device set is empty, inlined function body must not be placed.
   {
     GraphDef optimized_graph;
@@ -748,14 +522,14 @@
 
          // Function body nodes are not placed, however function input nodes
          // must copy device assignment from input arguments.
-         NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}),
-         NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
-              kDevice),
-         NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
-              kDevice),
-         NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}),
+         NDef(input_x, "Identity", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+         NDef(input_y, "Identity", {"b"}, {{"T", DT_FLOAT}}, kDevice),
+         // TODO(ezhulenev): Currently inlined function body "implicitly placed"
+         // with a 'inline_options.initialize_empty_device' flag.
+         NDef("c/mul", "Mul", {input_x, input_y}, {{"T", DT_FLOAT}}, kDevice),
+         NDef(output_z, "Identity", {"c/mul"}, {{"T", DT_FLOAT}}, kDevice),
 
-         NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+         NDef("d", "Identity", {output_z}, {{"T", DT_FLOAT}}, kDevice)},
         // Function library.
         {mul_func});
 
@@ -779,14 +553,12 @@
         {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
-         NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
-         NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
-              kDevice),
-         NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
-              kDevice),
-         NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
+         NDef(input_x, "Identity", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+         NDef(input_y, "Identity", {"b"}, {{"T", DT_FLOAT}}, kDevice),
+         NDef("c/mul", "Mul", {input_x, input_y}, {{"T", DT_FLOAT}}, kDevice),
+         NDef(output_z, "Identity", {"c/mul"}, {{"T", DT_FLOAT}}, kDevice),
 
-         NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+         NDef("d", "Identity", {output_z}, {{"T", DT_FLOAT}}, kDevice)},
         // Function library.
         {mul_func});
 
@@ -874,54 +646,68 @@
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
        // Initialize variable with one of the placeholders.
-       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}},
+            kDevice),
        NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
             kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("f1/inputs_ready", "NoOp", {"^a", "^b", "^v", "^init_v"}, {},
+       NDef("Func/f1/input_control_node/_0", "NoOp", {"^init_v"}, {}, kDevice),
+
+       NDef("Func/f1/input/_1", "Identity",  // input: 'x'
+            {"a", "^Func/f1/input_control_node/_0"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("Func/f1/input/_2", "Identity",  // input: 'y'
+            {"b", "^Func/f1/input_control_node/_0"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("Func/f1/input/_3", "Identity",  // input: 'v'
+            {"v", "^Func/f1/input_control_node/_0"}, {{"T", DT_RESOURCE}},
             kDevice),
 
-       NDef("f1/x", "Identity", {"a:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f1/y", "Identity", {"b:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f1/v", "Identity", {"v:0", "^f1/inputs_ready"},
-            {{"T", DT_RESOURCE}}, kDevice),
-
-       NDef("f1/one", "Const", {"^f1/inputs_ready"},
+       NDef("f1/one", "Const", {"^Func/f1/input_control_node/_0"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
-       NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
+       NDef("f1/mul", "Mul", {"Func/f1/input/_1", "Func/f1/input/_2"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/add", "AssignAddVariableOp", {"Func/f1/input/_3", "f1/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
-       NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
 
-       NDef("f1/side_effects_executed", "NoOp", {"^f1/add"}, {}, kDevice),
+       NDef("Func/f1/output/_4", "Identity", {"f1/mul"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("Func/f1/output_control_node/_5", "NoOp", {"^f1/add"}, {}, kDevice),
 
        // Function body of a second function call also inlined into the graph,
-       // and input nodes read directly from the inlined nodes of the first
-       // function call.
-       NDef("f2/inputs_ready", "NoOp",
-            {"^v", "^f1/mul", "^f1/side_effects_executed"}, {}, kDevice),
+       // and input nodes read from the output nodes of the first function call.
+       NDef("Func/f2/input_control_node/_6", "NoOp",
+            {"^Func/f1/output_control_node/_5"}, {}, kDevice),
 
-       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+       NDef("Func/f2/input/_7", "Identity",  // input: 'x'
+            {"Func/f1/output/_4", "^Func/f2/input_control_node/_6"},
             {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+       NDef("Func/f2/input/_8", "Identity",  // input: 'y'
+            {"Func/f1/output/_4", "^Func/f2/input_control_node/_6"},
             {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/v", "Identity", {"v:0", "^f2/inputs_ready"},
-            {{"T", DT_RESOURCE}}, kDevice),
+       NDef("Func/f2/input/_9", "Identity",  // input: 'v'
+            {"v", "^Func/f2/input_control_node/_6"}, {{"T", DT_RESOURCE}},
+            kDevice),
 
-       NDef("f2/one", "Const", {"^f2/inputs_ready"},
+       NDef("f2/one", "Const", {"^Func/f2/input_control_node/_6"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
-       NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
+       NDef("f2/add", "AssignAddVariableOp", {"Func/f2/input/_9", "f2/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
-       NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"Func/f2/input/_7", "Func/f2/input/_8"},
+            {{"T", DT_FLOAT}}, kDevice),
 
-       NDef("f2/side_effects_executed", "NoOp", {"^f2/add"}, {}, kDevice),
+       NDef("Func/f2/output/_10", "Identity", {"f2/mul"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("Func/f2/output_control_node/_11", "NoOp", {"^f2/add"}, {},
+            kDevice),
 
-       // Return values read directly from inlined nodes.
-       NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       // Return values read from inlined output nodes.
+       NDef("out_1", "Identity", {"Func/f2/output/_10"}, {{"T", DT_FLOAT}},
+            kDevice),
        NDef("out_2", "ReadVariableOp",
-            {"v", "^f1/side_effects_executed", "^f2/side_effects_executed"},
+            {"v", "^Func/f1/output_control_node/_5",
+             "^Func/f2/output_control_node/_11"},
             {{"dtype", DT_FLOAT}}, kDevice)},
 
       // Function library.
@@ -981,20 +767,22 @@
   GraphDef optimized_graph;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
 
+  const string input_x = "Func/c/input/_0";
+  const string input_y = "Func/c/input/_1";
+  const string output_z = "Func/c/output/_2";
+
   GraphDef expected = test::function::GDef(
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
 
        // Function must be inlined and `mul` node placed on a requested device,
        // and input `Identity` nodes must be colocated with their source nodes.
-       NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}, cpu0),
-       NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
-            cpu0),
-       NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
-            cpu1),
-       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef(input_x, "Identity", {"a"}, {{"T", DT_FLOAT}}, cpu0),
+       NDef(input_y, "Identity", {"b"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/mul", "Mul", {input_x, input_y}, {{"T", DT_FLOAT}}, cpu1),
+       NDef(output_z, "Identity", {"c/mul"}, {{"T", DT_FLOAT}}, cpu1),
 
-       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
+       NDef("d", "Identity", {output_z}, {{"T", DT_FLOAT}}, cpu0)},
       // Function library.
       {mul_func});
 
@@ -1033,8 +821,10 @@
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
+       NDef("c", "NoOp", {}, {}, kDevice),
+
        // Call function first time.
-       NDef("f1", "PartitionedCall", {"a", "b"},
+       NDef("f1", "PartitionedCall", {"a", "b", "^c"},
             {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
              {"Tout", DataTypeSlice{DT_FLOAT}},
              {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
@@ -1060,31 +850,49 @@
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
+       NDef("c", "NoOp", {}, {}, kDevice),
+
        // Function body of a first function call inlined into the graph.
-       NDef("f1/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
-       NDef("f1/x", "Identity", {"a:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+       NDef("Func/f1/input_control_node/_0", "NoOp", {"^c"}, {}, kDevice),
+
+       NDef("Func/f1/input/_1", "Identity",  // input: 'x'
+            {"a", "^Func/f1/input_control_node/_0"}, {{"T", DT_FLOAT}},
             kDevice),
-       NDef("f1/y", "Identity", {"b:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+       NDef("Func/f1/input/_2", "Identity",  // input: 'y'
+            {"b", "^Func/f1/input_control_node/_0"}, {{"T", DT_FLOAT}},
             kDevice),
-       NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
-       // Control input from `inputs_ready` node is added to ensure correct
-       // frame execution.
-       NDef("f1/side_effects_executed", "NoOp", {"^f1/inputs_ready"}, {},
+
+       NDef("f1/mul", "Mul", {"Func/f1/input/_1", "Func/f1/input/_2"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("Func/f1/output/_3", "Identity", {"f1/mul"}, {{"T", DT_FLOAT}},
             kDevice),
+       // Control input from `input_control_node` node is added to ensure
+       // correct frame execution.
+       NDef("Func/f1/output_control_node/_4", "NoOp",
+            {"^Func/f1/input_control_node/_0"}, {}, kDevice),
 
        // Function body of a second function call also inlined into the graph,
-       // and input nodes read directly from the inlined nodes of the first
+       // and input nodes read directly from the output nodes of the first
        // function call, and control dependency edge removed.
-       NDef("f2/inputs_ready", "NoOp", {"^f1/mul", "^f1/side_effects_executed"},
-            {}, kDevice),
-       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
-            {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
-            {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("Func/f2/input_control_node/_5", "NoOp",
+            {"^Func/f1/output_control_node/_4"}, {}, kDevice),
 
-       // Return directly from inlined node of f2.
-       NDef("out", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+       NDef("Func/f2/input/_6", "Identity",
+            {"Func/f1/output/_3", "^Func/f2/input_control_node/_5"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("Func/f2/input/_7", "Identity",
+            {"Func/f1/output/_3", "^Func/f2/input_control_node/_5"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("f2/mul", "Mul", {"Func/f2/input/_6", "Func/f2/input/_7"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("Func/f2/output/_8", "Identity", {"f2/mul"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Return directly from output node of f2.
+       NDef("out", "Identity", {"Func/f2/output/_8"}, {{"T", DT_FLOAT}},
+            kDevice)},
 
       // Function library.
       {mul_func});
@@ -1227,22 +1035,24 @@
        NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("fn/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
-       NDef("fn/x", "Identity", {"a:0", "^fn/inputs_ready"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("fn/cond", "Identity", {"b:0", "^fn/inputs_ready"},
-            {{"T", DT_BOOL}}, kDevice),
-       NDef("fn/switch", "Switch", {"fn/x:0", "fn/cond:0"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("fn/if_false", "Identity", {"fn/switch:0"}, {{"T", DT_FLOAT}},
+       NDef("Func/fn/input/_0", "Identity", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("Func/fn/input/_1", "Identity", {"b"}, {{"T", DT_BOOL}}, kDevice),
+
+       NDef("fn/switch", "Switch", {"Func/fn/input/_0", "Func/fn/input/_1"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("fn/if_false", "Identity", {"fn/switch"}, {{"T", DT_FLOAT}},
             kDevice),
        NDef("fn/if_true", "Identity", {"fn/switch:1"}, {{"T", DT_FLOAT}},
             kDevice),
-       NDef("fn/merge", "Merge", {"fn/if_false:0", "fn/if_true:0"},
+       NDef("fn/merge", "Merge", {"fn/if_false", "fn/if_true"},
             {{"T", DT_FLOAT}, {"N", 2}}, kDevice),
 
-       // Return directly from inlined node.
-       NDef("out", "Identity", {"fn/merge:0"}, {{"T", DT_FLOAT}}, kDevice)},
+       NDef("Func/fn/output/_2", "Identity", {"fn/merge"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Return directly from inlined function output node.
+       NDef("out", "Identity", {"Func/fn/output/_2"}, {{"T", DT_FLOAT}},
+            kDevice)},
 
       // Function library.
       {no_dead_outputs});
@@ -1312,22 +1122,25 @@
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
        // Inlined inputs of `b` node.
-       NDef("b/inputs_ready", "NoOp", {"^a"}, {}, kDevice),
-       NDef("b/x", "Identity", {"a:0", "^b/inputs_ready"}, {{"T", DT_FLOAT}},
-            kDevice),
+       NDef("Func/b/input/_0", "Identity", {"a"}, {{"T", DT_FLOAT}}, kDevice),
 
        // Inlined inputs of `square` node inside inlined `MySquare` function.
-       NDef("b/square/inputs_ready", "NoOp", {"^b/x"}, {}, kDevice),
-       NDef("b/square/x", "Identity", {"b/x:0", "^b/square/inputs_ready"},
+       NDef("Func/b/square/input/_2", "Identity", {"Func/b/input/_0"},
             {{"T", DT_FLOAT}}, kDevice),
-       NDef("b/square/y", "Identity", {"b/x:0", "^b/square/inputs_ready"},
+       NDef("Func/b/square/input/_3", "Identity", {"Func/b/input/_0"},
             {{"T", DT_FLOAT}}, kDevice),
 
        // Inlined mul node from the `MyMul` function.
-       NDef("b/square/mul", "Mul", {"b/square/x", "b/square/y"},
+       NDef("b/square/mul", "Mul",
+            {"Func/b/square/input/_2", "Func/b/square/input/_3"},
             {{"T", DT_FLOAT}}, kDevice),
 
-       NDef("c", "Identity", {"b/square/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+       NDef("Func/b/square/output/_4", "Identity", {"b/square/mul"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("Func/b/output/_1", "Identity", {"Func/b/square/output/_4"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("c", "Identity", {"Func/b/output/_1"}, {{"T", DT_FLOAT}}, kDevice)},
       // Function library.
       {mul_func});
 
@@ -1379,7 +1192,7 @@
            }},
       },
       /* Mapping between function returns and function node outputs. */
-      {{"z", "if_node:output:0"}});
+      {{"z", "if_node:output:0"}}, {{"side_effect", "if_node"}});
 
   // Build a computation graph for:
   //   is_add: bool
@@ -1403,7 +1216,7 @@
              {"f", FDH::FunctionRef("AddOrMul")}},
             kDevice),
 
-       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+       NDef("d", "Identity", {"c", "^c"}, {{"T", DT_FLOAT}}, kDevice)},
       // Function library.
       {add_or_mul_func, add_func, mul_func});
 
@@ -1925,7 +1738,7 @@
     } else if (node.name() == "use_fn4_2" && ++found) {
       EXPECT_EQ("fn4:0", node.input(0));
     } else if (node.name() == "use_fn5_0" && ++found) {
-      EXPECT_EQ("fn5:0", node.input(0));
+      EXPECT_EQ("fn5", node.input(0));
     } else if (node.name() == "use_fn5_2" && ++found) {
       EXPECT_EQ("fn5:1", node.input(0));
     }
@@ -2086,7 +1899,7 @@
     } else if (node.name() == "use_fn4_2" && ++found) {
       EXPECT_EQ("fn4:0", node.input(0));
     } else if (node.name() == "use_fn5_0" && ++found) {
-      EXPECT_EQ("fn5:0", node.input(0));
+      EXPECT_EQ("fn5", node.input(0));
     } else if (node.name() == "use_fn5_2" && ++found) {
       EXPECT_EQ("fn5:1", node.input(0));
     }
@@ -2112,10 +1925,10 @@
 TEST_F(FunctionOptimizerTest, PruningUselessLibraryFunctions) {
   using test::function::NDef;
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  DisableFunctionSpecialization(&optimizer);
   auto func = test::function::XTimesTwo();
   (*func.mutable_attr())["_noinline"].set_b(true);
   GrapplerItem item;
+  item.id = "test_graph";
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, "/device:CPU:0"),
        NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, "/device:CPU:0"),
@@ -2130,8 +1943,9 @@
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(output.library().function().size(), 1);
-  EXPECT_EQ(output.library().function(0).signature().name(), "XTimesTwo");
+  ASSERT_EQ(output.library().function().size(), 1);
+  EXPECT_EQ(output.library().function(0).signature().name(),
+            "XTimesTwo_specialized_for_y_at_test_graph");
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 7dff0b5..ddb09ea 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -188,7 +188,16 @@
                                         const GrapplerItem& item,
                                         GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
-  TF_RETURN_IF_ERROR(LoadFunctions(*optimized_graph));
+  auto status = LoadFunctions(*optimized_graph);
+  // Eat up the error from function loading, since this optimizer might run
+  // several times, and might try to run against functions generated by
+  // function_optimizer from previous runs, which will fail due to function
+  // signature mismatch.
+  if (!status.ok()) {
+    LOG(WARNING) << "Skipping optimization due to error while loading function "
+                 << "libraries: " << status;
+    return Status::OK();
+  }
   return SelectImplementation(optimized_graph);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 6e68a53..f4be384 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -15,6 +15,7 @@
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -40,6 +41,7 @@
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
@@ -97,18 +99,6 @@
   }
 }
 
-Status CompressConstants(GraphDef* graph) {
-  for (int i = 0; i < graph->node_size(); ++i) {
-    NodeDef* node = graph->mutable_node(i);
-    if ((IsConstant(*node) || IsHostConstant(*node)) &&
-        HasNodeAttr(*node, "value")) {
-      AttrValue& attr_val = (*node->mutable_attr())["value"];
-      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
-    }
-  }
-  return Status::OK();
-}
-
 // A helper function to decide whether to enable the automatic mixed precision
 // optimizer.
 bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
@@ -388,6 +378,7 @@
                           reinterpret_cast<uintptr_t>(optimized_graph)),
           *optimized_graph);
     }
+
     for (const auto& optimizer : optimizers) {
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
       // Some optimizers can run only once.
@@ -446,9 +437,6 @@
                                     optimized_graph, &optimization_result));
   }
 
-  // Compress the constants in the final graph.
-  TF_RETURN_IF_ERROR(CompressConstants(optimized_graph));
-
   bool is_optimized = std::find_if(optimization_result.results.begin(),
                                    optimization_result.results.end(),
                                    [](const OptimizerResult& result) {
@@ -459,6 +447,9 @@
   optimization_results_.push_back(optimization_result);
 
   if (is_optimized) {
+    // Compress the constants in the graph.
+    CompressConstants(optimized_graph);
+
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
     // Make sure that the optimizers preserved the graph version.
@@ -775,6 +766,8 @@
     Status added_device = item.AddDevice(d->name());
     if (!added_device.ok()) VLOG(3) << added_device.error_message();
   }
+  VLOG(3) << "Grappler available devices: "
+          << absl::StrJoin(item.devices(), ", ");
 
   // Add fetches so that the graph can be pruned.
   item.fetch.swap(ret_node_names);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index dca7b60..da30c7b 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -364,32 +364,25 @@
   for (const FunctionDef* optimized_func : optimized_funcs) {
     count = 0;
     for (const NodeDef& node : optimized_func->node_def()) {
-      if (node.name() == "my_mul/inlined_inputs" && ++count) {
-        EXPECT_EQ("IdentityN", node.op());
-        EXPECT_EQ(2, node.input_size());
+      if (node.name() == "Func/my_mul/input/_0" && ++count) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
         EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("x", node.input(1));
-      } else if (node.name() == "my_mul/x" && ++count) {
+      } else if (node.name() == "Func/my_mul/input/_1" && ++count) {
         EXPECT_EQ("Identity", node.op());
         EXPECT_EQ(1, node.input_size());
-        EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
-      } else if (node.name() == "my_mul/y" && ++count) {
-        EXPECT_EQ("Identity", node.op());
-        EXPECT_EQ(1, node.input_size());
-        EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
+        EXPECT_EQ("x", node.input(0));
       } else if (node.name() == "my_mul/mul" && ++count) {
         EXPECT_EQ("Mul", node.op());
         EXPECT_EQ(2, node.input_size());
-        EXPECT_EQ("my_mul/x:output:0", node.input(0));
-        EXPECT_EQ("my_mul/y:output:0", node.input(1));
-      } else if (node.name() == "my_mul" && ++count) {
-        EXPECT_EQ("IdentityN", node.op());
-        EXPECT_EQ(1, node.input_size());
-        EXPECT_EQ("my_mul/mul:z:0", node.input(0));
+        EXPECT_EQ("Func/my_mul/input/_0:output:0", node.input(0));
+        EXPECT_EQ("Func/my_mul/input/_1:output:0", node.input(1));
       }
       EXPECT_TRUE(node.device().empty());
     }
-    EXPECT_EQ(5, count);
+    EXPECT_EQ(3, count);
+    ASSERT_EQ(1, optimized_func->ret().size());
+    EXPECT_EQ("Func/my_mul/output/_2:output:0", optimized_func->ret().at("z"));
   }
 
   item.fetch = {"out_s", "out_q"};
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 65611ec..a6ab196 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -540,6 +540,7 @@
   (*attr)["T"] = src_attr.at("T");
   (*attr)["strides"] = src_attr.at("strides");
   (*attr)["padding"] = src_attr.at("padding");
+  (*attr)["explicit_paddings"] = src_attr.at("explicit_paddings");
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
@@ -878,9 +879,11 @@
   FusedBatchNorm                        fused_batch_norm;
   ContractionWithBiasAdd                contract_with_bias;
   ContractionWithBiasAddAndActivation   contract_with_bias_and_activation;
+#ifndef INTEL_MKL
   ContractionWithBatchNorm              contract_with_batch_norm;
   ContractionWithBatchNormAndActivation contract_with_batch_norm_and_activation;
   ContractionWithSqueezeAndBiasAdd      contract_with_squeeze_and_bias;
+#endif  // INTEL_MKL
   // clang-format on
 
   // Processing graph in reverse-topological sorted order allows to remap
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index e0cd49a..2591423 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -406,7 +406,9 @@
   auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
 
   std::vector<int> strides = {1, 1, 1, 1};
-  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto conv = ops::Conv2D(
+      s.WithOpName("conv"), input, filter, strides, "EXPLICIT",
+      ops::Conv2D::Attrs().ExplicitPaddings({0, 0, 1, 2, 3, 4, 0, 0}));
   ops::FusedBatchNorm::Attrs attrs;
   attrs = attrs.IsTraining(false);
   auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 7dae0e3..08237d7 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -73,15 +73,29 @@
           continue;
         }
         // Rewrite the reduction of the shape dimensions as a Size operation.
+        NodeDef size_node(*fanout.node);
         const DataType type = input_props[0].dtype();
-        fanout.node->set_op("Size");
-        fanout.node->set_input(0, node.input(0));
-        fanout.node->set_input(1, AsControlDependency(node));
-        fanout.node->mutable_attr()->erase("Tidx");
-        fanout.node->mutable_attr()->erase("keep_dims");
-        (*fanout.node->mutable_attr())["out_type"] =
-            fanout.node->attr().at("T");
-        (*fanout.node->mutable_attr())["T"].set_type(type);
+        size_node.set_op("Size");
+        size_node.set_input(0, node.input(0));
+        size_node.set_input(1, AsControlDependency(node));
+        size_node.mutable_attr()->erase("Tidx");
+        size_node.mutable_attr()->erase("keep_dims");
+        (*size_node.mutable_attr())["out_type"] = fanout.node->attr().at("T");
+        (*size_node.mutable_attr())["T"].set_type(type);
+
+        // The corresponding Size kernel might not exist on the device where
+        // Prod was placed, so assign the Size kernel to the same device as the
+        // input.
+        size_node.set_device(node.device());
+
+        // In the unlikely even that "Size" is not registered on the input
+        // device, skip the optimization.
+        Status s = IsKernelRegisteredForNode(size_node);
+        if (!s.ok()) {
+          continue;
+        }
+
+        fanout.node->Swap(&size_node);
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc b/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
index 95a5ecc..ff2d838 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
@@ -14,7 +14,9 @@
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -29,7 +31,9 @@
 class ShapeOptimizerTest : public GrapplerTest {};
 
 TEST_F(ShapeOptimizerTest, OptimizeShapeProduct) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  // Set the device to CPU zero, because the shape optimizer will only optimize
+  // Prod to Size when a concrete Size kernel is available.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/cpu:0");
   Output a = ops::Const(s.WithOpName("a"), 3.14f, {32, 16});
   Output c = ops::Shape(s.WithOpName("c"), a);
   Output d = ops::Const(s.WithOpName("d"), 0, {1});
@@ -68,6 +72,63 @@
               tensors_actual[1].scalar<int>()(), 0);
 }
 
+TEST_F(ShapeOptimizerTest, OptimizeShapeProductMissingKernel) {
+  {
+    // Skip this test if no GPU is available.
+    std::vector<std::unique_ptr<Device>> devices;
+    SessionOptions session_options;
+    session_options.config.mutable_gpu_options()
+        ->set_per_process_gpu_memory_fraction(0.1);
+    session_options.env = Env::Default();
+    TF_CHECK_OK(DeviceFactory::GetFactory(DEVICE_GPU)
+                    ->AddDevices(session_options, "", &devices));
+    bool found_gpu = false;
+    for (const auto& d : devices) {
+      if (d->device_type() == DEVICE_GPU) {
+        found_gpu = true;
+        break;
+      }
+    }
+    if (!found_gpu) {
+      LOG(INFO) << "Skipping test that requires GPU.";
+      return;
+    }
+  }
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/cpu:0");
+  Output a = ops::Const(s.WithOpName("a"), string("Hello"), {32, 16});
+  Output c = ops::Shape(s.WithOpName("c"), a);
+  Output d = ops::Const(s.WithOpName("d"), 0, {1});
+  ops::ReduceProd::Attrs attrs;
+  Output e = ops::ReduceProd(s.WithDevice("/gpu:0").WithOpName("e"), c, d,
+                             attrs.KeepDims(false));
+
+  GrapplerItem item;
+  item.fetch = {"e"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ShapeOptimizer optimizer;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "e") {
+      found++;
+      EXPECT_EQ("Size", node.op());
+      EXPECT_EQ("a", node.input(0));
+      EXPECT_EQ("/cpu:0", node.device());
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_actual = EvaluateNodes(output, item.fetch);
+  EXPECT_NEAR(tensors_expected[0].scalar<int>()(),
+              tensors_actual[0].scalar<int>()(), 0);
+}
+
 TEST_F(ShapeOptimizerTest, OptimizeShapeRatio) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 3.14f, {32, 32});
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 34bbf94..acbb81a 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -493,13 +493,26 @@
   return Status::OK();
 }
 
-Status IsKernelRegisteredForNode(const NodeDef& node) {
+Status IsKernelRegisteredForNode(
+    absl::string_view node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    absl::string_view node_op, absl::string_view node_device,
+    AttrSlice node_attrs) {
   DeviceNameUtils::ParsedName parsed_name;
-  if (!DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+  if (!DeviceNameUtils::ParseFullName(node_device, &parsed_name)) {
     return errors::InvalidArgument("Could not parse device name: ",
-                                   node.device());
+                                   node_device);
   }
-  return FindKernelDef(DeviceType(parsed_name.type), node, nullptr, nullptr);
+  return FindKernelDef(DeviceType(parsed_name.type), node_name,
+                       has_experimental_debug_info, experimental_debug_info,
+                       node_op, node_device, node_attrs, nullptr, nullptr);
+}
+
+Status IsKernelRegisteredForNode(const NodeDef& node) {
+  return IsKernelRegisteredForNode(node.name(),
+                                   node.has_experimental_debug_info(),
+                                   node.experimental_debug_info(), node.op(),
+                                   node.device(), AttrSlice(&node.attr()));
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 6d09376..700e431 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -298,6 +298,11 @@
 
 // Returns Status::OK() if a kernel is registered for node.op() on the device
 // type corresponding to node.device().
+Status IsKernelRegisteredForNode(
+    absl::string_view node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    absl::string_view node_op, absl::string_view node_device,
+    AttrSlice node_attrs);
 Status IsKernelRegisteredForNode(const NodeDef& node);
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index b24f27d..2c3ae8b 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -277,3 +277,29 @@
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "canonicalizer",
+    srcs = ["canonicalizer.cc"],
+    hdrs = ["canonicalizer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "canonicalizer_test",
+    size = "small",
+    srcs = ["canonicalizer_test.cc"],
+    deps = [
+        ":canonicalizer",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/canonicalizer.cc b/tensorflow/core/grappler/utils/canonicalizer.cc
new file mode 100644
index 0000000..a30d97b
--- /dev/null
+++ b/tensorflow/core/grappler/utils/canonicalizer.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
+
+#include <algorithm>
+
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void CanonicalizeNode(NodeDef* node) {
+  if (node->input_size() < 2) return;
+  // Partition control and regular inputs.
+  int index = 0;
+  for (; index < node->input_size(); ++index) {
+    if (IsControlInput(node->input(index))) {
+      break;
+    }
+  }
+  auto* input = node->mutable_input();
+  // Maybe sort regular inputs.
+  if (IsCommutative(*node) && index > 0) {
+    std::sort(input->begin(), input->begin() + index);
+  }
+  // Sort and dedup control inputs.
+  if (index < node->input_size()) {
+    std::sort(input->begin() + index, input->end());
+    input->erase(std::unique(input->begin() + index, input->end()),
+                 input->end());
+  }
+}
+
+void CanonicalizeGraph(GraphDef* graph) {
+  for (int i = 0; i < graph->node_size(); ++i) {
+    CanonicalizeNode(graph->mutable_node(i));
+  }
+}
+
+void CompressConstants(GraphDef* graph) {
+  for (int i = 0; i < graph->node_size(); ++i) {
+    NodeDef* node = graph->mutable_node(i);
+    if ((IsConstant(*node) || IsHostConstant(*node)) &&
+        HasNodeAttr(*node, "value")) {
+      AttrValue& attr_val = (*node->mutable_attr())["value"];
+      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/canonicalizer.h b/tensorflow/core/grappler/utils/canonicalizer.h
new file mode 100644
index 0000000..a913fc2
--- /dev/null
+++ b/tensorflow/core/grappler/utils/canonicalizer.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Canonicalizes node by performing the following steps
+//  - sorting control inputs,
+//  - sorting data inputs if the node represents a commutative op.
+void CanonicalizeNode(NodeDef* node);
+
+// Canonicalizes all nodes in graph.
+void CanonicalizeGraph(GraphDef* graph);
+
+// Compresses Const and HostConstant nodes in the graph to the smallest
+// representation possible, either
+//   a) truncated repeated field representation, or
+//   b) raw serialized byte format.
+// Each node is only modified if it is larger than 64 bytes and compression
+// reduces its size by more than 50%.
+void CompressConstants(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
diff --git a/tensorflow/core/grappler/utils/canonicalizer_test.cc b/tensorflow/core/grappler/utils/canonicalizer_test.cc
new file mode 100644
index 0000000..2a1ba92
--- /dev/null
+++ b/tensorflow/core/grappler/utils/canonicalizer_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeNode(const string& op) {
+  NodeDef node;
+  node.set_name("node");
+  node.set_op(op);
+  *node.add_input() = "b";
+  *node.add_input() = "a";
+  *node.add_input() = "^z";
+  *node.add_input() = "^y";
+  *node.add_input() = "^x";
+  *node.add_input() = "^z";
+  return node;
+}
+
+void Verify(const NodeDef& node) {
+  EXPECT_EQ(node.name(), "node");
+  ASSERT_EQ(node.input_size(), 5);
+  if (node.op() == "Div") {
+    EXPECT_EQ(node.input(0), "b");
+    EXPECT_EQ(node.input(1), "a");
+  } else {
+    EXPECT_EQ(node.input(0), "a");
+    EXPECT_EQ(node.input(1), "b");
+  }
+  EXPECT_EQ(node.input(2), "^x");
+  EXPECT_EQ(node.input(3), "^y");
+  EXPECT_EQ(node.input(4), "^z");
+}
+
+TEST(CanonicalizeNode, NonCommutative) {
+  NodeDef node = MakeNode("Div");
+  CanonicalizeNode(&node);
+  Verify(node);
+}
+
+TEST(CanonicalizeNode, Commutative) {
+  NodeDef node = MakeNode("Mul");
+  CanonicalizeNode(&node);
+  Verify(node);
+}
+
+TEST(CanonicalizeGraph, Simple) {
+  GraphDef graph;
+  *graph.add_node() = MakeNode("Div");
+  *graph.add_node() = MakeNode("Mul");
+  CanonicalizeGraph(&graph);
+  for (auto node : graph.node()) {
+    Verify(node);
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.cc b/tensorflow/core/grappler/verifiers/structure_verifier.cc
index 2b438b5..819605d 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.cc
@@ -42,7 +42,7 @@
 
   std::vector<const NodeDef*> topo_order;
   status_group.Update(ComputeTopologicalOrder(graph, &topo_order));
-  return status_group.as_status();
+  return status_group.as_concatenated_status();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e2e32ed..e377571 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -31,6 +31,7 @@
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
     "if_android",
+    "if_cuda_or_rocm",
     "if_not_windows",
     "tf_cc_binary",
     "tf_cc_shared_object",
@@ -64,6 +65,11 @@
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "if_nccl")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_is_configured",
+)
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -197,6 +203,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
     ] + if_nccl([
         "@local_config_nccl//:nccl",
         "//tensorflow/core/nccl:nccl_lib",
@@ -741,6 +748,7 @@
         "//tensorflow:android": [],
         "//tensorflow:arm": [],
         "//tensorflow:ios": [],
+        "//tensorflow:linux_aarch64": [],
         "//tensorflow:linux_ppc64le": [],
         "//conditions:default": [
             "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
@@ -753,6 +761,7 @@
         "//tensorflow:android": [],
         "//tensorflow:arm": [],
         "//tensorflow:ios": [],
+        "//tensorflow:linux_aarch64": [],
         "//tensorflow:linux_ppc64le": [],
         "//conditions:default": ["@mkl_dnn//:mkldnn_single_threaded"],
     }),
@@ -932,6 +941,7 @@
         ":edit_distance_op",
         ":extract_image_patches_op",
         ":extract_volume_patches_op",
+        ":fingerprint_op",
         ":gather_nd_op",
         ":gather_op",
         ":guarantee_const_op",
@@ -1022,6 +1032,28 @@
 )
 
 tf_kernel_library(
+    name = "fingerprint_op",
+    prefix = "fingerprint_op",
+    deps = ARRAY_DEPS,
+)
+
+tf_cc_test(
+    name = "fingerprint_op_test",
+    size = "small",
+    srcs = ["fingerprint_op_test.cc"],
+    kernels = [":fingerprint_op"],
+    deps = [
+        ":ops_testutil",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
     name = "gather_nd_op",
     prefix = "gather_nd_op",
     deps = ARRAY_DEPS,
@@ -1260,6 +1292,8 @@
     deps = if_cuda([
         ":cuda_solvers",
         "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
     ]) + ARRAY_DEPS,
 )
 
@@ -1268,7 +1302,9 @@
     deps = [
         ":ragged_gather_op",
         ":ragged_range_op",
+        ":ragged_tensor_from_variant_op",
         ":ragged_tensor_to_sparse_kernel",
+        ":ragged_tensor_to_variant_op",
     ],
 )
 
@@ -1339,6 +1375,56 @@
 )
 
 tf_kernel_library(
+    name = "ragged_tensor_to_variant_op",
+    srcs = ["ragged_tensor_to_variant_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "ragged_tensor_from_variant_op",
+    srcs = ["ragged_tensor_from_variant_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_tensor_to_variant_op_test",
+    size = "small",
+    srcs = ["ragged_tensor_to_variant_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ragged_tensor_to_variant_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_tensor_from_variant_op_test",
+    size = "small",
+    srcs = ["ragged_tensor_from_variant_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ragged_tensor_from_variant_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_kernel_library(
     name = "cudnn_rnn_kernels",
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
@@ -1520,10 +1606,10 @@
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/stream_executor/cuda:cudnn_plugin",
     ],
 )
 
@@ -1648,6 +1734,7 @@
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/stream_executor/cuda:cudnn_plugin",
     ],
 )
 
@@ -2229,7 +2316,9 @@
     deps = DYNAMIC_DEPS + [
         ":fill_functor",
         ":gather_functor",
-    ] + if_cuda(["@cub_archive//:cub"]),
+    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_kernel_library(
@@ -2472,6 +2561,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
 )
 
@@ -2495,6 +2585,7 @@
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -3009,6 +3100,7 @@
         ":self_adjoint_eig_op",
         ":self_adjoint_eig_v2_op",
         ":svd_op",
+        ":tridiagonal_matmul_op",
         ":tridiagonal_solve_op",
     ],
 )
@@ -3148,6 +3240,12 @@
 )
 
 tf_kernel_library(
+    name = "tridiagonal_matmul_op",
+    srcs = ["tridiagonal_matmul_op.cc"],
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
     name = "tridiagonal_solve_op",
     srcs = ["tridiagonal_solve_op.cc"],
     gpu_srcs = ["tridiagonal_solve_op_gpu.cu.cc"],
@@ -3481,7 +3579,11 @@
     name = "reduction_ops",
     gpu_srcs = ["reduction_gpu_kernels.cu.h"],
     prefix = "reduction_ops",
-    deps = MATH_DEPS + [":transpose_functor"] + if_cuda(["@cub_archive//:cub"]),
+    deps = MATH_DEPS + [":transpose_functor"] + if_cuda([
+        "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_kernel_library(
@@ -3503,7 +3605,11 @@
         "scan_ops_gpu_float.cu.cc",
         "scan_ops_gpu_half.cu.cc",
     ],
-    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
+    deps = MATH_DEPS + if_cuda([
+        "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_kernel_library(
@@ -3665,19 +3771,6 @@
     ],
 )
 
-tf_cc_test(
-    name = "batch_matmul_op_common_test",
-    size = "small",
-    srcs = ["batch_matmul_op_common_test.cc"],
-    deps = [
-        ":batch_matmul_op",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "batch_matmul_op_test",
     size = "small",
@@ -3943,6 +4036,8 @@
     ] + if_cuda([
         "@cub_archive//:cub",
         "@local_config_cuda//cuda:cudnn_header",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
     ]),
 )
 
@@ -4023,11 +4118,16 @@
 tf_kernel_library(
     name = "bias_op",
     prefix = "bias_op",
-    deps = NN_DEPS + [":redux_functor"] + if_cuda([
+    deps = NN_DEPS + [
+        ":redux_functor",
+    ] + if_cuda_or_rocm([
         ":reduction_ops",
+    ]) + if_cuda([
         "@cub_archive//:cub",
         "//tensorflow/core:stream_executor",
         "//tensorflow/stream_executor/cuda:cuda_stream",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
     ]),
 )
 
@@ -4060,9 +4160,12 @@
 tf_kernel_library(
     name = "softmax_op",
     prefix = "softmax_op",
-    deps = NN_DEPS + if_cuda([
+    deps = NN_DEPS + if_cuda_or_rocm([
         ":reduction_ops",
+    ]) + if_cuda([
         "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
     ]),
 )
 
@@ -4095,7 +4198,11 @@
         "topk_op_gpu_int8.cu.cc",
         "topk_op_gpu_uint8.cu.cc",
     ],
-    deps = NN_DEPS + if_cuda(["@cub_archive//:cub"]),
+    deps = NN_DEPS + if_cuda([
+        "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_kernel_library(
@@ -4118,7 +4225,9 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
-    ] + if_cuda(["@cub_archive//:cub"]),
+    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_kernel_library(
@@ -4129,7 +4238,9 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
-    ] + if_cuda(["@cub_archive//:cub"]),
+    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_kernel_library(
@@ -4142,7 +4253,9 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
-    ] + if_cuda(["@cub_archive//:cub"]),
+    ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
+        "@rocprim_archive//:rocprim",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -4364,6 +4477,7 @@
     deps = [
         ":decode_compressed_op",
         ":decode_csv_op",
+        ":decode_padded_raw_op",
         ":decode_raw_op",
         ":example_parsing_ops",
         ":parse_tensor_op",
@@ -4391,6 +4505,12 @@
 )
 
 tf_kernel_library(
+    name = "decode_padded_raw_op",
+    prefix = "decode_padded_raw_op",
+    deps = PARSING_DEPS,
+)
+
+tf_kernel_library(
     name = "decode_compressed_op",
     prefix = "decode_compressed_op",
     deps = [
@@ -4709,9 +4829,12 @@
     deps = SPARSE_DEPS + [
         ":bounds_check",
         "//third_party/eigen3",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
         ":reduction_ops",
+    ]) + if_cuda([
         "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
     ]),
 )
 
@@ -4973,9 +5096,11 @@
         ":string_format_op",
         ":string_join_op",
         ":string_length_op",
+        ":string_lower_op",
         ":string_split_op",
         ":string_strip_op",
         ":string_to_hash_bucket_op",
+        ":string_upper_op",
         ":substr_op",
         ":unicode_ops",
         ":unicode_script_op",
@@ -5112,6 +5237,24 @@
 )
 
 tf_kernel_library(
+    name = "string_lower_op",
+    prefix = "string_lower_op",
+    deps = STRING_DEPS + [
+        "@com_google_absl//absl/strings",
+        "@icu//:common",
+    ],
+)
+
+tf_kernel_library(
+    name = "string_upper_op",
+    prefix = "string_upper_op",
+    deps = STRING_DEPS + [
+        "@com_google_absl//absl/strings",
+        "@icu//:common",
+    ],
+)
+
+tf_kernel_library(
     name = "substr_op",
     prefix = "substr_op",
     deps = STRING_DEPS,
@@ -5203,9 +5346,12 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
         ":reduction_ops",
+    ]) + if_cuda([
         "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@rocprim_archive//:rocprim",
     ]),
 )
 
@@ -5558,8 +5704,6 @@
     name = "mobile_srcs",
     srcs = [
         "avgpooling_op.h",
-        "batch_matmul_op_common.cc",
-        "batch_matmul_op_common.h",
         "batch_util.h",
         "cwise_ops.h",
         "cwise_ops_common.h",
@@ -5750,7 +5894,6 @@
         "initializable_lookup_table.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
-        "logging_ops.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
@@ -6031,6 +6174,7 @@
 )
 
 ANDROID_TEXTUAL_HDRS = [
+    "eigen_spatial_convolutions-inl.h",
     "gather_nd_op_cpu_impl.h",
     "gemm_functors.h",
     "mirror_pad_op_cpu_impl.h",
@@ -6060,7 +6204,6 @@
             "*_3d*",
             "*.cu.*",
             # Ops already in android_srcs
-            "batch_matmul_op_common.cc",
             "pooling_ops_common.cc",
             # Ops which we are currently excluding because they are likely
             # not used on Android. Those ops also do not compile if included,
@@ -6100,6 +6243,8 @@
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
+            "string_lower_op.cc",  # Requires ICU for unicode.
+            "string_upper_op.cc",  # Requires ICU for unicode.
             "unicode_ops.cc",
             "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
@@ -6707,6 +6852,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_graph_util",
         "@gemmlowp",
     ] + mkl_deps(),
 )
@@ -7049,6 +7195,8 @@
         ":bounds_check",
         ":conv_ops",
         ":ops_util",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -7159,6 +7307,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_graph_util",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "@gemmlowp",
@@ -7277,26 +7426,22 @@
         "no_op.h",
         "reference_gemm.h",
     ],
-    deps = if_mkl(
-        [
-            ":concat_lib_hdrs",
-            ":conv_ops",
-            ":cwise_op",
-            ":eigen_helpers",
-            ":image_resizer_state",
-            ":ops_util",
-            ":pooling_ops",
-            ":quantization_utils",
-            ":transpose_functor",
-            "//third_party/eigen3",
-            "@gemmlowp",
-            "@mkl_dnn",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:lib",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+    deps = [
+        ":concat_lib_hdrs",
+        ":conv_ops",
+        ":cwise_op",
+        ":eigen_helpers",
+        ":image_resizer_state",
+        ":ops_util",
+        ":pooling_ops",
+        ":quantization_utils",
+        ":transpose_functor",
+        "//third_party/eigen3",
+        "@gemmlowp",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ] + mkl_deps(),
 )
 
 tf_cc_test_mkl(
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 631702f..7fa3395 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -99,7 +99,8 @@
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
@@ -382,7 +383,8 @@
     Name("AdjustContrastv2").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     AdjustContrastOpv2<CPUDevice, float>);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
index 9030b9d..0b9142c 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -56,7 +56,8 @@
 // BM_AdjustContrast_cpu_1_299_299     179084     340186  2181  751.9M items/s
 // BM_AdjustContrast_gpu_32_299_299     85276     123665  4189  2.9G items/s
 BM_AdjustContrastDev(cpu, 1, 299, 299);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 BM_AdjustContrastDev(gpu, 32, 299, 299);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
index 2c20d3d..e072dc4 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 3b8d339..afe34b0 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -211,7 +211,8 @@
 
 #undef REGISTER_ADDN_CPU
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
 TF_CALL_int64(REGISTER_ADDN_GPU);
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
index 1dcf460..85bdc24 100644
--- a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index d840b2d..59c8fff 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -17,7 +17,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -150,7 +151,8 @@
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
index 1b82ae6..bd7c4b4 100644
--- a/tensorflow/core/kernels/argmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 199f551..9c7dddc 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -83,7 +83,8 @@
       context->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
   if (output->NumElements() > 0) {
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
     if (std::is_same<Device, GPUDevice>::value) {
       ConcatGPU<T>(context, inputs_flat, output, &output_flat);
       return Status::OK();
@@ -173,7 +174,8 @@
   return Status::OK();
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 // Handles the general case, on GPU.
 template <typename T>
@@ -198,7 +200,8 @@
     return Status::OK();
   }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // TODO(olston, apassos): Handle non-CPU cases.
 // return SplitGPU<T>(context, input, sizes, outputs);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -517,7 +520,6 @@
       return;
     }
     FunctionLibraryRuntime::Options opts;
-    opts.step_id = last_task_context->step_id();
     opts.step_container = last_task_context->step_container();
     opts.cancellation_manager = last_task_context->cancellation_manager();
     opts.stats_collector = last_task_context->stats_collector();
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 617951b..1798b27 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -30,12 +30,12 @@
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_matmul_op_common.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index e099dcf..b07c5fd 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -266,7 +266,7 @@
 TF_CALL_REAL_NUMBER_TYPES(REGISTER);
 #undef REGISTER
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
 #define REGISTER(T)                                        \
   REGISTER_KERNEL_BUILDER(Name("BatchToSpaceND")           \
                               .Device(DEVICE_GPU)          \
@@ -282,6 +282,6 @@
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 #undef REGISTER
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/betainc_op.cc b/tensorflow/core/kernels/betainc_op.cc
index 8683006..f9f6d9f 100644
--- a/tensorflow/core/kernels/betainc_op.cc
+++ b/tensorflow/core/kernels/betainc_op.cc
@@ -122,7 +122,8 @@
 REGISTER_KERNELS(double);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC_NDIM(T, NDIM)                               \
diff --git a/tensorflow/core/kernels/betainc_op_gpu.cu.cc b/tensorflow/core/kernels/betainc_op_gpu.cu.cc
index 4c8b0aa..2b7ce39 100644
--- a/tensorflow/core/kernels/betainc_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/betainc_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 285cded..2e7316f 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -10,7 +10,11 @@
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+    "tf_cc_test",
+)
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_proto_library",
@@ -52,6 +56,25 @@
 cc_library(
     name = "tree_helper",
     hdrs = ["tree_helper.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "tree_helper_test",
+    srcs = ["tree_helper_test.cc"],
+    deps = [
+        ":tree_helper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
@@ -72,6 +95,7 @@
         ":tree_helper",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//third_party/eigen3",
     ],
 )
 
@@ -84,6 +108,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index cc6ac36..097df8a 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -15,6 +15,7 @@
 
 #include <vector>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/boosted_trees/tree_helper.h"
@@ -24,6 +25,7 @@
 
 const char INEQUALITY_DEFAULT_LEFT[] = "inequality_default_left";
 
+// V1 Op. Deprecated. BoostedTreesCalculateBestFeatureSplitOp is V2.
 class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
  public:
   explicit BoostedTreesCalculateBestGainsPerFeatureOp(
@@ -45,6 +47,8 @@
     OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
                                                 &stats_summary_list));
     const int64 num_buckets = stats_summary_list[0].dim_size(1);
+    // Check for single logit: 1 gradient + 1 hessian value.
+    DCHECK_EQ(stats_summary_list[0].dim_size(2), 2);
     std::vector<TTypes<float, 3>::ConstTensor> stats_summary;
     stats_summary.reserve(stats_summary_list.size());
     for (const auto& tensor : stats_summary_list) {
@@ -84,6 +88,10 @@
                    context->output_list("right_node_contribs_list",
                                         &output_right_node_contribs_list));
 
+    // Use identity later to convert float to Eigen::Matrix type for input to
+    // CalculateWeightsAndGains. This op only supports single dimension logits.
+    Eigen::MatrixXf identity;
+    identity.setIdentity(1, 1);
     // Get the best split info per node for each feature.
     for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
       std::vector<float> cum_grad;
@@ -120,30 +128,32 @@
         float best_contrib_for_right = 0.0;
         // Parent gain.
         float parent_gain;
-        float unused;
-        CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &unused,
-                                 &parent_gain);
+        Eigen::VectorXf unused(1);
+        CalculateWeightsAndGains(total_grad * identity, total_hess * identity,
+                                 l1, l2, &unused, &parent_gain);
 
         for (int bucket = 0; bucket < num_buckets; ++bucket) {
           const float cum_grad_bucket = cum_grad[bucket];
           const float cum_hess_bucket = cum_hess[bucket];
           // Left child.
-          float contrib_for_left;
+          Eigen::VectorXf contrib_for_left(1);
           float gain_for_left;
-          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
+          CalculateWeightsAndGains(cum_grad_bucket * identity,
+                                   cum_hess_bucket * identity, l1, l2,
                                    &contrib_for_left, &gain_for_left);
           // Right child.
-          float contrib_for_right;
+          // use contrib_for_right.
+          Eigen::VectorXf contrib_for_right(1);
           float gain_for_right;
-          CalculateWeightsAndGains(total_grad - cum_grad_bucket,
-                                   total_hess - cum_hess_bucket, l1, l2,
-                                   &contrib_for_right, &gain_for_right);
+          CalculateWeightsAndGains((total_grad - cum_grad_bucket) * identity,
+                                   (total_hess - cum_hess_bucket) * identity,
+                                   l1, l2, &contrib_for_right, &gain_for_right);
 
           if (GainIsLarger(gain_for_left + gain_for_right, best_gain)) {
             best_gain = gain_for_left + gain_for_right;
             best_bucket = bucket;
-            best_contrib_for_left = contrib_for_left;
-            best_contrib_for_right = contrib_for_right;
+            best_contrib_for_left = contrib_for_left[0];
+            best_contrib_for_right = contrib_for_right[0];
           }
         }  // for bucket
         output_node_ids.push_back(node_id);
@@ -191,9 +201,8 @@
         // Adjust the gains to penalize by tree complexity.
         output_gains_vec(i) = output_gains[i] - tree_complexity;
         output_thresholds_vec(i) = output_thresholds[i];
-        // Logits are 1-dimensional for now.
-        // TODO(nponomareva): Consider multi-dimensional logits.
         output_left_node_contribs_matrix(i, 0) = output_left_node_contribs[i];
+        // This op only supports 1-dimensional logits.
         output_right_node_contribs_matrix(i, 0) = output_right_node_contribs[i];
       }
     }  // for f
@@ -204,17 +213,20 @@
   int num_features_;
 };
 
+// V1 op that only supports single dimensional logit.
 REGISTER_KERNEL_BUILDER(
     Name("BoostedTreesCalculateBestGainsPerFeature").Device(DEVICE_CPU),
     BoostedTreesCalculateBestGainsPerFeatureOp);
 
+// V2 Op.
 class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
  public:
   explicit BoostedTreesCalculateBestFeatureSplitOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
-    // TODO(crawles): Using logits_dim_ for multi-class split.
     OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
+    // TODO(crawles): multiclass support.
+    DCHECK_EQ(logits_dim_, 1);
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -231,6 +243,8 @@
         stats_summary_t->tensor<float, 4>();
     const int64 feature_dims = stats_summary_t->dim_size(1);
     const int64 num_buckets = stats_summary_t->dim_size(2);
+    const int64 hessian_dim = stats_summary_t->dim_size(3) - logits_dim_;
+    DCHECK_GT(hessian_dim, 0);
 
     const Tensor* l1_t;
     OP_REQUIRES_OK(context, context->input("l1", &l1_t));
@@ -259,8 +273,8 @@
     std::vector<string> output_split_types;
 
     for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
-      std::vector<float> cum_grad;
-      std::vector<float> cum_hess;
+      std::vector<Eigen::VectorXf> cum_grad;
+      std::vector<Eigen::VectorXf> cum_hess;
       cum_grad.reserve(num_buckets);
       cum_hess.reserve(num_buckets);
 
@@ -268,25 +282,34 @@
       float best_bucket = 0;
       float best_f_dim = 0;
       string best_split_type = INEQUALITY_DEFAULT_LEFT;
-      float best_contrib_for_left = 0.0;
-      float best_contrib_for_right = 0.0;
+      // TODO(crawles): multi-class support; as Eigen::VectorXf.
+      float best_contrib_for_left = 0;
+      float best_contrib_for_right = 0;
       // Parent gain.
       float parent_gain;
-      float unused;
+      Eigen::VectorXf unused(logits_dim_);
 
       for (int f_dim = 0; f_dim < feature_dims; ++f_dim) {
         cum_grad.clear();
         cum_hess.clear();
-        float total_grad = 0.0;
-        float total_hess = 0.0;
+        Eigen::VectorXf total_grad = Eigen::VectorXf::Zero(logits_dim_);
+        Eigen::VectorXf total_hess = Eigen::VectorXf::Zero(hessian_dim);
         for (int bucket = 0; bucket < num_buckets; ++bucket) {
-          total_grad += stats_summary(node_id, f_dim, bucket, 0);
-          total_hess += stats_summary(node_id, f_dim, bucket, 1);
+          for (int i = 0; i < logits_dim_; ++i) {
+            total_grad[i] += stats_summary(node_id, f_dim, bucket, i);
+            total_hess[i] +=
+                stats_summary(node_id, f_dim, bucket, logits_dim_ + i);
+          }
+          for (int i = logits_dim_; i < hessian_dim; ++i) {
+            // Full hessian.
+            total_hess[i] += stats_summary(node_id, f_dim, bucket, i);
+          }
           cum_grad.push_back(total_grad);
           cum_hess.push_back(total_hess);
         }
 
-        if (total_hess < min_node_weight) {
+        // TODO(crawles): Check if grad is almost zero.
+        if (total_hess.norm() < min_node_weight) {
           // Do not split the node because not enough hessian.
           break;
         }
@@ -296,26 +319,29 @@
         }
 
         for (int bucket = 0; bucket < num_buckets; ++bucket) {
-          const float cum_grad_bucket = cum_grad[bucket];
-          const float cum_hess_bucket = cum_hess[bucket];
+          const Eigen::VectorXf cum_grad_bucket = cum_grad[bucket];
+          const Eigen::VectorXf cum_hess_bucket = cum_hess[bucket];
           // Left child.
-          float contrib_for_left;
+          Eigen::VectorXf contrib_for_left(logits_dim_);
           float gain_for_left;
           CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
                                    &contrib_for_left, &gain_for_left);
           // Right child.
-          float contrib_for_right;
+          // TODO(crawles): consider accumulating right grad/hessians when doing
+          // cum_grad/hessian (if this becomes a bottleneck).
+          const Eigen::VectorXf grad_for_right = total_grad - cum_grad_bucket;
+          const Eigen::VectorXf hess_for_right = total_hess - cum_hess_bucket;
+          Eigen::VectorXf contrib_for_right(logits_dim_);
           float gain_for_right;
-          CalculateWeightsAndGains(total_grad - cum_grad_bucket,
-                                   total_hess - cum_hess_bucket, l1, l2,
+          CalculateWeightsAndGains(grad_for_right, hess_for_right, l1, l2,
                                    &contrib_for_right, &gain_for_right);
-
           if (GainIsLarger(gain_for_left + gain_for_right, best_gain)) {
             best_gain = gain_for_left + gain_for_right;
             best_bucket = bucket;
             best_f_dim = f_dim;
-            best_contrib_for_left = contrib_for_left;
-            best_contrib_for_right = contrib_for_right;
+            // TODO(crawles): multi-class support.
+            best_contrib_for_left = contrib_for_left[0];
+            best_contrib_for_right = contrib_for_right[0];
           }
         }  // for bucket
       }    // for f_dim
@@ -363,6 +389,7 @@
 
     // output_left_node_contribs
     Tensor* output_left_node_contribs_t;
+    // TODO(crawles): Using logits_dim_ for multi-class split.
     OP_REQUIRES_OK(
         context, context->allocate_output("left_node_contribs", {num_nodes, 1},
                                           &output_left_node_contribs_t));
@@ -401,6 +428,7 @@
   int logits_dim_;
 };
 
+// v2 op that supports multi-class.
 REGISTER_KERNEL_BUILDER(
     Name("BoostedTreesCalculateBestFeatureSplit").Device(DEVICE_CPU),
     BoostedTreesCalculateBestFeatureSplitOp);
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 2548723..68cf99a 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/boosted_trees/resources.h"
@@ -261,7 +262,7 @@
     const Tensor* mean_gradients_t;
     OP_REQUIRES_OK(context,
                    context->input("mean_gradients", &mean_gradients_t));
-
+    const int32 logits_dim = mean_gradients_t->dim_size(1);
     const Tensor* mean_hessians_t;
     OP_REQUIRES_OK(context, context->input("mean_hessians", &mean_hessians_t));
 
@@ -274,14 +275,18 @@
     const auto l2 = l2_t->scalar<float>()();
 
     // For now, assume 1-dimensional weight on leaves.
-    float logits;
+    Eigen::VectorXf logits_vector(1);
     float unused_gain;
 
-    // TODO(nponomareva): change this when supporting multiclass.
-    const float gradients_mean = mean_gradients_t->flat<float>()(0);
-    const float hessians_mean = mean_hessians_t->flat<float>()(0);
-    CalculateWeightsAndGains(gradients_mean, hessians_mean, l1, l2, &logits,
-                             &unused_gain);
+    // TODO(crawles): Support multiclass.
+    DCHECK_EQ(logits_dim, 1);
+    Eigen::VectorXf gradients_mean(1);
+    Eigen::VectorXf hessians_mean(1);
+    gradients_mean[0] = mean_gradients_t->flat<float>()(0);
+    hessians_mean[0] = mean_hessians_t->flat<float>()(0);
+    CalculateWeightsAndGains(gradients_mean, hessians_mean, l1, l2,
+                             &logits_vector, &unused_gain);
+    const float logits = logits_vector[0];
 
     float current_bias = 0.0;
     bool continue_centering = true;
diff --git a/tensorflow/core/kernels/boosted_trees/tree_helper.h b/tensorflow/core/kernels/boosted_trees/tree_helper.h
index 8b18d9e..4a4aafd 100644
--- a/tensorflow/core/kernels/boosted_trees/tree_helper.h
+++ b/tensorflow/core/kernels/boosted_trees/tree_helper.h
@@ -16,6 +16,12 @@
 #ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_TREE_HELPER_H_
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_TREE_HELPER_H_
 #include <cmath>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/QR"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
@@ -29,38 +35,81 @@
   return g1 - g2 >= kTolerance;
 }
 
-static void CalculateWeightsAndGains(const float g, const float h,
-                                     const float l1, const float l2,
-                                     float* weight, float* gain) {
+static void MultiDimLogitSolveForWeightAndGain(Eigen::MatrixXf hessian_and_reg,
+                                               Eigen::VectorXf g,
+                                               Eigen::VectorXf* weight,
+                                               float* gain) {
+  *weight = -hessian_and_reg.colPivHouseholderQr().solve(g);
+  *gain = -g.transpose() * (*weight);
+}
+
+static void CalculateWeightsAndGains(const Eigen::VectorXf g,
+                                     const Eigen::VectorXf h, const float l1,
+                                     const float l2, Eigen::VectorXf* weight,
+                                     float* gain) {
   const float kEps = 1e-15;
-  // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
-  // (g+l1*sgn(w))^2/(h+l2).
-  // This is because for each leaf we optimize
-  // 1/2(h+l2)*w^2+g*w+l1*abs(w)
-  float g_with_l1 = g;
-  // Apply L1 regularization.
-  // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
-  // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
-  // For g from (-l1, l1), thus there is no solution => set to 0.
-  if (l1 > 0) {
-    if (g > l1) {
-      g_with_l1 -= l1;
-    } else if (g < -l1) {
-      g_with_l1 += l1;
-    } else {
-      *weight = 0.0;
-      *gain = 0.0;
-      return;
+  int32 logits_dim = g.size();
+  if (logits_dim == 1) {
+    // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
+    // (g+l1*sgn(w))^2/(h+l2).
+    // This is because for each leaf we optimize
+    // 1/2(h+l2)*w^2+g*w+l1*abs(w)
+    float g_with_l1 = g[0];
+    // Apply L1 regularization.
+    // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
+    // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
+    // For g from (-l1, l1), thus there is no solution => set to 0.
+    if (l1 > 0) {
+      if (g[0] > l1) {
+        g_with_l1 -= l1;
+      } else if (g[0] < -l1) {
+        g_with_l1 += l1;
+      } else {
+        weight->coeffRef(0) = 0.0;
+        *gain = 0.0;
+        return;
+      }
     }
-  }
-  // Apply L2 regularization.
-  if (h + l2 <= kEps) {
-    // Avoid division by 0 or infinitesimal.
-    *weight = 0;
-    *gain = 0;
-  } else {
-    *weight = -g_with_l1 / (h + l2);
-    *gain = -g_with_l1 * (*weight);
+    // Apply L2 regularization.
+    if (h[0] + l2 <= kEps) {
+      // Avoid division by 0 or infinitesimal.
+      weight->coeffRef(0) = 0;
+      *gain = 0;
+    } else {
+      weight->coeffRef(0) = -g_with_l1 / (h[0] + l2);
+      *gain = -g_with_l1 * weight->coeffRef(0);
+    }
+  } else if (h.size() == logits_dim * logits_dim) { /* Full Hessian */
+    Eigen::MatrixXf identity;
+    identity.setIdentity(logits_dim, logits_dim);
+    // TODO(crawles): figure out L1 regularization for matrix form.
+    Eigen::MatrixXf hessian_and_reg =
+        h.reshaped(logits_dim, logits_dim) + l2 * identity;
+    MultiDimLogitSolveForWeightAndGain(hessian_and_reg, g, weight, gain);
+  } else if (h.size() == logits_dim) { /* Diagonal Hessian approximation. */
+    // TODO(crawles): figure out L1 regularization for matrix form.
+    Eigen::ArrayXf hessian_and_reg = h.array() + l2;
+    // Check if any of the elements are zeros.
+    bool invertible = true;
+    for (int i = 0; i < hessian_and_reg.size(); ++i) {
+      if (hessian_and_reg[i] == 0.0) {
+        invertible = false;
+        break;
+      }
+    }
+    if (invertible) {
+      // Operations on arrays are element wise. The formulas are as for full
+      // hessian, but for hessian of diagonal form they are simplified.
+      Eigen::ArrayXf ones = Eigen::ArrayXf::Ones(logits_dim);
+      Eigen::ArrayXf temp = ones / hessian_and_reg;
+      *weight = -temp * g.array();
+      *gain = (-g.array() * (*weight).array()).sum();
+    } else {
+      // Hessian is not invertible. We will go the same route as in full
+      // hessian to get an approximate solution.
+      MultiDimLogitSolveForWeightAndGain(hessian_and_reg.matrix().asDiagonal(),
+                                         g, weight, gain);
+    }
   }
 }
 
diff --git a/tensorflow/core/kernels/boosted_trees/tree_helper_test.cc b/tensorflow/core/kernels/boosted_trees/tree_helper_test.cc
new file mode 100644
index 0000000..91c9c2a
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/tree_helper_test.cc
@@ -0,0 +1,129 @@
+// Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/kernels/boosted_trees/tree_helper.h"
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+using std::vector;
+
+namespace tensorflow {
+namespace {
+
+const double kDelta = 1e-5;
+
+TEST(TreeHelper, MulticlassFullHessianTest) {
+  const int kNumClasses = 4;
+  Eigen::VectorXf g(kNumClasses);
+  g << 0.5, 0.33, -9, 1;
+  Eigen::VectorXf h(kNumClasses * kNumClasses);
+  h << 3, 5, 7, 8, 5, 4, 1, 5, 7, 1, 8, 4, 8, 5, 4, 9;
+  float l1 = 0;
+  float l2 = 0.3;
+  Eigen::VectorXf weight(kNumClasses);
+  float gain;
+  CalculateWeightsAndGains(g, h, l1, l2, &weight, &gain);
+  std::vector<float> expected_weight = {0.9607576, 0.4162569, 0.9863192,
+                                        -1.5820024};
+  for (int i = 0; i < kNumClasses; ++i) {
+    EXPECT_NEAR(expected_weight[i], weight[i], kDelta);
+  }
+  EXPECT_NEAR(9.841132, gain, kDelta);
+}
+
+TEST(TreeHelper, MulticlassDiagonalHessianTest) {
+  const int kNumClasses = 4;
+  Eigen::VectorXf g(kNumClasses);
+  g << 0.5, 0.33, -9, 1;
+  float l1 = 0.1;
+  // Normal case.
+  {
+    float l2 = 0.3;
+    // Full Hessian.
+    Eigen::VectorXf h_full(kNumClasses * kNumClasses);
+    h_full << 3, 0, 0, 0, 0, 4, 0, 0, 0, 0, 8, 0, 0, 0, 0, 9;
+    Eigen::VectorXf weight_full(kNumClasses);
+    float gain_full;
+    CalculateWeightsAndGains(g, h_full, l1, l2, &weight_full, &gain_full);
+    // Diagonal Hessian.
+    Eigen::VectorXf h_diagonal(kNumClasses);
+    h_diagonal << 3, 4, 8, 9;
+    Eigen::VectorXf weight_diagonal(kNumClasses);
+    float gain_diagonal;
+    CalculateWeightsAndGains(g, h_diagonal, l1, l2, &weight_diagonal,
+                             &gain_diagonal);
+
+    for (int i = 0; i < kNumClasses; ++i) {
+      EXPECT_NEAR(weight_full[i], weight_diagonal[i], kDelta);
+    }
+    EXPECT_EQ(gain_full, gain_diagonal);
+  }
+  // Zero entries in diagonal, no regularization; use matrix solver, just like
+  // full Hessian.
+  {
+    float l2 = 0.0;
+    // Full Hessian.
+    Eigen::VectorXf h_full(kNumClasses * kNumClasses);
+    h_full << 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 9;
+    Eigen::VectorXf weight_full(kNumClasses);
+    float gain_full;
+    CalculateWeightsAndGains(g, h_full, l1, l2, &weight_full, &gain_full);
+    // Diagonal Hessian.
+    Eigen::VectorXf h_diagonal(kNumClasses);
+    h_diagonal << 3, 0, 8, 9;
+    Eigen::VectorXf weight_diagonal(kNumClasses);
+    float gain_diagonal;
+    CalculateWeightsAndGains(g, h_diagonal, l1, l2, &weight_diagonal,
+                             &gain_diagonal);
+
+    for (int i = 0; i < kNumClasses; ++i) {
+      EXPECT_NEAR(weight_full[i], weight_diagonal[i], kDelta);
+    }
+    EXPECT_EQ(gain_full, gain_diagonal);
+  }
+}
+
+TEST(TreeHelper, DiagonalHessianVsSingleClass) {
+  float l1 = 0;
+  float l2 = 0.3;
+  // Solve as multi-class using 2 logits. For cross entropy loss, gradient and
+  // hessian are only non-zero when label probability > 0. For this example the
+  // one-hot label would be [0, 1].
+  Eigen::VectorXf g_diagonal(2);
+  g_diagonal << 0, -0.25;
+  // Diagonal Hessian.
+  Eigen::VectorXf h_diagonal(2);
+  h_diagonal << 0, 0.11;
+  Eigen::VectorXf weight_diagonal(2);
+  float gain_diagonal;
+  CalculateWeightsAndGains(g_diagonal, h_diagonal, l1, l2, &weight_diagonal,
+                           &gain_diagonal);
+  // Single logit.
+  Eigen::VectorXf g_single(1);
+  g_single << -0.25;
+  Eigen::VectorXf h_single(1);
+  h_single << 0.11;
+  Eigen::VectorXf weight_single(1);
+  float gain_single;
+  CalculateWeightsAndGains(g_single, h_single, l1, l2, &weight_single,
+                           &gain_single);
+
+  EXPECT_NEAR(weight_diagonal[1], weight_single[0], kDelta);
+  EXPECT_EQ(gain_diagonal, gain_single);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index 29c47b0..51caca5 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -15,7 +15,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -100,7 +101,8 @@
 TF_CALL_ALL_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 namespace functor {
 #define DECLARE_GPU_TEMPLATE(Type)                               \
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
index 4d9a8b0..aae1fb7 100644
--- a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 05ca1ce..91a9dbc 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -168,7 +168,8 @@
   return work_ == nullptr ? Unimplemented() : Status::OK();
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 class GpuCastOp : public CastOpBase {
  public:
   explicit GpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
@@ -222,7 +223,8 @@
 
 REGISTER_KERNEL_BUILDER(Name("Cast").Device(DEVICE_CPU), CpuCastOp);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define REGISTER_CAST_GPU(srctype, dsttype)                    \
   REGISTER_KERNEL_BUILDER(Name("Cast")                         \
                               .TypeConstraint<srctype>("SrcT") \
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
index 5046971..85cedfc 100644
--- a/tensorflow/core/kernels/cast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 154bede..266e2ce 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -99,7 +99,8 @@
 
 CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Same, for GPU.
 CastFunctorType GetGpuCastFromBool(DataType dst_dtype);
 
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index 2b41956..0a399d0 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -27,7 +27,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype) {
   if (dst_dtype == DT_FLOAT) {
     return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 98cb49f..d08a45a 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, bool);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_complex128.cc b/tensorflow/core/kernels/cast_op_impl_complex128.cc
index d610bd3..9bd0e11 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex128.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<double>);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_complex64.cc b/tensorflow/core/kernels/cast_op_impl_complex64.cc
index cb1018f..bb7fd86 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex64.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<float>);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index a9a129b..8637f3d 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, double);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index c7a918e..c2418e9 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -27,7 +27,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, GPUDevice, float);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index 041c205..1581b6b 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromHalf(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index a3fdef7..b322006 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int16);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index cc43c74..154fd14 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int32);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index acc550f..1f4ebc9 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int64);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index e0cad33..00a72ab 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int8);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index 31d171a..2981fe9 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint16);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_uint32.cc b/tensorflow/core/kernels/cast_op_impl_uint32.cc
index fc6c67b..b94540d 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint32.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint32);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_uint64.cc b/tensorflow/core/kernels/cast_op_impl_uint64.cc
index 70bf90c..e04c0a2 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint64.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint64);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index da36526..20c5729 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -25,7 +25,8 @@
   return nullptr;
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint8);
   return nullptr;
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 0e70ac2..8fd6b15 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace {
@@ -177,7 +178,8 @@
     // `WaitForDependencies` may block if the collective instances on which this
     // op depends have not yet launched.  When this function returns, this op is
     // ready to go.
-    tracing::ScopedActivity activity("WaitForDependencies");
+    profiler::TraceMe activity("WaitForDependencies",
+                               profiler::TraceMeLevel::kInfo);
     col_ctx_->col_exec->WaitForDependencies(*col_params_);
     NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
   }
@@ -186,17 +188,17 @@
     // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
     // implementation of `Launched` keeps track of the number of devices that
     // have launched.
-    tracing::ScopedActivity activity("Schedule");
+    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
     col_ctx_->col_exec->Launched(*col_params_);
   }
 
   // Wait for nccl op and group_size copy to succeed, then do final_op.
   {
-    tracing::ScopedActivity activity("GroupSizeCopy");
+    profiler::TraceMe activity("GroupSizeCopy", profiler::TraceMeLevel::kInfo);
     group_size_ready.WaitForNotification();
   }
   {
-    tracing::ScopedActivity activity("Nccl");
+    profiler::TraceMe activity("Nccl", profiler::TraceMeLevel::kInfo);
     nccl_done.WaitForNotification();
   }
   Status final_status =
diff --git a/tensorflow/core/kernels/collective_nccl_reducer_test.cc b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
index 26c92f1..00dfa72 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
@@ -18,6 +18,7 @@
 #include "tensorflow/core/kernels/collective_nccl_reducer.h"
 
 #include <algorithm>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/device.h"
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index eb172f8..6c817f7 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -119,7 +119,8 @@
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index 6427d20..227490a 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 7303a34..5e338fe 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -47,7 +47,8 @@
     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
         inputs,
     typename TTypes<T, 2>::Matrix* output);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 template <typename T>
 void ConcatGPU(
     OpKernelContext* c,
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index c3421e4..09180d6 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -17,7 +17,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif
 
@@ -92,7 +93,8 @@
 
 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define REGISTER_KERNEL(D, TYPE)                                      \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \
@@ -216,7 +218,8 @@
 #undef REGISTER_KERNEL_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL(GPU, Eigen::half);
 REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
@@ -300,7 +303,8 @@
                         ZerosLikeOp<CPUDevice, int32>);
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
 REGISTER_KERNEL(bfloat16, GPU);
@@ -353,7 +357,8 @@
                         OnesLikeOp<CPUDevice, int32>);
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
 REGISTER_KERNEL(bfloat16, GPU);
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index 4599232..36c30da 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index c618c36..e0171ea 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -86,7 +86,8 @@
 
 TEST_F(ConstantOpTest, PersistentMemoryTracking) {
   PersistentMemoryTrackingTest(false);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   PersistentMemoryTrackingTest(true);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 1bac2a1..22b10ad 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -57,11 +57,16 @@
                             Filter filter, int row_stride, int col_stride,
                             int row_dilation, int col_dilation,
                             const Eigen::PaddingType& padding,
-                            const OutputKernel& output_kernel) {
-  // Need to swap row/col when calling Eigen.
-  output.device(d) =
-      Eigen::SpatialConvolution(input, filter, col_stride, row_stride, padding,
-                                col_dilation, row_dilation, output_kernel);
+                            const OutputKernel& output_kernel,
+                            int padding_top = 0, int padding_bottom = 0,
+                            int padding_left = 0, int padding_right = 0) {
+  // Need to swap row/col, padding_top/padding_left, and
+  // padding_bottom/padding_right when calling Eigen. Eigen expects the tensor
+  // in NWHC format, but the tensor given is in NHWC.
+  output.device(d) = Eigen::SpatialConvolution(
+      input, filter, col_stride, row_stride, padding, col_dilation,
+      row_dilation, output_kernel, padding_left, padding_right, padding_top,
+      padding_bottom);
 }
 
 template <typename Device, typename T,
@@ -76,6 +81,18 @@
     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
                            row_dilation, col_dilation, padding, output_kernel);
   }
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 4>::ConstTensor filter, int row_stride,
+                  int col_stride, int row_dilation, int col_dilation,
+                  int padding_top, int padding_bottom, int padding_left,
+                  int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    SpatialConvolutionFunc(
+        d, output, input, filter, row_stride, col_stride, row_dilation,
+        col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
+        padding_top, padding_bottom, padding_left, padding_right);
+  }
 };
 
 template <typename Device, typename OutputKernel>
@@ -93,6 +110,22 @@
                                   row_dilation, output_kernel)
             .template cast<Eigen::half>();
   }
+  void operator()(const Device& d,
+                  typename TTypes<Eigen::half, 4>::Tensor output,
+                  typename TTypes<Eigen::half, 4>::ConstTensor input,
+                  typename TTypes<Eigen::half, 4>::ConstTensor filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, int padding_top, int padding_bottom,
+                  int padding_left, int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    output.device(d) =
+        Eigen::SpatialConvolution(
+            input.cast<float>(), filter.cast<float>(), col_stride, row_stride,
+            Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
+            output_kernel, padding_left, padding_right, padding_top,
+            padding_bottom)
+            .template cast<Eigen::half>();
+  }
 };
 
 template <typename Device, typename T>
@@ -146,42 +179,50 @@
 
 // Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format.
 //
-// Note: Currently OIHW is the only supported destination format. Support for
-// OHWI format will be added in a follow-up change.
+// Note: Currently supports OIHW and OHWI destination formats.
 template <typename Device, typename T, typename IndexType, int NDIMS>
 struct TransformFilter {
   void operator()(const Device& d, FilterTensorFormat dst_filter_format,
                   typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
                   typename TTypes<T, NDIMS, IndexType>::Tensor out) {
+    // NOTE: Source filter format is always HWIO.
+    Eigen::DSizes<IndexType, NDIMS - 2> spatial_dims;
+    for (int i = 0; i < spatial_dims.rank(); ++i) {
+      spatial_dims[i] = in.dimension(i);
+    }
+
     // Merge the spatial dimensions together to speed up the shuffle operation.
     Eigen::DSizes<IndexType, 3> merged_dims;
-    merged_dims[0] = in.dimension(0);  // spatial dimensions
-    for (int i = 1; i < NDIMS - 2; ++i) {
-      merged_dims[0] *= in.dimension(i);
-    }
-    merged_dims[1] = in.dimension(NDIMS - 2);  // input filters
-    merged_dims[2] = in.dimension(NDIMS - 1);  // output filters
+    merged_dims[0] = spatial_dims.TotalSize();  // product of spatial dims [H*W]
+    merged_dims[1] = in.dimension(NDIMS - 2);   // input filters           [I]
+    merged_dims[2] = in.dimension(NDIMS - 1);   // output filters          [O]
 
-    DCHECK(dst_filter_format == FORMAT_OIHW)
-        << "Unsupported destination filter format: "
-        << ToString(dst_filter_format);
-    // Source filter format is FORMAT_HWIO and spatial dimensions HW are merged
-    // in the beginning.
-    Eigen::DSizes<IndexType, 3> shuffling_perm =
-        Eigen::DSizes<IndexType, 3>(2, 1, 0);
-
+    // Shuffle tensor with merged spatial dimensions.
+    Eigen::DSizes<IndexType, 3> shuffling_perm;
+    // Expand shuffled tensor into final dimensions.
     Eigen::DSizes<IndexType, NDIMS> expanded_dims;
-    int out_index = 0;
-    for (int merged_dim = 0; merged_dim < merged_dims.rank(); ++merged_dim) {
-      if (shuffling_perm[merged_dim] == 0) {
-        for (int spatial_dim = 0; spatial_dim < NDIMS - 2; ++spatial_dim) {
-          expanded_dims[out_index++] = in.dimension(spatial_dim);
-        }
-      } else {
-        constexpr int kLastSpatialDim = NDIMS - 3;
-        expanded_dims[out_index++] =
-            in.dimension(kLastSpatialDim + shuffling_perm[merged_dim]);
+
+    if (dst_filter_format == FORMAT_OIHW) {
+      shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 1, 0);
+
+      expanded_dims[0] = merged_dims[2];  // [O]
+      expanded_dims[1] = merged_dims[1];  // [I]
+      for (int i = 0; i < spatial_dims.rank(); ++i) {
+        expanded_dims[2 + i] = spatial_dims[i];
       }
+
+    } else if (dst_filter_format == FORMAT_OHWI) {
+      shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 0, 1);
+
+      expanded_dims[0] = merged_dims[2];          // [O]
+      expanded_dims[NDIMS - 1] = merged_dims[1];  // [I]
+      for (int i = 0; i < spatial_dims.rank(); ++i) {
+        expanded_dims[1 + i] = spatial_dims[i];
+      }
+
+    } else {
+      DCHECK(false) << "Unsupported destination filter format: "
+                    << ToString(dst_filter_format);
     }
 
     out.device(d) =
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 820a92b..a8c218a 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -434,13 +434,22 @@
     combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
 
-    CHECK(dst_filter_format == FORMAT_OIHW)
-        << "Unsupported output layout: " << ToString(dst_filter_format);
+    if (dst_filter_format == FORMAT_OIHW) {
+      TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
+                                   config.block_count, config.thread_per_block,
+                                   0, d.stream(), config.virtual_thread_count,
+                                   in.data(), combined_dims, out.data()));
 
-    TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
-                                 config.block_count, config.thread_per_block, 0,
-                                 d.stream(), config.virtual_thread_count,
-                                 in.data(), combined_dims, out.data()));
+    } else if (dst_filter_format == FORMAT_OHWI) {
+      TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>,
+                                   config.block_count, config.thread_per_block,
+                                   0, d.stream(), config.virtual_thread_count,
+                                   in.data(), combined_dims, out.data()));
+
+    } else {
+      LOG(ERROR) << "Unsupported filter format: "
+                 << ToString(dst_filter_format);
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 168a91a..e755c3e 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -208,14 +208,9 @@
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(
-        context, padding_ != Padding::EXPLICIT,
-        errors::Unimplemented("Current CPU implementation does not support "
-                              "EXPLICIT padding yet."));
-    std::vector<int64> explicit_paddings;
     OP_REQUIRES_OK(context,
-                   context->GetAttr("explicit_paddings", &explicit_paddings));
-    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES(context, dilations_.size() == 4,
@@ -247,11 +242,12 @@
                                 filter_sizes.vec<int32>(), &filter_shape));
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2,
-                       input.shape(), filter_shape, out_backprop.shape(),
-                       strides_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensionsV2(
+            "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2, input.shape(),
+            filter_shape, out_backprop.shape(), /*dilations=*/{1, 1, 1, 1},
+            strides_, padding_, explicit_paddings_, data_format_, &dims));
 
     Tensor* filter_backprop;
     OP_REQUIRES_OK(context,
@@ -264,6 +260,12 @@
 
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
+    if (padding_ == Padding::EXPLICIT) {
+      pad_top = explicit_paddings_[2];
+      pad_bottom = explicit_paddings_[3];
+      pad_left = explicit_paddings_[4];
+      pad_right = explicit_paddings_[5];
+    }
     OP_REQUIRES_OK(
         context,
         GetWindowedOutputSizeVerbose(
@@ -402,6 +404,7 @@
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 471c73f..4c1a0d9 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -299,14 +299,9 @@
                 errors::InvalidArgument(
                     "Current libxsmm and customized CPU implementations do "
                     "not yet support dilation rates larger than 1."));
-    OP_REQUIRES(
-        context, padding_ != Padding::EXPLICIT,
-        errors::Unimplemented("Current CPU implementation does not support "
-                              "EXPLICIT padding yet."));
-    std::vector<int64> explicit_paddings;
     OP_REQUIRES_OK(context,
-                   context->GetAttr("explicit_paddings", &explicit_paddings));
-    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
   }
 
@@ -325,10 +320,11 @@
 
     ConvBackpropDimensions dims;
     OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
+                   ConvBackpropComputeDimensionsV2(
                        "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2,
                        input_shape, filter.shape(), out_backprop.shape(),
-                       strides_, padding_, data_format_, &dims));
+                       /*dilations=*/{1, 1, 1, 1}, strides_, padding_,
+                       explicit_paddings_, data_format_, &dims));
 
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context,
@@ -375,6 +371,12 @@
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
 #endif
+    if (padding_ == Padding::EXPLICIT) {
+      pad_top = explicit_paddings_[2];
+      pad_bottom = explicit_paddings_[3];
+      pad_left = explicit_paddings_[4];
+      pad_right = explicit_paddings_[5];
+    }
     OP_REQUIRES_OK(
         context,
         GetWindowedOutputSizeVerbose(
@@ -536,6 +538,7 @@
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
@@ -617,12 +620,6 @@
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    if (!std::is_same<Device, GPUDevice>::value) {
-      OP_REQUIRES(
-          context, padding_ != Padding::EXPLICIT,
-          errors::Unimplemented("Current CPU implementation does not support "
-                                "EXPLICIT padding yet."));
-    }
     OP_REQUIRES_OK(context,
                    context->GetAttr("explicit_paddings", &explicit_paddings_));
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index f5ec3d9..8050320 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -25,6 +25,7 @@
 #include "tensorflow/core/kernels/conv_ops.h"
 
 #include <string.h>
+
 #include <map>
 #include <vector>
 
@@ -70,11 +71,12 @@
   void operator()(OpKernelContext* ctx, const Tensor& input,
                   const Tensor& filter, int row_stride, int col_stride,
                   int row_dilation, int col_dilation, const Padding& padding,
-                  Tensor* output, TensorFormat data_format) {
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format) {
     CHECK(data_format == FORMAT_NHWC) << "Generic conv implementation only "
                                          "supports NHWC tensor format for now.";
     if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
-        col_stride == 1) {
+        col_stride == 1 && (padding == SAME || padding == VALID)) {
       // For 1x1 kernel, the 2D convolution is reduced to matrix
       // multiplication.
       //
@@ -110,10 +112,20 @@
           input.shaped<T, 2>({input.dim_size(0), k}),
           filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair);
     } else {
-      functor::SpatialConvolution<Device, T>()(
-          ctx->eigen_device<Device>(), output->tensor<T, 4>(),
-          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
-          row_dilation, col_dilation, BrainPadding2EigenPadding(padding));
+      if (padding == EXPLICIT) {
+        functor::SpatialConvolution<Device, T>()(
+            ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
+            row_dilation, col_dilation, static_cast<int>(explicit_paddings[2]),
+            static_cast<int>(explicit_paddings[3]),
+            static_cast<int>(explicit_paddings[4]),
+            static_cast<int>(explicit_paddings[5]));
+      } else {
+        functor::SpatialConvolution<Device, T>()(
+            ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
+            row_dilation, col_dilation, BrainPadding2EigenPadding(padding));
+      }
     }
   }
 };
@@ -133,18 +145,19 @@
                                 "NHWC tensor format for now."));
       return;
     }
-    // TODO(reedwm): Enable explicit padding on the CPU.
-    OP_REQUIRES(
-        ctx, padding != Padding::EXPLICIT,
-        errors::Unimplemented("Generic conv implementation does not support "
-                              "EXPLICIT padding yet."));
     const int64 in_depth = GetTensorDim(input, data_format, 'C');
     OP_REQUIRES(ctx, in_depth == filter.dim_size(2),
                 errors::Unimplemented("Generic conv implementation does not "
                                       "support grouped convolutions for now."));
+    for (int64 explicit_padding : explicit_paddings) {
+      if (!FastBoundsCheck(explicit_padding, std::numeric_limits<int>::max())) {
+        ctx->SetStatus(errors::InvalidArgument("filter too large"));
+        return;
+      }
+    }
     LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
-                                  row_dilation, col_dilation, padding, output,
-                                  data_format);
+                                  row_dilation, col_dilation, padding,
+                                  explicit_paddings, output, data_format);
   }
 };
 
@@ -549,6 +562,15 @@
 template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA
+// Returns true if the given StreamExecutor is for a Volta or newer nvidia GPU.
+bool IsVoltaOrLater(const se::StreamExecutor& stream_exec) {
+  int major, minor;
+  CHECK(stream_exec  // Crash OK
+            .GetDeviceDescription()
+            .cuda_compute_capability(&major, &minor));
+  return major >= 7;
+}
+
 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
                            int64 default_value_in_bytes) {
   const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
@@ -664,6 +686,23 @@
     return;
   }
 
+  // Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16
+  // in NHWC data layout. In all other configurations it's more efficient to
+  // run computation in NCHW data format.
+  const bool compute_in_nhwc =
+      DataTypeToEnum<T>::value == DT_HALF && IsVoltaOrLater(*stream->parent());
+
+  // We only do one directional conversion: NHWC->NCHW. We never convert in the
+  // other direction. Grappler layout optimizer selects preferred layout and
+  // adds necessary annotations to the graph.
+  // TODO(ezhulenev): Convert in other direction for fp16?
+  const TensorFormat compute_data_format =
+      compute_in_nhwc && data_format == FORMAT_NHWC ? FORMAT_NHWC : FORMAT_NCHW;
+
+  VLOG(3) << "Compute Conv2D with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
   const int64 out_batch = GetTensorDim(*output, data_format, 'N');
   const int64 out_rows = GetTensorDim(*output, data_format, 'H');
   const int64 out_cols = GetTensorDim(*output, data_format, 'W');
@@ -696,6 +735,11 @@
     // cuDNN only supports padding the same amount on the left and right sides,
     // and on the top and bottom sides. So we manually create a new padded
     // input tensor such that we can pass it to cuDNN.
+    VLOG(4) << "Pad input tensor:"
+            << " padding_top=" << padding_top
+            << " padding_bottom=" << padding_bottom
+            << " padding_left=" << padding_left
+            << " padding_right=" << padding_right;
 
     // TODO(reedwm): In some cases, we can avoid an allocation even if the two
     // padding sides are different. For example, if the input is 2x2, the filter
@@ -738,8 +782,9 @@
     in_cols = new_in_cols;
   }
 
-  if (data_format == FORMAT_NHWC) {
-    // Convert the input tensor from NHWC to NCHW.
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the input tensor from NHWC to NCHW.";
+
     TensorShape nchw_shape =
         ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
     if (in_depths > 1) {
@@ -755,28 +800,48 @@
       // If depth <= 1, then just reshape.
       CHECK(input.CopyFrom(input, nchw_shape));
     }
+  } else {
+    CHECK(data_format == compute_data_format)  // Crash OK
+        << "Illegal data and compute format pair:"
+        << " data_format=" << ToString(data_format)
+        << " compute_data_format=" << ToString(compute_data_format);
   }
 
   CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
       << "Negative row or col paddings: (" << common_padding_rows << ", "
       << common_padding_cols << ")";
+
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
       .set_height(in_rows)
       .set_width(in_cols)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(compute_data_layout);
   se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(out_batch)
       .set_height(out_rows)
       .set_width(out_cols)
       .set_feature_map_count(out_depths)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(compute_data_layout);
   se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(patch_rows)
       .set_input_filter_width(patch_cols)
       .set_input_feature_map_count(patch_depths)
-      .set_output_feature_map_count(filter.dim_size(3));
+      .set_output_feature_map_count(filter.dim_size(3))
+      .set_layout(filter_layout);
   se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(row_dilation)
       .set_horizontal_dilation_rate(col_dilation)
@@ -787,22 +852,42 @@
       .set_group_count(in_depths / patch_depths);
 
   Tensor transformed_filter;
-  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                          DataTypeToEnum<T>::value,
-                          TensorShape({filter.dim_size(3), filter.dim_size(2),
-                                       filter.dim_size(0), filter.dim_size(1)}),
-                          &transformed_filter));
-  functor::TransformFilter<GPUDevice, T, int, 4>()(
-      ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
-      To32Bit(filter.tensor<T, 4>()),
-      To32Bit(transformed_filter.tensor<T, 4>()));
+
+  const auto transform_filter = [&](FilterTensorFormat dst_format) -> void {
+    VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
+            << " to " << ToString(dst_format);
+
+    TensorShape dst_shape =
+        dst_format == FORMAT_OIHW
+            ? TensorShape({filter.dim_size(3), filter.dim_size(2),
+                           filter.dim_size(0), filter.dim_size(1)})
+            : TensorShape({filter.dim_size(3), filter.dim_size(0),
+                           filter.dim_size(1), filter.dim_size(2)});
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                           &transformed_filter));
+    functor::TransformFilter<GPUDevice, T, int, 4>()(
+        ctx->eigen_device<GPUDevice>(), dst_format,
+        To32Bit(filter.tensor<T, 4>()),
+        To32Bit(transformed_filter.tensor<T, 4>()));
+  };
+
+  if (compute_data_format == FORMAT_NCHW) {
+    transform_filter(FORMAT_OIHW);
+  } else if (compute_data_format == FORMAT_NHWC) {
+    transform_filter(FORMAT_OHWI);
+  } else {
+    ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
+                                           ToString(compute_data_format)));
+    return;
+  }
 
   Tensor transformed_output;
-  if (data_format == FORMAT_NHWC) {
-    // Only allocate temporary memory when a layout transformation is needed.
+  if (data_format != compute_data_format) {
+    VLOG(4) << "Allocate temporary memory for output in compute data format";
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, out_batch,
+                                ShapeFromFormat(compute_data_format, out_batch,
                                                 out_rows, out_cols, out_depths),
                                 &transformed_output));
   } else {
@@ -830,7 +915,7 @@
       in_depths,                // in_depths
       {{in_rows,                // in_rows
         in_cols}},              // in_cols
-      FORMAT_NCHW,              // compute_data_format
+      compute_data_format,      // compute_data_format
       out_depths,               // out_depths
       {{patch_rows,             // filter_rows
         patch_cols,             // filter_cols
@@ -889,6 +974,11 @@
     AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
   }
 
+  VLOG(4) << "Convolution Algorithm: "
+          << algorithm_config.algorithm()->algo_id();
+  VLOG(4) << "tensor_ops_enabled: "
+          << algorithm_config.algorithm()->tensor_ops_enabled();
+
   DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
@@ -904,8 +994,8 @@
         ") filter shape(", filter.shape().DebugString(), ")"));
   }
 
-  // Convert the output tensor back from NCHW to NHWC.
-  if (data_format == FORMAT_NHWC) {
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
     functor::NCHWToNHWC<GPUDevice, T, 4>()(
         ctx->eigen_device<GPUDevice>(),
         const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
index 259a2f2..a03f62b 100644
--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@@ -29,7 +29,7 @@
 namespace tensorflow {
 
 ////////////////////////////////////////////////////////////////////////////////
-// Performance benchmarks for the FusedConv2Op.                               //
+// Performance benchmarks for the Conv2DOp and FusedConv2Op.                  //
 ////////////////////////////////////////////////////////////////////////////////
 
 struct Conv2DGraph {
@@ -63,19 +63,27 @@
   Node* activation;
 };
 
+template <typename T>
 static Tensor MakeRandomTensor(const TensorShape& shape) {
-  Tensor tensor(DT_FLOAT, TensorShape(shape));
-  tensor.flat<float>() = tensor.flat<float>().setRandom();
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape(shape));
+  tensor.flat<T>() = tensor.flat<T>().setRandom();
   return tensor;
 }
 
 // Creates a simple Tensorflow graph with single Conv2D node.
+template <typename T>
 static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
-                          int filter_w, int filter_h, int out_depth) {
+                          int filter_w, int filter_h, int out_depth,
+                          TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
-  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
-  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor images_t = data_format == FORMAT_NHWC
+                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
+                        : MakeRandomTensor<T>({batch, in_depth, height, width});
+
+  // Filter is always in HWIO.
+  Tensor filter_t =
+      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
@@ -84,33 +92,35 @@
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "Conv2D")
                   .Input(images)
                   .Input(filter)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
+                  .Attr("data_format", ToString(data_format))
                   .Finalize(graph, &conv2d));
 
   return {graph, conv2d};
 }
 
 // Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
-static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
-                                          int in_depth, int filter_w,
-                                          int filter_h, int out_depth) {
-  Conv2DGraph conv_graph =
-      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+template <typename T>
+static Conv2DWithBiasGraph Conv2DWithBias(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DGraph conv_graph = Conv2D<T>(batch, height, width, in_depth, filter_w,
+                                     filter_h, out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
 
-  Tensor bias_t = MakeRandomTensor({out_depth});
+  Tensor bias_t = MakeRandomTensor<T>({out_depth});
   Node* bias = test::graph::Constant(graph, bias_t, "bias");
 
   Node* out;
   TF_CHECK_OK(NodeBuilder(graph->NewName("bias"), "BiasAdd")
                   .Input(conv2d)
                   .Input(bias)
-                  .Attr("T", DT_FLOAT)
-                  .Attr("data_format", "NHWC")
+                  .Attr("T", DataTypeToEnum<T>::value)
+                  .Attr("data_format", ToString(data_format))
                   .Finalize(graph, &out));
 
   return {graph, conv2d, out};
@@ -118,11 +128,14 @@
 
 // Creates a Tensorflow graph with a Conv2D node followed by BiasAdd and
 // activation (Relu, Relu6, etc...).
+template <typename T>
 static Conv2DWithBiasAndActivationGraph Conv2DWithBiasAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type) {
-  Conv2DWithBiasGraph conv_graph = Conv2DWithBias(
-      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+    int out_depth, const string& activation_type,
+    TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DWithBiasGraph conv_graph =
+      Conv2DWithBias<T>(batch, height, width, in_depth, filter_w, filter_h,
+                        out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
@@ -131,27 +144,27 @@
   Node* activation;
   TF_CHECK_OK(NodeBuilder(graph->NewName("activation"), activation_type)
                   .Input(bias)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Finalize(graph, &activation));
 
   return {graph, conv2d, bias, activation};
 }
 
 // Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm.
-static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
-                                                    int width, int in_depth,
-                                                    int filter_w, int filter_h,
-                                                    int out_depth) {
-  Conv2DGraph conv_graph =
-      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+template <typename T>
+static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DGraph conv_graph = Conv2D<T>(batch, height, width, in_depth, filter_w,
+                                     filter_h, out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
 
-  Tensor scale_t = MakeRandomTensor({out_depth});
-  Tensor offset_t = MakeRandomTensor({out_depth});
-  Tensor mean_t = MakeRandomTensor({out_depth});
-  Tensor variance_t = MakeRandomTensor({out_depth});
+  Tensor scale_t = MakeRandomTensor<T>({out_depth});
+  Tensor offset_t = MakeRandomTensor<T>({out_depth});
+  Tensor mean_t = MakeRandomTensor<T>({out_depth});
+  Tensor variance_t = MakeRandomTensor<T>({out_depth});
 
   Node* scale = test::graph::Constant(graph, scale_t, "scale");
   Node* offset = test::graph::Constant(graph, offset_t, "offset");
@@ -165,8 +178,9 @@
                   .Input(offset)
                   .Input(mean)
                   .Input(variance)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("is_training", false)
+                  .Attr("data_format", ToString(data_format))
                   .Finalize(graph, &out));
 
   return {graph, conv2d, out};
@@ -174,11 +188,14 @@
 
 // Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm and
 // activation (Relu, Relu6, etc...).
+template <typename T>
 static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type) {
-  Conv2DWithBatchNormGraph conv_graph = Conv2DWithBatchNorm(
-      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+    int out_depth, const string& activation_type,
+    TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DWithBatchNormGraph conv_graph =
+      Conv2DWithBatchNorm<T>(batch, height, width, in_depth, filter_w, filter_h,
+                             out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
@@ -187,7 +204,7 @@
   Node* activation;
   TF_CHECK_OK(NodeBuilder(graph->NewName("activation"), activation_type)
                   .Input(batch_norm)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Finalize(graph, &activation));
 
   return {graph, conv2d, batch_norm, activation};
@@ -195,15 +212,22 @@
 
 // Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
 // fuses into it additional computations (e.g. Relu).
+template <typename T>
 static Graph* FusedConv2DWithBias(int batch, int height, int width,
                                   int in_depth, int filter_w, int filter_h,
                                   int out_depth,
-                                  const std::vector<string>& fused_ops = {}) {
+                                  const std::vector<string>& fused_ops = {},
+                                  TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
-  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
-  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
-  Tensor bias_t = MakeRandomTensor({out_depth});
+  Tensor images_t = data_format == FORMAT_NHWC
+                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
+                        : MakeRandomTensor<T>({batch, in_depth, height, width});
+
+  // Filter is always in HWIO.
+  Tensor filter_t =
+      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
+  Tensor bias_t = MakeRandomTensor<T>({out_depth});
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
@@ -217,7 +241,7 @@
                   .Input(filter)
                   .Attr("num_args", 1)
                   .Input(args)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("fused_ops", fused_ops)
@@ -228,17 +252,24 @@
 
 // Creates a tensorflow graph with a single FusedConv2D (with FusedBatchNorm)
 // node and fuses into it additional computations (e.g. Relu).
+template <typename T>
 static Graph* FusedConv2DWithBatchNorm(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const std::vector<string>& fused_ops = {}) {
+    int out_depth, const std::vector<string>& fused_ops = {},
+    TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
-  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
-  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
-  Tensor scale_t = MakeRandomTensor({out_depth});
-  Tensor offset_t = MakeRandomTensor({out_depth});
-  Tensor mean_t = MakeRandomTensor({out_depth});
-  Tensor variance_t = MakeRandomTensor({out_depth});
+  Tensor images_t = data_format == FORMAT_NHWC
+                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
+                        : MakeRandomTensor<T>({batch, in_depth, height, width});
+
+  // Filter is always in HWIO.
+  Tensor filter_t =
+      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
+  Tensor scale_t = MakeRandomTensor<T>({out_depth});
+  Tensor offset_t = MakeRandomTensor<T>({out_depth});
+  Tensor mean_t = MakeRandomTensor<T>({out_depth});
+  Tensor variance_t = MakeRandomTensor<T>({out_depth});
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
@@ -255,7 +286,7 @@
                   .Input(filter)
                   .Attr("num_args", 4)
                   .Input(args)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("fused_ops", fused_ops)
@@ -273,6 +304,10 @@
 //   FH: filter height
 //   FW: filter width
 
+// -------------------------------------------------------------------------- //
+// Following benchmarks are always using 'float' data type with NHWC layout.
+// -------------------------------------------------------------------------- //
+
 #define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
   testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
   testing::SetLabel(LABEL);
@@ -280,39 +315,41 @@
 #define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
   name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
 
-#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                       \
-  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) {  \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph).Run(iters); \
-  }                                                                          \
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                      \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                              \
+    test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph)     \
+        .Run(iters);                                                        \
+  }                                                                         \
   BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
 
 #define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
   static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                   \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
-    test::Benchmark(#type, Conv2DWithBias(N, H, W, C, FW, FH, FC).graph) \
+    test::Benchmark(#type,                                               \
+                    Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph) \
         .Run(iters);                                                     \
   }                                                                      \
   BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
 
-#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)      \
-  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,  \
-                      FC)(int iters) {                                     \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                             \
-    test::Benchmark(                                                       \
-        #type,                                                             \
-        Conv2DWithBiasAndActivation(N, H, W, C, FW, FH, FC, "Relu").graph) \
-        .Run(iters);                                                       \
-  }                                                                        \
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)         \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,     \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type, Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, \
+                                                              FH, FC, "Relu") \
+                               .graph)                                        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
 
 #define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
   static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                        \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type,                                                    \
-                    FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
+                                                      {"BiasAdd"}))           \
         .Run(iters);                                                          \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
@@ -321,8 +358,8 @@
   static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                         \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC,         \
-                                               {"BiasAdd", "Relu"}))           \
+    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC,  \
+                                                      {"BiasAdd", "Relu"}))    \
         .Run(iters);                                                           \
   }                                                                            \
   BENCHMARK(                                                                   \
@@ -332,7 +369,8 @@
   static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                        \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type, Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \
+    test::Benchmark(#type,                                                    \
+                    Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph) \
         .Run(iters);                                                          \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
@@ -341,8 +379,8 @@
   static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                         \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, Conv2DWithBatchNormAndActivation(N, H, W, C, FW,    \
-                                                            FH, FC, "Relu")    \
+    test::Benchmark(#type, Conv2DWithBatchNormAndActivation<float>(            \
+                               N, H, W, C, FW, FH, FC, "Relu")                 \
                                .graph)                                         \
         .Run(iters);                                                           \
   }                                                                            \
@@ -353,8 +391,8 @@
   static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                       \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,  \
-                                                    {"FusedBatchNorm"}))     \
+    test::Benchmark(#type, FusedConv2DWithBatchNorm<float>(                  \
+                               N, H, W, C, FW, FH, FC, {"FusedBatchNorm"}))  \
         .Run(iters);                                                         \
   }                                                                          \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
@@ -364,9 +402,9 @@
   static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
                       FW, FH, FC)(int iters) {                                \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type,                                                    \
-                    FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,          \
-                                             {"FusedBatchNorm", "Relu"}))     \
+    test::Benchmark(                                                          \
+        #type, FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC,        \
+                                               {"FusedBatchNorm", "Relu"}))   \
         .Run(iters);                                                          \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
@@ -500,4 +538,63 @@
 BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
 #endif
 
+// Macro arguments names: --------------------------------------------------- //
+//      T: data type
+// FORMAT: data format (NHWC or NCHW)
+//      N: batch size
+//      H: height
+//      W: width
+//      C: channels
+//     FC: filter count
+//     FH: filter height
+//     FW: filter width
+
+// -------------------------------------------------------------------------- //
+// Following benchmarks are used to compare different data format performance
+// for different data types. They make sense only when CUDA enabled, because on
+// CPU we only support data in NHWC.
+// -------------------------------------------------------------------------- //
+
+#define BM_LONG_NAME(name, type, T, FORMAT, N, H, W, C, FW, FH, FC) \
+  name##_##T##_##FORMAT##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+
+#define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)                 \
+  static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH,    \
+                           FC)(int iters) {                                   \
+    BM_SETUP(N, H, W, C, type, "", Conv2D);                                   \
+    test::Benchmark(#type,                                                    \
+                    Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC));
+
+#if GOOGLE_CUDA
+using fp32 = float;
+using fp16 = Eigen::half;
+
+// ResNet50-ish convolutions.
+#define BENCHMARK_DTYPE(BATCH, T)                             \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 64, 1, 1, 64, gpu);    \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 64, 1, 1, 256, gpu);   \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 256, 1, 1, 64, gpu);   \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 64, 3, 3, 64, gpu);    \
+                                                              \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 128, 1, 1, 128, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 128, 1, 1, 512, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 512, 1, 1, 128, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 512, 3, 3, 128, gpu);  \
+                                                              \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 256, 1, 1, 256, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 256, 1, 1, 1024, gpu); \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 1024, 1, 1, 256, gpu); \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 256, 3, 3, 256, gpu);
+
+BENCHMARK_DTYPE(32, fp32);
+BENCHMARK_DTYPE(32, fp16);
+
+BENCHMARK_DTYPE(64, fp32);
+BENCHMARK_DTYPE(64, fp16);
+
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 3284262..933cdaa 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -91,18 +91,20 @@
  public:
   LaunchFusedConv2DWithOutputKernel(int row_stride, int col_stride,      //
                                     int row_dilation, int col_dilation,  //
-                                    Padding padding)
+                                    Padding padding,
+                                    const std::vector<int64>& explicit_paddings)
       : row_stride_(row_stride),
         col_stride_(col_stride),
         row_dilation_(row_dilation),
         col_dilation_(col_dilation),
-        padding_(padding) {}
+        padding_(padding),
+        explicit_paddings_(explicit_paddings) {}
 
   template <typename OutputKernel>
   void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
                   const Tensor& input, const Tensor& filter, Tensor* output) {
     if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
-        row_stride_ == 1 && col_stride_ == 1) {
+        row_stride_ == 1 && col_stride_ == 1 && padding_ != EXPLICIT) {
       int conv_width = 1;  // Width for the convolution step.
       for (int i = 0; i < 3; ++i) {
         conv_width *= output->dim_size(i);
@@ -135,11 +137,22 @@
           output_kernel);
 
     } else {
-      functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
-          ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
-          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_, col_stride_,
-          row_dilation_, col_dilation_, BrainPadding2EigenPadding(padding_),
-          output_kernel);
+      if (padding_ == EXPLICIT) {
+        functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+            ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
+            col_stride_, row_dilation_, col_dilation_,
+            static_cast<int>(explicit_paddings_[2]),
+            static_cast<int>(explicit_paddings_[3]),
+            static_cast<int>(explicit_paddings_[4]),
+            static_cast<int>(explicit_paddings_[5]), output_kernel);
+      } else {
+        functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+            ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
+            col_stride_, row_dilation_, col_dilation_,
+            BrainPadding2EigenPadding(padding_), output_kernel);
+      }
     }
   }
 
@@ -149,6 +162,7 @@
   int row_dilation_;
   int col_dilation_;
   const Padding padding_;
+  const std::vector<int64>& explicit_paddings_;
 };
 
 template <typename T>
@@ -180,7 +194,8 @@
 
     LaunchFusedConv2DWithOutputKernel<T> conv2d(
         dimensions.stride_rows, dimensions.stride_cols,
-        dimensions.dilation_rows, dimensions.dilation_cols, params.padding);
+        dimensions.dilation_rows, dimensions.dilation_cols, params.padding,
+        params.explicit_paddings);
 
     switch (fusion) {
       case FusedComputationType::kUndefined:
@@ -371,8 +386,6 @@
     const int64 patch_cols = filter.dim_size(1);
     const int64 patch_depths = filter.dim_size(2);
 
-    int64 padding_rows = 0;
-    int64 padding_cols = 0;
     const int64 out_batch = GetTensorDim(*output, params.data_format, 'N');
     const int64 out_rows = GetTensorDim(*output, params.data_format, 'H');
     const int64 out_cols = GetTensorDim(*output, params.data_format, 'W');
@@ -387,44 +400,61 @@
                 errors::InvalidArgument("bias depth must be equal to out depth",
                                         bias.shape().DebugString()));
 
-    if (params.padding == SAME) {
-      // Total padding on rows and cols is
-      // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
-      // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
-      // where (R', C') are output dimensions, (R, C) are input dimensions, S
-      // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
-      // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
-      // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
-      // we pad more on the right and bottom than on the top and left.
-      padding_rows = std::max<int>(
-          0, (out_rows - 1) * dimensions.stride_rows +
-                 (patch_rows - 1) * dimensions.dilation_rows + 1 - in_rows);
-      padding_cols = std::max<int>(
-          0, (out_cols - 1) * dimensions.stride_cols +
-                 (patch_cols - 1) * dimensions.dilation_cols + 1 - in_cols);
-      const bool rows_odd = (padding_rows % 2 != 0);
-      const bool cols_odd = (padding_cols % 2 != 0);
-      if (rows_odd || cols_odd) {
-        Tensor transformed_input;
-        int64 new_in_rows = in_rows + rows_odd;
-        int64 new_in_cols = in_cols + cols_odd;
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(
-                           DataTypeToEnum<T>::value,
-                           ShapeFromFormat(params.data_format, in_batch,
-                                           new_in_rows, new_in_cols, in_depths),
-                           &transformed_input));
+    const int64 common_padding_rows =
+        std::min(dimensions.pad_rows_before, dimensions.pad_rows_after);
+    const int64 common_padding_cols =
+        std::min(dimensions.pad_cols_before, dimensions.pad_cols_after);
+    if (dimensions.pad_rows_before != dimensions.pad_rows_after ||
+        dimensions.pad_cols_before != dimensions.pad_cols_after) {
+      // cuDNN only supports padding the same amount on the left and right
+      // sides, and on the top and bottom sides. So we manually create a new
+      // padded input tensor such that we can pass it to cuDNN.
 
-        functor::PadInput<GPUDevice, T, int, 4>()(
-            context->eigen_device<GPUDevice>(),
-            To32Bit(input_param.tensor<T, 4>()), {{0, 0}},
-            {{rows_odd, cols_odd}}, To32Bit(transformed_input.tensor<T, 4>()),
-            params.data_format);
-
-        input = transformed_input;
-        in_rows = new_in_rows;
-        in_cols = new_in_cols;
+      // TODO(reedwm): In some cases, we can avoid an allocation even if the two
+      // padding sides are different. For example, if the input is 2x2, the
+      // filter is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the
+      // result is equivalent to as if the padding is (1, 1, 1, 1). Changing the
+      // padding in such a way would allow us to avoid the allocation.
+      Tensor transformed_input;
+      const int64 padding_rows_diff =
+          std::abs(dimensions.pad_rows_after - dimensions.pad_rows_before);
+      const int64 padding_cols_diff =
+          std::abs(dimensions.pad_cols_after - dimensions.pad_cols_before);
+      const int64 new_in_rows = in_rows + padding_rows_diff;
+      const int64 new_in_cols = in_cols + padding_cols_diff;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(
+                         DataTypeToEnum<T>::value,
+                         ShapeFromFormat(params.data_format, in_batch,
+                                         new_in_rows, new_in_cols, in_depths),
+                         &transformed_input));
+      const int64 input_pad_top =
+          dimensions.pad_rows_before - common_padding_rows;
+      const int64 input_pad_bottom =
+          dimensions.pad_rows_after - common_padding_rows;
+      const int64 input_pad_left =
+          dimensions.pad_cols_before - common_padding_cols;
+      const int64 input_pad_right =
+          dimensions.pad_cols_after - common_padding_cols;
+      bool in_bounds =
+          FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+      if (!in_bounds) {
+        context->SetStatus(errors::InvalidArgument("Padding is too large."));
+        return;
       }
+      functor::PadInput<GPUDevice, T, int, 4>()(
+          context->eigen_device<GPUDevice>(),
+          To32Bit(input_param.tensor<T, 4>()),
+          {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+          {{static_cast<int>(input_pad_bottom),
+            static_cast<int>(input_pad_right)}},
+          To32Bit(transformed_input.tensor<T, 4>()), params.data_format);
+      input = transformed_input;
+      in_rows = new_in_rows;
+      in_cols = new_in_cols;
     }
 
     if (params.data_format == FORMAT_NHWC) {
@@ -447,8 +477,8 @@
       }
     }
 
-    CHECK(padding_rows >= 0) << "Negative padding rows";  // Crash OK
-    CHECK(padding_cols >= 0) << "Negative padding cols";  // Crash OK
+    CHECK(common_padding_rows >= 0) << "Negative padding rows";  // Crash OK
+    CHECK(common_padding_rows >= 0) << "Negative padding cols";  // Crash OK
 
     se::dnn::ActivationMode dnn_activation_mode;
     switch (fusion) {
@@ -481,8 +511,8 @@
         .set_horizontal_dilation_rate(dimensions.dilation_cols)
         .set_vertical_filter_stride(dimensions.stride_rows)
         .set_horizontal_filter_stride(dimensions.stride_cols)
-        .set_zero_padding_height(padding_rows / 2)
-        .set_zero_padding_width(padding_cols / 2)
+        .set_zero_padding_height(common_padding_rows)
+        .set_zero_padding_width(common_padding_cols)
         .set_group_count(in_depths / patch_depths);
     se::dnn::BatchDescriptor output_desc;
     output_desc.set_count(out_batch)
@@ -547,8 +577,8 @@
               dimensions.dilation_cols}},  // dilation_cols
             {{dimensions.stride_rows,      // stride_rows
               dimensions.stride_cols}},    // stride_cols
-            {{padding_rows,                // padding_rows
-              padding_cols}},              // padding_cols
+            {{common_padding_rows,         // padding_rows
+              common_padding_cols}},       // padding_cols
             dtype,                         // tensor datatype
             device_id,                     // device_id
         },
@@ -579,8 +609,7 @@
       auto status = FindBestConvolveAlgorithm<T>(
           conv_parameters, launch, context, stream,
           [&](absl::Span<const tensorflow::AutotuneResult> results) {
-            LogFusedConvAutotuneResults(
-                se::dnn::ConvolutionKind::FORWARD,
+            LogFusedConvForwardAutotuneResults(
                 se::dnn::ToDataType<T>::value, input_desc, filter_desc,
                 output_desc, conv_desc, 1.0, 0.0, dnn_activation_mode,
                 stream->parent(), results);
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index b566fc4..bb0cd9e 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -607,15 +607,18 @@
   }
 
   void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
-                         const Tensor& bias_data, Tensor* output,
-                         bool allow_gpu_device = false, int stride = 1) {
+                         const Tensor& bias_data, const std::string& padding,
+                         const std::vector<int>& explicit_paddings,
+                         Tensor* output, bool allow_gpu_device = false,
+                         int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
 
     ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
-        {1, stride, stride, 1}, "SAME");
+        {1, stride, stride, 1}, padding,
+        ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
 
     ops::BiasAdd with_bias = ops::BiasAdd(
         root.WithOpName("with_bias"), conv,
@@ -626,15 +629,17 @@
 
   void RunConv2DWithBiasAndActivation(
       const Tensor& input_data, const Tensor& filter_data,
-      const Tensor& bias_data, const string& activation_type, Tensor* output,
-      bool allow_gpu_device = false, int stride = 1) {
+      const Tensor& bias_data, const string& activation_type,
+      const std::string& padding, const std::vector<int>& explicit_paddings,
+      Tensor* output, bool allow_gpu_device = false, int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
 
     ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
-        {1, stride, stride, 1}, "SAME");
+        {1, stride, stride, 1}, padding,
+        ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
 
     ops::BiasAdd with_bias = ops::BiasAdd(
         root.WithOpName("with_bias"), conv,
@@ -653,20 +658,20 @@
     RunAndFetch(root, "with_activation", output, allow_gpu_device);
   }
 
-  void RunConv2DWithBatchNorm(const Tensor& input_data,
-                              const Tensor& filter_data,
-                              const Tensor& scale_data,
-                              const Tensor& offset_data,
-                              const Tensor& mean_data,
-                              const Tensor& variance_data, Tensor* output,
-                              bool allow_gpu_device = false, int stride = 1) {
+  void RunConv2DWithBatchNorm(
+      const Tensor& input_data, const Tensor& filter_data,
+      const Tensor& scale_data, const Tensor& offset_data,
+      const Tensor& mean_data, const Tensor& variance_data,
+      const std::string& padding, const std::vector<int>& explicit_paddings,
+      Tensor* output, bool allow_gpu_device = false, int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
 
     ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
-        {1, stride, stride, 1}, "SAME");
+        {1, stride, stride, 1}, padding,
+        ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
 
     ops::FusedBatchNorm::Attrs attr;
     attr = attr.IsTraining(false);
@@ -686,7 +691,8 @@
       const Tensor& input_data, const Tensor& filter_data,
       const Tensor& scale_data, const Tensor& offset_data,
       const Tensor& mean_data, const Tensor& variance_data,
-      const string& activation_type, Tensor* output,
+      const string& activation_type, const std::string& padding,
+      const std::vector<int>& explicit_paddings, Tensor* output,
       bool allow_gpu_device = false, int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
 
@@ -694,7 +700,8 @@
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
-        {1, stride, stride, 1}, "SAME");
+        {1, stride, stride, 1}, padding,
+        ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
 
     ops::FusedBatchNorm::Attrs attr;
     attr = attr.IsTraining(false);
@@ -723,8 +730,11 @@
 
   void RunFusedConv2DOp(const Tensor& input_data, const Tensor& filter_data,
                         const std::vector<Tensor>& args_data,
-                        const std::vector<string>& fused_ops, Tensor* output,
-                        bool allow_gpu_device = false, int stride = 1) {
+                        const std::vector<string>& fused_ops,
+                        const std::string& padding,
+                        const std::vector<int>& explicit_paddings,
+                        Tensor* output, bool allow_gpu_device = false,
+                        int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
 
     DataType dtype = DataTypeToEnum<T>::v();
@@ -750,7 +760,8 @@
                      .Attr("num_args", num_args)
                      .Attr("T", dtype)
                      .Attr("strides", {1, stride, stride, 1})
-                     .Attr("padding", "SAME")
+                     .Attr("padding", padding)
+                     .Attr("explicit_paddings", explicit_paddings)
                      .Attr("fused_ops", fused_ops)
                      .Finalize(&fused_conv2d));
 
@@ -851,21 +862,26 @@
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
   // FusedConv2D.
   void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            const std::vector<int>& explicit_paddings = {},
                             int depth = kDepth, int image_width = kImageWidth,
                             int image_height = kImageHeight,
                             int image_batch_count = kImageBatchCount) {
+    std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
     const BiasAddGraphRunner run_default =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunConv2DWithBias(input_data, filter_data, bias_data, out);
+        [this, &explicit_paddings, padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBias(input_data, filter_data, bias_data, padding,
+                            explicit_paddings, out);
         };
 
-    const BiasAddGraphRunner run_fused = [this](const Tensor& input_data,
-                                                const Tensor& filter_data,
-                                                const Tensor& bias_data,
-                                                Tensor* out) {
-      RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"}, out);
-    };
+    const BiasAddGraphRunner run_fused =
+        [this, explicit_paddings, padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& bias_data, Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"},
+                           padding, explicit_paddings, out);
+        };
 
     VerifyBiasAddTensorsNear(depth, image_width, image_height,
                              image_batch_count, filter_size, filter_count,
@@ -876,24 +892,29 @@
   // to FusedConv2D.
   void VerifyConv2DWithBiasAndActivation(
       const string& activation, int filter_size, int filter_count,
-      int depth = kDepth, int image_width = kImageWidth,
-      int image_height = kImageHeight,
+      const std::vector<int>& explicit_paddings = {}, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
       int image_batch_count = kImageBatchCount) {
+    std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
     const BiasAddGraphRunner run_default =
-        [this, &activation](const Tensor& input_data, const Tensor& filter_data,
-                            const Tensor& bias_data, Tensor* out) {
+        [this, &activation, &explicit_paddings, &padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& bias_data, Tensor* out) {
           RunConv2DWithBiasAndActivation(
-              input_data, filter_data, bias_data, activation, out,
+              input_data, filter_data, bias_data, activation, padding,
+              explicit_paddings, out,
               /*allow_gpu_device=*/activation == "Relu");
         };
 
-    const BiasAddGraphRunner run_fused =
-        [this, &activation](const Tensor& input_data, const Tensor& filter_data,
-                            const Tensor& bias_data, Tensor* out) {
-          RunFusedConv2DOp(input_data, filter_data, {bias_data},
-                           {"BiasAdd", activation}, out,
-                           /*allow_gpu_device=*/activation == "Relu");
-        };
+    const BiasAddGraphRunner run_fused = [this, &activation, &explicit_paddings,
+                                          padding](const Tensor& input_data,
+                                                   const Tensor& filter_data,
+                                                   const Tensor& bias_data,
+                                                   Tensor* out) {
+      RunFusedConv2DOp(input_data, filter_data, {bias_data},
+                       {"BiasAdd", activation}, padding, explicit_paddings, out,
+                       /*allow_gpu_device=*/activation == "Relu");
+    };
 
     VerifyBiasAddTensorsNear(depth, image_width, image_height,
                              image_batch_count, filter_size, filter_count,
@@ -903,27 +924,30 @@
   // Verifies that computing Conv2D+FusedBatchNorm in a graph is identical to
   // FusedConv2D.
   void VerifyConv2DWithBatchNorm(int filter_size, int filter_count,
+                                 const std::vector<int>& explicit_paddings = {},
                                  int depth = kDepth,
                                  int image_width = kImageWidth,
                                  int image_height = kImageHeight,
                                  int image_batch_count = kImageBatchCount) {
+    std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
     const BatchNormGraphRunner run_default =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& scale_data, const Tensor& offset_data,
-               const Tensor& mean_data, const Tensor& variance_data,
-               Tensor* out) {
+        [this, explicit_paddings, padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& scale_data, const Tensor& offset_data,
+            const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
           RunConv2DWithBatchNorm(input_data, filter_data, scale_data,
-                                 offset_data, mean_data, variance_data, out);
+                                 offset_data, mean_data, variance_data, padding,
+                                 explicit_paddings, out);
         };
 
     const BatchNormGraphRunner run_fused =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& scale_data, const Tensor& offset_data,
-               const Tensor& mean_data, const Tensor& variance_data,
-               Tensor* out) {
+        [this, explicit_paddings, padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& scale_data, const Tensor& offset_data,
+            const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
           RunFusedConv2DOp(input_data, filter_data,
                            {scale_data, offset_data, mean_data, variance_data},
-                           {"FusedBatchNorm"}, out);
+                           {"FusedBatchNorm"}, padding, explicit_paddings, out);
         };
 
     VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
@@ -935,27 +959,29 @@
   // identical to FusedConv2D.
   void VerifyConv2DWithBatchNormAndActivation(
       const string& activation, int filter_size, int filter_count,
-      int depth = kDepth, int image_width = kImageWidth,
-      int image_height = kImageHeight,
+      const std::vector<int>& explicit_paddings = {}, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
       int image_batch_count = kImageBatchCount) {
+    std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
     const BatchNormGraphRunner run_default =
-        [this, &activation](const Tensor& input_data, const Tensor& filter_data,
-                            const Tensor& scale_data, const Tensor& offset_data,
-                            const Tensor& mean_data,
-                            const Tensor& variance_data, Tensor* out) {
+        [this, &activation, explicit_paddings, padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& scale_data, const Tensor& offset_data,
+            const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
           RunConv2DWithBatchNormAndActivation(
               input_data, filter_data, scale_data, offset_data, mean_data,
-              variance_data, activation, out);
+              variance_data, activation, padding, explicit_paddings, out);
         };
 
     const BatchNormGraphRunner run_fused =
-        [this, &activation](const Tensor& input_data, const Tensor& filter_data,
-                            const Tensor& scale_data, const Tensor& offset_data,
-                            const Tensor& mean_data,
-                            const Tensor& variance_data, Tensor* out) {
+        [this, &activation, explicit_paddings, padding](
+            const Tensor& input_data, const Tensor& filter_data,
+            const Tensor& scale_data, const Tensor& offset_data,
+            const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
           RunFusedConv2DOp(input_data, filter_data,
                            {scale_data, offset_data, mean_data, variance_data},
-                           {"FusedBatchNorm", activation}, out);
+                           {"FusedBatchNorm", activation}, padding,
+                           explicit_paddings, out);
         };
 
     VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
@@ -997,6 +1023,13 @@
   this->VerifyConv2DWithBias(filter_size, filter_count);
 }
 
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count,
+                             /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
+}
+
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
   const int filter_count = 12;
@@ -1024,6 +1057,17 @@
   }
 }
 
+TYPED_TEST_P(FusedConv2DWithBiasOpTest,
+             ExplicitPaddingConvolutionAndActivation) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+    this->VerifyConv2DWithBiasAndActivation(
+        activation, filter_size, filter_count,
+        /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
+  }
+}
+
 // -------------------------------------------------------------------------- //
 // Conv2D + FusedBatchNorm + {Activation}                                     //
 // -------------------------------------------------------------------------- //
@@ -1046,6 +1090,14 @@
   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
 }
 
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(
+      filter_size, filter_count,
+      /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
+}
+
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
   const int filter_count = 12;
@@ -1074,21 +1126,36 @@
   }
 }
 
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
+             ExplicitPaddingConvolutionAndActivation) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+    this->VerifyConv2DWithBatchNormAndActivation(
+        activation, filter_size, filter_count,
+        /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
+  }
+}
+
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
                             SpatialConvolution,                 //
+                            ExplicitPaddingConvolution,         //
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
-                            SpatialConvolutionAndActivation);
+                            SpatialConvolutionAndActivation,    //
+                            ExplicitPaddingConvolutionAndActivation);
 
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
                             SpatialConvolution,                 //
+                            ExplicitPaddingConvolution,         //
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
-                            SpatialConvolutionAndActivation);
+                            SpatialConvolutionAndActivation,    //
+                            ExplicitPaddingConvolutionAndActivation);
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
diff --git a/tensorflow/core/kernels/cuda_sparse.cc b/tensorflow/core/kernels/cuda_sparse.cc
index e0b0a2f..51a4d9c 100644
--- a/tensorflow/core/kernels/cuda_sparse.cc
+++ b/tensorflow/core/kernels/cuda_sparse.cc
@@ -211,6 +211,44 @@
 
 TF_CALL_LAPACK_TYPES(GTSV_INSTANCE);
 
+#define GTSV_NO_PIVOT_INSTANCE(Scalar, sparse_prefix)                          \
+  template <>                                                                  \
+  Status CudaSparse::GtsvNoPivot<Scalar>(int m, int n, const Scalar* dl,       \
+                                         const Scalar* d, const Scalar* du,    \
+                                         Scalar* B, int ldb) const {           \
+    DCHECK(initialized_);                                                      \
+    return GtsvImpl(SPARSE_FN(gtsv_nopivot, sparse_prefix), *cusparse_handle_, \
+                    m, n, dl, d, du, B, ldb);                                  \
+  }
+
+TF_CALL_LAPACK_TYPES(GTSV_NO_PIVOT_INSTANCE);
+
+template <typename Scalar, typename SparseFn>
+static inline Status GtsvStridedBatchImpl(SparseFn op,
+                                          cusparseHandle_t cusparse_handle,
+                                          int m, const Scalar* dl,
+                                          const Scalar* d, const Scalar* du,
+                                          Scalar* x, int batchCount,
+                                          int batchStride) {
+  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, AsCudaComplex(dl),
+                                 AsCudaComplex(d), AsCudaComplex(du),
+                                 AsCudaComplex(x), batchCount, batchStride));
+  return Status::OK();
+}
+
+#define GTSV_STRIDED_BATCH_INSTANCE(Scalar, sparse_prefix)                   \
+  template <>                                                                \
+  Status CudaSparse::GtsvStridedBatch<Scalar>(                               \
+      int m, const Scalar* dl, const Scalar* d, const Scalar* du, Scalar* x, \
+      int batchCount, int batchStride) const {                               \
+    DCHECK(initialized_);                                                    \
+    return GtsvStridedBatchImpl(SPARSE_FN(gtsvStridedBatch, sparse_prefix),  \
+                                *cusparse_handle_, m, dl, d, du, x,          \
+                                batchCount, batchStride);                    \
+  }
+
+TF_CALL_LAPACK_TYPES(GTSV_STRIDED_BATCH_INSTANCE);
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h
index e3770fb..e7c9708 100644
--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/kernels/cuda_sparse.h
@@ -109,6 +109,24 @@
   Status Gtsv(int m, int n, const Scalar *dl, const Scalar *d, const Scalar *du,
               Scalar *B, int ldb) const;
 
+  // Solves tridiagonal system of equations without pivoting.
+  // See:
+  // https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-gtsv_nopivot
+  // Returns Status::OK() if the kernel was launched successfully.
+  template <typename Scalar>
+  Status GtsvNoPivot(int m, int n, const Scalar *dl, const Scalar *d,
+                     const Scalar *du, Scalar *B, int ldb) const;
+
+  // Solves a batch of tridiagonal systems of equations. Doesn't support
+  // multiple right-hand sides per each system. Doesn't do pivoting.
+  // See:
+  // https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-gtsvstridedbatch
+  // Returns Status::OK() if the kernel was launched successfully.
+  template <typename Scalar>
+  Status GtsvStridedBatch(int m, const Scalar *dl, const Scalar *d,
+                          const Scalar *du, Scalar *x, int batchCount,
+                          int batchStride) const;
+
  private:
   bool initialized_;
   OpKernelContext *context_;  // not owned.
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 269a145..d35520b 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -51,6 +51,14 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
@@ -87,6 +95,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/time",
     ],
@@ -178,6 +187,26 @@
     ],
 )
 
+tf_cc_test(
+    name = "batch_dataset_op_test",
+    size = "small",
+    srcs = ["batch_dataset_op_test.cc"],
+    deps = [
+        ":batch_dataset_op",
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "shard_dataset_op",
     srcs = ["shard_dataset_op.cc"],
@@ -189,6 +218,26 @@
     ],
 )
 
+tf_cc_test(
+    name = "shard_dataset_op_test",
+    size = "small",
+    srcs = ["shard_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        ":shard_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "window_dataset_op",
     srcs = ["window_dataset_op.cc"],
@@ -201,6 +250,26 @@
     ],
 )
 
+tf_cc_test(
+    name = "window_dataset_op_test",
+    size = "small",
+    srcs = ["window_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        ":window_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "padded_batch_dataset_op",
     srcs = ["padded_batch_dataset_op.cc"],
@@ -212,6 +281,27 @@
     ],
 )
 
+tf_cc_test(
+    name = "padded_batch_dataset_op_test",
+    size = "small",
+    srcs = ["padded_batch_dataset_op_test.cc"],
+    deps = [
+        ":concatenate_dataset_op",
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":padded_batch_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "filter_dataset_op",
     srcs = ["filter_dataset_op.cc"],
@@ -227,6 +317,29 @@
     ],
 )
 
+tf_cc_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":filter_dataset_op",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:unique_op",
+    ],
+)
+
 tf_kernel_library(
     name = "filter_by_component_dataset_op",
     srcs = ["filter_by_component_dataset_op.cc"],
@@ -239,6 +352,26 @@
     ],
 )
 
+tf_cc_test(
+    name = "filter_by_component_dataset_op_test",
+    size = "small",
+    srcs = ["filter_by_component_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":filter_by_component_dataset_op",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "map_dataset_op",
     srcs = ["map_dataset_op.cc"],
@@ -646,6 +779,24 @@
     ],
 )
 
+tf_cc_test(
+    name = "shuffle_dataset_op_test",
+    size = "small",
+    srcs = ["shuffle_dataset_op_test.cc"],
+    deps = [
+        "shuffle_dataset_op",
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "sparse_tensor_slice_dataset_op",
     srcs = ["sparse_tensor_slice_dataset_op.cc"],
@@ -876,31 +1027,11 @@
     ],
 )
 
-cc_library(
-    name = "graph_rewrite_dataset",
-    srcs = ["graph_rewrite_dataset.cc"],
-    hdrs = ["graph_rewrite_dataset.h"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//tensorflow/core/grappler/optimizers/data",
-        "//tensorflow/core/grappler/optimizers/data:function_utils",
-        "//tensorflow/core/grappler/optimizers/data:graph_utils",
-    ],
-)
-
 tf_kernel_library(
     name = "optimize_dataset_op",
     srcs = ["optimize_dataset_op.cc"],
     deps = [
-        ":graph_rewrite_dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index f9ce0d9..e36d270 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -13,8 +13,12 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -28,7 +32,11 @@
  public:
   explicit BatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
-        op_version_(ctx->def().op() == "BatchDataset" ? 1 : 2) {}
+        op_version_(ctx->def().op() == "BatchDataset" ? 1 : 2) {
+    if (ctx->HasAttr("parallel_copy")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("parallel_copy", &parallel_copy_));
+    }
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
@@ -45,17 +53,19 @@
                                                     &drop_remainder));
     }
 
-    *output = new Dataset(ctx, batch_size, drop_remainder, input);
+    *output =
+        new Dataset(ctx, batch_size, drop_remainder, parallel_copy_, input);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 batch_size, bool drop_remainder,
-            const DatasetBase* input)
+            bool parallel_copy, const DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)),
           batch_size_(batch_size),
           drop_remainder_(drop_remainder),
+          parallel_copy_(parallel_copy),
           input_(input) {
       input_->Ref();
 
@@ -114,8 +124,11 @@
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
       Node* drop_remainder = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {input_graph_node, batch_size, drop_remainder}, output));
+      AttrValue parallel_copy;
+      b->BuildAttrValue(parallel_copy_, &parallel_copy);
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, batch_size, drop_remainder},
+                        {{"parallel_copy", parallel_copy}}, output));
       return Status::OK();
     }
 
@@ -167,13 +180,14 @@
           return Status::OK();
         }
 
-        // Copy the retrieved batch elements into one output tensor
-        // per tuple component.
-        // NOTE(mrry): If the input or output sizes are statically
-        // known, we could potentially read the input values in-place
-        // into their respective slice locations. This would require a
-        // different GetNext() overload that supports zero-copy, and might
-        // make sense in an optimization pass.
+        // Copy the retrieved batch elements into one output tensor per tuple
+        // component.
+        //
+        // NOTE(mrry): If the input or output sizes are statically known, we
+        // could potentially read the input values in-place into their
+        // respective slice locations. This would require a different GetNext()
+        // overload that supports zero-copy, and might make sense in an
+        // optimization pass.
         const size_t num_tuple_components = batch_elements[0].size();
         const int64 num_batch_elements = batch_elements.size();
         for (size_t component_index = 0; component_index < num_tuple_components;
@@ -191,21 +205,45 @@
           Tensor& batch_component = out_tensors->back();
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
+          auto copy_element_fn = [component_index, &batch_elements,
+                                  &batch_component](int index) {
+            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                std::move(batch_elements[index][component_index]),
+                &batch_component, index));
+            return Status::OK();
+          };
+          BlockingCounter counter(num_batch_elements);
+          Status status;
+          mutex status_mu;
           for (size_t i = 0; i < num_batch_elements; ++i) {
             if (batch_elements[i][component_index].shape() !=
                 first_element.shape()) {
               return errors::InvalidArgument(
-                  "Cannot batch tensors with different shapes in component ",
+                  "Cannot batch tensors with different shapes in "
+                  "component ",
                   component_index, ". First element had shape ",
                   first_element.shape().DebugString(), " and element ", i,
                   " had shape ",
                   batch_elements[i][component_index].shape().DebugString(),
                   ".");
             }
-            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
-                std::move(batch_elements[i][component_index]), &batch_component,
-                i));
+            if (TF_PREDICT_FALSE(dataset()->parallel_copy_)) {
+              (*ctx->runner())(
+                  [i, &status, &status_mu, &counter, &copy_element_fn]() {
+                    Status s = copy_element_fn(i);
+                    {
+                      mutex_lock l(status_mu);
+                      status.Update(s);
+                    }
+                    counter.DecrementCount();
+                  });
+            } else {
+              status.Update(copy_element_fn(i));
+              counter.DecrementCount();
+            }
           }
+          counter.Wait();
+          TF_RETURN_IF_ERROR(status);
         }
         *end_of_sequence = false;
         return Status::OK();
@@ -247,11 +285,13 @@
 
     const int64 batch_size_;
     const bool drop_remainder_;
+    const bool parallel_copy_;
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> output_shapes_;
   };
 
   const int op_version_;
+  bool parallel_copy_ = false;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/batch_dataset_op_test.cc b/tensorflow/core/kernels/data/batch_dataset_op_test.cc
new file mode 100644
index 0000000..4f77d85
--- /dev/null
+++ b/tensorflow/core/kernels/data/batch_dataset_op_test.cc
@@ -0,0 +1,719 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "batch_dataset_v2";
+constexpr char kOpName[] = "BatchDatasetV2";
+
+class BatchDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `BatchDataset` op kernel.
+  Status CreateBatchDatasetOpKernel(
+      bool parallel_copy, const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* batch_dataset_op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "batch_size", "drop_remainder"},
+        {{"parallel_copy", parallel_copy},
+         {"output_types", output_types},
+         {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, batch_dataset_op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `BatchDataset` op kernel context
+  Status CreateBatchDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct TestCase {
+  RangeDatasetParam range_dataset_param;
+  Tensor batch_size;
+  Tensor drop_remainder;
+  bool parallel_copy;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test Case 1: test BatchDatasetV2 with `drop_remainder` = false and a batch
+// size that can evenly split the input dataset.
+TestCase TestCase1() {
+  return {
+      /*range_data_param*/ {0, 12, 1},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+      /*parallel_copy*/ true,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {0, 1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {4, 5, 6, 7}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}),
+                                               {8, 9, 10, 11})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({4})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 2: test BatchDatasetV2 with `drop_remainder` = true and a batch
+// size that can evenly split the input dataset.
+TestCase TestCase2() {
+  return {
+      /*range_data_param*/ {0, 12, 1},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*parallel_copy*/ false,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {0, 1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {4, 5, 6, 7}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}),
+                                               {8, 9, 10, 11})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({4})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 3: test BatchDatasetV2 with `drop_remainder` = false and a batch
+// size that can not evenly split the input dataset.
+TestCase TestCase3() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+          /*parallel_copy*/ false,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {6, 7, 8}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1}), {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 4: test BatchDatasetV2 with `drop_remainder` = true and a batch
+// size that can not evenly split the input dataset.
+TestCase TestCase4() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*parallel_copy*/ true,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {6, 7, 8})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({3})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 5: test BatchDatasetV2 with `drop_remainder` = true and
+// `batch_size` > the cardinality of the input dataset.
+TestCase TestCase5() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+          /*parallel_copy*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({12})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 6: test BatchDatasetV2 with `drop_remainder` = false and
+// `batch_size` > the cardinality of the input dataset.
+TestCase TestCase6() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+          /*parallel_copy*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape({10}), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 7: test BatchDatasetV2 with `drop_remainder` = false and
+// the output of the input dataset is empty.
+TestCase TestCase7() {
+  return {/*range_data_param*/ {0, 0, 1},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+          /*parallel_copy*/ false,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({4})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 8: test BatchDatasetV2 with an invalid batch size
+TestCase InvalidBatchSizeTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+          /*parallel_copy*/ false,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+class ParameterizedBatchDatasetOpTest
+    : public BatchDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedBatchDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
+      expected_outputs_it++;
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  EXPECT_EQ(batch_dataset->node_name(), kNodeName);
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  EXPECT_EQ(batch_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(batch_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(batch_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  EXPECT_EQ(batch_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(batch_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Batch");
+}
+
+TEST_P(ParameterizedBatchDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
+                             batch_dataset_context.get(), &batch_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  for (int breakpoint : test_case.breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *batch_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+        TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
+        expected_outputs_it++;
+      }
+      cur_iteration++;
+    }
+
+    if (breakpoint >= test_case.expected_cardinality) {
+      EXPECT_TRUE(end_of_sequence);
+      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(BatchDatasetOpTest, ParameterizedBatchDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3(),
+                              TestCase4(), TestCase5(), TestCase6(),
+                              TestCase7()})));
+
+TEST_F(BatchDatasetOpTest, InvalidBatchSize) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TestCase test_case = InvalidBatchSizeTestCase();
+  std::unique_ptr<OpKernel> batch_dataset_kernel;
+  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
+      test_case.parallel_copy, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &batch_dataset_kernel));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor batch_size = test_case.batch_size;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs{&range_dataset_tensor, &batch_size,
+                                            &drop_remainder};
+  std::unique_ptr<OpKernelContext> batch_dataset_context;
+  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
+                                         &batch_dataset_context));
+  DatasetBase* batch_dataset;
+  EXPECT_EQ(CreateDataset(batch_dataset_kernel.get(),
+                          batch_dataset_context.get(), &batch_dataset)
+                .code(),
+            tensorflow::error::INVALID_ARGUMENT);
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 39bbbc6..fdb84e8 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -19,6 +19,7 @@
 #include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
@@ -101,6 +102,113 @@
   int64 processing_time_ GUARDED_BY(mu_) = 0;
 };
 
+Status RunShortCircuit(const ShortCircuitInfo& info,
+                       const std::vector<Tensor>& args,
+                       const std::vector<Tensor>& captured_inputs,
+                       std::vector<Tensor>* rets) {
+  size_t num_args = args.size();
+  for (size_t i = 0; i < info.indices.size(); ++i) {
+    if (info.indices[i] < num_args) {
+      rets->push_back(args[info.indices[i]]);
+    } else {
+      rets->push_back(captured_inputs[info.indices[i] - num_args]);
+    }
+  }
+  return Status::OK();
+}
+
+Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
+                       const std::vector<Tensor>& captured_inputs,
+                       std::vector<Tensor>* rets) {
+  size_t num_args = args.size();
+  for (size_t i = 0; i < info.indices.size(); ++i) {
+    if (info.indices[i] < num_args) {
+      if (info.can_move[i]) {
+        rets->push_back(std::move(args[info.indices[i]]));
+      } else {
+        rets->push_back(args[info.indices[i]]);
+      }
+    } else {
+      rets->push_back(captured_inputs[info.indices[i] - num_args]);
+    }
+  }
+  return Status::OK();
+}
+
+Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
+                              const NameAttrList& func,
+                              ShortCircuitInfo* info) {
+  auto& indices = info->indices;
+
+  FunctionLibraryRuntime::Handle fn_handle;
+  TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
+      func.name(), AttrSlice(&func.attr()), &fn_handle));
+  auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
+    Status s = ctx->function_library()->ReleaseHandle(fn_handle);
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to release handle: " << s.error_message();
+    }
+  });
+
+  // If the function contains any stateful operations, we conservatively execute
+  // the entire function.
+  if (ctx->function_library()->IsStateful(func.name())) {
+    return Status::OK();
+  }
+
+  const FunctionBody* fn_body =
+      ctx->function_library()->GetFunctionBody(fn_handle);
+  indices.resize(fn_body->ret_nodes.size());
+
+  for (size_t i = 0; i < fn_body->ret_nodes.size(); ++i) {
+    Node* ret_node = fn_body->ret_nodes[i];
+    Node* ret_input_node;
+    TF_RETURN_IF_ERROR(ret_node->input_node(0, &ret_input_node));
+
+    while (ret_input_node->def().op() == "Identity") {
+      TF_RETURN_IF_ERROR(ret_input_node->input_node(0, &ret_input_node));
+    }
+
+    if (ret_input_node->def().op() == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(ret_input_node->def(), "index", &(indices[i])));
+    } else {
+      indices.clear();
+      break;
+    }
+  }
+
+  // Compute the `can_move` vector.
+  if (!indices.empty()) {
+    auto& can_move = info->can_move;
+    std::map<int, int> last_use;
+    for (size_t i = 0; i < indices.size(); ++i) {
+      last_use[indices[i]] = i;
+    }
+    can_move.resize(indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+      can_move[i] = last_use[indices[i]] == i;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status CreateFunctionLibraryDefinition(
+    const FunctionLibraryDefinition* lib_def, const string& func_name,
+    std::unique_ptr<FunctionLibraryDefinition>* result) {
+  DCHECK(lib_def != nullptr);
+  const FunctionDef* fdef = lib_def->Find(func_name);
+  if (TF_PREDICT_FALSE(fdef == nullptr)) {
+    return errors::FailedPrecondition(strings::StrCat(
+        "Could not find required function definition ", func_name));
+  }
+  *result = absl::make_unique<FunctionLibraryDefinition>(
+      lib_def->ReachableDefinitions(*fdef));
+  TF_RETURN_IF_ERROR((*result)->AddFunctionDef(*fdef));
+  return Status::OK();
+}
+
 }  // namespace
 
 Status MakeIteratorFromInputElement(
@@ -129,31 +237,46 @@
 }
 
 /* static */
-Status CapturedFunction::Create(
-    const NameAttrList& func, OpKernelContext* ctx, const string& argument_name,
-    Params params, std::unique_ptr<CapturedFunction>* out_function) {
-  OpInputList inputs;
-  TF_RETURN_IF_ERROR(ctx->input_list(argument_name, &inputs));
-  std::vector<Tensor> captured_inputs(inputs.begin(), inputs.end());
-  return Create(func, ctx, std::move(captured_inputs), std::move(params),
-                out_function);
+Status FunctionMetadata::Create(
+    OpKernelConstruction* ctx, const string& func_name, Params params,
+    std::shared_ptr<FunctionMetadata>* out_metadata) {
+  NameAttrList func;
+  TF_RETURN_IF_ERROR(ctx->GetAttr(func_name, &func));
+  return Create(ctx, std::move(func), params, out_metadata);
+}
+
+Status FunctionMetadata::Create(
+    OpKernelConstruction* ctx, NameAttrList&& func, Params params,
+    std::shared_ptr<FunctionMetadata>* out_metadata) {
+  out_metadata->reset(new FunctionMetadata(std::move(func), params));
+  TF_RETURN_IF_ERROR(CreateFunctionLibraryDefinition(
+      ctx->function_library()->GetFunctionLibraryDefinition(),
+      (*out_metadata)->func_.name(), &(*out_metadata)->lib_def_));
+  TF_RETURN_IF_ERROR(CreateShortCircuitInfo(
+      ctx, (*out_metadata)->func_, &(*out_metadata)->short_circuit_info_));
+  return Status::OK();
 }
 
 /* static */
 Status CapturedFunction::Create(
-    const NameAttrList& func, OpKernelContext* ctx,
-    std::vector<Tensor>&& captured_inputs, Params params,
+    OpKernelContext* ctx,
+    const std::shared_ptr<const FunctionMetadata> metadata,
+    const string& argument_name,
     std::unique_ptr<CapturedFunction>* out_function) {
-  if (params.lib_def == nullptr)
-    return errors::Internal(
-        "After cl/242905426 the CapturedFunction factories require the "
-        "FunctionLibraryDefinition parameter to be set. The expectation is "
-        "that any tf.data op kernel that uses the CapturedFunction mechanism "
-        "to invoke user-defined functions will create an instance of "
-        "FunctionLibraryDefinition in its constructor. See map_dataset_op.cc "
-        "for a code example.");
-  *out_function = absl::WrapUnique(new CapturedFunction(
-      func, std::move(captured_inputs), std::move(params)));
+  OpInputList inputs;
+  TF_RETURN_IF_ERROR(ctx->input_list(argument_name, &inputs));
+  std::vector<Tensor> captured_inputs(inputs.begin(), inputs.end());
+  return Create(ctx, metadata, std::move(captured_inputs), out_function);
+}
+
+/* static */
+Status CapturedFunction::Create(
+    OpKernelContext* ctx,
+    const std::shared_ptr<const FunctionMetadata> metadata,
+    std::vector<Tensor>&& captured_inputs,
+    std::unique_ptr<CapturedFunction>* out_function) {
+  *out_function = absl::WrapUnique(
+      new CapturedFunction(metadata, std::move(captured_inputs)));
   return Status::OK();
 }
 
@@ -175,9 +298,8 @@
     other_arguments->emplace_back(node);
     other_arguments_types->emplace_back(t.dtype());
   }
-
-  TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name(), *lib_def_));
-
+  TF_RETURN_IF_ERROR(
+      b->AddFunction(ctx, metadata_->func().name(), *metadata_->lib_def()));
   return Status::OK();
 }
 
@@ -187,27 +309,28 @@
   // The context's runtime will be used for all subsequent calls.
   FunctionLibraryRuntime* lib = ctx->flr();
   FunctionLibraryRuntime::InstantiateOptions inst_opts;
-  inst_opts.lib_def = lib_def_.get();
+  inst_opts.lib_def = metadata_->lib_def();
   inst_opts.create_kernels_eagerly = true;
-  if (!use_inter_op_parallelism_) {
+  if (!metadata_->use_inter_op_parallelism()) {
     inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
-  inst_opts.is_multi_device_function = is_multi_device_function_;
+  inst_opts.is_multi_device_function = metadata_->is_multi_device_function();
 
   // We infer the target device from the function library runtime.
   DCHECK(lib->device() != nullptr);
   inst_opts.target = lib->device()->name();
 
-  if (is_multi_device_function_) {
+  if (metadata_->is_multi_device_function()) {
     // Compute devices of non-captured inputs.
     //
     // We infer the number of non-captured inputs by subtracting the number
     // of captured inputs from the number of input arguments and we infer the
     // input devices from the function library runtime.
-    const FunctionDef* fdef = lib_def_->Find(func_.name());
+    const FunctionDef* fdef =
+        metadata_->lib_def()->Find(metadata_->func().name());
     if (fdef == nullptr) {
       return errors::InvalidArgument(
-          "Failed to find function ", func_.name(),
+          "Failed to find function ", metadata_->func().name(),
           " in function library: ", lib->GetFunctionLibraryDefinition());
     }
     size_t num_non_captured_inputs =
@@ -216,6 +339,7 @@
       inst_opts.input_devices.push_back(inst_opts.target);
     }
     // Compute devices of captured inputs.
+    // TODO(jsimsa): Correctly handle tensors on devices other than CPU:0.
     Device* cpu_device;
     TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
     for (auto& input : captured_inputs_) {
@@ -224,18 +348,22 @@
         const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
         inst_opts.input_devices.push_back(handle.device());
       } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
-        // TODO(jsimsa): Correctly handle tensors on devices other than CPU:0.
         inst_opts.input_devices.push_back(cpu_device->name());
       } else {
         // Fall back to using the function library runtime device.
         inst_opts.input_devices.push_back(inst_opts.target);
       }
     }
+
+    for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
+      inst_opts.output_devices.push_back(inst_opts.target);
+    }
   }
 
   FunctionLibraryRuntime::Handle f_handle;
   TF_RETURN_IF_ERROR(ctx->function_handle_cache()->Instantiate(
-      func_.name(), AttrSlice(&func_.attr()), inst_opts, &f_handle));
+      metadata_->func().name(), AttrSlice(&metadata_->func().attr()), inst_opts,
+      &f_handle));
 
   DataTypeVector ret_types;
   TF_RETURN_IF_ERROR(lib->GetRetTypes(f_handle, &ret_types));
@@ -381,8 +509,13 @@
 Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
                                          std::vector<Tensor>&& args,
                                          std::vector<Tensor>* rets) const {
+  auto& info = captured_func_->short_circuit_info();
+  if (!info.indices.empty()) {
+    return RunShortCircuit(info, std::move(args),
+                           captured_func_->captured_inputs(), rets);
+  }
+
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
       f_opts.step_id, [this](const string& name) {
         lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
@@ -418,8 +551,12 @@
 Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
     IteratorContext* ctx, const std::vector<Tensor>& args,
     std::vector<Tensor>* rets) const {
+  auto& info = captured_func_->short_circuit_info();
+  if (!info.indices.empty()) {
+    return RunShortCircuit(info, args, captured_func_->captured_inputs(), rets);
+  }
+
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
       f_opts.step_id, [this](const string& name) {
         lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
@@ -454,8 +591,12 @@
 
 Status InstantiatedCapturedFunction::RunInstantiated(
     const std::vector<Tensor>& args, std::vector<Tensor>* rets) {
+  auto& info = captured_func_->short_circuit_info();
+  if (!info.indices.empty()) {
+    return RunShortCircuit(info, args, captured_func_->captured_inputs(), rets);
+  }
+
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
       f_opts.step_id, [this](const string& name) {
         lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
@@ -491,6 +632,19 @@
 void InstantiatedCapturedFunction::RunAsync(
     IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets,
     FunctionLibraryRuntime::DoneCallback done, const string& prefix) const {
+  auto& info = captured_func_->short_circuit_info();
+  if (!info.indices.empty()) {
+    // Run the `done` callback on a threadpool thread, because it will
+    // potentially do a non-trivial amount of (e.g. copying) work, and we may
+    // want to run that concurrently with the next invocation.
+    Status s = RunShortCircuit(info, std::move(args),
+                               captured_func_->captured_inputs(), rets);
+    (*ctx->runner())(
+        std::bind([s](FunctionLibraryRuntime::DoneCallback& done) { done(s); },
+                  std::move(done)));
+    return;
+  }
+
   // NOTE(mrry): This method does not transfer ownership of `ctx`, and it may
   // be deleted before `done` is called. Take care not to capture `ctx` in any
   // code that may execute asynchronously in this function.
@@ -498,7 +652,6 @@
       std::move(args), &captured_func_->captured_inputs(), ret_types_);
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ResourceMgr* resource_mgr = lib_->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
       f_opts.step_id, [resource_mgr](const string& name) {
@@ -567,14 +720,10 @@
   lib_->Run(f_opts, f_handle_, frame, std::move(callback));
 }
 
-CapturedFunction::CapturedFunction(const NameAttrList& func,
-                                   std::vector<Tensor> captured_inputs,
-                                   Params params)
-    : func_(func),
-      captured_inputs_(std::move(captured_inputs)),
-      use_inter_op_parallelism_(params.use_inter_op_parallelism),
-      is_multi_device_function_(params.is_multi_device_function),
-      lib_def_(std::move(params.lib_def)) {}
+CapturedFunction::CapturedFunction(
+    const std::shared_ptr<const FunctionMetadata> metadata,
+    std::vector<Tensor> captured_inputs)
+    : metadata_(metadata), captured_inputs_(std::move(captured_inputs)) {}
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 79afb96..6a074d0 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -20,6 +20,7 @@
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -87,16 +88,6 @@
                 FunctionLibraryRuntime::DoneCallback done,
                 const string& prefix) const;
 
-  // Returns a step ID for use when running an `InstantiatedCapturedFunction`.
-  static int64 generate_step_id() {
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    return -std::abs(static_cast<int64>(random::New64()));
-  }
-
  private:
   InstantiatedCapturedFunction(
       FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
@@ -115,26 +106,79 @@
   TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
 };
 
+struct ShortCircuitInfo {
+  std::vector<int> indices;
+  std::vector<bool> can_move;
+};
+
+// Metadata shared across all captures of the same function.
+class FunctionMetadata {
+ public:
+  struct Params {
+    bool is_multi_device_function = false;
+    bool use_inter_op_parallelism = true;
+  };
+
+  // Creates a new instance of the `FunctionMetadata` class, fetching function
+  // from a context argument.
+  static Status Create(tensorflow::OpKernelConstruction* ctx,
+                       const string& func_name, Params params,
+                       std::shared_ptr<FunctionMetadata>* out_metadata);
+
+  // Creates a new instance of the `FunctionMetadata` class, using the provided
+  // function.
+  static Status Create(tensorflow::OpKernelConstruction* ctx,
+                       NameAttrList&& func, Params params,
+                       std::shared_ptr<FunctionMetadata>* out_metadata);
+
+  // Returns the named list of function arguments.
+  const NameAttrList& func() const { return func_; }
+
+  // Indicates whether the function is a multi-device function.
+  bool is_multi_device_function() const { return is_multi_device_function_; }
+
+  // Returns a borrowed pointer to the function library that contains the
+  // transitive closure of definitions used by the function.
+  const FunctionLibraryDefinition* lib_def() const { return lib_def_.get(); }
+
+  // Returns short-circuit information.
+  const ShortCircuitInfo& short_circuit_info() const {
+    return short_circuit_info_;
+  }
+
+  // Indicates whether to use inter-op parallelism for execution of the
+  // function.
+  bool use_inter_op_parallelism() const { return use_inter_op_parallelism_; }
+
+ private:
+  FunctionMetadata(NameAttrList&& func, Params params)
+      : func_(std::move(func)),
+        is_multi_device_function_(params.is_multi_device_function),
+        use_inter_op_parallelism_(params.use_inter_op_parallelism) {}
+
+  NameAttrList func_;
+  bool is_multi_device_function_ = false;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_ = nullptr;
+  ShortCircuitInfo short_circuit_info_;
+  bool use_inter_op_parallelism_ = true;
+};
+
 // A `CapturedFunction` encapsulates a TensorFlow function, plus any "captured"
 // arguments that it closed over in the user program.
 class CapturedFunction {
  public:
-  struct Params {
-    bool use_inter_op_parallelism = true;
-    bool is_multi_device_function = false;
-    std::shared_ptr<FunctionLibraryDefinition> lib_def = nullptr;
-  };
-
   // Creates a new instance using a list of named attributes, fetching captured
   // inputs from a context argument.
-  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument_name, Params params,
+  static Status Create(OpKernelContext* ctx,
+                       const std::shared_ptr<const FunctionMetadata> metadata,
+                       const string& argument_name,
                        std::unique_ptr<CapturedFunction>* out_function);
 
   // Creates a new instance using a list of named attributes, using provided
   // captured inputs.
-  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       std::vector<Tensor>&& captured_inputs, Params params,
+  static Status Create(OpKernelContext* ctx,
+                       const std::shared_ptr<const FunctionMetadata> metadata,
+                       std::vector<Tensor>&& captured_inputs,
                        std::unique_ptr<CapturedFunction>* out_function);
 
   // Adds the definition of this captured function into the given graph,
@@ -157,27 +201,37 @@
   }
 
   // Returns the named list of function arguments.
-  const NameAttrList& func() const { return func_; }
+  const NameAttrList& func() const { return metadata_->func(); }
 
   // Indicates whether the function is multi-device.
-  bool is_multi_device_function() const { return is_multi_device_function_; }
+  bool is_multi_device_function() const {
+    return metadata_->is_multi_device_function();
+  }
 
   // Returns the transitive set of function definition required to instantiate
   // this function.
-  const FunctionLibraryDefinition* lib_def() const { return lib_def_.get(); }
+  const FunctionLibraryDefinition* lib_def() const {
+    return metadata_->lib_def();
+  }
+
+  // If every function output corresponds to one of its inputs, the method
+  // returns the mapping from output indices to input indices. Otherwise, it
+  // returns an empty list.
+  const ShortCircuitInfo& short_circuit_info() const {
+    return metadata_->short_circuit_info();
+  }
 
   // Indicates whether the function should use inter op parallelism.
-  bool use_inter_op_parallelism() const { return use_inter_op_parallelism_; }
+  bool use_inter_op_parallelism() const {
+    return metadata_->use_inter_op_parallelism();
+  }
 
  private:
-  CapturedFunction(const NameAttrList& func,
-                   std::vector<Tensor> captured_inputs, Params params);
+  CapturedFunction(const std::shared_ptr<const FunctionMetadata> metadata,
+                   std::vector<Tensor> captured_inputs);
 
-  const NameAttrList func_;
+  const std::shared_ptr<const FunctionMetadata> metadata_;
   const std::vector<Tensor> captured_inputs_;
-  const bool use_inter_op_parallelism_;
-  const bool is_multi_device_function_;
-  std::shared_ptr<const FunctionLibraryDefinition> lib_def_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
 };
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 6963dcd..d75634a 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -32,7 +32,8 @@
     DatasetBase* dataset;
     OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
     GraphDef graph_def;
-    OP_REQUIRES_OK(ctx, AsGraphDef(ctx, dataset, &graph_def));
+    OP_REQUIRES_OK(
+        ctx, AsGraphDef(ctx, dataset, SerializationContext({}), &graph_def));
     Tensor* result;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
     result->scalar<string>()() = graph_def.SerializeAsString();
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index b03abed..6765a5a 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -18,12 +18,39 @@
 namespace tensorflow {
 namespace data {
 
+template <typename T>
+Status IsEqual(const Tensor& t1, const Tensor& t2) {
+  if (t1.dtype() != t2.dtype()) {
+    return tensorflow::errors::Internal(
+        "Two tensors have different dtypes: ", DataTypeString(t1.dtype()),
+        " vs. ", DataTypeString(t2.dtype()));
+  }
+  if (!t1.IsSameSize(t2)) {
+    return tensorflow::errors::Internal(
+        "Two tensors have different shapes: ", t1.shape().DebugString(),
+        " vs. ", t2.shape().DebugString());
+  }
+
+  auto flat_t1 = t1.flat<T>();
+  auto flat_t2 = t2.flat<T>();
+  auto length = flat_t1.size();
+
+  for (int i = 0; i < length; ++i) {
+    if (flat_t1(i) != flat_t2(i)) {
+      return tensorflow::errors::Internal(
+          "Two tensors have different values "
+          "at [",
+          i, "]: ", flat_t1(i), " vs. ", flat_t2(i));
+    }
+  }
+  return Status::OK();
+}
+
 Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
-  EXPECT_EQ(a.dtype(), b.dtype());
   switch (a.dtype()) {
-#define CASE(type)                       \
-  case DataTypeToEnum<type>::value:      \
-    test::ExpectTensorEqual<type>(a, b); \
+#define CASE(DT)                           \
+  case DataTypeToEnum<DT>::value:          \
+    TF_RETURN_IF_ERROR(IsEqual<DT>(a, b)); \
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_string(CASE);
@@ -36,7 +63,7 @@
 }
 
 template <typename T>
-bool compare(Tensor t1, Tensor t2) {
+bool compare(const Tensor& t1, const Tensor& t2) {
   auto flat_t1 = t1.flat<T>();
   auto flat_t2 = t2.flat<T>();
   auto length = std::min(flat_t1.size(), flat_t2.size());
@@ -49,7 +76,7 @@
 
 Status DatasetOpsTestBase::ExpectEqual(std::vector<Tensor> produced_tensors,
                                        std::vector<Tensor> expected_tensors,
-                                       bool expect_items_equal) {
+                                       bool compare_order) {
   if (produced_tensors.size() != expected_tensors.size()) {
     return Status(tensorflow::errors::Internal(
         "The two tensor vectors have different size (", produced_tensors.size(),
@@ -64,7 +91,7 @@
         ")"));
   }
 
-  if (expect_items_equal) {
+  if (!compare_order) {
     const DataType& dtype = produced_tensors[0].dtype();
     switch (dtype) {
 #define CASE(DT)                                                \
@@ -190,6 +217,7 @@
     OpKernelContext* const op_context,
     std::unique_ptr<IteratorContext>* iterator_context) {
   IteratorContext::Params params(op_context);
+  params.resource_mgr = op_context->resource_manager();
   function_handle_cache_ = absl::make_unique<FunctionHandleCache>(flr_);
   params.function_handle_cache = function_handle_cache_.get();
   *iterator_context = absl::make_unique<IteratorContext>(params);
@@ -228,6 +256,7 @@
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
   device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+  resource_mgr_ = absl::make_unique<ResourceMgr>("default_container");
 
   FunctionDefLibrary proto;
   for (const auto& fdef : flib) *(proto.add_function()) = fdef;
@@ -269,6 +298,7 @@
   step_container_ =
       absl::make_unique<ScopedStepContainer>(0, [](const string&) {});
   params_->step_container = step_container_.get();
+  params_->resource_manager = resource_mgr_.get();
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
   slice_reader_cache_ =
       absl::make_unique<checkpoint::TensorSliceReaderCacheWrapper>();
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index f3a0d0a..d82a0c3 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -52,11 +52,11 @@
   static Status ExpectEqual(const Tensor& a, const Tensor& b);
 
   // The method validates whether the two tensor vectors have the same tensors.
-  // If `expect_items_equal` is true, the method will only evaluate the two
+  // If `compare_order` is false, the method will only evaluate whether the two
   // vectors have the same elements regardless of order.
   static Status ExpectEqual(std::vector<Tensor> produced_tensors,
                             std::vector<Tensor> expected_tensors,
-                            bool expect_items_equal);
+                            bool compare_order);
 
   // Creates a tensor with the specified dtype, shape, and value.
   template <typename T>
@@ -206,6 +206,7 @@
   std::function<void(std::function<void()>)> runner_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ResourceMgr> resource_mgr_;
   std::unique_ptr<OpKernelContext::Params> params_;
   std::unique_ptr<checkpoint::TensorSliceReaderCacheWrapper>
       slice_reader_cache_;
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 4127026..d6101c1 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -17,20 +17,128 @@
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 namespace data {
+namespace {
 
-Status AsGraphDef(OpKernelContext* ctx, DatasetBase* dataset,
+void AddFakeSinks(FunctionDef* function_def) {
+  int counter = 0;
+  for (const auto& output : function_def->signature().output_arg()) {
+    NodeDef* node = function_def->add_node_def();
+    tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+        strings::StrCat("FakeSink", counter++), function_def, node);
+    node->set_op("Identity");
+    node->add_input(function_def->ret().at(output.name()));
+    (*node->mutable_attr())["T"].set_type(output.type());
+
+    (*function_def->mutable_ret())[output.name()] =
+        strings::StrCat(node->name(), ":output:0");
+  }
+}
+
+void RemoveFakeSinks(FunctionDef* function_def) {
+  // Map from identity node names to their input tensor strings
+  std::map<string, string> identity_map;
+  for (const auto& node : function_def->node_def()) {
+    if (node.op() == "Identity" && node.input_size() == 1) {
+      identity_map[node.name()] = node.input(0);
+    }
+  }
+  for (const auto& output_arg : function_def->signature().output_arg()) {
+    const string& tensor = function_def->ret().at(output_arg.name());
+    const string& output_node = tensor.substr(0, tensor.find(':'));
+    if (identity_map.find(output_node) != identity_map.end()) {
+      (*function_def->mutable_ret())[output_arg.name()] =
+          identity_map.at(output_node);
+    }
+  }
+}
+
+Status ApplyRewrites(OpKernelContext* ctx,
+                     const std::function<RewriterConfig(void)> config_factory,
+                     bool optimize_function_library, GraphDef* graph_def,
+                     string* output_node) {
+  // Add an identity node as the fetch node, otherwise we might get 'placeholder
+  // is both fed and fetched' errors in some cases when using input list with
+  // placeholder dataset nodes.
+  NodeDef* node = graph_def->mutable_node()->Add();
+  tensorflow::grappler::graph_utils::SetUniqueGraphNodeName("Sink", graph_def,
+                                                            node);
+  node->set_op("Identity");
+  node->add_input(*output_node);
+  (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+  *output_node = node->name();
+
+  // Add fake sink node to graph and functions to allow rewriting the actual
+  // sink nodes.
+  //
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
+  // to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    AddFakeSinks(&function_def);
+  }
+
+  // Create metagraph.
+  MetaGraphDef meta_graph_def;
+  (*meta_graph_def.mutable_graph_def()) = *graph_def;
+
+  // Grappler determines fetch ops from collection 'train_op'.
+  CollectionDef collection_def;
+  auto node_list = collection_def.mutable_node_list();
+  node_list->add_value(*output_node);
+  (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
+
+  // Create Grappler item.
+  tensorflow::grappler::ItemConfig item_config;
+  item_config.apply_optimizations = true;
+  std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+      tensorflow::grappler::GrapplerItemFromMetaGraphDef(
+          "graph", meta_graph_def, item_config);
+  grappler_item->optimization_options().optimize_function_library =
+      optimize_function_library;
+  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+  tensorflow::grappler::VirtualCluster cluster(device_map);
+
+  // Run data optimizer using grappler's meta optimizer.
+  tensorflow::ConfigProto config;
+  *config.mutable_graph_options()->mutable_rewrite_options() = config_factory();
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+  // Remove fake sinks after optimizations are done.
+  //
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
+  // to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    RemoveFakeSinks(&function_def);
+  }
+
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
+                  SerializationContext&& serialization_ctx,
                   GraphDef* graph_def) {
   GraphDefBuilder b;
   DatasetBase::DatasetGraphDefBuilder db(&b);
   Node* output_node = nullptr;
-  SerializationContext serialization_ctx({});
   TF_RETURN_IF_ERROR(
       db.AddInputDataset(&serialization_ctx, dataset, &output_node));
   // Insert a purely symbolic _Retval node to indicate to consumers which Tensor
@@ -44,63 +152,57 @@
   return Status::OK();
 }
 
-Status ComputeShortCircuitIndices(OpKernelConstruction* ctx,
-                                  const NameAttrList& func,
-                                  std::vector<int>* indices) {
-  FunctionLibraryRuntime::Handle fn_handle;
-  TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
-      func.name(), AttrSlice(&func.attr()), &fn_handle));
-  auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
-    Status s = ctx->function_library()->ReleaseHandle(fn_handle);
-    if (!s.ok()) {
-      LOG(WARNING) << "Failed to release handle: " << s.error_message();
-    }
-  });
+Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                      std::function<RewriterConfig(void)> config_factory,
+                      bool optimize_function_library,
+                      DatasetBase** rewritten_input) {
+  SerializationContext::Params params;
+  std::vector<std::pair<string, Tensor>> input_list;
+  params.input_list = &input_list;
+  params.optimization_only = true;
+  SerializationContext serialization_ctx(params);
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(
+      AsGraphDef(ctx, input, std::move(serialization_ctx), &graph_def));
 
-  // If the function contains any stateful operations, we conservatively execute
-  // the entire function.
-  if (ctx->function_library()->IsStateful(func.name())) {
-    indices->clear();
-    return Status::OK();
-  }
-
-  const FunctionBody* fn_body =
-      ctx->function_library()->GetFunctionBody(fn_handle);
-  indices->resize(fn_body->ret_nodes.size());
-
-  for (size_t i = 0; i < fn_body->ret_nodes.size(); ++i) {
-    Node* ret_node = fn_body->ret_nodes[i];
-    Node* ret_input_node;
-    TF_RETURN_IF_ERROR(ret_node->input_node(0, &ret_input_node));
-
-    while (ret_input_node->def().op() == "Identity") {
-      TF_RETURN_IF_ERROR(ret_input_node->input_node(0, &ret_input_node));
-    }
-
-    if (ret_input_node->def().op() == FunctionLibraryDefinition::kArgOp) {
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(ret_input_node->def(), "index", &((*indices)[i])));
-    } else {
-      indices->clear();
-      break;
+  string output_node;
+  for (const auto& node : graph_def.node()) {
+    if (node.op() == "_Retval") {
+      output_node = node.input(0);
     }
   }
+
+  VLOG(3) << "Before graph rewrites: " << graph_def.DebugString();
+  TF_RETURN_IF_ERROR(ApplyRewrites(ctx, config_factory,
+                                   optimize_function_library, &graph_def,
+                                   &output_node));
+  VLOG(3) << "After graph rewrites: " << graph_def.DebugString();
+
+  // Instantiate the optimized input pipeline by running the optimized graph
+  // using the optimized function library.
+  FunctionLibraryRuntime* flr = nullptr;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr = nullptr;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def = nullptr;
+  TF_RETURN_IF_ERROR(
+      ctx->function_library()->Clone(&lib_def, &pflr, &flr, true));
+
+  // Some functions may have been modified without having their names
+  // changed (for example, nested dataset graphs from FlatMap or
+  // Interleave).
+  TF_RETURN_IF_ERROR(AddToFunctionLibrary(lib_def.get(), graph_def.library()));
+
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+  std::vector<Tensor> outputs;
+  GraphRunner graph_runner(flr->device());
+
+  TF_RETURN_IF_ERROR(
+      graph_runner.Run(&graph, flr, input_list, {output_node}, &outputs));
+  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], rewritten_input));
+  (*rewritten_input)->Ref();
   return Status::OK();
 }
 
-std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
-  std::map<int, int> last_use;
-  for (size_t i = 0; i < indices.size(); ++i) {
-    last_use[indices[i]] = i;
-  }
-  std::vector<bool> can_move;
-  can_move.resize(indices.size());
-  for (size_t i = 0; i < indices.size(); ++i) {
-    can_move[i] = last_use[indices[i]] == i;
-  }
-  return can_move;
-}
-
 Status VerifyTypesMatch(const DataTypeVector& expected,
                         const DataTypeVector& received) {
   if (expected.size() != received.size()) {
@@ -277,20 +379,5 @@
       std::move(runner), std::placeholders::_1);
 }
 
-Status CreateFunctionLibraryDefinition(
-    const FunctionLibraryDefinition* lib_def, const string& func_name,
-    std::shared_ptr<FunctionLibraryDefinition>* result) {
-  DCHECK(lib_def != nullptr);
-  const FunctionDef* fdef = lib_def->Find(func_name);
-  if (TF_PREDICT_FALSE(fdef == nullptr)) {
-    return tensorflow::errors::FailedPrecondition(tensorflow::strings::StrCat(
-        "Could not find required function definition ", func_name));
-  }
-  *result = std::make_shared<FunctionLibraryDefinition>(
-      lib_def->ReachableDefinitions(*fdef));
-  TF_RETURN_IF_ERROR((*result)->AddFunctionDef(*fdef));
-  return Status::OK();
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 81bf36c..9d8f4a3 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -22,28 +22,15 @@
 namespace data {
 
 // Returns a GraphDef representation of the given dataset.
-Status AsGraphDef(OpKernelContext* ctx, DatasetBase* dataset,
+Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
+                  SerializationContext&& serialization_ctx,
                   GraphDef* graph_def);
 
-// This method is used to determine whether we can short-circuit the evaluation
-// of the user-defined function `func`. Short-circuting is possible if every
-// function output corresponds to one of its inputs (e.g. `f(x) = x`, `f(x,y) =
-// (y,x)`, or `f(x) = (x,x)`).
-//
-// If short-circuiting is possible, the method stores the mapping from output
-// indices to input indices in `indices`. Otherwise, `indices` will be empty.
-//
-// Returns non-ok status if analysis of the function fails.
-//
-// TODO(jsimsa): Extend this to support constants as well.
-Status ComputeShortCircuitIndices(OpKernelConstruction* ctx,
-                                  const NameAttrList& func,
-                                  std::vector<int>* indices);
-
-// Given a vector that maps output indices to input indices, return a vector
-// that identifies for which output indices can we move the input (assuming
-// output indices are processed left to right).
-std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
+// Rewrites the input dataset using the given config.
+Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                      std::function<RewriterConfig(void)> config_factory,
+                      bool optimize_function_library,
+                      DatasetBase** rewritten_input);
 
 // Returns Status::OK() if `expected` and `received` types match,
 // errors::InvalidArgument otherwise.
@@ -108,10 +95,6 @@
 std::function<void(std::function<void()>)> RunnerWithMaxParallelism(
     std::function<void(std::function<void()>)> runner, int max_parallelism);
 
-Status CreateFunctionLibraryDefinition(
-    const FunctionLibraryDefinition* lib_def, const string& func_name,
-    std::shared_ptr<FunctionLibraryDefinition>* result);
-
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 23ae9d4..a553b8a 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -25,26 +25,6 @@
 namespace data {
 namespace {
 
-TEST(DatasetUtilsTest, ComputeMoveVector) {
-  struct TestCase {
-    std::vector<int> indices;
-    std::vector<bool> expected;
-  };
-
-  TestCase test_cases[] = {
-      TestCase{{}, {}},
-      TestCase{{1}, {true}},
-      TestCase{{1, 1}, {false, true}},
-      TestCase{{1, 2}, {true, true}},
-      TestCase{{1, 1, 2}, {false, true, true}},
-      TestCase{{1, 2, 2}, {true, false, true}},
-  };
-
-  for (auto& test_case : test_cases) {
-    EXPECT_EQ(test_case.expected, ComputeMoveVector(test_case.indices));
-  }
-}
-
 TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
   VariantTensorData data;
   VariantTensorDataWriter writer(&data);
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 0c99cca..ccd5844 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -22,6 +22,21 @@
 )
 
 tf_kernel_library(
+    name = "auto_shard_dataset_op",
+    srcs = ["auto_shard_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/optimizers/data:auto_shard",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
     name = "choose_fastest_branch_dataset_op",
     srcs = ["choose_fastest_branch_dataset_op.cc"],
     deps = [
@@ -29,6 +44,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:take_dataset_op",
@@ -69,21 +85,6 @@
 )
 
 tf_kernel_library(
-    name = "auto_shard_dataset_op",
-    srcs = ["auto_shard_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler/optimizers/data:auto_shard",
-        "//tensorflow/core/kernels/data:graph_rewrite_dataset",
-    ],
-)
-
-tf_kernel_library(
     name = "group_by_reducer_dataset_op",
     srcs = ["group_by_reducer_dataset_op.cc"],
     deps = [
@@ -123,18 +124,6 @@
 )
 
 tf_kernel_library(
-    name = "indexed_dataset_op",
-    srcs = ["indexed_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/kernels/data:dataset_utils",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_kernel_library(
     name = "lmdb_dataset_op",
     srcs = ["lmdb_dataset_op.cc"],
     deps = [
@@ -199,24 +188,6 @@
 )
 
 tf_kernel_library(
-    name = "numa_map_and_batch_dataset_op",
-    srcs = ["numa_map_and_batch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core/kernels:inplace_ops",
-        "//tensorflow/core/kernels/data:captured_function",
-        "//tensorflow/core/kernels/data:dataset_utils",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_kernel_library(
     name = "parallel_interleave_dataset_op",
     srcs = ["parallel_interleave_dataset_op.cc"],
     deps = [
@@ -276,7 +247,7 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/optimizers/data:rebatch",
-        "//tensorflow/core/kernels/data:graph_rewrite_dataset",
+        "//tensorflow/core/kernels/data:dataset_utils",
     ],
 )
 
@@ -337,6 +308,18 @@
 )
 
 tf_kernel_library(
+    name = "snapshot_dataset_op",
+    srcs = ["snapshot_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
     name = "sql_dataset_op",
     srcs = [
         "sql_dataset_op.cc",
@@ -446,12 +429,10 @@
         ":group_by_reducer_dataset_op",
         ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
-        ":indexed_dataset_op",
         ":lmdb_dataset_op",
         ":map_and_batch_dataset_op",
         ":matching_files_dataset_op",
         ":non_serializable_dataset_op",
-        ":numa_map_and_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parse_example_dataset_op",
         ":prefetching_kernels",
@@ -462,6 +443,7 @@
         ":set_stats_aggregator_dataset_op",
         ":sleep_dataset_op",
         ":sliding_window_dataset_op",
+        ":snapshot_dataset_op",
         ":sql_dataset_op",
         ":stats_aggregator_ops",
         ":stats_dataset_ops",
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
index 3728c64..7531225 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -24,17 +25,12 @@
 class AutoShardDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit AutoShardDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
+      : UnaryDatasetOpKernel(ctx) {}
 
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 index;
-    int64 num_workers;
+    int64 index, num_workers;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
     OP_REQUIRES(
         ctx, num_workers > 0,
@@ -45,69 +41,39 @@
                 errors::InvalidArgument("index must be between 0 and ",
                                         num_workers - 1));
 
-    Dataset* dataset = new Dataset(ctx, input, num_workers, index,
-                                   output_types_, output_shapes_);
-    const Status s = dataset->Optimize(ctx);
+    auto config_factory = [num_workers, index]() {
+      return CreateConfig(num_workers, index);
+    };
 
-    if (s.ok()) {
-      *output = dataset;
-    } else {
-      dataset->Unref();
-      OP_REQUIRES_OK(ctx, s);
-    }
+    // We only want to optimize functions for some particular datasets like
+    // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+    // function optimization and explicitly handle function modifications
+    // for those datasets in the rewrite.
+    OP_REQUIRES_OK(ctx,
+                   RewriteDataset(ctx, input, std::move(config_factory),
+                                  /*optimize_function_library=*/false, output));
   }
 
  private:
-  class Dataset : public GraphRewriteDataset {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const int64 num_workers, const int64 index,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
-          num_workers_(num_workers),
-          index_(index) {}
+  static RewriterConfig CreateConfig(int64 num_workers, int64 index) {
+    RewriterConfig rewriter_config;
+    rewriter_config.set_fail_on_optimizer_errors(true);
+    rewriter_config.add_optimizers(kOptimizerName);
+    rewriter_config.set_meta_optimizer_iterations(
+        RewriterConfig_NumIterationsType_ONE);
+    auto custom_optimizer = rewriter_config.add_custom_optimizers();
+    custom_optimizer->set_name(kOptimizerName);
+    AttrValue num_workers_attr;
+    num_workers_attr.set_i(num_workers);
+    (*custom_optimizer->mutable_parameter_map())["num_workers"] =
+        num_workers_attr;
 
-    string DebugString() const override {
-      return "AutoShardDatasetOp::Dataset";
-    }
+    AttrValue index_attr;
+    index_attr.set_i(index);
+    (*custom_optimizer->mutable_parameter_map())["index"] = index_attr;
 
-   private:
-    bool ShouldOptimizeFunctions() override {
-      // We only want to optimize functions for some particular datasets like
-      // FlatMapDataset, InterleaveDataset etc. So we disable generalized
-      // function optimization and explicitly handle function modifications
-      // for those datasets in the rewrite.
-      return false;
-    }
-
-    RewriterConfig CreateGrapplerRewriteConfig() override {
-      RewriterConfig rewriter_config;
-      rewriter_config.set_fail_on_optimizer_errors(true);
-      rewriter_config.add_optimizers(kOptimizerName);
-      rewriter_config.set_meta_optimizer_iterations(
-          RewriterConfig_NumIterationsType_ONE);
-      auto custom_optimizer = rewriter_config.add_custom_optimizers();
-      custom_optimizer->set_name(kOptimizerName);
-      AttrValue num_workers_attr;
-      num_workers_attr.set_i(num_workers_);
-      (*custom_optimizer->mutable_parameter_map())["num_workers"] =
-          num_workers_attr;
-
-      AttrValue index_attr;
-      index_attr.set_i(index_);
-      (*custom_optimizer->mutable_parameter_map())["index"] = index_attr;
-
-      return rewriter_config;
-    }
-
-    const int64 num_workers_;
-    const int64 index_;
-  };
-
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
+    return rewriter_config;
+  }
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExperimentalAutoShardDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index e8ec235..8b4bafe 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
@@ -128,13 +129,15 @@
 class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ChooseFastestBranchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        lib_def_(std::make_shared<FunctionLibraryDefinition>(
-            ctx->function_library()
-                ->GetFunctionLibraryDefinition()
-                ->default_registry(),
-            FunctionDefLibrary{})) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &funcs_));
+      : UnaryDatasetOpKernel(ctx) {
+    std::vector<NameAttrList> funcs;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &funcs));
+    func_metadatas_.resize(funcs.size());
+    for (int i = 0; i < funcs.size(); ++i) {
+      OP_REQUIRES_OK(
+          ctx, FunctionMetadata::Create(ctx, std::move(funcs[i]), /*params=*/{},
+                                        &func_metadatas_[i]));
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("num_elements_per_branch",
                                      &num_elements_per_branch_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -142,17 +145,8 @@
     OP_REQUIRES_OK(ctx, ctx->GetAttr("other_arguments_lengths",
                                      &other_arguments_lengths_));
 
-    for (const auto& func : funcs_) {
-      std::shared_ptr<FunctionLibraryDefinition> result;
-      OP_REQUIRES_OK(
-          ctx, CreateFunctionLibraryDefinition(
-                   ctx->function_library()->GetFunctionLibraryDefinition(),
-                   func.name(), &result));
-      OP_REQUIRES_OK(ctx, lib_def_->AddLibrary(*result));
-    }
-
     OP_REQUIRES(
-        ctx, funcs_.size() == other_arguments_lengths_.size(),
+        ctx, func_metadatas_.size() == other_arguments_lengths_.size(),
         errors::InvalidArgument(
             "branches and other_arguments_lengths must have the same length."));
   }
@@ -174,36 +168,32 @@
                                         "divisible by `ratio_denominator`."));
 
     std::vector<std::unique_ptr<CapturedFunction>> captured_funcs(
-        funcs_.size());
+        func_metadatas_.size());
     OpInputList inputs;
     OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
 
-    CapturedFunction::Params params;
-    params.lib_def = lib_def_;
     // Keeps track of starting index into other_arguments for a given function.
     int index = 0;
-    for (int i = 0; i < funcs_.size(); ++i) {
+    for (int i = 0; i < func_metadatas_.size(); ++i) {
       std::vector<Tensor> captured_args;
       captured_args.reserve(other_arguments_lengths_[i]);
       int end_index = index + other_arguments_lengths_[i];
       for (; index < end_index; ++index) {
         captured_args.push_back(inputs[index]);
       }
-      OP_REQUIRES_OK(ctx, CapturedFunction::Create(funcs_[i], ctx,
+      OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_metadatas_[i],
                                                    std::move(captured_args),
-                                                   params, &captured_funcs[i]));
+                                                   &captured_funcs[i]));
     }
-    *output =
-        new Dataset(ctx, input, funcs_, std::move(captured_funcs),
-                    output_types_, output_shapes_, num_elements_per_branch_,
-                    ratio_numerator_, ratio_denominator_);
+    *output = new Dataset(ctx, input, std::move(captured_funcs), output_types_,
+                          output_shapes_, num_elements_per_branch_,
+                          ratio_numerator_, ratio_denominator_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, DatasetBase* input,
-            const std::vector<NameAttrList>& funcs,
             std::vector<std::unique_ptr<CapturedFunction>> captured_funcs,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
@@ -211,7 +201,6 @@
             int64 ratio_denominator)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          funcs_(funcs),
           captured_funcs_(std::move(captured_funcs)),
           output_types_(output_types),
           output_shapes_(output_shapes),
@@ -292,7 +281,12 @@
 
       // branches
       AttrValue branches_attr;
-      b->BuildAttrValue(funcs_, &branches_attr);
+      std::vector<NameAttrList> funcs;
+      funcs.resize(captured_funcs_.size());
+      for (int i = 0; i < captured_funcs_.size(); ++i) {
+        funcs[i] = captured_funcs_[i]->func();
+      }
+      b->BuildAttrValue(funcs, &branches_attr);
 
       // other_arguments_lengths
       AttrValue other_arguments_lengths_attr;
@@ -323,15 +317,15 @@
      public:
       explicit ChooseFastestIterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            instantiated_captured_funcs_(dataset()->funcs_.size()),
-            histograms_(dataset()->funcs_.size()) {}
+            instantiated_captured_funcs_(dataset()->captured_funcs_.size()),
+            histograms_(dataset()->captured_funcs_.size()) {}
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
 
-        for (int i = 0; i < dataset()->funcs_.size(); ++i) {
+        for (int i = 0; i < dataset()->captured_funcs_.size(); ++i) {
           TF_RETURN_IF_ERROR(dataset()->captured_funcs_[i]->Instantiate(
               ctx, &instantiated_captured_funcs_[i]));
         }
@@ -347,7 +341,7 @@
                              bool* end_of_sequence) override {
         {  // Locking scope
           mutex_lock l(mu_);
-          if (branch_index_ < dataset()->funcs_.size()) {
+          if (branch_index_ < dataset()->captured_funcs_.size()) {
             // Still running experiments
             if (!current_iterator_) {
               TF_RETURN_IF_ERROR(MakeCurrentIterator(ctx, branch_index_,
@@ -420,7 +414,7 @@
 
         // Restore state of `current_iterator_` if it exists.
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          if (branch_index_ < dataset()->funcs_.size()) {
+          if (branch_index_ < dataset()->captured_funcs_.size()) {
             TF_RETURN_IF_ERROR(MakeCurrentIterator(ctx, branch_index_,
                                                    /*is_experiment=*/true));
           } else {
@@ -537,7 +531,6 @@
     };  // class Iterator
 
     const DatasetBase* const input_;
-    std::vector<NameAttrList> funcs_;
     const std::vector<std::unique_ptr<CapturedFunction>> captured_funcs_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
@@ -549,11 +542,10 @@
   int64 ratio_numerator_;
   int64 ratio_denominator_;
   int64 num_elements_per_branch_;
+  std::vector<std::shared_ptr<FunctionMetadata>> func_metadatas_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  std::vector<NameAttrList> funcs_;
   std::vector<int32> other_arguments_lengths_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };  // class ChooseFastestBranchDatasetOp
 
 // Register the kernel implementation for ChooseFastestBranchDataset.
diff --git a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index df4b2db..88e53df 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -31,52 +31,41 @@
 class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByReducerDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        lib_def_(std::make_shared<FunctionLibraryDefinition>(
-            ctx->function_library()
-                ->GetFunctionLibraryDefinition()
-                ->default_registry(),
-            FunctionDefLibrary{})) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("finalize_func", &finalize_func_));
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "key_func", /*params=*/{},
+                                                 &key_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "init_func", /*params=*/{},
+                                            &init_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "reduce_func", /*params=*/{},
+                                            &reduce_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "finalize_func", /*params=*/{},
+                                            &finalize_func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-
-    for (const auto& func :
-         {key_func_, init_func_, reduce_func_, finalize_func_}) {
-      std::shared_ptr<FunctionLibraryDefinition> result;
-      OP_REQUIRES_OK(
-          ctx, CreateFunctionLibraryDefinition(
-                   ctx->function_library()->GetFunctionLibraryDefinition(),
-                   func.name(), &result));
-      OP_REQUIRES_OK(ctx, lib_def_->AddLibrary(*result));
-    }
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-
     std::unique_ptr<CapturedFunction> captured_key_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(key_func_, ctx,
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, key_func_metadata_,
                                                  "key_func_other_arguments",
-                                                 params, &captured_key_func));
+                                                 &captured_key_func));
     std::unique_ptr<CapturedFunction> captured_init_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(init_func_, ctx,
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, init_func_metadata_,
                                                  "init_func_other_arguments",
-                                                 params, &captured_init_func));
+                                                 &captured_init_func));
     std::unique_ptr<CapturedFunction> captured_reduce_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
-                            reduce_func_, ctx, "reduce_func_other_arguments",
-                            params, &captured_reduce_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, reduce_func_metadata_,
+                                                 "reduce_func_other_arguments",
+                                                 &captured_reduce_func));
     std::unique_ptr<CapturedFunction> captured_finalize_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(finalize_func_, ctx,
-                                      "finalize_func_other_arguments", params,
-                                      &captured_finalize_func));
+    OP_REQUIRES_OK(ctx,
+                   CapturedFunction::Create(ctx, finalize_func_metadata_,
+                                            "finalize_func_other_arguments",
+                                            &captured_finalize_func));
 
     *output = new Dataset(
         ctx, input, std::move(captured_key_func), std::move(captured_init_func),
@@ -156,13 +145,13 @@
           &finalize_func_other_arguments_types));
 
       AttrValue key_func;
-      b->BuildAttrValue(this->key_func(), &key_func);
+      b->BuildAttrValue(captured_key_func_->func(), &key_func);
       AttrValue init_func;
-      b->BuildAttrValue(this->init_func(), &init_func);
+      b->BuildAttrValue(captured_init_func_->func(), &init_func);
       AttrValue reduce_func;
-      b->BuildAttrValue(this->reduce_func(), &reduce_func);
+      b->BuildAttrValue(captured_reduce_func_->func(), &reduce_func);
       AttrValue finalize_func;
-      b->BuildAttrValue(this->finalize_func(), &finalize_func);
+      b->BuildAttrValue(captured_finalize_func_->func(), &finalize_func);
 
       AttrValue key_func_other_arguments_types_attr;
       b->BuildAttrValue(key_func_other_arguments_types,
@@ -406,20 +395,6 @@
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
     };
 
-    const NameAttrList& key_func() const { return captured_key_func_->func(); }
-
-    const NameAttrList& init_func() const {
-      return captured_init_func_->func();
-    }
-
-    const NameAttrList& reduce_func() const {
-      return captured_reduce_func_->func();
-    }
-
-    const NameAttrList& finalize_func() const {
-      return captured_finalize_func_->func();
-    }
-
     const DatasetBase* const input_;
     const std::unique_ptr<CapturedFunction> captured_key_func_;
     const std::unique_ptr<CapturedFunction> captured_init_func_;
@@ -429,13 +404,12 @@
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
+  std::shared_ptr<FunctionMetadata> key_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> init_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> reduce_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> finalize_func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList key_func_;
-  NameAttrList init_func_;
-  NameAttrList reduce_func_;
-  NameAttrList finalize_func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index fd0d677..1bdb25b 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -31,107 +31,58 @@
 // description of the following op.
 class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using KeyFunction =
-      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
-                           const std::vector<Tensor>&, std::vector<Tensor>*)>;
-
   explicit GroupByWindowDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        lib_def_(std::make_shared<FunctionLibraryDefinition>(
-            ctx->function_library()
-                ->GetFunctionLibraryDefinition()
-                ->default_registry(),
-            FunctionDefLibrary{})) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size_func", &window_size_func_));
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "key_func", /*params=*/{},
+                                                 &key_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "reduce_func", /*params=*/{},
+                                            &reduce_func_metadata_));
+    OP_REQUIRES_OK(
+        ctx, FunctionMetadata::Create(ctx, "window_size_func", /*params=*/{},
+                                      &window_size_func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-
-    for (const auto& func : {key_func_, reduce_func_, window_size_func_}) {
-      std::shared_ptr<FunctionLibraryDefinition> result;
-      OP_REQUIRES_OK(
-          ctx, CreateFunctionLibraryDefinition(
-                   ctx->function_library()->GetFunctionLibraryDefinition(),
-                   func.name(), &result));
-      OP_REQUIRES_OK(ctx, lib_def_->AddLibrary(*result));
-    }
-
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(
-                            ctx, key_func_, &key_short_circuit_indices_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-
     std::unique_ptr<CapturedFunction> captured_key_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(key_func_, ctx,
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, key_func_metadata_,
                                                  "key_func_other_arguments",
-                                                 params, &captured_key_func));
+                                                 &captured_key_func));
+
     std::unique_ptr<CapturedFunction> captured_reduce_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
-                            reduce_func_, ctx, "reduce_func_other_arguments",
-                            params, &captured_reduce_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, reduce_func_metadata_,
+                                                 "reduce_func_other_arguments",
+                                                 &captured_reduce_func));
+
     std::unique_ptr<CapturedFunction> captured_window_size_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(window_size_func_, ctx,
-                                      "window_size_func_other_arguments",
-                                      params, &captured_window_size_func));
+    OP_REQUIRES_OK(ctx,
+                   CapturedFunction::Create(ctx, window_size_func_metadata_,
+                                            "window_size_func_other_arguments",
+                                            &captured_window_size_func));
 
-    KeyFunction key_fn;
-    if (key_short_circuit_indices_.empty()) {
-      key_fn = [](IteratorContext* ctx,
-                  InstantiatedCapturedFunction* inst_captured_key_func,
-                  const std::vector<Tensor>& args,
-                  std::vector<Tensor>* out_tensors) {
-        return inst_captured_key_func->RunWithBorrowedArgs(ctx, args,
-                                                           out_tensors);
-      };
-    } else {
-      int key_index = key_short_circuit_indices_[0];
-      key_fn = [key_index](IteratorContext* ctx,
-                           InstantiatedCapturedFunction* inst_captured_key_func,
-                           const std::vector<Tensor>& args,
-                           std::vector<Tensor>* out_tensors) {
-        const Tensor& key = args[key_index];
-        if (key.dtype() != DT_INT64 || key.NumElements() != 1) {
-          return errors::InvalidArgument(
-              "Key function `f` must return a scalar int64.");
-        }
-        out_tensors->push_back(key);
-        return Status::OK();
-      };
-    }
-
-    *output = new Dataset(ctx, input, key_func_, reduce_func_,
-                          window_size_func_, std::move(captured_key_func),
+    *output = new Dataset(ctx, input, std::move(captured_key_func),
                           std::move(captured_reduce_func),
-                          std::move(captured_window_size_func),
-                          std::move(key_fn), output_types_, output_shapes_);
+                          std::move(captured_window_size_func), output_types_,
+                          output_shapes_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& key_func, const NameAttrList& reduce_func,
-            const NameAttrList& window_size_func,
             std::unique_ptr<CapturedFunction> captured_key_func,
             std::unique_ptr<CapturedFunction> captured_reduce_func,
             std::unique_ptr<CapturedFunction> captured_window_size_func,
-            KeyFunction key_fn, const DataTypeVector& output_types,
+            const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          key_func_(key_func),
-          reduce_func_(reduce_func),
-          window_size_func_(window_size_func),
           captured_key_func_(std::move(captured_key_func)),
           captured_reduce_func_(std::move(captured_reduce_func)),
           captured_window_size_func_(std::move(captured_window_size_func)),
-          key_fn_(std::move(key_fn)),
           output_types_(output_types),
           output_shapes_(output_shapes) {
       input_->Ref();
@@ -182,11 +133,11 @@
           &window_size_func_other_arguments_types));
 
       AttrValue key_func;
-      b->BuildAttrValue(key_func_, &key_func);
+      b->BuildAttrValue(captured_key_func_->func(), &key_func);
       AttrValue reduce_func;
-      b->BuildAttrValue(reduce_func_, &reduce_func);
+      b->BuildAttrValue(captured_reduce_func_->func(), &reduce_func);
       AttrValue window_size_func;
-      b->BuildAttrValue(window_size_func_, &window_size_func);
+      b->BuildAttrValue(captured_window_size_func_->func(), &window_size_func);
 
       AttrValue key_func_other_arguments_types_attr;
       b->BuildAttrValue(key_func_other_arguments_types,
@@ -266,9 +217,8 @@
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(
-                  dataset()->key_fn_(ctx, instantiated_key_func_.get(),
-                                     next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                  ctx, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -521,8 +471,9 @@
             GetDatasetFromVariantTensor(return_values[0], &returned_dataset));
 
         // Create an iterator for the dataset that was returned by `f`.
-        return returned_dataset->MakeIterator(ctx, prefix(),
-                                              &current_group_iterator_);
+        return returned_dataset->MakeIterator(
+            ctx, strings::StrCat(prefix(), "::Reduce"),
+            &current_group_iterator_);
       }
 
       mutex mu_;
@@ -540,24 +491,18 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList key_func_;
-    const NameAttrList reduce_func_;
-    const NameAttrList window_size_func_;
     const std::unique_ptr<CapturedFunction> captured_key_func_;
     const std::unique_ptr<CapturedFunction> captured_reduce_func_;
     const std::unique_ptr<CapturedFunction> captured_window_size_func_;
-    const KeyFunction key_fn_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
+  std::shared_ptr<FunctionMetadata> key_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> reduce_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> window_size_func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList key_func_;
-  NameAttrList reduce_func_;
-  NameAttrList window_size_func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
-  std::vector<int> key_short_circuit_indices_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
deleted file mode 100644
index 758eef0..0000000
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
+++ /dev/null
@@ -1,547 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-// TODO(saeta): Urgh, this is ugly.
-class MaterializedIndexedDataset {
- public:
-  virtual ~MaterializedIndexedDataset() = default;
-
-  // Retrieve the element at a given index. The output tensors are stored in
-  // out_tensors.
-  //
-  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
-  // returned.
-  //
-  // Get is thread-safe.
-  virtual Status Get(IteratorContext&& ctx, uint64 index,
-                     std::vector<Tensor>* out_tensors) const = 0;
-
-  // Size determines the number of elements in this IndexedDataset.
-  //
-  // Size is thread-safe.
-  virtual Status Size(uint64* size) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-};
-
-// IndexedDataset represents a dataset that supports random access in addition
-// to iterator-based sequential access.
-//
-// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
-// significant (backwards incompatible) changes!
-class IndexedDataset : public DatasetBase {
- public:
-  explicit IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
-
-  // Materialize (if necessary) the dataset, and return a pointer.
-  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
-  virtual Status MaterializeDataset(
-      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
-};
-
-// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
-// rest of the TensorFlow runtime.
-//
-// Most IndexedDataset's will be private members of classes inheriting from this
-// class.
-class IndexedDatasetOpKernel : public OpKernel {
- public:
-  explicit IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeIndexedDataset(OpKernelContext* ctx,
-                                  IndexedDataset** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-class MaterializedDatasetResource : public ResourceBase {
- public:
-  MaterializedDatasetResource(
-      const DataTypeVector& output_dtypes,
-      const std::vector<PartialTensorShape>& output_shapes)
-      : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
-
-  string DebugString() const override {
-    return "Materialized IndexedDataset resource";
-  }
-
-  Status Get(IteratorContext&& ctx, uint64 index,
-             std::vector<Tensor>* out_tensors) {
-    std::shared_ptr<MaterializedIndexedDataset> captured(materialized_);
-    if (captured) {
-      return captured->Get(std::move(ctx), index, out_tensors);
-    } else {
-      return errors::FailedPrecondition(
-          "Get() failed because the MaterializedIndexedDataset has not been "
-          "initialized. Ensure that you have run the materialization operation "
-          "for this MaterializedIndexedDataset before retrieving elements.");
-    }
-  }
-
-  // TODO(saeta): Implement Save and Restore
-
-  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
-  const std::vector<PartialTensorShape>& output_shapes() const {
-    return output_shapes_;
-  }
-
-  Status set_materialized_dataset(
-      const std::shared_ptr<MaterializedIndexedDataset>& dataset) {
-    if (dataset) {
-      TF_RETURN_IF_ERROR(
-          VerifyTypesMatch(output_dtypes_, dataset->output_dtypes()));
-      TF_RETURN_IF_ERROR(
-          VerifyShapesCompatible(output_shapes_, dataset->output_shapes()));
-    }
-    materialized_ = dataset;
-    return Status::OK();
-  }
-
- private:
-  std::shared_ptr<MaterializedIndexedDataset> materialized_;
-  const DataTypeVector output_dtypes_;
-  const std::vector<PartialTensorShape> output_shapes_;
-};
-
-// A wrapper class for storing an `IndexedDataset` instance in a DT_VARIANT
-// tensor. Objects of the wrapper class own a reference on an instance of an
-// `IndexedTensor` and the wrapper's copy constructor and destructor take care
-// of managing the reference count.
-//
-// NOTE: This is not a feature-complete implementation of the DT_VARIANT
-// specification. In particular, we cannot currently serialize an arbitrary
-// `IndexedDataset` object, so the `Encode()` and `Decode()` methods are not
-// implemented.
-//
-// NOTE(saeta): When `IndexedDataset`s get merged into core, we can instead just
-// use `tensorflow::DatasetVariantWrapper`.
-class IndexedDatasetVariantWrapper {
- public:
-  IndexedDatasetVariantWrapper() : dataset_(nullptr) {}
-
-  // Transfers ownership of `dataset` to `*this`.
-  explicit IndexedDatasetVariantWrapper(IndexedDataset* dataset)
-      : dataset_(dataset) {}
-
-  IndexedDatasetVariantWrapper(const IndexedDatasetVariantWrapper& other)
-      : dataset_(other.dataset_) {
-    if (dataset_) dataset_->Ref();
-  }
-
-  ~IndexedDatasetVariantWrapper() {
-    if (dataset_) dataset_->Unref();
-  }
-
-  IndexedDataset* get() const { return dataset_; }
-
-  string TypeName() const { return "tensorflow::IndexedDatasetVariantWrapper"; }
-  string DebugString() const {
-    if (dataset_) {
-      return dataset_->DebugString();
-    } else {
-      return "<Uninitialized IndexedDatasetVariantWrapper>";
-    }
-  }
-
-  void Encode(VariantTensorData* data) const {
-    LOG(ERROR) << "The Encode() method is not implemented for "
-                  "IndexedDatasetVariantWrapper objects.";
-  }
-
-  bool Decode(const VariantTensorData& data) {
-    LOG(ERROR) << "The Decode() method is not implemented for "
-                  "IndexedDatasetVariantWrapper objects.";
-    return false;
-  }
-
- private:
-  IndexedDataset* const dataset_;  // Owns one reference.
-};
-
-Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
-                                          IndexedDataset** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor.shape()))) {
-    return errors::InvalidArgument(
-        "IndexedDataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  const Variant& variant = tensor.scalar<Variant>()();
-  const IndexedDatasetVariantWrapper* wrapper =
-      variant.get<IndexedDatasetVariantWrapper>();
-  if (wrapper == nullptr) {
-    return errors::InvalidArgument("Tensor must be an IndexedDataset object.");
-  }
-  *out_dataset = wrapper->get();
-  if (*out_dataset == nullptr) {
-    return errors::Internal("Read uninitialized IndexedDataset variant.");
-  }
-  return Status::OK();
-}
-
-Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
-                                          Tensor* tensor) {
-  if (!(tensor->dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor->shape()))) {
-    return errors::InvalidArgument(
-        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  tensor->scalar<Variant>()() = IndexedDatasetVariantWrapper(dataset);
-  return Status::OK();
-}
-
-void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
-  IndexedDataset* dataset = nullptr;
-  MakeIndexedDataset(ctx, &dataset);
-
-  if (ctx->status().ok()) {
-    OP_REQUIRES(ctx, dataset != nullptr,
-                errors::Internal("MakeIndexedDataset did not correctly "
-                                 "construct the IndexedDataset"));
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES_OK(ctx, StoreIndexedDatasetInVariantTensor(dataset, output));
-  }
-}
-
-class MaterializedHandleOp : public OpKernel {
- public:
-  explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  ~MaterializedHandleOp() override {
-    if (resource_ != nullptr) {
-      resource_->Unref();
-      if (cinfo_.resource_is_private_to_kernel()) {
-        if (!cinfo_.resource_manager()
-                 ->template Delete<MaterializedDatasetResource>(
-                     cinfo_.container(), cinfo_.name())
-                 .ok()) {
-          // Do nothing; the resource can have been deleted by session resets.
-          // Note: cargo-culted from $tf/core/framework/resource_op_kernel.h
-        }
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    {
-      mutex_lock l(mu_);
-      if (resource_ == nullptr) {
-        ResourceMgr* mgr = context->resource_manager();
-        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
-
-        MaterializedDatasetResource* resource;
-        OP_REQUIRES_OK(context,
-                       mgr->LookupOrCreate<MaterializedDatasetResource>(
-                           cinfo_.container(), cinfo_.name(), &resource,
-                           [this](MaterializedDatasetResource** ret)
-                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                                 *ret = new MaterializedDatasetResource(
-                                     output_dtypes_, output_shapes_);
-                                 return Status::OK();
-                               }));
-        Status s = VerifyResource(resource);
-        if (TF_PREDICT_FALSE(!s.ok())) {
-          resource->Unref();
-          context->SetStatus(s);
-          return;
-        }
-
-        resource_ = resource;
-      }
-    }
-    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                context, 0, cinfo_.container(), cinfo_.name(),
-                                MakeTypeIndex<MaterializedDatasetResource>()));
-  }
-
- private:
-  // During the first Compute(), resource is either created or looked up using
-  // shared_name. In the latter case, the resource found should be verified if
-  // it is compatible with this op's configuration. The verification may fail in
-  // cases such as two graphs asking queues of the same shared name to have
-  // inconsistent capacities.
-  Status VerifyResource(MaterializedDatasetResource* resource) {
-    TF_RETURN_IF_ERROR(
-        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
-    TF_RETURN_IF_ERROR(
-        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
-    return Status::OK();
-  }
-
-  mutex mu_;
-  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
-  MaterializedDatasetResource* resource_ GUARDED_BY(mu_) = nullptr;
-  DataTypeVector output_dtypes_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-// TODO(saeta): Make async.
-class MaterializeDatasetOp : public OpKernel {
- public:
-  explicit MaterializeDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    IndexedDataset* dataset;
-    OP_REQUIRES_OK(ctx,
-                   GetIndexedDatasetFromVariantTensor(ctx->input(0), &dataset));
-
-    MaterializedDatasetResource* materialized_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
-                                       &materialized_resource));
-    core::ScopedUnref unref(materialized_resource);
-    std::shared_ptr<MaterializedIndexedDataset> materialized;
-    OP_REQUIRES_OK(ctx, dataset->MaterializeDataset(&materialized));
-    OP_REQUIRES_OK(
-        ctx, materialized_resource->set_materialized_dataset(materialized));
-  }
-};
-
-// TODO(saeta): Make async
-class IndexedDatasetGet : public OpKernel {
- public:
-  explicit IndexedDatasetGet(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    MaterializedDatasetResource* materialized_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0),
-                                       &materialized_resource));
-    auto cleanup = gtl::MakeCleanup([materialized_resource] {
-      materialized_resource->Unref();  // Note: can't use core::ScopedUnref.
-    });
-
-    const Tensor* index_t;
-    OP_REQUIRES_OK(ctx, ctx->input("index", &index_t));
-    // TODO(saeta): Support batch reads (indexes should be non-scalar!)
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(index_t->shape()),
-                errors::InvalidArgument("index must be a scalar"));
-    const uint64 index = index_t->scalar<uint64>()();
-
-    std::vector<Tensor> out_tensors;
-    Status s =
-        materialized_resource->Get(IteratorContext(ctx), index, &out_tensors);
-
-    // Note: Unref materialized_resource to avoid destruction races. (Important
-    // in a [future] async op implementation.)
-    cleanup.release()();
-
-    if (!s.ok()) {
-      ctx->SetStatus(s);
-    } else {
-      auto expected_shapes = materialized_resource->output_shapes();
-      auto expected_types = materialized_resource->output_dtypes();
-      for (size_t i = 0; i < out_tensors.size(); ++i) {
-        OP_REQUIRES(
-            ctx, expected_shapes[i].IsCompatibleWith(out_tensors[i].shape()),
-            errors::Internal(
-                "Materialized dataset output at index ", i,
-                " is incompatible with the expected shape. (Expected: ",
-                expected_shapes[i], ", got: ", out_tensors[i].shape(), ")"));
-        OP_REQUIRES(ctx, out_tensors[i].dtype() == expected_types[i],
-                    errors::Internal("Materialized dataset output at index ", i,
-                                     " was not the expected dtype. (Expected: ",
-                                     expected_types[i],
-                                     ", got: ", out_tensors[i].dtype(), ")"));
-        ctx->set_output(i, out_tensors[i]);
-      }
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalMaterializedIndexDatasetHandle").Device(DEVICE_CPU),
-    MaterializedHandleOp);
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIndexedDatasetMaterialize").Device(DEVICE_CPU),
-    MaterializeDatasetOp);
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
-    IndexedDatasetGet);
-
-class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
- public:
-  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
-
-  void MakeIndexedDataset(OpKernelContext* ctx,
-                          IndexedDataset** output) override {
-    uint64 size = -1;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
-    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
-    *output = new Dataset(ctx, size);
-  }
-
-  class Dataset : public IndexedDataset {
-   public:
-    Dataset(OpKernelContext* ctx, uint64 size)
-        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
-
-    Status MaterializeDataset(
-        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      (*materialized) = std::make_shared<Materialized>(this);
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(Iterator::Params{
-          this, strings::StrCat(prefix, "::IdentityIndexedDataset")});
-    }
-
-    string DebugString() const override {
-      return "IdentityIndexedDataset::Dataset";
-    }
-
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** node) const override {
-      return errors::Unimplemented(
-          "identity_indexed_dataset.AsGraphDefInternal");
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (cur_ < dataset()->size_) {
-          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
-                                    TensorShape({}));
-          out_tensors->back().scalar<uint64>()() = cur_++;
-          *end_of_sequence = false;
-          return Status::OK();
-        }
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-     private:
-      mutex mu_;
-      uint64 cur_ GUARDED_BY(mu_);
-    };
-
-    class Materialized : public MaterializedIndexedDataset {
-     public:
-      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
-        dataset->Ref();
-      }
-
-      ~Materialized() override {
-        // TODO(saeta): Pull this into MaterializedIndexedDataset
-        dataset_->Unref();
-      }
-
-      const DataTypeVector& output_dtypes() const override {
-        return dataset_->output_dtypes();
-      }
-
-      const std::vector<PartialTensorShape>& output_shapes() const override {
-        return dataset_->output_shapes();
-      }
-
-      Status Get(IteratorContext&& ctx, uint64 index,
-                 std::vector<Tensor>* out_tensors) const override {
-        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
-                  << ")";
-        if (index >= dataset_->size_) {
-          // Note: use InvalidArgument instead of OutOfRange error because many
-          // things consider OutOfRange to be a "clean termination" error.
-          return errors::InvalidArgument(
-              "Index ", index,
-              " is out of range for this dataset. (Size is: ", dataset_->size_,
-              ".)");
-        }
-        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
-                                  TensorShape({}));
-        out_tensors->back().scalar<uint64>()() = index;
-        return Status::OK();
-      }
-
-      Status Size(uint64* size) const override {
-        *size = dataset_->size_;
-        return Status::OK();
-      }
-
-     private:
-      const Dataset* const dataset_;  // Not owned.
-    };
-
-    const uint64 size_;
-    std::shared_ptr<Materialized> materialized_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
-    IdentityIndexedDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index bd39635..9453f2b 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -48,24 +48,14 @@
 // description of the following op.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using MapAndBatchIteratorFunction =
-      std::function<void(IteratorContext*, InstantiatedCapturedFunction*,
-                         const string&, std::vector<Tensor>,
-                         std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
-
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "f", /*params=*/{},
+                                                 &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
-    OP_REQUIRES_OK(
-        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
   }
 
  protected:
@@ -90,83 +80,36 @@
                    ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    data::CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
-
-    MapAndBatchIteratorFunction map_func;
-    CapturedFunction* raw_captured_func = captured_func.get();
-    if (short_circuit_indices_.empty()) {
-      map_func = [](IteratorContext* ctx,
-                    InstantiatedCapturedFunction* instantiated_captured_func,
-                    const string& prefix, std::vector<Tensor> args,
-                    std::shared_ptr<std::vector<Tensor>> out_tensors,
-                    StatusCallback done) {
-        instantiated_captured_func->RunAsync(
-            ctx, std::move(args), out_tensors.get(), std::move(done), prefix);
-      };
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(short_circuit_indices_);
-      const auto& indices = short_circuit_indices_;
-      map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx,
-                     InstantiatedCapturedFunction* instantiated_captured_func,
-                     const string& prefix, std::vector<Tensor> args,
-                     std::shared_ptr<std::vector<Tensor>> out_tensors,
-                     StatusCallback done) {
-        const std::vector<Tensor>& captured_inputs =
-            raw_captured_func->captured_inputs();
-        size_t num_args = args.size();
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (indices[i] < num_args) {
-            if (can_move[i]) {
-              out_tensors->push_back(std::move(args[indices[i]]));
-            } else {
-              out_tensors->push_back(args[indices[i]]);
-            }
-          } else {
-            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
-          }
-        }
-        // Run the `done` callback on a threadpool thread, because it will
-        // potentially do a lot of copying work, and we want to run that
-        // concurrently with the next invocation.
-        (*ctx->runner())(std::bind(std::move(done), Status::OK()));
-      };
-    }
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
     if (num_parallel_calls == model::kAutoTune) {
       metrics::RecordTFDataAutotune(kDatasetName);
     }
 
-    *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
+    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_,
-                          std::move(captured_func), std::move(map_func),
-                          preserve_cardinality_);
+                          std::move(captured_func), preserve_cardinality_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func, int64 batch_size,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
             int64 num_parallel_calls, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::unique_ptr<CapturedFunction> captured_func,
-            MapAndBatchIteratorFunction map_func, bool preserve_cardinality)
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
           batch_size_(batch_size),
           num_parallel_calls_(num_parallel_calls),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
           captured_func_(std::move(captured_func)),
-          map_func_(std::move(map_func)),
           preserve_cardinality_(preserve_cardinality) {
       input_->Ref();
     }
@@ -176,8 +119,7 @@
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)},
-          map_func_);
+          Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -219,7 +161,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
       AttrValue preserve_cardinality_attr;
@@ -243,14 +185,12 @@
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params,
-                        MapAndBatchIteratorFunction map_func)
+      explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
                 params.dataset->num_parallel_calls_, mu_, cond_var_)),
-            map_func_(std::move(map_func)),
             max_batch_results_(std::min(kMaxBatchResults,
                                         (params.dataset->num_parallel_calls_ +
                                          params.dataset->batch_size_ - 1) /
@@ -475,9 +415,9 @@
 
         // Apply the map function on `input_element`, storing the result in
         // `return_values`, and invoking `done` when finished.
-        map_func_(ctx.get(), instantiated_captured_func_.get(), prefix(),
-                  std::move(input_element), std::move(return_values),
-                  std::move(done));
+        instantiated_captured_func_->RunAsync(
+            ctx.get(), std::move(input_element), return_values.get(),
+            std::move(done), prefix());
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -779,7 +719,6 @@
       const std::shared_ptr<condition_variable> cond_var_;
       // Identifies the maximum number of parallel calls.
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
-      const MapAndBatchIteratorFunction map_func_;
 
       // Counts the number of outstanding calls for this batch.
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
@@ -800,23 +739,19 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const int64 batch_size_;
     const int64 num_parallel_calls_;
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const MapAndBatchIteratorFunction map_func_;
     const bool preserve_cardinality_;
   };
 
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
   bool preserve_cardinality_;
-  std::vector<int> short_circuit_indices_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
deleted file mode 100644
index 8dfd87c..0000000
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ /dev/null
@@ -1,1154 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_THREADS
-
-#include <atomic>
-#include <utility>
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/kernels/inplace_ops_functor.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/numa.h"
-#include "tensorflow/core/platform/tracing.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-// kWindowSize is the fixed constant controlling the number of batch outputs
-// each NumaWorkerBlock may be processing at a time. This is currently a
-// constant and not user configurable to enable future performance optimizations
-// in the implementation.
-const int64 kWindowSize = 10;
-
-// Define a helper for more consistent logging.
-#define WORKER_VLOG(verbose_level)                                           \
-  VLOG(verbose_level) << "WorkerThread (" << numa_node << ", " << thread_num \
-                      << "): "
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
-class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit NumaMapAndBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    // TODO(saeta): Implement support for preserve_cardinality logic.
-    OP_REQUIRES_OK(
-        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
-  }
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("batch_size must be greater than zero."));
-
-    int64 num_parallel_calls;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
-                                            &num_parallel_calls));
-    OP_REQUIRES(
-        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
-        errors::InvalidArgument(
-            "num_parallel_calls must be greater than zero."));
-
-    bool drop_remainder;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    CapturedFunction::Params params;
-    params.use_inter_op_parallelism = false;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
-
-    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
-                          drop_remainder, output_types_, output_shapes_, func_,
-                          std::move(captured_func));
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_calls, bool drop_remainder,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          batch_size_(batch_size),
-          num_parallel_calls_(num_parallel_calls),
-          drop_remainder_(drop_remainder),
-          output_types_(output_types),
-          output_shapes_(output_shapes),
-          func_(func),
-          captured_func_(std::move(captured_func)) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::NumaMapAndBatch")});
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "NumaMapAndBatchDatasetOp::Dataset";
-    }
-
-    // TODO(b/120482302): Note that this is inaccurate until
-    // NumaMapAndBatchMapDataset modified to preserve cardinality.
-    int64 Cardinality() const override {
-      int64 n = input_->Cardinality();
-      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
-        return n;
-      }
-      return n / batch_size_ +
-             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* batch_size_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
-      Node* num_parallel_calls_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
-      Node* drop_remainder_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
-      std::vector<Node*> other_arguments;
-      DataTypeVector other_arguments_types;
-      TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
-                                                    &other_arguments_types));
-      AttrValue f;
-      b->BuildAttrValue(func_, &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {std::make_pair(0, input_graph_node),
-           std::make_pair(2, batch_size_node),
-           std::make_pair(3, num_parallel_calls_node),
-           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
-          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
-          {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
-          output));
-      return Status::OK();
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            mu_(std::make_shared<mutex>()),
-            autotune_cond_var_(std::make_shared<condition_variable>()),
-            num_parallel_calls_(std::make_shared<model::SharedState>(
-                params.dataset->num_parallel_calls_, mu_, autotune_cond_var_)) {
-      }
-
-      ~Iterator() override {
-        mutex_lock l(*mu_);
-        cancelled_ = true;
-        VLOG(3) << "NumaMapAndBatchIterator::~Iterator: cancelling operations.";
-        for (size_t i = 0; i < workers_.size(); ++i) {
-          workers_[i]->manager.Cancel();
-        }
-        VLOG(3) << "NumaMapAndBatchIterator::~Iterator: waiting for threads to "
-                   "shut down.";
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == model::kAutoTune) {
-          num_parallel_calls_->value = ctx->runner_threadpool_size();
-        }
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
-            ctx, &instantiated_captured_func_));
-        return Status::OK();
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        auto cleanup = gtl::MakeCleanup(
-            [] { VLOG(3) << "GetNextInternal call returning."; });
-        NumaWorkerBlock* worker = nullptr;
-        {
-          mutex_lock l(*mu_);
-          VLOG(3) << "GetNextInternal call; current block: " << cur_block_;
-          if (global_end_of_input_) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-          TF_RETURN_IF_ERROR(EnsureBackgroundThreadsStarted(ctx));
-          worker = workers_[cur_block_].get();
-          cur_block_ = (cur_block_ + 1) % workers_.size();
-        }
-        bool global_end_of_input_local = false;
-        Status s = worker->manager.GetBatch(ctx, dataset()->drop_remainder_,
-                                            &global_end_of_input_local,
-                                            out_tensors, end_of_sequence);
-        if (global_end_of_input_local) {
-          mutex_lock l(*mu_);
-          global_end_of_input_ = global_end_of_input_local;
-        }
-        return s;
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeAsyncKnownRatioNode(
-            std::move(args), dataset()->batch_size_,
-            {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/ctx->runner_threadpool_size())});
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(*mu_);
-        for (size_t i = 0; i < workers_.size(); ++i) {
-          if (!workers_[i]->manager.Quiesce()) {
-            return errors::Cancelled(
-                "The iterator was deleted before it could reach a "
-                "checkpointable state.");
-          }
-        }
-
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("num_workers"), workers_.size()));
-
-        for (size_t i = 0; i < workers_.size(); ++i) {
-          size_t index = (cur_block_ + i) % workers_.size();
-          TF_RETURN_IF_ERROR(workers_[index]->manager.Save(writer, this, i));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(*mu_);
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        int64 num_workers = -1;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("num_workers"), &num_workers));
-        // Note: num_workers can be 0 if the iterator wasn't started when
-        // first checkpointed.
-        if (num_workers < 0) {
-          return errors::DataLoss(
-              "When restoring from checkpoint, we encountered a data "
-              "consistency error: num_workers has an invalid value: ",
-              num_workers);
-        }
-        if (port::NUMAEnabled()) {
-          int actual_numa_domains = port::NUMANumNodes();
-          if (actual_numa_domains != num_workers && num_workers > 0) {
-            LOG(WARNING) << "# NUMA domains mismatch when restoring from "
-                            "checkpoint: checkpoint has "
-                         << num_workers
-                         << " NUMA domains, while this host has: "
-                         << actual_numa_domains << " NUMA domains.";
-          }
-        }
-        if (num_workers > 1 && !port::NUMAEnabled()) {
-          LOG(WARNING) << "NUMA is not enabled for this process, but restoring "
-                          "a checkpoint that assumes "
-                       << num_workers << " NUMA domains.";
-        }
-        workers_.resize(num_workers);
-        for (size_t i = 0; i < num_workers; ++i) {
-          workers_[i] = absl::make_unique<NumaWorkerBlock>(this);
-          TF_RETURN_IF_ERROR(
-              workers_[i]->manager.Restore(ctx, reader, this, i));
-        }
-        cur_block_ = 0;
-        return Status::OK();
-      }
-
-     private:
-      // NumaBlockManager manages all the state for a set of threads pinned to a
-      // single NUMA domain.
-      //
-      // The methods can be divided into 3 categories based on who should call
-      // them:
-      //
-      //  (1) RunnerThread: WaitForInputSpace, PushInputs, SetEndOfInput.
-      //  (2) WorkerThread: RetrieveInput, GetBatchTensors.
-      //      RecordBatchEntryComplete
-      //  (3) Client threads: GetBatch, Cancel, Save, Restore.
-      //
-      // Internally, we manage state in a circular buffer of size `kWindowSize`.
-      // There are 3 pointers into the circular buffer, and must maintain the
-      // following order: (1) next_input_batch_ (corresponding to the next input
-      // batch to be pulled from the input iterator), (2) next_input_
-      // (corresponding to the batch the WorkerThreads should pull from for
-      // their next inputs), and (3) next_output_ corresponding to the next
-      // value to be consumed by the output iterator.
-      //
-      // Methods return errors::Cancelled if the iteration is cancelled before
-      // completing.
-      //
-      // NumaBlockManager is thread safe.
-      class NumaBlockManager {
-       public:
-        explicit NumaBlockManager(Iterator* itr) : itr_(itr) {}
-
-        // WaitForInputSpace blocks until there is space in the circular buffer
-        // to begin processing a new batch of elements.
-        //
-        // Returns true when there is space, false if the Iterator is cancelled.
-        bool WaitForInputSpace(IteratorContext* ctx) {
-          mutex_lock l(mu_);
-
-          size_t next = (next_input_batch_ + 1) % kWindowSize;
-          DCHECK(next < kWindowSize) << next;
-
-          // Wait for space in the circular buffer.
-          while (!cancelled_ && batches_[next].state != BatchState::kEmpty) {
-            VLOG(3) << "Waiting for input space; next: " << next
-                    << ", next_output_: " << next_output_
-                    << ", next_input_batch_: " << next_input_batch_;
-            itr_->RecordStop(ctx);
-            runner_cond_var_.wait(l);
-            itr_->RecordStart(ctx);
-          }
-          if (cancelled_) {
-            VLOG(3) << "WaitForInputSpace cancelled.";
-            return false;
-          }
-
-          DCHECK(batches_[next].state == BatchState::kEmpty);
-
-          next_input_batch_ = next;
-          return true;
-        }
-
-        // PushInputs sets the inputs for the next batch as retrieved from the
-        // input iterator.
-        void PushInputs(const Status& status,
-                        std::vector<std::vector<Tensor>> inputs) {
-          mutex_lock l(mu_);
-
-          DCHECK(next_input_ < kWindowSize) << next_input_;
-          DCHECK(batches_[next_input_batch_].state == BatchState::kEmpty);
-          DCHECK(batches_[next_input_batch_].next_input_to_process == 0)
-              << batches_[next_input_batch_].next_input_to_process;
-          DCHECK(batches_[next_input_batch_].status.ok())
-              << batches_[next_input_batch_].status;
-
-          batches_[next_input_batch_].inputs.swap(inputs);
-          batches_[next_input_batch_].state = BatchState::kInputsFilled;
-          batches_[next_input_batch_].status.Update(status);
-          if (batches_[next_input_batch_].status.ok()) {
-            worker_cond_var_.notify_all();
-          } else {
-            client_cond_var_.notify_all();
-            batches_[next_input_batch_].error_index = 0;
-          }
-        }
-
-        // SetEndOfInput records the fact that we have reached the end of the
-        // input iterator, and that we should return end_of_sequence = true when
-        // we have exhaused all buffered batches.
-        void SetEndOfInput() {
-          mutex_lock l(mu_);
-          reached_eof_ = true;
-          worker_cond_var_.notify_all();
-          client_cond_var_.notify_all();
-        }
-
-        // RetrieveInput gets the next input tuple to be mapped by a worker
-        // thread.
-        //
-        // Returns true if an input was retrieved, false if the iterator has
-        // been cancelled.
-        bool RetrieveInput(IteratorContext* ctx, std::vector<Tensor>* input,
-                           uint64* index, size_t* sequence_number) {
-          mutex_lock l(mu_);
-
-          // Wait for inputs to be ready.
-          while (!cancelled_ &&
-                 batches_[next_input_].state != BatchState::kInputsFilled) {
-            itr_->RecordStop(ctx);
-            worker_cond_var_.wait(l);
-            itr_->RecordStart(ctx);
-          }
-
-          if (cancelled_) {
-            return false;
-          }
-
-          DCHECK(batches_[next_input_].next_input_to_process <
-                 batches_[next_input_].inputs.size())
-              << "next_input_: " << next_input_ << ", next_input_to_process: "
-              << batches_[next_input_].next_input_to_process
-              << ", inputs.size(): " << batches_[next_input_].inputs.size()
-              << ", state: " << static_cast<int32>(batches_[next_input_].state)
-              << ", this: " << this;
-          *index = batches_[next_input_].next_input_to_process;
-          *sequence_number = next_input_;
-          input->swap(batches_[next_input_]
-                          .inputs[batches_[next_input_].next_input_to_process]);
-          // Increment pointers.
-          batches_[next_input_].next_input_to_process++;
-
-          if (batches_[next_input_].next_input_to_process ==
-              batches_[next_input_].inputs.size()) {
-            batches_[next_input_].state = BatchState::kAllMapsStarted;
-            next_input_ = (next_input_ + 1) % kWindowSize;
-          }
-          return true;
-        }
-
-        // GetBatchTensors returns a pointer to the output batch tensors for the
-        // worker thread to copy into.
-        //
-        // allocate_output is a function taking a batch size, and a pointer to
-        // the output tuple of Tensors to allocate them. The allocate_output
-        // function is called at most once per output batch.
-        std::vector<Tensor>* GetBatchTensors(
-            size_t sequence_number,
-            std::function<void(size_t, std::vector<Tensor>*)> allocate_output) {
-          mutex_lock l(mu_);
-          DCHECK(sequence_number < kWindowSize) << sequence_number;
-          DCHECK(batches_[sequence_number].state == BatchState::kInputsFilled ||
-                 batches_[sequence_number].state == BatchState::kAllMapsStarted)
-              << sequence_number;
-
-          if (batches_[sequence_number].outputs.empty()) {
-            allocate_output(batches_[sequence_number].inputs.size(),
-                            &batches_[sequence_number].outputs);
-          }
-          return &batches_[sequence_number].outputs;
-        }
-
-        // RecordBatchEntryComplete records an element of the batch has finished
-        // copying into the output tensors.
-        void RecordBatchEntryComplete(size_t sequence_number, uint64 index,
-                                      Status s) {
-          mutex_lock l(mu_);
-          DCHECK(sequence_number < kWindowSize) << sequence_number;
-          DCHECK(batches_[sequence_number].state == BatchState::kInputsFilled ||
-                 batches_[sequence_number].state == BatchState::kAllMapsStarted)
-              << sequence_number;
-
-          batches_[sequence_number].num_outputs_complete++;
-          if (!s.ok() && batches_[sequence_number].error_index > index) {
-            batches_[sequence_number].status = s;
-            batches_[sequence_number].error_index = index;
-          }
-
-          if (batches_[sequence_number].num_outputs_complete ==
-              batches_[sequence_number].inputs.size()) {
-            DCHECK(batches_[sequence_number].state ==
-                   BatchState::kAllMapsStarted);
-            batches_[sequence_number].state = BatchState::kOutputsComplete;
-            batches_[sequence_number].inputs.clear();  // Eagerly save memory.
-            batches_[sequence_number].inputs.shrink_to_fit();
-            client_cond_var_.notify_all();
-          }
-        }
-
-        // GetBatch retrieves the next output batch tensors.
-        Status GetBatch(IteratorContext* ctx, bool drop_remainder,
-                        bool* global_eof, std::vector<Tensor>* out_tensor,
-                        bool* end_of_sequence) {
-          mutex_lock l(mu_);
-          // Wait until one of 3 conditions occurs:
-          //  (1) we're cancelled.
-          //  (2) the state becomes kOutputsComplete
-          //  (3) state is empty && reached_eof.
-          while (!cancelled_ &&
-                 batches_[next_output_].state != BatchState::kOutputsComplete &&
-                 !(reached_eof_ &&
-                   batches_[next_output_].state == BatchState::kEmpty)) {
-            VLOG(3) << "Waiting in GetBatch.";
-            itr_->RecordStop(ctx);
-            client_cond_var_.wait(l);
-            itr_->RecordStart(ctx);
-          }
-
-          if (cancelled_) {
-            return errors::Cancelled(
-                "Cancelled in NumaMapAndBatch::GetNext call.");
-          }
-
-          if (reached_eof_ &&
-              batches_[next_output_].state == BatchState::kEmpty) {
-            VLOG(4) << "GetBatch returning end of sequence.";
-            *end_of_sequence = true;
-            *global_eof = true;
-            return Status::OK();
-          }
-
-          VLOG(3) << "Returning output index: " << next_output_
-                  << ", this: " << this;
-
-          *end_of_sequence = false;
-          Status s = batches_[next_output_].status;
-          if (s.ok()) {
-            out_tensor->swap(batches_[next_output_].outputs);
-          }
-          // Handle early termination.
-          if (errors::IsOutOfRange(s)) {
-            *global_eof = true;
-            s = Status::OK();
-            if (drop_remainder || batches_[next_output_].error_index == 0) {
-              *end_of_sequence = true;
-            } else {
-              std::vector<Tensor> true_outputs;
-              for (size_t i = 0; i < batches_[next_output_].outputs.size();
-                   ++i) {
-                TensorShape component_shape(
-                    batches_[next_output_].outputs[i].shape());
-                component_shape.set_dim(0, batches_[next_output_].error_index);
-                AllocatorAttributes attr;
-                attr.set_gpu_compatible(true);
-                true_outputs.emplace_back(
-                    ctx->allocator(attr),
-                    batches_[next_output_].outputs[i].dtype(), component_shape);
-                TF_RETURN_IF_ERROR(CopyPartialBatch(
-                    &true_outputs.back(), batches_[next_output_].outputs[i],
-                    batches_[next_output_].error_index));
-              }
-              out_tensor->swap(true_outputs);
-            }
-          }
-
-          batches_[next_output_].Reset();
-          next_output_ = (next_output_ + 1) % kWindowSize;
-          runner_cond_var_.notify_all();
-
-          return s;
-        }
-
-        void Cancel() {
-          mutex_lock l(mu_);
-          VLOG(3) << "Cancelling NUMA block.";
-          cancelled_ = true;
-          runner_cond_var_.notify_all();
-          worker_cond_var_.notify_all();
-          client_cond_var_.notify_all();
-        }
-
-        // Waits until all the worker threads have completed their work and all
-        // internal state has reached a "safe-point" where we can safely
-        // checkpoint.
-        //
-        // Returns true if completed successfully, false if cancelled while
-        // waiting.
-        bool Quiesce() {
-          mutex_lock l(mu_);
-          VLOG(3) << "Waiting until the operations have quiesced.";
-          while (!cancelled_ && !AllMapOperationsFinished()) {
-            client_cond_var_.wait(l);
-          }
-          if (cancelled_) {
-            return false;
-          }
-          return true;
-        }
-
-        Status Save(IteratorStateWriter* writer, Iterator* itr, size_t index) {
-          mutex_lock l(mu_);
-          string prefix = itr->full_name(strings::StrCat("numa_block_", index));
-          if (reached_eof_) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                strings::StrCat(prefix, "_end_of_input"), ""));
-          }
-          for (size_t i = 0; i < kWindowSize; ++i) {
-            size_t index = (next_output_ + i) % kWindowSize;
-            if (batches_[index].state == BatchState::kEmpty) {
-              break;
-            }
-            string batch_prefix = strings::StrCat(prefix, "_batch_", i);
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                strings::StrCat(batch_prefix, "_code"),
-                static_cast<int64>(batches_[index].status.code())));
-            if (!batches_[index].status.ok()) {
-              TF_RETURN_IF_ERROR(
-                  writer->WriteScalar(strings::StrCat(batch_prefix, "_msg"),
-                                      batches_[index].status.error_message()));
-              TF_RETURN_IF_ERROR(writer->WriteScalar(
-                  strings::StrCat(batch_prefix, "_error_index"),
-                  batches_[index].error_index));
-            }
-
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                strings::StrCat(batch_prefix, "_output_size"),
-                batches_[index].outputs.size()));
-            for (size_t j = 0; j < batches_[index].outputs.size(); ++j) {
-              string tensor_prefix =
-                  strings::StrCat(batch_prefix, "_output_", j);
-              if (!batches_[index].status.ok()) {
-                DCHECK(batches_[index].error_index >= 0 &&
-                       batches_[index].error_index <
-                           itr_->dataset()->batch_size_);
-                // If the batch is not full, we only store the first
-                // `error_index` values. The rest of the batch tensor might not
-                // be initialized, and accessing that will raise msan errors.
-                TF_RETURN_IF_ERROR(writer->WriteTensor(
-                    tensor_prefix, batches_[index].outputs[j].Slice(
-                                       0, batches_[index].error_index)));
-              } else {
-                TF_RETURN_IF_ERROR(writer->WriteTensor(
-                    tensor_prefix, batches_[index].outputs[j]));
-              }
-            }
-          }
-          return Status::OK();
-        }
-
-        Status Restore(IteratorContext* ctx, IteratorStateReader* reader,
-                       Iterator* itr, size_t index) {
-          mutex_lock l(mu_);
-          if (reached_eof_) {
-            return errors::FailedPrecondition(
-                "Already reached the end of the sequence.");
-          }
-          string prefix = itr->full_name(strings::StrCat("numa_block_", index));
-          reached_eof_ =
-              reader->Contains(strings::StrCat(prefix, "_end_of_input"));
-          for (size_t i = 0; i < kWindowSize; ++i) {
-            string batch_prefix = strings::StrCat(prefix, "_batch_", i);
-            if (!reader->Contains(strings::StrCat(batch_prefix, "_code"))) {
-              break;
-            }
-            Batch batch;
-            batch.state = BatchState::kOutputsComplete;
-            int64 code_int;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                strings::StrCat(batch_prefix, "_code"), &code_int));
-            error::Code code = static_cast<error::Code>(code_int);
-            if (code != error::Code::OK) {
-              string error_message;
-              TF_RETURN_IF_ERROR(reader->ReadScalar(
-                  strings::StrCat(batch_prefix, "_msg"), &error_message));
-              batch.status = Status(code, error_message);
-              int64 error_index_int = -1;
-              TF_RETURN_IF_ERROR(reader->ReadScalar(
-                  strings::StrCat(batch_prefix, "_error_index"),
-                  &error_index_int));
-              if (error_index_int < 0 ||
-                  error_index_int > itr->dataset()->batch_size_) {
-                return errors::FailedPrecondition(
-                    "Error index out of bounds when restoring from checkpoint; "
-                    "error index: ",
-                    error_index_int);
-              }
-              batch.error_index = static_cast<size_t>(error_index_int);
-            }
-            int64 output_size = -1;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                strings::StrCat(batch_prefix, "_output_size"), &output_size));
-            batch.outputs.reserve(output_size);
-            for (size_t j = 0; j < output_size; ++j) {
-              string tensor_name = strings::StrCat(batch_prefix, "_output_", j);
-              Tensor t;
-              TF_RETURN_IF_ERROR(reader->ReadTensor(tensor_name, &t));
-              batch.outputs.emplace_back(std::move(t));
-            }
-            batches_[i] = std::move(batch);
-          }
-          return Status::OK();
-        }
-
-       private:
-        bool AllMapOperationsFinished() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-          for (size_t i = 0; i < kWindowSize; ++i) {
-            if (batches_[i].state == BatchState::kInputsFilled ||
-                batches_[i].state == BatchState::kAllMapsStarted) {
-              return false;
-            }
-            if (batches_[i].state != BatchState::kOutputsComplete &&
-                !reached_eof_) {
-              return false;
-            }
-          }
-          return true;
-        }
-
-        // Batches begin in the `kEmpty` state. Once the RunnerThread has
-        // filled the `inputs` to a `Batch`, it transitions to the
-        // `kInputsFilled` state. At this point, the Worker threads run the map
-        // function and copy the outputs appropriately. Once all worker threads
-        // have started, it transitions to `kAllMapsStarted`. After the outputs
-        // are complete, the GetNext call can consume the outputs, and return
-        // the batch to the kEmpty state.
-        enum class BatchState {
-          kEmpty,
-          kInputsFilled,
-          kAllMapsStarted,
-          kOutputsComplete,
-        };
-
-        // Batch captures all the state of an output batch as it progresses
-        // through the machinery. Once the RunnerThread fills inputs, it
-        // transitions to `kInputsFilled`. At this point, the worker threads can
-        // work on it, incrementing outputs_complete for every element of the
-        // input set that is copied into the output Tensors. Once all the input
-        // tuples have been processed (i.e. num_outputs_complete ==
-        // inputs.size()), it transitions to the `kOutputsComplete` stage, where
-        // it is ready to be returned by a `GetBatch` call (called from
-        // `GetNextInternal`).
-        struct Batch {
-          BatchState state;
-          // Aggregates the Status of the input iterator's GetNext
-          // calls, in addition to the Status of the map function invocations.
-          //
-          // In the case where multiple non-OK statuses are encountered, we
-          // return the first one encountered.
-          Status status;
-          // In order to return the correct error status, we keep track of the
-          // error_index.
-          size_t error_index;
-          // The batch_size input tuples (or fewer in the case of the last
-          // batch).
-          // TODO(saeta): Avoid re-allocating vectors all the time!
-          std::vector<std::vector<Tensor>> inputs;
-          std::vector<Tensor> outputs;
-          size_t next_input_to_process;
-          size_t num_outputs_complete;
-
-          Batch() { Reset(); }
-
-          // Resets the Batch state (e.g. after consuming the outputs).
-          void Reset() {
-            state = BatchState::kEmpty;
-            status = Status::OK();
-            inputs.clear();
-            inputs.shrink_to_fit();
-            outputs.clear();
-            outputs.shrink_to_fit();
-            next_input_to_process = 0;
-            num_outputs_complete = 0;
-            error_index = -1;
-          }
-        };
-
-        Iterator* itr_;  // Not owned.
-        mutex mu_;
-        Batch batches_[kWindowSize] GUARDED_BY(mu_);
-        size_t next_input_batch_ GUARDED_BY(mu_) = -1;
-        size_t next_input_ GUARDED_BY(mu_) = 0;
-        size_t next_output_ GUARDED_BY(mu_) = 0;
-        bool cancelled_ GUARDED_BY(mu_) = false;
-        bool reached_eof_ GUARDED_BY(mu_) = false;
-
-        // The runner thread waits on this condition variable for space to be
-        // available. When the client thread takes a value out of the circular
-        // buffer, it notifies this condition variable that space is now
-        // available.
-        condition_variable runner_cond_var_ GUARDED_BY(mu_);
-        // The worker threads wait on this condition variable for available
-        // inputs. When the runner thread makes new inputs available, it
-        // notifies this condition variable.
-        condition_variable worker_cond_var_ GUARDED_BY(mu_);
-        // The client threads wait on this condition variable for available
-        // batched outputs. When worker threads complete a batch, they notify
-        // this condition variable.
-        condition_variable client_cond_var_ GUARDED_BY(mu_);
-      };
-      // Mark NumaBlockManager as a friend of Iterator in order to call
-      // protected Iterator methods during checkpointing.
-      friend NumaBlockManager;
-
-      struct NumaWorkerBlock {
-        NumaBlockManager manager;
-        // TODO(saeta): Migrate to BackgroundWorker.
-        std::vector<std::unique_ptr<Thread>> threads;
-
-        explicit NumaWorkerBlock(Iterator* itr) : manager(itr) {}
-      };
-
-      static void CustomNumaWorkerBlockDeleter(NumaWorkerBlock* ptr) {
-        ptr->~NumaWorkerBlock();
-        port::NUMAFree(ptr, sizeof(NumaWorkerBlock));
-      }
-      static void DefaultNumaWorkerBlockDeleter(NumaWorkerBlock* ptr) {
-        delete ptr;
-      }
-
-      static Status CopyPartialBatch(Tensor* output, const Tensor& value,
-                                     int64 num_elements) {
-        switch (value.dtype()) {
-#define HANDLE_TYPE(type)                                         \
-  case DataTypeToEnum<type>::value: {                             \
-    auto output_t = output->flat_outer_dims<type>();              \
-    auto value_t = value.flat_outer_dims<type>();                 \
-    for (size_t i = 0; i < num_elements; i++) {                   \
-      output_t.template chip<0>(i) = value_t.template chip<0>(i); \
-    }                                                             \
-    return Status::OK();                                          \
-  }
-          TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-          default:
-            return errors::InvalidArgument("Unsupported data type: ",
-                                           DataTypeString(value.dtype()));
-        }
-        return Status::OK();
-      }
-
-      Status EnsureBackgroundThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (curr_num_parallel_calls_ >= num_parallel_calls_->value) {
-          // All necessary threads have been started.
-          curr_num_parallel_calls_ = num_parallel_calls_->value;
-          return Status::OK();
-        }
-
-        VLOG(4) << "Starting workers";
-        bool numa_enabled = port::NUMAEnabled();
-
-        if (!numa_enabled) {
-          LOG(INFO) << "NUMA not enabled on this host.";
-        }
-
-        int num_numa_nodes = port::NUMANumNodes();
-        if (num_numa_nodes < 1) {
-          return errors::Internal("The number of NUMA nodes is invalid: ",
-                                  num_numa_nodes);
-        }
-
-        // Only resize when empty to support restoring from checkpoints.
-        if (workers_.empty()) {
-          VLOG(3) << "# NUMA Nodes: " << num_numa_nodes
-                  << ", # Parallel Calls: " << num_parallel_calls_->value;
-          workers_.resize(num_numa_nodes);
-        } else {
-          num_numa_nodes = workers_.size();
-        }
-
-        // Round up num_parallel_calls, with a minimum of 1.
-        const size_t num_threads_per_block =
-            std::max(1LL, (num_parallel_calls_->value + num_numa_nodes - 1) /
-                              num_numa_nodes);
-
-        VLOG(3) << "Starting " << num_threads_per_block * num_numa_nodes
-                << " worker threads, with " << num_threads_per_block
-                << " threads per block.";
-
-        // Only allocate new_ctx if required.
-        std::shared_ptr<IteratorContext> new_ctx;
-
-        for (int i = 0; i < num_numa_nodes; ++i) {
-          if (!workers_[i]) {
-            if (numa_enabled) {
-              // Allocate in appropriate NUMA domain.
-              // 4k page align.
-              void* ptr = port::NUMAMalloc(i, sizeof(NumaWorkerBlock), 0);
-              if (ptr != nullptr) {
-                NumaWorkerBlock* block = new (ptr) NumaWorkerBlock(this);
-                workers_[i] =
-                    std::unique_ptr<NumaWorkerBlock,
-                                    std::function<void(NumaWorkerBlock*)>>(
-                        block, CustomNumaWorkerBlockDeleter);
-              } else {
-                LOG(ERROR) << "Could not NUMA-allocate worker block: " << i;
-              }
-            }
-            // If the NUMA allocation fails, or NUMA is not enabled.
-            if (!workers_[i]) {
-              workers_[i] =
-                  std::unique_ptr<NumaWorkerBlock,
-                                  std::function<void(NumaWorkerBlock*)>>(
-                      new NumaWorkerBlock(this), DefaultNumaWorkerBlockDeleter);
-            }
-          }
-          // Be sure to start threads if num_parallel_calls_ has changed.
-          for (size_t j = workers_[i]->threads.size();
-               j < num_threads_per_block; ++j) {
-            VLOG(3) << "Starting worker " << i << ", " << j;
-            if (!new_ctx) {
-              new_ctx = std::make_shared<IteratorContext>(*ctx);
-            }
-            workers_[i]->threads.emplace_back(ctx->StartThread(
-                strings::StrCat("tf_data_numa_map_and_batch_", i, "_", j),
-                [this, new_ctx, i, j]() { WorkerThread(new_ctx, i, j); }));
-            VLOG(3) << "Worker " << i << ", " << j << " successfully started.";
-          }
-        }
-        if (!runner_thread_) {
-          if (!new_ctx) {
-            new_ctx = std::make_shared<IteratorContext>(*ctx);
-          }
-          runner_thread_ =
-              ctx->StartThread("tf_data_numa_map_and_batch",
-                               [this, new_ctx] { RunnerThread(new_ctx); });
-        }
-        VLOG(3) << "All workers & runner thread started.";
-        return Status::OK();
-      }
-
-      void AllocateOutput(IteratorContext* ctx, size_t batch_size,
-                          const std::vector<Tensor>& map_fn_outputs,
-                          std::vector<Tensor>* batch_outputs) {
-        DCHECK(dataset()->output_dtypes().size() ==
-               dataset()->output_shapes().size());
-        DCHECK(map_fn_outputs.size() == dataset()->output_dtypes().size());
-        for (size_t i = 0; i < dataset()->output_dtypes().size(); ++i) {
-          TensorShape component_shape({static_cast<uint32>(batch_size)});
-          component_shape.AppendShape(map_fn_outputs.at(i).shape());
-          AllocatorAttributes attr;
-          attr.set_gpu_compatible(true);
-          batch_outputs->emplace_back(ctx->allocator(attr),
-                                      map_fn_outputs.at(i).dtype(),
-                                      component_shape);
-        }
-      }
-
-      void RunnerThread(std::shared_ptr<IteratorContext> ctx)
-          LOCKS_EXCLUDED(mu_) {
-        RecordStart(ctx.get());
-        auto cleanup = gtl::MakeCleanup([this, &ctx] {
-          // Set end of input on all the managers in order to clean up in an
-          // orderly fashion.
-          VLOG(3) << "Setting End of Input on workers_[*]->manager";
-          for (size_t i = 0; i < workers_.size(); ++i) {
-            workers_[i]->manager.SetEndOfInput();
-          }
-          RecordStop(ctx.get());
-        });
-
-        const size_t num_blocks = workers_.size();
-
-        while (true) {
-          for (size_t block = 0; block < num_blocks; ++block) {
-            VLOG(4) << "RunnerThread waiting for input space in block: "
-                    << block;
-            if (TF_PREDICT_FALSE(
-                    !workers_[block]->manager.WaitForInputSpace(ctx.get()))) {
-              VLOG(3) << "RunnerThread exiting due to cancellation.";
-              return;
-            }
-            VLOG(4) << "RunnerThread has space; pulling on upstream for block "
-                    << block;
-
-            Status s;
-            std::vector<std::vector<Tensor>> inputs;
-            bool end_of_sequence = false;
-            for (size_t i = 0; i < dataset()->batch_size_; ++i) {
-              std::vector<Tensor> tuple;
-              s.Update(
-                  input_impl_->GetNext(ctx.get(), &tuple, &end_of_sequence));
-              if (!s.ok()) {
-                break;
-              }
-              if (end_of_sequence) {
-                VLOG(4) << "Runner thread encountered end of sequence.";
-                if (dataset()->drop_remainder_) {
-                  return;
-                }
-                break;
-              }
-              inputs.push_back(std::move(tuple));
-            }
-
-            VLOG(4) << "Moving inputs to block " << block
-                    << ", which has size: " << inputs.size();
-            if (!s.ok() || !inputs.empty()) {
-              workers_[block]->manager.PushInputs(s, std::move(inputs));
-              VLOG(4) << "Inputs moved into block " << block;
-            }
-            if (end_of_sequence) {
-              return;
-            }
-          }
-        }
-      }
-
-      void WorkerThread(std::shared_ptr<IteratorContext> ctx,
-                        const int numa_node, const int thread_num) {
-        RecordStart(ctx.get());
-        WORKER_VLOG(3) << "started.";
-        auto stop_cleanup =
-            gtl::MakeCleanup([this, numa_node, thread_num, &ctx]() {
-              RecordStop(ctx.get());
-              WORKER_VLOG(3) << "exiting.";
-            });
-
-        NumaWorkerBlock* block = workers_[numa_node].get();
-        port::NUMASetThreadNodeAffinity(numa_node);
-        const int num_numa_nodes = port::NUMANumNodes();
-        const int minimum_num_parallel_calls = thread_num * num_numa_nodes;
-
-        while (true) {
-          // Put threads to sleep based on autotuner.
-          {
-            mutex_lock l(*mu_);
-            while (minimum_num_parallel_calls >= num_parallel_calls_->value &&
-                   !cancelled_) {
-              RecordStop(ctx.get());
-              autotune_cond_var_->wait(l);
-              RecordStart(ctx.get());
-            }
-            if (cancelled_) {
-              return;
-            }
-          }
-
-          std::vector<Tensor> input;
-          uint64 index = 0;
-          size_t sequence_number = 0;
-          WORKER_VLOG(4) << "retrieving input.";
-          {
-            tracing::ScopedActivity trace(
-                "NumaMapAndBatch::Iterator::Worker::RetrieveInput");
-            if (!block->manager.RetrieveInput(ctx.get(), &input, &index,
-                                              &sequence_number)) {
-              return;
-            }
-          }
-
-          WORKER_VLOG(4) << "retrieved input; index: " << index
-                         << ", sequence_number: " << sequence_number;
-
-          std::vector<Tensor> return_values;
-          Status s;
-          {
-            tracing::ScopedActivity trace(
-                "NumaMapAndBatch::Iterator::Worker::FunctionExecution");
-            s = instantiated_captured_func_->Run(ctx.get(), std::move(input),
-                                                 &return_values);
-          }
-          WORKER_VLOG(4) << "ran function for index: " << index
-                         << ", sequence_number: " << sequence_number;
-
-          if (s.ok()) {
-            std::vector<Tensor>* output = block->manager.GetBatchTensors(
-                sequence_number,
-                [this, ctx, &return_values](size_t batch_size,
-                                            std::vector<Tensor>* output) {
-                  AllocateOutput(ctx.get(), batch_size, return_values, output);
-                });
-            WORKER_VLOG(4) << "copying tensors to batch output.";
-            {
-              tracing::ScopedActivity trace(
-                  "NumaMapAndBatch::Iterator::Worker::BatchCopy");
-              for (size_t i = 0; i < return_values.size() && s.ok(); ++i) {
-                Tensor& tensor = return_values.at(i);
-                Tensor* batch = &output->at(i);
-                if (tensor.NumElements() !=
-                    (batch->NumElements() / batch->dim_size(0))) {
-                  s.Update(errors::InvalidArgument(
-                      "Cannot add tensor to the batch: number of elements does "
-                      "not match. Shapes are: [tensor]: ",
-                      tensor.shape().DebugString(),
-                      ", [batch]: ", batch->shape().DebugString()));
-                  break;
-                }
-                s.Update(batch_util::CopyElementToSlice(std::move(tensor),
-                                                        batch, index));
-              }
-            }
-          }
-
-          block->manager.RecordBatchEntryComplete(sequence_number, index, s);
-          WORKER_VLOG(4) << "finished index: " << index
-                         << ", sequence_number: " << sequence_number;
-        }
-      }
-
-      // mu_ protects shared internal state and is used to coordinate between
-      // the auto-tuner, client threads, worker threads, and the runner thread.
-      const std::shared_ptr<mutex> mu_;
-      const std::shared_ptr<condition_variable> autotune_cond_var_;
-      // The maximum number of parallel calls (can be auto-tuned).
-      const std::shared_ptr<model::SharedState> num_parallel_calls_;
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
-
-      // Caches the last-seen value of num_parallel_calls_->value to
-      // short-circuit starting workers.
-      int64 curr_num_parallel_calls_ GUARDED_BY(*mu_) = 0;
-
-      std::unique_ptr<IteratorBase> input_impl_;
-      int64 cur_block_ GUARDED_BY(*mu_) = 0;
-      bool global_end_of_input_ GUARDED_BY(*mu_) = false;
-      bool cancelled_ GUARDED_BY(*mu_) = false;
-      std::vector<std::unique_ptr<NumaWorkerBlock,
-                                  std::function<void(NumaWorkerBlock*)>>>
-          workers_;  // Const after initialization.
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
-    };
-
-    const DatasetBase* const input_;
-    const int64 batch_size_;
-    const int64 num_parallel_calls_;
-    const bool drop_remainder_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-    const NameAttrList func_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-  };
-
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-  bool preserve_cardinality_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalNumaMapAndBatchDataset").Device(DEVICE_CPU),
-    NumaMapAndBatchDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 38ce2ad..d89518e 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -38,13 +38,10 @@
  public:
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "f", /*params=*/{},
+                                                 &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       interleave_func_.name(), &lib_def_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -79,15 +76,13 @@
         errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    CapturedFunction::Params params;
-    params.lib_def = lib_def_;
     OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
-                                      std::move(params), &captured_func));
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
     *output =
-        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
-                    cycle_length, block_length, sloppy, buffer_output_elements,
+        new Dataset(ctx, input, std::move(captured_func), cycle_length,
+                    block_length, sloppy, buffer_output_elements,
                     prefetch_input_elements, output_types_, output_shapes_);
   }
 
@@ -95,14 +90,12 @@
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, bool sloppy, int64 buffer_output_elements,
             int64 prefetch_input_elements, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          interleave_func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -157,7 +150,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(interleave_func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -1058,7 +1051,6 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList interleave_func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
@@ -1069,10 +1061,9 @@
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList interleave_func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index 0397ca0..3078005 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -24,11 +25,7 @@
 class RebatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit RebatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
+      : UnaryDatasetOpKernel(ctx) {}
 
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -39,58 +36,32 @@
         ctx, num_workers > 0,
         errors::InvalidArgument("num_workers must be greater than zero."));
 
-    Dataset* dataset =
-        new Dataset(ctx, input, num_workers, output_types_, output_shapes_);
-    Status s = dataset->Optimize(ctx);
-    if (s.ok()) {
-      *output = dataset;
-    } else {
-      dataset->Unref();
-      OP_REQUIRES_OK(ctx, s);
-    }
+    auto config_factory = [num_workers]() { return CreateConfig(num_workers); };
+
+    // We only want to optimize functions for some particular datasets like
+    // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+    // function optimization and explicitly handle function modifications
+    // for those datasets in the rewrite.
+    OP_REQUIRES_OK(ctx,
+                   RewriteDataset(ctx, input, std::move(config_factory),
+                                  /*optimize_function_library=*/false, output));
   }
 
  private:
-  class Dataset : public GraphRewriteDataset {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const int64 num_workers, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
-          num_workers_(num_workers) {}
-
-    string DebugString() const override { return "RebatchDatasetOp::Dataset"; }
-
-   private:
-    bool ShouldOptimizeFunctions() override {
-      // We only want to optimize functions for some particular datasets like
-      // FlatMapDataset, InterleaveDataset etc. So we disable generalized
-      // function optimization and explicitly handle function modifications
-      // for those datasets in the rewrite.
-      return false;
-    }
-
-    RewriterConfig CreateGrapplerRewriteConfig() override {
-      RewriterConfig rewriter_config;
-      rewriter_config.set_fail_on_optimizer_errors(true);
-      rewriter_config.add_optimizers(kOptimizerName);
-      rewriter_config.set_meta_optimizer_iterations(
-          RewriterConfig_NumIterationsType_ONE);
-      auto custom_optimizer = rewriter_config.add_custom_optimizers();
-      custom_optimizer->set_name(kOptimizerName);
-      AttrValue num_workers_attr;
-      num_workers_attr.set_i(num_workers_);
-      (*custom_optimizer->mutable_parameter_map())["num_workers"] =
-          num_workers_attr;
-      return rewriter_config;
-    }
-
-    const int64 num_workers_;
-  };
-
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
+  static RewriterConfig CreateConfig(int64 num_workers) {
+    RewriterConfig rewriter_config;
+    rewriter_config.set_fail_on_optimizer_errors(true);
+    rewriter_config.add_optimizers(kOptimizerName);
+    rewriter_config.set_meta_optimizer_iterations(
+        RewriterConfig_NumIterationsType_ONE);
+    auto custom_optimizer = rewriter_config.add_custom_optimizers();
+    custom_optimizer->set_name(kOptimizerName);
+    AttrValue num_workers_attr;
+    num_workers_attr.set_i(num_workers);
+    (*custom_optimizer->mutable_parameter_map())["num_workers"] =
+        num_workers_attr;
+    return rewriter_config;
+  }
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExperimentalRebatchDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 32000ad..0fcfadb 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -34,16 +34,15 @@
  public:
   explicit ScanDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    FunctionMetadata::Params params;
+    params.is_multi_device_function = true;
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tstate", &state_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -55,13 +54,11 @@
                                       initial_state_inputs.end());
 
     std::unique_ptr<CapturedFunction> captured_func;
-    data::CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
-    *output = new Dataset(ctx, input, func_, std::move(initial_state),
+    *output = new Dataset(ctx, input, std::move(initial_state),
                           std::move(captured_func), state_types_, output_types_,
                           output_shapes_, preserve_cardinality_);
   }
@@ -70,7 +67,7 @@
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func, std::vector<Tensor> initial_state,
+            std::vector<Tensor> initial_state,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& state_types,
             const DataTypeVector& output_types,
@@ -78,7 +75,6 @@
             bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
           initial_state_(std::move(initial_state)),
           captured_func_(std::move(captured_func)),
           state_types_(state_types),
@@ -125,7 +121,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue state_types;
       b->BuildAttrValue(state_types_, &state_types);
       AttrValue other_arguments_types_attr;
@@ -278,7 +274,6 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const std::vector<Tensor> initial_state_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector state_types_;
@@ -287,12 +282,11 @@
     const bool preserve_cardinality_;
   };
 
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector state_types_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
   bool preserve_cardinality_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExperimentalScanDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
new file mode 100644
index 0000000..f96f75f
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -0,0 +1,487 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/base64.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/protobuf/data/experimental/snapshot.pb.h"
+#include "tensorflow/core/util/batch_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
+
+const uint64 kReaderBufferSize = 8 * 1024 * 1024;  // 8 MB
+
+const char* kCompressionType = io::compression::kGzip;
+
+const uint64 kOneDayInMicroseconds = 24L * 60L * 60L * 1e6L;
+
+const uint64 kNumElementsPerShard = 10000;
+
+const char kSnapshotFilename[] = "snapshot.metadata";
+
+string GetCurrentSnapshotDataFilename(uint64 next_index,
+                                      const string& run_dir) {
+  uint64_t shard_id = next_index / kNumElementsPerShard;
+  return absl::StrCat(run_dir, "/", strings::Printf("%08lu", shard_id),
+                      ".snapshot");
+}
+
+Status WriteMetadataFile(const string& fingerprint_dir,
+                         const experimental::SnapshotMetadataRecord& metadata) {
+  string metadata_filename =
+      absl::StrCat(fingerprint_dir, "/", kSnapshotFilename);
+
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(fingerprint_dir));
+
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(metadata_filename, &file));
+
+  auto writer = absl::make_unique<io::RecordWriter>(file.get());
+  TF_RETURN_IF_ERROR(writer->WriteRecord(metadata.SerializeAsString()));
+  TF_RETURN_IF_ERROR(writer->Close());
+
+  return Status::OK();
+}
+
+Status ReadMetadataFile(const string& fingerprint_dir,
+                        experimental::SnapshotMetadataRecord* metadata) {
+  string metadata_filename =
+      absl::StrCat(fingerprint_dir, "/", kSnapshotFilename);
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(metadata_filename));
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_CHECK_OK(Env::Default()->NewRandomAccessFile(metadata_filename, &file));
+
+  string record_bytes;
+  auto reader = absl::make_unique<io::SequentialRecordReader>(file.get());
+  TF_CHECK_OK(reader->ReadRecord(&record_bytes));
+
+  metadata->ParseFromString(record_bytes);
+  return Status::OK();
+}
+
+SnapshotMode DetermineOpState(
+    const Status& file_status,
+    const experimental::SnapshotMetadataRecord& metadata) {
+  if (errors::IsNotFound(file_status)) {
+    return WRITER;
+  }
+
+  if (metadata.finalized()) {
+    // File found, snapshot has been finalized.
+    return READER;
+  }
+
+  if (metadata.creation_timestamp() >=
+      Env::Default()->NowMicros() - kOneDayInMicroseconds) {
+    // TODO(frankchn): Make this timestamp configurable.
+    // Someone else is already writing and time has not expired.
+    return PASSTHROUGH;
+  } else {
+    // Time has expired, we write regardless.
+    return WRITER;
+  }
+}
+
+class SnapshotDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SnapshotDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    string path;
+
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "path", &path));
+
+    GraphDef graph_def;
+    OP_REQUIRES_OK(
+        ctx, AsGraphDef(ctx, input, SerializationContext({}), &graph_def));
+
+    // TODO(frankchn): Find a better way than SerializeToStringDeterministic()
+    // This is not deterministic across different builds of binaries right now.
+    string graph_def_serialized;
+    SerializeToStringDeterministic(graph_def, &graph_def_serialized);
+
+    string graph_fingerprint = strings::StrCat(
+        strings::Hex(Fingerprint64(graph_def_serialized), strings::kZeroPad16));
+
+    *output = new Dataset(ctx, input, path, graph_fingerprint);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, const string& path,
+            const string& graph_fingerprint)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          dir_(path),
+          graph_fingerprint_(graph_fingerprint) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Snapshot")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override { return "SnapshotDatasetOp::Dataset"; }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* path = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(dir_, &path));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node, path}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        fingerprint_dir_ =
+            absl::StrCat(dataset()->dir_, "/", dataset()->graph_fingerprint_);
+
+        experimental::SnapshotMetadataRecord metadata;
+        Status s = ReadMetadataFile(fingerprint_dir_, &metadata);
+        state_ = DetermineOpState(s, metadata);
+
+        switch (state_) {
+          case WRITER:
+            iterator_ = absl::make_unique<SnapshotWriterIterator>(
+                SnapshotWriterIterator::Params{
+                    dataset(), strings::StrCat(prefix(), "Impl")},
+                fingerprint_dir_);
+            break;
+          case READER:
+            iterator_ = absl::make_unique<SnapshotReaderIterator>(
+                SnapshotReaderIterator::Params{
+                    dataset(), strings::StrCat(prefix(), "Impl")},
+                fingerprint_dir_, metadata);
+            break;
+          case PASSTHROUGH:
+            iterator_ = absl::make_unique<SnapshotPassthroughIterator>(
+                SnapshotPassthroughIterator::Params{
+                    dataset(), strings::StrCat(prefix(), "Impl")});
+            break;
+        }
+
+        return iterator_->Initialize(ctx);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // TODO(frankchn): Make save iterators work
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // TODO(frankchn): Make iterator restores work
+        return Status::OK();
+      }
+
+     private:
+      class SnapshotReaderIterator : public DatasetIterator<Dataset> {
+       public:
+        explicit SnapshotReaderIterator(
+            const Params& params, const string& fingerprint_dir,
+            const experimental::SnapshotMetadataRecord& metadata)
+            : DatasetIterator<Dataset>(params),
+              fingerprint_dir_(fingerprint_dir),
+              metadata_(metadata) {}
+
+        Status Initialize(IteratorContext* ctx) override {
+          mutex_lock l(mu_);
+
+          run_id_ = metadata_.run_id();
+          run_dir_ = absl::StrCat(fingerprint_dir_, "/", run_id_);
+          return Status::OK();
+        }
+
+        Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override {
+          mutex_lock l(mu_);
+
+          string snapshot_data_filename =
+              GetCurrentSnapshotDataFilename(next_index_, run_dir_);
+
+          if (current_read_filename_ != snapshot_data_filename) {
+            current_reader_.reset();
+            current_read_file_.reset();
+
+            // The current implementation here assumes that tensors are stored
+            // in files which are named sequentially. If a file doesn't exist
+            // when we try reading that item, we assume that we have reached the
+            // end of the snapshot.
+            Status s = Env::Default()->FileExists(snapshot_data_filename);
+            if (!s.ok()) {
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+
+            TF_CHECK_OK(Env::Default()->NewRandomAccessFile(
+                snapshot_data_filename, &current_read_file_));
+            auto reader_options =
+                io::RecordReaderOptions::CreateRecordReaderOptions(
+                    kCompressionType);
+            reader_options.buffer_size = kReaderBufferSize;
+
+            current_reader_ = absl::make_unique<io::SequentialRecordReader>(
+                current_read_file_.get(), reader_options);
+            current_read_filename_ = snapshot_data_filename;
+          }
+
+          string record_bytes;
+          Status s = current_reader_->ReadRecord(&record_bytes);
+
+          if (errors::IsOutOfRange(s)) {
+            *end_of_sequence = true;
+            return Status::OK();
+          } else if (!s.ok()) {
+            return s;
+          }
+
+          experimental::SnapshotRecord record;
+          record.ParseFromString(record_bytes);
+
+          for (int i = 0; i < record.tensor_size(); ++i) {
+            Tensor t;
+            if (!t.FromProto(record.tensor(i))) {
+              return errors::DataLoss("Unable to parse Tensor from proto.");
+            }
+            out_tensors->push_back(t);
+          }
+
+          next_index_++;
+          return Status::OK();
+        }
+
+       private:
+        const string fingerprint_dir_;
+        const experimental::SnapshotMetadataRecord metadata_;
+        string run_id_ GUARDED_BY(mu_);
+        string run_dir_ GUARDED_BY(mu_);
+
+        std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+
+        string current_read_filename_ GUARDED_BY(mu_);
+        std::unique_ptr<RandomAccessFile> current_read_file_ GUARDED_BY(mu_);
+        std::unique_ptr<io::SequentialRecordReader> current_reader_
+            GUARDED_BY(mu_);
+
+        int64 next_index_ GUARDED_BY(mu_) = 0;
+
+        mutex mu_;
+      };
+
+      class SnapshotWriterIterator : public DatasetIterator<Dataset> {
+       public:
+        explicit SnapshotWriterIterator(const Params& params,
+                                        const string& fingerprint_dir)
+            : DatasetIterator<Dataset>(params),
+              fingerprint_dir_(fingerprint_dir) {}
+
+        Status Initialize(IteratorContext* ctx) override {
+          mutex_lock l(mu_);
+
+          run_id_ = strings::StrCat(
+              strings::Hex(random::New64(), strings::kZeroPad4));
+          run_dir_ = absl::StrCat(fingerprint_dir_, "/", run_id_);
+
+          TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir_));
+
+          experimental::SnapshotMetadataRecord metadata;
+          metadata.set_creation_timestamp(Env::Default()->NowMicros());
+          metadata.set_graph_fingerprint(dataset()->graph_fingerprint_);
+          metadata.set_run_id(run_id_);
+          metadata.set_finalized(false);
+
+          TF_RETURN_IF_ERROR(WriteMetadataFile(fingerprint_dir_, metadata));
+
+          return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        }
+
+        Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override {
+          mutex_lock l(mu_);
+
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+
+          if (*end_of_sequence) {
+            experimental::SnapshotMetadataRecord metadata;
+            TF_RETURN_IF_ERROR(ReadMetadataFile(fingerprint_dir_, &metadata));
+
+            if (metadata.run_id() == run_id_) {
+              if (current_writer_) TF_RETURN_IF_ERROR(current_writer_->Close());
+              if (current_write_file_)
+                TF_RETURN_IF_ERROR(current_write_file_->Close());
+              current_writer_.reset();
+              current_write_file_.reset();
+
+              current_write_filename_ = "";
+
+              metadata.set_finalized(true);
+              TF_RETURN_IF_ERROR(WriteMetadataFile(fingerprint_dir_, metadata));
+            } else {
+              // TODO(frankchn): We lost the race, remove all snapshots.
+            }
+
+            return Status::OK();
+          }
+
+          string snapshot_data_filename =
+              GetCurrentSnapshotDataFilename(next_index_, run_dir_);
+
+          if (current_write_filename_ != snapshot_data_filename) {
+            if (current_writer_) TF_RETURN_IF_ERROR(current_writer_->Close());
+            if (current_write_file_)
+              TF_RETURN_IF_ERROR(current_write_file_->Close());
+
+            current_writer_.reset();
+            current_write_file_.reset();
+
+            auto writer_options =
+                io::RecordWriterOptions::CreateRecordWriterOptions(
+                    kCompressionType);
+
+            TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(
+                snapshot_data_filename, &current_write_file_));
+            current_writer_ = absl::make_unique<io::RecordWriter>(
+                current_write_file_.get(), writer_options);
+            current_write_filename_ = snapshot_data_filename;
+          }
+
+          experimental::SnapshotRecord record;
+
+          for (auto out_tensor : *out_tensors) {
+            TensorProto* t = record.add_tensor();
+            out_tensor.AsProtoTensorContent(t);
+          }
+
+          TF_RETURN_IF_ERROR(
+              current_writer_->WriteRecord(record.SerializeAsString()));
+
+          next_index_++;
+          return Status::OK();
+        }
+
+       private:
+        std::unique_ptr<IteratorBase> input_impl_;
+
+        const string fingerprint_dir_;
+        string run_id_ GUARDED_BY(mu_);
+        string run_dir_ GUARDED_BY(mu_);
+
+        string current_write_filename_ GUARDED_BY(mu_);
+        std::unique_ptr<WritableFile> current_write_file_ GUARDED_BY(mu_);
+        std::unique_ptr<io::RecordWriter> current_writer_ GUARDED_BY(mu_);
+
+        uint64 next_index_ GUARDED_BY(mu_) = 0;
+
+        mutex mu_;
+      };
+
+      class SnapshotPassthroughIterator : public DatasetIterator<Dataset> {
+       public:
+        explicit SnapshotPassthroughIterator(const Params& params)
+            : DatasetIterator<Dataset>(params) {}
+
+        Status Initialize(IteratorContext* ctx) override {
+          return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        }
+
+        Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override {
+          return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        }
+
+       private:
+        std::unique_ptr<IteratorBase> input_impl_;
+      };
+
+      string fingerprint_dir_;
+      SnapshotMode state_;
+
+      std::unique_ptr<IteratorBase> iterator_;
+    };
+
+    const DatasetBase* const input_;
+    const string dir_;
+    const string graph_fingerprint_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("SnapshotDataset").Device(DEVICE_CPU),
+                        SnapshotDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index ab54dc9..79c5ec0 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -32,81 +32,32 @@
 
 class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using LoopIteratorPredicate =
-      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
-                           std::vector<Tensor>&, bool*)>;
-
   explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
-    OP_REQUIRES_OK(
-        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
-    OP_REQUIRES(
-        ctx, short_circuit_indices_.size() <= 1,
-        errors::InvalidArgument("`predicate` has more than one return value."));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(
+                            ctx, "predicate", /*params=*/{}, &func_metadata_));
+    OP_REQUIRES(ctx, func_metadata_->short_circuit_info().indices.size() <= 1,
+                errors::InvalidArgument(
+                    "predicate function has more than one return value."));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
     std::unique_ptr<CapturedFunction> captured_func;
-    data::CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
-
-    LoopIteratorPredicate loop_pred;
-    if (short_circuit_indices_.empty()) {
-      loop_pred = [](IteratorContext* ctx,
-                     InstantiatedCapturedFunction* inst_captured_func,
-                     const std::vector<Tensor>& args, bool* end_of_sequence) {
-        std::vector<Tensor> result;
-        TF_RETURN_IF_ERROR(
-            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
-
-        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-            result[0].NumElements() != 1) {
-          return errors::InvalidArgument(
-              "`predicate` must returns a scalar bool tensor.");
-        }
-        *end_of_sequence = !result[0].scalar<bool>()();
-        return Status::OK();
-      };
-    } else {
-      int predicate_index = short_circuit_indices_[0];
-      loop_pred = [predicate_index](
-                      IteratorContext* ctx,
-                      InstantiatedCapturedFunction* inst_captured_func,
-                      const std::vector<Tensor>& args, bool* end_of_sequence) {
-        const Tensor& predicate = args[predicate_index];
-        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
-          return errors::InvalidArgument(
-              "`predicate` must returns a scalar bool tensor.");
-        }
-        *end_of_sequence = !predicate.scalar<bool>()();
-        return Status::OK();
-      };
-    }
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          std::move(loop_pred));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
+    *output = new Dataset(ctx, input, std::move(captured_func));
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func,
-            LoopIteratorPredicate loop_pred)
+            std::unique_ptr<CapturedFunction> captured_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
-          captured_func_(std::move(captured_func)),
-          loop_pred_(std::move(loop_pred)) {
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
@@ -115,8 +66,7 @@
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return MakeUnique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::TakeWhile")},
-          loop_pred_);
+          Iterator::Params{this, strings::StrCat(prefix, "::TakeWhile")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -145,7 +95,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f_attr;
-      b->BuildAttrValue(func_, &f_attr);
+      b->BuildAttrValue(captured_func_->func(), &f_attr);
 
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
@@ -162,8 +112,8 @@
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params, LoopIteratorPredicate loop_pred)
-          : DatasetIterator<Dataset>(params), loop_pred_(loop_pred) {}
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
@@ -189,8 +139,16 @@
           input_impl_.reset();
           return Status::OK();
         }
-        TF_RETURN_IF_ERROR(loop_pred_(ctx, instantiated_captured_func_.get(),
-                                      *out_tensors, end_of_sequence));
+        std::vector<Tensor> result;
+        TF_RETURN_IF_ERROR(instantiated_captured_func_->RunWithBorrowedArgs(
+            ctx, *out_tensors, &result));
+
+        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+            result[0].NumElements() != 1) {
+          return errors::InvalidArgument(
+              "`predicate` must returns a scalar bool tensor.");
+        }
+        *end_of_sequence = !result[0].scalar<bool>()();
         if (*end_of_sequence) {
           out_tensors->clear();
         }
@@ -228,18 +186,13 @@
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
-      const LoopIteratorPredicate loop_pred_;
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const LoopIteratorPredicate loop_pred_;
   };
 
-  NameAttrList func_;
-  std::vector<int> short_circuit_indices_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExperimentalTakeWhileDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
index 3b9b319..bbcc84d 100644
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
@@ -130,7 +130,13 @@
             return Status::OK();
           }
 
-          matched = out_tensors->back().scalar<bool>()();
+          const Tensor& last_component = out_tensors->back();
+          if (last_component.NumElements() != 1 ||
+              last_component.dtype() != DT_BOOL) {
+            return errors::InvalidArgument(
+                "Last component must be a bool scalar.");
+          }
+          matched = last_component.scalar<bool>()();
           out_tensors->pop_back();
           if (!matched) {
             // Clear the output tensor list since it didn't match.
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc
new file mode 100644
index 0000000..04627df
--- /dev/null
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc
@@ -0,0 +1,589 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "filter_by_last_component_dataset";
+constexpr char kOpName[] = "FilterByLastComponentDataset";
+
+class FilterByLastComponentDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `FilterByLastComponentDataset` op kernel.
+  Status CreateFilterByLastComponentDatasetKernel(
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `FilterByLastComponentDataset` op kernel context.
+  Status CreateFilterByLastComponentDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test case 1: simple case.
+TestCase TestCase1() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                   {0, 1, 2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<bool>(TensorShape{3, 1},
+                                                  {true, false, true})},
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {0, 1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {4, 5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test case 2: the output of input dataset is empty.
+TestCase TestCase2() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0}};
+}
+
+// Test case 3: the output of input dataset has only one component.
+TestCase TestCase3() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<bool>(TensorShape{3, 1},
+                                                  {true, false, true})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_BOOL},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test case 4: the last component has more than one element.
+TestCase InvalidLastComponentShape() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                   {0, 1, 2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<bool>(
+               TensorShape{3, 2}, {true, false, true, true, false, true})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+// Test case 5: the data type of last component is not DT_BOOL.
+TestCase InvalidLastComponentDType() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                   {0, 1, 2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<int>(TensorShape{3}, {1, 1, 0})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+class ParameterizedFilterByLastComponentDatasetOpTest
+    : public FilterByLastComponentDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_F(FilterByLastComponentDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TestCase1();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  EXPECT_EQ(filter_by_last_component_dataset->node_name(), kNodeName);
+}
+
+TEST_F(FilterByLastComponentDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TestCase1();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  EXPECT_EQ(filter_by_last_component_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  TF_EXPECT_OK(
+      VerifyTypesMatch(filter_by_last_component_dataset->output_dtypes(),
+                       test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  TF_EXPECT_OK(
+      VerifyShapesCompatible(filter_by_last_component_dataset->output_shapes(),
+                             test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  EXPECT_EQ(filter_by_last_component_dataset->Cardinality(),
+            test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(
+      filter_by_last_component_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(FilterByLastComponentDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TestCase1();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::FilterByLastComponent");
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *filter_by_last_component_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(FilterByLastComponentDatasetOpTest,
+                         ParameterizedFilterByLastComponentDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3()})));
+
+TEST_F(FilterByLastComponentDatasetOpTest, InvalidLastComponent) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  std::vector<TestCase> test_cases = {InvalidLastComponentShape(),
+                                      InvalidLastComponentDType()};
+  for (const TestCase &test_case : test_cases) {
+    std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+    TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+        test_case.expected_output_dtypes, test_case.expected_output_shapes,
+        &filter_by_last_component_dataset_kernel));
+
+    Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+    std::vector<Tensor> inputs_for_tensor_slice_dataset =
+        test_case.input_tensors;
+    TF_ASSERT_OK(CreateTensorSliceDatasetTensor(
+        &inputs_for_tensor_slice_dataset, &tensor_slice_dataset_tensor));
+    gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+    std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+    TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+        filter_by_last_component_dataset_kernel.get(), &inputs,
+        &filter_by_last_component_dataset_context));
+    DatasetBase *filter_by_last_component_dataset;
+    TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                               filter_by_last_component_dataset_context.get(),
+                               &filter_by_last_component_dataset));
+    core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(CreateIteratorContext(
+        filter_by_last_component_dataset_context.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+        iterator_ctx.get(), "Iterator", &iterator));
+
+    std::vector<Tensor> next;
+    bool end_of_sequence = false;
+    EXPECT_EQ(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence).code(),
+        tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 1098047..688d120 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -33,82 +34,33 @@
 
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using FilterIteratorPredicate =
-      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
-                           std::vector<Tensor>, bool*)>;
-
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
-    OP_REQUIRES_OK(
-        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
-    OP_REQUIRES(ctx, short_circuit_indices_.size() <= 1,
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(
+                            ctx, "predicate", /*params=*/{}, &func_metadata_));
+    OP_REQUIRES(ctx, func_metadata_->short_circuit_info().indices.size() <= 1,
                 errors::InvalidArgument(
                     "predicate function has more than one return value."));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
     std::unique_ptr<CapturedFunction> captured_func;
-    data::CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
-    FilterIteratorPredicate filter_pred;
-    if (short_circuit_indices_.empty()) {
-      filter_pred = [](IteratorContext* ctx,
-                       InstantiatedCapturedFunction* inst_captured_func,
-                       const std::vector<Tensor>& args, bool* out_matched) {
-        std::vector<Tensor> result;
-        TF_RETURN_IF_ERROR(
-            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
-
-        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-            result[0].NumElements() != 1) {
-          return errors::InvalidArgument(
-              "Filter predicate `f` must return a scalar bool.");
-        }
-        *out_matched = result[0].scalar<bool>()();
-        return Status::OK();
-      };
-    } else {
-      int predicate_index = short_circuit_indices_[0];
-      filter_pred = [predicate_index](
-                        IteratorContext* ctx,
-                        InstantiatedCapturedFunction* inst_captured_func,
-                        const std::vector<Tensor>& args, bool* out_matched) {
-        const Tensor& predicate = args[predicate_index];
-        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
-          return errors::InvalidArgument(
-              "Filter predicate `f` must return a scalar bool.");
-        }
-        *out_matched = predicate.scalar<bool>()();
-        return Status::OK();
-      };
-    }
-
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          std::move(filter_pred));
+    *output = new Dataset(ctx, input, std::move(captured_func));
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func,
-            FilterIteratorPredicate filter_pred)
+            std::unique_ptr<CapturedFunction> captured_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
-          captured_func_(std::move(captured_func)),
-          filter_pred_(std::move(filter_pred)) {
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
@@ -140,7 +92,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -191,8 +143,20 @@
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(dataset()->filter_pred_(
-              ctx, instantiated_captured_func_.get(), *out_tensors, &matched));
+          std::vector<Tensor> result;
+          TF_RETURN_IF_ERROR(instantiated_captured_func_->RunWithBorrowedArgs(
+              ctx, *out_tensors, &result));
+
+          if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+              result[0].NumElements() != 1) {
+            // Clear the output tensor list since there were errors with Filter
+            // prediction result.
+            out_tensors->clear();
+            return errors::InvalidArgument(
+                "Filter predicate `f` must return a scalar bool.");
+          }
+          matched = result[0].scalar<bool>()();
+
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -270,15 +234,11 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const FilterIteratorPredicate filter_pred_;
   };
 
  private:
-  NameAttrList func_;
-  std::vector<int> short_circuit_indices_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FilterDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/filter_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
new file mode 100644
index 0000000..b145600
--- /dev/null
+++ b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
@@ -0,0 +1,593 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "filter_dataset";
+constexpr char kOpName[] = "FilterDataset";
+
+class FilterDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `FilterDataset` op kernel
+  Status CreateFilterDatasetKernel(
+      const FunctionDefHelper::AttrValueWrapper &func,
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    NodeDef node_def =
+        test::function::NDef(kNodeName, kOpName, {"input_dataset"},
+                             {{"predicate", func},
+                              {"Targuments", {}},
+                              {"output_types", output_types},
+                              {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `ParallelInterleaveDataset` op kernel context.
+  Status CreateFilterDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  FunctionDefHelper::AttrValueWrapper func;
+  std::vector<FunctionDef> func_lib;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+template <typename T>
+std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
+  std::vector<Tensor> tensors;
+  tensors.reserve(values.size());
+  for (auto &value : values) {
+    tensors.emplace_back(
+        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+  }
+  return tensors;
+}
+
+// Test case 1: norm case.
+TestCase TestCase1() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{9, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::IsZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({0, 0, 0}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
+}
+
+// Test case 2: the input dataset has no outputs.
+TestCase TestCase2() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::IsZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
+}
+
+// Test case 3: the filter function returns two outputs.
+TestCase InvalidFuncTestCase1() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{3, 3}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/
+          FunctionDefHelper::FunctionRef(
+              "GetUnique", {{"T", DT_INT64}, {"out_idx", DT_INT32}}),
+          /*func_lib*/ {test::function::Unique()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+// Test case 4: the filter function returns a 1-D bool tensor.
+TestCase InvalidFuncTestCase2() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{3, 3, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::IsZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+// Test case 5: the filter function returns a scalar int64 tensor.
+TestCase InvalidFuncTestCase3() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{9}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/ FunctionDefHelper::FunctionRef("NonZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::NonZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+class ParameterizedFilterDatasetOpTest
+    : public FilterDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedFilterDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_F(FilterDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  EXPECT_EQ(filter_dataset->node_name(), kNodeName);
+}
+
+TEST_F(FilterDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  EXPECT_EQ(filter_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(filter_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(filter_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  EXPECT_EQ(filter_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(filter_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(ParameterizedFilterDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Filter");
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *filter_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FilterDatasetOpTest, ParameterizedFilterDatasetOpTest,
+    ::testing::ValuesIn(std::vector<TestCase>({TestCase1(), TestCase2()})));
+
+TEST_F(ParameterizedFilterDatasetOpTest, InvalidFuncs) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(
+      {test::function::IsZero(), test::function::Unique(),
+       test::function::NonZero()},
+      cpu_num));
+
+  std::vector<TestCase> test_cases(
+      {InvalidFuncTestCase1(), InvalidFuncTestCase2(), InvalidFuncTestCase3()});
+  for (const auto &test_case : test_cases) {
+    std::unique_ptr<OpKernel> filter_dataset_kernel;
+    TF_ASSERT_OK(CreateFilterDatasetKernel(
+        test_case.func, test_case.expected_output_dtypes,
+        test_case.expected_output_shapes, &filter_dataset_kernel));
+    Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+    std::vector<Tensor> inputs_for_tensor_slice_dataset =
+        test_case.input_tensors;
+    TF_ASSERT_OK(CreateTensorSliceDatasetTensor(
+        &inputs_for_tensor_slice_dataset, &tensor_slice_dataset_tensor));
+    gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+    std::unique_ptr<OpKernelContext> filter_dataset_context;
+    TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(),
+                                            &inputs, &filter_dataset_context));
+    DatasetBase *filter_dataset;
+    TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                               filter_dataset_context.get(), &filter_dataset));
+    core::ScopedUnref scoped_unref(filter_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(
+        CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator",
+                                              &iterator));
+
+    bool end_of_sequence = false;
+    std::vector<Tensor> out_tensors;
+    EXPECT_EQ(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)
+            .code(),
+        tensorflow::error::INVALID_ARGUMENT);
+    EXPECT_TRUE(out_tensors.empty());
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index e93771f..d3e571a 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -32,38 +32,31 @@
   explicit FlatMapDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "f", /*params=*/{},
+                                                 &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
     std::unique_ptr<CapturedFunction> captured_func;
-    data::CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_);
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
+    *output = new Dataset(ctx, input, std::move(captured_func), output_types_,
+                          output_shapes_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
           output_shapes_(output_shapes) {
@@ -99,7 +92,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -255,7 +248,6 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
@@ -264,8 +256,7 @@
   const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FlatMapDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 8864ce4..a3c39a4 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -154,47 +154,34 @@
 };
 
 GeneratorDatasetOp::GeneratorDatasetOp(OpKernelConstruction* ctx)
-    : DatasetOpKernel(ctx),
-      lib_def_(std::make_shared<FunctionLibraryDefinition>(
-          ctx->function_library()
-              ->GetFunctionLibraryDefinition()
-              ->default_registry(),
-          FunctionDefLibrary{})) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("next_func", &next_func_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("finalize_func", &finalize_func_));
+    : DatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "init_func", /*params=*/{},
+                                               &init_func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "next_func", /*params=*/{},
+                                               &next_func_metadata_));
+  OP_REQUIRES_OK(ctx,
+                 FunctionMetadata::Create(ctx, "finalize_func", /*params=*/{},
+                                          &finalize_func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-
-  for (const auto& func : {init_func_, next_func_, finalize_func_}) {
-    std::shared_ptr<FunctionLibraryDefinition> result;
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func.name(), &result));
-    OP_REQUIRES_OK(ctx, lib_def_->AddLibrary(*result));
-  }
 }
 
 void GeneratorDatasetOp::MakeDataset(OpKernelContext* ctx,
                                      DatasetBase** output) {
-  CapturedFunction::Params params;
-  params.lib_def = lib_def_;
-
   std::unique_ptr<CapturedFunction> init_func;
-  OP_REQUIRES_OK(
-      ctx, CapturedFunction::Create(init_func_, ctx, "init_func_other_args",
-                                    params, &init_func));
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, init_func_metadata_,
+                                          "init_func_other_args", &init_func));
 
   std::unique_ptr<CapturedFunction> next_func;
-  OP_REQUIRES_OK(
-      ctx, CapturedFunction::Create(next_func_, ctx, "next_func_other_args",
-                                    params, &next_func));
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, next_func_metadata_,
+                                          "next_func_other_args", &next_func));
 
   std::unique_ptr<CapturedFunction> finalize_func;
-  OP_REQUIRES_OK(ctx, CapturedFunction::Create(finalize_func_, ctx,
+  OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, finalize_func_metadata_,
                                                "finalize_func_other_args",
-                                               params, &finalize_func));
+                                               &finalize_func));
 
   *output =
       new Dataset(ctx, std::move(init_func), std::move(next_func),
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.h b/tensorflow/core/kernels/data/generator_dataset_op.h
index f2a9c29..951440ee 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.h
+++ b/tensorflow/core/kernels/data/generator_dataset_op.h
@@ -17,6 +17,7 @@
 #define TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
 
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
 
 namespace tensorflow {
 namespace data {
@@ -32,10 +33,9 @@
 
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList init_func_;
-  NameAttrList next_func_;
-  NameAttrList finalize_func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
+  std::shared_ptr<FunctionMetadata> init_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> next_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> finalize_func_metadata_ = nullptr;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.cc b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
deleted file mode 100644
index fd91bdc..0000000
--- a/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
-#include <memory>
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tensorflow/core/protobuf/rewriter_config.pb.h"
-
-namespace tensorflow {
-namespace data {
-
-GraphRewriteDataset::~GraphRewriteDataset() {
-  input_->Unref();
-  if (optimized_input_) {
-    optimized_input_->Unref();
-  }
-}
-
-Status GraphRewriteDataset::Optimize(OpKernelContext* ctx) {
-  GraphDefBuilder b;
-  DatasetGraphDefBuilder db(&b);
-  Node* input_node = nullptr;
-  SerializationContext::Params params;
-  std::vector<std::pair<string, Tensor>> input_list;
-  params.input_list = &input_list;
-  params.optimization_only = true;
-  SerializationContext serialization_ctx(params);
-  TF_RETURN_IF_ERROR(
-      db.AddInputDataset(&serialization_ctx, input_, &input_node));
-  string output_node = input_node->name();
-
-  GraphDef graph_def;
-  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-  VLOG(3) << "Before optimization: " << graph_def.DebugString();
-
-  TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
-  VLOG(3) << "After optimization: " << graph_def.DebugString();
-
-  // Instantiate the optimized input pipeline by running the optimized graph
-  // using the optimized function library.
-  TF_RETURN_IF_ERROR(
-      ctx->function_library()->Clone(&lib_def_, &pflr_, &flr_, true));
-
-  // Create a FunctionHandleCache.
-  function_handle_cache_ = absl::make_unique<FunctionHandleCache>(flr_);
-
-  // Some functions may have been modified without having their names
-  // changed (for example, nested dataset graphs from FlatMap or
-  // Interleave).
-  TF_RETURN_IF_ERROR(AddToFunctionLibrary(lib_def_.get(), graph_def.library()));
-
-  Graph graph(OpRegistry::Global());
-  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
-  std::vector<Tensor> outputs;
-  GraphRunner graph_runner(flr_->device());
-
-  TF_RETURN_IF_ERROR(
-      graph_runner.Run(&graph, flr_, input_list, {output_node}, &outputs));
-  TF_RETURN_IF_ERROR(
-      GetDatasetFromVariantTensor(outputs[0], &optimized_input_));
-  optimized_input_->Ref();
-  return Status::OK();
-}
-
-Status GraphRewriteDataset::AsGraphDefInternal(SerializationContext* ctx,
-                                               DatasetGraphDefBuilder* b,
-                                               Node** output) const {
-  // We only serialize the optimized dataset to avoid re-running optimizations
-  // when the input pipeline is restored from a checkpoint.
-  TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, optimized_input_, output));
-  return Status::OK();
-}
-
-namespace {
-void AddFakeSinks(FunctionDef* function_def) {
-  int counter = 0;
-  for (const auto& output : function_def->signature().output_arg()) {
-    NodeDef* node = function_def->add_node_def();
-    tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
-        strings::StrCat("FakeSink", counter++), function_def, node);
-    node->set_op("Identity");
-    node->add_input(function_def->ret().at(output.name()));
-    (*node->mutable_attr())["T"].set_type(output.type());
-
-    (*function_def->mutable_ret())[output.name()] =
-        strings::StrCat(node->name(), ":output:0");
-  }
-}
-
-void RemoveFakeSinks(FunctionDef* function_def) {
-  // Map from identity node names to their input tensor strings
-  std::map<string, string> identity_map;
-  for (const auto& node : function_def->node_def()) {
-    if (node.op() == "Identity" && node.input_size() == 1) {
-      identity_map[node.name()] = node.input(0);
-    }
-  }
-  for (const auto& output_arg : function_def->signature().output_arg()) {
-    const string& tensor = function_def->ret().at(output_arg.name());
-    const string& output_node = tensor.substr(0, tensor.find(':'));
-    if (identity_map.find(output_node) != identity_map.end()) {
-      (*function_def->mutable_ret())[output_arg.name()] =
-          identity_map.at(output_node);
-    }
-  }
-}
-}  // anonymous namespace
-
-Status GraphRewriteDataset::ApplyOptimizations(OpKernelContext* ctx,
-                                               GraphDef* graph_def,
-                                               string* output_node) {
-  // Add an identity node as the fetch node, otherwise we might get 'placeholder
-  // is both fed and fetched' errors in some cases when using input list with
-  // placeholder dataset nodes.
-  NodeDef* node = graph_def->mutable_node()->Add();
-  tensorflow::grappler::graph_utils::SetUniqueGraphNodeName("Sink", graph_def,
-                                                            node);
-  node->set_op("Identity");
-  node->add_input(*output_node);
-  (*node->mutable_attr())["T"].set_type(DT_VARIANT);
-  *output_node = node->name();
-
-  // Add fake sink node to graph and functions to allow rewriting the actual
-  // sink nodes.
-  //
-  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
-  // to be optimizable, we will no longer need this.
-  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
-    AddFakeSinks(&function_def);
-  }
-
-  // Create metagraph.
-  MetaGraphDef meta_graph_def;
-  (*meta_graph_def.mutable_graph_def()) = *graph_def;
-
-  // Grappler determines fetch ops from collection 'train_op'.
-  CollectionDef collection_def;
-  auto node_list = collection_def.mutable_node_list();
-  node_list->add_value(*output_node);
-  (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
-
-  // Create Grappler item.
-  tensorflow::grappler::ItemConfig item_config;
-  item_config.apply_optimizations = true;
-  std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
-      tensorflow::grappler::GrapplerItemFromMetaGraphDef(
-          "graph", meta_graph_def, item_config);
-  grappler_item->optimization_options().optimize_function_library =
-      ShouldOptimizeFunctions();
-  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
-  tensorflow::grappler::VirtualCluster cluster(device_map);
-
-  // Run data optimizer using grappler's meta optimizer.
-  tensorflow::ConfigProto config;
-  *config.mutable_graph_options()->mutable_rewrite_options() =
-      CreateGrapplerRewriteConfig();
-  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-      *grappler_item, config, ctx->device(), &cluster, graph_def));
-
-  // Remove fake sinks after optimizations are done.
-  //
-  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
-  // to be optimizable, we will no longer need this.
-  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
-    RemoveFakeSinks(&function_def);
-  }
-
-  return Status::OK();
-}
-
-class GraphRewriteDataset::Iterator
-    : public DatasetIterator<GraphRewriteDataset> {
- public:
-  explicit Iterator(const Params& params)
-      : DatasetIterator<GraphRewriteDataset>(params) {}
-
-  Status Initialize(IteratorContext* ctx) override {
-    IteratorContext::Params params(ctx);
-    params.flr = dataset()->flr_;
-    params.function_handle_cache = dataset()->function_handle_cache_.get();
-    return dataset()->optimized_input_->MakeIterator(
-        IteratorContext(std::move(params)), prefix(), &input_impl_);
-  }
-
-  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                         bool* end_of_sequence) override {
-    IteratorContext::Params params(ctx);
-    params.flr = dataset()->flr_;
-    params.function_handle_cache = dataset()->function_handle_cache_.get();
-    return input_impl_->GetNext(IteratorContext(std::move(params)), out_tensors,
-                                end_of_sequence);
-  }
-
- protected:
-  std::shared_ptr<model::Node> CreateNode(
-      IteratorContext* ctx, model::Node::Args args) const override {
-    return model::MakeKnownRatioNode(std::move(args),
-                                     /*ratio=*/1);
-  }
-
-  Status SaveInternal(IteratorStateWriter* writer) override {
-    TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-    return Status::OK();
-  }
-
-  Status RestoreInternal(IteratorContext* ctx,
-                         IteratorStateReader* reader) override {
-    TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-    return Status::OK();
-  }
-
- private:
-  std::unique_ptr<IteratorBase> input_impl_;
-};
-
-std::unique_ptr<IteratorBase> GraphRewriteDataset::MakeIteratorInternal(
-    const string& prefix) const {
-  // We do not add a token for this dataset to the prefix. The
-  // prefix is used to identify checkpoint elements and since this
-  // dataset is excluded from the checkpoint, adding a token
-  // here would result in invalid checkpoint identifiers.
-  return absl::make_unique<Iterator>(Iterator::Params{this, prefix});
-}
-
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.h b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
deleted file mode 100644
index 24f09b4..0000000
--- a/tensorflow/core/kernels/data/graph_rewrite_dataset.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
-
-#include "tensorflow/core/common_runtime/graph_runner.h"
-#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/function_handle_cache.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/grappler_item_builder.h"
-#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-
-namespace tensorflow {
-namespace data {
-
-class GraphRewriteDataset : public DatasetBase {
- public:
-  GraphRewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
-                      const DataTypeVector& output_types,
-                      const std::vector<PartialTensorShape>& output_shapes)
-      : DatasetBase(DatasetContext(ctx)),
-        optimized_input_(nullptr),
-        input_(input),
-        output_types_(output_types),
-        output_shapes_(output_shapes) {
-    input_->Ref();
-  }
-
-  ~GraphRewriteDataset() override;
-
-  // Runs Grappler to transform the input dataset into optimized_input_
-  // dataset.
-  Status Optimize(OpKernelContext* ctx);
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override;
-
-  const DataTypeVector& output_dtypes() const override { return output_types_; }
-
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return output_shapes_;
-  }
-
-  int64 Cardinality() const override { return input_->Cardinality(); }
-
- protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override;
-
- private:
-  class Iterator;
-
-  // Create a Grappler RewriteConfig proto that defines the list of
-  // optimizations to be run by the Grappler Meta Optimizer.
-  virtual RewriterConfig CreateGrapplerRewriteConfig() = 0;
-
-  // Option specifying whether we want to optimize the function library as well.
-  virtual bool ShouldOptimizeFunctions() { return true; }
-
-  Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
-                            string* output_node);
-
-  DatasetBase* optimized_input_;
-  FunctionLibraryRuntime* flr_ = nullptr;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
-  std::unique_ptr<FunctionLibraryDefinition> lib_def_ = nullptr;
-  std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
-  const DatasetBase* input_;
-  const DataTypeVector output_types_;
-  const std::vector<PartialTensorShape> output_shapes_;
-};
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 01f3410..0dcdf19 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -32,13 +32,10 @@
   explicit InterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "f", /*params=*/{},
+                                                 &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -62,28 +59,23 @@
         errors::InvalidArgument("block_length must be greater than zero."));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    CapturedFunction::Params params;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
-    *output =
-        new Dataset(ctx, input, func_, std::move(captured_func), cycle_length,
-                    block_length, output_types_, output_shapes_);
+    *output = new Dataset(ctx, input, std::move(captured_func), cycle_length,
+                          block_length, output_types_, output_shapes_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -126,7 +118,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -311,7 +303,6 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
@@ -322,8 +313,7 @@
   const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
 };
 
 REGISTER_KERNEL_BUILDER(Name("InterleaveDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index deeab5c..8a37dac 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -237,6 +237,8 @@
   // destroyed, essentially triggering the iterator deletion.
   class Deleter {
    public:
+    Deleter() : deleter_() {}
+
     Deleter(ResourceHandle handle, ResourceMgr* resource_manager)
         : deleter_(std::make_shared<Helper>(handle, resource_manager)) {}
 
@@ -248,6 +250,10 @@
       VLOG(3) << "IteratorResource::Deleter copy constructor called.";
     }
 
+    Deleter& operator=(const Deleter& rhs) = delete;
+
+    Deleter& operator=(Deleter&& rhs) = default;
+
     virtual ~Deleter() {
       VLOG(3) << "IteratorResource::Deleter destructor called.";
     }
@@ -358,6 +364,9 @@
       Decode(*other.data_);
     }
   }
+  IteratorStateVariant& operator=(IteratorStateVariant&& other) = default;
+  IteratorStateVariant& operator=(const IteratorStateVariant& other) = delete;
+
   // Initializes this object with the current state of the iterator so
   // that it can be written on the next call to Encode().
   Status InitializeFromIterator(OpKernelContext* ctx,
@@ -681,15 +690,16 @@
   explicit ReduceDatasetOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
         background_worker_(ctx->env(), "tf_data_reduce_dataset") {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &reduce_func_));
+    bool use_inter_op_parallelism;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
+                                     &use_inter_op_parallelism));
+    FunctionMetadata::Params params;
+    params.is_multi_device_function = true;
+    params.use_inter_op_parallelism = use_inter_op_parallelism;
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
-                                     &use_inter_op_parallelism_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       reduce_func_.name(), &lib_def_));
   }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
@@ -706,14 +716,10 @@
       std::vector<Tensor> state(inputs.begin(), inputs.end());
 
       std::unique_ptr<CapturedFunction> captured_func;
-      CapturedFunction::Params fn_params;
-      fn_params.use_inter_op_parallelism = use_inter_op_parallelism_;
-      fn_params.is_multi_device_function = true;
-      fn_params.lib_def = lib_def_;
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          CapturedFunction::Create(reduce_func_, ctx, "other_arguments",
-                                   fn_params, &captured_func),
+          CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                   &captured_func),
           done);
 
       IteratorContext::Params params(ctx);
@@ -795,12 +801,10 @@
   }
 
  private:
-  NameAttrList reduce_func_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  bool use_inter_op_parallelism_;
   BackgroundWorker background_worker_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 class OneShotIteratorOp : public AsyncOpKernel {
@@ -921,12 +925,6 @@
         &f_handle));
     FunctionLibraryRuntime::Options opts;
     opts.cancellation_manager = ctx->cancellation_manager();
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    opts.step_id = -std::abs(static_cast<int64>(random::New64()));
     ScopedStepContainer step_container(opts.step_id, [ctx](const string& name) {
       ctx->resource_manager()->Cleanup(name).IgnoreError();
     });
@@ -1226,8 +1224,9 @@
     MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_CPU).Priority(2),
                         DeleteIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_GPU).Priority(1),
-                        DeleteIteratorOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeleteIterator").Device(DEVICE_GPU).HostMemory("deleter").Priority(1),
+    DeleteIteratorOp);
 REGISTER_KERNEL_BUILDER(
     Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
     AnonymousIteratorHandleOp);
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 7acea9a..68b1565 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -34,93 +34,42 @@
                            std::vector<Tensor>, std::vector<Tensor>*)>;
 
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    FunctionMetadata::Params params;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
+                                     &params.use_inter_op_parallelism));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
-                                     &use_inter_op_parallelism_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
-
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
-
-    OP_REQUIRES_OK(
-        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
     std::unique_ptr<CapturedFunction> captured_func;
-    CapturedFunction::Params params;
-    params.use_inter_op_parallelism = use_inter_op_parallelism_;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
-    MapIteratorFunction map_func;
-    CapturedFunction* raw_captured_func = captured_func.get();
-    if (short_circuit_indices_.empty()) {
-      map_func = [](IteratorContext* ctx,
-                    InstantiatedCapturedFunction* inst_captured_func,
-                    std::vector<Tensor> args,
-                    std::vector<Tensor>* out_tensors) {
-        return inst_captured_func->Run(ctx, std::move(args), out_tensors);
-      };
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(short_circuit_indices_);
-      const auto& indices = short_circuit_indices_;
-      map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx,
-                     InstantiatedCapturedFunction* inst_captured_func,
-                     std::vector<Tensor> args,
-                     std::vector<Tensor>* out_tensors) {
-        const std::vector<Tensor>& captured_inputs =
-            raw_captured_func->captured_inputs();
-        size_t num_args = args.size();
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (indices[i] < num_args) {
-            if (can_move[i]) {
-              out_tensors->push_back(std::move(args[indices[i]]));
-            } else {
-              out_tensors->push_back(args[indices[i]]);
-            }
-          } else {
-            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
-          }
-        }
-        return Status::OK();
-      };
-    }
-
-    *output =
-        new Dataset(ctx, input, func_, std::move(captured_func), output_types_,
-                    output_shapes_, use_inter_op_parallelism_,
-                    std::move(map_func), preserve_cardinality_);
+    *output = new Dataset(ctx, input, std::move(captured_func), output_types_,
+                          output_shapes_, preserve_cardinality_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
-            bool use_inter_op_parallelism, MapIteratorFunction map_func,
             bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
-          use_inter_op_parallelism_(use_inter_op_parallelism),
           preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
-          output_shapes_(output_shapes),
-          map_func_(std::move(map_func)) {
+          output_shapes_(output_shapes) {
       input_->Ref();
     }
 
@@ -129,7 +78,7 @@
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
+          Iterator::Params{this, strings::StrCat(prefix, "::Map")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -157,7 +106,7 @@
 
       // Attr: f
       AttrValue f_attr;
-      b->BuildAttrValue(func_, &f_attr);
+      b->BuildAttrValue(captured_func_->func(), &f_attr);
 
       // Attr: Targuments
       AttrValue other_arguments_types_attr;
@@ -165,7 +114,7 @@
 
       // Attr: use_inter_op_parallelism
       AttrValue use_inter_op_parallelism_attr;
-      b->BuildAttrValue(use_inter_op_parallelism_,
+      b->BuildAttrValue(captured_func_->use_inter_op_parallelism(),
                         &use_inter_op_parallelism_attr);
 
       // Attr: preserve_cardinality
@@ -188,8 +137,8 @@
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params, MapIteratorFunction map_func)
-          : DatasetIterator<Dataset>(params), map_func_(std::move(map_func)) {}
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
@@ -212,8 +161,8 @@
           return Status::OK();
         }
 
-        Status s = map_func_(ctx, instantiated_captured_func_.get(), args,
-                             out_tensors);
+        Status s =
+            instantiated_captured_func_->Run(ctx, std::move(args), out_tensors);
         if (errors::IsOutOfRange(s)) {
           if (dataset()->preserve_cardinality_) {
             // To guarantee that the transformation preserves the cardinality of
@@ -253,27 +202,20 @@
 
      private:
       std::unique_ptr<IteratorBase> input_impl_;
-      const MapIteratorFunction map_func_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
-    const bool use_inter_op_parallelism_;
     const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
-    const MapIteratorFunction map_func_;
   };
 
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-  bool use_inter_op_parallelism_;
   bool preserve_cardinality_;
-  std::vector<int> short_circuit_indices_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 1577e77..cae0fac 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -250,7 +250,6 @@
   void SetRunOptions(OpKernelContext* ctx,
                      FunctionLibraryRuntime::Options* opts,
                      ComputeOptions* compute_opts, bool always_collect_stats) {
-    opts->step_id = ctx->step_id();
     opts->rendezvous = ctx->rendezvous();
     if (always_collect_stats) {
       opts->stats_collector = ctx->stats_collector();
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 5064940..896e080 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -17,7 +17,7 @@
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
@@ -32,12 +32,9 @@
 class OptimizeDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit OptimizeDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetAttr("optimization_configs", &optimizer_configs_));
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("optimization_configs", &optimization_configs_));
   }
 
  protected:
@@ -46,62 +43,41 @@
     std::vector<string> optimizations;
     OP_REQUIRES_OK(
         ctx, ParseVectorArgument<string>(ctx, "optimizations", &optimizations));
-    Dataset* dataset = new Dataset(ctx, input, optimizations, output_types_,
-                                   output_shapes_, optimizer_configs_);
-    Status s = dataset->Optimize(ctx);
-    if (s.ok()) {
-      *output = dataset;
-    } else {
-      dataset->Unref();
-      OP_REQUIRES_OK(ctx, s);
-    }
+
+    auto config_factory = [this, &optimizations]() {
+      return CreateConfig(optimizations, optimization_configs_);
+    };
+    OP_REQUIRES_OK(ctx,
+                   RewriteDataset(ctx, input, std::move(config_factory),
+                                  /*optimize_function_library=*/true, output));
   }
 
  private:
-  class Dataset : public GraphRewriteDataset {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const std::vector<string>& optimizations,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            const std::vector<string>& optimizer_configs)
-        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
-          optimizations_(optimizations),
-          optimizer_configs_(optimizer_configs) {}
-
-    string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
-
-   private:
-    RewriterConfig CreateGrapplerRewriteConfig() override {
-      RewriterConfig rewriter_config;
-      rewriter_config.add_optimizers(kOptimizerName);
-      rewriter_config.set_meta_optimizer_iterations(
-          RewriterConfig_NumIterationsType_ONE);
-      auto custom_optimizer = rewriter_config.add_custom_optimizers();
-      custom_optimizer->set_name(kOptimizerName);
-      auto* custom_optimizations_list =
-          (*custom_optimizer->mutable_parameter_map())["optimizers"]
-              .mutable_list();
-      for (const auto& opt : optimizations_) {
-        custom_optimizations_list->add_s(opt);
-      }
-      auto* config_list =
-          (*custom_optimizer->mutable_parameter_map())["optimizer_configs"]
-              .mutable_list();
-      for (const auto& config : optimizer_configs_) {
-        config_list->add_s(config);
-      }
-      return rewriter_config;
+  static RewriterConfig CreateConfig(
+      std::vector<string> optimizations,
+      std::vector<string> optimizations_configs) {
+    RewriterConfig rewriter_config;
+    rewriter_config.add_optimizers(kOptimizerName);
+    rewriter_config.set_meta_optimizer_iterations(
+        RewriterConfig_NumIterationsType_ONE);
+    auto custom_optimizer = rewriter_config.add_custom_optimizers();
+    custom_optimizer->set_name(kOptimizerName);
+    auto* custom_optimizations_list =
+        (*custom_optimizer->mutable_parameter_map())["optimizers"]
+            .mutable_list();
+    for (const auto& opt : optimizations) {
+      custom_optimizations_list->add_s(opt);
     }
+    auto* config_list =
+        (*custom_optimizer->mutable_parameter_map())["optimizer_configs"]
+            .mutable_list();
+    for (const auto& config : optimizations_configs) {
+      config_list->add_s(config);
+    }
+    return rewriter_config;
+  }
 
-    const std::vector<string> optimizations_;
-    const std::vector<string> optimizer_configs_;
-  };
-
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  std::vector<string> optimizer_configs_;
+  std::vector<string> optimization_configs_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("OptimizeDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 41ea362..8086253 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -13,9 +13,14 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -29,7 +34,11 @@
  public:
   explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
-        op_version_(ctx->def().op() == "PaddedBatchDataset" ? 1 : 2) {}
+        op_version_(ctx->def().op() == "PaddedBatchDataset" ? 1 : 2) {
+    if (ctx->HasAttr("parallel_copy")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("parallel_copy", &parallel_copy_));
+    }
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
@@ -93,31 +102,32 @@
     }
 
     *output =
-        new Dataset(ctx, batch_size, drop_remainder, std::move(padded_shapes),
-                    std::move(padding_values), input);
+        new Dataset(ctx, batch_size, drop_remainder, parallel_copy_,
+                    std::move(padded_shapes), std::move(padding_values), input);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 batch_size, bool drop_remainder,
-            std::vector<PartialTensorShape> padded_shapes,
+            bool parallel_copy, std::vector<PartialTensorShape> padded_shapes,
             std::vector<Tensor> padding_values, const DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)),
           batch_size_(batch_size),
           drop_remainder_(drop_remainder),
+          parallel_copy_(parallel_copy),
           padded_shapes_(std::move(padded_shapes)),
           padding_values_(std::move(padding_values)),
           input_(input) {
       input_->Ref();
 
-      // NOTE(mrry): Currently we implement "batch up to"
-      // semantics. If we could tell statically that the input dataset
-      // is infinite, then we could always report `batch_size` as the
-      // 0th dimension.
-      // TODO(mrry): Need to validate that the input shape and the
-      // padded shape are "compatible" (i.e. that padded shape is >=
-      // input shape, with both static and dynamic checks as appropriate).
+      // NOTE(mrry): Currently we implement "batch up to" semantics. If we could
+      // tell statically that the input dataset is infinite, then we could
+      // always report `batch_size` as the 0th dimension.
+      //
+      // TODO(mrry): Need to validate that the input shape and the padded shape
+      // are "compatible" (i.e. that padded shape is >= input shape, with both
+      // static and dynamic checks as appropriate).
       const auto& input_shapes = input_->output_shapes();
       output_shapes_.reserve(input_shapes.size());
       for (size_t i = 0; i < input_shapes.size(); ++i) {
@@ -193,6 +203,9 @@
       Node* drop_remainder = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder));
 
+      AttrValue parallel_copy;
+      b->BuildAttrValue(parallel_copy_, &parallel_copy);
+
       AttrValue output_types;
       b->BuildAttrValue(output_dtypes(), &output_types);
 
@@ -202,14 +215,14 @@
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {{0, input_graph_node}, {1, batch_size}, {4, drop_remainder}},
           {{2, padded_shapes}, {3, padding_values}},
-          {{"Toutput_types", output_types}, {"N", N}}, output));
+          {{"parallel_copy", parallel_copy},
+           {"Toutput_types", output_types},
+           {"N", N}},
+          output));
       return Status::OK();
     }
 
    private:
-    // Copies element into the index^th slice of parent (in the 0th dimension).
-    //
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -259,13 +272,14 @@
           return Status::OK();
         }
 
-        // Copy the retrieved batch elements into one output tensor
-        // per tuple component.
-        // NOTE(mrry): If the input or output sizes are statically
-        // known, we could potentially read the input values in-place
-        // into their respective slice locations. This would require a
-        // different GetNext() overload that supports zero-copy, and might
-        // make sense in an optimization pass.
+        // Copy the retrieved batch elements into one output tensor per tuple
+        // component.
+        //
+        // NOTE(mrry): If the input or output sizes are statically known, we
+        // could potentially read the input values in-place into their
+        // respective slice locations. This would require a different GetNext()
+        // overload that supports zero-copy, and might make sense in an
+        // optimization pass.
         const size_t num_tuple_components = batch_elements[0].size();
         const int64 num_batch_elements = batch_elements.size();
         for (size_t component_index = 0; component_index < num_tuple_components;
@@ -330,16 +344,43 @@
           for (int i = 1; i < batch_component_shape.dims(); ++i) {
             component_shape.AddDim(batch_component_shape.dim_size(i));
           }
-          for (int64 i = 0; i < num_batch_elements; ++i) {
+          auto copy_element_fn = [component_index, &batch_elements,
+                                  &batch_component,
+                                  &component_shape](int index) {
             // Take the fast path if possible.
-            if (batch_elements[i][component_index].shape() == component_shape) {
+            if (batch_elements[index][component_index].shape() ==
+                component_shape) {
               TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
-                  batch_elements[i][component_index], &batch_component, i));
+                  batch_elements[index][component_index], &batch_component,
+                  index));
             } else {
               TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
-                  batch_elements[i][component_index], &batch_component, i));
+                  batch_elements[index][component_index], &batch_component,
+                  index));
+            }
+            return Status::OK();
+          };
+          BlockingCounter counter(num_batch_elements);
+          Status status;
+          mutex status_mu;
+          for (size_t i = 0; i < num_batch_elements; ++i) {
+            if (TF_PREDICT_FALSE(dataset()->parallel_copy_)) {
+              (*ctx->runner())(
+                  [i, &status, &status_mu, &counter, &copy_element_fn]() {
+                    Status s = copy_element_fn(i);
+                    {
+                      mutex_lock l(status_mu);
+                      status.Update(s);
+                    }
+                    counter.DecrementCount();
+                  });
+            } else {
+              status.Update(copy_element_fn(i));
+              counter.DecrementCount();
             }
           }
+          counter.Wait();
+          TF_RETURN_IF_ERROR(status);
         }
         *end_of_sequence = false;
         return Status::OK();
@@ -381,6 +422,7 @@
 
     const int64 batch_size_;
     const bool drop_remainder_;
+    const bool parallel_copy_;
     const std::vector<PartialTensorShape> padded_shapes_;
     const std::vector<Tensor> padding_values_;
     const DatasetBase* const input_;
@@ -388,6 +430,7 @@
   };
 
   const int op_version_;
+  bool parallel_copy_ = false;
 };
 
 REGISTER_KERNEL_BUILDER(Name("PaddedBatchDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
new file mode 100644
index 0000000..89d42d6
--- /dev/null
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
@@ -0,0 +1,1246 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "padded_batch_datasetv2";
+constexpr char kOpName[] = "PaddedBatchDatasetV2";
+
+class PaddedBatchDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `ConcatenateDataset` variant tensor from the input vector of
+  // tensor vectors.
+  Status CreateConcatenateDatasetTensor(
+      const std::vector<std::vector<Tensor>> &tensor_vectors,
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      Tensor *concatenate_dataset_tensor) {
+    // Create two `TensorSliceDataset` tensors as the inputs for
+    // `ConcatenateDataset`.
+    std::vector<Tensor> tensor_slice_dataset_tensors;
+    for (int i = 0; i < tensor_vectors.size(); ++i) {
+      std::vector<Tensor> tensors = tensor_vectors[i];
+      DatasetBase *tensor_slice_dataset;
+      TF_RETURN_IF_ERROR(
+          CreateTensorSliceDataset(strings::StrCat("tensor_slice_node_", i),
+                                   &tensors, &tensor_slice_dataset));
+      Tensor dataset_tensor(DT_VARIANT, TensorShape({}));
+      TF_RETURN_IF_ERROR(
+          StoreDatasetInVariantTensor(tensor_slice_dataset, &dataset_tensor));
+      tensor_slice_dataset_tensors.emplace_back(std::move(dataset_tensor));
+    }
+
+    // Create a `ConcatenateDataset` dataset.
+    std::unique_ptr<OpKernel> concatenate_dataset_op_kernel;
+    NodeDef concatenate_node_def = test::function::NDef(
+        "concatenate_dataset", "ConcatenateDataset",
+        {"input_dataset", "another_dataset"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(
+        CreateOpKernel(concatenate_node_def, &concatenate_dataset_op_kernel));
+
+    gtl::InlinedVector<TensorValue, 4> concatenate_dataset_inputs;
+    for (auto &tensor : tensor_slice_dataset_tensors) {
+      concatenate_dataset_inputs.emplace_back(&tensor);
+    }
+
+    std::unique_ptr<OpKernelContext> concatenate_dataset_op_context;
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*concatenate_dataset_op_kernel,
+                                          concatenate_dataset_inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(
+        concatenate_dataset_op_kernel.get(), &concatenate_dataset_inputs,
+        &concatenate_dataset_op_context));
+    DatasetBase *concatenate_dataset;
+    TF_RETURN_IF_ERROR(CreateDataset(concatenate_dataset_op_kernel.get(),
+                                     concatenate_dataset_op_context.get(),
+                                     &concatenate_dataset));
+
+    // Store the `ConcatenateDataset` dataset in a tensor.
+    TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(concatenate_dataset,
+                                                   concatenate_dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `PaddedBatchDataset` op kernel
+  Status CreatePaddedBatchDatasetKernel(
+      bool parallel_copy, int n, const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    std::vector<string> inputs({"input_dataset", "batch_size"});
+    // Create the placeholder names for the input padded_shapes.
+    for (int i = 0; i < n; ++i) {
+      inputs.emplace_back(strings::StrCat("padded_shapes_", i));
+    }
+    // Create the placeholder names for the input padding_values.
+    for (int j = 0; j < output_types.size(); ++j) {
+      inputs.emplace_back(strings::StrCat("padding_values_", j));
+    }
+    inputs.emplace_back("drop_remainder");
+
+    NodeDef node_def = test::function::NDef(kNodeName, kOpName, inputs,
+                                            {{"parallel_copy", parallel_copy},
+                                             {"Toutput_types", output_types},
+                                             {"output_shapes", output_shapes},
+                                             {"N", n}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `PaddedBatchDataset` op kernel context.
+  Status CreatePaddedBatchDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  // Used for creating two `TensorSliceDataset` datasets, which will be the
+  // input datasets for `ConcatenateDataset`. Then the `ConcatenateDataset`
+  // dataset will be the input for `PaddedBatchDataset`.
+  std::vector<std::vector<Tensor>> input_tensors;
+  DataTypeVector concatenate_output_dtypes;
+  std::vector<PartialTensorShape> concatenate_output_shapes;
+  Tensor batch_size;
+  std::vector<Tensor> padded_shapes;
+  std::vector<Tensor> padding_values;
+  Tensor drop_remainder;
+  bool parallel_copy;
+  int64 n;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+template <typename T>
+std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
+  std::vector<Tensor> tensors;
+  tensors.reserve(values.size());
+  for (auto &value : values) {
+    tensors.emplace_back(
+        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+  }
+  return tensors;
+}
+
+// Test case 1: input elements with same shapes.
+TestCase TestCase1() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(
+               TensorShape{4, 2}, {6, 7, 8, 9, 10, 11, 12, 13})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {true}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                   {0, 1, 1, 2, 3, 1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                   {4, 5, 1, 6, 7, 1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                   {8, 9, 1, 10, 11, 1})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2, 3})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 2: input elements with different shapes.
+TestCase TestCase2() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
+                                                    {6, 7, 8, 9})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {true}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                   {0, 1, 1, 2, 3, 1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                   {4, 5, 1, 6, 1, 1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                   {7, 1, 1, 8, 1, 1})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2, 3})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 3: similar with the test case 2 but drop_remainder = false.
+TestCase TestCase3() {
+  return {
+      /*input_tensors*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                {0, 1, 2, 3, 4, 5})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
+                                                {6, 7, 8, 9})}},
+      /*concatenate_output_dtypes*/ {DT_INT64},
+      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+      /*padded_shapes*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+      /*padding_values*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+      /*parallel_copy*/ false,
+      /*n*/ 1,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                               {0, 1, 1, 2, 3, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                               {4, 5, 1, 6, 1, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                               {7, 1, 1, 8, 1, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1, 3}, {9, 1, 1})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({-1, 3})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 4: similar with the test case 3 but the input elements can be
+// divided by the batch size evenly. As drop_remainder = false, the output
+// shape is still {-1, 3} instead of {2, 3}.
+TestCase TestCase4() {
+  return {
+      /*input_tensors*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                {0, 1, 2, 3, 4, 5})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})}},
+      /*concatenate_output_dtypes*/ {DT_INT64},
+      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+      /*padded_shapes*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+      /*padding_values*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+      /*parallel_copy*/ false,
+      /*n*/ 1,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                               {0, 1, 1, 2, 3, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                               {4, 5, 1, 6, 1, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                               {7, 1, 1, 8, 1, 1})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({-1, 3})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 5: similar with the test case 3 but padded_shapes = {-1}.
+TestCase TestCase5() {
+  return {
+      /*input_tensors*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                {0, 1, 2, 3, 4, 5})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
+                                                {6, 7, 8, 9})}},
+      /*concatenate_output_dtypes*/ {DT_INT64},
+      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+      /*padded_shapes*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {-1})},
+      /*padding_values*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+      /*parallel_copy*/ false,
+      /*n*/ 1,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {0, 1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {4, 5, 6, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {7, 8}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1, 1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 6: similar with the test case 5 but parallel_copy = true.
+TestCase TestCase6() {
+  return {
+      /*input_tensors*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                {0, 1, 2, 3, 4, 5})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
+                                                {6, 7, 8, 9})}},
+      /*concatenate_output_dtypes*/ {DT_INT64},
+      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+      /*batch_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+      /*padded_shapes*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {-1})},
+      /*padding_values*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+      /*parallel_copy*/ true,
+      /*n*/ 1,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {0, 1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {4, 5, 6, 1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {7, 8}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1, 1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 7: empty input elements.
+TestCase TestCase7() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {-1})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase ShortPaddingTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase InvalidPaddingShapesTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase InvalidBatchSizeTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {-1}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase InvalidPaddedShapesSizeTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 2,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase InvalidPaddedValuesSizeTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64, DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase InvalidPaddedValuesDTypeTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<string>(TensorShape{}, {"a"})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase InvalidPaddedValuesShapeTestCase() {
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {0, 1, 2, 3, 4, 5})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                    {6, 7, 8, 9, 10, 11})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+          /*batch_size*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1})},
+          /*drop_remainder*/
+          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+class ParameterizedPaddedBatchDatasetOpTest
+    : public PaddedBatchDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_F(PaddedBatchDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  EXPECT_EQ(padded_batch_dataset->node_name(), kNodeName);
+}
+
+TEST_F(PaddedBatchDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  EXPECT_EQ(padded_batch_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(padded_batch_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(padded_batch_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  EXPECT_EQ(padded_batch_dataset->Cardinality(),
+            test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(padded_batch_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(PaddedBatchDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::PaddedBatch");
+}
+
+TEST_P(ParameterizedPaddedBatchDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *padded_batch_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(PaddedBatchDatasetOpTest,
+                         ParameterizedPaddedBatchDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3(),
+                              TestCase4(), TestCase5(), TestCase6(),
+                              TestCase7()})));
+
+TEST_F(PaddedBatchDatasetOpTest, ShortPadding) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  TestCase test_case = ShortPaddingTestCase();
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  EXPECT_EQ(
+      iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)
+          .code(),
+      tensorflow::error::DATA_LOSS);
+}
+
+TEST_F(PaddedBatchDatasetOpTest, InvalidPaddedShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  TestCase test_case = InvalidPaddingShapesTestCase();
+  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+      test_case.input_tensors, test_case.concatenate_output_dtypes,
+      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+  Tensor batch_size = test_case.batch_size;
+  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+  std::vector<Tensor> padding_values = test_case.padding_values;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&concatenate_dataset_tensor, &batch_size});
+  for (auto &padded_shape : padded_shapes) {
+    inputs.emplace_back(&padded_shape);
+  }
+  for (auto &padding_value : padding_values) {
+    inputs.emplace_back(&padding_value);
+  }
+  inputs.emplace_back(&drop_remainder);
+
+  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+  TF_ASSERT_OK(
+      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
+                                      &inputs, &padded_batch_dataset_context));
+  DatasetBase *padded_batch_dataset;
+  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
+                             padded_batch_dataset_context.get(),
+                             &padded_batch_dataset));
+  core::ScopedUnref scoped_unref(padded_batch_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(padded_batch_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(padded_batch_dataset->MakeIterator(iterator_ctx.get(),
+                                                  "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  EXPECT_EQ(
+      iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)
+          .code(),
+      tensorflow::error::INVALID_ARGUMENT);
+}
+
+TEST_F(PaddedBatchDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::vector<TestCase> test_cases = {
+      InvalidBatchSizeTestCase(), InvalidPaddedShapesSizeTestCase(),
+      InvalidPaddedValuesSizeTestCase(), InvalidPaddedValuesDTypeTestCase(),
+      InvalidPaddedValuesShapeTestCase()};
+  for (const TestCase &test_case : test_cases) {
+    std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
+    TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
+        test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
+        test_case.expected_output_shapes, &padded_batch_dataset_kernel));
+
+    Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_ASSERT_OK(CreateConcatenateDatasetTensor(
+        test_case.input_tensors, test_case.concatenate_output_dtypes,
+        test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
+    Tensor batch_size = test_case.batch_size;
+    std::vector<Tensor> padded_shapes = test_case.padded_shapes;
+    std::vector<Tensor> padding_values = test_case.padding_values;
+    Tensor drop_remainder = test_case.drop_remainder;
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {&concatenate_dataset_tensor, &batch_size});
+    for (auto &padded_shape : padded_shapes) {
+      inputs.emplace_back(&padded_shape);
+    }
+    for (auto &padding_value : padding_values) {
+      inputs.emplace_back(&padding_value);
+    }
+    inputs.emplace_back(&drop_remainder);
+
+    std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
+    TF_ASSERT_OK(CreatePaddedBatchDatasetContext(
+        padded_batch_dataset_kernel.get(), &inputs,
+        &padded_batch_dataset_context));
+    DatasetBase *padded_batch_dataset;
+    EXPECT_EQ(
+        CreateDataset(padded_batch_dataset_kernel.get(),
+                      padded_batch_dataset_context.get(), &padded_batch_dataset)
+            .code(),
+        tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 473757f..666eac3 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -54,14 +54,11 @@
  public:
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "f", /*params=*/{},
+                                                 &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       interleave_func_.name(), &lib_def_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -91,34 +88,29 @@
             "num_parallel_calls must less than or equal to cycle_length."));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    CapturedFunction::Params params;
-    params.lib_def = lib_def_;
     OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
-                                      std::move(params), &captured_func));
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
     if (num_parallel_calls == model::kAutoTune) {
       metrics::RecordTFDataAutotune(kDatasetName);
     }
 
-    *output =
-        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
-                    cycle_length, block_length, num_parallel_calls, sloppy_,
-                    output_types_, output_shapes_);
+    *output = new Dataset(ctx, input, std::move(captured_func), cycle_length,
+                          block_length, num_parallel_calls, sloppy_,
+                          output_types_, output_shapes_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, int64 num_parallel_calls, bool sloppy,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          interleave_func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -169,7 +161,7 @@
       TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
                                                     &other_arguments_types));
       AttrValue f;
-      b->BuildAttrValue(interleave_func_, &f);
+      b->BuildAttrValue(captured_func_->func(), &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
       AttrValue sloppy_attr;
@@ -213,7 +205,7 @@
         cancelled_ = true;
         cond_var_->notify_all();
         // Wait for all in-flight calls to complete.
-        while (num_calls_ > 0) {
+        while (current_num_calls_ > 0 || future_num_calls_ > 0) {
           cond_var_->wait(l);
         }
       }
@@ -260,16 +252,17 @@
         return model::MakeAsyncInterleaveManyNode(
             std::move(args),
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/port::NumSchedulableCPUs())});
+                                  /*max=*/dataset()->cycle_length_)});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(*mu_);
         // Wait for all in-flight calls to complete.
-        while (num_calls_ > 0) {
+        while (current_num_calls_ > 0 || future_num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        DCHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(current_num_calls_, 0);
+        DCHECK_EQ(future_num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("block_index"), block_index_));
@@ -446,7 +439,8 @@
               }
             }
           }
-          return all_elements_busy || num_calls_ >= num_parallel_calls_->value;
+          return all_elements_busy ||
+                 current_num_calls_ >= num_parallel_calls_->value;
         };
         while (true) {
           mutex_lock l(*mu_);
@@ -470,6 +464,10 @@
               if (!future_elements_.empty()) {
                 current_elements_[idx] = std::move(future_elements_.back());
                 future_elements_.pop_back();
+                if (current_elements_[idx]->iterator) {
+                  EnableAutotune(ctx.get(),
+                                 current_elements_[idx]->iterator.get());
+                }
               } else {
                 current_elements_[idx] = MakeElement(ctx);
                 if (!current_elements_[idx]) {
@@ -486,11 +484,23 @@
                     dataset()->block_length_ - element->results.size();
               }
               if (num_results > 0) {
-                num_calls_++;
+                current_num_calls_++;
                 element->in_use = true;
-                thread_pool_->Schedule(
-                    std::bind(&ParallelInterleaveIterator::FetchResults, this,
-                              ctx, std::move(element), num_results));
+                thread_pool_->Schedule(std::bind(
+                    &ParallelInterleaveIterator::FetchResults, this, ctx,
+                    std::move(element), num_results,
+                    [this, ctx]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+                      --current_num_calls_;
+                      const auto& stats_aggregator = ctx->stats_aggregator();
+                      if (stats_aggregator) {
+                        stats_aggregator->AddScalar(
+                            stats_utils::ThreadUtilizationScalarName(
+                                dataset()->node_name()),
+                            static_cast<float>(current_num_calls_) /
+                                static_cast<float>(num_parallel_calls_->value),
+                            num_elements());
+                      }
+                    }));
               }
             }
           }
@@ -499,7 +509,7 @@
             stats_aggregator->AddScalar(
                 stats_utils::ThreadUtilizationScalarName(
                     dataset()->node_name()),
-                static_cast<float>(num_calls_) /
+                static_cast<float>(current_num_calls_) /
                     static_cast<float>(num_parallel_calls_->value),
                 num_elements());
           }
@@ -526,7 +536,8 @@
       // Fetches up to `dataset()->block_length_` results from `element`.
       void FetchResults(const std::shared_ptr<IteratorContext>& ctx,
                         const std::shared_ptr<Element>& element,
-                        int64 num_results) LOCKS_EXCLUDED(*mu_) {
+                        int64 num_results, std::function<void()> done)
+          LOCKS_EXCLUDED(*mu_) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
         bool end_of_input = false;
@@ -554,15 +565,7 @@
           element->inputs.clear();
           --num_open_;
         }
-        --num_calls_;
-        const auto& stats_aggregator = ctx->stats_aggregator();
-        if (stats_aggregator) {
-          stats_aggregator->AddScalar(
-              stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
-              static_cast<float>(num_calls_) /
-                  static_cast<float>(num_parallel_calls_->value),
-              num_elements());
-        }
+        done();
         cond_var_->notify_all();
       }
 
@@ -574,9 +577,8 @@
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
         auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
-          // TODO(jsimsa): Autotune the buffer size.
-          return num_calls_ >= num_parallel_calls_->value ||
-                 future_elements_.size() >= 2 * dataset()->cycle_length_;
+          // TODO(jsimsa): Autotune the number of iterators to prefetch.
+          return future_elements_.size() >= 2 * dataset()->cycle_length_;
         };
         while (true) {
           mutex_lock l(*mu_);
@@ -603,20 +605,14 @@
             if (!element->iterator) {
               continue;
             }
-            ++num_calls_;
+            DisableAutotune(ctx.get(), element->iterator.get());
+            ++future_num_calls_;
             element->in_use = true;
-            thread_pool_->Schedule(
-                std::bind(&ParallelInterleaveIterator::FetchResults, this, ctx,
-                          std::move(element), dataset()->block_length_));
-          }
-          const auto& stats_aggregator = ctx->stats_aggregator();
-          if (stats_aggregator) {
-            stats_aggregator->AddScalar(
-                stats_utils::ThreadUtilizationScalarName(
-                    dataset()->node_name()),
-                static_cast<float>(num_calls_) /
-                    static_cast<float>(num_parallel_calls_->value),
-                num_elements());
+            thread_pool_->Schedule(std::bind(
+                &ParallelInterleaveIterator::FetchResults, this, ctx,
+                std::move(element), dataset()->block_length_,
+                [this]()
+                    EXCLUSIVE_LOCKS_REQUIRED(*mu_) { --future_num_calls_; }));
           }
           cond_var_->notify_all();
         }
@@ -901,8 +897,10 @@
       // Identifies the number of open iterators.
       int64 num_open_ GUARDED_BY(*mu_) = 0;
 
-      // Identifies the number of outstanding calls.
-      int64 num_calls_ GUARDED_BY(*mu_) = 0;
+      // Identifies the number of outstanding calls for CurrentElementsManager.
+      int64 current_num_calls_ GUARDED_BY(*mu_) = 0;
+      // Identifies the number of outstanding calls for FutureElementsManager.
+      int64 future_num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
       std::unique_ptr<Thread> current_elements_manager_ GUARDED_BY(*mu_);
@@ -915,7 +913,6 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList interleave_func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
@@ -925,11 +922,10 @@
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
-  bool sloppy_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList interleave_func_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
+  bool sloppy_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDatasetV2").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
index 3c3d1de..6f30cce 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
@@ -494,7 +494,7 @@
   }
 
   TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
-                           /*expect_items_equal*/ test_case.sloppy));
+                           /*compare_order*/ !test_case.sloppy));
 }
 
 TEST_F(ParallelInterleaveDatasetOpTest, InvalidArguments) {
@@ -949,7 +949,7 @@
   }
 
   TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
-                           /*expect_items_equal*/ test_case.sloppy));
+                           /*compare_order*/ !test_case.sloppy));
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index defd4be..e9a648b 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -38,20 +38,16 @@
  public:
   explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    FunctionMetadata::Params params;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
+                                     &params.use_inter_op_parallelism));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
-                                     &use_inter_op_parallelism_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
-    OP_REQUIRES_OK(ctx,
-                   CreateFunctionLibraryDefinition(
-                       ctx->function_library()->GetFunctionLibraryDefinition(),
-                       func_.name(), &lib_def_));
-    OP_REQUIRES_OK(
-        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
   }
 
  protected:
@@ -66,20 +62,16 @@
             "num_parallel_calls must be greater than zero."));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    CapturedFunction::Params params;
-    params.use_inter_op_parallelism = use_inter_op_parallelism_;
-    params.lib_def = lib_def_;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(func_, ctx, "other_arguments",
-                                            std::move(params), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                      &captured_func));
 
     if (num_parallel_calls == model::kAutoTune) {
       metrics::RecordTFDataAutotune(kDatasetName);
     }
 
-    *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
-                          output_shapes_, use_inter_op_parallelism_, sloppy_,
-                          std::move(captured_func), short_circuit_indices_,
+    *output = new Dataset(ctx, input, num_parallel_calls, output_types_,
+                          output_shapes_, sloppy_, std::move(captured_func),
                           preserve_cardinality_);
   }
 
@@ -87,25 +79,18 @@
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func, int32 num_parallel_calls,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            bool use_inter_op_parallelism, bool sloppy,
+            int32 num_parallel_calls, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes, bool sloppy,
             std::unique_ptr<CapturedFunction> captured_func,
-            const std::vector<int> indices, bool preserve_cardinality)
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
           num_parallel_calls_(num_parallel_calls),
           output_types_(output_types),
           output_shapes_(output_shapes),
-          use_inter_op_parallelism_(use_inter_op_parallelism),
           sloppy_(sloppy),
           preserve_cardinality_(preserve_cardinality),
-          captured_func_(std::move(captured_func)),
-          short_circuit_indices_(indices),
-          can_move_(indices.empty() ? std::vector<bool>()
-                                    : ComputeMoveVector(indices)) {
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
@@ -113,13 +98,8 @@
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
-      if (short_circuit_indices_.empty()) {
-        parallel_map_functor =
-            absl::make_unique<ParallelMapDatasetFunctor>(this);
-      } else {
-        parallel_map_functor = absl::make_unique<ShortCircuitFunctor>(this);
-      }
+      std::unique_ptr<ParallelMapFunctor> parallel_map_functor =
+          absl::make_unique<ParallelMapDatasetFunctor>(this);
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::", kDatasetName)}, input_,
           std::move(parallel_map_functor), num_parallel_calls_, sloppy_,
@@ -161,7 +141,7 @@
 
       // Attr: f
       AttrValue f_attr;
-      b->BuildAttrValue(func_, &f_attr);
+      b->BuildAttrValue(captured_func_->func(), &f_attr);
 
       // Attr: Targuments
       AttrValue other_arguments_types_attr;
@@ -169,7 +149,7 @@
 
       // Attr: use_inter_op_parallelism
       AttrValue use_inter_op_parallelism_attr;
-      b->BuildAttrValue(use_inter_op_parallelism_,
+      b->BuildAttrValue(captured_func_->use_inter_op_parallelism(),
                         &use_inter_op_parallelism_attr);
 
       // Attr: sloppy
@@ -197,38 +177,6 @@
     }
 
    private:
-    class ShortCircuitFunctor : public ParallelMapFunctor {
-     public:
-      explicit ShortCircuitFunctor(const Dataset* dataset)
-          : dataset_(dataset) {}
-
-      void MapFunc(IteratorContext* ctx, const string& prefix,
-                   std::vector<Tensor> input_element,
-                   std::vector<Tensor>* result, StatusCallback done) override {
-        const std::vector<Tensor>& captured_inputs =
-            dataset_->captured_func_->captured_inputs();
-        size_t num_args = input_element.size();
-        for (size_t i = 0; i < dataset_->short_circuit_indices_.size(); ++i) {
-          if (dataset_->short_circuit_indices_[i] < num_args) {
-            if (dataset_->can_move_[i]) {
-              result->push_back(std::move(
-                  input_element[dataset_->short_circuit_indices_[i]]));
-            } else {
-              result->push_back(
-                  input_element[dataset_->short_circuit_indices_[i]]);
-            }
-          } else {
-            result->push_back(
-                captured_inputs[dataset_->short_circuit_indices_[i] -
-                                num_args]);
-          }
-        }
-        done(Status::OK());
-      }
-
-      const Dataset* const dataset_;
-    };
-
     class ParallelMapDatasetFunctor : public ParallelMapFunctor {
      public:
       explicit ParallelMapDatasetFunctor(const Dataset* dataset)
@@ -249,7 +197,7 @@
           instantiated_captured_func_->RunAsync(
               ctx, std::move(input_element), result, std::move(done), prefix);
         };
-        if (!dataset_->use_inter_op_parallelism_) {
+        if (!dataset_->captured_func_->use_inter_op_parallelism()) {
           (*ctx->runner())(std::bind(map_func, ctx, prefix,
                                      std::move(input_element), result,
                                      std::move(done)));
@@ -265,26 +213,19 @@
     };
 
     const DatasetBase* const input_;
-    const NameAttrList func_;
     const int32 num_parallel_calls_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
-    const bool use_inter_op_parallelism_;
     const bool sloppy_;
     const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const std::vector<int> short_circuit_indices_;
-    const std::vector<bool> can_move_;
   };
 
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  bool use_inter_op_parallelism_;
   bool sloppy_;
   bool preserve_cardinality_;
-  NameAttrList func_;
-  std::vector<int> short_circuit_indices_;
-  std::shared_ptr<FunctionLibraryDefinition> lib_def_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelMapDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
index dc1ff9f..abb6e81 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
@@ -334,7 +334,7 @@
   }
 
   TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
-                           /*expect_items_equal*/ test_case.sloppy));
+                           /*compare_order*/ !test_case.sloppy));
 }
 
 TEST_F(ParallelMapDatasetOpTest, DatasetNodeName) {
@@ -769,7 +769,7 @@
   }
 
   TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
-                           /*expect_items_equal*/ test_case.sloppy));
+                           /*compare_order*/ !test_case.sloppy));
 }
 
 TEST_F(ParallelMapDatasetOpTest, InvalidNumParallelCalls) {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index eeb6d30..8ca0d20 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -31,14 +31,18 @@
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+// Determines the fraction of slack time by which to delay prefetching of data.
+constexpr double kSleepFactor = 0.2;
 constexpr char kDatasetName[] = "Prefetch";
 
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
-  Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
+  Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
+          int64 slack_period)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
-        buffer_size_(buffer_size) {
+        buffer_size_(buffer_size),
+        slack_period_(slack_period) {
     input_->Ref();
   }
 
@@ -70,8 +74,11 @@
     TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
     Node* buffer_size = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-    TF_RETURN_IF_ERROR(
-        b->AddDataset(this, {input_graph_node, buffer_size}, output));
+    AttrValue slack_period_attr;
+    b->BuildAttrValue(slack_period_, &slack_period_attr);
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this, {input_graph_node, buffer_size},
+        {std::make_pair("slack_period", slack_period_attr)}, output));
     return Status::OK();
   }
 
@@ -80,7 +87,9 @@
    public:
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params),
-          auto_tuner_(params.dataset->buffer_size_) {}
+          auto_tuner_(params.dataset->buffer_size_) {
+      slack_us_ = 0;
+    }
 
     ~Iterator() override {
       // Signal the prefetch thread to terminate it. We will then
@@ -226,6 +235,7 @@
       Status status;
       // The buffered data element.
       std::vector<Tensor> value;
+      int64 created_us;
     };
 
     Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -248,6 +258,20 @@
       // (if we successfully got an element) the output values.
       Status s = buffer_.front().status;
       if (s.ok()) {
+        if (dataset()->slack_period_ > 0 &&
+            (num_elements() + 1) % dataset()->slack_period_ == 0) {
+          // TODO(rachelim): Consider doing something more sophisticated
+          // to decide how long to sleep for; e.g. using a kalman filter.
+          int64 slack_us =
+              Env::Default()->NowMicros() - buffer_.front().created_us;
+          // Every slack_period_-th element, update the most recent slack time,
+          // measured by the duration between when the element is prefetched
+          // and when it is consumed. We add kSleepFactor * slack_us_ to the
+          // measurement because we slept for that duration before prefetching
+          // the element.
+          slack_us_ = kSleepFactor * slack_us_ + slack_us;
+          VLOG(2) << "Setting slack_us_: " << slack_us_;
+        }
         *out_tensors = std::move(buffer_.front().value);
         RecordBufferDequeue(ctx, *out_tensors);
       }
@@ -282,6 +306,8 @@
     void PrefetchThread(const std::shared_ptr<IteratorContext>& ctx) {
       RecordStart(ctx.get());
       auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
+      // Keep track of where we are in an iteration "burst"
+      int num_produced = 0;
       while (true) {
         // 1. Wait for a slot in the buffer.
         {
@@ -297,6 +323,14 @@
           }
         }
 
+        if (dataset()->slack_period_ > 0 &&
+            num_produced % dataset()->slack_period_ == 0) {
+          // For the first element in the "burst", sleep for a bit if there is
+          // slack.
+          VLOG(2) << "Sleeping for: " << slack_us_ * kSleepFactor;
+          ctx->env()->SleepForMicroseconds(slack_us_ * kSleepFactor);
+        }
+
         // 2. Read the next element.
         // Acquire the parent lock since we will be reading an element
         // from the input iterator. Note that we do not wish to release
@@ -319,9 +353,11 @@
         {
           mutex_lock l(mu_);
           RecordBufferEnqueue(ctx.get(), buffer_element.value);
+          buffer_element.created_us = ctx->env()->NowMicros();
           buffer_.push_back(std::move(buffer_element));
           cond_var_.notify_all();
         }
+        ++num_produced;
       }
     }
 
@@ -375,9 +411,15 @@
     std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
     bool cancelled_ GUARDED_BY(mu_) = false;
     bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
+
+    std::atomic<int64> slack_us_;
   };
   const DatasetBase* const input_;
   const int64 buffer_size_;
+
+  // If non-zero, determines the period between injecting "slack" into the
+  // execution.
+  const int64 slack_period_;
 };
 
 void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -396,7 +438,7 @@
     metrics::RecordTFDataAutotune(kDatasetName);
   }
 
-  *output = new Dataset(ctx, input, buffer_size);
+  *output = new Dataset(ctx, input, buffer_size, slack_period_);
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.h b/tensorflow/core/kernels/data/prefetch_dataset_op.h
index 8320637..d42e143 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.h
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -25,7 +25,11 @@
 class PrefetchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PrefetchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx) {
+    if (ctx->HasAttr("slack_period")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("slack_period", &slack_period_));
+    }
+  }
 
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -33,6 +37,7 @@
 
  private:
   class Dataset;
+  int64 slack_period_ = 0;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
index 4beb306..56dfbc5 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
@@ -38,9 +38,11 @@
       const DataTypeVector &output_types,
       const std::vector<PartialTensorShape> &output_shapes,
       std::unique_ptr<OpKernel> *op_kernel) {
-    NodeDef node_def = test::function::NDef(
-        kNodeName, kOpName, {"input_dataset", "buffer_size"},
-        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    NodeDef node_def = test::function::NDef(kNodeName, kOpName,
+                                            {"input_dataset", "buffer_size"},
+                                            {{"output_types", output_types},
+                                             {"output_shapes", output_shapes},
+                                             {"slack_period", 0}});
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 9bb6491..59825b4 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -15,6 +15,7 @@
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -27,7 +28,9 @@
 class ShardDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ShardDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("require_non_empty", &require_non_empty_));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
@@ -48,18 +51,19 @@
         errors::InvalidArgument("Index must be between 0 and ", num_shards - 1,
                                 " (currently index = ", index, ")."));
 
-    *output = new Dataset(ctx, num_shards, index, input);
+    *output = new Dataset(ctx, num_shards, index, require_non_empty_, input);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 num_shards, int64 index,
-            const DatasetBase* input)
+            bool require_non_empty, const DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)),
           num_shards_(num_shards),
           index_(index),
-          input_(input) {
+          input_(input),
+          require_non_empty_(require_non_empty) {
       input_->Ref();
     }
 
@@ -102,8 +106,13 @@
       TF_RETURN_IF_ERROR(b->AddScalar(num_shards_, &num_shards));
       Node* index = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(index_, &index));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, num_shards, index}, output));
+
+      AttrValue require_non_empty_attr;
+      b->BuildAttrValue(require_non_empty_, &require_non_empty_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, num_shards, index},
+          {{"require_non_empty", require_non_empty_attr}}, output));
       return Status::OK();
     }
 
@@ -138,6 +147,26 @@
           }
         } while ((next_index_++ % dataset()->num_shards_) != dataset()->index_);
 
+        while (dataset()->require_non_empty_ &&
+               next_index_ < dataset()->num_shards_) {
+          std::vector<Tensor> unused_result;
+
+          Status s = input_impl_->GetNext(ctx, &unused_result, end_of_sequence);
+          if (*end_of_sequence || errors::IsOutOfRange(s)) {
+            return errors::InvalidArgument(
+                "There aren't enough elements in this dataset for each shard "
+                "to have at least one element (# elems = ",
+                next_index_, ", ", "# shards = ", dataset()->num_shards_,
+                "). If you are using ",
+                "datasets with distribution strategy, consider turning ",
+                "dataset autosharding off with `tf.data.Options`.");
+          } else if (!s.ok()) {
+            return s;
+          }
+
+          next_index_++;
+        }
+
         *out_tensors = std::move(result);
         return Status::OK();
       }
@@ -184,7 +213,10 @@
     const int64 num_shards_;
     const int64 index_;
     const DatasetBase* const input_;
+    const bool require_non_empty_;
   };
+
+  bool require_non_empty_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ShardDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/shard_dataset_op_test.cc b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
new file mode 100644
index 0000000..6da1ff3
--- /dev/null
+++ b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
@@ -0,0 +1,821 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "shard_dataset";
+constexpr char kOpName[] = "ShardDataset";
+
+class ShardDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `ShardDataset` op kernel.
+  Status CreateShardDatasetOpKernel(
+      bool require_non_empty, const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "num_shards", "index"},
+        {{"require_non_empty", require_non_empty},
+         {"output_types", output_types},
+         {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `ShardDataset` op kernel context
+  Status CreateShardDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct TestCase {
+  RangeDatasetParam range_dataset_param;
+  Tensor num_shards;
+  Tensor index;
+  bool require_non_empty;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test Case 1: simple case.
+TestCase TestCase1() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 2: zero offset.
+TestCase TestCase2() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 3: iterator ends before first element.
+TestCase TestCase3() {
+  return {/*range_data_param*/ {0, 1, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1}};
+}
+
+// Test Case 4: larger num_shards.
+TestCase TestCase4() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 5}};
+}
+
+// Test Case 5: index == num_shards.
+TestCase TestCase5() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 6: similar with test_case_5 but the number of outputs could not be
+// divided evenly by num_shards.
+TestCase TestCase6() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 7: num_shard is larger than the cardinality of input dataset;
+// require_non_empty = false.
+TestCase TestCase7() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*require_non_empty*/ false,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 5}};
+}
+
+// Test Case 8: similar with test_case_7 but require_non_empty = true.
+TestCase NoElemForEachShardTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 5}};
+}
+
+TestCase IndexGreaterNumShardsCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+TestCase NegativeIndexTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-3}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+TestCase NegativeNumShardsTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-3}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+TestCase ZeroNumShardsTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+class ParameterizedShardDatasetOpTest
+    : public ShardDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedShardDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
+      expected_outputs_it++;
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_F(ShardDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  EXPECT_EQ(shard_dataset->node_name(), kNodeName);
+}
+
+TEST_F(ShardDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  EXPECT_EQ(shard_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(shard_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(shard_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  EXPECT_EQ(shard_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(shard_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(ShardDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Shard");
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int>& breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *shard_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(ShardDatasetOpTest, ParameterizedShardDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3(),
+                              TestCase4(), TestCase5(), TestCase6(),
+                              TestCase7()})));
+
+TEST_F(ShardDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::vector<TestCase> test_cases = {
+      IndexGreaterNumShardsCase(), NegativeIndexTestCase(),
+      NegativeNumShardsTestCase(), ZeroNumShardsTestCase()};
+  for (const auto& test_case : test_cases) {
+    std::unique_ptr<OpKernel> shard_dataset_kernel;
+    TF_ASSERT_OK(CreateShardDatasetOpKernel(
+        test_case.require_non_empty, test_case.expected_output_dtypes,
+        test_case.expected_output_shapes, &shard_dataset_kernel));
+
+    DatasetBase* range_dataset;
+    TF_ASSERT_OK(CreateRangeDataset<int64>(
+        test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+        test_case.range_dataset_param.step, "range", &range_dataset));
+    Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_ASSERT_OK(
+        StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+    Tensor num_shards = test_case.num_shards;
+    Tensor index = test_case.index;
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {&range_dataset_tensor, &num_shards, &index});
+    std::unique_ptr<OpKernelContext> shard_dataset_context;
+    TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                           &shard_dataset_context));
+
+    DatasetBase* shard_dataset;
+    EXPECT_EQ(CreateDataset(shard_dataset_kernel.get(),
+                            shard_dataset_context.get(), &shard_dataset)
+                  .code(),
+              tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+TEST_F(ShardDatasetOpTest, NoElemForEachShard) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TestCase test_case = NoElemForEachShardTestCase();
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+
+  EXPECT_EQ(
+      iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)
+          .code(),
+      tensorflow::error::INVALID_ARGUMENT);
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index f426e3c..287a7c9 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -63,7 +63,15 @@
       return input_->output_shapes();
     }
 
-    int64 Cardinality() const override { return input_->Cardinality(); }
+    int64 Cardinality() const override {
+      if (count_ == -1 || input_->Cardinality() == kInfiniteCardinality) {
+        return kInfiniteCardinality;
+      } else if (input_->Cardinality() == kUnknownCardinality) {
+        return kUnknownCardinality;
+      } else {
+        return input_->Cardinality() * count_;
+      }
+    }
 
    protected:
     template <class T>
@@ -645,6 +653,10 @@
     int64 count;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
 
+    OP_REQUIRES(ctx, count > 0 || count == -1,
+                errors::InvalidArgument(
+                    "count must be greater than zero or equal to -1."));
+
     // By TensorFlow convention, if both seeds are 0, then shuffling should be
     // seeded non-deterministically.
     if (seed == 0 && seed2 == 0) {
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
new file mode 100644
index 0000000..38b93f1
--- /dev/null
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
@@ -0,0 +1,915 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kShuffleNodeName[] = "shuffle_dataset";
+constexpr char kShuffleOpName[] = "ShuffleDataset";
+constexpr char kShuffleAndRepeatNodeName[] = "shuffle_and_repeat_dataset";
+constexpr char kShuffleAndRepeatOpName[] = "ShuffleAndRepeatDataset";
+
+class ShuffleDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `ShuffleDataset`/`ShuffleAndRepeatDataset` op kernel
+  Status CreateDatasetOpKernel(
+      int64 count, bool reshuffle_each_iteration,
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* shuffle_dataset_kernel) {
+    NodeDef node_def;
+    if (count == 1) {
+      node_def = test::function::NDef(
+          kShuffleNodeName, kShuffleOpName,
+          {"input_dataset", "buffer_size", "seed", "seed2"},
+          {{"reshuffle_each_iteration", reshuffle_each_iteration},
+           {"output_types", output_types},
+           {"output_shapes", output_shapes}});
+    } else {
+      node_def = test::function::NDef(
+          kShuffleAndRepeatNodeName, kShuffleAndRepeatOpName,
+          {"input_dataset", "buffer_size", "seed", "seed2", "count"},
+          {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    }
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, shuffle_dataset_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `ShuffleDataset`/`ShuffleAndRepeatDataset` op kernel context.
+  Status CreateDatasetContext(OpKernel* const op_kernel,
+                              gtl::InlinedVector<TensorValue, 4>* const inputs,
+                              std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct TestCase {
+  RangeDatasetParam range_data_param;
+  Tensor buffer_size;
+  Tensor seed;
+  Tensor seed2;
+  Tensor count;
+  bool reshuffle_each_iteration;
+  std::vector<Tensor> expected_shuffle_outputs;
+  std::vector<Tensor> expected_reshuffle_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+template <typename T>
+std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
+  std::vector<Tensor> tensors;
+  tensors.reserve(values.size());
+  for (auto& value : values) {
+    tensors.emplace_back(
+        DatasetOpsTestBase::CreateTensor<T>(TensorShape({}), {value}));
+  }
+  return tensors;
+}
+
+// Test case 1: test shuffle_dataset with reshuffle_each_iteration = false.
+TestCase TestCase1() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ false,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({2, 3, 0, 5, 6, 4, 7, 8, 9, 1}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({2, 3, 0, 5, 6, 4, 7, 8, 9, 1}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 2: test shuffle_dataset with reshuffle_each_iteration = true.
+TestCase TestCase2() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({2, 6, 1, 3, 9, 5, 0, 8, 7, 4}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({1, 6, 0, 5, 2, 7, 4, 3, 9, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 3: similar with the test case 2 but a smaller buffer size than
+// the input dataset.
+TestCase TestCase3() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({0, 2, 1, 3, 5, 6, 4, 7, 8, 9}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({1, 0, 2, 3, 4, 5, 6, 7, 9, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 4: similar with the test case 2 but has different seeds.
+TestCase TestCase4() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({3, 0, 8, 1, 5, 4, 7, 2, 6, 9}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({4, 6, 9, 0, 1, 8, 2, 7, 3, 5}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 5: test shuffle_dataset with buffer_size = 1 &
+// reshuffle_each_iteration = true.
+TestCase TestCase5() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 6: test shuffle_dataset with an empty input dataset.
+TestCase TestCase6() {
+  return {
+      /*range_data_param*/ {0, 0, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 7: test shuffle_and_repeat_dataset with buffer_size = 10 &
+// count = 2.
+TestCase TestCase7() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*reshuffle_each_iteration*/ false,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>(
+          {9, 0, 8, 6, 1, 3, 7, 2, 4, 5, 4, 3, 0, 5, 8, 2, 6, 9, 7, 1}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>(
+          {9, 0, 8, 6, 1, 3, 7, 2, 4, 5, 4, 3, 0, 5, 8, 2, 6, 9, 7, 1}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 20,
+      /*breakpoints*/ {0, 5, 22}};
+}
+
+// Test case 8: test shuffle_and_repeat_dataset with buffer_size = 10 &
+// count = -1
+TestCase TestCase8() {
+  return {
+      /*range_data_param*/ {0, 3, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
+      /*reshuffle_each_iteration*/ false,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>(
+          {2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 1, 0}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>(
+          {2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 1, 0}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ kInfiniteCardinality,
+      /*breakpoints*/ {0, 5, 20}};
+}
+
+TestCase InvalidBufferSizeTestCaseForShuffleDataset() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/ ConvertToTensorVec<int64>({}),
+      /*expected_reshuffle_outputs*/ ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+TestCase InvalidBufferSizeTestCaseForShuffleAndRepeatDataset() {
+  return {
+      /*range_data_param*/ {0, 10, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*reshuffle_each_iteration*/ true,
+      /*expected_shuffle_outputs*/ ConvertToTensorVec<int64>({}),
+      /*expected_reshuffle_outputs*/ ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+TestCase InvalidCountTestCaseForShuffleAndRepeatDataset() {
+  return {
+      /*range_data_param*/ {0, 3, 1},
+      /*buffer_size*/
+      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+      /*reshuffle_each_iteration*/ false,
+      /*expected_shuffle_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_reshuffle_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 5, 20}};
+}
+
+class ParameterizedShuffleDatasetOpTest
+    : public ShuffleDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedShuffleDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> shuffled_out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    shuffled_out_tensors.insert(shuffled_out_tensors.end(), next.begin(),
+                                next.end());
+    // For the forever-repeat case, we test only a finite number of steps of
+    // the infinite sequence.
+    if (count_value == -1 && shuffled_out_tensors.size() ==
+                                 test_case.expected_shuffle_outputs.size()) {
+      break;
+    }
+  }
+
+  // Reshuffle the dataset.
+  end_of_sequence = false;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  std::vector<Tensor> reshuffled_out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    reshuffled_out_tensors.insert(reshuffled_out_tensors.end(), next.begin(),
+                                  next.end());
+    // For the forever-repeat case, we test only a finite number of steps of
+    // the infinite sequence.
+    if (count_value == -1 && reshuffled_out_tensors.size() ==
+                                 test_case.expected_shuffle_outputs.size()) {
+      break;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(shuffled_out_tensors,
+                           test_case.expected_shuffle_outputs,
+                           /*compare_order*/ true));
+  TF_EXPECT_OK(ExpectEqual(reshuffled_out_tensors,
+                           test_case.expected_reshuffle_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  if (count_value == 1) {
+    EXPECT_EQ(dataset->node_name(), kShuffleNodeName);
+  } else {
+    EXPECT_EQ(dataset->node_name(), kShuffleAndRepeatNodeName);
+  }
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  if (count_value == 1) {
+    EXPECT_EQ(dataset->type_string(), kShuffleOpName);
+  } else {
+    EXPECT_EQ(dataset->type_string(), kShuffleAndRepeatOpName);
+  }
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  EXPECT_EQ(dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  if (count_value == 1) {
+    EXPECT_EQ(iterator->prefix(), "Iterator::Shuffle");
+  } else {
+    EXPECT_EQ(iterator->prefix(), "Iterator::ShuffleAndRepeat");
+  }
+}
+
+TEST_P(ParameterizedShuffleDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor count = test_case.count;
+  int64 count_value = count.flat<int64>()(0);
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(
+      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
+                            test_case.expected_output_dtypes,
+                            test_case.expected_output_shapes, &dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor buffer_size = test_case.buffer_size;
+  Tensor seed = test_case.seed;
+  Tensor seed2 = test_case.seed2;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+  if (count_value != 1) inputs.push_back(&count);
+
+  std::unique_ptr<OpKernelContext> dataset_context;
+  TF_ASSERT_OK(
+      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int>& breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_shuffle_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(ShuffleDatasetOpTest,
+                         ParameterizedShuffleDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3(),
+                              TestCase4(), TestCase5(), TestCase6(),
+                              TestCase7(), TestCase8()})));
+
+TEST_F(ShuffleDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<TestCase> test_cases = {
+      InvalidBufferSizeTestCaseForShuffleDataset(),
+      InvalidBufferSizeTestCaseForShuffleAndRepeatDataset(),
+      InvalidCountTestCaseForShuffleAndRepeatDataset()};
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (const auto& test_case : test_cases) {
+    Tensor count = test_case.count;
+    int64 count_value = count.flat<int64>()(0);
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(CreateDatasetOpKernel(
+        count_value, test_case.reshuffle_each_iteration,
+        test_case.expected_output_dtypes, test_case.expected_output_shapes,
+        &dataset_kernel));
+
+    DatasetBase* range_dataset;
+    TF_ASSERT_OK(CreateRangeDataset<int64>(
+        test_case.range_data_param.start, test_case.range_data_param.end,
+        test_case.range_data_param.step, "range", &range_dataset));
+    Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_ASSERT_OK(
+        StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+    Tensor buffer_size = test_case.buffer_size;
+    Tensor seed = test_case.seed;
+    Tensor seed2 = test_case.seed2;
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {&range_dataset_tensor, &buffer_size, &seed, &seed2});
+    if (count_value != 1) inputs.push_back(&count);
+
+    std::unique_ptr<OpKernelContext> dataset_context;
+    TF_ASSERT_OK(
+        CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
+    DatasetBase* shuffle_dataset;
+    EXPECT_EQ(CreateDataset(dataset_kernel.get(), dataset_context.get(),
+                            &shuffle_dataset)
+                  .code(),
+              tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 97a1ec4..bae1530 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -129,23 +129,28 @@
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (i_ < n_) {
-          out_tensors->clear();
-          out_tensors->reserve(dataset()->tensors_.size());
-          for (int i = 0; i < dataset()->tensors_.size(); ++i) {
-            const Tensor& t = dataset()->tensors_[i];
-            out_tensors->emplace_back(
-                ctx->allocator({}), t.dtype(),
-                TensorShape(dataset()->shapes_[i].dim_sizes()));
-            TF_RETURN_IF_ERROR(
-                batch_util::CopySliceToElement(t, &out_tensors->back(), i_));
+        int64 index = 0;
+        {
+          mutex_lock l(mu_);
+          if (i_ < n_) {
+            index = i_;
+            ++i_;
+          } else {
+            *end_of_sequence = true;
+            return Status::OK();
           }
-          ++i_;
-          *end_of_sequence = false;
-        } else {
-          *end_of_sequence = true;
         }
+        out_tensors->clear();
+        out_tensors->reserve(dataset()->tensors_.size());
+        for (int i = 0; i < dataset()->tensors_.size(); ++i) {
+          const Tensor& t = dataset()->tensors_[i];
+          out_tensors->emplace_back(
+              ctx->allocator({}), t.dtype(),
+              TensorShape(dataset()->shapes_[i].dim_sizes()));
+          TF_RETURN_IF_ERROR(
+              batch_util::CopySliceToElement(t, &out_tensors->back(), index));
+        }
+        *end_of_sequence = false;
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 0b24c11..150385c 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -103,8 +103,18 @@
       if (n == kInfiniteCardinality || n == kUnknownCardinality) {
         return n;
       }
-      return n / window_shift_ +
-             (n % window_shift_ == 0 || drop_remainder_ ? 0 : 1);
+      int64 cardinality = 0;
+      if (drop_remainder_) {
+        // Compute rest_elements, the number of elements after the last element
+        // of the initial window. If it is negative, we know that the
+        // cardinality is 0. Otherwise, it will be the number of valid shifts
+        // over the rest_elements.
+        int64 rest_elements = n - ((window_size_ - 1) * window_stride_ + 1);
+        cardinality = rest_elements < 0 ? 0 : rest_elements / window_shift_ + 1;
+      } else {
+        cardinality = n / window_shift_ + (n % window_shift_ == 0 ? 0 : 1);
+      }
+      return cardinality;
     }
 
    protected:
diff --git a/tensorflow/core/kernels/data/window_dataset_op_test.cc b/tensorflow/core/kernels/data/window_dataset_op_test.cc
new file mode 100644
index 0000000..97debfd
--- /dev/null
+++ b/tensorflow/core/kernels/data/window_dataset_op_test.cc
@@ -0,0 +1,883 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "window_dataset";
+constexpr char kOpName[] = "WindowDataset";
+
+class WindowDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `WindowDataset` op kernel
+  Status CreateWindowDatasetKernel(
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName,
+        {"input_dataset", "size", "shift", "stride", "drop_remainder"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `WindowDataset` op kernel context.
+  Status CreateWindowDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct TestCase {
+  RangeDatasetParam range_data_param;
+  Tensor size;
+  Tensor shift;
+  Tensor stride;
+  Tensor drop_remainder;
+  std::vector<std::vector<Tensor>> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test case 1: size=2, shift=2, stride=1, drop_remainder=false.
+TestCase TestCase1() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 2: size=2, shift=2, stride=2, drop_remainder=true.
+TestCase TestCase2() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 3: size=8, shift=3, stride=1, drop_remainder=false.
+TestCase TestCase3() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 4: size=8, shift=3, stride=1, drop_remainder=true.
+TestCase TestCase4() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 5: size=2, shift=8, stride=1, drop_remainder=false.
+TestCase TestCase5() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 1,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 6: size=2, shift=8, stride=1, drop_remainder=true.
+TestCase TestCase6() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 1,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 7: size=2, shift=2, stride=8, drop_remainder=false.
+TestCase TestCase7() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 8: size=2, shift=2, stride=8, drop_remainder=true.
+TestCase TestCase8() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 9: size=4, shift=2, stride=2, drop_remainder=true.
+TestCase TestCase9() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 1,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 10: size=5, shift=2, stride=2, drop_remainder=true.
+TestCase TestCase10() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 11: size=0, shift=2, stride=2, drop_remainder=true.
+TestCase InvalidWindowSizeTestCase() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 12: size=2, shift=0, stride=2, drop_remainder=true.
+TestCase InvalidWindowShiftTestCase() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+// Test case 13: size=2, shift=2, stride=0, drop_remainder=true.
+TestCase InvalidWindowStrideTestCase() {
+  return {
+      /*range_data_param*/ {0, 7, 1},
+      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+      /*drop_remainder*/
+      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_VARIANT},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 9}};
+}
+
+class ParameterizedWindowDatasetOpTest
+    : public WindowDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedWindowDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(window_dataset_op_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  while (!end_of_sequence) {
+    // Owns the window_datasets, which are stored as the variant tensors in the
+    // vector.
+    std::vector<Tensor> out_tensors;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      for (const auto& window_dataset_tensor : out_tensors) {
+        // Not owned.
+        DatasetBase* window_dataset;
+        TF_ASSERT_OK(GetDatasetFromVariantTensor(window_dataset_tensor,
+                                                 &window_dataset));
+        std::unique_ptr<IteratorBase> window_dataset_iterator;
+        TF_ASSERT_OK(window_dataset->MakeIterator(
+            iterator_ctx.get(), "Iterator", &window_dataset_iterator));
+        bool end_of_window_dataset = false;
+        std::vector<Tensor> window_elements;
+        // Fetches all the elements in window_dataset.
+        while (!end_of_window_dataset) {
+          std::vector<Tensor> next_element;
+          TF_EXPECT_OK(window_dataset_iterator->GetNext(
+              iterator_ctx.get(), &next_element, &end_of_window_dataset));
+          window_elements.insert(window_elements.end(), next_element.begin(),
+                                 next_element.end());
+        }
+        EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+        TF_EXPECT_OK(ExpectEqual(window_elements, *expected_outputs_it, false));
+        expected_outputs_it++;
+      }
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_F(WindowDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  EXPECT_EQ(dataset->node_name(), kNodeName);
+}
+
+TEST_F(WindowDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  EXPECT_EQ(dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  EXPECT_EQ(dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(window_dataset_op_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(window_dataset_op_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(WindowDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(window_dataset_op_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Window");
+}
+
+TEST_P(ParameterizedWindowDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> window_dataset_kernel;
+  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &window_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  Tensor size = test_case.size;
+  Tensor shift = test_case.shift;
+  Tensor stride = test_case.stride;
+  Tensor drop_remainder = test_case.drop_remainder;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
+                                          &window_dataset_op_ctx));
+  DatasetBase* dataset;
+  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
+                             window_dataset_op_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref_dataset(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(window_dataset_op_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  int cur_iteration = 0;
+  for (int breakpoint : test_case.breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *dataset, &iterator));
+    while (cur_iteration <= breakpoint) {
+      while (!end_of_sequence) {
+        // Owns the datasets, which are stored as the variant tensors in the
+        // vector.
+        std::vector<Tensor> out_tensors;
+        TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                       &end_of_sequence));
+        if (!end_of_sequence) {
+          for (const auto& window_dataset_tensor : out_tensors) {
+            // Not owned.
+            DatasetBase* window_dataset;
+            TF_ASSERT_OK(GetDatasetFromVariantTensor(window_dataset_tensor,
+                                                     &window_dataset));
+            std::unique_ptr<IteratorBase> window_dataset_iterator;
+            TF_ASSERT_OK(window_dataset->MakeIterator(
+                iterator_ctx.get(), "Iterator", &window_dataset_iterator));
+            bool end_of_window_dataset = false;
+            std::vector<Tensor> window_elements;
+            while (!end_of_window_dataset) {
+              std::vector<Tensor> next_element;
+              TF_EXPECT_OK(window_dataset_iterator->GetNext(
+                  iterator_ctx.get(), &next_element, &end_of_window_dataset));
+              window_elements.insert(window_elements.end(),
+                                     next_element.begin(), next_element.end());
+            }
+            EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+            TF_EXPECT_OK(
+                ExpectEqual(window_elements, *expected_outputs_it, false));
+            expected_outputs_it++;
+          }
+        }
+      }
+      cur_iteration++;
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    WindowDatasetOpTest, ParameterizedWindowDatasetOpTest,
+    ::testing::ValuesIn(std::vector<TestCase>(
+        {TestCase1(), TestCase2(), TestCase3(), TestCase4(), TestCase5(),
+         TestCase6(), TestCase7(), TestCase8(), TestCase9(), TestCase10()})));
+
+TEST_F(WindowDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  std::vector<TestCase> test_cases({InvalidWindowSizeTestCase(),
+                                    InvalidWindowShiftTestCase(),
+                                    InvalidWindowStrideTestCase()});
+  for (const auto& test_case : test_cases) {
+    std::unique_ptr<OpKernel> window_dataset_kernel;
+    TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
+                                           test_case.expected_output_shapes,
+                                           &window_dataset_kernel));
+    DatasetBase* range_dataset;
+    TF_ASSERT_OK(CreateRangeDataset<int64>(
+        test_case.range_data_param.start, test_case.range_data_param.end,
+        test_case.range_data_param.step, "range", &range_dataset));
+    Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_ASSERT_OK(
+        StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+    Tensor size = test_case.size;
+    Tensor shift = test_case.shift;
+    Tensor stride = test_case.stride;
+    Tensor drop_remainder = test_case.drop_remainder;
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {&range_dataset_tensor, &size, &shift, &stride, &drop_remainder});
+
+    std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
+    TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(),
+                                            &inputs, &window_dataset_op_ctx));
+    DatasetBase* dataset;
+    EXPECT_EQ(CreateDataset(window_dataset_kernel.get(),
+                            window_dataset_op_ctx.get(), &dataset)
+                  .code(),
+              tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_padded_raw_op.cc b/tensorflow/core/kernels/decode_padded_raw_op.cc
new file mode 100644
index 0000000..1e6a0cb
--- /dev/null
+++ b/tensorflow/core/kernels/decode_padded_raw_op.cc
@@ -0,0 +1,139 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+template <typename T>
+class DecodePaddedRawOp : public OpKernel {
+ public:
+  explicit DecodePaddedRawOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("out_type", &out_type_));
+
+    const bool host_is_little_endian = port::kLittleEndian;
+    bool data_is_little_endian;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("little_endian", &data_is_little_endian));
+    convert_data_endianness_ = host_is_little_endian != data_is_little_endian;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const auto& input = context->input(0);
+    auto flat_in = input.flat<string>();
+
+    int fixed_length;
+    const auto& length_input = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(length_input.shape()),
+                errors::InvalidArgument("k must be scalar, got shape ",
+                                        length_input.shape().DebugString()));
+    fixed_length = length_input.scalar<int32>()();
+
+    OP_REQUIRES(
+        context, fixed_length % sizeof(T) == 0,
+        errors::InvalidArgument(
+            "fixed_length (", fixed_length,
+            ") must be a multiple of the size of out_type (", sizeof(T), ")"));
+
+    OP_REQUIRES(context, fixed_length > 0,
+                errors::InvalidArgument("fixed_length (", fixed_length,
+                                        ") must be greater than zero."));
+
+    int width = fixed_length / sizeof(T);
+
+    TensorShape out_shape = input.shape();
+    out_shape.AddDim(width);
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("output", out_shape, &output_tensor));
+
+    if (flat_in.size() == 0) {  // Empty input
+      return;
+    }
+
+    auto out = output_tensor->flat_inner_dims<T>();
+    T* out_data = out.data();
+
+    // Forcibly clear memory - we're going to copy variable length strings in,
+    // and need to ensure that if we don't write to byte N when we copy, that
+    // we're not getting random data.
+    memset(out_data, 0, fixed_length * flat_in.size());
+
+    // If the data is already in the host's byte order, or if the width of the
+    // output type is a single byte (meaning the ordering doesn't matter), we
+    // can copy the memory directly.
+    if (!convert_data_endianness_ || sizeof(T) == 1) {
+      for (int64 i = 0; i < flat_in.size(); ++i) {
+        const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());
+
+        if (flat_in(i).size() > fixed_length) {
+          memcpy(out_data, in_data, fixed_length);
+        } else {
+          memcpy(out_data, in_data, flat_in(i).size());
+        }
+        out_data += fixed_length;
+      }
+    } else {
+      // Otherwise, the data is not in the host's byte order, and rather than a
+      // direct copy, we need to reverse the byte ordering of each element.
+      for (int64 i = 0; i < flat_in.size(); ++i) {
+        const char* in_data_bytes =
+            reinterpret_cast<const char*>(flat_in(i).data());
+        char* out_data_bytes = reinterpret_cast<char*>(out_data);
+        const char* p_in = in_data_bytes;
+        char* p_out = out_data_bytes;
+        for (; p_in < in_data_bytes + fixed_length;
+             p_in += sizeof(T), p_out += sizeof(T)) {
+          std::reverse_copy(p_in, p_in + sizeof(T), p_out);
+        }
+        out_data += fixed_length;
+      }
+    }
+  }
+
+ private:
+  // True if the endianness of the data and the endianness of the host are
+  // different, and the data needs conversion.
+  bool convert_data_endianness_;
+
+  // Data type of the output tensor.
+  DataType out_type_;
+};
+
+#define REGISTER(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("DecodePaddedRaw")                \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("out_type"), \
+                          DecodePaddedRawOp<type>)
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(int32);
+REGISTER(uint16);
+REGISTER(uint8);
+REGISTER(int16);
+REGISTER(int8);
+REGISTER(int64);
+
+#undef REGISTER
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index d58fe39..e68fa40 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -29,8 +29,13 @@
 class DecodeRawOp : public OpKernel {
  public:
   explicit DecodeRawOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("little_endian", &little_endian_));
     OP_REQUIRES_OK(context, context->GetAttr("out_type", &out_type_));
+
+    const bool host_is_little_endian = port::kLittleEndian;
+    bool data_is_little_endian;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("little_endian", &data_is_little_endian));
+    convert_data_endianness_ = host_is_little_endian != data_is_little_endian;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -70,13 +75,18 @@
     auto out = output_tensor->flat_inner_dims<T>();
     DCHECK_EQ(flat_in.size(), out.dimensions()[0]);
     T* out_data = out.data();
-    if (port::kLittleEndian == little_endian_ || sizeof(T) == 1) {
+
+    // If the data is already in the host's byte order, or if the width of the
+    // output type is a single byte, we can copy the memory directly.
+    if (!convert_data_endianness_ || sizeof(T) == 1) {
       for (int64 i = 0; i < flat_in.size(); ++i) {
         const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());
         memcpy(out_data, in_data, str_size);
         out_data += added_dim;
       }
     } else {
+      // Otherwise, the data is not in the host's byte order, and rather than a
+      // direct copy, we need to reverse the byte ordering of each element.
       for (int64 i = 0; i < flat_in.size(); ++i) {
         const char* in_data_bytes =
             reinterpret_cast<const char*>(flat_in(i).data());
@@ -92,7 +102,12 @@
   }
 
  private:
-  bool little_endian_;
+  // True if the endianness of the data and the endianness of the host are
+  // different, and the data needs conversion.
+  bool convert_data_endianness_;
+
+  // True if the input data is in little endian format.
+  bool data_is_little_endian_;
   DataType out_type_;
 };
 
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index 3ed3794..4d7eafd 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -105,7 +105,7 @@
 
 INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define GPU_DENSE_COPY(T)                                                \
   case DataTypeToEnum<T>::value: {                                       \
     functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
@@ -121,7 +121,7 @@
                                 GPU_DENSE_COPY);
 #undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
 #undef GPU_DENSE_COPY
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #undef CPU_DENSE_COPY
 #undef INSTANTIATE_GET_VARIANT_COPY_FN
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 25c5738..daf8a73 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -72,4 +72,4 @@
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index f942b1a..c68f189 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -15,7 +15,7 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif
 
@@ -102,7 +102,7 @@
 TF_CALL_quint16(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Only register 'Assign' on GPU for the subset of types also supported by
 // 'Variable' (see variable_ops.cc.)
 #define REGISTER_GPU_KERNELS(type)                                 \
@@ -113,7 +113,7 @@
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                 \
@@ -136,7 +136,7 @@
 TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("AssignAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
@@ -147,7 +147,7 @@
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif  // end GOOGLE_CUDA
+#endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                    \
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
index a2afab4..324e7ac 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
@@ -21,6 +21,61 @@
 
 namespace internal {
 
+// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
+// provides `value` that is true if TensorEvaluatorType has `PacketType
+// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
+// const` and if the PacketType supports masked load.
+//
+// Partial packets are used to:
+//
+// 1) Split the packet over two columns and use partial loads for each
+//    individual part before combining them to get the required packet. This
+//    class is used to pick the correct implementation of loadPacketStandard
+//    function below.
+//
+// 2) Finalize packing of columns in gemm_pack_colmajor after processing
+//    vectorized part with full packets (see eigen_spatiual_convolutions.h).
+template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
+class TensorEvaluatorHasPartialPacket {
+ public:
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(
+      typename std::enable_if<
+          unpacket_traits<PacketT>::masked_load_available &&
+          std::is_same<PacketT,
+                       decltype(std::declval<const TensorEvaluatorT>()
+                                    .template partialPacket<PacketT>(
+                                        std::declval<IndexT>(),
+                                        std::declval<typename unpacket_traits<
+                                            PacketT>::mask_t>()))>::value>::
+          type*) -> std::true_type;
+
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(...) -> std::false_type;
+
+  typedef decltype(
+      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
+          nullptr)) status;
+
+  static const bool value = status::value;
+};
+
+// Compute a mask for loading/storing coefficients in/from a packet in a
+// [from, to) range. If the mask bit is 1, element will be loaded/stored.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                            typename unpacket_traits<Packet>::mask_t>::type
+    mask(int from, int to) {
+  const Index packet_size = internal::unpacket_traits<Packet>::size;
+  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
+
+  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
+  const Mask mask_max = std::numeric_limits<Mask>::max();
+
+  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
+}
+
 // WARNING: Most of the code here implicitly assumes that the matrix is in
 // ColMajor layout. This is guaranteed by the tensor contraction (see
 // TensorContraction.h).
@@ -91,6 +146,8 @@
   typedef SubMapper LinearMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
+  typedef TensorEvaluator<ArgType, Device> TensorEvaluatorT;
+
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(
       const TensorEvaluator<
@@ -347,13 +404,137 @@
     if (nonStandardPatches()) {
       return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
     }
-    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
+    typedef decltype(m_impl) TensorEvaluatorT;
+    return loadPacketStandard<Packet, TensorEvaluatorT>(patchId, rowIndex,
+                                                        colIndex, otherIndex);
   }
 
+  // Helper function to load a 'partial' packet - this is the single column
+  // part of a packet that is split across two columns. In the 'partial' packet,
+  // the elements corresponding to the column (specified through colOffset) are
+  // loaded and the rest of the elements are zero-filled into the 'partial'
+  // packet. This function is called from loadPacketStandardFromTwoColumns().
+  // This code path is exercied only when the packet type supports masked load
+  // and when the partial packet load is available in the TensorEvaluator.
   EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex,
-                                                Index colIndex,
-                                                Index otherIndex) const {
+  EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
+      Index rowIndex, Index colIndex, Index otherIndex, Index patchId,
+      const Index span[], const Index patchOffsets[], Index colOffset) const {
+    const Index inputCol = colIndex + colOffset;
+    const Index rowOffsets[2] = {patchOffsets[0] - colOffset * m_colStride,
+                                 patchOffsets[1] - colOffset * m_colStride};
+    const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                rowIndex + rowOffsets[1]};
+
+    if (inputRows[0] >= m_inputRows || inputRows[1] < 0 ||
+        inputCol >= m_inputCols || inputCol < 0) {
+      // Partial packet is all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    } else if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+      // From inputIndex-span[0], we need to load elements starting from index
+      // span[0] all the way upto (and including) span[1].
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                               inputCol * m_colInputStride + otherIndex;
+      return m_impl.template partialPacket<Packet>(
+          inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
+    } else {
+      // Using slow path for this partial packet.
+      // We need to load elements starting from index span[0] all the way upto
+      // (and including) span[1]. We split this load into 3 parts:
+      // 0 : span[0]-1 - Zeros will be loaded for these indices
+      // span[0] : span[1] - Elements will be loaded here for these indices
+      // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
+      const Index packetSize = internal::unpacket_traits<Packet>::size;
+      EIGEN_ALIGN_MAX
+      typename internal::remove_const<Scalar>::type values[packetSize];
+      for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
+      for (int i = span[0]; i < span[1] + 1; ++i)
+        values[i] =
+            loadCoeff(patchId - span[0] + i, rowIndex, colIndex, otherIndex);
+      for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
+      return internal::pload<Packet>(values);
+    }
+  }
+
+  // Helper function to load a packet that is split across two columns.
+  // If required, this function is called from loadPacketStandard() when the
+  // packet type supports masked load and when the partial packet load is
+  // available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromTwoColumns(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex,
+      const Index patchOffsets[], const Index colOffsets[]) const {
+    eigen_assert(colOffsets[1] == colOffsets[0] + 1);
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+
+    // Packet to load will be split into 2 parts where each part spans a single
+    // column. First determine where to split.
+    const Index patchIdSplit =
+        ((colOffsets[1] * m_colStride) * m_rowInputStride) - 1;
+    const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
+
+    // patchIds[i]:          patchId corresponding to partial packet i
+    // spans[i]:             Start and end indices corresponding to the elements
+    //                       to be loaded for partial packet i
+    // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
+    const Index patchIds[2] = {patchId, patchIdSplit + 1};
+    const Index spans[2][2] = {{0, patchIdSplit - patchId},
+                               {patchIdSplit - patchId + 1, packetSize - 1}};
+    const Index patchOffsets2Cols[2][2] = {
+        {patchOffsets[0], patchOffsetSplit},
+        {patchOffsetSplit + 1, patchOffsets[1]}};
+
+    // Load partial packets and do bit-wise OR to generate required packet
+    return internal::por<Packet>(
+        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0],
+                                  spans[0], patchOffsets2Cols[0],
+                                  colOffsets[0]),
+        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1],
+                                  spans[1], patchOffsets2Cols[1],
+                                  colOffsets[1]));
+  }
+
+  // Helper function to load a packet that is present in a single columns.
+  // If required, this function is called from loadPacketStandard().
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumn(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex,
+      const Index patchOffsets[], const Index colOffsets[],
+      const Index inputCols[]) const {
+    eigen_assert(colOffsets[0] == colOffsets[1]);
+    const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride,
+                                 patchOffsets[1] - colOffsets[1] * m_colStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                rowIndex + rowOffsets[1]};
+
+    if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));  // all zeros
+    }
+
+    if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+      // no padding
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                               inputCols[0] * m_colInputStride + otherIndex;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is not available
+  // for the TesnorEvaluator or if the packet type does not support masked
+  // load.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index rowIndex, Index colIndex,
+                     Index otherIndex) const {
     const Index packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
@@ -362,43 +543,77 @@
 
     if ((patchDepth() % packetSize) == 0) {
       return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
-    } else {
-      // Offsets and input calculation here are identical to
-      // loadCoeffStandard(...), but repeated twice.
+    }
 
-      const Index patchOffsets[2] = {
-          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+    // Offsets and input calculation here are identical to
+    // loadCoeffStandard(...), but repeated twice.
+    const Index patchOffsets[2] = {patchId / m_fastDimZero,
+                                   (patchId + packetSize - 1) / m_fastDimZero};
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+    const Index inputCols[2] = {colIndex + colOffsets[0],
+                                colIndex + colOffsets[1]};
 
-      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
-                                   patchOffsets[1] / m_fastColStride};
-      const Index inputCols[2] = {colIndex + colOffsets[0],
-                                  colIndex + colOffsets[1]};
-      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
-        // all zeros
-        return internal::pset1<Packet>(Scalar(0));
-      }
+    if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    if (inputCols[0] == inputCols[1]) {
+      return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex,
+                                                otherIndex, patchOffsets,
+                                                colOffsets, inputCols);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
 
-      if (inputCols[0] == inputCols[1]) {
-        const Index rowOffsets[2] = {
-            patchOffsets[0] - colOffsets[0] * m_colStride,
-            patchOffsets[1] - colOffsets[1] * m_colStride};
-        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-        const Index inputRows[2] = {rowIndex + rowOffsets[0],
-                                    rowIndex + rowOffsets[1]};
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is available for
+  // the TesnorEvaluator and if the packet type supports masked load.
+  // The only difference between this and the other case is that if the packet
+  // to load is split across two columns, then in this case instead of going to
+  // the slow (element-by-element) load, we load two packets - each containing
+  // elements from one of the columns (rest of the elements of the packets are
+  // zeroes), and then combine these two packets to generate the required
+  // packet. The idea is to enable fast load (if possible) of these 'partial'
+  // packets.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index rowIndex, Index colIndex,
+                     Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<PacketT>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
 
-        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
-          // all zeros
-          return internal::pset1<Packet>(Scalar(0));
-        }
+    eigen_assert(!nonStandardPatches());
 
-        if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
-          // no padding
-          const Index depth = patchId - patchOffsets[0] * patchDepth();
-          const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
-                                   inputCols[0] * m_colInputStride + otherIndex;
-          return m_impl.template packet<Unaligned>(inputIndex);
-        }
-      }
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    }
+
+    // Offsets and input calculation here are identical to
+    // loadCoeffStandard(...), but repeated twice.
+    const Index patchOffsets[2] = {patchId / m_fastDimZero,
+                                   (patchId + packetSize - 1) / m_fastDimZero};
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+    const Index inputCols[2] = {colIndex + colOffsets[0],
+                                colIndex + colOffsets[1]};
+
+    if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+      // all zeros
+      return internal::pset1<PacketT>(Scalar(0));
+    }
+    if (inputCols[0] == inputCols[1]) {
+      return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex,
+                                                otherIndex, patchOffsets,
+                                                colOffsets, inputCols);
+    }
+    if (inputCols[1] == inputCols[0] + 1) {
+      return loadPacketStandardFromTwoColumns(
+          patchId, rowIndex, colIndex, otherIndex, patchOffsets, colOffsets);
     }
     return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
   }
@@ -545,6 +760,8 @@
 
   typedef Self LinearMapper;
 
+  typedef typename ParentMapper::TensorEvaluatorT TensorEvaluatorT;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
       : m_depth_offset(vert_offset),
@@ -591,8 +808,9 @@
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
   loadPacketStandard(Index i) const {
-    return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex,
-                                            m_colIndex, m_otherIndex);
+    typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
+    return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
+        i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC bool aligned(Index) const {
@@ -696,7 +914,16 @@
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.coeff(inputIndex);
   }
-
+  template <typename PacketT = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  partialPacketNoPadding(const Index depth, const Index baseIndex,
+                         Index num_coeffs) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template partialPacket<PacketT>(
+        inputIndex, mask<PacketT>(0, num_coeffs));
+  }
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
     const Index r = m_rowIndex + row;
@@ -1313,6 +1540,10 @@
  * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
  * pixels.
  *
+ * If padding_top, padding_bottom, padding_left, or padding_right is specified,
+ * then those paddings will be used to pad the input, and padding_type must be
+ * PADDING_VALID.
+ *
  * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be filters, height, width (and
  * others if applicable).
@@ -1360,7 +1591,9 @@
                        const PaddingType padding_type = PADDING_SAME,
                        const Index row_in_stride = 1,
                        const Index col_in_stride = 1,
-                       const OutputKernel& output_kernel = OutputKernel()) {
+                       const OutputKernel& output_kernel = OutputKernel(),
+                       Index padding_top = 0, Index padding_bottom = 0,
+                       Index padding_left = 0, Index padding_right = 0) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
                    internal::traits<Input>::NumDimensions,
@@ -1402,25 +1635,33 @@
       isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
   const TensorIndex InputCols =
       isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const bool padding_explicit =
+      (padding_top || padding_bottom || padding_left || padding_right);
 
   TensorIndex out_height;
   TensorIndex out_width;
   switch (padding_type) {
-    case PADDING_VALID:
-      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
+    case PADDING_VALID: {
+      const TensorIndex InputRowsEff = InputRows + padding_top + padding_bottom;
+      const TensorIndex InputColsEff = InputCols + padding_left + padding_right;
+      out_height = numext::ceil((InputRowsEff - kernelRowsEff + 1.f) /
                                 static_cast<float>(row_stride));
-      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
+      out_width = numext::ceil((InputColsEff - kernelColsEff + 1.f) /
                                static_cast<float>(col_stride));
       break;
-    case PADDING_SAME:
+    }
+    case PADDING_SAME: {
+      eigen_assert(!padding_explicit);
       out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
       out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
       break;
-    default:
+    }
+    default: {
       // Initialize unused variables to avoid a compiler warning
       out_height = 0;
       out_width = 0;
       eigen_assert(false && "unexpected padding");
+    }
   }
 
   // Molds the output of the patch extraction code into a 2d tensor:
@@ -1473,22 +1714,50 @@
     kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
     kernel_dims[1] = kernelFilters;
   }
-  return choose(
-      Cond<internal::traits<Input>::Layout == ColMajor>(),
-      kernel.reshape(kernel_dims)
-          .contract(input
-                        .extract_image_patches(
-                            kernelRows, kernelCols, row_stride, col_stride,
-                            row_in_stride, col_in_stride, padding_type)
-                        .reshape(pre_contract_dims),
-                    contract_dims, output_kernel)
-          .reshape(post_contract_dims),
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-          .reshape(post_contract_dims));
+  if (padding_explicit) {
+    return choose(
+        Cond<internal::traits<Input>::Layout == ColMajor>(),
+        kernel.reshape(kernel_dims)
+            .contract(input
+                          .extract_image_patches(
+                              kernelRows, kernelCols, row_stride, col_stride,
+                              row_in_stride, col_in_stride,
+                              /*row_inflate_stride=*/1,
+                              /*col_inflate_stride=*/1, padding_top,
+                              padding_bottom, padding_left, padding_right,
+                              /*padding_value=*/0)
+                          .reshape(pre_contract_dims),
+                      contract_dims, output_kernel)
+            .reshape(post_contract_dims),
+        input
+            .extract_image_patches(kernelRows, kernelCols, row_stride,
+                                   col_stride, row_in_stride, col_in_stride,
+                                   /*row_inflate_stride=*/1,
+                                   /*col_inflate_stride=*/1, padding_top,
+                                   padding_bottom, padding_left, padding_right,
+                                   /*padding_value=*/0)
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+            .reshape(post_contract_dims));
+  } else {
+    return choose(
+        Cond<internal::traits<Input>::Layout == ColMajor>(),
+        kernel.reshape(kernel_dims)
+            .contract(input
+                          .extract_image_patches(
+                              kernelRows, kernelCols, row_stride, col_stride,
+                              row_in_stride, col_in_stride, padding_type)
+                          .reshape(pre_contract_dims),
+                      contract_dims, output_kernel)
+            .reshape(post_contract_dims),
+        input
+            .extract_image_patches(kernelRows, kernelCols, row_stride,
+                                   col_stride, row_in_stride, col_in_stride,
+                                   padding_type)
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+            .reshape(post_contract_dims));
+  }
 }
 
 }  // end namespace Eigen
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index f955bc7..0127b65 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -18,11 +18,57 @@
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+// Note the following header is used in both TF and TFLite. Particularly, it's
+// used for float TFLite Conv2D.
+#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
+
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 
 namespace Eigen {
 namespace internal {
+
+// After we vectorized all loads from the underlying tensor using Packet ops, we
+// have to finalize coefficients that do not fit into a packet.
+template <typename Scalar, typename DataMapper, int packet_size,
+          bool masked_load_store>
+struct FinalizeDataMapperCoeffs {
+  EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
+                                            const DataMapper& rhs,
+                                            Index base_idx, Index depth,
+                                            Index max_depth, bool pad = false) {
+    const Index num_coeffs = max_depth - depth;
+    eigen_assert(num_coeffs <= packet_size);
+
+    for (; depth < max_depth; ++depth) {
+      *block = pad ? Scalar(0) : rhs.coeffNoPadding(depth, base_idx);
+      ++block;
+    }
+
+    return num_coeffs;
+  }
+};
+
+template <typename Scalar, typename DataMapper, int packet_size>
+struct FinalizeDataMapperCoeffs<Scalar, DataMapper, packet_size,
+                                /*masked_load_store=*/true> {
+  EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
+                                            const DataMapper& rhs,
+                                            Index base_idx, Index depth,
+                                            Index max_depth, bool pad = false) {
+    Index num_coeffs = max_depth - depth;
+    eigen_assert(num_coeffs <= packet_size);
+    if (num_coeffs == 0) return 0;
+
+    using Packet = typename packet_traits<Scalar>::type;
+    Packet p = pad ? pset1<Packet>(Scalar(0))
+                   : rhs.partialPacketNoPadding(depth, base_idx, num_coeffs);
+    internal::pstoreu(block, p, mask<Packet>(0, num_coeffs));
+
+    return num_coeffs;
+  }
+};
+
 // Pack a block of the right input matrix (in our case it's always a
 // "virtual matrix" constructed from extracted image patches) in contiguous
 // block in column-major storage order. Knowing the properties of the
@@ -56,6 +102,12 @@
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
+  using CoeffFinalizer = FinalizeDataMapperCoeffs<
+      Scalar, DataMapper, packet_size,
+      TensorEvaluatorHasPartialPacket<typename DataMapper::TensorEvaluatorT,
+                                      Packet, Index>::value &&
+          unpacket_traits<Packet>::masked_store_available>;
+
   EIGEN_DONT_INLINE
   void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows,
                   StorageIndex cols) {
@@ -149,12 +201,14 @@
               block += packet_size;
               k += packet_size;
             }
-            for (; d < max_depth; d++) {
-              eigen_assert(k < peeled_k);
-              *block = rhs.coeffNoPadding(d, base_idx);
-              ++block;
-              ++k;
-            }
+
+            eigen_assert(k <= peeled_k);
+            const Index num_coeffs =
+                CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth);
+
+            k += num_coeffs;
+            block += num_coeffs;
+            eigen_assert(k <= peeled_k);
           }
 
           // Go to the next column.
@@ -190,9 +244,9 @@
             }
 
           } else {
-            const StorageIndex max_vectorized_depth = max_depth - packet_size;
+            const StorageIndex vectorized_depth = max_depth - packet_size;
             StorageIndex d = start_depth;
-            for (; d < max_vectorized_depth; d += packet_size) {
+            for (; d <= vectorized_depth; d += packet_size) {
               eigen_assert(k < peeled_k);
               const Packet p = pad ? pset1<Packet>(Scalar(0))
                                    : rhs.packetNoPadding(d, base_idx);
@@ -200,12 +254,14 @@
               block += packet_size;
               k += packet_size;
             }
-            for (; d < max_depth; d++) {
-              eigen_assert(k < peeled_k);
-              *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
-              ++block;
-              ++k;
-            }
+
+            eigen_assert(k <= peeled_k);
+            const Index num_coeffs = CoeffFinalizer::finalize(
+                block, rhs, base_idx, d, max_depth, pad);
+
+            k += num_coeffs;
+            block += num_coeffs;
+            eigen_assert(k <= peeled_k);
           }
         }
       }
@@ -221,12 +277,7 @@
     }
   }
 };
-}  // end namespace internal
-}  // end namespace Eigen
+}  // namespace internal
+}  // namespace Eigen
 #endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-
-// Note the following header is used in both TF and TFLite. Particularly, it's
-// used for float TFLite Conv2D.
-#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
-
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 9aba7b6..9b215c5 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1848,4 +1848,14 @@
            /*filter channels*/ 1024,  //
            /*filter dims*/ 3, 3,      //
            /*block*/ 56, 256);
+
+BM_PackLhs(/*input channels*/ 30,   //
+           /*filter channels*/ 64,  //
+           /*filter dims*/ 3, 3,    //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*input channels*/ 50,   //
+           /*filter channels*/ 64,  //
+           /*filter dims*/ 3, 3,    //
+           /*block*/ 56, 256);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/extract_image_patches_op.cc
index 7192fec..0fc1f56 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/extract_image_patches_op.cc
@@ -130,7 +130,8 @@
 
 #undef REGISTER
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc b/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
index 465b7ac..650c51f 100644
--- a/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 5a3c571..01e3468 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -15,7 +15,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -28,7 +29,8 @@
 
 using tensorflow::BinaryElementWiseOp;
 using tensorflow::DEVICE_CPU;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 using tensorflow::DEVICE_GPU;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 using tensorflow::OpKernel;
@@ -143,7 +145,8 @@
     Name("FakeQuantWithMinMaxArgsGradient").Device(DEVICE_CPU),
     FakeQuantWithMinMaxArgsGradientOp<CPUDevice>);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 typedef Eigen::GpuDevice GPUDevice;
 
 // Forward declarations for functor specializations for GPU.
@@ -265,7 +268,8 @@
     Name("FakeQuantWithMinMaxVarsGradient").Device(DEVICE_CPU),
     FakeQuantWithMinMaxVarsGradientOp<CPUDevice>);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 template <>
 void FakeQuantWithMinMaxVarsFunctor<GPUDevice>::operator()(
     const GPUDevice& d, typename TTypes<float>::ConstFlat inputs,
@@ -411,7 +415,8 @@
     Name("FakeQuantWithMinMaxVarsPerChannelGradient").Device(DEVICE_CPU),
     FakeQuantWithMinMaxVarsPerChannelGradientOp<CPUDevice>);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 template <>
 void FakeQuantWithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
     const GPUDevice& d, typename TTypes<float>::ConstMatrix inputs,
diff --git a/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc b/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
index dc258f4..b3bd440 100644
--- a/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define FAKE_QUANT_NO_DEBUG
 
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 29b5e89..e0f326d 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -28,7 +28,8 @@
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -286,7 +287,8 @@
 
 #undef FFT_LABEL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 namespace {
 template <typename T>
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index d4c9258..4e47de4 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
@@ -116,4 +117,4 @@
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/fingerprint_op.cc b/tensorflow/core/kernels/fingerprint_op.cc
new file mode 100644
index 0000000..2052932
--- /dev/null
+++ b/tensorflow/core/kernels/fingerprint_op.cc
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/fingerprint.h"
+
+namespace tensorflow {
+namespace {
+template <typename T>
+inline void CopyToBuffer(const T& value, uint8* output) {
+  // Memcpy to string is endian-dependent. We choose little-endian as
+  // standard. On big-endian machines, bytes should be reversed.
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  static_assert(port::kLittleEndian, "");
+  std::memcpy(output, &value, sizeof(value));
+#else
+  static_assert(!port::kLittleEndian, "");
+  std::reverse_copy(reinterpret_cast<const uint8*>(&value),
+                    reinterpret_cast<const uint8*>(&value + 1), output);
+#endif
+}
+
+void FarmhashFingerprint64(TTypes<uint8, 2>::ConstTensor input,
+                           TTypes<uint8, 2>::Matrix output) {
+  DCHECK_EQ(output.dimension(0), input.dimension(0));
+  DCHECK_EQ(output.dimension(1), sizeof(uint64));
+  for (int64 i = 0; i < output.dimension(0); ++i) {
+    const uint64 fingerprint =
+        Fingerprint64({reinterpret_cast<const char*>(&input(i, 0)),
+                       static_cast<std::size_t>(input.dimension(1))});
+    CopyToBuffer(fingerprint, &output(i, 0));
+  }
+}
+
+void FarmhashFingerprint64(TTypes<string>::ConstFlat input,
+                           TTypes<uint8, 2>::Matrix output) {
+  DCHECK_EQ(output.dimension(0), input.dimension(0));
+  DCHECK_EQ(output.dimension(1), sizeof(uint64));
+  for (int64 i = 0; i < input.dimension(0); ++i) {
+    const uint64 fingerprint =
+        Fingerprint64({input(i).data(), input(i).size()});
+    CopyToBuffer(fingerprint, &output(i, 0));
+  }
+}
+
+class FingerprintOp : public OpKernel {
+ public:
+  explicit FingerprintOp(OpKernelConstruction* context) : OpKernel(context) {
+    DataType dtype;
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype));
+    OP_REQUIRES(context, DataTypeCanUseMemcpy(dtype) || dtype == DT_STRING,
+                errors::InvalidArgument("Data type not supported: ",
+                                        DataTypeString(dtype)));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const Tensor& method_tensor = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(method_tensor.shape()),
+                errors::InvalidArgument("`method` should be a scalar string: ",
+                                        method_tensor.shape()));
+    // For now, farmhash64 is the only function supported.
+    const string& method = method_tensor.scalar<string>()();
+    OP_REQUIRES(
+        context, method == "farmhash64",
+        errors::InvalidArgument("Unsupported fingerprint method: ", method));
+
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+        errors::InvalidArgument("`data` should have at least one dimension: ",
+                                input.shape()));
+
+    const int64 dim0 = input.shape().dim_size(0);
+    const int64 dim1 = input.shape().num_elements() / dim0;
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape{dim0, kFingerprintSize}, &output));
+
+    if (input.dtype() == DT_STRING) {
+      if (dim1 > 1) {
+        Tensor temp;
+        OP_REQUIRES_OK(context, context->allocate_temp(
+                                    DT_UINT8,
+                                    TensorShape{input.shape().num_elements(),
+                                                kFingerprintSize},
+                                    &temp));
+        // `temp` is a matrix of shape {input.num_elements, fingerprint_size},
+        // and each row contains the fingerprint value of corresponding string.
+        // To compute fingerprints of multiple strings, this op fingerprints the
+        // buffer containing the string fingerprints.
+        FarmhashFingerprint64(input.flat<string>(), temp.tensor<uint8, 2>());
+        FarmhashFingerprint64(static_cast<const Tensor&>(temp).shaped<uint8, 2>(
+                                  {dim0, dim1 * kFingerprintSize}),
+                              output->matrix<uint8>());
+      } else {
+        // In case dim1 == 1, each string computes into its own fingerprint
+        // value. There is no need to fingerprint twice.
+        FarmhashFingerprint64(input.flat<string>(), output->matrix<uint8>());
+      }
+    } else {
+      auto data = input.bit_casted_shaped<uint8, 2>(
+          {dim0, dim1 * DataTypeSize(input.dtype())});
+      FarmhashFingerprint64(data, output->matrix<uint8>());
+    }
+  }
+
+ private:
+  static constexpr int kFingerprintSize = sizeof(uint64);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Fingerprint").Device(tensorflow::DEVICE_CPU),
+                        FingerprintOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fingerprint_op_test.cc b/tensorflow/core/kernels/fingerprint_op_test.cc
new file mode 100644
index 0000000..febfafb
--- /dev/null
+++ b/tensorflow/core/kernels/fingerprint_op_test.cc
@@ -0,0 +1,242 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+Status MakeNodeDef(DataType dtype, NodeDef* node_def) {
+  return NodeDefBuilder("fingerprint", "Fingerprint")
+      .Input(FakeInput(dtype))
+      .Input(FakeInput(DT_STRING))
+      .Finalize(node_def);
+}
+
+class FingerprintOpTest : public OpsTestBase {
+ protected:
+  Status MakeFingerprintOp(Tensor* tensor) {
+    return MakeFingerprintOp(tensor, "farmhash64");
+  }
+
+  Status MakeFingerprintOp(Tensor* data, const string& method) {
+    TF_RETURN_IF_ERROR(MakeNodeDef(data->dtype(), node_def()));
+    TF_RETURN_IF_ERROR(InitOp());
+
+    inputs_.clear();
+    inputs_.push_back(data);
+
+    method_ = Tensor(DT_STRING, TensorShape{});
+    method_.scalar<string>()() = method;
+    inputs_.push_back(&method_);
+    return Status::OK();
+  }
+
+  Tensor batch_dims_;
+  Tensor method_;
+};
+
+// This test detects changes in fingerprint method.
+TEST_F(FingerprintOpTest, GoldenValue) {
+  Tensor tensor(DT_UINT8, {1, 3, 4, 5, 6, 7});
+  auto buffer = tensor.flat<uint8>();
+  std::iota(buffer.data(), buffer.data() + buffer.size(),
+            static_cast<uint8>(47));
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  EXPECT_EQ(GetOutput(0)->shape(), (TensorShape{1, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(), "\x2d\x90\xdf\x03\x79\x36\x3c\x43");
+}
+
+// String types have a different compute path. This test detects changes in this
+// special-case handling.
+TEST_F(FingerprintOpTest, StringGoldenValue) {
+  Tensor data(DT_STRING, {1, 2, 2});
+  auto buffer = data.flat<string>();
+  buffer(0).resize(10);
+  buffer(1).resize(7);
+  buffer(2).resize(0);
+  buffer(3).resize(19);
+  std::iota(buffer(0).begin(), buffer(0).end(), 0);
+  std::iota(buffer(1).begin(), buffer(1).end(), 7);
+  std::iota(buffer(2).begin(), buffer(2).end(), 71);
+  std::iota(buffer(3).begin(), buffer(3).end(), 41);
+
+  TF_ASSERT_OK(MakeFingerprintOp(&data));
+  TF_ASSERT_OK(RunOpKernel());
+  ASSERT_EQ(GetOutput(0)->shape(), (TensorShape{1, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(), "\x92\x43\x28\x52\xa3\x7c\x48\x18");
+
+  // When each batch item has exactly one string, Fingerprint op avoids
+  // double-fingerprint. Adding a test to detect any change in this logic.
+  ASSERT_TRUE(data.CopyFrom(data, TensorShape{4}));
+  TF_ASSERT_OK(MakeFingerprintOp(&data));
+  TF_ASSERT_OK(RunOpKernel());
+  ASSERT_EQ(GetOutput(0)->shape(), (TensorShape{4, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(),
+            "\xea\xff\xd6\xb2\xb2\x4d\x70\x9b"
+            "\x6e\x9d\xed\x21\xc6\x4a\x61\x52"
+            "\x4f\x40\x90\x2f\x3b\x6a\xe1\x9a"
+            "\x0d\x9b\x7f\x63\x23\x14\x1c\xb8");
+}
+
+TEST_F(FingerprintOpTest, Collision) {
+  const TensorShape shape = {1, 2, 4, 6};
+  for (DataType dtype : kRealNumberTypes) {
+    const int64 size = shape.num_elements() * DataTypeSize(dtype);
+
+    Tensor tensor(dtype, shape);
+    auto buffer = tensor.bit_casted_shaped<uint8, 1>({size});
+    buffer.setRandom();
+
+    TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+    TF_ASSERT_OK(RunOpKernel());
+    const Tensor fingerprint0 = *GetOutput(0);
+
+    // Alter a byte value in the buffer.
+    const int offset = buffer(0) % buffer.size();
+    buffer(offset) = ~buffer(offset);
+
+    TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+    TF_ASSERT_OK(RunOpKernel());
+    const Tensor fingerprint1 = *GetOutput(0);
+
+    EXPECT_NE(fingerprint0.tensor_data(), fingerprint1.tensor_data());
+  }
+}
+
+TEST_F(FingerprintOpTest, CollisionString) {
+  constexpr int64 size = 256;
+
+  Tensor tensor(DT_STRING, {1});
+  auto& input = tensor.vec<string>()(0);
+  input.resize(size);
+
+  TTypes<uint8>::UnalignedFlat buffer(reinterpret_cast<uint8*>(&*input.begin()),
+                                      input.size());
+  buffer.setRandom();
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor fingerprint0 = *GetOutput(0);
+
+  // Alter a byte value in the buffer.
+  const int offset = buffer(0) % buffer.size();
+  buffer(offset) = ~buffer(offset);
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor fingerprint1 = *GetOutput(0);
+
+  EXPECT_NE(fingerprint0.tensor_data(), fingerprint1.tensor_data());
+}
+
+TEST_F(FingerprintOpTest, CompareBytesAndString) {
+  Tensor pods_tensor(DT_FLOAT, {4, 64});
+  Tensor strings_tensor(DT_STRING, {4});
+
+  auto pods = pods_tensor.matrix<float>();
+  pods.setRandom();
+
+  auto strings = strings_tensor.vec<string>();
+  for (int64 i = 0; i < strings.size(); ++i) {
+    strings(i).assign(reinterpret_cast<const char*>(&pods(i, 0)),
+                      pods.dimension(1) * sizeof(pods(i, 0)));
+  }
+
+  TF_ASSERT_OK(MakeFingerprintOp(&pods_tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor pods_fingerprints = *GetOutput(0);
+
+  TF_ASSERT_OK(MakeFingerprintOp(&strings_tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor strings_fingerprints = *GetOutput(0);
+
+  EXPECT_EQ(pods_fingerprints.tensor_data(),
+            strings_fingerprints.tensor_data());
+}
+
+TEST_F(FingerprintOpTest, SupportedMethods) {
+  Tensor tensor(DT_STRING, TensorShape{1});
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor, "unsupported_method"));
+
+  const Status status = RunOpKernel();
+  EXPECT_FALSE(status.ok());
+  EXPECT_NE(status.error_message().find("unsupported_method"), string::npos);
+}
+
+TEST_F(FingerprintOpTest, SupportedTypes) {
+  Tensor input(DT_RESOURCE, TensorShape{1});
+  EXPECT_FALSE(MakeFingerprintOp(&input).ok());
+}
+
+TEST(FingerprintOpShapeFnTest, MethodKnownStatically) {
+  ShapeInferenceTestOp op("Fingerprint");
+
+  Tensor method(DT_STRING, TensorShape{});
+  method.scalar<string>()() = "farmhash64";
+  op.input_tensors.assign({nullptr, &method});
+
+  TF_ASSERT_OK(MakeNodeDef(DT_UINT8, &op.node_def));
+  INFER_OK(op, "?;?", "[?,8]");
+  INFER_ERROR("must be at least rank 1", op, "[];?");
+  INFER_OK(op, "[?];?", "[d0_0,8]");
+  INFER_OK(op, "[1,?];?", "[d0_0,8]");
+  INFER_OK(op, "[?,2,3];?", "[d0_0,8]");
+}
+
+TEST(FingerprintOpShapeFnTest, MethodUnknownStatically) {
+  ShapeInferenceTestOp op("Fingerprint");
+
+  TF_ASSERT_OK(MakeNodeDef(DT_FLOAT, &op.node_def));
+  INFER_OK(op, "?;?", "[?,?]");
+  INFER_ERROR("must be at least rank 1", op, "[];?");
+  INFER_OK(op, "[?];?", "[d0_0,?]");
+  INFER_OK(op, "[1,?];?", "[d0_0,?]");
+  INFER_OK(op, "[?,2,3];?", "[d0_0,?]");
+}
+
+TEST(FingerprintOpShapeFnTest, InvalidMethod) {
+  ShapeInferenceTestOp op("Fingerprint");
+
+  // When `method` shape is known statically.
+  INFER_ERROR("must be rank 0", op, "[1];[1]");
+
+  // When `method` shape is unknown statically.
+  Tensor method(DT_STRING, TensorShape{1});
+  method.vec<string>()(0) = "farmhash64";
+  op.input_tensors.assign({nullptr, &method});
+  INFER_ERROR("must be rank 0", op, "?;?");
+
+  method = Tensor(DT_STRING, TensorShape{});
+  method.scalar<string>()() = "unsupported_method";
+  op.input_tensors.assign({nullptr, &method});
+  INFER_ERROR("unsupported_method", op, "?;?");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 88a8a52..37f2bd3 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -13,11 +13,11 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/function_ops.h"
+
 #include <deque>
 #include <vector>
 
-#include "tensorflow/core/kernels/function_ops.h"
-
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -29,6 +29,7 @@
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -249,7 +250,6 @@
         ctx, lib->Instantiate(kGradientOp, AttrSlice(def()), &handle), done);
 
     FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.runner = ctx->runner();
@@ -329,8 +329,12 @@
       handle = cached_entry->second;
     } else {
       VLOG(1) << "Instantiating " << func_.name() << " on " << target_device;
-      tracing::ScopedActivity activity(strings::StrCat(
-          "RemoteCall: Instantiate: ", func_.name(), " on ", target_device));
+      profiler::TraceMe activity(
+          [&] {
+            return strings::StrCat("RemoteCall: Instantiate: ", func_.name(),
+                                   " on ", target_device);
+          },
+          profiler::TraceMeLevel::kInfo);
       OP_REQUIRES_OK_ASYNC(
           ctx,
           lib->Instantiate(func_.name(), AttrSlice(&attr_values),
@@ -347,7 +351,6 @@
   OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
 
   FunctionLibraryRuntime::Options opts;
-  opts.step_id = ctx->step_id();
   opts.runner = ctx->runner();
   opts.source_device = source_device;
   if (opts.source_device != target_device) {
@@ -374,8 +377,12 @@
     opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
   }
   auto* rets = new std::vector<Tensor>;
-  auto* activity = new tracing::ScopedActivity(strings::StrCat(
-      "RemoteCall: Run: ", func_.name(), " on ", target_device));
+  auto* activity = new profiler::TraceMe(
+      [&] {
+        return strings::StrCat("RemoteCall: Run: ", func_.name(), " on ",
+                               target_device);
+      },
+      profiler::TraceMeLevel::kInfo);
   VLOG(1) << "Running " << func_.name() << " on " << target_device
           << " with handle: " << handle;
   lib->Run(opts, handle, args, rets,
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 246a6ce..8792387 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -114,7 +114,6 @@
 
 void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
                    bool always_collect_stats) {
-  opts->step_id = ctx->step_id();
   opts->rendezvous = ctx->rendezvous();
   opts->cancellation_manager = ctx->cancellation_manager();
   if (always_collect_stats) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 48b3395..40a58de 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -253,13 +253,22 @@
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
     const int64 height = GetTensorDim(x, tensor_format, 'H');
     const int64 width = GetTensorDim(x, tensor_format, 'W');
+
+    // If input tensor is in NHWC format, and we are running in inference mode,
+    // there is no need to convert to NCHW format, performance is the same.
+    // However in training mode, performance in NCHW format is much better.
+    TensorFormat compute_format = !is_training && tensor_format == FORMAT_NHWC
+                                      ? FORMAT_NHWC
+                                      : FORMAT_NCHW;
+
     VLOG(2) << "FusedBatchNorm:"
             << " batch_size: " << batch_size << " channels: " << channels
             << " height: " << height << " width:" << width
             << " x shape: " << x.shape().DebugString()
             << " scale shape: " << scale.shape().DebugString()
             << " offset shape: " << offset.shape().DebugString()
-            << " tensor format: " << tensor_format;
+            << " tensor format: " << ToString(tensor_format)
+            << " compute format: " << ToString(compute_format);
 
     // If input is empty, return NaN mean/variance
     if (x.shape().num_elements() == 0) {
@@ -274,12 +283,12 @@
     Tensor y_transformed;
     se::DeviceMemory<T> y_ptr;
 
-    if (tensor_format == FORMAT_NCHW) {
+    if (tensor_format == compute_format) {
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*y);
-    } else if (tensor_format == FORMAT_NHWC) {
+    } else if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) {
       OP_REQUIRES_OK(context, context->allocate_temp(
                                   DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
+                                  ShapeFromFormat(compute_format, batch_size,
                                                   height, width, channels),
                                   &x_transformed));
       functor::NHWCToNCHW<GPUDevice, T, 4>()(
@@ -290,22 +299,27 @@
 
       OP_REQUIRES_OK(context, context->allocate_temp(
                                   DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
+                                  ShapeFromFormat(compute_format, batch_size,
                                                   height, width, channels),
                                   &y_transformed));
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(y_transformed);
     } else {
-      context->SetStatus(
-          errors::Internal("Unsupported tensor format: ", tensor_format));
+      context->SetStatus(errors::Internal(
+          "Unsupported tensor format: ", ToString(tensor_format),
+          " and compute format: ", ToString(compute_format)));
       return;
     }
 
+    const se::dnn::DataLayout data_layout =
+        compute_format == FORMAT_NHWC ? se::dnn::DataLayout::kBatchYXDepth
+                                      : se::dnn::DataLayout::kBatchDepthYX;
+
     se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(data_layout);
 
     se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
@@ -371,7 +385,8 @@
           errors::Internal("cuDNN launch failure : input shape (",
                            x.shape().DebugString(), ")"));
     }
-    if (tensor_format == FORMAT_NHWC) {
+
+    if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) {
       functor::NCHWToNHWC<GPUDevice, T, 4>()(
           context->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(y_transformed).tensor<T, 4>(),
diff --git a/tensorflow/core/kernels/fused_batch_norm_op_test.cc b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
index a3f760b..1b348a6 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
@@ -14,6 +14,8 @@
 ==============================================================================*/
 
 #include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -21,10 +23,12 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 class FusedBatchNormOpTest : public OpsTestBase {};
@@ -124,4 +128,82 @@
   test::FillValues<float>(&expected_offset, {27, 27});
   test::ExpectTensorNear<float>(expected_offset, *GetOutput(2), 0.01);
 }
+
+//----------------------------------------------------------------------------//
+// Performance benchmarks are below.                                          //
+//----------------------------------------------------------------------------//
+
+using fp32 = float;
+using fp16 = Eigen::half;
+
+template <typename T>
+static Graph* FusedBatchNormInference(int n, int h, int w, int c,
+                                      bool is_training,
+                                      TensorFormat data_format) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  DataType dtype = DataTypeToEnum<T>::value;
+  Tensor x_t(dtype, data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
+                                               : TensorShape({n, c, h, w}));
+  x_t.flat<T>().setRandom();
+
+  Tensor other_t(DT_FLOAT, TensorShape({c}));
+  other_t.flat<float>().setRandom();
+
+  Tensor empty_t(DT_FLOAT, TensorShape({0}));
+
+  Node* x = test::graph::Constant(g, x_t, "x");
+  Node* other = test::graph::Constant(g, other_t, "other");
+  Node* empty = test::graph::Constant(g, empty_t, "empty");
+
+  Node* fused_batch_norm;
+  TF_CHECK_OK(NodeBuilder(g->NewName("fused_batch_norm"), "FusedBatchNormV2")
+                  .Input(x)
+                  .Input(other)                        // scale
+                  .Input(other)                        // offset
+                  .Input(is_training ? empty : other)  // mean
+                  .Input(is_training ? empty : other)  // variance
+                  .Attr("T", dtype)
+                  .Attr("U", DT_FLOAT)
+                  .Attr("epsilon", 0.001)
+                  .Attr("is_training", is_training)
+                  .Attr("data_format", ToString(data_format))
+                  .Finalize(g, &fused_batch_norm));
+
+  return g;
+}
+
+#define BM_NAME(N, H, W, C, T, IT, FORMAT, DEVICE) \
+  BM_FusedBatchNorm##_##N##_##H##_##W##_##C##_##IT##_##FORMAT##_##T##_##DEVICE
+
+#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)          \
+  static void BM_NAME(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(int iters) { \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);        \
+    test::Benchmark(#DEVICE, FusedBatchNormInference<T>(                       \
+                                 N, H, W, C, IS_TRAINING, FORMAT_##FORMAT))    \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_NAME(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, cpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, cpu);
+
+#ifdef GOOGLE_CUDA
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, gpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NCHW, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NCHW, gpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, gpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NCHW, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index b5b6f14..0b82b72 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -71,6 +71,7 @@
 //
 // Same for the GPU kernel.
 TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_ND_CPU);
 
 #undef REGISTER_GATHER_ND_CPU
 
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
index cf9817d..c3d2f70 100644
--- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -152,6 +152,7 @@
   REGISTER_GATHER_ND_FULL(type, int64)
 
 TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_ND_CPU);
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/gather_nd_op_test.cc b/tensorflow/core/kernels/gather_nd_op_test.cc
index 9f8658e..b0b5c95 100644
--- a/tensorflow/core/kernels/gather_nd_op_test.cc
+++ b/tensorflow/core/kernels/gather_nd_op_test.cc
@@ -57,9 +57,9 @@
 
 class GatherNdOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType index_type) {
+  void MakeOp(DataType param_type, DataType index_type) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "GatherNd")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(param_type))
                      .Input(FakeInput(index_type))
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
@@ -67,7 +67,7 @@
 };
 
 TEST_F(GatherNdOpTest, Simple) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 8, 4});
@@ -80,6 +80,32 @@
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(GatherNdOpTest, Quantized_UINT8) {
+  MakeOp(DT_QUINT8, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<quint8>(TensorShape({5}), {0, 1, 2, 8, 4});
+  AddInputFromArray<int32>(TensorShape({2, 1}), {3, 4});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({2}));
+  test::FillValues<quint8>(&expected, {8, 4});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherNdOpTest, Quantized_INT8) {
+  MakeOp(DT_QINT8, DT_INT32);
+
+  AddInputFromArray<qint8>(TensorShape({5}), {0, 1, 2, 8, 4});
+  AddInputFromArray<int32>(TensorShape({2, 1}), {3, 4});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_QINT8, TensorShape({2}));
+  test::FillValues<qint8>(&expected, {8, 4});
+  test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
+}
+
 constexpr int kLookups = 2000;
 
 template <typename Index>
diff --git a/tensorflow/core/kernels/gpu_device_array.h b/tensorflow/core/kernels/gpu_device_array.h
index 3961cee..51eb8bb 100644
--- a/tensorflow/core/kernels/gpu_device_array.h
+++ b/tensorflow/core/kernels/gpu_device_array.h
@@ -15,7 +15,8 @@
 #ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
 #define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/gpu_device_array_gpu.h b/tensorflow/core/kernels/gpu_device_array_gpu.h
index ca2051c..3d81712 100644
--- a/tensorflow/core/kernels/gpu_device_array_gpu.h
+++ b/tensorflow/core/kernels/gpu_device_array_gpu.h
@@ -18,7 +18,8 @@
 #ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 305dac4..a6a1334 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -70,6 +70,8 @@
     *instr.mutable_output() = output_desc.ToProto(element_type);
     *instr.mutable_conv_desc() = conv_desc.ToProto();
     log.mutable_instr()->PackFrom(std::move(instr));
+    instr.set_conv_scale(1);
+    instr.set_side_value_scale(0);
   }
   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
@@ -80,9 +82,8 @@
   Logger::Singleton()->LogProto(log);
 }
 
-void LogFusedConvAutotuneResults(
-    se::dnn::ConvolutionKind kind, se::dnn::DataType element_type,
-    const se::dnn::BatchDescriptor& input_desc,
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, const se::dnn::BatchDescriptor& input_desc,
     const se::dnn::FilterDescriptor& filter_desc,
     const se::dnn::BatchDescriptor& output_desc,
     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
@@ -91,7 +92,7 @@
   AutotuningLog log;
   {
     ConvolutionProto instr;
-    instr.set_kind(kind);
+    instr.set_kind(se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION);
     *instr.mutable_input() = input_desc.ToProto(element_type);
     *instr.mutable_filter() = filter_desc.ToProto(element_type);
     *instr.mutable_output() = output_desc.ToProto(element_type);
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 7455c21..14cd639 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -96,7 +96,28 @@
     }
     if (new_score >= min_score_threshold_) {
       VLOG(1) << GetActionSummary("accepts", params, config);
+    } else if (autotune_global_count_ >= max_autotune_global_count_) {
+      // The autotuning exceeds the max iteration threshold and we accept the
+      // the winner if it exists in the map, otherwise we accept the current
+      // winner.
+      auto winner = params_config_map_.find(params);
+      if (winner == params_config_map_.end()) {
+        VLOG(1) << GetActionSummary("creates", params, config);
+        for (int i = 0; i < min_score_threshold_; ++i) {
+          VLOG(1) << GetActionSummary("promotes", params, config);
+        }
+        params_config_map_.insert(
+            std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
+      } else {
+        int promotes_times = min_score_threshold_ - winner->second.score;
+        for (int i = 0; i < promotes_times; ++i) {
+          VLOG(1) << GetActionSummary("promotes", params, config);
+        }
+        winner->second.score = min_score_threshold_;
+      }
+      VLOG(1) << GetActionSummary("accepts", params, config);
     }
+    autotune_global_count_++;
   }
 
  private:
@@ -115,6 +136,8 @@
     min_score_threshold_ = std::max(min_score_threshold_, 1);
     max_autotune_count_ = std::max(
         5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
+    max_autotune_global_count_ = 2 * max_autotune_count_;
+    autotune_global_count_ = 0;
   }
 
   template <class Group, class Params, class Cfg>
@@ -144,6 +167,8 @@
   string name_;
   int32 min_score_threshold_;
   int32 max_autotune_count_;
+  int32 max_autotune_global_count_;
+  int32 autotune_global_count_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap);
 };
@@ -173,9 +198,8 @@
                             absl::Span<const AutotuneResult> results);
 
 // Logs fused convolution results to customized back-storage.
-void LogFusedConvAutotuneResults(
-    se::dnn::ConvolutionKind kind, se::dnn::DataType element_type,
-    const se::dnn::BatchDescriptor& input_desc,
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, const se::dnn::BatchDescriptor& input_desc,
     const se::dnn::FilterDescriptor& filter_desc,
     const se::dnn::BatchDescriptor& output_desc,
     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 5410997..cf63a97 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -112,7 +112,8 @@
 
 #undef REGISTER_GPU_KERNEL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // A special GPU kernel for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index 692f916..11ecf7d 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -113,6 +113,8 @@
       Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using ConstMatrixMap = Eigen::Map<const Matrix>;
   using MatrixMap = Eigen::Map<Matrix>;
+  using ConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<Scalar, 1, Eigen::Dynamic>>;
   using ConstMatrixMaps = gtl::InlinedVector<ConstMatrixMap, 4>;
   using MatrixMaps = gtl::InlinedVector<MatrixMap, 4>;
   using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
@@ -180,6 +182,7 @@
   using MatrixMaps = typename Base::MatrixMaps;               \
   using ConstMatrixMap = typename Base::ConstMatrixMap;       \
   using ConstMatrixMaps = typename Base::ConstMatrixMaps;     \
+  using ConstVectorMap = typename Base::ConstVectorMap;       \
   using TensorShapes = typename Base::TensorShapes;
 
 #define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index e611ae2..c0ec46a 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,16 +13,13 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/logging_ops.h"
-
 #include <iostream>
 
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
@@ -44,29 +41,13 @@
   mutex_lock l(*file_mutex);
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewAppendableFile(fname, &file));
-  Status a = file->Append(absl::StrCat(data, "\n"));
+  Status a = file->Append(data);
   Status c = file->Close();
   return a.ok() ? c : a;
 }
 
 }  // namespace
 
-namespace logging {
-
-typedef std::vector<void (*)(const char*)> Listeners;
-
-Listeners* GetListeners() {
-  static Listeners* listeners = new Listeners;
-  return listeners;
-}
-
-bool RegisterListener(void (*listener)(const char*)) {
-  GetListeners()->push_back(listener);
-  return true;
-}
-
-}  // end namespace logging
-
 class AssertOp : public OpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -148,6 +129,7 @@
  public:
   explicit PrintV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_stream", &output_stream_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("end", &end_));
 
     SetFilePathIfAny();
     if (!file_path_.empty()) return;
@@ -171,26 +153,29 @@
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_));
     const string& msg = input_->scalar<string>()();
 
+    string ended_msg = strings::StrCat(msg, end_);
+
     if (!file_path_.empty()) {
       // Outputs to a file at the specified path.
-      OP_REQUIRES_OK(ctx, AppendStringToFile(file_path_, msg, ctx->env()));
+      OP_REQUIRES_OK(ctx,
+                     AppendStringToFile(file_path_, ended_msg, ctx->env()));
       return;
     }
-    auto listeners = logging::GetListeners();
-    if (!listeners->empty()) {
-      for (auto& listener : *listeners) {
-        listener(msg.c_str());
-      }
-    } else if (output_stream_ == "stdout") {
-      std::cout << msg << std::endl;
+
+    if (logging::LogToListeners(ended_msg, "")) {
+      return;
+    }
+
+    if (output_stream_ == "stdout") {
+      std::cout << ended_msg << std::flush;
     } else if (output_stream_ == "stderr") {
-      std::cerr << msg << std::endl;
+      std::cerr << ended_msg << std::flush;
     } else if (output_stream_ == "log(info)") {
-      LOG(INFO) << msg << std::endl;
+      LOG(INFO) << ended_msg << std::flush;
     } else if (output_stream_ == "log(warning)") {
-      LOG(WARNING) << msg << std::endl;
+      LOG(WARNING) << ended_msg << std::flush;
     } else if (output_stream_ == "log(error)") {
-      LOG(ERROR) << msg << std::endl;
+      LOG(ERROR) << ended_msg << std::flush;
     } else {
       string error_msg = strings::StrCat(
           "Unknown output stream: ", output_stream_, ", Valid streams are:");
@@ -206,6 +191,7 @@
                                           "log(warning)", "log(error)"};
 
  private:
+  string end_;
   // Either output_stream_ or file_path_ (but not both) will be non-empty.
   string output_stream_;
   string file_path_;
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index b046401..28a3d94 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -57,19 +57,21 @@
                                       use_node_name_sharing_));
     }
 
-    auto creator = [ctx, this](lookup::LookupInterface** ret) {
-      lookup::LookupInterface* container = new Container(ctx, this);
-      if (!ctx->status().ok()) {
-        container->Unref();
-        return ctx->status();
-      }
-      if (ctx->track_allocations()) {
-        ctx->record_persistent_memory_allocation(
-            container->MemoryUsed() + table_handle_.AllocatedBytes());
-      }
-      *ret = container;
-      return Status::OK();
-    };
+    auto creator =
+        [ctx, this](lookup::LookupInterface** ret)
+            EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+              lookup::LookupInterface* container = new Container(ctx, this);
+              if (!ctx->status().ok()) {
+                container->Unref();
+                return ctx->status();
+              }
+              if (ctx->track_allocations()) {
+                ctx->record_persistent_memory_allocation(
+                    container->MemoryUsed() + table_handle_.AllocatedBytes());
+              }
+              *ret = container;
+              return Status::OK();
+            };
 
     lookup::LookupInterface* table = nullptr;
     OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/matrix_diag_op.cc
index 75c49ba..7779525 100644
--- a/tensorflow/core/kernels/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_diag_op.cc
@@ -62,8 +62,8 @@
     for (int i = 0; i < rank - 2; ++i) {
       output_shape.AddDim(input_shape.dim_size(i));
     }
-    const int64 min_dim = std::min(input_shape.dim_size(rank - 2),
-                                   input_shape.dim_size(rank - 1));
+    const Eigen::Index min_dim = std::min(input_shape.dim_size(rank - 2),
+                                          input_shape.dim_size(rank - 1));
     output_shape.AddDim(min_dim);
 
     Tensor* output = nullptr;
@@ -97,7 +97,7 @@
                     "input must be at least 1-dim, received shape: ",
                     input.shape().DebugString()));
 
-    const int64 k = input_shape.dim_size(rank - 1);
+    const Eigen::Index k = input_shape.dim_size(rank - 1);
     auto input_reshaped = input.flat_inner_dims<T, 2>();
 
     TensorShape output_shape = input_shape;
@@ -147,8 +147,8 @@
                       typename TTypes<T, 2>::ConstTensor input,
                       typename TTypes<T, 3>::Tensor output) {
     output.device(d) = output.constant(T());
-    for (int64 r = 0; r < output.dimension(0); ++r) {
-      for (int64 d = 0; d < output.dimension(1); ++d) {
+    for (Eigen::Index r = 0; r < output.dimension(0); ++r) {
+      for (Eigen::Index d = 0; d < output.dimension(1); ++d) {
         output(r, d, d) = input(r, d);
       }
     }
@@ -160,8 +160,8 @@
   static void Compute(const CPUDevice& d,
                       typename TTypes<T, 3>::ConstTensor input,
                       typename TTypes<T, 2>::Tensor output) {
-    for (int64 r = 0; r < output.dimension(0); ++r) {
-      for (int64 d = 0; d < output.dimension(1); ++d) {
+    for (Eigen::Index r = 0; r < output.dimension(0); ++r) {
+      for (Eigen::Index d = 0; d < output.dimension(1); ++d) {
         output(r, d) = input(r, d, d);
       }
     }
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 502d593..78b1df2 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -121,16 +121,17 @@
     if (input.data() != output.data()) {
       output.device(device) = input;
     }
-    auto compute_shard = [&output, &diag](int64 begin, int64 end) {
-      for (int64 batch = begin; batch < end; ++batch) {
-        for (int64 col = 0; col < diag.dimension(1); ++col) {
+    auto compute_shard = [&output, &diag](Eigen::Index begin,
+                                          Eigen::Index end) {
+      for (Eigen::Index batch = begin; batch < end; ++batch) {
+        for (Eigen::Index col = 0; col < diag.dimension(1); ++col) {
           output(batch, col, col) = diag(batch, col);
         }
       }
     };
     auto thread_pool =
         context->device()->tensorflow_cpu_worker_threads()->workers;
-    int64 cost_per_batch = 10 * output.dimension(1);  // Heuristic.
+    Eigen::Index cost_per_batch = 10 * output.dimension(1);  // Heuristic.
     thread_pool->ParallelFor(output.dimension(0), cost_per_batch,
                              std::move(compute_shard));
   }
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/matrix_solve_op.cc
index f3919a1..3a75054 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_op.cc
@@ -76,7 +76,7 @@
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& matrix = inputs[0];
     const ConstMatrixMap& rhs = inputs[1];
-    if (matrix.rows() == 0 || rhs.cols() == 0) {
+    if (matrix.rows() == 0 || matrix.cols() == 0 || rhs.cols() == 0) {
       // To be consistent with the MatrixInverse op, we define the solution for
       // an empty set of equation as the empty matrix.
       return;
@@ -162,7 +162,7 @@
 
     // To be consistent with the MatrixInverse op, we define the solution for
     // an empty set of equations as the empty matrix.
-    if (rhs.NumElements() == 0) {
+    if (input.NumElements() == 0 || rhs.NumElements() == 0) {
       done();
       return;
     }
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 3fc683b..566ab79 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -90,7 +90,7 @@
     MklDnnShape output_mkl_shape;
     Tensor* dst_tensor = nullptr;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
     for (int src_idx = 0; src_idx < num_inputs; ++src_idx) {
       const Tensor& src_tensor = MklGetInput(ctx, src_idx);
       T* src_i = const_cast<T*>(src_tensor.flat<T>().data());
@@ -249,6 +249,7 @@
                           MklAddNOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU);
+TF_CALL_bfloat16(REGISTER_MKL_CPU);
 #undef REGISTER_MKL_CPU
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index fa33818..fcc7248 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -919,10 +919,6 @@
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
 
-      // Data from persistent (cached) filter tensor
-      const Tensor& cached_filter_data_tensor =
-          *cached_filter_data_ptensor_.AccessTensor(context);
-
       MklDnnShape src_mkl_shape, filter_mkl_shape;
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
@@ -1187,8 +1183,8 @@
     // Similarly, if the data format is NCHW, indices 0, 1, 2 and 3 of
     // paddings(_tf) will be zero.
     // i.e. for the above example, paddings = {0, 0, 0, 0, 1, 2, 3, 4}.
-    int64 pad_top, pad_left;
-    int64 pad_bottom, pad_right;
+    int64 pad_top = 0, pad_left = 0;
+    int64 pad_bottom = 0, pad_right = 0;
     string data_format = ToString(data_format_);
     if (data_format == "NHWC") {
       pad_top = paddings[2];
@@ -1672,9 +1668,11 @@
       } else {
         bias_attr.set_output_scales(1, scales);
       }
-      auto bias_pd = memory::primitive_desc(
-          {{bias_tensor.NumElements()}, MklDnnType<Tbias>(), memory::format::x},
-          this->cpu_engine_);
+      auto bias_pd =
+          memory::primitive_desc({{static_cast<int>(bias_tensor.NumElements())},
+                                  MklDnnType<Tbias>(),
+                                  memory::format::x},
+                                 this->cpu_engine_);
 
       void* bias_buf = static_cast<void*>(
           const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
@@ -1785,7 +1783,6 @@
                             memory::format output_tf_format,
                             Tensor** output_tensor) override {
     int summand_idx = context->num_inputs() / 2 - 1;
-    float reorder_sum_scale = 1.0;
     if (std::is_same<Toutput, quint8>::value) {
       summand_idx -= 2;
       DataType summand_type = this->input_type(summand_idx);
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 7008b47..4c9dbf4 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -55,8 +55,10 @@
 
       // Get the inputs
       const Tensor& src_tensor = MklGetInput(ctx, kSrcIndex);
-      const float min_range = MklGetInput(ctx, kMinIndex).flat<float>()(0);
-      const float max_range = MklGetInput(ctx, kMaxIndex).flat<float>()(0);
+      const float min_range =
+          MklGetInput(ctx, kMinIndex).template flat<float>()(0);
+      const float max_range =
+          MklGetInput(ctx, kMaxIndex).template flat<float>()(0);
 
       // Get MklShape
       MklDnnShape src_mkl_shape;
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 94b0c02..6b6eaac 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -1127,6 +1127,7 @@
                           MklFusedBatchNormOp<CPUDevice, T, T>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_CPU);
+TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
 #undef REGISTER_MKL_FUSED_BATCHNORM_CPU
 
 #define REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(T, U)                   \
@@ -1138,6 +1139,7 @@
                           MklFusedBatchNormOp<CPUDevice, T, U>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(float, float);
+REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
 #undef REGISTER_MKL_FUSED_BATCHNORM_V2_CPU
 
 #define REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU(T)                    \
@@ -1148,6 +1150,7 @@
                           MklFusedBatchNormGradOp<CPUDevice, T, T>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU);
+TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU);
 #undef REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU
 
 #define REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(T, U)              \
@@ -1159,6 +1162,7 @@
                           MklFusedBatchNormGradOp<CPUDevice, T, U>);
 
 REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(float, float);
+REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
 #undef REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index f8504ed..1c7e6ff 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -26,7 +26,7 @@
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::primitive_attr;
@@ -70,7 +70,6 @@
   ~MklQuantizeV2Op() {}
 
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& input = ctx->input(0);
     const float input_min_range = ctx->input(1).flat<float>()(0);
     const float input_max_range = ctx->input(2).flat<float>()(0);
     float min_range = std::min(0.0f, input_min_range);
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
index b5c1a01..8fbb16c 100644
--- a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -42,7 +42,7 @@
     OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_type_));
     OP_REQUIRES(ctx, out_type_ == DT_QINT8 || out_type_ == DT_QUINT8,
                 errors::InvalidArgument(
-                    "out_type must be qint8 or quint8, but got: " + out_type_));
+                    "out_type must be qint8 or quint8, but got: ", out_type_));
   }
   virtual ~MklRequantizePerChannelOp() {}
   void Compute(OpKernelContext* ctx) override {
@@ -162,11 +162,18 @@
   engine cpu_engine_ = engine(engine::cpu, 0);
 };
 
+// Registration for out_type: qint8
 REGISTER_KERNEL_BUILDER(Name("RequantizePerChannel")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<qint32>("T")
                             .TypeConstraint<qint8>("out_type"),
                         MklRequantizePerChannelOp<CPUDevice, qint8>);
+// Registration for out_type: quint8
+REGISTER_KERNEL_BUILDER(Name("RequantizePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T")
+                            .TypeConstraint<quint8>("out_type"),
+                        MklRequantizePerChannelOp<CPUDevice, quint8>);
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 342e226..9f3fa09 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -263,6 +263,7 @@
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklReshapeOp<CPUDevice, T>);
 TF_CALL_float(REGISTER_MKL_CPU);
+TF_CALL_bfloat16(REGISTER_MKL_CPU);
 #undef REGISTER_MKL_CPU
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index e2cbeec..5d238a2 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -16,8 +16,9 @@
 // See docs in ../ops/array_ops.cc.
 
 #ifdef INTEL_MKL
-#ifndef INTEL_MKL_ML_ONLY
 
+#include "mkldnn.hpp"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -25,9 +26,6 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "mkldnn.hpp"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
@@ -485,9 +483,9 @@
                           MklSliceOp<CPUDevice, type>);
 
 TF_CALL_float(REGISTER_MKL_SLICE);
+TF_CALL_bfloat16(REGISTER_MKL_SLICE);
 #undef REGISTER_MKL_SLICE
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_DNN
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index edc7156..e89aa1e 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -184,6 +184,9 @@
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
         break;
+      case DT_BFLOAT16:
+        return MKLTransposeND<bfloat16>(ctx, in, out, perm);
+        break;
       // TODO(nhasabni): support other types such as INT8.
       default:
         break;
@@ -228,6 +231,9 @@
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
         break;
+      case DT_BFLOAT16:
+        return MKLTransposeND<bfloat16>(ctx, in, out, perm);
+        break;
       // TODO(nhasabni): support other types such as INT8.
       default:
         break;
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index 2f4a5e9..0cc29b4 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -74,6 +74,8 @@
   struct SharedLockReleaser {
     std::shared_ptr<LockReleaser> shared_lock;
 
+    SharedLockReleaser() : shared_lock() {}
+
     explicit SharedLockReleaser(std::shared_ptr<LockReleaser>&& lock)
         : shared_lock(std::forward<decltype(lock)>(lock)) {
       VLOG(3) << "Creating shared_ptr of " << shared_lock.get()
@@ -86,6 +88,16 @@
               << " count is: " << shared_lock.use_count();
     }
 
+    SharedLockReleaser& operator=(const SharedLockReleaser& rhs) = delete;
+
+    SharedLockReleaser& operator=(SharedLockReleaser&& rhs) {
+      if (&rhs == this) return *this;
+      std::swap(shared_lock, rhs.shared_lock);
+      VLOG(3) << "Move-assign of SharedLockReleaser of " << shared_lock.get()
+              << " count is: " << shared_lock.use_count();
+      return *this;
+    }
+
     SharedLockReleaser(const SharedLockReleaser& rhs)
         : shared_lock(rhs.shared_lock) {
       VLOG(3) << "Copying SharedLockReleaser of " << shared_lock.get()
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index c506af9..e977aa5 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -15,7 +15,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 1cc81355..0548e38 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -17,7 +17,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -103,7 +104,7 @@
       for (int i = 0; i < axis; ++i) {
         prefix_dim_size *= indices_shape.dim_size(i);
       }
-      TI suffix_dim_size = indices_shape.num_elements() / prefix_dim_size;
+      int64 suffix_dim_size = indices_shape.num_elements() / prefix_dim_size;
 
       // Split indices into matrix of size prefix_dim_size x suffix_dim_size
       auto indices_t =
@@ -140,7 +141,8 @@
 
 TF_CALL_ALL_TYPES(REGISTER_ONE_HOT);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index fc97962..83ba272 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -15,7 +15,8 @@
 
 // See docs in ../ops/array_ops.cc
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index d215756..a55b4af 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -294,7 +294,8 @@
 TF_CALL_string(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T, Dims)                                         \
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index bd4b0f0..ddc1241 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 03f7249..2c3a8f7 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -23,6 +23,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -218,8 +219,12 @@
                                     FunctionLibraryRuntime* lib,
                                     OpKernelContext* ctx, DoneCallback done) {
   FunctionLibraryRuntime::Options run_opts;
-  run_opts.step_id = ctx->step_id();
-  run_opts.step_container = ctx->step_container();
+  ResourceMgr* resource_mgr = lib->device()->resource_manager();
+  ScopedStepContainer* step_container = new ScopedStepContainer(
+      run_opts.step_id, [resource_mgr](const string& name) {
+        resource_mgr->Cleanup(name).IgnoreError();
+      });
+  run_opts.step_container = step_container;
   run_opts.cancellation_manager = ctx->cancellation_manager();
   run_opts.stats_collector = ctx->stats_collector();
   run_opts.collective_executor = ctx->collective_executor();
@@ -229,15 +234,20 @@
   run_opts.source_device =
       lib->device() == nullptr ? "" : lib->device()->name();
   run_opts.allow_dead_tensors = true;
-  // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
-  // constructed rendezvous to a rendezvous manager.
-  Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
+
+  Rendezvous* rendez;
+  OP_REQUIRES_OK_ASYNC(
+      ctx,
+      ctx->create_rendezvous(run_opts.step_id,
+                             ctx->function_library()->device_mgr(), &rendez),
+      done);
   run_opts.rendezvous = rendez;
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
   lib->Run(run_opts, handle, inputs, rets,
-           [rets, rendez, done, ctx, func_name](const Status& status) {
+           [rets, rendez, done, ctx, func_name,
+            step_container](const Status& status) {
              if (!status.ok()) {
                const string function_and_msg =
                    strings::StrCat(errors::FormatFunctionForError(func_name),
@@ -249,6 +259,7 @@
                }
              }
              delete rets;
+             delete step_container;
              rendez->Unref();
              done();
            });
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index d39331c..43f1c6e 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -15,7 +15,8 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -241,7 +242,8 @@
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define REGISTER_GPU_KERNEL(T)                                                 \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2")                      \
                               .Device(DEVICE_GPU)                              \
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index 290b639..00d2a3b 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 272aa3b..1a5aa8d 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -32,10 +32,6 @@
 
 namespace tensorflow {
 
-namespace barrier {
-class Barrier;
-}  // namespace barrier
-
 // Functionality common to asynchronous QueueInterface implementations.
 class QueueBase : public QueueInterface {
  public:
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 903a97a..730694e 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -30,10 +30,11 @@
 // For each slice in `(start, limit)` in `value_slices`, append
 // `params_dense_values_in[start:limit] to `values_out`.  `value_size` indicates
 // the number of scalars contained in each value params_dense_values_in[i].
-template <typename VALUE_TYPE>
-void WriteValueSlices(const Tensor& params_dense_values_in,
-                      const std::vector<std::pair<int64, int64>>& value_slices,
-                      int64 value_size, Tensor* values_out) {
+template <typename VALUE_TYPE, typename SPLITS_TYPE>
+void WriteValueSlices(
+    const Tensor& params_dense_values_in,
+    const std::vector<std::pair<SPLITS_TYPE, SPLITS_TYPE>>& value_slices,
+    SPLITS_TYPE value_size, Tensor* values_out) {
   const auto& params_dense_values =
       params_dense_values_in.flat_outer_dims<VALUE_TYPE, 2>();
   auto values = values_out->flat_outer_dims<VALUE_TYPE, 2>();
@@ -50,7 +51,7 @@
 
 }  // namespace
 
-template <typename INDEX_TYPE>
+template <typename INDEX_TYPE, typename SPLITS_TYPE>
 class RaggedGatherOpBase : public OpKernel {
  public:
   using OpKernel::OpKernel;
@@ -66,18 +67,18 @@
         context->input(params_nested_splits_in.size() + 1);
 
     DCHECK_GT(params_nested_splits_in.size(), 0);  // Enforced by REGISTER_OP.
-    int64 num_params = params_nested_splits_in[0].dim_size(0) - 1;
+    SPLITS_TYPE num_params = params_nested_splits_in[0].dim_size(0) - 1;
     OP_REQUIRES_OK(context, ValidateIndices(indices_in, num_params));
 
     OP_REQUIRES(context, params_dense_values_in.dims() > 0,
                 errors::InvalidArgument("params.rank must be nonzero"));
-    int64 num_params_dense_values = params_dense_values_in.dim_size(0);
+    SPLITS_TYPE num_params_dense_values = params_dense_values_in.dim_size(0);
 
     // Calculate the `splits`, and store the value slices that we need to
     // copy in `value_slices`.
-    std::vector<std::pair<int64, int64>> value_slices;
-    int64 num_values = 0;
-    std::vector<std::vector<int64>> out_splits;
+    std::vector<std::pair<SPLITS_TYPE, SPLITS_TYPE>> value_slices;
+    SPLITS_TYPE num_values = 0;
+    std::vector<std::vector<SPLITS_TYPE>> out_splits;
     OP_REQUIRES_OK(context, MakeSplits(indices_in, params_nested_splits_in,
                                        num_params_dense_values, &out_splits,
                                        &value_slices, &num_values));
@@ -90,12 +91,14 @@
   }
 
  private:
+  using ConstFlatType = typename TTypes<SPLITS_TYPE>::ConstFlat;
+
   // Check if any indices are out-of-bounds.
   ::tensorflow::Status ValidateIndices(const Tensor& indices_in,
-                                       int64 num_params) {
+                                       SPLITS_TYPE num_params) {
     const auto& indices = indices_in.flat<INDEX_TYPE>();
-    for (int64 i = 0; i < indices.size(); ++i) {
-      int64 index = indices(i);
+    for (SPLITS_TYPE i = 0; i < indices.size(); ++i) {
+      SPLITS_TYPE index = indices(i);
       if (index < 0 || index >= num_params) {
         return errors::InvalidArgument(
             "indices", SliceDebugString(indices_in.shape(), i), " = ", index,
@@ -111,9 +114,10 @@
   // we need for allocating the output values tensor) is stored in `num_values`.
   ::tensorflow::Status MakeSplits(
       const Tensor& indices_in, const OpInputList& params_nested_splits_in,
-      int64 num_params_dense_values,
-      std::vector<std::vector<int64>>* out_splits,
-      std::vector<std::pair<int64, int64>>* value_slices, int64* num_values) {
+      SPLITS_TYPE num_params_dense_values,
+      std::vector<std::vector<SPLITS_TYPE>>* out_splits,
+      std::vector<std::pair<SPLITS_TYPE, SPLITS_TYPE>>* value_slices,
+      SPLITS_TYPE* num_values) {
     *num_values = 0;
     value_slices->clear();
 
@@ -122,10 +126,10 @@
 
     // Get Eigen tensors.
     const auto& indices = indices_in.flat<INDEX_TYPE>();
-    std::vector<TTypes<int64>::ConstFlat> params_nested_splits;
+    std::vector<ConstFlatType> params_nested_splits;
     params_nested_splits.reserve(params_nested_splits_in.size());
     for (const auto& splits_in : params_nested_splits_in) {
-      params_nested_splits.push_back(splits_in.flat<int64>());
+      params_nested_splits.push_back(splits_in.flat<SPLITS_TYPE>());
     }
 
     TF_RETURN_IF_ERROR(
@@ -165,7 +169,7 @@
         const auto& splits = params_nested_splits[dim];
         int out_dim = dim + indices_in.dims() - 1;
         if (out_dim >= 0) {
-          int64 delta = out_splits->at(out_dim).back() - splits(start);
+          SPLITS_TYPE delta = out_splits->at(out_dim).back() - splits(start);
           for (int j = start; j < limit; ++j) {
             out_splits->at(out_dim).push_back(splits(j + 1) + delta);
           }
@@ -182,14 +186,14 @@
   }
 
   ::tensorflow::Status ValidateSplits(
-      const std::vector<TTypes<int64>::ConstFlat>& params_nested_splits,
-      int64 num_params_dense_values) {
+      const std::vector<ConstFlatType>& params_nested_splits,
+      SPLITS_TYPE num_params_dense_values) {
     // Validate
     for (int dim = 0; dim < params_nested_splits.size(); ++dim) {
       const auto& splits = params_nested_splits[dim];
-      int64 last_split = (dim == params_nested_splits.size() - 1)
-                             ? num_params_dense_values
-                             : params_nested_splits[dim + 1].size();
+      SPLITS_TYPE last_split = (dim == params_nested_splits.size() - 1)
+                                   ? num_params_dense_values
+                                   : params_nested_splits[dim + 1].size();
       if (splits.size() == 0) {
         return errors::InvalidArgument("Ragged splits may not be empty");
       }
@@ -210,17 +214,17 @@
   }
 
   ::tensorflow::Status WriteSplits(
-      const std::vector<std::vector<int64>>& out_splits,
+      const std::vector<std::vector<SPLITS_TYPE>>& out_splits,
       OpKernelContext* context) {
     OpOutputList splits_out;
     TF_RETURN_IF_ERROR(
         context->output_list("output_nested_splits", &splits_out));
     for (int i = 0; i < out_splits.size(); ++i) {
       Tensor* splits;
-      int64 num_splits = out_splits[i].size();
+      SPLITS_TYPE num_splits = out_splits[i].size();
       TF_RETURN_IF_ERROR(
           splits_out.allocate(i, TensorShape({num_splits}), &splits));
-      auto splits_flat = splits->flat<int64>();
+      auto splits_flat = splits->flat<SPLITS_TYPE>();
       std::copy_n(out_splits[i].data(), out_splits[i].size(),
                   splits_flat.data());
     }
@@ -229,15 +233,16 @@
 
   ::tensorflow::Status WriteValues(
       const Tensor& params_dense_values_in,
-      const std::vector<std::pair<int64, int64>>& value_slices,
-      int values_index, int64 num_values, OpKernelContext* context) const {
+      const std::vector<std::pair<SPLITS_TYPE, SPLITS_TYPE>>& value_slices,
+      int values_index, SPLITS_TYPE num_values,
+      OpKernelContext* context) const {
     Tensor* values_out = nullptr;
     TensorShape values_shape = params_dense_values_in.shape();
     values_shape.set_dim(0, num_values);
     TF_RETURN_IF_ERROR(
         context->allocate_output(values_index, values_shape, &values_out));
-    const int64 num_elements = params_dense_values_in.NumElements();
-    const int64 value_size =
+    const SPLITS_TYPE num_elements = params_dense_values_in.NumElements();
+    const SPLITS_TYPE value_size =
         num_elements == 0 ? 0
                           : (num_elements / params_dense_values_in.dim_size(0));
     CallWriteValueSlices(params_dense_values_in, value_slices, value_size,
@@ -253,34 +258,39 @@
   // which cuts the binary size of this op from ~300k to <90k.
   virtual void CallWriteValueSlices(
       const Tensor& params_dense_values_in,
-      const std::vector<std::pair<int64, int64>>& value_slices,
-      int64 value_size, Tensor* values_out) const = 0;
+      const std::vector<std::pair<SPLITS_TYPE, SPLITS_TYPE>>& value_slices,
+      SPLITS_TYPE value_size, Tensor* values_out) const = 0;
 };
 
-template <typename INDEX_TYPE, typename VALUE_TYPE>
-class RaggedGatherOp : public RaggedGatherOpBase<INDEX_TYPE> {
+template <typename INDEX_TYPE, typename VALUE_TYPE, typename SPLITS_TYPE>
+class RaggedGatherOp : public RaggedGatherOpBase<INDEX_TYPE, SPLITS_TYPE> {
  public:
-  using RaggedGatherOpBase<INDEX_TYPE>::RaggedGatherOpBase;
+  using RaggedGatherOpBase<INDEX_TYPE, SPLITS_TYPE>::RaggedGatherOpBase;
 
  private:
   void CallWriteValueSlices(
       const Tensor& params_dense_values_in,
-      const std::vector<std::pair<int64, int64>>& value_slices,
-      int64 value_size, Tensor* values_out) const override {
+      const std::vector<std::pair<SPLITS_TYPE, SPLITS_TYPE>>& value_slices,
+      SPLITS_TYPE value_size, Tensor* values_out) const override {
     WriteValueSlices<VALUE_TYPE>(params_dense_values_in, value_slices,
                                  value_size, values_out);
   }
 };
 
-#define REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(index_type, value_type)   \
-  REGISTER_KERNEL_BUILDER(Name("RaggedGather")                        \
-                              .Device(DEVICE_CPU)                     \
-                              .TypeConstraint<index_type>("Tindices") \
-                              .TypeConstraint<value_type>("Tvalues"), \
-                          RaggedGatherOp<index_type, value_type>);
-#define REGISTER_CPU_KERNEL(value_type)                  \
-  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int32, value_type) \
-  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int64, value_type)
+#define REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(index_type, value_type, \
+                                            splits_type)            \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RaggedGather")                                          \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<index_type>("Tindices")                   \
+          .TypeConstraint<value_type>("Tvalues")                    \
+          .TypeConstraint<splits_type>("Tsplits"),                  \
+      RaggedGatherOp<index_type, value_type, splits_type>);
+#define REGISTER_CPU_KERNEL(value_type)                         \
+  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int32, value_type, int32) \
+  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int64, value_type, int32) \
+  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int32, value_type, int64) \
+  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int64, value_type, int64)
 TF_CALL_POD_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_string(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
diff --git a/tensorflow/core/kernels/ragged_range_op.cc b/tensorflow/core/kernels/ragged_range_op.cc
index cb7546c..024b16f 100644
--- a/tensorflow/core/kernels/ragged_range_op.cc
+++ b/tensorflow/core/kernels/ragged_range_op.cc
@@ -26,7 +26,7 @@
 
 using errors::InvalidArgument;
 
-template <typename T>
+template <typename T, typename SPLITS_TYPE>
 class RaggedRangeOp : public OpKernel {
  public:
   using OpKernel::OpKernel;
@@ -60,7 +60,7 @@
                   InvalidArgument("starts, limits, and deltas must have the "
                                   "same shape"));
     }
-    int64 nrows = in_sizes.empty() ? 1 : in_sizes[0];
+    SPLITS_TYPE nrows = in_sizes.empty() ? 1 : in_sizes[0];
 
     const auto& starts = starts_in.flat<T>();
     const auto& limits = limits_in.flat<T>();
@@ -71,7 +71,7 @@
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({nrows + 1}),
                                             &rt_nested_splits_out));
-    auto rt_nested_splits = rt_nested_splits_out->flat<int64>();
+    auto rt_nested_splits = rt_nested_splits_out->flat<SPLITS_TYPE>();
     rt_nested_splits(0) = 0;
     for (int row = 0; row < nrows; ++row) {
       T start = broadcast_starts ? starts(0) : starts(row);
@@ -81,7 +81,7 @@
       rt_nested_splits(row + 1) =
           rt_nested_splits(row) + RangeSize(start, limit, delta);
     }
-    int64 nvals = rt_nested_splits(nrows);
+    SPLITS_TYPE nvals = rt_nested_splits(nrows);
 
     // Construct the rt_dense_values tensor.
     Tensor* rt_dense_values_out = nullptr;
@@ -90,10 +90,10 @@
     auto rt_dense_values = rt_dense_values_out->flat<T>();
     int value_index = 0;
     for (int row = 0; row < nrows; ++row) {
-      int64 row_size = rt_nested_splits(row + 1) - rt_nested_splits(row);
+      SPLITS_TYPE row_size = rt_nested_splits(row + 1) - rt_nested_splits(row);
       T value = broadcast_starts ? starts(0) : starts(row);
       T delta = broadcast_deltas ? deltas(0) : deltas(row);
-      for (int64 i = 0; i < row_size; ++i) {
+      for (SPLITS_TYPE i = 0; i < row_size; ++i) {
         rt_dense_values(value_index++) = T(value);
         value += delta;
       }
@@ -102,7 +102,7 @@
 
  private:
   // Returns the number of elements in the specified range.
-  int64 RangeSize(T start, T limit, T delta) {
+  SPLITS_TYPE RangeSize(T start, T limit, T delta) {
     if (((delta > 0) && (limit < start)) || ((delta < 0) && (limit > start))) {
       return 0;
     }
@@ -114,10 +114,17 @@
   }
 };
 
-#define REGISTER_CPU_KERNEL(TYPE)                                       \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("RaggedRange").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      RaggedRangeOp<TYPE>);
+#define REGISTER_CPU_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(Name("RaggedRange")                    \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<TYPE>("T")         \
+                              .TypeConstraint<int32>("Tsplits"), \
+                          RaggedRangeOp<TYPE, int32>);           \
+  REGISTER_KERNEL_BUILDER(Name("RaggedRange")                    \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<TYPE>("T")         \
+                              .TypeConstraint<int64>("Tsplits"), \
+                          RaggedRangeOp<TYPE, int64>);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_int32(REGISTER_CPU_KERNEL);
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
new file mode 100644
index 0000000..3ba266b
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -0,0 +1,314 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+
+struct RaggedTensor {
+  Tensor values;
+  std::vector<Tensor> nested_splits;
+};
+
+Status RaggedComponentsFromVariant(const Tensor& encoded_variant,
+                                   int ragged_rank, DataType value_dtype,
+                                   DataType split_dtype,
+                                   std::vector<RaggedTensor>* decoded_ragged) {
+  const auto& flat_variants = encoded_variant.flat<Variant>();
+  decoded_ragged->resize(flat_variants.size());
+  // Step 1: Extract the 1-D DT_VARIANT Tensor from each Variant element in the
+  // input.
+  for (int i = 0; i < flat_variants.size(); i++) {
+    const auto& flat_variant = flat_variants(i);
+    const Tensor* encoded_list = flat_variant.get<Tensor>();
+    if (encoded_list == nullptr) {
+      return errors::InvalidArgument(
+          "Input Variant element at index ", i,
+          " doesn't hold a Tensor: ", flat_variant.DebugString());
+    }
+    if (encoded_list->dims() != 1) {
+      return errors::InvalidArgument(
+          "Encoded input Variant must have rank 1, but found rank: ",
+          encoded_list->dims(),
+          ". encoded input Variant: ", encoded_list->DebugString());
+    }
+    if (encoded_list->NumElements() != (ragged_rank + 1) &&
+        encoded_list->NumElements() != 1) {
+      return errors::InvalidArgument(
+          "Encoded input Variant must hold either input_ragged_rank + 1 "
+          "Tensors or an empty Tensor (zero splits Tensors, 1 values Tensor), "
+          "input_ragged_rank: ",
+          ragged_rank,
+          ", encoded input Variant: ", encoded_list->DebugString());
+    }
+    const auto& input_vec = encoded_list->vec<Variant>();
+
+    // Step 2: Get the splits and value Tensors from the 1-D DT_VARIANT Tensor
+    // to create the component RaggedTensors.
+    (*decoded_ragged)[i].nested_splits.reserve(ragged_rank);
+    for (int j = 0; j < ragged_rank; j++) {
+      const Tensor* split_tensor = input_vec(j).get<Tensor>();
+      if (split_tensor == nullptr) {
+        return errors::InvalidArgument(
+            "Encoded scalar element at index ", i,
+            " doesn't have a splits Tensor at split_index ", j, ": ",
+            input_vec(j).DebugString());
+      }
+      Tensor splits_tensor = *split_tensor;
+      if (splits_tensor.dtype() != split_dtype) {
+        return errors::InvalidArgument(
+            "Expected splits Tensor dtype: ", split_dtype,
+            ", found: ", splits_tensor.dtype());
+      }
+      if (splits_tensor.dims() != 1) {
+        return errors::InvalidArgument(
+            "Ragged splits must have rank 1; encoded scalar element at index ",
+            i, " has splits Tensor at split_index ", j, ": ",
+            splits_tensor.DebugString());
+      }
+      (*decoded_ragged)[i].nested_splits.push_back(splits_tensor);
+    }
+    const Tensor* values_tensor = input_vec(ragged_rank).get<Tensor>();
+    if (values_tensor == nullptr) {
+      return errors::InvalidArgument("Encoded scalar element at index ", i,
+                                     " doesn't have a values Tensor: ",
+                                     input_vec(ragged_rank).DebugString());
+    }
+    if (values_tensor->dtype() != value_dtype) {
+      return errors::InvalidArgument(
+          "Expected values Tensor dtype: ", value_dtype,
+          ", found: ", values_tensor->dtype());
+    }
+    if (values_tensor->dims() < 1) {
+      return errors::InvalidArgument(
+          "Ragged values must have rank >= 1; encoded scalar element at index ",
+          i, " has values Tensor: ", values_tensor->DebugString());
+    }
+    (*decoded_ragged)[i].values = *values_tensor;
+  }
+  return Status::OK();
+}
+
+template <typename VALUE_TYPE, typename SPLIT_TYPE>
+Status NestedStackRaggedTensors(
+    const std::vector<RaggedTensor>& ragged_components,
+    const std::vector<int>& nested_dim_sizes, const int input_ragged_rank,
+    const int output_ragged_rank, RaggedTensor* output_ragged) {
+  output_ragged->nested_splits.reserve(output_ragged_rank);
+  const int dims = nested_dim_sizes.size();
+
+  // Populate first `dims - 1` splits.
+  for (int i = 0; i < dims - 1; i++) {
+    int dims_splits_size = nested_dim_sizes[i] + 1;
+    output_ragged->nested_splits.push_back(Tensor(
+        DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({dims_splits_size})));
+    auto splits_vec = output_ragged->nested_splits[i].vec<SPLIT_TYPE>();
+    int split_diff = nested_dim_sizes[i + 1];
+    for (int j = 0; j < dims_splits_size; j++) {
+      splits_vec(j) = j * split_diff;
+    }
+  }
+
+  // Populate `dims`-th split.
+  int splits_size = ragged_components.size() + 1;
+  output_ragged->nested_splits.push_back(
+      Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({splits_size})));
+  auto dims_splits_vec =
+      output_ragged->nested_splits[dims - 1].vec<SPLIT_TYPE>();
+  dims_splits_vec(0) = 0;
+  for (int i = 0; i < ragged_components.size(); i++) {
+    int split_val = ragged_components[i].values.NumElements();
+    if (input_ragged_rank != 0 && !ragged_components[i].nested_splits.empty()) {
+      split_val = ragged_components[i].nested_splits[0].NumElements() - 1;
+    }
+    dims_splits_vec(i + 1) = dims_splits_vec(i) + split_val;
+  }
+
+  // Populate last `input_ragged_rank` splits.
+  for (int i = 0; i < input_ragged_rank; i++) {
+    int split_index = dims + i;
+    int split_size = 1;
+    for (int j = 0; j < ragged_components.size(); j++) {
+      if (!ragged_components[j].nested_splits.empty()) {
+        split_size += ragged_components[j].nested_splits[i].NumElements() - 1;
+      }
+    }
+    output_ragged->nested_splits.push_back(
+        Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({split_size})));
+    auto splits_vec =
+        output_ragged->nested_splits[split_index].vec<SPLIT_TYPE>();
+    splits_vec(0) = 0;
+    SPLIT_TYPE last_split_value = 0;
+    int index = 1;
+    for (int j = 0; j < ragged_components.size(); j++) {
+      if (ragged_components[j].nested_splits.empty()) {
+        // Corner case: empty row. e.g [ [[x], [x]], [] ]
+        continue;
+      }
+      auto component_splits_vec =
+          ragged_components[j].nested_splits[i].vec<SPLIT_TYPE>();
+      for (int k = 1; k < component_splits_vec.size(); k++, index++) {
+        splits_vec(index) = component_splits_vec(k) + last_split_value;
+      }
+      last_split_value = splits_vec(index - 1);
+    }
+  }
+
+  // Populate values.
+  TensorShape component_values_shape = ragged_components[0].values.shape();
+  int values_size = component_values_shape.dim_size(0);
+  for (int i = 1; i < ragged_components.size(); i++) {
+    if (ragged_components[i].values.dims() != component_values_shape.dims()) {
+      return errors::InvalidArgument(
+          "Rank of values must match for all "
+          "components; values shape at index 0: ",
+          component_values_shape.DebugString(), ", values shape at index ", i,
+          ": ", ragged_components[i].values.shape().DebugString());
+    }
+    values_size += ragged_components[i].values.shape().dim_size(0);
+  }
+  component_values_shape.set_dim(0, values_size);
+  output_ragged->values =
+      Tensor(DataTypeToEnum<VALUE_TYPE>::value, component_values_shape);
+  auto output_values_flat =
+      output_ragged->values.flat_outer_dims<VALUE_TYPE, 2>();
+  int values_index = 0;
+  for (int i = 0; i < ragged_components.size(); i++) {
+    auto component_values_flat =
+        ragged_components[i].values.flat_outer_dims<VALUE_TYPE, 2>();
+    int num_inner_elements = ragged_components[i].values.NumElements();
+    if (ragged_components[i].values.dim_size(0) > 0) {
+      num_inner_elements /= ragged_components[i].values.dim_size(0);
+    }
+    for (int j = 0; j < ragged_components[i].values.dim_size(0);
+         j++, values_index++) {
+      for (int k = 0; k < num_inner_elements; k++) {
+        output_values_flat(values_index, k) = component_values_flat(j, k);
+      }
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
+template <typename VALUE_TYPE, typename SPLIT_TYPE>
+class RaggedTensorFromVariantOp : public OpKernel {
+ public:
+  explicit RaggedTensorFromVariantOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("input_ragged_rank", &input_ragged_rank_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("output_ragged_rank", &output_ragged_rank_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Read input Tensor.
+    const Tensor& encoded_variant = context->input(0);
+
+    if (input_ragged_rank_ == -1) {  // Infer input_ragged_rank_.
+      input_ragged_rank_ = output_ragged_rank_ - encoded_variant.dims();
+      OP_REQUIRES(context, input_ragged_rank_ >= 0,
+                  errors::InvalidArgument(
+                      "Inferred input_ragged_rank (output_ragged_rank - "
+                      "encoded_variant.dims()) must be >= 0, found "
+                      "output_ragged_rank: ",
+                      output_ragged_rank_,
+                      ", encoded_variant.dims(): ", encoded_variant.dims(),
+                      ", inferred input_ragged_rank: ", input_ragged_rank_));
+    }
+    OP_REQUIRES(
+        context,
+        output_ragged_rank_ == encoded_variant.dims() + input_ragged_rank_,
+        errors::InvalidArgument(
+            "output_ragged_rank must be equal to input_ragged_rank + "
+            "encoded_ragged.dims(); output_ragged_rank: ",
+            output_ragged_rank_, ", input_ragged_rank: ", input_ragged_rank_,
+            ", encoded_variant.dims(): ", encoded_variant.dims(), "."));
+
+    // Decode all variants.
+    const auto value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto split_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+    std::vector<RaggedTensor> decoded_components;
+    OP_REQUIRES_OK(context, RaggedComponentsFromVariant(
+                                encoded_variant, input_ragged_rank_,
+                                value_dtype, split_dtype, &decoded_components));
+
+    // Corner case: input is a scalar.
+    if (encoded_variant.dims() == 0) {
+      ReturnRaggedTensor(context, decoded_components[0]);
+      return;
+    }
+
+    // Nested-Stack Ragged components into a batched RaggedTensor.
+    std::vector<int> encoded_dim_sizes(encoded_variant.dims(), 0);
+    for (int i = 0; i < encoded_variant.dims(); i++) {
+      encoded_dim_sizes[i] = encoded_variant.dim_size(i);
+    }
+    RaggedTensor output_ragged;
+    OP_REQUIRES_OK(
+        context, NestedStackRaggedTensors<VALUE_TYPE, SPLIT_TYPE>(
+                     decoded_components, encoded_dim_sizes, input_ragged_rank_,
+                     output_ragged_rank_, &output_ragged));
+
+    // Set output.
+    ReturnRaggedTensor(context, output_ragged);
+  }
+
+ private:
+  int input_ragged_rank_;
+  int output_ragged_rank_;
+
+  void ReturnRaggedTensor(OpKernelContext* context,
+                          RaggedTensor ragged_tensor) {
+    int ragged_rank = ragged_tensor.nested_splits.size();
+    OpOutputList splits_out;
+    OP_REQUIRES_OK(context,
+                   context->output_list("output_nested_splits", &splits_out));
+    for (int i = 0; i < ragged_rank; i++) {
+      splits_out.set(i, ragged_tensor.nested_splits[i]);
+    }
+    context->set_output(ragged_rank, ragged_tensor.values);
+  }
+};
+
+#define REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, split_type)      \
+  REGISTER_KERNEL_BUILDER(Name("RaggedTensorFromVariant")             \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<value_type>("Tvalues")  \
+                              .TypeConstraint<split_type>("Tsplits"), \
+                          RaggedTensorFromVariantOp<value_type, split_type>);
+#define REGISTER_KERNELS(value_type)                  \
+  REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int32) \
+  REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int64)
+TF_CALL_POD_TYPES(REGISTER_KERNELS);
+TF_CALL_string(REGISTER_KERNELS);
+TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
+TF_CALL_quint16(REGISTER_KERNELS);
+TF_CALL_qint16(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
+TF_CALL_uint64(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+#undef REGISTER_KERNELS_WITH_SPLIT_TYPE
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
new file mode 100644
index 0000000..cb51be2
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
@@ -0,0 +1,695 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class RaggedTensorFromVariantKernelTest : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for the RaggedTensorFromVariant op, and
+  // populates the variant input with the given values.
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void BuildDecodeRaggedTensorGraph(
+      const int input_ragged_rank, const int output_ragged_rank,
+      const TensorShape& variant_shape,
+      const std::vector<Variant>& variant_values) {
+    const auto value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto split_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+    TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedTensorFromVariant")
+                     .Input(FakeInput(DT_VARIANT))
+                     .Attr("input_ragged_rank", input_ragged_rank)
+                     .Attr("output_ragged_rank", output_ragged_rank)
+                     .Attr("Tvalues", value_dtype)
+                     .Attr("Tsplits", split_dtype)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<Variant>(variant_shape, variant_values);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  Tensor CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    // Step 1: Create Tensors out of ragged splits and values.
+    std::vector<Variant> ragged_components;
+    for (auto ragged_split : ragged_splits) {
+      int splits_size = ragged_split.size();
+      Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
+                    TensorShape({splits_size}));
+      test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
+      ragged_components.push_back(splits);
+    }
+    Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
+    test::FillValues<VALUE_TYPE>(&values, ragged_values);
+    ragged_components.push_back(values);
+
+    // Step 2: Encode into a 1-D Variant Tensor.
+    int num_splits = ragged_splits.size();
+    Tensor encoded_list(DT_VARIANT, TensorShape({num_splits + 1}));
+    test::FillValues<Variant>(&encoded_list, ragged_components);
+    return encoded_list;
+  }
+};
+
+TEST_F(RaggedTensorFromVariantKernelTest, ScalarInput) {
+  const std::vector<int64> split_1 = {0, 1, 2, 3, 4, 5};
+  const std::vector<int64> split_2 = {0, 1, 2, 5, 6, 7};
+  const std::vector<int> values = {0, 1, 1, 2, 2, 3, 4};
+
+  Tensor encoded_variant = CreateVariantFromRagged<int, int64>(
+      {split_1, split_2}, TensorShape({7}), values);
+  Tensor expected_splits_1(DT_INT64, TensorShape({6}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({6}));
+  Tensor expected_values(DT_INT32, TensorShape({7}));
+
+  test::FillValues<int64>(&expected_splits_1, split_1);
+  test::FillValues<int64>(&expected_splits_2, split_2);
+  test::FillValues<int>(&expected_values, values);
+
+  int input_ragged_rank = 2;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(input_ragged_rank,
+                                           output_ragged_rank, TensorShape({}),
+                                           {encoded_variant});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int>(*GetOutput(2), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, OneInputElement) {
+  const std::vector<int64> split_1 = {0, 1, 2, 3, 4, 5};
+  const std::vector<int64> split_2 = {0, 1, 2, 5, 6, 7};
+  const std::vector<int> values = {0, 1, 1, 2, 2, 3, 4};
+  const std::vector<int64> batched_splits_1 = {0, 5};
+
+  Tensor encoded_variant = CreateVariantFromRagged<int, int64>(
+      {split_1, split_2}, TensorShape({7}), values);
+  Tensor expected_splits_1(DT_INT64, TensorShape({2}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({6}));
+  Tensor expected_splits_3(DT_INT64, TensorShape({6}));
+  Tensor expected_values(DT_INT32, TensorShape({7}));
+
+  test::FillValues<int64>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int64>(&expected_splits_2, split_1);
+  test::FillValues<int64>(&expected_splits_3, split_2);
+  test::FillValues<int>(&expected_values, values);
+
+  int input_ragged_rank = 2;
+  int output_ragged_rank = 3;
+  BuildDecodeRaggedTensorGraph<int, int64>(input_ragged_rank,
+                                           output_ragged_rank, TensorShape({1}),
+                                           {encoded_variant});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int64>(*GetOutput(2), expected_splits_3);
+  test::ExpectTensorEqual<int>(*GetOutput(3), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, TensorIn2DOut) {
+  // component_1 = [x, x, x]
+  // component_2 = []
+  // component_3 = [x, x]
+  // component_4 = []
+  // batched_ragged =
+  // [[component_1, component_2], [component_3, component_4]]
+  // [
+  //   [ [x, x, x], []  ],
+  //   [ [x, x],    [x] ]
+  // ]
+  const std::vector<int> values_1 = {1, 2, 3};
+  const std::vector<int> values_2 = {};
+  const std::vector<int> values_3 = {4, 5};
+  const std::vector<int> values_4 = {6};
+  const std::vector<int64> batched_splits_1 = {0, 2, 4};
+  const std::vector<int64> batched_splits_2 = {0, 3, 3, 5, 6};
+  const std::vector<int> batched_values = {1, 2, 3, 4, 5, 6};
+
+  Tensor component_variant_1 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({3}), values_1);
+  Tensor component_variant_2 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({0}), values_2);
+  Tensor component_variant_3 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({2}), values_3);
+  Tensor component_variant_4 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({1}), values_4);
+
+  Tensor expected_splits_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({5}));
+  Tensor expected_values(DT_INT32, TensorShape({6}));
+
+  test::FillValues<int64>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int64>(&expected_splits_2, batched_splits_2);
+  test::FillValues<int>(&expected_values, batched_values);
+
+  int input_ragged_rank = 0;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2, 2}),
+      {component_variant_1, component_variant_2, component_variant_3,
+       component_variant_4});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int>(*GetOutput(2), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOut) {
+  // ragged_component_1 = [[x]]
+  // ragged_component_2 = [[x], [x]]
+  // ragged_component_3 = [[x, x]]
+  // ragged_component_4 = [[x, x], [x]]
+  // ragged_component_5 = [[x], [x, x]]
+  // batched_ragged = [[rc1, rc2, rc3, rc4, rc5], [rc4, rc5, rc1, rc3, rc2]]
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int64> component_split_2_1 = {0, 1, 2};
+  const std::vector<int64> component_split_3_1 = {0, 2};
+  const std::vector<int64> component_split_4_1 = {0, 2, 3};
+  const std::vector<int64> component_split_5_1 = {0, 1, 3};
+  const std::vector<int> component_values_1 = {0};
+  const std::vector<int> component_values_2 = {0, 1};
+  const std::vector<int> component_values_3 = {0, 1};
+  const std::vector<int> component_values_4 = {0, 1, 2};
+  const std::vector<int> component_values_5 = {0, 1, 2};
+
+  const std::vector<int64> batched_splits_1 = {0, 5, 10};
+  const std::vector<int64> batched_splits_2 = {0,  1,  3,  4,  6, 8,
+                                               10, 12, 13, 14, 16};
+  const std::vector<int64> batched_splits_3 = {
+      0, 1, 2, 3, 5, 7, 8, 9, 11, 13, 14, 15, 17, 18, 20, 21, 22};
+  const std::vector<int> batched_values = {0, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2,
+                                           0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1};
+
+  Tensor expected_splits_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({11}));
+  Tensor expected_splits_3(DT_INT64, TensorShape({17}));
+  Tensor expected_values(DT_INT32, TensorShape({22}));
+
+  test::FillValues<int64>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int64>(&expected_splits_2, batched_splits_2);
+  test::FillValues<int64>(&expected_splits_3, batched_splits_3);
+  test::FillValues<int>(&expected_values, batched_values);
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1}), component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+      {component_split_2_1}, TensorShape({2}), component_values_2);
+  Tensor variant_component_3 = CreateVariantFromRagged<int, int64>(
+      {component_split_3_1}, TensorShape({2}), component_values_3);
+  Tensor variant_component_4 = CreateVariantFromRagged<int, int64>(
+      {component_split_4_1}, TensorShape({3}), component_values_4);
+  Tensor variant_component_5 = CreateVariantFromRagged<int, int64>(
+      {component_split_5_1}, TensorShape({3}), component_values_5);
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 3;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2, 5}),
+      {variant_component_1, variant_component_2, variant_component_3,
+       variant_component_4, variant_component_5, variant_component_4,
+       variant_component_5, variant_component_1, variant_component_3,
+       variant_component_2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int64>(*GetOutput(2), expected_splits_3);
+  test::ExpectTensorEqual<int>(*GetOutput(3), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest,
+       NonEmpty2DIn4DOutInferredInputRaggedRank) {
+  // ragged_component_1 =
+  // [
+  //   [ [x]            ],
+  //   [ [x],    [x]    ],
+  //   [ [x, x]         ],
+  //   [ [x, x], [x]    ],
+  //   [ [x],    [x, x] ]
+  // ]
+  // ragged_component_2 =
+  // [
+  //   [ [x, x], [x]    ],
+  //   [ [x],    [x, x] ],
+  //   [ [x]            ],
+  //   [ [x, x]         ],
+  //   [ [x],    [x]    ]
+  // ]
+  // batched_ragged = [[rc1, rc2], [rc2, rc1]]
+  const std::vector<int64> component_split_1_1 = {0, 1, 3, 4, 6, 8};
+  const std::vector<int64> component_split_1_2 = {0, 1, 2, 3, 5, 7, 8, 9, 11};
+  const std::vector<int64> component_split_2_1 = {0, 2, 4, 5, 6, 8};
+  const std::vector<int64> component_split_2_2 = {0, 2, 3, 4, 6, 7, 9, 10, 11};
+  const std::vector<int> component_values_1 = {0, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2};
+  const std::vector<int> component_values_2 = {0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1};
+  const std::vector<int64> batched_splits_1 = {0, 2, 4};
+  const std::vector<int64> batched_splits_2 = {0, 5, 10, 15, 20};
+  const std::vector<int64> batched_splits_3 = {0,  1,  3,  4,  6,  8,  10,
+                                               12, 13, 14, 16, 18, 20, 21,
+                                               22, 24, 25, 27, 28, 30, 32};
+  const std::vector<int64> batched_splits_4 = {
+      0,  1,  2,  3,  5,  7,  8,  9,  11, 13, 14, 15, 17, 18, 20, 21, 22,
+      24, 25, 26, 28, 29, 31, 32, 33, 34, 35, 36, 38, 40, 41, 42, 44};
+  const std::vector<int> batched_values = {
+      0, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1,
+      0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2};
+
+  Tensor expected_splits_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({5}));
+  Tensor expected_splits_3(DT_INT64, TensorShape({21}));
+  Tensor expected_splits_4(DT_INT64, TensorShape({33}));
+  Tensor expected_values(DT_INT32, TensorShape({44}));
+  test::FillValues<int64>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int64>(&expected_splits_2, batched_splits_2);
+  test::FillValues<int64>(&expected_splits_3, batched_splits_3);
+  test::FillValues<int64>(&expected_splits_4, batched_splits_4);
+  test::FillValues<int>(&expected_values, batched_values);
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1, component_split_1_2}, TensorShape({11}),
+      component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+      {component_split_2_1, component_split_2_2}, TensorShape({11}),
+      component_values_2);
+  int input_ragged_rank = -1;
+  int output_ragged_rank = 4;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2, 2}),
+      {variant_component_1, variant_component_2, variant_component_2,
+       variant_component_1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int64>(*GetOutput(2), expected_splits_3);
+  test::ExpectTensorEqual<int64>(*GetOutput(3), expected_splits_4);
+  test::ExpectTensorEqual<int>(*GetOutput(4), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, EmptyRow1DIn2DOut) {
+  // ragged_component_1 = [[x, x, x], []]
+  // ragged_component_2 = []
+  // batched_ragged = [rc1, rc2] = [[[x, x, x], []], []]
+  const std::vector<int64> component_split_1_1 = {0, 3, 3};
+  const std::vector<int> component_values_1 = {1, 2, 3};
+  const std::vector<int64> component_split_2_1 = {0};
+  const std::vector<int64> batched_splits_1 = {0, 2, 2};
+  const std::vector<int64> batched_splits_2 = {0, 3, 3};
+  const std::vector<int> batched_values = {1, 2, 3};
+
+  Tensor expected_splits_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({3}));
+  Tensor expected_values(DT_INT32, TensorShape({3}));
+  test::FillValues<int64>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int64>(&expected_splits_2, batched_splits_2);
+  test::FillValues<int>(&expected_values, batched_values);
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({3}), component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+      {component_split_2_1}, TensorShape({0}), {});  // Empty row.
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2}),
+      {variant_component_1, variant_component_2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int>(*GetOutput(2), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, NDValues1DIn2DOut) {
+  // ragged_component_1 = [[x]]
+  // ragged_component_1 = [[x], [x]]
+  // batched_ragged = [rc1, rc2] = [[[x]], [[x], [x]]]
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int> component_values_1 = {1, 2};
+  const std::vector<int64> component_split_2_1 = {0, 1, 2};
+  const std::vector<int> component_values_2 = {1, 2, 3, 4};
+  const std::vector<int64> batched_splits_1 = {0, 1, 3};
+  const std::vector<int64> batched_splits_2 = {0, 1, 2, 3};
+  const std::vector<int> batched_values = {1, 2, 1, 2, 3, 4};
+
+  Tensor expected_splits_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_2(DT_INT64, TensorShape({4}));
+  Tensor expected_values(DT_INT32, TensorShape({3, 2}));
+  test::FillValues<int64>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int64>(&expected_splits_2, batched_splits_2);
+  test::FillValues<int>(&expected_values, batched_values);
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1, 2}), component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+      {component_split_2_1}, TensorShape({2, 2}), component_values_2);
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2}),
+      {variant_component_1, variant_component_2});
+
+  TF_ASSERT_OK(RunOpKernel());
+  test::ExpectTensorEqual<int64>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int64>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int>(*GetOutput(2), expected_values);
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOutInt32Splits) {
+  // ragged_component_1 = [[x]]
+  // ragged_component_2 = [[x], [x]]
+  // ragged_component_3 = [[x, x]]
+  // ragged_component_4 = [[x, x], [x]]
+  // ragged_component_5 = [[x], [x, x]]
+  // batched_ragged = [[rc1, rc2, rc3, rc4, rc5], [rc4, rc5, rc1, rc3, rc2]]
+  const std::vector<int> component_split_1_1 = {0, 1};
+  const std::vector<int> component_split_2_1 = {0, 1, 2};
+  const std::vector<int> component_split_3_1 = {0, 2};
+  const std::vector<int> component_split_4_1 = {0, 2, 3};
+  const std::vector<int> component_split_5_1 = {0, 1, 3};
+  const std::vector<int> component_values_1 = {0};
+  const std::vector<int> component_values_2 = {0, 1};
+  const std::vector<int> component_values_3 = {0, 1};
+  const std::vector<int> component_values_4 = {0, 1, 2};
+  const std::vector<int> component_values_5 = {0, 1, 2};
+
+  const std::vector<int> batched_splits_1 = {0, 5, 10};
+  const std::vector<int> batched_splits_2 = {0,  1,  3,  4,  6, 8,
+                                             10, 12, 13, 14, 16};
+  const std::vector<int> batched_splits_3 = {0,  1,  2,  3,  5,  7,  8,  9, 11,
+                                             13, 14, 15, 17, 18, 20, 21, 22};
+  const std::vector<int> batched_values = {0, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2,
+                                           0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1};
+
+  Tensor expected_splits_1(DT_INT32, TensorShape({3}));
+  Tensor expected_splits_2(DT_INT32, TensorShape({11}));
+  Tensor expected_splits_3(DT_INT32, TensorShape({17}));
+  Tensor expected_values(DT_INT32, TensorShape({22}));
+
+  test::FillValues<int>(&expected_splits_1, batched_splits_1);
+  test::FillValues<int>(&expected_splits_2, batched_splits_2);
+  test::FillValues<int>(&expected_splits_3, batched_splits_3);
+  test::FillValues<int>(&expected_values, batched_values);
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int>(
+      {component_split_1_1}, TensorShape({1}), component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int>(
+      {component_split_2_1}, TensorShape({2}), component_values_2);
+  Tensor variant_component_3 = CreateVariantFromRagged<int, int>(
+      {component_split_3_1}, TensorShape({2}), component_values_3);
+  Tensor variant_component_4 = CreateVariantFromRagged<int, int>(
+      {component_split_4_1}, TensorShape({3}), component_values_4);
+  Tensor variant_component_5 = CreateVariantFromRagged<int, int>(
+      {component_split_5_1}, TensorShape({3}), component_values_5);
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 3;
+  BuildDecodeRaggedTensorGraph<int, int>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2, 5}),
+      {variant_component_1, variant_component_2, variant_component_3,
+       variant_component_4, variant_component_5, variant_component_4,
+       variant_component_5, variant_component_1, variant_component_3,
+       variant_component_2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorEqual<int>(*GetOutput(0), expected_splits_1);
+  test::ExpectTensorEqual<int>(*GetOutput(1), expected_splits_2);
+  test::ExpectTensorEqual<int>(*GetOutput(2), expected_splits_3);
+  test::ExpectTensorEqual<int>(*GetOutput(3), expected_values);
+}
+
+// Tests for invalid inputs.
+TEST_F(RaggedTensorFromVariantKernelTest, InvalidInferredInputRaggedRank) {
+  Tensor component_variant_1 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({3}), {1, 2, 3});
+  Tensor component_variant_2 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({0}), {});
+  Tensor component_variant_3 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({2}), {1, 2});
+  Tensor component_variant_4 =
+      CreateVariantFromRagged<int, int64>({}, TensorShape({1}), {1});
+
+  int input_ragged_rank = -1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({1, 1, 1, 4}),
+      {component_variant_1, component_variant_2, component_variant_3,
+       component_variant_4});
+  EXPECT_TRUE(
+      absl::StartsWith(RunOpKernel().error_message(),
+                       "Inferred input_ragged_rank (output_ragged_rank - "
+                       "encoded_variant.dims()) must be >= 0"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, InputDimsAndRaggedRankAttrsMismatch) {
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int64> component_split_2_1 = {0, 1, 2};
+  const std::vector<int> component_values_1 = {0};
+  const std::vector<int> component_values_2 = {0, 1};
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1}), component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+      {component_split_2_1}, TensorShape({2}), component_values_2);
+
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 4;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2}),
+      {variant_component_1, variant_component_2});
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+                               "output_ragged_rank must be equal to "
+                               "input_ragged_rank + encoded_ragged.dims()"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, InputDoesNotHoldTensors) {
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2}), {1, 2});
+  EXPECT_TRUE(absl::StartsWith(
+      RunOpKernel().error_message(),
+      "Input Variant element at index 0 doesn't hold a Tensor"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, InputVariantTensorRankNotOne) {
+  Tensor variant_list(DT_VARIANT, TensorShape({2, 1}));
+  test::FillValues<Variant>(&variant_list, {1, 2});
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({1}), {variant_list});
+  EXPECT_TRUE(absl::StartsWith(
+      RunOpKernel().error_message(),
+      "Encoded input Variant must have rank 1, but found rank: 2"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest,
+       InputScalarElementDoesNotMatchInputRaggedRank) {
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int> component_values_1 = {1, 2};
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1, 2}), component_values_1);
+
+  int input_ragged_rank = 2;
+  int output_ragged_rank = 3;
+  BuildDecodeRaggedTensorGraph<int, int64>(input_ragged_rank,
+                                           output_ragged_rank, TensorShape({1}),
+                                           {variant_component_1});
+  EXPECT_TRUE(absl::StartsWith(
+      RunOpKernel().error_message(),
+      "Encoded input Variant must hold either input_ragged_rank + 1 "
+      "Tensors or an empty Tensor"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitNotATensor) {
+  Tensor variant_list(DT_VARIANT, TensorShape({2}));
+  test::FillValues<Variant>(&variant_list, {1, 2});
+
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int>(input_ragged_rank, output_ragged_rank,
+                                         TensorShape({1}), {variant_list});
+  EXPECT_TRUE(
+      absl::StartsWith(RunOpKernel().error_message(),
+                       "Encoded scalar element at index 0 doesn't have a "
+                       "splits Tensor at split_index 0"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitTypeMismatch) {
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int> component_values_1 = {0};
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1}), component_values_1);
+
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int>(input_ragged_rank, output_ragged_rank,
+                                         TensorShape({1}),
+                                         {variant_component_1});
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+                               "Expected splits Tensor dtype: 3, found: 9"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitRankNotOne) {
+  Tensor splits(DT_INT64, TensorShape({2, 1}));
+  test::FillValues<int64>(&splits, {1, 2});
+  Tensor values(DT_INT32, {2});
+  test::FillValues<int>(&values, {1, 2});
+  Tensor encoded_list(DT_VARIANT, TensorShape({2}));
+  test::FillValues<Variant>(&encoded_list, {splits, values});
+
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({1}), {encoded_list});
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+                               "Ragged splits must have rank 1"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesNotATensor) {
+  Tensor splits(DT_INT64, TensorShape({3}));
+  test::FillValues<int64>(&splits, {0, 2, 3});
+  Tensor variant_list(DT_VARIANT, TensorShape({2}));
+  test::FillValues<Variant>(&variant_list, {splits, 2});
+
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({1}), {variant_list});
+  EXPECT_TRUE(
+      absl::StartsWith(RunOpKernel().error_message(),
+                       "Encoded scalar element at index 0 doesn't have a "
+                       "values Tensor"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int> component_values_1 = {0};
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1}), component_values_1);
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<string, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({1}),
+      {variant_component_1});
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+                               "Expected values Tensor dtype: 7, found: 3"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankNotGreaterThanOne) {
+  Tensor variant_component_1 =
+      CreateVariantFromRagged<int, int64>({{0, 1}}, TensorShape({}), {1});
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(input_ragged_rank,
+                                           output_ragged_rank, TensorShape({1}),
+                                           {variant_component_1});
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+                               "Ragged values must have rank >= 1"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankMismatch) {
+  const std::vector<int64> component_split_1_1 = {0, 1};
+  const std::vector<int64> component_split_2_1 = {0, 1, 2};
+  const std::vector<int> component_values_1 = {0};
+  const std::vector<int> component_values_2 = {0, 1, 2, 3};
+
+  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+      {component_split_1_1}, TensorShape({1}), component_values_1);
+  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+      {component_split_2_1}, TensorShape({2, 2}), component_values_2);
+  int input_ragged_rank = 1;
+  int output_ragged_rank = 2;
+  BuildDecodeRaggedTensorGraph<int, int64>(
+      input_ragged_rank, output_ragged_rank, TensorShape({2}),
+      {variant_component_1, variant_component_2});
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+                               "Rank of values must match for all components"));
+}
+
+TEST_F(RaggedTensorFromVariantKernelTest, ShapeFnTest) {
+  ShapeInferenceTestOp op("RaggedTensorFromVariant");
+
+  // Tests with input_ragged_rank == 0.
+  (*op.node_def.mutable_attr())["input_ragged_rank"].set_i(0);
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(1);
+  INFER_OK(op, "?", "[?];?");
+  INFER_OK(op, "[?]", "[?];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[?,?]");
+
+  // Tests with input_ragged_rank == 1.
+  (*op.node_def.mutable_attr())["input_ragged_rank"].set_i(1);
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(1);
+  INFER_OK(op, "?", "[?];?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[?]");
+  INFER_ERROR("Shape must be rank 0 but is rank 2", op, "[?,?]");
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(2);
+  INFER_OK(op, "?", "[?];[?];?");
+  INFER_OK(op, "[?]", "[?];[?];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[?,?]");
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(3);
+  INFER_OK(op, "?", "[?];[?];[?];?");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[?]");
+  INFER_OK(op, "[?,?]", "[?];[?];[?];?");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[?,?,?]");
+
+  // Tests with input_ragged_rank == 3.
+  (*op.node_def.mutable_attr())["input_ragged_rank"].set_i(3);
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(3);
+  INFER_OK(op, "?", "[?];[?];[?];?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[?]");
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(4);
+  INFER_OK(op, "?", "[?];[?];[?];[?];?");
+  INFER_OK(op, "[?]", "[?];[?];[?];[?];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[?,?]");
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(5);
+  INFER_OK(op, "?", "[?];[?];[?];[?];[?];?");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[?]");
+  INFER_OK(op, "[?,?]", "[?];[?];[?];[?];[?];?");
+
+  (*op.node_def.mutable_attr())["output_ragged_rank"].set_i(6);
+  INFER_OK(op, "?", "[?];[?];[?];[?];[?];[?];?");
+  INFER_ERROR("Shape must be rank 3 but is rank 1", op, "[?]");
+  INFER_ERROR("Shape must be rank 3 but is rank 2", op, "[?,?]");
+  INFER_OK(op, "[?,?,?]", "[?];[?];[?];[?];[?];[?];?");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
index 8cd4b8d..39b530f 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
@@ -26,21 +26,23 @@
 
 using errors::InvalidArgument;
 
+template <typename SPLITS_TYPE>
 class RaggedTensorToSparseOp : public OpKernel {
  public:
   using OpKernel::OpKernel;
+  using ConstFlatSplits = typename TTypes<SPLITS_TYPE>::ConstFlat;
 
   void Compute(OpKernelContext* context) override {
     // Read the `rt_nested_splits` input & convert to Eigen tensors.
     OpInputList rt_nested_splits_in;
     OP_REQUIRES_OK(
         context, context->input_list("rt_nested_splits", &rt_nested_splits_in));
-    const int64 rt_nested_splits_len = rt_nested_splits_in.size();
+    const int rt_nested_splits_len = rt_nested_splits_in.size();
     DCHECK_GT(rt_nested_splits_len, 0);  // Enforced by REGISTER_OP.
-    std::vector<TTypes<int64>::ConstFlat> rt_nested_splits;
+    std::vector<ConstFlatSplits> rt_nested_splits;
     rt_nested_splits.reserve(rt_nested_splits_len);
     for (int i = 0; i < rt_nested_splits_len; ++i) {
-      rt_nested_splits.push_back(rt_nested_splits_in[i].flat<int64>());
+      rt_nested_splits.push_back(rt_nested_splits_in[i].flat<SPLITS_TYPE>());
     }
 
     // Read the `rt_dense_values` input.
@@ -135,7 +137,7 @@
     sparse_dense_shape(0) = rt_nested_splits_in[0].dim_size(0) - 1;
     for (int dim = 0; dim < rt_nested_splits_len; ++dim) {
       const auto& splits = rt_nested_splits[dim];
-      int64 max_width = 0;
+      SPLITS_TYPE max_width = 0;
       for (int i = 1; i < splits.size(); ++i) {
         max_width = std::max(max_width, splits(i) - splits(i - 1));
       }
@@ -150,7 +152,7 @@
  private:
   // Validate `rt_nested_splits` to ensure we don't get any segfaults.
   static ::tensorflow::Status ValidateInputs(
-      std::vector<TTypes<int64>::ConstFlat> rt_nested_splits,
+      std::vector<ConstFlatSplits> rt_nested_splits,
       const Tensor& rt_dense_values_in) {
     for (int i = 0; i < rt_nested_splits.size(); ++i) {
       if (rt_nested_splits[i].size() == 0) {
@@ -160,7 +162,7 @@
         return InvalidArgument("First value of ragged splits must be 0.");
       }
       if (i > 0) {
-        int64 last_split =
+        SPLITS_TYPE last_split =
             rt_nested_splits[i - 1](rt_nested_splits[i - 1].size() - 1);
         if (rt_nested_splits[i].size() != last_split + 1) {
           return InvalidArgument(
@@ -206,14 +208,21 @@
   // values.
   static bool IsCompleted(
       const std::vector<int64>& pos, int dim,
-      const std::vector<TTypes<int64>::ConstFlat>& rt_nested_splits) {
+      const std::vector<ConstFlatSplits>& rt_nested_splits) {
     int64 current_child = pos[dim + 1];
     int64 limit_child = rt_nested_splits[dim](pos[dim] + 1);
     return current_child >= limit_child;
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse").Device(DEVICE_CPU),
-                        RaggedTensorToSparseOp);
+REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int32>("Tsplits"),
+                        RaggedTensorToSparseOp<int32>);
+
+REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("Tsplits"),
+                        RaggedTensorToSparseOp<int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
new file mode 100644
index 0000000..6923fd4
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -0,0 +1,221 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+
+struct RaggedTensor {
+  Tensor values;
+  std::vector<Tensor> nested_splits;
+};
+
+Status RaggedToVariant(const RaggedTensor& ragged, Tensor* encoded_list) {
+  // Encode as a rank-1 Variant Tensor.
+  int ragged_rank = ragged.nested_splits.size();
+  *encoded_list = Tensor(DT_VARIANT, TensorShape({ragged_rank + 1}));
+  auto encoded_vec = encoded_list->vec<Variant>();
+  for (int i = 0; i < ragged_rank; i++) {
+    encoded_vec(i) = ragged.nested_splits[i];
+  }
+  encoded_vec(ragged_rank) = ragged.values;
+  return Status::OK();
+}
+
+template <typename VALUE_TYPE, typename SPLIT_TYPE>
+Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
+                              std::vector<RaggedTensor>* ragged_components) {
+  // Set up the component Ragged Tensors.
+  int ragged_rank = batched_ragged.nested_splits.size();
+  auto batched_splits_top_vec =
+      batched_ragged.nested_splits[0].vec<SPLIT_TYPE>();
+  int num_components = batched_splits_top_vec.size() - 1;
+  int num_splits = ragged_rank - 1;
+  ragged_components->resize(num_components);
+  for (RaggedTensor ragged_component : *ragged_components) {
+    ragged_component.nested_splits.reserve(num_splits);
+  }
+  const auto& batched_flat = batched_ragged.values.flat<VALUE_TYPE>();
+  int num_inner_elems = batched_ragged.values.NumElements();
+  if (batched_ragged.values.dim_size(0) > 1) {
+    num_inner_elems /= batched_ragged.values.dim_size(0);
+  }
+  TensorShape values_shape = batched_ragged.values.shape();
+
+  // Corner case: ragged_rank == 1, e.g. [[1, 2, 3], [4, 5]]
+  if (num_splits == 0) {
+    for (int i = 0; i < num_components; i++) {
+      int start = batched_splits_top_vec(i);
+      int limit = batched_splits_top_vec(i + 1);
+      int num_values = limit - start;
+      values_shape.set_dim(0, num_values);
+      (*ragged_components)[i].values =
+          Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape);
+      auto ragged_component_values_flat =
+          (*ragged_components)[i].values.flat<VALUE_TYPE>();
+      for (int j = 0; j < num_values * num_inner_elems; j++) {
+        ragged_component_values_flat(j) =
+            batched_flat(j + start * num_inner_elems);
+      }
+    }
+    return Status::OK();
+  }
+
+  // Unbatch nested splits.
+  std::vector<typename TTypes<SPLIT_TYPE>::ConstVec> batched_splits_vec;
+  batched_splits_vec.reserve(ragged_rank);
+  for (int i = 0; i < ragged_rank; i++) {
+    batched_splits_vec.push_back(
+        batched_ragged.nested_splits[i].vec<SPLIT_TYPE>());
+  }
+  std::vector<int> index(num_splits, 1);
+  std::vector<int> ragged_component_values_size(num_components, 0);
+  for (int i = 0; i < num_components; i++) {
+    std::vector<typename TTypes<SPLIT_TYPE>::Vec> ragged_component_splits_vec;
+    ragged_component_splits_vec.reserve(num_splits);
+    int split_size = -1;
+    for (int j = 0; j < num_splits; j++) {
+      if (j == 0) {
+        split_size =
+            batched_splits_top_vec(i + 1) - batched_splits_top_vec(i) + 1;
+      } else {
+        // Update split size based on previous split.
+        int last_index = ragged_component_splits_vec[j - 1].size() - 1;
+        split_size = ragged_component_splits_vec[j - 1](last_index) + 1;
+      }
+      (*ragged_components)[i].nested_splits.push_back(
+          Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({split_size})));
+      ragged_component_splits_vec.push_back(
+          (*ragged_components)[i].nested_splits[j].vec<SPLIT_TYPE>());
+      SPLIT_TYPE last_split_value = batched_splits_vec[j + 1](index[j] - 1);
+      ragged_component_splits_vec[j](0) = 0;
+      for (int k = 1; k < split_size; k++, index[j]++) {
+        ragged_component_splits_vec[j](k) =
+            batched_splits_vec[j + 1](index[j]) - last_split_value;
+      }
+    }
+    int last_split_size = ragged_component_splits_vec[num_splits - 1].size();
+    ragged_component_values_size[i] =
+        ragged_component_splits_vec[num_splits - 1](last_split_size - 1);
+  }
+
+  // Unbatch values.
+  int value_index = 0;
+  for (int i = 0; i < num_components; i++) {
+    int num_values = ragged_component_values_size[i];
+    values_shape.set_dim(0, num_values);
+    (*ragged_components)[i].values =
+        Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape);
+    auto ragged_component_values_flat =
+        (*ragged_components)[i].values.flat<VALUE_TYPE>();
+    for (int j = 0; j < num_values * num_inner_elems; j++, value_index++) {
+      ragged_component_values_flat(j) = batched_flat(value_index);
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+template <typename VALUE_TYPE, typename SPLIT_TYPE>
+class RaggedTensorToVariantOp : public OpKernel {
+ public:
+  explicit RaggedTensorToVariantOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("batched_input", &batched_input_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Read ragged_splits inputs.
+    OpInputList ragged_nested_splits_in;
+    OP_REQUIRES_OK(context, context->input_list("rt_nested_splits",
+                                                &ragged_nested_splits_in));
+    const int ragged_nested_splits_len = ragged_nested_splits_in.size();
+    DCHECK_GT(ragged_nested_splits_len, 0);  // Enforced by REGISTER_OP.
+    RaggedTensor batched_ragged_input;
+    // Read ragged_values input.
+    batched_ragged_input.values = context->input(ragged_nested_splits_len);
+    batched_ragged_input.nested_splits.reserve(ragged_nested_splits_len);
+    for (int i = 0; i < ragged_nested_splits_len; i++) {
+      batched_ragged_input.nested_splits.push_back(ragged_nested_splits_in[i]);
+    }
+
+    if (!batched_input_) {
+      // Encode the input as is.
+      Tensor encoded_list;
+      OP_REQUIRES_OK(context,
+                     RaggedToVariant(batched_ragged_input, &encoded_list));
+      // Encode as a Scalar Variant Tensor.
+      Tensor* encoded_scalar;
+      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}),
+                                                       &encoded_scalar));
+      encoded_scalar->scalar<Variant>()() = std::move(encoded_list);
+      return;
+    }
+
+    // Unbatch the Ragged Tensor and encode the components.
+    std::vector<RaggedTensor> ragged_components;
+    OP_REQUIRES_OK(context, UnbatchRaggedZerothDim<VALUE_TYPE, SPLIT_TYPE>(
+                                batched_ragged_input, &ragged_components));
+    std::vector<Tensor> encoded_components(ragged_components.size());
+    for (int i = 0; i < ragged_components.size(); i++) {
+      OP_REQUIRES_OK(context, RaggedToVariant(ragged_components[i],
+                                              &encoded_components[i]));
+    }
+
+    // Bundle the encoded scalar Variant Tensors into a rank-1 Variant Tensor.
+    Tensor* encoded_ragged;
+    int output_size = ragged_components.size();
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({output_size}),
+                                            &encoded_ragged));
+    auto encoded_ragged_vec = encoded_ragged->vec<Variant>();
+    for (int i = 0; i < output_size; i++) {
+      encoded_ragged_vec(i) = encoded_components[i];
+    }
+  }
+
+ private:
+  bool batched_input_;
+};
+
+#define REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, split_type)      \
+  REGISTER_KERNEL_BUILDER(Name("RaggedTensorToVariant")               \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<value_type>("Tvalues")  \
+                              .TypeConstraint<split_type>("Tsplits"), \
+                          RaggedTensorToVariantOp<value_type, split_type>);
+#define REGISTER_KERNELS(value_type)                  \
+  REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int32) \
+  REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int64)
+TF_CALL_POD_TYPES(REGISTER_KERNELS);
+TF_CALL_string(REGISTER_KERNELS);
+TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
+TF_CALL_quint16(REGISTER_KERNELS);
+TF_CALL_qint16(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
+TF_CALL_uint64(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+#undef REGISTER_KERNELS_WITH_SPLIT_TYPE
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
new file mode 100644
index 0000000..2854044
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
@@ -0,0 +1,610 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class RaggedTensorToVariantKernelTest : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for the RaggedTensorToVariant op, and
+  // populates the `splits` input with the given values.
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void BuildEncodeRaggedTensorGraph(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values, const bool batched) {
+    const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+    int64 num_splits = ragged_splits.size();
+    TF_ASSERT_OK(
+        NodeDefBuilder("tested_op", "RaggedTensorToVariant")
+            .Input(FakeInput(num_splits, splits_dtype))  // ragged_splits
+            .Input(FakeInput(values_dtype))              // ragged_values
+            .Attr("RAGGED_RANK", num_splits)
+            .Attr("Tvalues", values_dtype)
+            .Attr("Tsplits", splits_dtype)
+            .Attr("batched_input", batched)
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    for (const auto& splits : ragged_splits) {
+      int64 splits_size = splits.size();
+      AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), splits);
+    }
+    AddInputFromArray<VALUE_TYPE>(ragged_values_shape, ragged_values);
+  }
+};
+
+TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
+  // ragged_tensor=[[[], []], [[]], []]
+  const std::vector<int64> batched_splits_1 = {0, 2, 3, 3};
+  const std::vector<int64> batched_splits_2 = {0, 0, 0, 0};
+
+  const std::vector<int64> component_splits_1_1 = {0, 0, 0};
+  const std::vector<int64> component_splits_2_1 = {0, 0};
+  const std::vector<int64> component_splits_3_1 = {0};
+
+  Tensor expected_splits_1_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_2_1(DT_INT64, TensorShape({2}));
+  Tensor expected_splits_3_1(DT_INT64, TensorShape({1}));
+
+  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
+  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
+  test::FillValues<int64>(&expected_splits_3_1, component_splits_3_1);
+
+  BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
+                                           TensorShape({0}), {}, true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 3);
+
+  const Variant& encoded_splits_1_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_splits_2_1 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_splits_3_1 =
+      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_3 =
+      encoded_list(2).get<Tensor>()->vec<Variant>()(1);
+
+  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
+                                 expected_splits_1_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
+                                 expected_splits_2_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_3_1.get<Tensor>(),
+                                 expected_splits_3_1);
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               Tensor(DT_INT32, TensorShape({0})));
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               Tensor(DT_INT32, TensorShape({0})));
+  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
+                               Tensor(DT_INT32, TensorShape({0})));
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, 1DValuesRaggedRankOneInput) {
+  // ragged_tensor=
+  // [ [x, x, x],
+  //   [       ],
+  //   [x, x   ],
+  //   [x      ]]
+  const std::vector<int64> batched_splits = {0, 3, 3, 5, 6};
+  const std::vector<int> batched_values = {1, 2, 3, 4, 5, 6};
+
+  const std::vector<int> component_values_1 = {1, 2, 3};
+  const std::vector<int> component_values_3 = {4, 5};
+  const std::vector<int> component_values_4 = {6};
+
+  Tensor expected_values_1(DT_INT32, TensorShape({3}));
+  Tensor expected_values_2(DT_INT32, TensorShape({0}));
+  Tensor expected_values_3(DT_INT32, TensorShape({2}));
+  Tensor expected_values_4(DT_INT32, TensorShape({1}));
+
+  test::FillValues<int>(&expected_values_1, component_values_1);
+  test::FillValues<int>(&expected_values_3, component_values_3);
+  test::FillValues<int>(&expected_values_4, component_values_4);
+
+  BuildEncodeRaggedTensorGraph<int, int64>({batched_splits}, TensorShape({6}),
+                                           batched_values, true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 4);
+
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_3 =
+      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_4 =
+      encoded_list(3).get<Tensor>()->vec<Variant>()(0);
+
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               expected_values_1);
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               expected_values_2);
+  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
+                               expected_values_3);
+  test::ExpectTensorEqual<int>(*encoded_values_4.get<Tensor>(),
+                               expected_values_4);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankOneInput) {
+  // ragged_tensor=
+  // [[x, x],
+  //  [x, x],
+  //  [x, x]]
+  const std::vector<int64> batched_splits = {0, 1, 2, 3};
+  const std::vector<int> batched_values = {1, 2, 4, 5, 6, 7};
+
+  const std::vector<int> component_values_1 = {1, 2};
+  const std::vector<int> component_values_2 = {4, 5};
+  const std::vector<int> component_values_3 = {6, 7};
+
+  Tensor expected_values_1(DT_INT32, TensorShape({1, 2}));
+  Tensor expected_values_2(DT_INT32, TensorShape({1, 2}));
+  Tensor expected_values_3(DT_INT32, TensorShape({1, 2}));
+
+  test::FillValues<int>(&expected_values_1, component_values_1);
+  test::FillValues<int>(&expected_values_2, component_values_2);
+  test::FillValues<int>(&expected_values_3, component_values_3);
+
+  BuildEncodeRaggedTensorGraph<int, int64>(
+      {batched_splits}, TensorShape({3, 2}), batched_values, true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 3);
+
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_3 =
+      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
+
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               expected_values_1);
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               expected_values_2);
+  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
+                               expected_values_3);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankTwoInput) {
+  // ragged_tensor=[
+  // [ [[x, x], [x, x]],
+  //   [[x, x]        ] ]
+  const std::vector<int64> batched_splits_1 = {0, 1, 2};
+  const std::vector<int64> batched_splits_2 = {0, 2, 3};
+  const std::vector<int> batched_values = {1, 2, 4, 5, 6, 7};
+
+  const std::vector<int64> component_splits_1_1 = {0, 2};
+  const std::vector<int64> component_splits_2_1 = {0, 1};
+  const std::vector<int> component_values_1 = {1, 2, 4, 5};
+  const std::vector<int> component_values_2 = {6, 7};
+
+  Tensor expected_splits_1_1(DT_INT64, TensorShape({2}));
+  Tensor expected_splits_2_1(DT_INT64, TensorShape({2}));
+  Tensor expected_values_1(DT_INT32, TensorShape({2, 2}));
+  Tensor expected_values_2(DT_INT32, TensorShape({1, 2}));
+
+  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
+  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
+  test::FillValues<int>(&expected_values_1, component_values_1);
+  test::FillValues<int>(&expected_values_2, component_values_2);
+
+  BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
+                                           TensorShape({3, 2}), batched_values,
+                                           true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 2);
+
+  const Variant& encoded_splits_1_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_splits_2_1 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
+
+  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
+                                 expected_splits_1_1);
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               expected_values_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
+                                 expected_splits_2_1);
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               expected_values_2);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
+  // ragged_tensor =
+  // [[ [x],         [x x],       [] ],
+  //  [                              ],
+  //  [ [x x x x x], [x x x]         ],
+  //  [ [],          [x x x x]       ]]
+  const std::vector<int64> batched_splits_1 = {0, 3, 3, 5, 7};
+  const std::vector<int64> batched_splits_2 = {0, 1, 3, 3, 8, 11, 11, 15};
+  const std::vector<int> batched_values = {1, 2,  3,  4,  5,  6,  7, 8,
+                                           9, 10, 11, 12, 13, 14, 15};
+  const std::vector<int64> component_splits_1_1 = {0, 1, 3, 3};
+  const std::vector<int64> component_splits_2_1 = {0};
+  const std::vector<int64> component_splits_3_1 = {0, 5, 8};
+  const std::vector<int64> component_splits_4_1 = {0, 0, 4};
+  const std::vector<int> component_values_1 = {1, 2, 3};
+  const std::vector<int> component_values_3 = {4, 5, 6, 7, 8, 9, 10, 11};
+  const std::vector<int> component_values_4 = {12, 13, 14, 15};
+
+  Tensor expected_splits_1_1(DT_INT64, TensorShape({4}));
+  Tensor expected_splits_2_1(DT_INT64, TensorShape({1}));
+  Tensor expected_splits_3_1(DT_INT64, TensorShape({3}));
+  Tensor expected_splits_4_1(DT_INT64, TensorShape({3}));
+  Tensor expected_values_1(DT_INT32, TensorShape({3}));
+  Tensor expected_values_2(DT_INT32, TensorShape({0}));
+  Tensor expected_values_3(DT_INT32, TensorShape({8}));
+  Tensor expected_values_4(DT_INT32, TensorShape({4}));
+
+  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
+  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
+  test::FillValues<int64>(&expected_splits_3_1, component_splits_3_1);
+  test::FillValues<int64>(&expected_splits_4_1, component_splits_4_1);
+  test::FillValues<int>(&expected_values_1, component_values_1);
+  test::FillValues<int>(&expected_values_3, component_values_3);
+  test::FillValues<int>(&expected_values_4, component_values_4);
+
+  BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
+                                           TensorShape({15}), batched_values,
+                                           true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 4);
+
+  const Variant& encoded_splits_1_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_splits_2_1 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_splits_3_1 =
+      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_3 =
+      encoded_list(2).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_splits_4_1 =
+      encoded_list(3).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_values_4 =
+      encoded_list(3).get<Tensor>()->vec<Variant>()(1);
+
+  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
+                                 expected_splits_1_1);
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               expected_values_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
+                                 expected_splits_2_1);
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               expected_values_2);
+  test::ExpectTensorEqual<int64>(*encoded_splits_3_1.get<Tensor>(),
+                                 expected_splits_3_1);
+  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
+                               expected_values_3);
+  test::ExpectTensorEqual<int64>(*encoded_splits_4_1.get<Tensor>(),
+                                 expected_splits_4_1);
+  test::ExpectTensorEqual<int>(*encoded_values_4.get<Tensor>(),
+                               expected_values_4);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
+  // ragged_tensor =
+  // [[     [ [x, x]        ],
+  //        [ [x],      [x] ],
+  //        [ [x]           ],
+  //        [ [x]           ],
+  //        [ [x]           ]],
+  //  [     [ [x]           ],
+  //        [ [x]           ],
+  //        [ [x, x, x]     ],
+  //        [ [x]           ],
+  //        [ [x]           ] ]]
+  const std::vector<int64> batched_splits_1 = {0, 5, 10};
+  const std::vector<int64> batched_splits_2 = {0, 1, 3, 4,  5, 6,
+                                               7, 8, 9, 10, 11};
+  const std::vector<int64> batched_splits_3 = {0, 2, 3, 4,  5,  6,
+                                               7, 8, 9, 12, 13, 14};
+  const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
+                                           5, 6, 7, 8, 9, 8, 9};
+  const std::vector<int64> component_split_1_1 = {0, 1, 3, 4, 5, 6};
+  const std::vector<int64> component_split_1_2 = {0, 2, 3, 4, 5, 6, 7};
+  const std::vector<int64> component_split_2_1 = {0, 1, 2, 3, 4, 5};
+  const std::vector<int64> component_split_2_2 = {0, 1, 2, 5, 6, 7};
+  const std::vector<int> component_values_1 = {0, 1, 1, 2, 2, 3, 4};
+  const std::vector<int> component_values_2 = {5, 6, 7, 8, 9, 8, 9};
+
+  Tensor expected_splits_1_1(DT_INT64, TensorShape({6}));
+  Tensor expected_splits_1_2(DT_INT64, TensorShape({7}));
+  Tensor expected_splits_2_1(DT_INT64, TensorShape({6}));
+  Tensor expected_splits_2_2(DT_INT64, TensorShape({6}));
+  Tensor expected_values_1(DT_INT32, TensorShape({7}));
+  Tensor expected_values_2(DT_INT32, TensorShape({7}));
+
+  test::FillValues<int64>(&expected_splits_1_1, component_split_1_1);
+  test::FillValues<int64>(&expected_splits_1_2, component_split_1_2);
+  test::FillValues<int64>(&expected_splits_2_1, component_split_2_1);
+  test::FillValues<int64>(&expected_splits_2_2, component_split_2_2);
+  test::FillValues<int>(&expected_values_1, component_values_1);
+  test::FillValues<int>(&expected_values_2, component_values_2);
+
+  BuildEncodeRaggedTensorGraph<int, int64>(
+      {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
+      batched_values, true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 2);
+
+  const Variant& encoded_splits_1_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_splits_1_2 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(2);
+  const Variant& encoded_splits_2_1 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_splits_2_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(2);
+
+  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
+                                 expected_splits_1_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_1_2.get<Tensor>(),
+                                 expected_splits_1_2);
+  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
+                                 expected_splits_2_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_2_2.get<Tensor>(),
+                                 expected_splits_2_2);
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               expected_values_1);
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               expected_values_2);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
+  // ragged_tensor =
+  // [[     [ [x, x]        ],
+  //        [ [x],      [x] ],
+  //        [ [x]           ],
+  //        [ [x]           ],
+  //        [ [x]           ]],
+  //  [     [ [x]           ],
+  //        [ [x]           ],
+  //        [ [x, x, x]     ],
+  //        [ [x]           ],
+  //        [ [x]           ] ]]
+  const std::vector<int> batched_splits_1 = {0, 5, 10};
+  const std::vector<int> batched_splits_2 = {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  const std::vector<int> batched_splits_3 = {0, 2, 3, 4,  5,  6,
+                                             7, 8, 9, 12, 13, 14};
+  const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
+                                           5, 6, 7, 8, 9, 8, 9};
+  const std::vector<int> component_split_1_1 = {0, 1, 3, 4, 5, 6};
+  const std::vector<int> component_split_1_2 = {0, 2, 3, 4, 5, 6, 7};
+  const std::vector<int> component_split_2_1 = {0, 1, 2, 3, 4, 5};
+  const std::vector<int> component_split_2_2 = {0, 1, 2, 5, 6, 7};
+  const std::vector<int> component_values_1 = {0, 1, 1, 2, 2, 3, 4};
+  const std::vector<int> component_values_2 = {5, 6, 7, 8, 9, 8, 9};
+
+  Tensor expected_splits_1_1(DT_INT32, TensorShape({6}));
+  Tensor expected_splits_1_2(DT_INT32, TensorShape({7}));
+  Tensor expected_splits_2_1(DT_INT32, TensorShape({6}));
+  Tensor expected_splits_2_2(DT_INT32, TensorShape({6}));
+  Tensor expected_values_1(DT_INT32, TensorShape({7}));
+  Tensor expected_values_2(DT_INT32, TensorShape({7}));
+
+  test::FillValues<int>(&expected_splits_1_1, component_split_1_1);
+  test::FillValues<int>(&expected_splits_1_2, component_split_1_2);
+  test::FillValues<int>(&expected_splits_2_1, component_split_2_1);
+  test::FillValues<int>(&expected_splits_2_2, component_split_2_2);
+  test::FillValues<int>(&expected_values_1, component_values_1);
+  test::FillValues<int>(&expected_values_2, component_values_2);
+
+  BuildEncodeRaggedTensorGraph<int, int>(
+      {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
+      batched_values, true);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_list = GetOutput(0)->vec<Variant>();
+  EXPECT_EQ(encoded_list.size(), 2);
+
+  const Variant& encoded_splits_1_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_splits_1_2 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_values_1 =
+      encoded_list(0).get<Tensor>()->vec<Variant>()(2);
+  const Variant& encoded_splits_2_1 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_splits_2_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_values_2 =
+      encoded_list(1).get<Tensor>()->vec<Variant>()(2);
+
+  test::ExpectTensorEqual<int>(*encoded_splits_1_1.get<Tensor>(),
+                               expected_splits_1_1);
+  test::ExpectTensorEqual<int>(*encoded_splits_1_2.get<Tensor>(),
+                               expected_splits_1_2);
+  test::ExpectTensorEqual<int>(*encoded_splits_2_1.get<Tensor>(),
+                               expected_splits_2_1);
+  test::ExpectTensorEqual<int>(*encoded_splits_2_2.get<Tensor>(),
+                               expected_splits_2_2);
+  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
+                               expected_values_1);
+  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
+                               expected_values_2);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, NonBatchInput) {
+  // ragged_tensor =
+  // [[ [x],         [x x],       [] ],
+  //  [                              ],
+  //  [ [x x x x x], [x x x]         ],
+  //  [ [],          [x x x x]       ]]
+  const std::vector<int64> batched_splits_1 = {0, 3, 3, 5, 7};
+  const std::vector<int64> batched_splits_2 = {0, 1, 3, 3, 8, 11, 11, 15};
+  const std::vector<int> batched_values = {1, 2,  3,  4,  5,  6,  7, 8,
+                                           9, 10, 11, 12, 13, 14, 15};
+
+  Tensor batched_ragged_splits_1(DT_INT64, TensorShape({5}));
+  Tensor batched_ragged_splits_2(DT_INT64, TensorShape({8}));
+  Tensor batched_ragged_values(DT_INT32, TensorShape({15}));
+
+  test::FillValues<int64>(&batched_ragged_splits_1, batched_splits_1);
+  test::FillValues<int64>(&batched_ragged_splits_2, batched_splits_2);
+  test::FillValues<int>(&batched_ragged_values, batched_values);
+
+  BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
+                                           TensorShape({15}), batched_values,
+                                           false);
+  TF_ASSERT_OK(RunOpKernel());
+
+  const auto& encoded_scalar = GetOutput(0)->scalar<Variant>()();
+  const Variant& encoded_splits_1 =
+      encoded_scalar.get<Tensor>()->vec<Variant>()(0);
+  const Variant& encoded_splits_2 =
+      encoded_scalar.get<Tensor>()->vec<Variant>()(1);
+  const Variant& encoded_values =
+      encoded_scalar.get<Tensor>()->vec<Variant>()(2);
+
+  test::ExpectTensorEqual<int64>(*encoded_splits_1.get<Tensor>(),
+                                 batched_ragged_splits_1);
+  test::ExpectTensorEqual<int64>(*encoded_splits_2.get<Tensor>(),
+                                 batched_ragged_splits_2);
+  test::ExpectTensorEqual<int>(*encoded_values.get<Tensor>(),
+                               batched_ragged_values);
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, ShapeFnTestBatched) {
+  ShapeInferenceTestOp op("RaggedTensorToVariant");
+  (*op.node_def.mutable_attr())["batched_input"].set_b(true);
+
+  // Tests with len(ragged_splits)==0.
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(0);
+  INFER_ERROR("Shape inference should have returned error", op, "?");
+
+  // Tests with len(ragged_splits)==1.
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(1);
+  INFER_OK(op, "?;?", "[?]");
+  INFER_OK(op, "?;[?]", "[?]");
+  INFER_OK(op, "?;[?,?]", "[?]");
+  INFER_OK(op, "[?];[5]", "[?]");
+  INFER_OK(op, "[?];[5,2]", "[?]");
+  INFER_OK(op, "[5];[5,2]", "[4]");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[5,5];?");
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "?;[]");
+
+  // Tests with len(ragged_splits)==2
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(2);
+  INFER_OK(op, "?;?;?", "[?]");
+  INFER_OK(op, "?;?;[?]", "[?]");
+  INFER_OK(op, "?;?;[?,?]", "[?]");
+  INFER_OK(op, "[?];[?];[5]", "[?]");
+  INFER_OK(op, "[?];[?];[5,2]", "[?]");
+  INFER_OK(op, "[6];[?];[5,2]", "[5]");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[5,5];?");
+
+  // Tests with len(ragged_splits)==3
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(3);
+  INFER_OK(op, "?;?;?;?", "[?]");
+  INFER_OK(op, "?;?;?;[?]", "[?]");
+  INFER_OK(op, "?;?;?;[5]", "[?]");
+  INFER_OK(op, "[4];?;?;[5]", "[3]");
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, ShapeFnTestNotBatched) {
+  ShapeInferenceTestOp op("RaggedTensorToVariant");
+  (*op.node_def.mutable_attr())["batched_input"].set_b(false);
+
+  // Tests with len(ragged_splits)==0.
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(0);
+  INFER_ERROR("Shape inference should have returned error", op, "?");
+
+  // Tests with len(ragged_splits)==1.
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(1);
+  INFER_OK(op, "?;?", "[]");
+  INFER_OK(op, "?;[?]", "[]");
+  INFER_OK(op, "?;[?,?]", "[]");
+  INFER_OK(op, "[?];[5]", "[]");
+  INFER_OK(op, "[?];[5,2]", "[]");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[5,5];?");
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "?;[]");
+
+  // Tests with len(ragged_splits)==2
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(2);
+  INFER_OK(op, "?;?;?", "[]");
+  INFER_OK(op, "?;?;[?]", "[]");
+  INFER_OK(op, "?;?;[?,?]", "[]");
+  INFER_OK(op, "[?];[?];[5]", "[]");
+  INFER_OK(op, "[?];[?];[5,2]", "[]");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[5,5];?");
+
+  // Tests with len(ragged_splits)==3
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(3);
+  INFER_OK(op, "?;?;?;?", "[]");
+  INFER_OK(op, "?;?;?;[?]", "[]");
+  INFER_OK(op, "?;?;?;[5]", "[]");
+}
+
+TEST_F(RaggedTensorToVariantKernelTest, NoSplits) {
+  const auto dtype = DataTypeToEnum<int>::v();
+  TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedTensorToVariant")
+                   .Input(FakeInput(0))
+                   .Input(FakeInput(dtype))
+                   .Attr("RAGGED_RANK", 0)
+                   .Attr("Tvalues", dtype)
+                   .Attr("Tsplits", DT_INT64)
+                   .Attr("batched_input", true)
+                   .Finalize(node_def()));
+  EXPECT_TRUE(absl::StartsWith(
+      InitOp().error_message(),
+      "Value for attr 'RAGGED_RANK' of 0 must be at least minimum 1"));
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index c60aaf1..b38bf1c 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -70,7 +70,7 @@
     return p * batch_size;
   }
   // numerically stable version of (1 - (1-p)^num_tries)
-  return -expm1(num_tries * std::log1p(-p));
+  return -std::expm1(num_tries * std::log1p(-p));
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 07c5df9..b7d87b2 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -42,9 +42,9 @@
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-struct Sqrt {
+struct SqrtOfReal {
   __host__ __device__ T operator()(const T& a) const {
-    return Eigen::numext::sqrt(a);
+    return T(Eigen::numext::sqrt(Eigen::numext::real(a)));
   }
 };
 
@@ -875,8 +875,8 @@
                      const functor::EuclideanNormReducer<T>& reducer) {
     typedef cub::TransformInputIterator<T, Square<T>, T*> inputIterType;
     inputIterType input_itr((T*)in.data(), Square<T>());
-    typedef TransformOutputIterator<T, T, Sqrt<T>> outputIterType;
-    outputIterType output_itr((T*)out.data(), Sqrt<T>());
+    typedef TransformOutputIterator<T, T, SqrtOfReal<T>> outputIterType;
+    outputIterType output_itr((T*)out.data(), SqrtOfReal<T>());
     ReduceImpl<T, Sum<T>, outputIterType, inputIterType, ReductionAxes>(
         ctx, output_itr, input_itr, in.rank(), in.dimension(0),
         in.rank() >= 2 ? in.dimension(1) : 1,
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 0a1568b..164359f 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -117,8 +117,6 @@
 FIX_MEAN_IDENTITY(Eigen::half)
 FIX_MEAN_IDENTITY(float)
 FIX_MEAN_IDENTITY(double)
-FIX_MEAN_IDENTITY(complex64)
-FIX_MEAN_IDENTITY(complex128)
 #undef FIX_MEAN_IDENTITY
 
 template <typename Device, typename OUT_T, typename Reducer>
diff --git a/tensorflow/core/kernels/redux_functor.h b/tensorflow/core/kernels/redux_functor.h
index fb2b489..05a867a 100644
--- a/tensorflow/core/kernels/redux_functor.h
+++ b/tensorflow/core/kernels/redux_functor.h
@@ -16,7 +16,11 @@
 #ifndef TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
 #define TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -41,7 +45,7 @@
     const int num_output_dims = output->dims();
     auto output_dims = output->template flat<T>().dimensions();
 
-    int64 inner_dim = 1, outer_dim = 1;
+    Eigen::Index inner_dim = 1, outer_dim = 1;
     for (int i = 0; i < num_dims - num_output_dims; ++i)
       outer_dim *= input_dims[i];
     for (int i = num_dims - num_output_dims; i < num_dims; ++i)
@@ -55,15 +59,15 @@
     }
 
     // Get device thread num.
-    const int64 num_threads = device.numThreads();
+    const Eigen::Index num_threads = device.numThreads();
 
     // If the inner dim parallelism is large enough
     if (inner_dim > num_threads * 16) {
       // Do not create more blocks than there are threads in a pool.
-      const int64 num_blocks = num_threads;
+      const Eigen::Index num_blocks = num_threads;
 
       // Block size along the outer dimension.
-      const int64 inner_block_size = Eigen::divup(inner_dim, num_blocks);
+      const Eigen::Index inner_block_size = Eigen::divup(inner_dim, num_blocks);
       const T* input_data = input.template flat<T>().data();
 
       // Allocate temporary buffer for partial reductions.
@@ -84,15 +88,15 @@
                             input_data, buffer_data](
                                Eigen::Index start, Eigen::Index limit) -> void {
         DCHECK(start >= 0 && limit <= num_blocks);
-        int64 inner_dim_start = start * inner_block_size;
-        int64 inner_dim_limit = limit * inner_block_size;
+        Eigen::Index inner_dim_start = start * inner_block_size;
+        Eigen::Index inner_dim_limit = limit * inner_block_size;
         inner_dim_limit = std::min(inner_dim, inner_dim_limit);
-        int64 my_job_len = inner_dim_limit - inner_dim_start;
+        Eigen::Index my_job_len = inner_dim_limit - inner_dim_start;
 
         const T* my_job_start = input_data + inner_dim_start;
         Buffer buf(buffer_data + inner_dim_start, my_job_len);
 
-        for (int64 i = 0; i < outer_dim; ++i) {
+        for (Eigen::Index i = 0; i < outer_dim; ++i) {
           auto in = Input(my_job_start + i * inner_dim, my_job_len);
           auto cast = in.template cast<AccumT>();
           buf = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf),
@@ -101,8 +105,8 @@
       };
 
       // Compute cost of reducing a single block.
-      const int64 compute_size = outer_dim * inner_block_size;
-      const int64 compute_input_bytes = compute_size * sizeof(T);
+      const Eigen::Index compute_size = outer_dim * inner_block_size;
+      const Eigen::Index compute_input_bytes = compute_size * sizeof(T);
       const Eigen::TensorOpCost cost(
           compute_input_bytes,
           0,  // We'll be mostly writing to L1, assume store cost is 0
@@ -115,21 +119,21 @@
           buffer.template cast<T>().reshape(output_dims);
     } else {
       // Compute block size along the outer dimension for efficiency.
-      const int64 parallel_cell_size = inner_dim;
-      const int64 total_workload = outer_dim * inner_dim;
-      const int64 max_parallelism = total_workload / parallel_cell_size;
+      const Eigen::Index parallel_cell_size = inner_dim;
+      const Eigen::Index total_workload = outer_dim * inner_dim;
+      const Eigen::Index max_parallelism = total_workload / parallel_cell_size;
 
-      const int64 min_block_workload = 2000;
-      const int64 min_block_size =
+      const Eigen::Index min_block_workload = 2000;
+      const Eigen::Index min_block_size =
           Eigen::divup(min_block_workload, parallel_cell_size);
-      const int64 max_num_blocks = std::min(
+      const Eigen::Index max_num_blocks = std::min(
           max_parallelism, Eigen::divup(total_workload, min_block_size));
 
       // Do not create more blocks than there are threads in a pool.
-      const int64 num_blocks = std::min(max_num_blocks, num_threads);
+      const Eigen::Index num_blocks = std::min(max_num_blocks, num_threads);
 
       // Block size along the outer dimension.
-      const int64 outer_block_size = Eigen::divup(outer_dim, num_blocks);
+      const Eigen::Index outer_block_size = Eigen::divup(outer_dim, num_blocks);
 
       const T* input_data = input.template flat<T>().data();
 
@@ -150,12 +154,12 @@
                             buffer_data, input_data, outer_dim](
                                Eigen::Index start, Eigen::Index limit) -> void {
         DCHECK(start >= 0 && limit <= num_blocks);
-        int64 outer_dim_start = start * outer_block_size;
-        int64 outer_dim_limit = limit * outer_block_size;
+        Eigen::Index outer_dim_start = start * outer_block_size;
+        Eigen::Index outer_dim_limit = limit * outer_block_size;
         outer_dim_limit = std::min(outer_dim, outer_dim_limit);
 
         Buffer buf(buffer_data + start * inner_dim, inner_dim);
-        for (int64 i = outer_dim_start; i < outer_dim_limit; ++i) {
+        for (Eigen::Index i = outer_dim_start; i < outer_dim_limit; ++i) {
           auto in = Input(input_data + i * inner_dim, inner_dim);
           auto cast = in.template cast<AccumT>();
           buf = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf),
@@ -164,8 +168,8 @@
       };
 
       // Compute cost of reducing a single block.
-      const int64 compute_size = outer_block_size * inner_dim;
-      const int64 compute_input_bytes = compute_size * sizeof(T);
+      const Eigen::Index compute_size = outer_block_size * inner_dim;
+      const Eigen::Index compute_input_bytes = compute_size * sizeof(T);
       const Eigen::TensorOpCost cost(
           compute_input_bytes,
           0,  // We'll be mostly writing to L1, assume store cost is 0
@@ -203,7 +207,7 @@
     const int num_output_dims = output->dims();
     auto output_dims = output->template flat<T>().dimensions();
 
-    int64 inner_dim = 1, middle_dim = 1, outer_dim = 1;
+    Eigen::Index inner_dim = 1, middle_dim = 1, outer_dim = 1;
     for (int i = 0; i < axis_begin_dim; ++i) outer_dim *= input_dims[i];
     for (int i = axis_begin_dim; i < axis_begin_dim + num_output_dims; ++i)
       middle_dim *= input_dims[i];
@@ -223,22 +227,23 @@
     }
 
     // Compute block size along the outer dimension for efficiency.
-    const int64 parallel_cell_size = inner_dim;
-    const int64 max_parallelism = outer_dim * middle_dim;
-    const int64 total_workload = max_parallelism * inner_dim;
+    const Eigen::Index parallel_cell_size = inner_dim;
+    const Eigen::Index max_parallelism = outer_dim * middle_dim;
+    const Eigen::Index total_workload = max_parallelism * inner_dim;
 
-    const int64 min_block_workload = 2000;
-    const int64 min_block_size =
+    const Eigen::Index min_block_workload = 2000;
+    const Eigen::Index min_block_size =
         Eigen::divup(min_block_workload, parallel_cell_size);
-    const int64 max_num_blocks =
+    const Eigen::Index max_num_blocks =
         std::min(max_parallelism, Eigen::divup(total_workload, min_block_size));
 
     // Do not create more blocks than there are threads in a pool.
-    const int64 num_threads = device.numThreads();
-    const int64 num_blocks = std::min(max_num_blocks, num_threads);
+    const Eigen::Index num_threads = device.numThreads();
+    const Eigen::Index num_blocks = std::min(max_num_blocks, num_threads);
 
     // Block size along the outer dimension.
-    const int64 outer_block_size = Eigen::divup(total_workload, num_blocks);
+    const Eigen::Index outer_block_size =
+        Eigen::divup(total_workload, num_blocks);
 
     const T* input_data = input.template flat<T>().data();
 
@@ -259,8 +264,8 @@
                           reduction_axis, reducer, binary_op](
                              Eigen::Index start, Eigen::Index limit) -> void {
       DCHECK(start >= 0 && limit <= num_blocks);
-      int64 block_start = start * outer_block_size;
-      int64 block_limit = limit * outer_block_size;
+      Eigen::Index block_start = start * outer_block_size;
+      Eigen::Index block_limit = limit * outer_block_size;
       block_limit = std::min(total_workload, block_limit);
       Buffer buf(buffer_data + start * middle_dim, middle_dim);
 
@@ -268,7 +273,7 @@
           ((block_start + inner_dim - 1) / inner_dim) * inner_dim;
       const int align_end = (block_limit / inner_dim) * inner_dim;
 
-      int64 coordinate = block_start / inner_dim % middle_dim;
+      Eigen::Index coordinate = block_start / inner_dim % middle_dim;
       Eigen::Tensor<AccumT, 0> reduced =
           Input(&input_data[block_start], align_start - block_start)
               .reduce(reduction_axis, reducer)
@@ -293,8 +298,8 @@
     };
 
     // Compute cost of reducing a single block.
-    const int64 compute_size = outer_block_size * inner_dim;
-    const int64 compute_input_bytes = compute_size * sizeof(T);
+    const Eigen::Index compute_size = outer_block_size * inner_dim;
+    const Eigen::Index compute_input_bytes = compute_size * sizeof(T);
     const Eigen::TensorOpCost cost(
         compute_input_bytes,
         0,  // We'll be mostly writing to L1, assume store cost is 0
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index ead95a3..9860448 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -86,7 +86,8 @@
 #undef REGISTER_SYCL_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index aa2434d..c60ab60 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -317,7 +317,7 @@
 TF_CALL_string(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
@@ -407,7 +407,7 @@
                             .HostMemory("axis")
                             .HostMemory("output"),
                         ReverseV2Op<CPUDevice, int32, int64>);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T)                             \
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index 3ee49db..2917a0d 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -51,4 +51,4 @@
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index cded417..0e11213 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -17,9 +17,9 @@
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/reverse_sequence_op.h"
 
@@ -177,7 +177,7 @@
 TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);
 TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
@@ -222,6 +222,6 @@
 
 #undef REGISTER_REVERSE_SEQUENCE_GPU
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
index 4a2136a..948a99a 100644
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -43,4 +43,4 @@
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 6e1a0d5..60c6a7d 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -20,12 +20,12 @@
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "third_party/eigen3/Eigen/Core"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
+
 #include <vector>
 
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 6521dcf..91d6e9b 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -116,7 +116,6 @@
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_GPU).HostMemory("tensor"), SendOp);
 
@@ -200,7 +199,6 @@
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp);
 
@@ -209,4 +207,16 @@
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
 #endif  // TENSORFLOW_USE_SYCL
 
+// Environment variable `DISABLE_HOST_SEND_RECV_REGISTRATION` is used to disable
+// hostSend and hostRecv registration on CPU device in the mock environment.
+static bool InitModule() {
+  if (!std::getenv("DISABLE_HOST_SEND_RECV_REGISTRATION")) {
+    REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
+    REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
+  }
+  return true;
+}
+
+static bool module_initialized = InitModule();
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 21c3b89..02dcc1e 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -103,14 +103,14 @@
 TF_CALL_int32(REGISTER_CPU_KERNEL);
 TF_CALL_int64(REGISTER_CPU_KERNEL);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 TF_CALL_int32(REGISTER_GPU_KERNEL);
 TF_CALL_int64(REGISTER_GPU_KERNEL);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #undef REGISTER_KERNEL
 #undef REGISTER_CPU_KERNEL
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index db7357c..86ccde9 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -68,7 +68,7 @@
                         ShapeOp<int64>);
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                                \
   REGISTER_KERNEL_BUILDER(Name("Shape")                          \
                               .Device(DEVICE_GPU)                \
@@ -106,7 +106,7 @@
                             .TypeConstraint<int64>("out_type"),
                         ShapeOp<int64>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // ShapeN ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
@@ -120,7 +120,7 @@
                             .TypeConstraint<int64>("out_type"),
                         ShapeNOp<int64>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                                \
   REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
                               .Device(DEVICE_GPU)                \
@@ -156,7 +156,7 @@
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeNOp<int64>);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
@@ -222,7 +222,7 @@
                         RankOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                        \
   REGISTER_KERNEL_BUILDER(Name("Rank")                   \
                               .Device(DEVICE_GPU)        \
@@ -250,7 +250,7 @@
                             .HostMemory("output"),
                         RankOp);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Size ------------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Size")
@@ -264,7 +264,7 @@
                             .TypeConstraint<int64>("out_type"),
                         SizeOp<int64>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                                \
   REGISTER_KERNEL_BUILDER(Name("Size")                           \
                               .Device(DEVICE_GPU)                \
@@ -301,7 +301,7 @@
                             .HostMemory("output"),
                         SizeOp<int64>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
@@ -349,7 +349,7 @@
                             .TypeConstraint<int64>("Tdim"),
                         ExpandDimsOp<int64>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                            \
   REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
                               .Device(DEVICE_GPU)            \
@@ -383,7 +383,7 @@
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp<int64>);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                           \
@@ -424,7 +424,7 @@
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                                   \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
@@ -442,7 +442,7 @@
                             .HostMemory("input")
                             .HostMemory("output"),
                         SqueezeOp);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                                   \
@@ -532,7 +532,7 @@
 
 #undef REGISTER_GPU_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A special GPU kernel for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
index fe04dcf..95bcfd6 100644
--- a/tensorflow/core/kernels/snapshot_op.cc
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -51,7 +51,7 @@
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_KERNEL(TYPE)                                        \
   REGISTER_KERNEL_BUILDER(                                           \
       Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
index 02d4929..a35233b 100644
--- a/tensorflow/core/kernels/snapshot_op.h
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -16,7 +16,7 @@
 #ifndef TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
 #define TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif
 
diff --git a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
index e4e3bd5..d4fee5b 100644
--- a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -12,7 +12,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // See docs in ../ops/array_ops.cc.
 #include "tensorflow/core/kernels/snapshot_op.h"
@@ -31,4 +31,4 @@
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index fb00e1b..0c0f330 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -87,7 +87,8 @@
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                          \
diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
index 900df27..0cf169d 100644
--- a/tensorflow/core/kernels/softplus_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
index d691f15..df1c61f 100644
--- a/tensorflow/core/kernels/softsign_op.cc
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -88,7 +88,7 @@
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                          \
@@ -120,6 +120,6 @@
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softsign_op_gpu.cu.cc b/tensorflow/core/kernels/softsign_op_gpu.cu.cc
index b80cdf0..679f743 100644
--- a/tensorflow/core/kernels/softsign_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softsign_op_gpu.cu.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -37,4 +37,4 @@
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 925c926..9c0f370 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -216,7 +216,8 @@
 };
 
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
@@ -249,7 +250,8 @@
 };
 
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
@@ -284,7 +286,8 @@
 };
 
 REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(
     Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
@@ -314,7 +317,8 @@
 };
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
                         StageSizeOp);
 #endif
@@ -339,7 +343,8 @@
 };
 
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_CPU), StageClearOp);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index b7fd469..cbbce24 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -27,7 +27,8 @@
   void operator()(OpKernelContext* ctx, const CPUDevice& device,
                   Distribution dist, int64 output_size, int64 alg_tag_skip,
                   ScopedUnlockUnrefVar* state_var_guard, Tensor* state_tensor,
-                  typename Distribution::ResultElementType* output_data) {
+                  typename Distribution::ResultElementType* output_data)
+      UNLOCK_FUNCTION() {
     auto state_tensor_flat = state_tensor->flat<StateElementType>();
     auto state_data = state_tensor_flat.data();
     // Delegates to PhiloxRandom to do the actual increasing.
@@ -40,6 +41,33 @@
   }
 };
 
+Status CheckState(const Tensor& state) {
+  if (state.dtype() != STATE_ELEMENT_DTYPE) {
+    return errors::InvalidArgument("dtype of RNG state variable must be ",
+                                   DataTypeString(STATE_ELEMENT_DTYPE),
+                                   ", not ", DataTypeString(state.dtype()));
+  }
+  if (state.dims() != 1) {
+    return errors::InvalidArgument(
+        "RNG state must have one and only one dimension, not ", state.dims());
+  }
+  return Status::OK();
+}
+
+Status CheckPhiloxState(const Tensor& state, int64 alg_tag_skip = 0) {
+  static_assert(std::is_same<StateElementType, int64>::value,
+                "StateElementType must be int64");
+  static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
+                "PhiloxRandom::ResultElementType must be uint32");
+  if (state.NumElements() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
+    return errors::InvalidArgument(
+        "For the Philox algorithm, the size of state"
+        " must be at least ",
+        alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ", state.NumElements());
+  }
+  return Status::OK();
+}
+
 template <typename Device, typename Distribution>
 Status UpdateVariableAndFill(
     OpKernelContext* ctx, Distribution dist, int state_input_idx,
@@ -54,17 +82,7 @@
   // filling.
   ScopedUnlockUnrefVar state_var_guard(var);
   Tensor* var_tensor = var->tensor();
-  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
-    return errors::InvalidArgument("dtype of RNG state variable must be ",
-                                   DataTypeString(STATE_ELEMENT_DTYPE),
-                                   ", not ",
-                                   DataTypeString(var_tensor->dtype()));
-  }
-  if (var_tensor->dims() != 1) {
-    return errors::InvalidArgument(
-        "RNG state must have one and only one dimension, not ",
-        var_tensor->dims());
-  }
+  TF_RETURN_IF_ERROR(CheckState(*var_tensor));
   auto var_tensor_flat = var_tensor->flat<StateElementType>();
   int64 alg_tag_skip = 0;
   if (read_alg_from_state) {
@@ -75,17 +93,7 @@
     alg = var_tensor_flat(0);
   }
   if (alg == RNG_ALG_PHILOX) {
-    static_assert(std::is_same<StateElementType, int64>::value,
-                  "StateElementType must be int64");
-    static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
-                  "PhiloxRandom::ResultElementType must be uint32");
-    if (var_tensor_flat.size() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
-      return errors::InvalidArgument(
-          "For the Philox algorithm, the size of state"
-          " must be at least ",
-          alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ",
-          var_tensor_flat.size());
-    }
+    TF_RETURN_IF_ERROR(CheckPhiloxState(*var_tensor, alg_tag_skip));
     TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
         ctx, var_tensor, var->copy_on_read_mode.load()));
     UpdateVariableAndFill_Philox<Device, Distribution>()(
@@ -124,18 +132,20 @@
   }
 };
 
-Status GetAlgorithm(OpKernelContext* ctx, int alg_input_idx, Algorithm* alg) {
-  const Tensor& alg_tensor = ctx->input(alg_input_idx);
-  if (alg_tensor.dims() != 0) {
-    return errors::InvalidArgument("algorithm must be of shape [], not ",
-                                   alg_tensor.shape().DebugString());
+template <typename T>
+Status GetScalar(const Tensor& tensor, int input_idx, T* result) {
+  auto dtype = DataTypeToEnum<T>::v();
+  if (tensor.dims() != 0) {
+    return errors::InvalidArgument("input ", std::to_string(input_idx),
+                                   " (0-based) must have shape [], not ",
+                                   tensor.shape().DebugString());
   }
-  if (alg_tensor.dtype() != ALGORITHM_DTYPE) {
-    return errors::InvalidArgument("algorithm's dtype must be ",
-                                   DataTypeString(ALGORITHM_DTYPE), ", not ",
-                                   DataTypeString(alg_tensor.dtype()));
+  if (tensor.dtype() != dtype) {
+    return errors::InvalidArgument("dtype of input ", std::to_string(input_idx),
+                                   " (0-based) must be ", DataTypeString(dtype),
+                                   ", not ", DataTypeString(tensor.dtype()));
   }
-  *alg = alg_tensor.flat<Algorithm>()(0);
+  *result = tensor.flat<T>()(0);
   return Status::OK();
 }
 
@@ -146,7 +156,7 @@
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, /*alg_input_idx=*/1, &alg));
+    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
     StatefulRandomCompute<Device>(ctx, Distribution(), /*state_input_idx=*/0,
                                   /*shape_input_idx=*/2,
                                   /*read_alg_from_state=*/false, alg);
@@ -160,7 +170,7 @@
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, /*alg_input_idx=*/1, &alg));
+    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
     const Tensor& minval = ctx->input(3);
     const Tensor& maxval = ctx->input(4);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
@@ -197,7 +207,7 @@
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, /*alg_input_idx=*/1, &alg));
+    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
     StatefulRandomCompute<Device>(
         ctx,
         random::UniformFullIntDistribution<random::PhiloxRandom, IntType>(),
@@ -206,6 +216,45 @@
   }
 };
 
+template <>
+struct RngSkip_Philox<CPUDevice> {
+  void operator()(const CPUDevice& device, int64 delta, Tensor* state_tensor) {
+    auto state_data = state_tensor->flat<StateElementType>().data();
+    // Delegates to PhiloxRandom to do the actual increasing.
+    auto philox = GetPhiloxRandomFromMem(state_data);
+    UpdateMemWithPhiloxRandom(philox, delta, state_data);
+  }
+};
+
+template <typename Device>
+class RngSkipOp : public OpKernel {
+ public:
+  explicit RngSkipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto state_input_idx = 0;
+    Algorithm alg;
+    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    int64 delta;
+    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(2), 2, &delta));
+    Var* var = nullptr;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, state_input_idx), &var));
+    ScopedUnlockUnrefVar state_var_guard(var);
+    Tensor* var_tensor = var->tensor();
+    OP_REQUIRES_OK(ctx, CheckState(*var_tensor));
+    if (alg == RNG_ALG_PHILOX) {
+      OP_REQUIRES_OK(ctx, CheckPhiloxState(*var_tensor));
+      OP_REQUIRES_OK(ctx, PrepareToUpdateVariable<Device, StateElementType>(
+                              ctx, var_tensor, var->copy_on_read_mode.load()));
+      RngSkip_Philox<Device>()(ctx->eigen_device<Device>(), delta, var_tensor);
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
 template <typename T>
 class NonDeterministicIntsOp : public OpKernel {
  public:
@@ -334,6 +383,16 @@
 TF_CALL_uint32(REGISTER_StatefulUniformFullInt_CPU);
 TF_CALL_uint64(REGISTER_StatefulUniformFullInt_CPU);
 
+#define REGISTER_RngSkip(DEVICE)                       \
+  REGISTER_KERNEL_BUILDER(Name("RngSkip")              \
+                              .Device(DEVICE_##DEVICE) \
+                              .HostMemory("resource")  \
+                              .HostMemory("algorithm") \
+                              .HostMemory("delta"),    \
+                          RngSkipOp<DEVICE##Device>);
+
+REGISTER_RngSkip(CPU);
+
 #if GOOGLE_CUDA
 
 TF_CALL_half(REGISTER_FloatOps_GPU);
@@ -345,6 +404,7 @@
 TF_CALL_int64(REGISTER_StatefulUniformFullInt_GPU);
 TF_CALL_uint32(REGISTER_StatefulUniformFullInt_GPU);
 TF_CALL_uint64(REGISTER_StatefulUniformFullInt_GPU);
+REGISTER_RngSkip(GPU);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/stateful_random_ops.h b/tensorflow/core/kernels/stateful_random_ops.h
index 25d0ce7..58ab414 100644
--- a/tensorflow/core/kernels/stateful_random_ops.h
+++ b/tensorflow/core/kernels/stateful_random_ops.h
@@ -16,7 +16,7 @@
 #ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
 
-// #include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
index 63d746f..f3d966b 100644
--- a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
+++ b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
@@ -77,13 +77,16 @@
 template <typename Device, typename Distribution>
 struct UpdateVariableAndFill_Philox;
 
+template <typename Device>
+struct RngSkip_Philox;
+
 using CPUDevice = Eigen::ThreadPoolDevice;
 
 #if GOOGLE_CUDA
 
 using GPUDevice = Eigen::GpuDevice;
 
-// Declares the partially GPU-specialized functor struct.
+// Declares the partially GPU-specialized functor structs.
 template <typename Distribution>
 struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
   void operator()(OpKernelContext* ctx, const GPUDevice& device,
@@ -92,6 +95,11 @@
                   typename Distribution::ResultElementType* output_data);
 };
 
+template <>
+struct RngSkip_Philox<GPUDevice> {
+  void operator()(const GPUDevice& device, int64 delta, Tensor* state_tensor);
+};
+
 #endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
index d246b8b..9aa7318 100644
--- a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -80,6 +80,18 @@
       d.stream(), dist, state_size, output_size, state_data, output_data));
 }
 
+// Precondition: there is only 1 block and 1 thread.
+__global__ void SkipKernel(int64 delta, StateElementType* state_data) {
+  auto philox = GetPhiloxRandomFromMem(state_data);
+  UpdateMemWithPhiloxRandom(philox, delta, state_data);
+}
+
+void RngSkip_Philox<GPUDevice>::operator()(const GPUDevice& d, int64 delta,
+                                           Tensor* state_tensor) {
+  SkipKernel<<<1, 1, 0, d.stream()>>>(
+      delta, state_tensor->flat<StateElementType>().data());
+}
+
 // Explicit instantiation of the GPU distributions functors.
 
 // clang-format off
diff --git a/tensorflow/core/kernels/string_lower_op.cc b/tensorflow/core/kernels/string_lower_op.cc
new file mode 100644
index 0000000..e24eedc
--- /dev/null
+++ b/tensorflow/core/kernels/string_lower_op.cc
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "unicode/unistr.h"  // TF:icu
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class StringLowerOp : public OpKernel {
+ public:
+  explicit StringLowerOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("encoding", &encoding_));
+    OP_REQUIRES(context, encoding_.empty() || encoding_ == "utf-8",
+                errors::InvalidArgument(
+                    "only utf-8 or '' (no encoding) is supported, received ",
+                    encoding_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
+
+    const auto input = input_tensor->flat<string>();
+    auto output = output_tensor->flat<string>();
+
+    if (encoding_.empty()) {
+      for (int64 i = 0; i < input.size(); ++i) {
+        StringPiece entry(input(i));
+        output(i) = absl::AsciiStrToLower(entry);
+      }
+    } else {
+      // The validation of utf-8 has already been done in GetAttr above.
+      for (int64 i = 0; i < input.size(); ++i) {
+        icu::UnicodeString us(input(i).c_str(), "UTF-8");
+        us.toLower();
+        us.toUTF8String(output(i));
+      }
+    }
+  }
+
+ private:
+  string encoding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringLower").Device(DEVICE_CPU), StringLowerOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc
new file mode 100644
index 0000000..f2a1d33
--- /dev/null
+++ b/tensorflow/core/kernels/string_upper_op.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "unicode/unistr.h"  // TF:icu
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class StringUpperOp : public OpKernel {
+ public:
+  explicit StringUpperOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("encoding", &encoding_));
+    OP_REQUIRES(context, encoding_.empty() || encoding_ == "utf-8",
+                errors::InvalidArgument(
+                    "only utf-8 or '' (no encoding) is supported, received ",
+                    encoding_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
+
+    const auto input = input_tensor->flat<string>();
+    auto output = output_tensor->flat<string>();
+    if (encoding_.empty()) {
+      for (int64 i = 0; i < input.size(); ++i) {
+        StringPiece entry(input(i));
+        output(i) = absl::AsciiStrToUpper(entry);
+      }
+    } else {
+      // The validation of utf-8 has already been done in GetAttr above.
+      for (int64 i = 0; i < input.size(); ++i) {
+        icu::UnicodeString us(input(i).c_str(), "UTF-8");
+        us.toUpper();
+        us.toUTF8String(output(i));
+      }
+    }
+  }
+
+ private:
+  string encoding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringUpper").Device(DEVICE_CPU), StringUpperOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tridiagonal_matmul_op.cc b/tensorflow/core/kernels/tridiagonal_matmul_op.cc
new file mode 100644
index 0000000..6058688
--- /dev/null
+++ b/tensorflow/core/kernels/tridiagonal_matmul_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// TODO(b/131583008): add broadcast support (for batch dimensions).
+template <class Scalar>
+class TridiagonalMatMulOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit TridiagonalMatMulOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    auto num_inputs = input_matrix_shapes.size();
+    OP_REQUIRES(
+        context, num_inputs == 4,
+        errors::InvalidArgument("Expected 4 inputs, got ", num_inputs, "."));
+
+    auto n = input_matrix_shapes[3].dim_size(0);
+
+    OP_REQUIRES(context,
+                input_matrix_shapes[0].dim_size(0) == 1 &&
+                    input_matrix_shapes[0].dim_size(1) == n,
+                errors::InvalidArgument("Invalid superdiagonal shape."));
+
+    OP_REQUIRES(context,
+                input_matrix_shapes[1].dim_size(0) == 1 &&
+                    input_matrix_shapes[1].dim_size(1) == n,
+                errors::InvalidArgument("Invalid main diagonal shape."));
+
+    OP_REQUIRES(context,
+                input_matrix_shapes[2].dim_size(0) == 1 &&
+                    input_matrix_shapes[2].dim_size(1) == n,
+                errors::InvalidArgument("Invalid subdiagonal shape."));
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({input_matrix_shapes[3]});
+  }
+
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    const int num_eqs = static_cast<int>(input_matrix_shapes[0].dim_size(1));
+    const int num_rhss = static_cast<int>(input_matrix_shapes[3].dim_size(0));
+
+    const double add_cost = Eigen::TensorOpCost::AddCost<Scalar>();
+    const double mult_cost = Eigen::TensorOpCost::MulCost<Scalar>();
+
+    const double cost = num_rhss * ((3 * num_eqs - 2) * mult_cost +
+                                    (2 * num_eqs - 2) * add_cost);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  // Needed to prevent writing result to the same location where input is.
+  bool EnableInputForwarding() const final { return false; }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    // Superdiagonal elements. Must have length m.
+    // Last element is ignored.
+    const auto& superdiag = inputs[0].row(0);
+
+    // Diagonal elements. Must have length m.
+    const auto& maindiag = inputs[1].row(0);
+
+    // Superdiagonal elements. Must have length m.
+    // First element is ignored.
+    const auto& subdiag = inputs[2].row(0);
+
+    // Right-hand matrix. Size m x n.
+    const auto& rhs = inputs[3];
+
+    MatrixMap& result = outputs->at(0);
+
+    const int m = rhs.rows();
+    const int n = rhs.cols();
+
+    ConstVectorMap subdiag_map(subdiag.data() + 1, m - 1);
+    ConstVectorMap superdiag_map(superdiag.data(), m - 1);
+    ConstMatrixMap rhs_except_first_row(rhs.data() + n, m - 1, n);
+    ConstMatrixMap rhs_except_last_row(rhs.data(), m - 1, n);
+
+    MatrixMap result_except_first_row(result.data() + n, m - 1, n);
+    MatrixMap result_except_last_row(result.data(), m - 1, n);
+    result.array() = rhs.array().colwise() * maindiag.transpose().array();
+    result_except_first_row.noalias() +=
+        (rhs_except_last_row.array().colwise() *
+         subdiag_map.transpose().array())
+            .matrix();
+    result_except_last_row.noalias() +=
+        (rhs_except_first_row.array().colwise() *
+         superdiag_map.transpose().array())
+            .matrix();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalMatMulOp);
+};
+
+REGISTER_LINALG_OP_CPU("TridiagonalMatMul", (TridiagonalMatMulOp<float>),
+                       float);
+REGISTER_LINALG_OP_CPU("TridiagonalMatMul", (TridiagonalMatMulOp<double>),
+                       double);
+REGISTER_LINALG_OP_CPU("TridiagonalMatMul", (TridiagonalMatMulOp<complex64>),
+                       complex64);
+REGISTER_LINALG_OP_CPU("TridiagonalMatMul", (TridiagonalMatMulOp<complex128>),
+                       complex128);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op.cc b/tensorflow/core/kernels/tridiagonal_solve_op.cc
index 35c1d5a..59ee073 100644
--- a/tensorflow/core/kernels/tridiagonal_solve_op.cc
+++ b/tensorflow/core/kernels/tridiagonal_solve_op.cc
@@ -30,12 +30,20 @@
 static const char kNotInvertibleScalarMsg[] =
     "The matrix is not invertible: it is a scalar with value zero.";
 
+static const char kThomasFailedMsg[] =
+    "The matrix is either not invertible, or requires pivoting. "
+    "Try setting partial_pivoting = True.";
+
 template <class Scalar>
 class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
  public:
   INHERIT_LINALG_TYPEDEFS(Scalar);
+  using MatrixMapRow =
+      decltype(std::declval<const ConstMatrixMaps>()[0].row(0));
 
-  explicit TridiagonalSolveOp(OpKernelConstruction* context) : Base(context) {}
+  explicit TridiagonalSolveOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("partial_pivoting", &pivoting_));
+  }
 
   void ValidateInputMatrixShapes(
       OpKernelContext* context,
@@ -74,10 +82,15 @@
     const double mult_cost = Eigen::TensorOpCost::MulCost<Scalar>();
     const double div_cost = Eigen::TensorOpCost::DivCost<Scalar>();
 
-    // Assuming cases with and without row interchange are equiprobable.
-    const double cost =
-        num_eqs * (div_cost * (num_rhss + 1) +
-                   (add_cost + mult_cost) * (2.5 * num_rhss + 1.5));
+    double cost;
+    if (pivoting_) {
+      // Assuming cases with and without row interchange are equiprobable.
+      cost = num_eqs * (div_cost * (num_rhss + 1) +
+                        (add_cost + mult_cost) * (2.5 * num_rhss + 1.5));
+    } else {
+      cost = num_eqs * (div_cost * (num_rhss + 1) +
+                        (add_cost + mult_cost) * (2 * num_rhss + 1));
+    }
     return cost >= static_cast<double>(kint64max) ? kint64max
                                                   : static_cast<int64>(cost);
   }
@@ -111,6 +124,26 @@
       return;
     }
 
+    if (pivoting_) {
+      SolveWithGaussianEliminationWithPivoting(context, superdiag, diag,
+                                               subdiag, rhs, x);
+    } else {
+      SolveWithThomasAlgorithm(context, superdiag, diag, subdiag, rhs, x);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOp);
+
+  void SolveWithGaussianEliminationWithPivoting(OpKernelContext* context,
+                                                const MatrixMapRow& superdiag,
+                                                const MatrixMapRow& diag,
+                                                const MatrixMapRow& subdiag,
+                                                const ConstMatrixMap& rhs,
+                                                MatrixMap& x) {
+    const int n = diag.size();
+    const Scalar zero(0);
+
     // The three columns in u are the diagonal, superdiagonal, and second
     // superdiagonal, respectively, of the U matrix in the LU decomposition of
     // the input matrix (subject to row exchanges due to pivoting). For pivoted
@@ -158,8 +191,36 @@
     }
   }
 
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOp);
+  void SolveWithThomasAlgorithm(OpKernelContext* context,
+                                const MatrixMapRow& superdiag,
+                                const MatrixMapRow& diag,
+                                const MatrixMapRow& subdiag,
+                                const ConstMatrixMap& rhs, MatrixMap& x) {
+    const int n = diag.size();
+    const Scalar zero(0);
+
+    // The superdiagonal of the U matrix in the LU decomposition of the input
+    // matrix (in Thomas algorithm, the U matrix has ones on the diagonal and
+    // one superdiagonal).
+    Eigen::Matrix<Scalar, Eigen::Dynamic, 1> u(n);
+
+    OP_REQUIRES(context, diag(0) != zero,
+                errors::InvalidArgument(kThomasFailedMsg));
+    u(0) = superdiag(0) / diag(0);
+    x.row(0) = rhs.row(0) / diag(0);
+    for (int i = 1; i < n; ++i) {
+      auto denom = diag(i) - subdiag(i) * u(i - 1);
+      OP_REQUIRES(context, denom != zero,
+                  errors::InvalidArgument(kThomasFailedMsg));
+      u(i) = superdiag(i) / denom;
+      x.row(i) = (rhs.row(i) - subdiag(i) * x.row(i - 1)) / denom;
+    }
+    for (int i = n - 2; i >= 0; --i) {
+      x.row(i) -= u(i) * x.row(i + 1);
+    }
+  }
+
+  bool pivoting_;
 };
 
 REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<float>), float);
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
index c5bcfd5..6c6b9ab 100644
--- a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
@@ -19,13 +19,13 @@
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
@@ -63,12 +63,39 @@
 }
 
 template <typename Scalar>
-class TridiagonalSolveOpGpu : public LinearAlgebraOp<Scalar> {
+se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
+  se::DeviceMemory<Scalar> typed(wrapped);
+  return typed;
+}
+
+template <typename Scalar>
+void CopyDeviceToDevice(OpKernelContext* context, const Scalar* src,
+                        Scalar* dst, const int num_elements) {
+  auto src_device_mem = AsDeviceMemory(src);
+  auto dst_device_mem = AsDeviceMemory(dst);
+  auto* stream = context->op_device_context()->stream();
+  bool copy_status = stream
+                         ->ThenMemcpyD2D(&dst_device_mem, src_device_mem,
+                                         sizeof(Scalar) * num_elements)
+                         .ok();
+
+  if (!copy_status) {
+    context->SetStatus(errors::Internal("Copying device-to-device failed."));
+  }
+}
+
+// This implementation is used in cases when the batching mechanism of
+// LinearAlgebraOp is suitable. See TridiagonalSolveOpGpu below.
+template <class Scalar>
+class TridiagonalSolveOpGpuLinalg : public LinearAlgebraOp<Scalar> {
  public:
   INHERIT_LINALG_TYPEDEFS(Scalar);
 
-  explicit TridiagonalSolveOpGpu(OpKernelConstruction* context)
-      : Base(context) {}
+  explicit TridiagonalSolveOpGpuLinalg(OpKernelConstruction* context)
+      : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("partial_pivoting", &pivoting_));
+  }
 
   void ValidateInputMatrixShapes(
       OpKernelContext* context,
@@ -151,8 +178,6 @@
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOpGpu);
-
   void TransposeWithGeam(OpKernelContext* context,
                          const std::unique_ptr<CudaSolver>& cublas_solver,
                          const Scalar* src, Scalar* dst, const int src_rows,
@@ -170,29 +195,11 @@
                      const Scalar* superdiag, const Scalar* diag,
                      const Scalar* subdiag, Scalar* rhs, const int num_eqs,
                      const int num_rhs) const {
-    OP_REQUIRES_OK(context,
-                   cusparse_solver->Gtsv(num_eqs, num_rhs, subdiag, diag,
-                                         superdiag, rhs, num_eqs));
-  }
-
-  void CopyDeviceToDevice(OpKernelContext* context, const Scalar* src,
-                          Scalar* dst, const int num_elements) const {
-    auto src_device_mem = AsDeviceMemory(src);
-    auto dst_device_mem = AsDeviceMemory(dst);
-    auto* stream = context->op_device_context()->stream();
-    bool copy_status = stream
-                           ->ThenMemcpyD2D(&dst_device_mem, src_device_mem,
-                                           sizeof(Scalar) * num_elements)
-                           .ok();
-    if (!copy_status) {
-      context->SetStatus(errors::Internal("Copying device-to-device failed."));
-    }
-  }
-
-  se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) const {
-    se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
-    se::DeviceMemory<Scalar> typed(wrapped);
-    return typed;
+    auto function = pivoting_ ? &CudaSparse::Gtsv<Scalar>
+                              : &CudaSparse::GtsvNoPivot<Scalar>;
+    OP_REQUIRES_OK(
+        context, (cusparse_solver.get()->*function)(
+                     num_eqs, num_rhs, subdiag, diag, superdiag, rhs, num_eqs));
   }
 
   void SolveForSizeOneOrTwo(OpKernelContext* context, const Scalar* diagonals,
@@ -213,6 +220,116 @@
                 errors::InvalidArgument(m == 1 ? kNotInvertibleScalarMsg
                                                : kNotInvertibleMsg));
   }
+
+  bool pivoting_;
+};
+
+template <class Scalar>
+class TridiagonalSolveOpGpu : public OpKernel {
+ public:
+  explicit TridiagonalSolveOpGpu(OpKernelConstruction* context)
+      : OpKernel(context), linalgOp_(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("partial_pivoting", &pivoting_));
+  }
+
+  void Compute(OpKernelContext* context) final {
+    const Tensor& lhs = context->input(0);
+    const Tensor& rhs = context->input(1);
+    const int ndims = lhs.dims();
+    const int64 num_rhs = rhs.dim_size(rhs.dims() - 1);
+    const int64 matrix_size = lhs.dim_size(ndims - 1);
+    int64 batch_size = 1;
+    for (int i = 0; i < ndims - 2; i++) {
+      batch_size *= lhs.dim_size(i);
+    }
+
+    // The batching mechanism of LinearAlgebraOp is used when it's not
+    // possible or desirable to use GtsvBatched.
+    const bool use_linalg_op =
+        pivoting_            // GtsvBatched doesn't do pivoting
+        || num_rhs > 1       // GtsvBatched doesn't support multiple rhs
+        || matrix_size < 3   // Not supported in cuSparse, use the custom kernel
+        || batch_size == 1;  // No point to use GtsvBatched
+
+    if (use_linalg_op) {
+      linalgOp_.Compute(context);
+    } else {
+      ComputeWithGtsvBatched(context, lhs, rhs, batch_size);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOpGpu);
+
+  void ComputeWithGtsvBatched(OpKernelContext* context, const Tensor& lhs,
+                              const Tensor& rhs, const int batch_size) {
+    const Scalar* rhs_data = rhs.flat<Scalar>().data();
+    const int ndims = lhs.dims();
+
+    // To use GtsvBatched we need to transpose the left-hand side from shape
+    // [..., 3, M] into shape [3, ..., M]. With shape [..., 3, M] the stride
+    // between corresponding diagonal elements of consecutive batch components
+    // is 3 * M, while for the right-hand side the stride is M. Unfortunately,
+    // GtsvBatched requires the strides to be the same. For this reason we
+    // transpose into [3, ..., M], so that diagonals, superdiagonals, and
+    // and subdiagonals are separated from each other, and have stride M.
+    Tensor lhs_transposed;
+    TransposeLhsForGtsvBatched(context, lhs, lhs_transposed);
+    int matrix_size = lhs.dim_size(ndims - 1);
+    const Scalar* lhs_data = lhs_transposed.flat<Scalar>().data();
+    const Scalar* superdiag = lhs_data;
+    const Scalar* diag = lhs_data + matrix_size * batch_size;
+    const Scalar* subdiag = lhs_data + 2 * matrix_size * batch_size;
+
+    // Copy right-hand side into the output. GtsvBatched will replace it with
+    // the solution.
+    Tensor* output;
+    OP_REQUIRES_OK(context, context->allocate_output(0, rhs.shape(), &output));
+    CopyDeviceToDevice(context, rhs_data, output->flat<Scalar>().data(),
+                       rhs.flat<Scalar>().size());
+    Scalar* x = output->flat<Scalar>().data();
+
+    std::unique_ptr<CudaSparse> cusparse_solver(new CudaSparse(context));
+
+    OP_REQUIRES_OK(context, cusparse_solver->Initialize());
+    OP_REQUIRES_OK(context, cusparse_solver->GtsvStridedBatch(
+                                matrix_size, subdiag, diag, superdiag, x,
+                                batch_size, matrix_size));
+  }
+
+  void TransposeLhsForGtsvBatched(OpKernelContext* context, const Tensor& lhs,
+                                  Tensor& lhs_transposed) {
+    const int ndims = lhs.dims();
+
+    // Permutation of indices, transforming [..., 3, M] into [3, ..., M].
+    // E.g. for ndims = 6, it is [4, 0, 1, 2, 3, 5].
+    std::vector<int> perm(ndims);
+    perm[0] = ndims - 2;
+    for (int i = 0; i < ndims - 2; ++i) {
+      perm[i + 1] = i;
+    }
+    perm[ndims - 1] = ndims - 1;
+
+    std::vector<int64> dims;
+    for (int index : perm) {
+      dims.push_back(lhs.dim_size(index));
+    }
+    TensorShape lhs_transposed_shape(
+        gtl::ArraySlice<int64>(dims.data(), ndims));
+
+    std::unique_ptr<CudaSolver> cublas_solver(new CudaSolver(context));
+    OP_REQUIRES_OK(context, cublas_solver->allocate_scoped_tensor(
+                                DataTypeToEnum<Scalar>::value,
+                                lhs_transposed_shape, &lhs_transposed));
+    auto device = context->eigen_device<Eigen::GpuDevice>();
+    OP_REQUIRES_OK(
+        context,
+        DoTranspose(device, lhs, gtl::ArraySlice<int>(perm.data(), ndims),
+                    &lhs_transposed));
+  }
+
+  TridiagonalSolveOpGpuLinalg<Scalar> linalgOp_;
+  bool pivoting_;
 };
 
 REGISTER_LINALG_OP_GPU("TridiagonalSolve", (TridiagonalSolveOpGpu<float>),
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index c071db6..59ebbed 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -350,6 +350,7 @@
 REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
                         UnicodeTranscodeOp);
 
+template <typename SPLITS_TYPE>
 class UnicodeDecodeBaseOp : public OpKernel {
  public:
   explicit UnicodeDecodeBaseOp(OpKernelConstruction* ctx, bool generate_offsets)
@@ -369,8 +370,8 @@
   }
 
   void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
-              std::vector<int64>* offset_values, int* current_offset,
-              int64* next_row_split, UChar32 char_value, int char_length,
+              std::vector<SPLITS_TYPE>* offset_values, int* current_offset,
+              SPLITS_TYPE* next_row_split, UChar32 char_value, int char_length,
               bool found_any_format_error) {
     if (error_options_.error_on_malformatting && found_any_format_error) {
       ctx->CtxFailure(
@@ -414,16 +415,16 @@
                     input_encoding_));
 
     std::vector<UChar32> char_values;
-    std::vector<int64> offset_values;
+    std::vector<SPLITS_TYPE> offset_values;
 
     Tensor* output_row_splits;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("row_splits",
                                              {input_tensor->NumElements() + 1},
                                              &output_row_splits));
-    auto out_row_splits = output_row_splits->vec<int64>();
+    auto out_row_splits = output_row_splits->vec<SPLITS_TYPE>();
 
     int row_split_index = 0;
-    int64 next_row_split = 0;
+    SPLITS_TYPE next_row_split = 0;
     for (int i = 0; i < input_vec.size(); ++i) {
       const string& input = input_vec(i);
       // Convert input strings into unicode values. Output to a list of
@@ -443,18 +444,18 @@
 
     Tensor* output_char_values;
     OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("char_values",
-                                  {static_cast<int64>(char_values.size())},
-                                  &output_char_values));
+        ctx, ctx->allocate_output(
+                 "char_values", {static_cast<SPLITS_TYPE>(char_values.size())},
+                 &output_char_values));
     auto out_char_values = output_char_values->vec<int32>();
     if (generate_offsets_) {
       DCHECK(offset_values.size() == char_values.size());
       Tensor* output_offset_values;
-      OP_REQUIRES_OK(
-          ctx, ctx->allocate_output("char_to_byte_starts",
-                                    {static_cast<int64>(offset_values.size())},
-                                    &output_offset_values));
-      auto out_offset_values = output_offset_values->vec<int64>();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                              "char_to_byte_starts",
+                              {static_cast<SPLITS_TYPE>(offset_values.size())},
+                              &output_offset_values));
+      auto out_offset_values = output_offset_values->vec<SPLITS_TYPE>();
 
       // Load output tensors from intermediate value arrays.
       for (int i = 0; i < char_values.size(); ++i) {
@@ -474,23 +475,36 @@
   bool generate_offsets_ = false;
 };
 
-class UnicodeDecodeOp : public UnicodeDecodeBaseOp {
+template <typename SPLITS_TYPE>
+class UnicodeDecodeOp : public UnicodeDecodeBaseOp<SPLITS_TYPE> {
  public:
   explicit UnicodeDecodeOp(OpKernelConstruction* ctx)
-      : UnicodeDecodeBaseOp(ctx, false) {}
+      : UnicodeDecodeBaseOp<SPLITS_TYPE>(ctx, false) {}
 };
 
-class UnicodeDecodeWithOffsetsOp : public UnicodeDecodeBaseOp {
+template <typename SPLITS_TYPE>
+class UnicodeDecodeWithOffsetsOp : public UnicodeDecodeBaseOp<SPLITS_TYPE> {
  public:
   explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
-      : UnicodeDecodeBaseOp(ctx, true) {}
+      : UnicodeDecodeBaseOp<SPLITS_TYPE>(ctx, true) {}
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnicodeDecode").Device(DEVICE_CPU),
-                        UnicodeDecodeOp);
-REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
-                        UnicodeDecodeWithOffsetsOp);
+REGISTER_KERNEL_BUILDER(
+    Name("UnicodeDecode").Device(DEVICE_CPU).TypeConstraint<int64>("Tsplits"),
+    UnicodeDecodeOp<int64>);
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("Tsplits"),
+                        UnicodeDecodeWithOffsetsOp<int64>);
+REGISTER_KERNEL_BUILDER(
+    Name("UnicodeDecode").Device(DEVICE_CPU).TypeConstraint<int32>("Tsplits"),
+    UnicodeDecodeOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int32>("Tsplits"),
+                        UnicodeDecodeWithOffsetsOp<int32>);
 
+template <typename SPLITS_TYPE>
 class UnicodeEncodeOp : public OpKernel {
  public:
   explicit UnicodeEncodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -515,7 +529,7 @@
     const Tensor& input_tensor = context->input(0);
     const auto input_tensor_flat = input_tensor.flat<int32>();
     const Tensor& input_splits = context->input(1);
-    const auto input_splits_flat = input_splits.flat<int64>();
+    const auto input_splits_flat = input_splits.flat<SPLITS_TYPE>();
 
     // Since we limit to a 2-D input (flat_values of rank 1 and a single splits
     // tensor), our output dimension will be 1 with it's size equal to the
@@ -558,7 +572,11 @@
   ErrorOptions error_options_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnicodeEncode").Device(DEVICE_CPU),
-                        UnicodeEncodeOp);
+REGISTER_KERNEL_BUILDER(
+    Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int64>("Tsplits"),
+    UnicodeEncodeOp<int64>);
+REGISTER_KERNEL_BUILDER(
+    Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int32>("Tsplits"),
+    UnicodeEncodeOp<int32>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 19eb320..8a7c163 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -134,7 +134,8 @@
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T"),
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
index 6c7a9d7..2b1ac45 100644
--- a/tensorflow/core/kernels/xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 #define EIGEN_USE_GPU
 
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index f8c0698..941e2bd 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -34,7 +34,7 @@
 
 #include "include/libxsmm_cpuid.h"
 #include "include/libxsmm_malloc.h"
-#include "third_party/libxsmm/src/libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
+#include "src/libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index 0b63f66..c8b24df 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -140,6 +140,23 @@
   return new string(r);
 }
 
+// kDerivedMarker is appended to the Status message string to indicate whether a
+// Status object is the root cause of an error or if it has been triggered by
+// cancelling/aborting a step.
+static const char* kDerivedMarker = "[_Derived_]";
+
+Status StatusGroup::MakeDerived(const Status& s) {
+  if (IsDerived(s)) {
+    return s;
+  } else {
+    return Status(s.code(), strings::StrCat(kDerivedMarker, s.error_message()));
+  }
+}
+
+bool StatusGroup::IsDerived(const Status& s) {
+  return s.error_message().find(kDerivedMarker) != std::string::npos;
+}
+
 void StatusGroup::Update(const Status& s) {
   if (s.ok()) {
     ++num_ok_;
@@ -149,91 +166,88 @@
   }
 }
 
-const int kMaxChildMessageSize = 2048;
+static std::vector<Status> GetNonDerivedStatuses(
+    const std::vector<Status>& status) {
+  std::vector<Status> nonderived_statuses;
+  for (auto& s : status) {
+    if (!StatusGroup::IsDerived(s)) {
+      nonderived_statuses.push_back(s);
+    }
+  }
+  return nonderived_statuses;
+}
 
-Status StatusGroup::as_status() const {
+static constexpr int kMaxAggregatedStatusMessageSize = 8 * 1024;
+
+// Summarize all the status objects in the StatusGroup. This is used when
+// individual Status objects in the StatusGroup are not already summarized.
+Status StatusGroup::as_summary_status() const {
   if (ok_) {
     return Status::OK();
   }
 
-  // Reduce verbosity when handling duplicate messages. If there is only a
-  // single message, or all messages have similar content, then return the
-  // longest status message.
-  std::vector<Status> sorted_children(children_);
-  std::sort(sorted_children.begin(), sorted_children.end(),
-            [](const Status& a, const Status& b) {
-              return a.error_message().length() > b.error_message().length();
-            });
-  bool single_status = true;
-  for (const auto& s : sorted_children) {
-    if (s.code() != sorted_children[0].code() ||
-        sorted_children[0].error_message().find(s.error_message()) ==
-            string::npos) {
-      single_status = false;
-      break;
-    }
+  std::vector<Status> nonderived_statuses = GetNonDerivedStatuses(children_);
+
+  // If only one root status is found, return it directly.
+  if (nonderived_statuses.size() == 1) {
+    return nonderived_statuses[0];
   }
 
-  if (single_status) {
-    return sorted_children[0];
-  }
+  if (!nonderived_statuses.empty()) {
+    std::vector<string> fmt;
 
-  std::vector<string> fmt;
+    fmt.push_back(strings::Printf("%zu root error(s) found.",
+                                  nonderived_statuses.size()));
 
-  // Compute a final output string with status codes sorted by frequency in
-  // increasing order.  This prefers more "interesting" messages over child
-  // messages that may come from cancellation.
-  std::map<error::Code, std::vector<Status>> code_to_status;
-  for (const Status& s : children_) {
-    code_to_status[s.code()].push_back(s);
-  }
-
-  std::vector<std::pair<error::Code, int>> count_vec;
-  count_vec.reserve(code_to_status.size());
-  for (auto& p : code_to_status) {
-    count_vec.push_back(std::make_pair(p.first, p.second.size()));
-  }
-
-  std::sort(
-      count_vec.begin(), count_vec.end(),
-      [](const std::pair<error::Code, int>& a,
-         const std::pair<error::Code, int>& b) { return a.second < b.second; });
-
-  fmt.push_back(
-      strings::Printf("Combined status information from %zu operations:\n",
-                      num_ok_ + children_.size()));
-
-  for (const auto& p : count_vec) {
-    // Deduplicate error messages
-    std::map<string, int> child_errors;
-    for (const Status& s : code_to_status[p.first]) {
-      ++child_errors[s.error_message()];
+    int index = 0;
+    for (auto& s : nonderived_statuses) {
+      fmt.emplace_back(strings::StrCat("  (", index, ") ", s.ToString()));
+      ++index;
     }
 
-    string child_fmt;
-    for (auto& m : child_errors) {
-      child_fmt.append(strings::Printf(
-          "  %s [%dx]",
-          str_util::StringReplace(m.first, "\n", "\n  ", true).c_str(),
-          m.second));
-      child_fmt.append("\n");
-    }
-    // Strip last newline.
-    child_fmt = child_fmt.substr(0, child_fmt.size() - 1);
+    fmt.push_back(strings::Printf("%zu successful operations.", num_ok_));
+    fmt.push_back(
+        strings::Printf("%zu derived errors ignored.",
+                        children_.size() - nonderived_statuses.size()));
 
-    if (child_fmt.size() > kMaxChildMessageSize) {
-      child_fmt =
-          strings::StrCat(child_fmt.substr(0, kMaxChildMessageSize), "...");
-    }
-    fmt.push_back(strings::Printf("Status code: %s [%dx]\n%s",
-                                  error_name(p.first).c_str(), p.second,
-                                  child_fmt.c_str()));
+    return Status(
+        nonderived_statuses[0].code(),
+        absl::StrJoin(fmt, "\n").substr(0, kMaxAggregatedStatusMessageSize));
+  } else {
+    // All statuses are derived. Pick the first available status to return.
+    return children_[0];
+  }
+}
+
+// Concatenate all the status objects in the StatusGroup. This is used when
+// individual Status objects in the StatusGroup are already summarized Status.
+Status StatusGroup::as_concatenated_status() const {
+  if (ok_) {
+    return Status::OK();
   }
 
-  fmt.push_back(strings::Printf("(%zd successful operations.)", num_ok_));
+  std::vector<Status> nonderived_statuses = GetNonDerivedStatuses(children_);
 
-  // TODO(power): use the least-frequently occurring status for the return code
-  return Status(children_[0].code(), str_util::Join(fmt, "\n"));
+  // If only one root status is found, return it directly.
+  if (nonderived_statuses.size() == 1) {
+    return nonderived_statuses[0];
+  }
+
+  if (!nonderived_statuses.empty()) {
+    std::vector<string> fmt;
+    fmt.emplace_back("\n=====================");
+    for (auto& s : nonderived_statuses) {
+      fmt.emplace_back(s.ToString());
+    }
+    fmt.emplace_back("=====================\n");
+    return Status(
+        nonderived_statuses[0].code(),
+        absl::StrJoin(fmt, "\n").substr(0, kMaxAggregatedStatusMessageSize));
+  } else {
+    // All statuses are derived. Pick the first available status to return.
+    // This should not happen in normal execution.
+    return children_[0];
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index fe3eec1..48174cb 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -100,11 +100,16 @@
 // Helper class to manage multiple child status values.
 class StatusGroup {
  public:
-  // Return a merged status with combined child status messages.
-  //
-  // The status code returned is OK if all children were successful, otherwise
-  // the first non-OK child status code is reported.
-  Status as_status() const;
+  // Utility function to mark a Status as derived. By marking derived status,
+  // Derived status messages are ignored when reporting errors to end users.
+  static Status MakeDerived(const Status& s);
+  static bool IsDerived(const Status& s);
+
+  // Return a merged status with combined child status messages with a summary.
+  Status as_summary_status() const;
+  // Return a merged status with combined child status messages with
+  // concatenation.
+  Status as_concatenated_status() const;
 
   bool ok() const { return ok_; }
 
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index 7c28184..c932458 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -14,9 +14,9 @@
 ==============================================================================*/
 
 #include "tensorflow/core/lib/core/status.h"
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -98,72 +98,67 @@
   ASSERT_NE(a, b);
 }
 
-TEST(StatusGroup, AcceptsFirstCode) {
+TEST(StatusGroup, OKStatusGroup) {
   StatusGroup c;
-  const Status internal(errors::Internal("Original error."));
-  c.Update(internal);
   c.Update(Status::OK());
   c.Update(Status::OK());
-  c.Update(Status::OK());
-  ASSERT_EQ(c.as_status().code(), internal.code());
-  ASSERT_EQ(c.ok(), false);
+  ASSERT_EQ(c.as_summary_status(), Status::OK());
+  ASSERT_EQ(c.as_concatenated_status(), Status::OK());
 }
 
-TEST(StatusGroup, ContainsChildMessages) {
+TEST(StatusGroup, AggregateWithSingleErrorStatus) {
+  StatusGroup c;
+  const Status internal(errors::Internal("Original error."));
+
+  c.Update(internal);
+  ASSERT_EQ(c.as_summary_status(), internal);
+
+  Status concat_status = c.as_concatenated_status();
+  ASSERT_EQ(concat_status.code(), internal.code());
+  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
+                                internal.error_message()));
+
+  // Add derived error status
+  const Status derived =
+      StatusGroup::MakeDerived(errors::Internal("Derived error."));
+  c.Update(derived);
+
+  ASSERT_EQ(c.as_summary_status(), internal);
+
+  concat_status = c.as_concatenated_status();
+  ASSERT_EQ(concat_status.code(), internal.code());
+  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
+                                internal.error_message()));
+}
+
+TEST(StatusGroup, AggregateWithMultipleErrorStatus) {
   StatusGroup c;
   const Status internal(errors::Internal("Original error."));
   const Status cancelled(errors::Cancelled("Cancelled after 10 steps."));
   const Status aborted(errors::Aborted("Aborted after 10 steps."));
+
   c.Update(internal);
-  for (size_t i = 0; i < 5; ++i) {
-    c.Update(cancelled);
-  }
-  for (size_t i = 0; i < 10; ++i) {
-    c.Update(aborted);
-  }
-  for (size_t i = 0; i < 100; ++i) {
-    c.Update(Status::OK());
-  }
+  c.Update(cancelled);
+  c.Update(aborted);
 
-  ASSERT_EQ(c.as_status().code(), internal.code());
-  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
-                                    internal.error_message()));
-  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
-                                    cancelled.error_message()));
-  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
-                                    aborted.error_message()));
-  StatusGroup d;
-  d.Update(c.as_status());
-  c.Update(errors::FailedPrecondition("Failed!"));
-  d.Update(c.as_status());
-  c.Update(errors::DataLoss("Data loss!"));
-  d.Update(c.as_status());
-  LOG(INFO) << d.as_status();
-}
+  Status summary = c.as_summary_status();
 
-TEST(StatusGroup, ContainsIdenticalMessage) {
-  StatusGroup sg;
-  const Status internal(errors::Internal("Original error"));
-  for (size_t i = 0; i < 10; i++) {
-    sg.Update(internal);
-  }
-  EXPECT_EQ(sg.as_status(), internal);
-}
+  ASSERT_EQ(summary.code(), internal.code());
+  ASSERT_TRUE(
+      absl::StrContains(summary.error_message(), internal.error_message()));
+  ASSERT_TRUE(
+      absl::StrContains(summary.error_message(), cancelled.error_message()));
+  ASSERT_TRUE(
+      absl::StrContains(summary.error_message(), aborted.error_message()));
 
-TEST(StatusGroup, ContainsCommonPrefix) {
-  StatusGroup sg;
-  const Status a(errors::Internal("Original error"));
-  const Status b(errors::Internal("Original error is"));
-  const Status c(errors::Internal("Original error is invalid"));
-  sg.Update(a);
-  sg.Update(c);
-  sg.Update(c);
-  sg.Update(b);
-  sg.Update(c);
-  sg.Update(b);
-  sg.Update(a);
-  sg.Update(b);
-  EXPECT_EQ(sg.as_status(), c);
+  Status concat_status = c.as_concatenated_status();
+  ASSERT_EQ(concat_status.code(), internal.code());
+  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
+                                internal.error_message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
+                                cancelled.error_message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
+                                aborted.error_message()));
 }
 
 static void BM_TF_CHECK_OK(int iters) {
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index db996b7..0e16d5b 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -40,7 +40,7 @@
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
     fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
-    bool work[kWorkItems];
+    std::atomic<bool> work[kWorkItems];
     for (int i = 0; i < kWorkItems; i++) {
       work[i] = false;
     }
@@ -50,8 +50,7 @@
         pool.Schedule([&outer_context, &work, i]() {
           Context inner_context(ContextKind::kThread);
           ASSERT_EQ(outer_context, inner_context);
-          ASSERT_FALSE(work[i]);
-          work[i] = true;
+          ASSERT_FALSE(work[i].exchange(true));
         });
       }
     }
@@ -65,7 +64,10 @@
   mutex mu;
   int64 num_shards = 0;
   int64 num_done_work = 0;
-  std::vector<bool> work(total, false);
+  std::vector<std::atomic<bool>> work(total);
+  for (int i = 0; i < total; i++) {
+    work[i] = false;
+  }
   threads->TransformRangeConcurrently(
       block_size, total,
       [=, &mu, &num_shards, &num_done_work, &work](int64 start, int64 end) {
@@ -75,14 +77,16 @@
         mutex_lock l(mu);
         ++num_shards;
         for (; start < end; ++start) {
-          EXPECT_FALSE(work[start]);  // No duplicate
+          EXPECT_FALSE(work[start].exchange(true));  // No duplicate
           ++num_done_work;
-          work[start] = true;
         }
       });
   LOG(INFO) << block_size << " " << total;
-  const int64 num_workers = (total + block_size - 1) / block_size;
   EXPECT_EQ(num_done_work, total);
+  for (int i = 0; i < total; i++) {
+    ASSERT_TRUE(work[i]);
+  }
+  const int64 num_workers = (total + block_size - 1) / block_size;
   if (num_workers < threads->NumThreads()) {
     // If the intention is to limit the parallelism explicitly, we'd
     // better honor it. Ideally, even if per_thread_max_parallelism >
@@ -129,7 +133,7 @@
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
     fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
-    bool work[kWorkItems];
+    std::atomic<bool> work[kWorkItems];
     ThreadPool pool(Env::Default(), "test", num_threads);
     for (int i = 0; i < kWorkItems; i++) {
       work[i] = false;
@@ -139,8 +143,7 @@
                        Context inner_context(ContextKind::kThread);
                        ASSERT_EQ(outer_context, inner_context);
                        for (int64 i = begin; i < end; ++i) {
-                         ASSERT_FALSE(work[i]);
-                         work[i] = true;
+                         ASSERT_FALSE(work[i].exchange(true));
                        }
                      });
     for (int i = 0; i < kWorkItems; i++) {
@@ -155,19 +158,18 @@
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
     fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
-    volatile std::atomic<bool> work[kWorkItems];
+    std::atomic<bool> work[kWorkItems];
     ThreadPool pool(Env::Default(), "test", num_threads);
     for (int i = 0; i < kWorkItems; i++) {
       work[i] = false;
     }
-    volatile std::atomic<bool> threads_running[kNumThreads + 1];
+    std::atomic<bool> threads_running[kNumThreads + 1];
     for (int i = 0; i < num_threads + 1; i++) {
       threads_running[i] = false;
     }
     pool.ParallelForWithWorkerId(
         kWorkItems, kHugeCost,
-        [&threads_running, &work, num_threads](int64 begin, int64 end,
-                                               int64 id) {
+        [&threads_running, &work](int64 begin, int64 end, int64 id) {
           // Store true for the current thread, and assert that another thread
           // is not running with the same id.
           ASSERT_LE(0, id);
diff --git a/tensorflow/core/lib/io/table.h b/tensorflow/core/lib/io/table.h
index b9c6b8d..788d192 100644
--- a/tensorflow/core/lib/io/table.h
+++ b/tensorflow/core/lib/io/table.h
@@ -17,16 +17,15 @@
 #define TENSORFLOW_CORE_LIB_IO_TABLE_H_
 
 #include <stdint.h>
+
 #include "tensorflow/core/lib/io/iterator.h"
 
 namespace tensorflow {
+
 class RandomAccessFile;
 
 namespace table {
 
-class Block;
-class BlockHandle;
-class Footer;
 struct Options;
 
 // A Table is a sorted map from strings to strings.  Tables are
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index d069db6..a489d2e 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -197,24 +197,21 @@
 
     // Now that the cache is empty we need to inflate more data.
 
-    // Step 1. Fill up input buffer.
-    // We read from stream only after the previously read contents have been
-    // completely consumed. This is an optimization and can be removed if
-    // it causes problems. `ReadFromStream` is capable of handling partially
-    // filled up buffers.
-    if (z_stream_def_->stream->avail_in == 0) {
-      TF_RETURN_IF_ERROR(ReadFromStream());
-    }
-
-    // Step 2. Setup output stream.
+    // Step 1. Setup output stream.
     z_stream_def_->stream->next_out = z_stream_def_->output.get();
     next_unread_byte_ = reinterpret_cast<char*>(z_stream_def_->output.get());
     z_stream_def_->stream->avail_out = output_buffer_capacity_;
 
-    // Step 3. Inflate Inflate Inflate!
+    // Step 2. Try to inflate some input data.
     TF_RETURN_IF_ERROR(Inflate());
 
-    bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
+    // Step 3. Read any data produced by inflate. If no progress was made by
+    // inflate, read more compressed data from the input stream.
+    if (NumUnreadBytes() == 0) {
+      TF_RETURN_IF_ERROR(ReadFromStream());
+    } else {
+      bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
+    }
   }
 
   return Status::OK();
@@ -224,7 +221,11 @@
 
 Status ZlibInputStream::Inflate() {
   int error = inflate(z_stream_def_->stream.get(), zlib_options_.flush_mode);
-  if (error != Z_OK && error != Z_STREAM_END) {
+  // Source: http://zlib.net/manual.html
+  // Z_BUF_ERROR: `inflate` returns Z_BUF_ERROR if no progress was made. This is
+  // not fatal and `inflate` can be called again with more input and output
+  // space to continue inflating.
+  if (error != Z_OK && error != Z_STREAM_END && error != Z_BUF_ERROR) {
     string error_string =
         strings::StrCat("inflate() failed with error ", error);
     if (z_stream_def_->stream->msg != nullptr) {
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 0f04897..2052219 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -16,9 +16,13 @@
 #ifndef TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
 #define TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 // We replace this implementation with a null implementation for mobile
 // platforms.
-#include "tensorflow/core/platform/platform.h"
 #ifdef IS_MOBILE_PLATFORM
 #include "tensorflow/core/lib/monitoring/mobile_counter.h"
 #else
@@ -49,7 +53,7 @@
 // This class is thread-safe.
 class CounterCell {
  public:
-  CounterCell(int64 value) : value_(value) {}
+  explicit CounterCell(int64 value) : value_(value) {}
   ~CounterCell() {}
 
   // Atomically increments the value by step.
diff --git a/tensorflow/core/lib/monitoring/gauge.h b/tensorflow/core/lib/monitoring/gauge.h
index 280b3d4..83edf68 100644
--- a/tensorflow/core/lib/monitoring/gauge.h
+++ b/tensorflow/core/lib/monitoring/gauge.h
@@ -16,9 +16,13 @@
 #ifndef TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
 #define TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 // We replace this implementation with a null implementation for mobile
 // platforms.
-#include "tensorflow/core/platform/platform.h"
 #ifdef IS_MOBILE_PLATFORM
 #include "tensorflow/core/lib/monitoring/mobile_gauge.h"
 #else
diff --git a/tensorflow/core/lib/monitoring/sampler.cc b/tensorflow/core/lib/monitoring/sampler.cc
index 23d3668..20c5f1a 100644
--- a/tensorflow/core/lib/monitoring/sampler.cc
+++ b/tensorflow/core/lib/monitoring/sampler.cc
@@ -15,9 +15,13 @@
 
 #include "tensorflow/core/lib/monitoring/sampler.h"
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 // We replace this implementation with a null implementation for mobile
 // platforms.
-#include "tensorflow/core/platform/platform.h"
 #ifdef IS_MOBILE_PLATFORM
 // Do nothing.
 #else
@@ -93,6 +97,12 @@
 }  // namespace
 
 // static
+std::unique_ptr<Buckets> Buckets::Explicit(std::vector<double> bucket_limits) {
+  return std::unique_ptr<Buckets>(
+      new ExplicitBuckets(std::move(bucket_limits)));
+}
+
+// static
 std::unique_ptr<Buckets> Buckets::Explicit(
     std::initializer_list<double> bucket_limits) {
   return std::unique_ptr<Buckets>(new ExplicitBuckets(bucket_limits));
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index 7707e0a..c6f32d4 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -16,9 +16,13 @@
 #ifndef TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
 #define TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 // We replace this implementation with a null implementation for mobile
 // platforms.
-#include "tensorflow/core/platform/platform.h"
 #ifdef IS_MOBILE_PLATFORM
 #include "tensorflow/core/lib/monitoring/mobile_sampler.h"
 #else
@@ -92,6 +96,11 @@
   static std::unique_ptr<Buckets> Explicit(
       std::initializer_list<double> bucket_limits);
 
+  // This alternative Explicit Buckets factory method is primarily meant to be
+  // used by the CLIF layer code paths that are incompatible with
+  // initialize_lists.
+  static std::unique_ptr<Buckets> Explicit(std::vector<double> bucket_limits);
+
   virtual const std::vector<double>& explicit_bounds() const = 0;
 };
 
diff --git a/tensorflow/core/lib/monitoring/sampler_test.cc b/tensorflow/core/lib/monitoring/sampler_test.cc
index 0557835..8be15f9 100644
--- a/tensorflow/core/lib/monitoring/sampler_test.cc
+++ b/tensorflow/core/lib/monitoring/sampler_test.cc
@@ -61,7 +61,7 @@
 auto* init_sampler_without_labels =
     Sampler<0>::New({"/tensorflow/test/init_sampler_without_labels",
                      "Sampler without labels initialized as empty."},
-                    Buckets::Explicit({1.5, 2.8}));
+                    Buckets::Explicit(std::vector<double>{1.5, 2.8}));
 
 TEST(UnlabeledSamplerTest, InitializedEmpty) {
   Histogram empty;
@@ -112,7 +112,7 @@
   EqHistograms(expected, cell->value());
 }
 
-TEST(ExponentialSamplerTest, SameName) {
+TEST(ExplicitSamplerTest, SameName) {
   auto* same_sampler = Sampler<1>::New({"/tensorflow/test/sampler_with_labels",
                                         "Sampler with one label.", "MyLabel"},
                                        Buckets::Explicit({10.0, 20.0}));
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index a620f59..ef30805 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -52,7 +52,7 @@
 // You can convert to Hexadecimal output rather than Decimal output using Hex.
 // To do this, pass strings::Hex(my_int) as a parameter to StrCat. You may
 // specify a minimum field width using a separate parameter, so the equivalent
-// of Printf("%04x", my_int) is StrCat(Hex(my_int, strings::ZERO_PAD_4))
+// of Printf("%04x", my_int) is StrCat(Hex(my_int, strings::kZeroPad4))
 //
 // This class has implicit constructors.
 namespace tensorflow {
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 0323a08..ca6e64c 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -18,6 +18,7 @@
 
 #ifdef GOOGLE_CUDA
 
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/env.h"
@@ -44,14 +45,10 @@
 
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
-struct NcclManager::NcclStream {
+struct NcclManager::NcclStream : public core::RefCounted {
  public:
-  NcclStream() {}
-  ~NcclStream() {
-    mutex_lock l(mu);
-    shutdown_requested = true;
-    cv.notify_all();
-  }
+  NcclStream() = default;
+  ~NcclStream() = default;
 
   se::StreamExecutor* executor = nullptr;
 
@@ -59,11 +56,13 @@
   // This is a different stream than the tensorflow compute stream.
   std::unique_ptr<se::Stream> stream;
 
-  // See NcclManager::LoopKernelLaunches for information on these.
-  std::unique_ptr<Thread> thread;
+  // `mu` protects access to `pending_launches_`, which is the list of
+  // collectives ready but whose kernels are yet to be launched.  When the
+  // NcclManager object that owns this NcclStream object is destroyed, it
+  // signals `cv` to unblock the thread waiting on more collectives.
   mutex mu;
   condition_variable cv;
-  // Has collective,participant_idx pairs.
+  // Has (collective, participant_idx) pairs.
   std::deque<std::pair<Collective*, int>> pending_launches_ GUARDED_BY(mu);
   bool shutdown_requested GUARDED_BY(mu) = false;
 };
@@ -76,7 +75,7 @@
   }
 
   ncclComm_t nccl_comm = nullptr;
-  // Owned by NcclManager::device_to_comm_streams_.
+  // Owned by NcclManager::device_to_comm_streams_ and LoopKernelLaunches.
   NcclStream* nccl_stream = nullptr;
 };
 
@@ -127,7 +126,7 @@
 // have a single `Collective` per step.  However, a collective that executes on
 // 3 nodes with 4 GPUs each would have a `Collective` per node, each of which is
 // tracking the 4 GPUs local to that node.
-struct NcclManager::Collective {
+struct NcclManager::Collective : public core::RefCounted {
   Collective(DataType data_type_in, CollectiveType type_in,
              ncclRedOp_t reduction_op_in, int num_local_devices_in,
              int num_global_devices_in, const string& communicator_key_in)
@@ -137,8 +136,7 @@
         num_local_devices(num_local_devices_in),
         num_global_devices(num_global_devices_in),
         single_node(num_local_devices_in == num_global_devices_in),
-        communicator_key(communicator_key_in),
-        remaining_participants(num_local_devices_in) {
+        communicator_key(communicator_key_in) {
     participants.reserve(num_local_devices_in);
   }
 
@@ -174,13 +172,23 @@
   int available_participants = 0;
   bool multi_node_ready = false;
 
-  mutable std::atomic_int_fast32_t remaining_participants;
-
   Status status;
 };
 
-NcclManager::NcclManager() {}
-NcclManager::~NcclManager() {}
+NcclManager::NcclManager() { VLOG(2) << "New NcclManager " << this; }
+NcclManager::~NcclManager() {
+  VLOG(2) << "~NcclManager " << this;
+  for (auto& it : device_to_comm_streams_) {
+    for (NcclStream* nccl_stream : it.second) {
+      {
+        mutex_lock l(nccl_stream->mu);
+        nccl_stream->shutdown_requested = true;
+        nccl_stream->cv.notify_all();
+      }
+      nccl_stream->Unref();
+    }
+  }
+}
 NcclManager* NcclManager::instance() {
   static NcclManager* instance = new NcclManager();
   return instance;
@@ -276,8 +284,8 @@
     auto& streams = device_to_comm_streams_[executor];
     NcclStream* nccl_stream = nullptr;
     for (const auto& s : streams) {
-      if (used_streams.insert(s.get()).second) {
-        nccl_stream = s.get();
+      if (used_streams.insert(s).second) {
+        nccl_stream = s;
         break;
       }
     }
@@ -290,9 +298,11 @@
       streams.emplace_back(nccl_stream);
       used_streams.insert(nccl_stream);
 
-      nccl_stream->thread.reset(env->StartThread(
-          ThreadOptions(), "nccl_kernel_launch",
-          [this, nccl_stream] { LoopKernelLaunches(nccl_stream); }));
+      nccl_stream->Ref();
+      env->SchedClosure([this, nccl_stream]() {
+        LoopKernelLaunches(nccl_stream);
+        nccl_stream->Unref();
+      });
     }
 
     members[i].nccl_stream = nccl_stream;
@@ -384,9 +394,11 @@
     mutex_lock l(mu_);
     auto collective_it = collectives_.find(collective_key);
     if (collective_it != collectives_.end()) {
-      Collective* collective = collective_it->second.get();
+      Collective* collective = collective_it->second;
       collective->multi_node_ready = true;
-      to_run = CheckReady(collective_key, collective);
+      if (CheckReady(collective_key, collective)) {
+        to_run = collective;
+      }
     }
   }
 
@@ -404,14 +416,12 @@
     auto collective_it = collectives_.find(context.collective_key);
     Collective* collective = nullptr;
     if (collective_it == collectives_.end()) {
-      auto collective_unique_ptr = absl::make_unique<Collective>(
+      collective = new Collective(
           data_type, collective_type, reduction_op, context.num_local_devices,
           context.num_global_devices, context.communicator_key);
-      collective = collective_unique_ptr.get();
-      collectives_.emplace(context.collective_key,
-                           std::move(collective_unique_ptr));
+      collectives_.emplace(context.collective_key, collective);
     } else {
-      collective = collective_it->second.get();
+      collective = collective_it->second;
     }
 
     // Check `collective` is correct and consistent.
@@ -465,26 +475,25 @@
     collective->participants.emplace_back(std::move(participant));
     ++collective->available_participants;
 
-    to_run = CheckReady(context.collective_key, collective);
+    if (CheckReady(context.collective_key, collective)) {
+      to_run = collective;
+    }
   }
 
   if (to_run != nullptr) RunCollective(to_run);
 }
 
-NcclManager::Collective* NcclManager::CheckReady(const string& collective_key,
-                                                 Collective* collective) {
-  Collective* to_run = nullptr;
+bool NcclManager::CheckReady(const string& collective_key,
+                             Collective* collective) {
   if (collective->available_participants == collective->num_local_devices) {
     if (collective->num_global_devices == collective->num_local_devices ||
         collective->multi_node_ready) {
       // Ownership transferred to callee.
-      to_run = collective;
-      auto collectives_it = collectives_.find(collective_key);
-      collectives_it->second.release();
-      collectives_.erase(collectives_it);
+      collectives_.erase(collective_key);
+      return true;
     }
   }
-  return to_run;
+  return false;
 }
 
 void NcclManager::RunCollective(Collective* collective) {
@@ -498,7 +507,7 @@
     for (int i = 0; i < collective->num_local_devices; ++i) {
       collective->participants[i]->done_callback(s);
     }
-    delete collective;
+    collective->Unref();
     return;
   }
 
@@ -535,9 +544,13 @@
           collective->communicator->members[i].nccl_stream;
       mutex_lock l(nccl_stream->mu);
       nccl_stream->pending_launches_.push_front(std::make_pair(collective, i));
+      // Ownership is shared between LoopKernelLaunches for each stream in this
+      // collective.
+      collective->Ref();
       nccl_stream->cv.notify_all();
     }
   }
+  collective->Unref();
 }
 
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
@@ -550,6 +563,7 @@
     // Find collective to run.
     std::pair<Collective*, int> next_launch;
     {
+      VLOG(2) << "Locking mutex nccl_stream " << nccl_stream;
       mutex_lock l(nccl_stream->mu);
       while (nccl_stream->pending_launches_.empty()) {
         if (nccl_stream->shutdown_requested) {
@@ -626,15 +640,7 @@
         collective->participants[p_idx]->done_callback(errors::Unknown(
             "Error invoking NCCL: ", ncclGetErrorString(nccl_result)));
       }
-
-      // TODO(cwhipkey): use RefCounted after figuring out how to use in a
-      // custom op library.
-      // See tensorflow/core/lib/core/refcount.h for details on this locking.
-      if (collective->remaining_participants.load(std::memory_order_acquire) ==
-              1 ||
-          collective->remaining_participants.fetch_sub(1) == 1) {
-        delete collective;
-      }
+      collective->Unref();
     };
     p->event_mgr->ThenExecute(comm_stream, done_callback);
   }
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index f2f15f8..d968fac 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -198,13 +198,13 @@
                       ncclRedOp_t reduction_op);
 
   // If `collective` is ready to run, removes it from the `collectives_` map and
-  // returns the pointer.  Otherwise returns `nullptr`.
+  // returns true.  Otherwise returns false.
   // Assumes `collective_key` corresponds to `collective`.
   //
   // A collective is ready to run when all local participants have called Add*
   // function, and the collective is signalled globally ready via
   // `SetMultiNodeReady`.
-  Collective* CheckReady(const string& collective_key, Collective* collective)
+  bool CheckReady(const string& collective_key, Collective* collective)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Run <collective>.  This calls takes ownership of <collective>.
@@ -214,13 +214,12 @@
   mutex mu_;
 
   // Maps key to collectives currently being assembled or run.
-  std::unordered_map<string, std::unique_ptr<Collective>> collectives_
-      GUARDED_BY(mu_);
+  std::unordered_map<string, Collective*> collectives_ GUARDED_BY(mu_);
 
   // Maps a device to the communication streams that make up its collective.
   // This is used to share the stream across different communicators that
   // include the same device.
-  std::map<se::StreamExecutor*, std::vector<std::unique_ptr<NcclStream>>>
+  std::map<se::StreamExecutor*, std::vector<NcclStream*>>
       device_to_comm_streams_ GUARDED_BY(mu_);
 
   std::vector<std::unique_ptr<Communicator>> communicators_;
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index 6bf24ff..06564ee 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -70,6 +70,8 @@
     LOG(INFO) << "Running test with " << devices_->size() << " gpus";
   }
 
+  void SetUp() override { ASSERT_GT(devices_->size(), 0) << "No GPUs found"; }
+
   static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
 
   static void TearDownTestCase() { delete devices_; }
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index af9ad33..ccbf417 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -17,6 +17,9 @@
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/strided_slice_op.h"
@@ -3123,6 +3126,37 @@
       return Status::OK();
     });
 
+REGISTER_OP("Fingerprint")
+    .Input("data: T")
+    .Input("method: string")
+    .Output("fingerprint: uint8")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      DimensionHandle fingerprint_size;
+      const Tensor* method = c->input_tensor(1);
+      if (method == nullptr) {
+        fingerprint_size = c->UnknownDim();
+      } else {
+        if (method->dims() != 0) {
+          return errors::InvalidArgument("`method` must be rank 0: ",
+                                         method->shape());
+        }
+        const string& method_string = method->scalar<string>()();
+        if (method_string != "farmhash64") {
+          return errors::InvalidArgument("Unsupported method: ", method_string);
+        }
+        fingerprint_size = c->MakeDim(sizeof(uint64));
+      }
+
+      DimensionHandle batch = c->Dim(c->input(0), 0);
+      c->set_output(0, c->MakeShape({batch, fingerprint_size}));
+      return Status::OK();
+    });
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklConcat")
     .Input("concat_dim: int32")
diff --git a/tensorflow/core/ops/bitwise_ops.cc b/tensorflow/core/ops/bitwise_ops.cc
index 39acf5f..8d04d97 100644
--- a/tensorflow/core/ops/bitwise_ops.cc
+++ b/tensorflow/core/ops/bitwise_ops.cc
@@ -30,6 +30,13 @@
   Input("x: T")                                                              \
       .Input("y: T")                                                         \
       .Output("z: T")                                                        \
+      .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}") \
+      .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+
+#define BINARY_BITWISE_COMMUTATIVE()                                         \
+  Input("x: T")                                                              \
+      .Input("y: T")                                                         \
+      .Output("z: T")                                                        \
       .SetIsCommutative()                                                    \
       .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}") \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -40,11 +47,11 @@
     .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
-REGISTER_OP("BitwiseAnd").BINARY_BITWISE();
+REGISTER_OP("BitwiseAnd").BINARY_BITWISE_COMMUTATIVE();
 
-REGISTER_OP("BitwiseOr").BINARY_BITWISE();
+REGISTER_OP("BitwiseOr").BINARY_BITWISE_COMMUTATIVE();
 
-REGISTER_OP("BitwiseXor").BINARY_BITWISE();
+REGISTER_OP("BitwiseXor").BINARY_BITWISE_COMMUTATIVE();
 
 REGISTER_OP("LeftShift").BINARY_BITWISE();
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index a1ca77f..f3a9d10 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -9075,6 +9075,44 @@
   }
 }
 op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
   name: "BatchFFT"
   input_arg {
     name: "input"
@@ -19861,6 +19899,45 @@
   }
 }
 op {
+  name: "DecodePaddedRaw"
+  input_arg {
+    name: "input_bytes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "fixed_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
   name: "DecodePng"
   input_arg {
     name: "contents"
@@ -32864,6 +32941,37 @@
   is_commutative: true
 }
 op {
+  name: "LeftShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
   name: "Less"
   input_arg {
     name: "x"
@@ -41335,6 +41443,34 @@
   is_commutative: true
 }
 op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
   name: "MultiDeviceIterator"
   output_arg {
     name: "handle"
@@ -43788,6 +43924,59 @@
   }
 }
 op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
   name: "PaddingFIFOQueue"
   output_arg {
     name: "handle"
@@ -47239,6 +47428,87 @@
   is_commutative: true
 }
 op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
   name: "QuantizedAvgPool"
   input_arg {
     name: "input"
@@ -51859,6 +52129,87 @@
   is_commutative: true
 }
 op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
   name: "QuantizedRelu"
   input_arg {
     name: "features"
@@ -65785,6 +66136,37 @@
   is_commutative: true
 }
 op {
+  name: "RightShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
   name: "Rint"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 1c10eae..e98827a 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -167,6 +167,7 @@
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("slack_period: int = 0")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size should be a scalar.
@@ -264,6 +265,7 @@
     .Input("batch_size: int64")
     .Input("drop_remainder: bool")
     .Output("handle: variant")
+    .Attr("parallel_copy: bool = false")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -280,6 +282,7 @@
     .Input("num_shards: int64")
     .Input("index: int64")
     .Output("handle: variant")
+    .Attr("require_non_empty: bool = false")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -318,6 +321,7 @@
     .Input("padding_values: Toutput_types")
     .Input("drop_remainder: bool")
     .Output("handle: variant")
+    .Attr("parallel_copy: bool = false")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 5a125f1..239732a 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -364,6 +364,19 @@
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("SnapshotDataset")
+    .Input("input_dataset: variant")
+    .Input("path: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // snapshot_path should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ExperimentalSqlDataset")
     .Input("driver_name: string")
     .Input("data_source_name: string")
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 66594b3..51ab3e2 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -208,6 +208,54 @@
   return Status::OK();
 }
 
+// Inputs: [...,1,M], [...,1,M], [...,1,M],[...,M,N].
+// Output is [...,M,N].
+Status TridiagonalMatMulShapeFn(InferenceContext* c) {
+  ShapeHandle superdiag;
+  ShapeHandle maindiag;
+  ShapeHandle subdiag;
+  ShapeHandle rhs;
+
+  // Check that rank is at least 2.
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &superdiag));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 2, &maindiag));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 2, &subdiag));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 2, &rhs));
+
+  // Extract batch dimensions and check they are the same.
+  ShapeHandle superdiag_batch_shape;
+  ShapeHandle maindiag_batch_shape;
+  ShapeHandle subdiag_batch_shape;
+  ShapeHandle rhs_batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(superdiag, 0, -2, &superdiag_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(maindiag, 0, -2, &maindiag_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(subdiag, 0, -2, &subdiag_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(rhs, 0, -2, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(c->Merge(superdiag, maindiag, &superdiag));
+  TF_RETURN_IF_ERROR(
+      c->Merge(maindiag_batch_shape, rhs_batch_shape, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(
+      c->Merge(subdiag_batch_shape, rhs_batch_shape, &rhs_batch_shape));
+
+  // Check that diagonals have the same shape.
+  TF_RETURN_IF_ERROR(c->Merge(superdiag, maindiag, &maindiag));
+  TF_RETURN_IF_ERROR(c->Merge(subdiag, maindiag, &maindiag));
+
+  // Check that size of tri-diagonal matrix is the same as height of matrix on
+  // the right.
+  DimensionHandle m_lhs = c->Dim(maindiag, -1);
+  DimensionHandle m_rhs = c->Dim(rhs, -2);
+  TF_RETURN_IF_ERROR(c->Merge(m_lhs, m_rhs, &m_lhs));
+
+  // Check that next-to-last dimension of diagonals is 1.
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(maindiag, -2), 1, &unused));
+
+  // The output shape is the same as rhs shape.
+  c->set_output(0, rhs);
+  return Status::OK();
+}
+
 // The first input is [...,3,M] and second input is [...,M,K].
 // Output is [...,M,K].
 Status TridiagonalSolveShapeFn(InferenceContext* c) {
@@ -409,10 +457,20 @@
     .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(SvdShapeFn);
 
+REGISTER_OP("TridiagonalMatMul")
+    .Input("superdiag: T")
+    .Input("maindiag: T")
+    .Input("subdiag: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("T: {double, float, complex64, complex128}")
+    .SetShapeFn(TridiagonalMatMulShapeFn);
+
 REGISTER_OP("TridiagonalSolve")
     .Input("diagonals: T")
     .Input("rhs: T")
     .Output("output: T")
+    .Attr("partial_pivoting: bool = True")
     .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(TridiagonalSolveShapeFn);
 
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index 93732f9..682a994 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -314,6 +314,43 @@
            "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
 }
 
+TEST(LinalgOpsTest, TridiagonalMatMul_ShapeFn) {
+  ShapeInferenceTestOp op("TridiagonalMatMul");
+  INFER_OK(op, "?;?;?;?", "in3");
+  INFER_OK(op, "[1,5];[1,5];[1,5];[?,1]", "in3");
+  INFER_OK(op, "[1,5];[1,5];[1,5];[5,1]", "in3");
+
+  INFER_OK(op, "[?,1,?];[?,1,?];[?,1,?];[?,?,?]", "in3");
+  INFER_OK(op, "[?,1,5];[?,1,5];[?,1,5];[7,5,2]", "in3");
+  INFER_OK(op, "[7,1,5];[7,1,5];[7,1,5];[?,5,2]", "in3");
+  INFER_OK(op, "[7,1,5];[7,1,5];[7,1,5];[7,5,2]", "in3");
+
+  INFER_OK(op, "[7,?,1,5];[7,?,1,5];[7,?,1,5];[7,8,5,2]", "in3");
+  INFER_OK(op, "[7,8,1,5];[7,8,1,5];[7,8,1,5];[7,8,5,2]", "in3");
+
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[3];[3];[3];[5,1]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[3,5];[3,5];[3,5];[5]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [6,4] and [6,8].",
+      op, "[6,4,3,5];[6,4,3,5];[6,4,3,5];[6,8,5,2]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [?,4] and [6,8].",
+      op, "[?,4,3,5];[?,4,3,5];[?,4,3,5];[6,8,5,2]");
+
+  // Diagonals must have the same length.
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 5 and 6. "
+      "Shapes are [1,5] and [1,6]",
+      op, "[1,5];[1,6];[1,5];[6,2]");
+
+  // Diagonals must be 1-row matrices.
+  INFER_ERROR("Dimension must be 1 but is 3", op, "[3,5];[3,5];[3,5];[5,2]");
+}
+
 TEST(LinalgOpsTest, TridiagonalSolve_ShapeFn) {
   ShapeInferenceTestOp op("TridiagonalSolve");
   INFER_OK(op, "?;?", "in1");
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 42a1b1d..da8b7d8 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -50,6 +50,7 @@
     .Input("input: string")
     .SetIsStateful()
     .Attr("output_stream: string = 'stderr'")
+    .Attr("end: string = '\n'")
     .SetShapeFn([](InferenceContext* c) {
       // Make sure that the input is a scalar.
       if (c->Rank(c->input(0)) != 0) {
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 99070b6..c63e3be 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -14,8 +14,11 @@
 ==============================================================================*/
 
 #include <vector>
+
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
@@ -789,7 +792,47 @@
                                const string& attr_adj_y, const string& x0,
                                bool ax0, const string& x1, bool ax1,
                                const string& y0, bool ay0, const string& y1,
-                               bool ay1) {
+                               bool ay1, bool enable_broadcasting) {
+  // The final outputs are "dx" and "dy". If we're broadcasting compute
+  // intermediate nodes for now.
+  std::vector<FDH::Node> nodes = {
+      {{(enable_broadcasting ? "gx" : "dx")},
+       opname,
+       {x0, x1},
+       {{"T", "$T"}, {attr_adj_x, ax0}, {attr_adj_y, ax1}}},
+      {{(enable_broadcasting ? "gy" : "dy")},
+       opname,
+       {y0, y1},
+       {{"T", "$T"}, {attr_adj_x, ay0}, {attr_adj_y, ay1}}},
+  };
+  // TODO(anudhyan): Figure out a way to inspect the static shapes of "x" and
+  // "y". If they have the same batch dimensions, then we can omit adding the
+  // broadcasting-specific ops.
+  if (enable_broadcasting) {
+    std::vector<FDH::Node> unbroadcast_gradients = {
+        FDH::Const<int32>("zero", gtl::ArraySlice<int32>{0}),
+        FDH::Const<int32>("one", gtl::ArraySlice<int32>{1}),
+        FDH::Const<int32>("minustwo", gtl::ArraySlice<int32>{-2}),
+        // Compute the batch shapes of the inputs (all but last two dims).
+        {{"sx"}, "Shape", {"x"}, {{"T", "$T"}}},
+        {{"sy"}, "Shape", {"y"}, {{"T", "$T"}}},
+        {{"batch_sx"},
+         "StridedSlice",
+         {"sx", "zero", "minustwo", "one"},
+         {{"T", DT_INT32}, {"Index", DT_INT32}}},
+        {{"batch_sy"},
+         "StridedSlice",
+         {"sy", "zero", "minustwo", "one"},
+         {{"T", DT_INT32}, {"Index", DT_INT32}}},
+        // Sum along dimensions that the inputs were broadcasted across.
+        {{"rx", "ry"}, "BroadcastGradientArgs", {"batch_sx", "batch_sy"}},
+        {{"sum_gx"}, "Sum", {"gx", "rx"}, {{"T", "$T"}}},
+        {{"sum_gy"}, "Sum", {"gy", "ry"}, {{"T", "$T"}}},
+        {{"dx"}, "Reshape", {"sum_gx", "sx"}, {{"T", "$T"}}},
+        {{"dy"}, "Reshape", {"sum_gy", "sy"}, {{"T", "$T"}}}};
+    nodes.insert(nodes.end(), unbroadcast_gradients.begin(),
+                 unbroadcast_gradients.end());
+  }
   *g = FDH::Define(
       // Arg defs
       {"x: T", "y: T", "dz: T"},
@@ -798,22 +841,13 @@
       // Attr defs
       {{"T: {half, float, double}"}},
       // Nodes
-      {
-          {{"dx"},
-           opname,
-           {x0, x1},
-           {{"T", "$T"}, {attr_adj_x, ax0}, {attr_adj_y, ax1}}},
-          {{"dy"},
-           opname,
-           {y0, y1},
-           {{"T", "$T"}, {attr_adj_x, ay0}, {attr_adj_y, ay1}}},
-      });
+      nodes);
   return Status::OK();
 }
 
 Status MatMulGradCommon(const string& opname, const string& attr_adj_x,
                         const string& attr_adj_y, const AttrSlice& attrs,
-                        FunctionDef* g) {
+                        FunctionDef* g, bool enable_broadcasting) {
   DataType T;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "T", &T));
   if (T == DT_COMPLEX64 || T == DT_COMPLEX128) {
@@ -826,31 +860,39 @@
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, attr_adj_y, &tb));
   if (!ta && !tb) {
     return MatMulGradHelper(g, opname, attr_adj_x, attr_adj_y, "dz", false, "y",
-                            true, "x", true, "dz", false);
+                            true, "x", true, "dz", false, enable_broadcasting);
   }
   if (!ta && tb) {
     return MatMulGradHelper(g, opname, attr_adj_x, attr_adj_y, "dz", false, "y",
-                            false, "dz", true, "x", false);
+                            false, "dz", true, "x", false, enable_broadcasting);
   }
   if (ta && !tb) {
     return MatMulGradHelper(g, opname, attr_adj_x, attr_adj_y, "y", false, "dz",
-                            true, "x", false, "dz", false);
+                            true, "x", false, "dz", false, enable_broadcasting);
   }
   CHECK(ta && tb);
   return MatMulGradHelper(g, opname, attr_adj_x, attr_adj_y, "y", true, "dz",
-                          true, "dz", true, "x", true);
+                          true, "dz", true, "x", true, enable_broadcasting);
 }
 
 Status MatMulGrad(const AttrSlice& attrs, FunctionDef* g) {
-  return MatMulGradCommon("MatMul", "transpose_a", "transpose_b", attrs, g);
+  return MatMulGradCommon("MatMul", "transpose_a", "transpose_b", attrs, g,
+                          false /* enable_broadcasting */);
 }
 REGISTER_OP_GRADIENT("MatMul", MatMulGrad);
 
 Status BatchMatMulGrad(const AttrSlice& attrs, FunctionDef* g) {
-  return MatMulGradCommon("BatchMatMul", "adj_x", "adj_y", attrs, g);
+  return MatMulGradCommon("BatchMatMul", "adj_x", "adj_y", attrs, g,
+                          false /* enable_broadcasting */);
 }
 REGISTER_OP_GRADIENT("BatchMatMul", BatchMatMulGrad);
 
+Status BatchMatMulV2Grad(const AttrSlice& attrs, FunctionDef* g) {
+  return MatMulGradCommon("BatchMatMulV2", "adj_x", "adj_y", attrs, g,
+                          true /* enable_broadcasting */);
+}
+REGISTER_OP_GRADIENT("BatchMatMulV2", BatchMatMulV2Grad);
+
 // REGISTER_OP_GRADIENT("SparseMatMul", SparseMatMulGrad);
 
 // Comparison ops.
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 129d924..115dbd2 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -224,6 +225,29 @@
     *di = outputs[1];
   }
 
+  Tensor ReduceSum(const Tensor& x, gtl::ArraySlice<int32> axes) {
+    int num_axes = axes.length();
+    Tensor y(DT_INT32, TensorShape({num_axes}));
+    for (size_t i = 0; i < axes.size(); ++i) {
+      y.flat<int32>()(i) = axes[i];
+    }
+    auto T = x.dtype();
+    auto gdef = test::function::GDef(
+        {
+            f::NDef("x", "Placeholder", {}, {{"dtype", T}}),
+            f::NDef("y", "Const", {}, {{"dtype", DT_INT32}, {"value", y}}),
+            f::NDef("z", "Sum", {"x", "y"}, {{"T", T}}),
+        },
+        {});
+    auto sess = NewSession();
+    TF_CHECK_OK(sess->Create(gdef));
+    std::vector<Tensor> outputs;
+    TF_CHECK_OK(sess->Run({{"x:0", x}}, {"z:0"}, {}, &outputs));
+    CHECK_EQ(outputs.size(), 1);
+    TF_CHECK_OK(sess->Close());
+    return outputs[0];
+  }
+
   Tensor MatMulCommon(const string& opname, const string& attr_adj_x,
                       const string& attr_adj_y, const Tensor& x, bool ax,
                       const Tensor& y, bool ay) {
@@ -253,6 +277,10 @@
     return MatMulCommon("BatchMatMul", "adj_x", "adj_y", x, ax, y, ay);
   }
 
+  Tensor BatchMatMulV2(const Tensor& x, bool ax, const Tensor& y, bool ay) {
+    return MatMulCommon("BatchMatMulV2", "adj_x", "adj_y", x, ax, y, ay);
+  }
+
   void MatMulGradCommon(const string& opname, const string& attr_adj_x,
                         const string& attr_adj_y, const Tensor& x, bool ax,
                         const Tensor& y, bool ay, Tensor* dx, Tensor* dy) {
@@ -325,6 +353,12 @@
                             dy);
   }
 
+  void BatchMatMulV2Grad(const Tensor& x, bool ax, const Tensor& y, bool ay,
+                         Tensor* dx, Tensor* dy) {
+    return MatMulGradCommon("BatchMatMulV2", "adj_x", "adj_y", x, ax, y, ay, dx,
+                            dy);
+  }
+
   void SelectGrad(const Tensor& c, const Tensor& x, const Tensor& y, Tensor* dc,
                   Tensor* dx, Tensor* dy) {
     auto T = DT_FLOAT;
@@ -1179,6 +1213,139 @@
 }
 #endif  // TENSORFLOW_USE_SYCL
 
+TEST_F(MathGradTest, BatchMatMulV2_00) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({1, 2, 3}));
+  auto y = test::AsTensor<float>({-1.f, .5f, 2.f}, TensorShape({1, 3, 1}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, false, y, false, &dx, &dy);
+  auto dz = test::AsTensor<float>({1.f, 1.f}, TensorShape({1, 2, 1}));
+  test::ExpectClose(dx, BatchMatMulV2(dz, false, y, true));
+  test::ExpectClose(dy, BatchMatMulV2(x, true, dz, false));
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_01) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({1, 2, 3}));
+  auto y = test::AsTensor<float>({-1.f, .5f, 2.f}, TensorShape({1, 1, 3}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, false, y, true, &dx, &dy);
+  auto dz = test::AsTensor<float>({1.f, 1.f}, TensorShape({1, 2, 1}));
+  test::ExpectClose(dx, BatchMatMulV2(dz, false, y, false));
+  test::ExpectClose(dy, BatchMatMulV2(dz, true, x, false));
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_10) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({1, 3, 2}));
+  auto y = test::AsTensor<float>({-1.f, .5f, 2.f}, TensorShape({1, 3, 1}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, true, y, false, &dx, &dy);
+  auto dz = test::AsTensor<float>({1.f, 1.f}, TensorShape({1, 2, 1}));
+  test::ExpectClose(dx, BatchMatMulV2(y, false, dz, true));
+  test::ExpectClose(dy, BatchMatMulV2(x, false, dz, false));
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_11) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({1, 3, 2}));
+  auto y = test::AsTensor<float>({-1.f, .5f, 2.f}, TensorShape({1, 1, 3}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, true, y, true, &dx, &dy);
+  auto dz = test::AsTensor<float>({1.f, 1.f}, TensorShape({1, 2, 1}));
+  test::ExpectClose(dx, BatchMatMulV2(y, true, dz, true));
+  test::ExpectClose(dy, BatchMatMulV2(dz, true, x, true));
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_LhsBroadcasts) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({2, 3}));
+  auto y = test::AsTensor<float>(
+      {1.f, 2.4, 3.f, -1.f, .5f, 2.f, 3.f, 1.f, -1.f, 2.f, -.1f, 0},
+      TensorShape({2, 3, 2}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, false, y, false, &dx, &dy);
+  EXPECT_TRUE(dx.shape().IsSameSize(x.shape()));
+  EXPECT_TRUE(dy.shape().IsSameSize(y.shape()));
+  auto dz = test::AsTensor<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f},
+                                  TensorShape({2, 2, 2}));
+  Tensor ans_dx;
+  CHECK(ans_dx.CopyFrom(ReduceSum(BatchMatMulV2(dz, false, y, true), {0}),
+                        dx.shape()));
+  Tensor ans_dy = BatchMatMulV2(x, true, dz, false);
+  test::ExpectClose(dx, ans_dx);
+  test::ExpectClose(dy, ans_dy);
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_RhsBroadcasts) {
+  auto x = test::AsTensor<float>(
+      {1.f, 2.4, 3.f, -1.f, .5f, 2.f, 3.f, 1.f, -1.f, 2.f, -.1f, 0},
+      TensorShape({2, 2, 3}));
+  auto y = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({3, 2}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, false, y, false, &dx, &dy);
+  auto dz = test::AsTensor<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f},
+                                  TensorShape({2, 2, 2}));
+  Tensor ans_dx = BatchMatMulV2(dz, false, y, true);
+  Tensor ans_dy;
+  CHECK(ans_dy.CopyFrom(ReduceSum(BatchMatMulV2(x, true, dz, false), {0}),
+                        dy.shape()));
+  test::ExpectClose(dx, ans_dx);
+  test::ExpectClose(dy, ans_dy);
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_BothLhsAndRhsBroadcast) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({2, 1, 1, 3}));
+  auto y = test::AsTensor<float>({3.f, 1.f, -1.f, 2.f, -.1f, 0},
+                                 TensorShape({1, 2, 3, 1}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, false, y, false, &dx, &dy);
+  EXPECT_TRUE(dx.shape().IsSameSize(x.shape()));
+  EXPECT_TRUE(dy.shape().IsSameSize(y.shape()));
+  auto dz =
+      test::AsTensor<float>({1.f, 1.f, 1.f, 1.f}, TensorShape({2, 2, 1, 1}));
+  Tensor ans_dx;
+  Tensor ans_dy;
+  CHECK(ans_dx.CopyFrom(ReduceSum(BatchMatMulV2(dz, false, y, true), {1}),
+                        dx.shape()));
+  CHECK(ans_dy.CopyFrom(ReduceSum(BatchMatMulV2(x, true, dz, false), {0}),
+                        dy.shape()));
+  test::ExpectClose(dx, ans_dx);
+  test::ExpectClose(dy, ans_dy);
+}
+
+TEST_F(MathGradTest, BatchMatMulV2_BroadcastWhileAdjointed) {
+  auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
+                                 TensorShape({2, 1, 3, 1}));
+  auto y = test::AsTensor<float>({3.f, 1.f, -1.f, 2.f, -.1f, 0},
+                                 TensorShape({1, 2, 1, 3}));
+  Tensor dx;
+  Tensor dy;
+  BatchMatMulV2Grad(x, true, y, true, &dx, &dy);
+  EXPECT_TRUE(dx.shape().IsSameSize(x.shape()));
+  EXPECT_TRUE(dy.shape().IsSameSize(y.shape()));
+
+  auto dz =
+      test::AsTensor<float>({1.f, 1.f, 1.f, 1.f}, TensorShape({2, 2, 1, 1}));
+  Tensor ans_dx;
+  Tensor ans_dy;
+  CHECK(ans_dx.CopyFrom(ReduceSum(BatchMatMulV2(y, true, dz, true), {1}),
+                        dx.shape()));
+  CHECK(ans_dy.CopyFrom(ReduceSum(BatchMatMulV2(dz, true, x, true), {0}),
+                        dy.shape()));
+  test::ExpectClose(dx, ans_dx);
+  test::ExpectClose(dy, ans_dy);
+}
+
 TEST_F(MathGradTest, Sum_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index dd94e66..29a9375 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -447,15 +447,14 @@
     .Input("y: T")
     .Output("z: T")
     .Attr("T: {half, float, double, complex64, complex128}")
-    .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
+// Note: This op is not commutative w.r.t. to all its inputs.
 REGISTER_OP("_MklMul")
     .BINARY_MORE()
     .Input("mkl_x: uint8")
     .Input("mkl_y: uint8")
     .Output("mkl_z: uint8")
-    .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns x * y element-wise.
@@ -490,12 +489,12 @@
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
+// Note: This op is not commutative w.r.t. to all its inputs.
 REGISTER_OP("_MklSquaredDifference")
     .BINARY_FEWER()
     .Input("mkl_x: uint8")
     .Input("mkl_y: uint8")
     .Output("mkl_z: uint8")
-    .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns (x - y)(x - y) element-wise.
@@ -526,9 +525,9 @@
     .Input("y: T")
     .Output("z: T")
     .Attr("T: {bfloat16, half, float, double, int32, int64}")
-    .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
+// Note: This op is not commutative w.r.t. to all its inputs.
 REGISTER_OP("_MklMaximum")
     .Input("x: T")
     .Input("y: T")
@@ -537,7 +536,6 @@
     .Output("z: T")
     .Output("mkl_z: uint8")
     .Attr("T: {half, float, double, int32, int64, bfloat16}")
-    .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns the max of x and y (i.e. x > y ? x : y) element-wise.
@@ -551,7 +549,6 @@
     .Input("y: T")
     .Output("z: T")
     .Attr("T: {bfloat16, half, float, double, int32, int64}")
-    .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Mod")
@@ -1619,6 +1616,7 @@
       return Status::OK();
     });
 
+// Note: This op is not commutative w.r.t. to all its inputs.
 REGISTER_OP("QuantizedMul")
     .Input("x: T1")
     .Input("y: T2")
@@ -1632,7 +1630,6 @@
     .Attr("T1: quantizedtype")
     .Attr("T2: quantizedtype")
     .Attr("Toutput: quantizedtype = DT_QINT32")
-    .SetIsCommutative()
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::BroadcastBinaryOpShapeFn(c));
       c->set_output(1, c->Scalar());
@@ -1640,6 +1637,7 @@
       return Status::OK();
     });
 
+// Note: This op is not commutative w.r.t. to all its inputs.
 REGISTER_OP("QuantizedAdd")
     .Input("x: T1")
     .Input("y: T2")
@@ -1653,7 +1651,6 @@
     .Attr("T1: quantizedtype")
     .Attr("T2: quantizedtype")
     .Attr("Toutput: quantizedtype = DT_QINT32")
-    .SetIsCommutative()
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::BroadcastBinaryOpShapeFn(c));
       // min_x, max_x, min_y, max_y should be scalar.
@@ -1770,6 +1767,7 @@
     .SetShapeFn(shape_inference::UnchangedShape);
 
 #ifdef INTEL_MKL
+// Note: This op is not commutative w.r.t. to all its inputs.
 REGISTER_OP("_MklAddN")
     .Input("inputs: N * T")
     .Input("mkl_input: N * uint8")
@@ -1777,8 +1775,6 @@
     .Output("mkl_sum: uint8")
     .Attr("N: int >= 1")
     .Attr("T: numbertype")
-    .SetIsCommutative()
-    .SetIsAggregate()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle cur = c->input(c->num_inputs() - 1);
       for (int i = c->num_inputs() - 2; i >= 0; --i) {
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index ff85c4d..4d248b9 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -327,7 +327,8 @@
     .Attr("T: {float, double}")
     .Attr("num_args: int >= 0")
     .Attr("strides: list(int)")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Attr("use_cudnn_on_gpu: bool = true")
@@ -335,7 +336,7 @@
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
     // ---------------------------------------------------------------------- //
-    .SetShapeFn(shape_inference::Conv2DShape)
+    .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding)
     .Doc(R"doc(
 *NOTE*: Do not invoke this operator directly in Python. Grappler is
 expected to create these operators.
@@ -2528,7 +2529,7 @@
     .Output("mkl_batch_variance: uint8")
     .Output("mkl_reserve_space_1: uint8")
     .Output("mkl_reserve_space_2: uint8")
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr(GetConvnetDataFormatAttrString())
@@ -2556,7 +2557,7 @@
     .Output("mkl_offset_backprop: uint8")
     .Output("mkl_reserve_space_3: uint8")
     .Output("mkl_reserve_space_4: uint8")
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr(GetConvnetDataFormatAttrString())
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 18fce62..c1cc30d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3141,6 +3141,13 @@
     type: DT_VARIANT
   }
   attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
     name: "output_types"
     type: "list(type)"
     has_minimum: true
@@ -9225,6 +9232,45 @@
   }
 }
 op {
+  name: "DecodePaddedRaw"
+  input_arg {
+    name: "input_bytes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "fixed_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
   name: "DecodePng"
   input_arg {
     name: "contents"
@@ -16591,7 +16637,6 @@
       }
     }
   }
-  is_commutative: true
 }
 op {
   name: "Less"
@@ -20280,7 +20325,6 @@
       }
     }
   }
-  is_commutative: true
 }
 op {
   name: "MultiDeviceIterator"
@@ -22130,6 +22174,13 @@
     type: DT_VARIANT
   }
   attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
     name: "Toutput_types"
     type: "list(type)"
     has_minimum: true
@@ -23990,7 +24041,6 @@
       }
     }
   }
-  is_commutative: true
 }
 op {
   name: "QuantizedAvgPool"
@@ -26597,7 +26647,6 @@
       }
     }
   }
-  is_commutative: true
 }
 op {
   name: "QuantizedRelu"
@@ -33077,7 +33126,6 @@
       }
     }
   }
-  is_commutative: true
 }
 op {
   name: "Rint"
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index cd294a0..ff87544 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -20,6 +20,7 @@
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
@@ -40,6 +41,31 @@
       return Status::OK();
     });
 
+REGISTER_OP("DecodePaddedRaw")
+    .Input("input_bytes: string")
+    .Input("fixed_length: int32")
+    .Output("output: out_type")
+    .Attr("out_type: {half,float,double,int32,uint16,uint8,int16,int8,int64}")
+    .Attr("little_endian: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      DimensionHandle fixed_length;
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &fixed_length));
+
+      DataType out_type;
+      TF_RETURN_IF_ERROR(c->GetAttr("out_type", &out_type));
+
+      int32 data_type_size = DataTypeSize(out_type);
+
+      DimensionHandle width;
+      TF_RETURN_IF_ERROR(c->Divide(fixed_length, data_type_size, true, &width));
+
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->Concatenate(c->input(0), c->Vector(width), &out));
+
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 REGISTER_OP("DecodeCompressed")
     .Input("bytes: string")
     .Output("output: string")
diff --git a/tensorflow/core/ops/ragged_array_ops.cc b/tensorflow/core/ops/ragged_array_ops.cc
index 4642579..1e88890 100644
--- a/tensorflow/core/ops/ragged_array_ops.cc
+++ b/tensorflow/core/ops/ragged_array_ops.cc
@@ -29,13 +29,14 @@
 //==============================================================================
 
 REGISTER_OP("RaggedGather")
-    .Input("params_nested_splits: PARAMS_RAGGED_RANK * int64")
+    .Input("params_nested_splits: PARAMS_RAGGED_RANK * Tsplits")
     .Input("params_dense_values: Tvalues")
     .Input("indices: Tindices")
-    .Output("output_nested_splits: OUTPUT_RAGGED_RANK * int64")
+    .Output("output_nested_splits: OUTPUT_RAGGED_RANK * Tsplits")
     .Output("output_dense_values: Tvalues")
     .Attr("Tvalues: type")
     .Attr("Tindices: {int32, int64}")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
     .Attr("PARAMS_RAGGED_RANK: int >= 1")
     .Attr("OUTPUT_RAGGED_RANK: int >= 0")
     .SetShapeFn(RaggedGatherShapeFn);
diff --git a/tensorflow/core/ops/ragged_conversion_ops.cc b/tensorflow/core/ops/ragged_conversion_ops.cc
index 90fd517..5794b89 100644
--- a/tensorflow/core/ops/ragged_conversion_ops.cc
+++ b/tensorflow/core/ops/ragged_conversion_ops.cc
@@ -23,21 +23,44 @@
 using shape_inference::ShapeHandle;
 
 Status RaggedTensorToSparseShapeFn(InferenceContext* c);
+Status RaggedTensorToVariantShapeFn(InferenceContext* c);
+Status RaggedTensorFromVariantShapeFn(InferenceContext* c);
 
 //==============================================================================
 // Registered Ops
 //==============================================================================
 
 REGISTER_OP("RaggedTensorToSparse")
-    .Input("rt_nested_splits: RAGGED_RANK * int64")
+    .Input("rt_nested_splits: RAGGED_RANK * Tsplits")
     .Input("rt_dense_values: T")
     .Output("sparse_indices: int64")
     .Output("sparse_values: T")
     .Output("sparse_dense_shape: int64")
     .Attr("RAGGED_RANK: int >= 1")
     .Attr("T: type")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
     .SetShapeFn(RaggedTensorToSparseShapeFn);
 
+REGISTER_OP("RaggedTensorToVariant")
+    .Input("rt_nested_splits: RAGGED_RANK * Tsplits")
+    .Input("rt_dense_values: Tvalues")
+    .Output("encoded_ragged: variant")
+    .Attr("RAGGED_RANK: int >= 1")
+    .Attr("Tvalues: type")
+    .Attr("Tsplits: {int32, int64}")
+    .Attr("batched_input: bool")
+    .SetShapeFn(RaggedTensorToVariantShapeFn);
+
+REGISTER_OP("RaggedTensorFromVariant")
+    .Input("encoded_ragged: variant")
+    .Output("output_nested_splits: output_ragged_rank * Tsplits")
+    .Output("output_dense_values: Tvalues")
+    .Attr("input_ragged_rank: int >= -1")
+    .Attr("output_ragged_rank: int >= 1")
+    .Attr("Tvalues: type")
+    .Attr("Tsplits: {int32, int64}")
+    .SetShapeFn(RaggedTensorFromVariantShapeFn);
+
 //==============================================================================
 // Shape Functions
 //==============================================================================
@@ -71,4 +94,46 @@
   return Status::OK();
 }
 
+Status RaggedTensorToVariantShapeFn(InferenceContext* c) {
+  int64 num_splits;
+  TF_RETURN_IF_ERROR(c->GetAttr<int64>("RAGGED_RANK", &num_splits));
+  bool batched;
+  TF_RETURN_IF_ERROR(c->GetAttr<bool>("batched_input", &batched));
+  shape_inference::ShapeHandle rt_dense_values = c->input(num_splits);
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(rt_dense_values, 1, &rt_dense_values));
+  for (int64 i = 0; i < num_splits; ++i) {
+    shape_inference::ShapeHandle splits = c->input(i);
+    TF_RETURN_IF_ERROR(c->WithRank(splits, 1, &splits));
+  }
+  if (batched) {
+    auto num_first_splits = c->Dim(c->input(0), 0);
+    shape_inference::DimensionHandle num_rows;
+    TF_RETURN_IF_ERROR(c->Subtract(num_first_splits, 1, &num_rows));
+    c->set_output(0, c->Vector(num_rows));
+  } else {
+    c->set_output(0, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status RaggedTensorFromVariantShapeFn(InferenceContext* c) {
+  int64 input_ragged_rank;
+  TF_RETURN_IF_ERROR(
+      c->GetAttr<int64>("input_ragged_rank", &input_ragged_rank));
+  int64 output_ragged_rank;
+  TF_RETURN_IF_ERROR(
+      c->GetAttr<int64>("output_ragged_rank", &output_ragged_rank));
+  shape_inference::ShapeHandle encoded_ragged = c->input(0);
+  if (c->RankKnown(encoded_ragged) && input_ragged_rank >= 0) {
+    shape_inference::ShapeHandle unused;
+    TF_RETURN_IF_ERROR(c->WithRank(
+        encoded_ragged, output_ragged_rank - input_ragged_rank, &unused));
+  }
+  for (int64 i = 0; i < output_ragged_rank; i++) {
+    c->set_output(i, c->UnknownShapeOfRank(1));
+  }
+  c->set_output(output_ragged_rank, c->UnknownShape());
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ragged_math_ops.cc b/tensorflow/core/ops/ragged_math_ops.cc
index d739c69..5ceb31b 100644
--- a/tensorflow/core/ops/ragged_math_ops.cc
+++ b/tensorflow/core/ops/ragged_math_ops.cc
@@ -32,9 +32,10 @@
     .Input("starts: T")
     .Input("limits: T")
     .Input("deltas: T")
-    .Output("rt_nested_splits: int64")
+    .Output("rt_nested_splits: Tsplits")
     .Output("rt_dense_values: T")
     .Attr("T: {bfloat16, float, double, int32, int64} = DT_INT32")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
     .SetShapeFn(RaggedRangeShapeFn);
 
 //==============================================================================
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index 80e766c..9537e61 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -69,6 +69,17 @@
       return Status::OK();
     });
 
+REGISTER_OP("RngSkip")
+    .Input("resource: resource")
+    .Input("algorithm: int64")
+    .Input("delta: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return Status::OK();
+    });
+
 REGISTER_OP("NonDeterministicInts")
     .Input("shape: shape_dtype")
     .SetIsStateful()
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index d012ce6..4aefaad 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -206,6 +206,18 @@
       return Status::OK();
     });
 
+REGISTER_OP("StringLower")
+    .Input("input: string")
+    .Output("output: string")
+    .Attr("encoding: string =''")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("StringUpper")
+    .Input("input: string")
+    .Output("output: string")
+    .Attr("encoding: string =''")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
@@ -263,10 +275,11 @@
 
 REGISTER_OP("UnicodeEncode")
     .Input("input_values: int32")
-    .Input("input_splits: int64")
+    .Input("input_splits: Tsplits")
     .Attr("errors: {'ignore', 'replace', 'strict'} = 'replace'")
     .Attr("output_encoding: {'UTF-8', 'UTF-16-BE', 'UTF-32-BE'}")
     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
     .Output("output: string")
     .SetShapeFn([](InferenceContext* c) {
       // Check rank of inner values
@@ -298,12 +311,13 @@
 
 REGISTER_OP("UnicodeDecode")
     .Input("input: string")
-    .Output("row_splits: int64")
+    .Output("row_splits: Tsplits")
     .Output("char_values: int32")
     .Attr("input_encoding: string")
     .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
     .Attr("replace_control_characters: bool = false")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       // row_splits.shape == [input.size() + 1]
       DimensionHandle num_row_splits;
@@ -319,13 +333,14 @@
 
 REGISTER_OP("UnicodeDecodeWithOffsets")
     .Input("input: string")
-    .Output("row_splits: int64")
+    .Output("row_splits: Tsplits")
     .Output("char_values: int32")
     .Output("char_to_byte_starts: int64")
     .Attr("input_encoding: string")
     .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
     .Attr("replace_control_characters: bool = false")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       // row_splits.shape == [input.size() + 1]
       DimensionHandle num_row_splits;
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.cc b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
index 82b692a..5d92468 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
@@ -104,7 +104,9 @@
           mutex_lock l(mu_);
           // Do not update state if the block is already to be evicted.
           if (block->timestamp != 0) {
-            cache_size_ += block->data.size();
+            // Use capacity() instead of size() to account for all  memory
+            // used by the cache.
+            cache_size_ += block->data.capacity();
             // Put to beginning of LRA list.
             lra_list_.erase(block->lra_iterator);
             lra_list_.push_front(key);
@@ -132,7 +134,9 @@
         block->mu.lock();  // Reacquire the lock immediately afterwards
         if (status.ok()) {
           block->data.resize(bytes_transferred, 0);
-          block->data.shrink_to_fit();
+          // Shrink the data capacity to the actual size used.
+          // NOLINTNEXTLINE: shrink_to_fit() may not shrink the capacity.
+          std::vector<char>(block->data).swap(block->data);
           downloaded_block = true;
           block->state = FetchState::FINISHED;
         } else {
@@ -285,7 +289,7 @@
   entry->second->timestamp = 0;
   lru_list_.erase(entry->second->lru_iterator);
   lra_list_.erase(entry->second->lra_iterator);
-  cache_size_ -= entry->second->data.size();
+  cache_size_ -= entry->second->data.capacity();
   block_map_.erase(entry);
 }
 
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index c9208cc..fd76047 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -36,6 +36,16 @@
 // value (e.g. `4`) may be returned.
 int NumSchedulableCPUs();
 
+// Returns an estimate for the maximum parallelism for this process.
+// Applications should avoid running more than this number of threads with
+// intensive workloads concurrently to avoid performance degradation and
+// contention.
+// This value is either the number of schedulable CPUs, or a value specific to
+// the underlying cluster management. Applications should assume this value can
+// change throughout the lifetime of the process. This function must not be
+// called during initialization, i.e., before before main() has started.
+int MaxParallelism();
+
 // Returns the total number of CPUs on the system.  This number should
 // not change even if the underlying cluster management software may
 // change the number of schedulable CPUs.  Unlike `NumSchedulableCPUs`, if the
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 311daff..ba7fccd 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -520,6 +520,14 @@
         ], exclude = exclude),
     })
 
+def tf_additional_monitoring_hdrs():
+    return []
+
+def tf_additional_monitoring_srcs():
+    return [
+        "platform/default/monitoring.cc",
+    ]
+
 def tf_additional_minimal_lib_srcs():
     return [
         "platform/default/integral_types.h",
@@ -601,6 +609,13 @@
 def tf_additional_device_tracer_test_flags():
     return []
 
+def tf_additional_profiler_lib_deps():
+    return [
+        "//tensorflow/core/profiler/internal/cpu:host_tracer",
+    ] + if_cuda([
+        "//tensorflow/core/profiler/internal/gpu:device_tracer",
+    ])
+
 def tf_additional_libdevice_data():
     return []
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 8a4ca40..d917d44 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -290,21 +290,6 @@
     ],
 )
 
-# Check that libtensorflow_framework.so does not depend on cuda shared libraries.
-check_deps(
-    name = "libtensorflow_cuda_check_deps",
-    disallowed_deps = [
-        ":cuda",
-        "@local_config_cuda//cuda:cublas",
-        "@local_config_cuda//cuda:cuda_driver",
-        "@local_config_cuda//cuda:cudnn",
-        "@local_config_cuda//cuda:curand",
-        "@local_config_cuda//cuda:cusolver",
-        "@local_config_cuda//cuda:cusparse",
-    ],
-    deps = ["//tensorflow:libtensorflow_framework.so"],
-)
-
 cc_library(
     name = "rocm",
     data = [],
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 162b339..2f13f12 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -29,6 +29,7 @@
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -38,7 +39,7 @@
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace {
@@ -175,18 +176,15 @@
   Status Enable(CudaEventRecorder* recorder) {
     TF_RETURN_IF_ERROR(
         ToStatus(cuptiSubscribe(&subscriber_, &CuptiCallback, recorder)));
-    for (auto cbid :
-         {CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
-          CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice,
-          CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
-          CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2}) {
+    for (auto cbid : {CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2}) {
       TF_RETURN_IF_ERROR(ToStatus(cuptiEnableCallback(
           /*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid)));
     }
@@ -244,11 +242,7 @@
                                      const CUpti_CallbackData& cbdata,
                                      CudaEventRecorder* recorder) {
     switch (cbid) {
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
-        TF_FALLTHROUGH_INTENDED;
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
-        TF_FALLTHROUGH_INTENDED;
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
+      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
         DCHECK_NE(cbdata.symbolName, nullptr);
         auto params =
             static_cast<const cuLaunchKernel_params*>(cbdata.functionParams);
@@ -326,14 +320,6 @@
 
 class TraceCollectorImpl : public tracing::TraceCollector {
  public:
-  class ActivityHandle : public Handle {
-   public:
-    ActivityHandle(std::string&& name, int level)
-        : trace_me_(std::move(name), level) {}
-
-   private:
-    profiler::TraceMe trace_me_;
-  };
   TraceCollectorImpl() : active_trace_session_(false) {
     tracing::SetTraceCollector(this);
   }
@@ -358,23 +344,10 @@
     return absl::make_unique<Impl>(ConcatenateNames(name_part1, name_part2));
   }
 
-  virtual std::unique_ptr<Handle> CreateActivityHandle(
-      StringPiece name_part1, StringPiece name_part2, bool is_expensive) const {
-    if (!IsEnabledForActivities(is_expensive)) {
-      return nullptr;
-    }
-    return absl::make_unique<ActivityHandle>(
-        ConcatenateNames(name_part1, name_part2), GetLevel(is_expensive));
-  }
-
   bool IsEnabledForAnnotations() const override {
     return active_trace_session_.load(std::memory_order_relaxed);
   }
 
-  bool IsEnabledForActivities(bool is_expensive) const override {
-    return profiler::TraceMeRecorder::Active(GetLevel(is_expensive));
-  }
-
   void Start() {
     DCHECK(!active_trace_session_)
         << "Unexpected active trace session detected.";
@@ -387,10 +360,6 @@
   }
 
  private:
-  static int GetLevel(bool is_expensive) {
-    return profiler::GetTFTraceMeLevel(is_expensive);
-  }
-
   std::atomic<bool> active_trace_session_;
 };
 
@@ -566,7 +535,7 @@
   }
 
   // Returns time in microseconds between events recorded on the GPU.
-  static uint64_t GetElasedTimeUs(CUevent start, CUevent stop) {
+  static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
     float elapsed_ms = 0.0f;
     LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
     return static_cast<uint64>(
@@ -613,8 +582,8 @@
     const auto& stream_info =
         stream_infos_.at(StreamKey(record.context, record.stream));
     auto start_us =
-        GetElasedTimeUs(record.start_event, stream_info.ctx_info->end_event);
-    auto elapsed_us = GetElasedTimeUs(record.start_event, record.stop_event);
+        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
+    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
 
     auto stats = absl::make_unique<NodeExecStats>();
     std::string node_name = record.kernel_name;
@@ -642,8 +611,8 @@
     const auto& stream_info =
         stream_infos_.at(StreamKey(record.context, record.stream));
     auto start_us =
-        GetElasedTimeUs(record.start_event, stream_info.ctx_info->end_event);
-    auto elapsed_us = GetElasedTimeUs(record.start_event, record.stop_event);
+        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
+    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
 
     auto stats = absl::make_unique<NodeExecStats>();
     std::string node_name = GetMemcpyName(record);
@@ -748,7 +717,11 @@
 }
 
 auto register_device_tracer_factory = [] {
-  RegisterProfilerFactory(&CreateDeviceTracer);
+  bool enable;
+  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_GPU_PROFILER", true, &enable));
+  if (enable) {
+    RegisterProfilerFactory(&CreateDeviceTracer);
+  }
   return 0;
 }();
 
diff --git a/tensorflow/core/platform/default/fingerprint.h b/tensorflow/core/platform/default/fingerprint.h
deleted file mode 100644
index f901bef..0000000
--- a/tensorflow/core/platform/default/fingerprint.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_FINGERPRINT_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_FINGERPRINT_H_
-
-#include <farmhash.h>
-
-#include "tensorflow/core/lib/core/stringpiece.h"
-
-namespace tensorflow {
-
-inline uint64 Fingerprint64(StringPiece s) {
-  return ::util::Fingerprint64(s.data(), s.size());
-}
-
-inline Fprint128 Fingerprint128(StringPiece s) {
-  const auto fingerprint = ::util::Fingerprint128(s.data(), s.size());
-  return {::util::Uint128Low64(fingerprint),
-          ::util::Uint128High64(fingerprint)};
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_FINGERPRINT_H_
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java b/tensorflow/core/platform/default/monitoring.cc
similarity index 76%
rename from tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java
rename to tensorflow/core/platform/default/monitoring.cc
index 211d7e6..71ece3e 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java
+++ b/tensorflow/core/platform/default/monitoring.cc
@@ -13,11 +13,14 @@
 limitations under the License.
 ==============================================================================*/
 
-package org.tensorflow.demo;
+#include "tensorflow/core/platform/monitoring.h"
 
-import java.util.List;
-import org.tensorflow.demo.Classifier.Recognition;
+namespace tensorflow {
+namespace monitoring {
 
-public interface ResultsView {
-  public void setResults(final List<Recognition> results);
-}
+void StartExporter() {}
+
+void ExportMetrics() {}
+
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/stacktrace.h b/tensorflow/core/platform/default/stacktrace.h
index b64bc15..808ef25 100644
--- a/tensorflow/core/platform/default/stacktrace.h
+++ b/tensorflow/core/platform/default/stacktrace.h
@@ -16,7 +16,10 @@
 #ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_STACKTRACE_H_
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_STACKTRACE_H_
 
+// clang-format off
 #include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 #if !defined(IS_MOBILE_PLATFORM) && !defined(PLATFORM_WINDOWS) && \
     defined(PLATFORM_POSIX) && (defined(__clang__) || defined(__GNUC__))
 #define TF_GENERATE_BACKTRACE
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
index 696c774..d90e126 100644
--- a/tensorflow/core/platform/device_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -40,8 +40,17 @@
 
 namespace tensorflow {
 struct ProfilerContext;
+
+#if GOOGLE_CUDA
 std::unique_ptr<profiler::ProfilerInterface> CreateDeviceTracer(
     const ProfilerContext*);
+#else
+// We don't have device tracer for non-cuda case.
+std::unique_ptr<profiler::ProfilerInterface> CreateDeviceTracer(
+    const ProfilerContext*) {
+  return nullptr;
+}
+#endif
 
 namespace {
 
diff --git a/tensorflow/core/platform/fingerprint.h b/tensorflow/core/platform/fingerprint.h
index 720dc4c..ae41a8e 100644
--- a/tensorflow/core/platform/fingerprint.h
+++ b/tensorflow/core/platform/fingerprint.h
@@ -19,6 +19,16 @@
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
+// The following line is used by copybara to set or unset the USE_OSS_FARMHASH
+// preprocessor symbol as needed. Please do not remove.
+#define USE_OSS_FARMHASH
+
+#ifdef USE_OSS_FARMHASH
+#include <farmhash.h>
+#else
+#include "util/hash/farmhash_fingerprint.h"
+#endif
+
 namespace tensorflow {
 
 struct Fprint128 {
@@ -37,13 +47,6 @@
   }
 };
 
-// This is a portable fingerprint interface for strings that will never change.
-// However, it is not suitable for cryptography.
-uint64 Fingerprint64(StringPiece s);
-
-// 128-bit variant of Fingerprint64 above (same properties and caveats apply).
-Fprint128 Fingerprint128(StringPiece s);
-
 namespace internal {
 // Mixes some of the bits that got propagated to the high bits back into the
 // low bits.
@@ -72,12 +75,33 @@
   return result;
 }
 
-}  // namespace tensorflow
-
-#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
-#include "tensorflow/core/platform/google/fingerprint.h"
+// This is a portable fingerprint interface for strings that will never change.
+// However, it is not suitable for cryptography.
+inline uint64 Fingerprint64(StringPiece s) {
+#ifdef USE_OSS_FARMHASH
+  return ::util::Fingerprint64(s.data(), s.size());
 #else
-#include "tensorflow/core/platform/default/fingerprint.h"
+  // Fingerprint op depends on the fact that Fingerprint64() is implemented by
+  // Farmhash. If the implementation ever changes, Fingerprint op should be
+  // modified to keep using Farmhash.
+  // LINT.IfChange
+  return farmhash::Fingerprint64(s.data(), s.size());
+  // LINT.ThenChange(//third_party/tensorflow/core/kernels/fingerprint_op.cc)
 #endif
+}
+
+// 128-bit variant of Fingerprint64 above (same properties and caveats apply).
+inline Fprint128 Fingerprint128(StringPiece s) {
+#ifdef USE_OSS_FARMHASH
+  const auto fingerprint = ::util::Fingerprint128(s.data(), s.size());
+  return {::util::Uint128Low64(fingerprint),
+          ::util::Uint128High64(fingerprint)};
+#else
+  const auto fingerprint = farmhash::Fingerprint128(s.data(), s.size());
+  return {absl::Uint128Low64(fingerprint), absl::Uint128High64(fingerprint)};
+#endif
+}
+
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
diff --git a/tensorflow/core/platform/monitoring.h b/tensorflow/core/platform/monitoring.h
new file mode 100644
index 0000000..f012339
--- /dev/null
+++ b/tensorflow/core/platform/monitoring.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_MONITORING_H_
+#define TENSORFLOW_CORE_PLATFORM_MONITORING_H_
+
+namespace tensorflow {
+namespace monitoring {
+
+// Starts exporting metrics through a platform-specific monitoring API (if
+// provided). For builds using "tensorflow/core/platform/default", this is
+// currently a no-op. This function is idempotent.
+//
+// The TensorFlow runtime will call this the first time a new session is created
+// using the NewSession() method or an Eager Context is created.
+void StartExporter();
+
+// Manually invokes a one time metrics export through a platform-specific
+// monitoring API (if provided). For builds using
+// "tensorflow/core/platform/default", this is currently a no-op.
+void ExportMetrics();
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_MONITORING_H_
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 1561632..13a9042 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -80,6 +80,8 @@
   return kDefaultCores;
 }
 
+int MaxParallelism() { return NumSchedulableCPUs(); }
+
 int NumTotalCPUs() {
   int count = absl::base_internal::NumCPUs();
   return (count <= 0) ? kUnknownCPU : count;
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 083284c..435dbc5 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -16,6 +16,7 @@
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <sys/mman.h>
 #if defined(__linux__)
@@ -62,7 +63,16 @@
     Status s;
     char* dst = scratch;
     while (n > 0 && s.ok()) {
-      ssize_t r = pread(fd_, dst, n, static_cast<off_t>(offset));
+      // Some platforms, notably macs, throw EINVAL if pread is asked to read
+      // more than fits in a 32-bit integer.
+      size_t requested_read_length;
+      if (n > INT32_MAX) {
+        requested_read_length = INT32_MAX;
+      } else {
+        requested_read_length = n;
+      }
+      ssize_t r =
+          pread(fd_, dst, requested_read_length, static_cast<off_t>(offset));
       if (r > 0) {
         dst += r;
         n -= r;
@@ -325,10 +335,9 @@
   string translated_target = TranslateName(target);
   // O_WRONLY | O_CREAT:
   //   Open file for write and if file does not exist, create the file.
-  // S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH:
-  //   Create the file with permission of 0644
-  int target_fd = open(translated_target.c_str(), O_WRONLY | O_CREAT,
-                       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+  // When creating file, use the same permissions as original
+  mode_t mode = sbuf.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+  int target_fd = open(translated_target.c_str(), O_WRONLY | O_CREAT, mode);
   if (target_fd < 0) {
     close(src_fd);
     return IOError(target, errno);
diff --git a/tensorflow/core/platform/protobuf_util.cc b/tensorflow/core/platform/protobuf_util.cc
index 5eccddf..e46a77f 100644
--- a/tensorflow/core/platform/protobuf_util.cc
+++ b/tensorflow/core/platform/protobuf_util.cc
@@ -19,15 +19,12 @@
 
 bool ParseProtoUnlimited(protobuf::MessageLite* proto,
                          const string& serialized) {
-  return ParseProtoUnlimited(proto, serialized.data(), serialized.size());
+  return proto->ParseFromString(serialized);
 }
 
 bool ParseProtoUnlimited(protobuf::MessageLite* proto, const void* serialized,
                          size_t size) {
-  protobuf::io::CodedInputStream coded_stream(
-      reinterpret_cast<const uint8*>(serialized), size);
-  coded_stream.SetTotalBytesLimit(INT_MAX, INT_MAX);
-  return proto->ParseFromCodedStream(&coded_stream);
+  return proto->ParseFromArray(serialized, size);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index aefbe64..9b2886f 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -23,6 +23,7 @@
 #include <map>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
@@ -151,18 +152,10 @@
   virtual ~TraceCollector() {}
   virtual std::unique_ptr<Handle> CreateAnnotationHandle(
       StringPiece name_part1, StringPiece name_part2) const = 0;
-  virtual std::unique_ptr<Handle> CreateActivityHandle(
-      StringPiece name_part1, StringPiece name_part2,
-      bool is_expensive) const = 0;
 
   // Returns true if this annotation tracing is enabled for any op.
   virtual bool IsEnabledForAnnotations() const = 0;
 
-  // Returns true if this activity handle tracking is enabled for an op of the
-  // given expensiveness.
-  virtual bool IsEnabledForActivities(bool is_expensive) const = 0;
-
- protected:
   static string ConcatenateNames(StringPiece first, StringPiece second);
 
  private:
@@ -200,34 +193,10 @@
                                  : nullptr;
         }()) {}
 
-  bool IsEnabled() const { return static_cast<bool>(handle_); }
-
- private:
-  std::unique_ptr<TraceCollector::Handle> handle_;
-};
-
-// Adds an activity through the currently registered TraceCollector.
-// The activity starts when an object of this class is created and stops when
-// the object is destroyed.
-class ScopedActivity {
- public:
-  explicit ScopedActivity(StringPiece name, bool is_expensive = true)
-      : ScopedActivity(name, StringPiece(), is_expensive) {}
-
-  // If tracing is enabled, set up an activity with a label of
-  // "<name_part1>:<name_part2>".  This can be cheaper than the
-  // single-argument constructor because the concatenation of the
-  // label string is only done if tracing is enabled.
-  ScopedActivity(StringPiece name_part1, StringPiece name_part2,
-                 bool is_expensive = true)
-      : handle_([&] {
-          auto trace_collector = GetTraceCollector();
-          return trace_collector ? trace_collector->CreateActivityHandle(
-                                       name_part1, name_part2, is_expensive)
-                                 : nullptr;
-        }()) {}
-
-  bool IsEnabled() const { return static_cast<bool>(handle_); }
+  static bool IsEnabled() {
+    auto* trace_collector = GetTraceCollector();
+    return trace_collector && trace_collector->IsEnabledForAnnotations();
+  }
 
  private:
   std::unique_ptr<TraceCollector::Handle> handle_;
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index b902c85..08d0fad 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -55,6 +55,8 @@
   return system_info.dwNumberOfProcessors;
 }
 
+int MaxParallelism() { return NumSchedulableCPUs(); }
+
 int NumTotalCPUs() {
   // TODO(ebrevdo): Make this more accurate.
   //
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 9d972fb..eb98c8d 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -375,12 +375,12 @@
     visibility = [
         "//learning/brain/runtime:__pkg__",  # xprof_bridge
         "//perftools/accelerators/xprof/xprofilez:__pkg__",  # alias xprof::TraceMeRecorder
+        "//tensorflow/core:__pkg__",  # executor.cc
         "//tensorflow/core/profiler/internal/cpu:__pkg__",  # host_tracer
         "//tensorflow/core/profiler/lib:__pkg__",  # traceme
     ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -417,6 +417,8 @@
     srcs = [
         "profiler_interface.cc",
         "profiler_interface.h",
+        "traceme_recorder.cc",
+        "traceme_recorder.h",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index 9a326c2..a07e51f 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -16,14 +16,14 @@
         "host_tracer.cc",
     ],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/internal:traceme_recorder",
-        # "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,
 )
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 039b5a0..6fddd58 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -22,6 +22,7 @@
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -85,8 +86,8 @@
 constexpr char kUserMetadataMarker = '#';
 
 Status HostTracer::CollectData(RunMetadata* run_metadata) {
-  if (events_.empty() && recording_) {
-    events_ = TraceMeRecorder::Collect();
+  if (recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder not stopped");
   }
   // Pair up start and end events, and add complete events to trace_entries.
   absl::flat_hash_map<uint64, uint64> end_times;
@@ -146,7 +147,12 @@
 }
 
 auto register_host_tracer_factory = [] {
-  RegisterProfilerFactory(&CreateHostTracer);
+  bool enable;
+
+  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_CPU_PROFILER", true, &enable));
+  if (enable) {
+    RegisterProfilerFactory(&CreateHostTracer);
+  }
   return 0;
 }();
 
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index a63cd7b..8b0e027 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -109,30 +109,6 @@
               MakeNodeStats("incomplete", thread_id, "key1=value1,key2"))));
 }
 
-void ValidateResult(const RunMetadata& run_metadata, const string& trace_name) {
-  uint32 thread_id = Env::Default()->GetCurrentThreadId();
-
-  EXPECT_THAT(
-      run_metadata.step_stats().dev_stats(0).node_stats(),
-      ElementsAre(EqualsNodeStats(MakeNodeStats(trace_name, thread_id))));
-}
-
-TEST(HostTracerTest, CollectsTraceMeEventsBetweenTracing) {
-  auto tracer = CreateHostTracer(nullptr);
-  RunMetadata run_metadata;
-  RunMetadata run_metadata2;
-
-  TF_ASSERT_OK(tracer->Start());
-  { TraceMe traceme("hello"); }
-  TF_ASSERT_OK(CollectData(tracer.get(), &run_metadata));
-  { TraceMe traceme("world"); }
-  TF_ASSERT_OK(CollectData(tracer.get(), &run_metadata2));
-  TF_ASSERT_OK(tracer->Stop());
-
-  ValidateResult(run_metadata, "hello");
-  ValidateResult(run_metadata2, "world");
-}
-
 }  // namespace
 }  // namespace cpu
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index b408969..4754f4f 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -19,8 +19,9 @@
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
+class EagerContext;
 struct ProfilerContext {
-  class EagerContext* eager_context = nullptr;
+  EagerContext* eager_context = nullptr;
 };
 
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/runtime/BUILD b/tensorflow/core/profiler/internal/runtime/BUILD
index aa670f8..085fed8 100644
--- a/tensorflow/core/profiler/internal/runtime/BUILD
+++ b/tensorflow/core/profiler/internal/runtime/BUILD
@@ -15,9 +15,10 @@
         "eager_profiler.cc",
     ],
     deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/profiler/internal:profiler_interface",
-        # "//tensorflow/stream_executor/lib",
     ],
     alwayslink = True,
 )
diff --git a/tensorflow/core/profiler/internal/runtime/eager_profiler.cc b/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
index 8f09438..30182da 100644
--- a/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
+++ b/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
@@ -13,7 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -89,7 +91,12 @@
 }
 
 auto register_eager_profiler_factory = [] {
-  RegisterProfilerFactory(&CreateEagerProfiler);
+  bool enable;
+  TF_CHECK_OK(
+      ReadBoolFromEnvVar("TF_ENABLE_EAGER_RUNTIME_PROFILER", true, &enable));
+  if (enable) {
+    RegisterProfilerFactory(&CreateEagerProfiler);
+  }
   return 0;
 }();
 
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
index c5979ea..b2a20c7 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -14,84 +14,70 @@
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 
-// To avoid unneccesary synchronization between threads, each thread has a
-// ThreadLocalRecorder that independently records its events.
-//
-// Events are stored in an EventQueue implemented as a linked-list of blocks,
-// with start and end pointers:
-//  [ events........ | next-]--> [ events......... | next ]
-//  ^start_block  ^start         ^end_block  ^end
-//
-// Record() writes at end, and then advances it, allocating a block if needed.
-// Clear() takes ownership of events in the range [start, end).
-// The end pointer is atomic so these can be concurrent.
-//
-// If a thread dies, the ThreadLocalRecorder's destructor hands its data off to
-// the orphaned_events list.
+#include <cstddef>
 
-#include <string>
-#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/stream_executor/lib/initialize.h"
 
 namespace tensorflow {
 namespace profiler {
 
-// Default value for g_trace_level when tracing is disabled
-constexpr static int kTracingDisabled = -1;
+std::atomic<int> TraceMeRecorder::trace_level_ =
+    ATOMIC_VAR_INIT(TraceMeRecorder::kTracingDisabled);
 
-namespace internal {
-std::atomic<int> g_trace_level = ATOMIC_VAR_INIT(kTracingDisabled);
-}  // namespace internal
+// Implementation of TraceMeRecorder::trace_level_ must be lock-free for faster
+// execution of the TraceMe() public API. This can be commented (if compilation
+// is failing) but execution might be slow (even when host tracing is disabled).
+static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
 
 namespace {
 
-class ThreadLocalRecorder;
-
-struct TraceMeContext {
-  // Lock for only rare events - start/stop, thread death.
-  mutex global_lock;
-  // Map of the static container instances (thread_local storage) for each
-  // thread, that store the trace events.
-  absl::flat_hash_map<uint64, ThreadLocalRecorder*> threads
-      GUARDED_BY(global_lock);
-  // Events traced from threads that died during tracing.
-  TraceMeRecorder::Events orphaned_events GUARDED_BY(global_lock);
-};
-
-static TraceMeContext* GetTraceMeContext() {
-  static TraceMeContext* singleton = new TraceMeContext();
-  return singleton;
-}
-
 // A single-producer single-consumer queue of Events.
-// Only the owner thread can write events, writing is lock-free.
-// Consume is also lock-free in this class.
 //
-// Internally, we have a linked list of blocks containing numbered slots.
-// start is the first occupied slot, end is the first unoccupied slot.
+// Implemented as a linked-list of blocks containing numbered slots, with start
+// and end pointers:
+//
+//  [ events........ | next-]--> [ events......... | next ]
+//  ^start_block_ ^start_         ^end_block_ ^end_
+//
+// start_ is the first occupied slot, end_ is the first unoccupied slot.
+//
+// Push writes at end_, and then advances it, allocating a block if needed.
+// PopAll takes ownership of events in the range [start_, end_).
+// The end_ pointer is atomic so Push and PopAll can be concurrent.
+//
+// Push and PopAll are lock free and each might be called from at most one
+// thread. Push is only called by the owner thread. PopAll is called by the
+// owner thread when it shuts down, or by the tracing control thread.
+//
+// Thus, PopAll might race with Push, so PopAll only removes events that were
+// in the queue when it was invoked. If Push is called while PopAll is active,
+// the new event remains in the queue. Thus, the tracing control thread should
+// call PopAll when tracing stops to remove events created during tracing, but
+// also when tracing starts again to clear any remaining events.
 class EventQueue {
  public:
   EventQueue()
-      : start_block_(new Block{0, nullptr}), end_block_(start_block_) {}
+      : start_block_(new Block{/*start=*/0, /*next=*/nullptr}),
+        start_(start_block_->start),
+        end_block_(start_block_),
+        end_(start_) {}
 
-  // REQUIRES: Consume() was called since the last Push().
+  // REQUIRES: PopAll() was called since the last Push().
   // Memory should be deallocated and trace events destroyed on destruction.
   // This doesn't require global lock as this discards all the stored trace
-  // events and we assume of destruction of this class only after the last
+  // events and we assume of destruction of this instance only after the last
   // Push() has been called.
   ~EventQueue() {
-    DCHECK_EQ(start_, end_.load()) << "EventQueue destroyed without Consume()";
+    DCHECK(Empty()) << "EventQueue destroyed without PopAll()";
     delete end_block_;
   }
 
   // Add a new event to the back of the queue. Fast and lock-free.
   void Push(TraceMeRecorder::Event&& event) {
-    uint64 end = end_.load(std::memory_order_relaxed);
+    size_t end = end_.load(std::memory_order_relaxed);
     new (&end_block_->events[end++ - end_block_->start].event)
         TraceMeRecorder::Event(std::move(event));
-    if (ABSL_PREDICT_FALSE(end - end_block_->start == Block::kLength)) {
+    if (ABSL_PREDICT_FALSE(end - end_block_->start == Block::kNumSlots)) {
       auto* new_block = new Block{end, nullptr};
       end_block_->next = new_block;
       end_block_ = new_block;
@@ -99,41 +85,53 @@
     end_.store(end, std::memory_order_release);  // Write index after contents.
   }
 
-  // Retrieve and remove all events in the queue.
-  std::vector<TraceMeRecorder::Event> Consume() {
+  // Retrieve and remove all events in the queue at the time of invocation.
+  // If Push is called while PopAll is active, the new event will not be
+  // removed from the queue.
+  std::vector<TraceMeRecorder::Event> PopAll() {
     // Read index before contents.
-    uint64 end = end_.load(std::memory_order_acquire);
+    size_t end = end_.load(std::memory_order_acquire);
     std::vector<TraceMeRecorder::Event> result;
     result.reserve(end - start_);
     while (start_ != end) {
-      Shift(&result);
+      result.emplace_back(Pop());
     }
     return result;
   }
 
  private:
-  // Shift one event off the front of the queue into *out.
-  void Shift(std::vector<TraceMeRecorder::Event>* out) {
+  // Returns true if the queue is empty at the time of invocation.
+  bool Empty() const {
+    return (start_ == end_.load(std::memory_order_acquire));
+  }
+
+  // Remove one event off the front of the queue and return it.
+  // REQUIRES: The queue must not be empty.
+  TraceMeRecorder::Event Pop() {
+    DCHECK(!Empty());
     // Move the next event into the output.
     auto& event = start_block_->events[start_++ - start_block_->start].event;
-    out->push_back(std::move(event));
+    TraceMeRecorder::Event out = std::move(event);
     event.~Event();  // Events must be individually destroyed.
     // If we reach the end of a block, we own it and should delete it.
     // The next block is present: end always points to something.
-    if (start_ - start_block_->start == Block::kLength) {
+    if (ABSL_PREDICT_FALSE(start_ - start_block_->start == Block::kNumSlots)) {
       auto* next_block = start_block_->next;
       delete start_block_;
       start_block_ = next_block;
+      DCHECK_EQ(start_, start_block_->start);
     }
+    return out;
   }
 
-  // The number of slots in a block. Chosen so that the block fits in 64k.
   struct Block {
-    static constexpr size_t kLength =
-        ((1 << 16) - (sizeof(uint64) + sizeof(std::atomic<Block*>))) /
+    // The number of slots in a block is chosen so the block fits in 64 KiB.
+    static constexpr size_t kSize = 1 << 16;
+    static constexpr size_t kNumSlots =
+        (kSize - (sizeof(size_t) + sizeof(Block*))) /
         sizeof(TraceMeRecorder::Event);
 
-    const uint64 start;  // The number of the first slot.
+    size_t start;  // The number of the first slot.
     Block* next;
     // Defer construction of Event until the data is available.
     // Must also destroy manually, as the block may not fill entirely.
@@ -141,104 +139,107 @@
       MaybeEvent() {}
       ~MaybeEvent() {}
       TraceMeRecorder::Event event;
-    } events[kLength];
+    } events[kNumSlots];
   };
 
+  static_assert(sizeof(Block) <= Block::kSize, "");
+
   // Head of list for reading. Only accessed by consumer thread.
   Block* start_block_;
-  uint64 start_ = 0;
+  size_t start_;
   // Tail of list for writing. Accessed by producer thread.
   Block* end_block_;
-  std::atomic<uint64> end_ = {0};  // Atomic: also read by consumer thread.
+  std::atomic<size_t> end_;  // Atomic: also read by consumer thread.
 };
 
-class ThreadLocalRecorder {
+}  // namespace
+
+// To avoid unnecessary synchronization between threads, each thread has a
+// ThreadLocalRecorder that independently records its events.
+class TraceMeRecorder::ThreadLocalRecorder {
  public:
-  // The recorder is created the first time Record() is called on a thread.
+  // The recorder is created the first time TraceMeRecorder::Record() is called
+  // on a thread.
   ThreadLocalRecorder() {
     auto* env = Env::Default();
     info_.tid = env->GetCurrentThreadId();
     env->GetCurrentThreadName(&info_.name);
-    mutex_lock lock(GetTraceMeContext()->global_lock);
-    GetTraceMeContext()->threads.emplace(info_.tid, this);
+    TraceMeRecorder::Get()->RegisterThread(info_.tid, this);
   }
 
   // The destructor is called when the thread shuts down early.
-  // We unregister this thread, and move its events to orphaned_events.
-  ~ThreadLocalRecorder() {
-    mutex_lock lock(GetTraceMeContext()->global_lock);
-    GetTraceMeContext()->threads.erase(info_.tid);
-    GetTraceMeContext()->orphaned_events.push_back(Clear());
-  }
+  ~ThreadLocalRecorder() { TraceMeRecorder::Get()->UnregisterThread(Clear()); }
 
-  // This is the performance-critical part!
+  // Record is only called from the owner thread.
   void Record(TraceMeRecorder::Event&& event) { queue_.Push(std::move(event)); }
 
-  TraceMeRecorder::ThreadEvents Clear()
-      EXCLUSIVE_LOCKS_REQUIRED(GetTraceMeContext()->global_lock) {
-    return {info_, queue_.Consume()};
-  }
+  // Clear is called from the control thread when tracing starts/stops, or from
+  // the owner thread when it shuts down (see destructor).
+  TraceMeRecorder::ThreadEvents Clear() { return {info_, queue_.PopAll()}; }
 
  private:
   TraceMeRecorder::ThreadInfo info_;
   EventQueue queue_;
 };
 
-// Gather events from all active threads, and clear their buffers. The global
-// lock is held, so no threads can be added/removed for the duration while we
-// consume the collected trace entries. This will block any new thread and also
-// the starting and stopping of TraceMeRecorder, hence, this is performance
-// critical and should be kept fast.
-TraceMeRecorder::Events Clear()
-    EXCLUSIVE_LOCKS_REQUIRED(GetTraceMeContext()->global_lock) {
+/*static*/ TraceMeRecorder* TraceMeRecorder::Get() {
+  static TraceMeRecorder* singleton = new TraceMeRecorder;
+  return singleton;
+}
+
+void TraceMeRecorder::RegisterThread(int32 tid, ThreadLocalRecorder* thread) {
+  mutex_lock lock(mutex_);
+  threads_.emplace(tid, thread);
+}
+
+void TraceMeRecorder::UnregisterThread(TraceMeRecorder::ThreadEvents&& events) {
+  mutex_lock lock(mutex_);
+  threads_.erase(events.thread.tid);
+  orphaned_events_.push_back(std::move(events));
+}
+
+// This method is performance critical and should be kept fast. It is called
+// when tracing starts/stops. The mutex is held, so no threads can be
+// registered/unregistered. This prevents calling ThreadLocalRecorder::Clear
+// from two different threads.
+TraceMeRecorder::Events TraceMeRecorder::Clear() {
   TraceMeRecorder::Events result;
-  std::swap(GetTraceMeContext()->orphaned_events, result);
-  for (const auto& entry : GetTraceMeContext()->threads) {
+  std::swap(orphaned_events_, result);
+  for (const auto& entry : threads_) {
     auto* recorder = entry.second;
     result.push_back(recorder->Clear());
   }
   return result;
 }
 
-}  // namespace
-
-bool TraceMeRecorder::Start(int level) {
+bool TraceMeRecorder::StartRecording(int level) {
   level = std::max(0, level);
-  mutex_lock lock(GetTraceMeContext()->global_lock);
+  mutex_lock lock(mutex_);
+  // Change trace_level_ while holding mutex_.
   int expected = kTracingDisabled;
-  if (!internal::g_trace_level.compare_exchange_strong(
-          expected, level, std::memory_order_acq_rel)) {
-    return false;
+  bool started = trace_level_.compare_exchange_strong(
+      expected, level, std::memory_order_acq_rel);
+  if (started) {
+    // We may have old events in buffers because Record() raced with Stop().
+    Clear();
   }
-  // We may have old events in buffers because Record() raced with Stop().
-  Clear();
-  return true;
+  return started;
 }
 
-
 void TraceMeRecorder::Record(Event event) {
   static thread_local ThreadLocalRecorder thread_local_recorder;
   thread_local_recorder.Record(std::move(event));
 }
 
-// Only one thread is expected to call Stop() as first instance of XprofSession
-// prevents another XprofSession from doing any profiling.
-TraceMeRecorder::Events TraceMeRecorder::Stop() {
-  mutex_lock lock(GetTraceMeContext()->global_lock);
-  if (internal::g_trace_level.exchange(
-          kTracingDisabled, std::memory_order_acq_rel) == kTracingDisabled) {
-    return {};
-  }
-  return Clear();
-}
-
-TraceMeRecorder::Events TraceMeRecorder::Collect() {
-  mutex_lock lock(GetTraceMeContext()->global_lock);
-  if (internal::g_trace_level.load(std::memory_order_acquire) ==
+TraceMeRecorder::Events TraceMeRecorder::StopRecording() {
+  TraceMeRecorder::Events events;
+  mutex_lock lock(mutex_);
+  // Change trace_level_ while holding mutex_.
+  if (trace_level_.exchange(kTracingDisabled, std::memory_order_acq_rel) !=
       kTracingDisabled) {
-    return {};
+    events = Clear();
   }
-  return Clear();
+  return events;
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 1e66b1e..3740297 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -16,17 +16,18 @@
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
 #include <atomic>
+#include <cstddef>
+#include <string>
 #include <vector>
+
 #include "absl/base/optimization.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 
-namespace internal {
-extern std::atomic<int> g_trace_level;
-}  // namespace internal
-
 // TraceMeRecorder is a singleton repository of TraceMe events.
 // It can be safely and cheaply appended to by multiple threads.
 //
@@ -49,45 +50,68 @@
     uint64 end_time;    // 0 = missing
   };
   struct ThreadInfo {
-    int64 tid;
+    int32 tid;
     string name;
   };
   struct ThreadEvents {
-    const ThreadInfo thread;
+    ThreadInfo thread;
     std::vector<Event> events;
   };
   using Events = std::vector<ThreadEvents>;
 
   // Starts recording of TraceMe().
   // Only traces <= level will be recorded.
-  // Level must be >= 0.
-  // If level is 0, no traces will be recorded.
-  static bool Start(int level);
+  // Level must be >= 0. If level is 0, no traces will be recorded.
+  static bool Start(int level) { return Get()->StartRecording(level); }
 
   // Stops recording and returns events recorded since Start().
-  static Events Stop();
-
-  // Returns events recorded till now without stopping the recording. Empty
-  // container is returned if the recorder was already stopped.
-  static Events Collect();
+  // Events passed to Record after Stop has started will be dropped.
+  static Events Stop() { return Get()->StopRecording(); }
 
   // Returns whether we're currently recording. Racy, but cheap!
   static inline bool Active(int level = 1) {
-    return ABSL_PREDICT_FALSE(
-        internal::g_trace_level.load(std::memory_order_acquire) >= level);
+    return ABSL_PREDICT_FALSE(trace_level_.load(std::memory_order_acquire) >=
+                              level);
   }
 
-  static void Record(Event);
+  // Records an event. Non-blocking.
+  static void Record(Event event);
 
  private:
+  // Default value for trace_level_ when tracing is disabled
+  static constexpr int kTracingDisabled = -1;
+
+  class ThreadLocalRecorder;
+
+  // Returns singleton.
+  static TraceMeRecorder* Get();
+
+  TraceMeRecorder() = default;
+
   // No copy and assignment
   TraceMeRecorder(const TraceMeRecorder&) = delete;
   TraceMeRecorder& operator=(const TraceMeRecorder&) = delete;
 
-  // Implementation of g_trace_level must be lock-free for faster execution
-  // of the TraceMe() public API. This can be commented (if compilation is
-  // failing) but execution might be slow (even when host tracing is disabled).
-  static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
+  void RegisterThread(int32 tid, ThreadLocalRecorder* thread);
+  void UnregisterThread(ThreadEvents&& events);
+
+  bool StartRecording(int level);
+  Events StopRecording();
+
+  // Gathers events from all active threads, and clears their buffers.
+  Events Clear() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Current trace level.
+  // Static atomic so TraceMeRecorder::Active can be fast and non-blocking.
+  // Modified by TraceMeRecorder singleton when tracing starts/stops.
+  static std::atomic<int> trace_level_;
+
+  mutex mutex_;
+  // Map of the static container instances (thread_local storage) for each
+  // thread. While active, a ThreadLocalRecorder stores trace events.
+  absl::flat_hash_map<int32, ThreadLocalRecorder*> threads_ GUARDED_BY(mutex_);
+  // Events from threads that died during recording.
+  TraceMeRecorder::Events orphaned_events_ GUARDED_BY(mutex_);
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index ec588af..6899658 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -46,31 +46,6 @@
               ::testing::ElementsAre(Named("during1"), Named("during2")));
 }
 
-TEST(RecorderTest, CollectionBeforeStop) {
-  uint64 start_time = Env::Default()->NowNanos();
-  uint64 end_time = start_time + kNanosInSec;
-
-  TraceMeRecorder::Record({1, "ignored", start_time, end_time});
-  TraceMeRecorder::Start(/*level=*/1);
-  TraceMeRecorder::Record({2, "during1", start_time, end_time});
-  TraceMeRecorder::Record({3, "during2", start_time, end_time});
-  auto collected_results = TraceMeRecorder::Collect();
-  TraceMeRecorder::Record({4, "after_collect", start_time, end_time});
-  auto stopped_results = TraceMeRecorder::Stop();
-  TraceMeRecorder::Record({5, "after_stop", start_time, end_time});
-  auto results_after_stop = TraceMeRecorder::Collect();
-
-  ASSERT_EQ(collected_results.size(), 1);
-  EXPECT_THAT(collected_results[0].events,
-              ::testing::ElementsAre(Named("during1"), Named("during2")));
-
-  ASSERT_EQ(stopped_results.size(), 1);
-  EXPECT_THAT(stopped_results[0].events,
-              ::testing::ElementsAre(Named("after_collect")));
-
-  ASSERT_EQ(results_after_stop.size(), 0);
-}
-
 void SpinNanos(int nanos) {
   uint64 deadline = Env::Default()->NowNanos() + nanos;
   while (Env::Default()->NowNanos() < deadline) {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 9f6c28b..1404034 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -11,6 +11,10 @@
     "//tensorflow:tensorflow.bzl",
     "tf_cuda_library",
 )
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_additional_profiler_lib_deps",
+)
 
 tf_cuda_library(
     name = "profiler_session",
@@ -22,8 +26,6 @@
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/core/profiler/internal/gpu:device_tracer",
-        "//tensorflow/core/profiler/internal/cpu:host_tracer",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler:protos_all_cc",
     ] + select({
@@ -36,7 +38,6 @@
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:session_options",
-            "//tensorflow/core:device_tracer",
         ],
     }),
 )
@@ -44,10 +45,7 @@
 tf_cuda_library(
     name = "profiler_graph_lib",
     visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core/profiler/internal/cpu:host_tracer",
-        "//tensorflow/core/profiler/internal/gpu:device_tracer",
-    ],
+    deps = tf_additional_profiler_lib_deps(),
     alwayslink = 1,
 )
 
@@ -55,10 +53,8 @@
     name = "profiler_eager_lib",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/core/profiler/internal/cpu:host_tracer",
-        "//tensorflow/core/profiler/internal/gpu:device_tracer",
         "//tensorflow/core/profiler/internal/runtime:eager_profiler",
-    ],
+    ] + tf_additional_profiler_lib_deps(),
     alwayslink = 1,
 )
 
@@ -71,15 +67,11 @@
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/internal:traceme_recorder",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
     ],
 )
 
 filegroup(
     name = "mobile_srcs",
-    srcs = [
-        "profiler_session.cc",
-        "profiler_session.h",
-    ],
+    srcs = glob(["*"]),
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 53b4338..3913260 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -177,7 +177,11 @@
   status_ = Status::OK();
 
   for (auto& profiler : profilers_) {
-    profiler->Start().IgnoreError();
+    auto start_status = profiler->Start();
+    if (!start_status.ok()) {
+      LOG(WARNING) << "Encountered error while starting profiler: "
+                   << start_status.ToString();
+    }
   }
 }
 
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index b9fae3d..5a5ba52 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -18,7 +18,6 @@
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -27,13 +26,25 @@
 namespace tensorflow {
 namespace profiler {
 
-// This is specifically used in xprof_bridge for instrumenting Tensorflow ops.
+// This is specifically used for instrumenting Tensorflow ops.
 // Takes input as whether a TF op is expensive or not and returns the TraceMe
 // level to be assigned to trace that particular op. Assigns level 2 for
-// expensive ops (these are high-level details and shown by default in xprof
+// expensive ops (these are high-level details and shown by default in profiler
 // UI). Assigns level 3 for cheap ops (low-level details not shown by default).
 inline int GetTFTraceMeLevel(bool is_expensive) { return is_expensive ? 2 : 3; }
 
+// Predefined levels:
+// - Level 1 (kCritical) is the default and used only for user instrumentation.
+// - Level 2 (kInfo) is used by profiler for instrumenting high level program
+//   execution details (expensive TF ops, XLA ops, etc).
+// - Level 3 (kVerbose) is also used by profiler to instrument more verbose
+//   (low-level) program execution details (cheap TF ops, etc).
+enum TraceMeLevel {
+  kCritical = 1,
+  kInfo = 2,
+  kVerbose = 3,
+};
+
 // This class permits user-specified (CPU) tracing activities. A trace activity
 // is started when an object of this class is created and stopped when the
 // object is destroyed.
@@ -63,12 +74,8 @@
   // in the UI. Level defines the trace priority, used for filtering TraceMe
   // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
   // - Must be a positive integer.
-  // - Level 1 is the default and used only for user instrumentation.
-  // - Level 2 is used by xprof for instrumenting high level program execution
-  //   details (expensive TF ops, XLA ops, etc).
-  // - Level 3 is also used by xprof to instrument more verbose (low-level)
-  //   program execution details (cheap TF ops, etc).
-  // Users are welcome to use level >= 2 in their code, if they wish to filter
+  // - Can be a value in enum TraceMeLevel.
+  // Users are welcome to use level > 3 in their code, if they wish to filter
   // out their host traces based on verbosity.
   explicit TraceMe(absl::string_view activity_name, int level = 1) {
     DCHECK_GE(level, 1);
@@ -113,7 +120,7 @@
   // type that the string() constructor can take.
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
-  // Usage: xprof::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  // Usage: profiler::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
   template <typename NameGeneratorT>
   explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
     DCHECK_GE(level, 1);
@@ -125,7 +132,10 @@
     }
   }
 
-  ~TraceMe() {
+  // Stop tracing the activity. Called by the destructor, but exposed to allow
+  // stopping tracing before the object goes out of scope. Only has an effect
+  // the first time it is called.
+  void Stop() {
     // We do not need to check the trace level again here.
     // - If tracing wasn't active to start with, we have kUntracedActivity.
     // - If tracing was active and was stopped, we have
@@ -133,16 +143,19 @@
     // - If tracing was active and was restarted at a lower level, we may
     //   spuriously record the event. This is extremely rare, and acceptable as
     //   event will be discarded when its start timestamp fall outside of the
-    //   start/stop session timestamp (recorded in XprofResponse).
+    //   start/stop session timestamp.
     if (start_time_ != kUntracedActivity) {
       if (TraceMeRecorder::Active()) {
         TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name),
                                  start_time_, Env::Default()->NowNanos()});
       }
       no_init_.name.~string();
+      start_time_ = kUntracedActivity;
     }
   }
 
+  ~TraceMe() { Stop(); }
+
   // TraceMe is not movable or copyable.
   TraceMe(const TraceMe &) = delete;
   TraceMe &operator=(const TraceMe &) = delete;
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 56bdc39..cb6c145 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -31,7 +31,6 @@
         "//tensorflow:grpc++",
         "//tensorflow/core:framework",
         "//tensorflow/core:grpc_services",
-        # "//tensorflow/core:platform_env",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler/lib:profiler_eager_lib",
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 6afca6c..4d62124 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -165,9 +165,26 @@
     // is really not subject to pending use.
     bool timestamped_allocator = 5;
 
-    // If > 0 limit the number of pending kernels on any compute
-    // stream to this number.
-    int32 pending_cap = 6;
+    // reserved id: 6
+
+    // Parameters for GPUKernelTracker.  By default no kernel tracking is done.
+    // Note that timestamped_allocator is only effective if some tracking is
+    // specified.
+    //
+    // If kernel_tracker_max_interval = n > 0, then a tracking event
+    // is inserted after every n kernels without an event.
+    int32 kernel_tracker_max_interval = 7;
+    // If kernel_tracker_max_bytes = n > 0, then a tracking event is
+    // inserted after every series of kernels allocating a sum of
+    // memory >= n.  If one kernel allocates b * n bytes, then one
+    // event will be inserted after it, but it will count as b against
+    // the pending limit.
+    int32 kernel_tracker_max_bytes = 8;
+    // If kernel_tracker_max_pending > 0 then no more than this many
+    // tracking events can be outstanding at a time.  An attempt to
+    // launch an additional kernel will stall until an event
+    // completes.
+    int32 kernel_tracker_max_pending = 9;
   }
 
   // Everything inside experimental is subject to change and is not subject
@@ -330,6 +347,7 @@
   // inter_op_parallelism_threads available in each process.
   //
   // 0 means the system picks an appropriate number.
+  // Negative means all operations are performed in caller's thread.
   //
   // Note that the first Session created in the process sets the
   // number of threads for all future sessions unless use_per_session_threads is
@@ -474,6 +492,12 @@
     // but in the case where there is a lot of spinning may result in lower
     // CPU usage.
     bool disable_thread_spinning = 9;
+
+    // When true, WorkerSessions are created with device attributes from the
+    // full cluster.
+    // This is helpful when a worker wants to partition a graph
+    // (for example during a PartitionedCallOp).
+    bool share_cluster_devices_in_session = 10;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/data/experimental/snapshot.proto b/tensorflow/core/protobuf/data/experimental/snapshot.proto
new file mode 100644
index 0000000..dde0ade
--- /dev/null
+++ b/tensorflow/core/protobuf/data/experimental/snapshot.proto
@@ -0,0 +1,21 @@
+syntax = "proto3";
+
+package tensorflow.data.experimental;
+
+import "tensorflow/core/framework/tensor.proto";
+
+// Each SnapshotRecord represents one batch of pre-processed input data. A batch
+// consists of a list of tensors that we encode as TensorProtos. This message
+// doesn't store the structure of the batch.
+message SnapshotRecord {
+  repeated .tensorflow.TensorProto tensor = 1;
+}
+
+// This stores the metadata information present in each snapshot record.
+message SnapshotMetadataRecord {
+  string graph_fingerprint = 1;
+  string run_id = 2;
+  int64 creation_timestamp = 3;
+
+  bool finalized = 1000;
+}
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index 5acc879..720f54c 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -152,7 +152,11 @@
 }
 
 // A SavedResource represents a TF object that holds state during its lifetime.
+// An object of this type can have a reference to a:
+// create_resource() and an initialize() function.
 message SavedResource {
-  // An object of this type can have a reference to a:
-  // create_resource() and an initialize() function.
+  // A device specification indicating a required placement for the resource
+  // creation function, e.g. "CPU". An empty string allows the user to select a
+  // device.
+  string device = 1;
 }
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e7685d5..0bea9aa 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -67,6 +67,9 @@
   // If true, any resources such as Variables used in the session will not be
   // shared with other sessions.
   bool isolate_session_state = 3;
+
+  // The device attributes of all the devices in the cluster.
+  repeated DeviceAttributes cluster_device_attributes = 4;
 }
 
 message CreateWorkerSessionResponse {}
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 95b443e..24dab89 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -36,8 +36,6 @@
   (TF_STR(TF_MAJOR_VERSION) "." TF_STR(TF_MINOR_VERSION) "." TF_STR( \
       TF_PATCH_VERSION) TF_VERSION_SUFFIX)
 
-// TODO(josh11b): Public API functions for exporting the above.
-
 // GraphDef compatibility versions (the versions field in graph.proto).
 //
 // Each graph has producer and min_consumer versions, and each
@@ -100,12 +98,17 @@
 //     deprecated in favor of V2 ops. (2018/01/23)
 // 28. Deprecate MatrixExponential op in favor of Python implementation.
 //     (2018/08/21).
+// (2019/02/15). Added `control_ret` field to FunctionDef proto, and
+//     `control_output` field to OpDef proto.
 // 29. Deprecate StatefulStandardNormal op in favor of StatefulStandardNormalV2.
 //     (2019/03/25).
+// (2019/04/17). Added `arg_attr` field to FunctionDefProto.
+// 30. (2019/05/09) First date based GraphDef version. GraphDef
+//     versions advance by 1 each day after this point.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 29
+#define TF_GRAPH_DEF_VERSION 30  // Updated: 2019/05/09
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
deleted file mode 100644
index 3499db7..0000000
--- a/tensorflow/core/util/cuda_device_functions.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
-#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
-
-// Forward to new header.
-#include "tensorflow/core/util/gpu_device_functions.h"
-
-#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
deleted file mode 100644
index f0c77d6..0000000
--- a/tensorflow/core/util/cuda_launch_config.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
-#define TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
-
-// Forward to new header.
-#include "tensorflow/core/util/gpu_launch_config.h"
-
-#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/gpu_cuda_alias.h b/tensorflow/core/util/gpu_cuda_alias.h
new file mode 100644
index 0000000..5a05700
--- /dev/null
+++ b/tensorflow/core/util/gpu_cuda_alias.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
+#define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
+
+// Several forwarding macros are defined in this file to serve for backward
+// compatibility usage as we migrating from Cuda prefixed function to Gpu
+// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
+// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
+// function. Over time, all Cuda* functions will be deprecated.
+
+namespace tensorflow {
+
+// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
+  template <typename... Args>                             \
+  auto cuda_alias(Args&&... args)                         \
+      ->decltype(func(std::forward<Args>(args)...)) {     \
+    return func(std::forward<Args>(args)...);             \
+  }
+#else
+#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
+#endif
+
+// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
+// Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
+  template <typename... Args>                               \
+  __device__ auto cuda_alias(Args&&... args)                \
+      ->decltype(func(std::forward<Args>(args)...)) {       \
+    return func(std::forward<Args>(args)...);               \
+  }
+#else
+#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
+#endif
+
+// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
+#else
+#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias)
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index ddcded9..7e8742b 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -24,14 +24,16 @@
  * Provides atomic operations on types that aren't natively supported.
  */
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <algorithm>
 #include <complex>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
 #include "cuda/include/cuComplex.h"
 #include "cuda/include/cuda.h"
+#endif
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -115,7 +117,15 @@
 // Returns the warp lane ID of the calling thread
 __device__ inline unsigned CudaLaneId() {
   unsigned int lane_id;
+#if GOOGLE_CUDA
+#if __clang__
+  return __nvvm_read_ptx_sreg_laneid();
+#else   // __clang__
   asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+#endif  // __clang__
+#elif TENSORFLOW_USE_ROCM
+  land_id = __lane_id();
+#endif
   return lane_id;
 }
 
@@ -137,7 +147,12 @@
 #if CUDA_VERSION >= 9000
   unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
 #else
+#if GOOGLE_CUDA
   unsigned src_lane_mask = __shfl(mask, src_lane);
+#elif TENSORFLOW_USE_ROCM
+  unsigned src_lane_mask =
+      __shfl(static_cast<int>(mask), static_cast<int>(src_lane));
+#endif
 #endif
   return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
 }
@@ -250,12 +265,22 @@
 // See b/69446944.
 __device__ inline double CudaShuffleSync(unsigned mask, double value,
                                          int src_lane, int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
   hi = CudaShuffleSync(mask, hi, src_lane, width);
   lo = CudaShuffleSync(mask, lo, src_lane, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl(static_cast<int>(hi), src_lane, width);
+  lo = __shfl(static_cast<int>(lo), src_lane, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
 }
 
 // Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
@@ -279,12 +304,22 @@
 __device__ inline double CudaShuffleUpSync(unsigned mask, double value,
                                            unsigned delta,
                                            int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
   hi = CudaShuffleUpSync(mask, hi, delta, width);
   lo = CudaShuffleUpSync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl_up(static_cast<int>(hi), delta, width);
+  lo = __shfl_up(static_cast<int>(lo), delta, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
 }
 
 // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
@@ -308,12 +343,22 @@
 __device__ inline double CudaShuffleDownSync(unsigned mask, double value,
                                              unsigned delta,
                                              int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
   hi = CudaShuffleDownSync(mask, hi, delta, width);
   lo = CudaShuffleDownSync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl_down(static_cast<int>(hi), delta, width);
+  lo = __shfl_down(static_cast<int>(lo), delta, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
 }
 
 // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
@@ -324,25 +369,54 @@
   assert(!(width & width - 1));
   assert(detail::CudaValidateShuffleSyncMask(
       mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+#if GOOGLE_CUDA
 #if CUDA_VERSION >= 9000
   return __shfl_xor_sync(mask, value, lane_mask, width);
 #else
   return __shfl_xor(value, lane_mask, width);
 #endif
+#elif TENSORFLOW_USE_ROCM
+  return __shfl_xor(static_cast<int>(value), lane_mask, width);
+#endif
 }
 
+#if TENSORFLOW_USE_ROCM
+__device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
+                                                Eigen::half value,
+                                                int lane_mask,
+                                                int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+  // TODO(rocm): This doesn't preserve NaN payload and flushes denorms to zero,
+  // maybe this should be implemented differently?
+  return static_cast<Eigen::half>(
+      __shfl_xor(static_cast<float>(value), lane_mask, width));
+}
+#endif
+
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // See b/69446944.
 __device__ inline double CudaShuffleXorSync(unsigned mask, double value,
                                             int lane_mask,
                                             int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
   hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
   lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl_xor(static_cast<int>(hi), lane_mask, width);
+  lo = __shfl_xor(static_cast<int>(lo), lane_mask, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
 }
 
 // Wrapper for __ldg.
@@ -427,11 +501,24 @@
 }
 template <typename F>
 __device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
+#if TENSORFLOW_USE_ROCM
+  // FIXME: remove the workaround below once bug is fixed.
+  // HIP has a bug in the implementation of __longlong_as_double
+  // So workaround it by using reinterpret_cast<double*>.
+  uint64_t result =
+      CudaAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
+                          [accumulate](tensorflow::uint64 a) {
+                            return __double_as_longlong(
+                                accumulate(*(reinterpret_cast<double*>(&a))));
+                          });
+  return *(reinterpret_cast<double*>(&result));
+#else
   return __longlong_as_double(CudaAtomicCasHelper(
       reinterpret_cast<tensorflow::uint64*>(ptr),
       [accumulate](tensorflow::uint64 a) {
         return __double_as_longlong(accumulate(__longlong_as_double(a)));
       }));
+#endif
 }
 
 // Overload of above function for half. Note that we don't have
@@ -500,24 +587,15 @@
   return detail::CudaAtomicCasHelper(ptr,
                                      [value](double a) { return a + value; });
 }
-#elif __clang__
-// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX.
-// see https://reviews.llvm.org/D39638
-__device__ inline double CudaAtomicAdd(double* ptr, double value) {
-  double result;
-  asm volatile("atom.add.f64 %0, [%1], %2;"
-               : "=d"(result)
-               : "l"(ptr), "d"(value)
-               : "memory");
-  return result;
-}
 #endif
+
 // CudaAtomicAdd
 // Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
 // not support. We treat a std::complex<T>* as a T* (the C++ standard section
 // 26.4.4 allows this explicitly) and atomic add the real and imaginary
 // components individually. The operation as a whole is not atomic, but we can
 // safely treat the components independently for the purpose of accumulating.
+#if GOOGLE_CUDA
 __device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
                                                     std::complex<float> value) {
   auto ptr_scalar = reinterpret_cast<float*>(ptr);
@@ -531,6 +609,7 @@
   return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
                               CudaAtomicAdd(ptr_scalar + 1, value.imag()));
 }
+#endif
 
 // CudaAtomicSub
 template <typename T, typename U>
@@ -564,6 +643,33 @@
   return atomicMax(ptr, value);
 }
 
+#if TENSORFLOW_USE_ROCM
+
+/*
+ * CUDA runtime headers have the following defined
+ *   __device__  int max(int, int)
+ *   __device__  float max(float, float)
+ *   __device__  double max(double, double)
+ *
+ * and many others, where as HIP runtime headers only have the "int" version
+ *
+ * Therefore need to special case ROCm version to call the correct underlying
+ * routines for float and double types.
+ *
+ */
+
+__device__ inline float CudaAtomicMax(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return fmaxf(a, value); });
+}
+
+__device__ inline double CudaAtomicMax(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return fmax(a, value); });
+}
+
+#else
+
 __device__ inline float CudaAtomicMax(float* ptr, float value) {
   return detail::CudaAtomicCasHelper(
       ptr, [value](float a) { return max(a, value); });
@@ -574,6 +680,8 @@
       ptr, [value](double a) { return max(a, value); });
 }
 
+#endif
+
 __device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
                                             Eigen::half value) {
   return detail::CudaAtomicCasHelper(
@@ -594,6 +702,33 @@
   return atomicMin(ptr, value);
 }
 
+#if TENSORFLOW_USE_ROCM
+
+/*
+ * CUDA runtime headers have the following defined
+ *   __device__  int min(int, int)
+ *   __device__  float min(float, float)
+ *   __device__  double min(double, double)
+ *
+ * and many others, where as HIP runtime headers only have the "int" version
+ *
+ * Therefore need to special case ROCm version to call the correct underlying
+ * routines for float and double types.
+ *
+ */
+
+__device__ inline float CudaAtomicMin(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return fminf(a, value); });
+}
+
+__device__ inline double CudaAtomicMin(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return fmin(a, value); });
+}
+
+#else
+
 __device__ inline float CudaAtomicMin(float* ptr, float value) {
   return detail::CudaAtomicCasHelper(
       ptr, [value](float a) { return min(a, value); });
@@ -604,6 +739,8 @@
       ptr, [value](double a) { return min(a, value); });
 }
 
+#endif
+
 __device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
                                             Eigen::half value) {
   return detail::CudaAtomicCasHelper(
@@ -631,7 +768,7 @@
 }
 
 // Operator overloads for complex numbers.
-
+#if GOOGLE_CUDA
 __device__ inline std::complex<float> operator+(const std::complex<float>& a,
                                                 const std::complex<float>& b) {
   auto result = cuCaddf(make_cuComplex(a.real(), a.imag()),
@@ -687,8 +824,9 @@
                        make_cuDoubleComplex(b.real(), b.imag()));
   return std::complex<double>(result.x, result.y);
 }
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #endif  // TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index 8368960..0b30a3a 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -16,12 +16,20 @@
 #ifndef TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
 #define TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if GOOGLE_CUDA
 #include "cuda/include/cuda_fp16.h"
+#endif
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 
+#if GOOGLE_CUDA
+#define TF_RED_WARPSIZE 32
+#elif TENSORFLOW_USE_ROCM
+#define TF_RED_WARPSIZE 64
+#endif
+
 // Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
 #define CUDA_1D_KERNEL_LOOP(i, n) \
   for (int i : ::tensorflow::CudaGridRangeX<int>(n))
@@ -29,6 +37,19 @@
 #define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
   for (int i : ::tensorflow::CudaGridRange##axis<int>(n))
 
+#if GOOGLE_CUDA
+#define gpuSuccess cudaSuccess
+using gpuStream_t = cudaStream_t;
+using gpuError_t = cudaError_t;
+
+#elif TENSORFLOW_USE_ROCM
+#define gpuSuccess hipSuccess
+using gpuStream_t = hipStream_t;
+using gpuError_t = hipError_t;
+#endif
+
+#define GetGPUStream(context) context->eigen_gpu_device().stream()
+
 namespace tensorflow {
 __host__ __device__ inline tensorflow::bfloat16 CudaLdg(
     const tensorflow::bfloat16* address) {
@@ -135,5 +156,5 @@
 }  // namespace cuda_helper
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #endif  // TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index c91cc95..e4a20e2 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -16,7 +16,7 @@
 #ifndef TENSORFLOW_CORE_UTIL_GPU_LAUNCH_CONFIG_H_
 #define TENSORFLOW_CORE_UTIL_GPU_LAUNCH_CONFIG_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <algorithm>
 
@@ -27,33 +27,33 @@
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
 
-// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
-// GetCuda3DLaunchConfig:
+// Usage of GetGpuLaunchConfig, GetGpu2DLaunchConfig, and
+// GetGpu3DLaunchConfig:
 //
-// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
+// There are two versions of GetGpuLaunchConfig and GetGpu2DLaunchConfig, one
 // version uses heuristics without any knowledge of the device kernel, the other
 // version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
 // launch parameters that maximize occupancy. Currently, only the maximum
-// occupancy version of GetCuda3DLaunchConfig is available.
+// occupancy version of GetGpu3DLaunchConfig is available.
 //
 // For large number of work elements, the convention is that each kernel would
-// iterate through its assigned range. The return value of GetCudaLaunchConfig
-// is struct CudaLaunchConfig, which contains all the information needed for the
+// iterate through its assigned range. The return value of GetGpuLaunchConfig
+// is struct GpuLaunchConfig, which contains all the information needed for the
 // kernel launch, including: virtual number of threads, the number of threads
 // per block and number of threads per block used inside <<< >>> of a kernel
-// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
-// as CudaLaunchConfig. The only difference is the dimension. The macros
+// launch. GetGpu2DLaunchConfig and GetGpu3DLaunchConfig does the same thing
+// as GpuLaunchConfig. The only difference is the dimension. The macros
 // CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
 //
 /* Sample code:
 
-__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
+__global__ void MyKernel1D(GpuLaunchConfig config, other_args...) {
   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
     do_your_job_here;
   }
 }
 
-__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
+__global__ void MyKernel2D(Gpu2DLaunchConfig config, other_args...) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
       do_your_job_here;
@@ -61,7 +61,7 @@
   }
 }
 
-__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
+__global__ void MyKernel3D(Gpu3DLaunchConfig config, other_args...) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
       CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
@@ -73,25 +73,25 @@
 
 void MyDriverFunc(const Eigen::GpuDevice &d) {
   // use heuristics
-  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
+  GpuLaunchConfig cfg1 = GetGpuLaunchConfig(10240, d);
   MyKernel1D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
-  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
+  Gpu2DLaunchConfig cfg2 = GetGpu2DLaunchConfig(10240, 10240, d);
   MyKernel2D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
-  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
+  Gpu3DLaunchConfig cfg3 = GetGpu3DLaunchConfig(4096, 4096, 100, d);
   MyKernel3D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
 
   // maximize occupancy
-  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  GpuLaunchConfig cfg4 = GetGpuLaunchConfig(10240, d, MyKernel1D, 0, 0 );
   MyKernel1D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
-  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
+  Gpu2DLaunchConfig cfg5 = GetGpu2DLaunchConfig(10240, 10240, d,
                                                   MyKernel1D, 0, 0);
   MyKernel2D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
-  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
+  Gpu3DLaunchConfig cfg6 = GetGpu3DLaunchConfig(4096, 4096, 100, d,
                                                   MyKernel1D, 0, 0);
   MyKernel3D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
@@ -107,7 +107,7 @@
 
 inline int DivUp(int a, int b) { return (a + b - 1) / b; }
 
-struct CudaLaunchConfig {
+struct GpuLaunchConfig {
   // Logical number of thread that works on the elements. If each logical
   // thread works on exactly a single element, this is the same as the working
   // element count.
@@ -117,15 +117,16 @@
   // Number of blocks for Cuda kernel launch.
   int block_count = -1;
 };
+using CudaLaunchConfig = GpuLaunchConfig;
 
 // Calculate the Cuda launch config we should use for a kernel launch.
 // This is assuming the kernel is quite simple and will largely be
 // memory-limited.
 // REQUIRES: work_element_count > 0.
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const Eigen::GpuDevice& d) {
+inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
+                                          const Eigen::GpuDevice& d) {
   CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
+  GpuLaunchConfig config;
   const int virtual_thread_count = work_element_count;
   const int physical_thread_count = std::min(
       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
@@ -140,25 +141,48 @@
   config.block_count = block_count;
   return config;
 }
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d) {
+  return GetGpuLaunchConfig(work_element_count, d);
+}
 
 // Calculate the Cuda launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
 // REQUIRES: work_element_count > 0.
 template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const Eigen::GpuDevice& d,
-                                            DeviceFunc func,
-                                            size_t dynamic_shared_memory_size,
-                                            int block_size_limit) {
+GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
+                                   const Eigen::GpuDevice& d, DeviceFunc func,
+                                   size_t dynamic_shared_memory_size,
+                                   int block_size_limit) {
   CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
+  GpuLaunchConfig config;
   int block_count = 0;
   int thread_per_block = 0;
 
+#if GOOGLE_CUDA
   cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
       &block_count, &thread_per_block, func, dynamic_shared_memory_size,
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+  // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
+  // implemented
+  // hipError_t err = hipOccupancyMaxPotentialBlockSize(
+  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+  //    block_size_limit);
+  // CHECK_EQ(err, hipSuccess);
+
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
+  const int physical_thread_count = std::min(
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+      work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
+  thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
 
   block_count =
       std::min(block_count, DivUp(work_element_count, thread_per_block));
@@ -168,40 +192,77 @@
   config.block_count = block_count;
   return config;
 }
+template <typename DeviceFunc>
+CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                     const Eigen::GpuDevice& d, DeviceFunc func,
+                                     size_t dynamic_shared_memory_size,
+                                     int block_size_limit) {
+  return GetGpuLaunchConfig(work_element_count, d, func,
+                            dynamic_shared_memory_size, block_size_limit);
+}
 
 // Calculate the Cuda launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
 // The returned launch config has thread_per_block set to fixed_block_size.
 // REQUIRES: work_element_count > 0.
 template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
+GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
     int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
     size_t dynamic_shared_memory_size, int fixed_block_size) {
   CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
+  GpuLaunchConfig config;
   int block_count = 0;
 
+#if GOOGLE_CUDA
   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
   block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
+#elif TENSORFLOW_USE_ROCM
+  // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
+  // is implemented
+  // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+  //    block_size_limit);
+  // CHECK_EQ(err, hipSuccess);
+
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
+  const int physical_thread_count = std::min(
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+      work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
+  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
 
   config.virtual_thread_count = work_element_count;
   config.thread_per_block = fixed_block_size;
   config.block_count = block_count;
   return config;
 }
+template <typename DeviceFunc>
+CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
+    int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int fixed_block_size) {
+  return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
+                                          dynamic_shared_memory_size,
+                                          fixed_block_size);
+}
 
-struct Cuda2DLaunchConfig {
+struct Gpu2DLaunchConfig {
   dim3 virtual_thread_count = dim3(0, 0, 0);
   dim3 thread_per_block = dim3(0, 0, 0);
   dim3 block_count = dim3(0, 0, 0);
 };
+using Cuda2DLaunchConfig = Gpu2DLaunchConfig;
 
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
-                                                const Eigen::GpuDevice& d) {
-  Cuda2DLaunchConfig config;
+inline Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
+                                              const Eigen::GpuDevice& d) {
+  Gpu2DLaunchConfig config;
 
   if (xdim <= 0 || ydim <= 0) {
     return config;
@@ -226,26 +287,39 @@
       grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
   return config;
 }
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                                const Eigen::GpuDevice& d) {
+  return GetGpu2DLaunchConfig(xdim, ydim, d);
+}
 
 // Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
 // This variant takes the resource limits of func into account to maximize
 // occupancy.
-using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+using Gpu3DLaunchConfig = Gpu2DLaunchConfig;
+using Cuda3DLaunchConfig = Gpu2DLaunchConfig;
 
 template <typename DeviceFunc>
-inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
-    int xdim, int ydim, int zdim, const Eigen::GpuDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  Cuda3DLaunchConfig config;
+Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
+                                        const Eigen::GpuDevice& d,
+                                        DeviceFunc func,
+                                        size_t dynamic_shared_memory_size,
+                                        int block_size_limit) {
+  Gpu3DLaunchConfig config;
 
   if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
     return config;
   }
 
   int dev;
+#if GOOGLE_CUDA
   cudaGetDevice(&dev);
   cudaDeviceProp deviceProp;
   cudaGetDeviceProperties(&deviceProp, dev);
+#elif TENSORFLOW_USE_ROCM
+  hipGetDevice(&dev);
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, dev);
+#endif
   int xthreadlimit = deviceProp.maxThreadsDim[0];
   int ythreadlimit = deviceProp.maxThreadsDim[1];
   int zthreadlimit = deviceProp.maxThreadsDim[2];
@@ -255,10 +329,26 @@
 
   int block_count = 0;
   int thread_per_block = 0;
+
+#if GOOGLE_CUDA
   cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
       &block_count, &thread_per_block, func, dynamic_shared_memory_size,
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+  // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
+  // implemented
+  // hipError_t err = hipOccupancyMaxPotentialBlockSize(
+  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+  //    block_size_limit);
+  // CHECK_EQ(err, hipSuccess);
+
+  const int physical_thread_count =
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
+  thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
 
   int threadsx = std::min({xdim, thread_per_block, xthreadlimit});
   int threadsy =
@@ -278,15 +368,27 @@
   config.block_count = dim3(blocksx, blocksy, blocksz);
   return config;
 }
-
 template <typename DeviceFunc>
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
-    int xdim, int ydim, const Eigen::GpuDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
-                               dynamic_shared_memory_size, block_size_limit);
+Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
+                                         const Eigen::GpuDevice& d,
+                                         DeviceFunc func,
+                                         size_t dynamic_shared_memory_size,
+                                         int block_size_limit) {
+  return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
+                              dynamic_shared_memory_size, block_size_limit);
 }
 
+template <typename DeviceFunc>
+Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
+                                       const Eigen::GpuDevice& d,
+                                       DeviceFunc func,
+                                       size_t dynamic_shared_memory_size,
+                                       int block_size_limit) {
+  return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
+                              dynamic_shared_memory_size, block_size_limit);
+}
+
+#if GOOGLE_CUDA
 // Returns a raw reference to the current cuda stream.  Required by a
 // number of kernel calls (for which StreamInterface* does not work), i.e.
 // CUB and certain cublas primitives.
@@ -298,6 +400,16 @@
                                                 ->GpuStreamMemberHack()));
   return *ptr;
 }
+template <typename DeviceFunc>
+Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                         const Eigen::GpuDevice& d,
+                                         DeviceFunc func,
+                                         size_t dynamic_shared_memory_size,
+                                         int block_size_limit) {
+  return GetGpu2DLaunchConfig(xdim, ydim, d, func, dynamic_shared_memory_size,
+                              block_size_limit);
+}
+#endif  // GOOGLE_CUDA
 
 namespace detail {
 template <typename... Ts, size_t... Is>
@@ -324,6 +436,7 @@
 }
 }  // namespace detail
 
+#if GOOGLE_CUDA
 // Launches a CUDA kernel through cudaLaunchKernel with the given arguments.
 //
 // The kernel parameters 'Ts' must be constructible from the arguments 'Args'.
@@ -344,9 +457,10 @@
   }
   return Status::OK();
 }
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_UTIL_GPU_LAUNCH_CONFIG_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_common.cc b/tensorflow/core/util/matmul_bcast.cc
similarity index 97%
rename from tensorflow/core/kernels/batch_matmul_op_common.cc
rename to tensorflow/core/util/matmul_bcast.cc
index 27963f3..3e5c5cf 100644
--- a/tensorflow/core/kernels/batch_matmul_op_common.cc
+++ b/tensorflow/core/util/matmul_bcast.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/batch_matmul_op_common.h"
+#include "tensorflow/core/util/matmul_bcast.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/batch_matmul_op_common.h b/tensorflow/core/util/matmul_bcast.h
similarity index 93%
rename from tensorflow/core/kernels/batch_matmul_op_common.h
rename to tensorflow/core/util/matmul_bcast.h
index 99e6d93..611ef23 100644
--- a/tensorflow/core/kernels/batch_matmul_op_common.h
+++ b/tensorflow/core/util/matmul_bcast.h
@@ -13,8 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_COMMON_H_
+#ifndef TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
+#define TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
 
 #include <vector>
 
@@ -67,4 +67,4 @@
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_COMMON_H_
+#endif  // TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_common_test.cc b/tensorflow/core/util/matmul_bcast_test.cc
similarity index 97%
rename from tensorflow/core/kernels/batch_matmul_op_common_test.cc
rename to tensorflow/core/util/matmul_bcast_test.cc
index d6334b7..1de6229 100644
--- a/tensorflow/core/kernels/batch_matmul_op_common_test.cc
+++ b/tensorflow/core/util/matmul_bcast_test.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/batch_matmul_op_common.h"
+#include "tensorflow/core/util/matmul_bcast.h"
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index 7dc8ddd..0ec7815 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -26,8 +26,17 @@
 #endif
 }
 
-bool CudaSupportsHalfMatMulAndConv() {
-#if GOOGLE_CUDA
+bool IsBuiltWithROCm() {
+#if TENSORFLOW_USE_ROCM
+  return true;
+#else
+  return false;
+#endif
+}
+
+bool GpuSupportsHalfMatMulAndConv() {
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   return true;
 #else
   return false;
diff --git a/tensorflow/core/util/port.h b/tensorflow/core/util/port.h
index e9b9cb1..bfdede7 100644
--- a/tensorflow/core/util/port.h
+++ b/tensorflow/core/util/port.h
@@ -21,9 +21,19 @@
 // Returns true if GOOGLE_CUDA is defined.
 bool IsGoogleCudaEnabled();
 
-// Returns true if GOOGLE_CUDA is defined, and the given CUDA version supports
-// half-precision matrix multiplications and convolution operations.
-bool CudaSupportsHalfMatMulAndConv();
+// Returns true if TENSORFLOW_USE_ROCM is defined. (i.e. TF is built with ROCm)
+bool IsBuiltWithROCm();
+
+// Returns true if either
+//
+//   GOOGLE_CUDA is defined, and the given CUDA version supports
+//   half-precision matrix multiplications and convolution operations.
+//
+//     OR
+//
+//   TENSORFLOW_USE_ROCM is defined
+//
+bool GpuSupportsHalfMatMulAndConv();
 
 // Returns true if INTEL_MKL is defined
 bool IsMklEnabled();
diff --git a/tensorflow/core/util/reffed_status_callback.h b/tensorflow/core/util/reffed_status_callback.h
index 4d9a851..1c552d4 100644
--- a/tensorflow/core/util/reffed_status_callback.h
+++ b/tensorflow/core/util/reffed_status_callback.h
@@ -16,6 +16,7 @@
 #ifndef TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
 #define TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -28,33 +29,30 @@
 // UpdateStatus(), or Status::OK() if no non-OK status was set.
 class ReffedStatusCallback : public core::RefCounted {
  public:
-  explicit ReffedStatusCallback(StatusCallback done)
-      : done_(std::move(done)), status_(Status::OK()) {}
+  explicit ReffedStatusCallback(StatusCallback done) : done_(std::move(done)) {}
 
   void UpdateStatus(const Status& s) {
-    if (!s.ok()) {
-      mutex_lock lock(mu_);
-      if (status_.ok()) status_.Update(s);
-    }
+    mutex_lock lock(mu_);
+    status_group_.Update(s);
   }
 
   bool ok() {
-    mutex_lock lock(mu_);
-    return status_.ok();
+    tf_shared_lock lock(mu_);
+    return status_group_.ok();
   }
 
   // Returns a copy of the current status.
   Status status() {
-    mutex_lock lock(mu_);
-    return status_;
+    tf_shared_lock lock(mu_);
+    return status_group_.as_summary_status();
   }
 
-  ~ReffedStatusCallback() { done_(status_); }
+  ~ReffedStatusCallback() { done_(status_group_.as_summary_status()); }
 
  private:
   StatusCallback done_;
   mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  StatusGroup status_group_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/reffed_status_callback_test.cc b/tensorflow/core/util/reffed_status_callback_test.cc
index 7e776be..6799183 100644
--- a/tensorflow/core/util/reffed_status_callback_test.cc
+++ b/tensorflow/core/util/reffed_status_callback_test.cc
@@ -13,10 +13,11 @@
 limitations under the License.
 ==============================================================================*/
 
-#include <atomic>
-
 #include "tensorflow/core/util/reffed_status_callback.h"
 
+#include <atomic>
+
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -49,11 +50,16 @@
   };
   auto* cb = new ReffedStatusCallback(std::move(done));
   cb->UpdateStatus(errors::Internal("1"));
-  cb->UpdateStatus(errors::Internal("2"));  // Will be ignored.
+  cb->UpdateStatus(errors::InvalidArgument("2"));
   EXPECT_FALSE(called);
   cb->Unref();
   EXPECT_TRUE(called);
-  EXPECT_EQ(status.error_message(), "1");
+  // Equal to the first error.
+  EXPECT_EQ(status.code(), error::INTERNAL);
+  // Both errors are reported.
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Internal: 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "Invalid argument: 2"));
 }
 
 TEST(TestReffedStatusCallback, RefMulti) {
@@ -67,13 +73,15 @@
   cb->Ref();
   cb->UpdateStatus(errors::Internal("1"));
   cb->Ref();
-  cb->UpdateStatus(errors::Internal("2"));  // Will be ignored.
+  cb->UpdateStatus(errors::Internal("2"));
   cb->Unref();
   cb->Unref();
   EXPECT_FALSE(called);
   cb->Unref();  // Created by constructor.
   EXPECT_TRUE(called);
-  EXPECT_EQ(status.error_message(), "1");
+  // Both errors are reported.
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Internal: 1"));
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Internal: 2"));
 }
 
 TEST(TestReffedStatusCallback, MultiThreaded) {
@@ -104,7 +112,9 @@
   n.WaitForNotification();
 
   EXPECT_EQ(num_called.load(), 1);
-  EXPECT_EQ(status.error_message(), "err");
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "Invalid argument: err"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index f331973..5dbd8ef 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -63,6 +63,8 @@
       return "HWIO";
     case FORMAT_OIHW:
       return "OIHW";
+    case FORMAT_OHWI:
+      return "OHWI";
     case FORMAT_OIHW_VECT_I:
       return "OIHW_VECT_I";
     default:
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 643e14e..82af5c5 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -80,6 +80,9 @@
   // FORMAT_OIHW often improves performance on GPUs.
   FORMAT_OIHW = 1,
 
+  // FORMAT_OHWI used by cuDNN for NHWC convolutions.
+  FORMAT_OHWI = 2,
+
   // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
   // int8 convolution and fused convolution. It is analogous to the NCHW_VECT_C
   // data format. It is laid out in the same order as OIHW, except that the size
@@ -88,7 +91,7 @@
   // int32. Thus an OIHW format filter with dimensions [O, I, H, W] would have
   // dimensions [O, I/4, H, W, 4] in OIHW_VECT_I format.
   // A pre-condition of this format is that I must be a multiple of 4.
-  FORMAT_OIHW_VECT_I = 2,
+  FORMAT_OIHW_VECT_I = 3,
 };
 
 // Parse tensor format from the given string.
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index 823c580..ddb0599 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -212,4 +212,8 @@
   // * presubmit: results from oneshot requests.
   // * culprit: results from culprit finder rerun.
   string run_mode = 11;
+
+  // TensorFlow version this benchmark runs against.
+  // This can be either set to full version or just the major version.
+  string tf_version = 12;
 };
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index a4d6f20..e3ee520 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -69,7 +69,10 @@
     size = "small",
     srcs = ["zero_out_1_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":zero_out_op_1",
         "//tensorflow:tensorflow_py",
@@ -81,7 +84,10 @@
     size = "small",
     srcs = ["zero_out_2_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":zero_out_grad_2",
         ":zero_out_op_2",
@@ -94,7 +100,10 @@
     size = "small",
     srcs = ["zero_out_3_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":zero_out_op_3",
         "//tensorflow:tensorflow_py",
@@ -121,7 +130,10 @@
     srcs = ["cuda_op_test.py"],
     exec_compatible_with = tf_exec_compatible_with({"tags": tf_cuda_tests_tags()}),
     srcs_version = "PY2AND3",
-    tags = tf_cuda_tests_tags() + ["notap"],
+    tags = tf_cuda_tests_tags() + [
+        "notap",
+        "no_pip",
+    ],
     deps = [
         ":cuda_op",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
index 06048ec..5f622a2 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
@@ -16,6 +16,7 @@
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
 
+#include <jni.h>
 #include <stdint.h>
 
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
diff --git a/tensorflow/examples/android/jni/object_tracking/sprite.h b/tensorflow/examples/android/jni/object_tracking/sprite.h
index b54a684..964f1c3 100755
--- a/tensorflow/examples/android/jni/object_tracking/sprite.h
+++ b/tensorflow/examples/android/jni/object_tracking/sprite.h
@@ -16,16 +16,14 @@
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
 
+#ifdef __RENDER_OPENGL__
+
 #include <GLES/gl.h>
 #include <GLES/glext.h>
 
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
 #include "tensorflow/examples/android/jni/object_tracking/image.h"
 
-#ifndef __RENDER_OPENGL__
-#error sprite.h should not included if OpenGL is not enabled by platform.h
-#endif
-
 namespace tf_tracking {
 
 // This class encapsulates the logic necessary to load an render image data
@@ -199,4 +197,6 @@
 
 }  // namespace tf_tracking
 
+#endif  // __RENDER_OPENGL__
+
 #endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index 7882d87..f778f3d 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -157,7 +157,7 @@
             getAssets(), TF_OD_API_MODEL_FILE, TF_OD_API_LABELS_FILE, TF_OD_API_INPUT_SIZE);
         cropSize = TF_OD_API_INPUT_SIZE;
       } catch (final IOException e) {
-        LOGGER.e("Exception initializing classifier!", e);
+        LOGGER.e(e, "Exception initializing classifier!");
         Toast toast =
             Toast.makeText(
                 getApplicationContext(), "Classifier could not be initialized", Toast.LENGTH_SHORT);
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index c50fd93..cc73163 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -57,6 +57,7 @@
     name = "label_image_py",
     srcs = ["label_image.py"],
     main = "label_image.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 168241e..5ade3c2 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -2,18 +2,16 @@
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-# This target bundles many scripts into a single py_binary so they can be
-# executed by saved_model_test without exploding the data dependencies.
-py_binary(
-    name = "run_script",
+py_library(
+    name = "integration_scripts",
     srcs = [
         "export_mnist_cnn.py",
         "export_rnn_cell.py",
         "export_simple_text_embedding.py",
         "export_text_rnn_model.py",
-        "run_script.py",
+        "integration_scripts.py",
         "use_mnist_cnn.py",
         "use_model_in_sequential_keras.py",
         "use_rnn_cell.py",
@@ -23,15 +21,6 @@
     visibility = ["//tensorflow:internal"],
     deps = [
         ":mnist_util",
-        ":util",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "util",
-    srcs = ["util.py"],
-    deps = [
         "//tensorflow:tensorflow_py",
     ],
 )
@@ -39,29 +28,38 @@
 py_library(
     name = "mnist_util",
     srcs = ["mnist_util.py"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "saved_model_test",
     srcs = [
         "saved_model_test.py",
     ],
-    data = [
-        ":run_script",
+    additional_deps = [
+        ":integration_scripts",
+        "//tensorflow:tensorflow_py",
     ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
-        # NOTE: Split SavedModelTest due to Forge input size limit.
-        "no_cuda_on_cpu_tap",  # forge input size exceeded
+        "no_pip",  # b/131697937 and b/132196869
         "noasan",  # forge input size exceeded
         "nomsan",  # forge input size exceeded
         "notsan",  # forge input size exceeded
     ],
-    deps = [
-        "//tensorflow:tensorflow_py",
+)
+
+# b/132234211: Target added to support internal test target that runs the test
+# in an environment that has the extra dependencies required to test integration
+# with non core tensorflow packages.
+py_library(
+    name = "saved_model_test_lib",
+    srcs = [
+        "saved_model_test.py",
     ],
+    visibility = ["//tensorflow:internal"],
+    deps = [":integration_scripts"],
 )
diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
new file mode 100644
index 0000000..0db91fa
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to write SavedModel integration tests.
+
+SavedModel testing requires isolation between the process that creates and
+consumes it. This file helps doing that by relaunching the same binary that
+calls `assertCommandSucceeded` with an environment flag indicating what source
+file to execute. That binary must start by calling `MaybeRunScriptInstead`.
+
+This allows to wire this into existing building systems without having to depend
+on data dependencies. And as so allow to keep a fixed binary size and allows
+interop with GPU tests.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import os
+import subprocess
+import sys
+
+from absl import app
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+class TestCase(tf.test.TestCase):
+  """Base class to write SavedModel integration tests."""
+
+  def assertCommandSucceeded(self, script_name, **flags):
+    """Runs an integration test script with given flags."""
+    run_script = sys.argv[0]
+    if run_script.endswith(".py"):
+      command_parts = [sys.executable, run_script]
+    else:
+      command_parts = [run_script]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+    env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name)
+    logging.info("Running: %s with environment flags %s" % (command_parts, env))
+    subprocess.check_call(command_parts, env=dict(os.environ, **env))
+
+
+def MaybeRunScriptInstead():
+  if "SCRIPT_NAME" in os.environ:
+    # Append current path to import path and execute `SCRIPT_NAME` main.
+    sys.path.extend([os.path.dirname(__file__)])
+    module_name = os.environ["SCRIPT_NAME"]
+    retval = app.run(importlib.import_module(module_name).main)
+    sys.exit(retval)
diff --git a/tensorflow/examples/saved_model/integration_tests/run_script.py b/tensorflow/examples/saved_model/integration_tests/run_script.py
deleted file mode 100644
index 438df40..0000000
--- a/tensorflow/examples/saved_model/integration_tests/run_script.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility to create a single py_binary that can call multiple py_binaries.
-
-This simulates executing a python script by importing a module name by the
-environment 'SCRIPT_NAME' and executing its main via `app.run`.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import os
-import sys
-
-from absl import app
-
-
-if __name__ == '__main__':
-  # Append current path to import path and execute `SCRIPT_NAME` main.
-  sys.path.extend([os.path.dirname(__file__)])
-  module_name = os.environ['SCRIPT_NAME']
-  app.run(importlib.import_module(module_name).main)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
index b8e2019..7cc8fde 100644
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -18,26 +18,27 @@
 from __future__ import division
 from __future__ import print_function
 
-import os
-import subprocess
-
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.examples.saved_model.integration_tests import integration_scripts
 
 
-class SavedModelTest(tf.test.TestCase):
+class SavedModelTest(integration_scripts.TestCase):
 
-  def assertCommandSucceeded(self, script_name, **flags):
-    """Runs a test script via run_script."""
-    run_script = resource_loader.get_path_to_datafile("run_script")
-    command_parts = [run_script]
-    for flag_key, flag_value in flags.items():
-      command_parts.append("--%s=%s" % (flag_key, flag_value))
-    env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name)
-    logging.info("Running: %s with environment flags %s" % (command_parts, env))
-    subprocess.check_call(command_parts, env=dict(os.environ, **env))
+  def __init__(self, method_name="runTest", has_extra_deps=False):
+    super(SavedModelTest, self).__init__(method_name)
+    self.has_extra_deps = has_extra_deps
+
+  def skipIfMissingExtraDeps(self):
+    """Skip test if it requires extra dependencies.
+
+    b/132234211: The extra dependencies are not available in all environments
+    that run the tests, e.g. "tensorflow_hub" is not available from tests
+    within "tensorflow" alone. Those tests are instead run by another
+    internal test target.
+    """
+    if not self.has_extra_deps:
+      self.skipTest("Missing extra dependencies")
 
   def test_text_rnn(self):
     export_dir = self.get_temp_dir()
@@ -50,6 +51,7 @@
     self.assertCommandSucceeded("use_rnn_cell", model_dir=export_dir)
 
   def test_text_embedding_in_sequential_keras(self):
+    self.skipIfMissingExtraDeps()
     export_dir = self.get_temp_dir()
     self.assertCommandSucceeded(
         "export_simple_text_embedding", export_dir=export_dir)
@@ -57,6 +59,9 @@
         "use_model_in_sequential_keras", model_dir=export_dir)
 
   def test_text_embedding_in_dataset(self):
+    if tf.test.is_gpu_available():
+      self.skipTest("b/132156097 - fails if there is a gpu available")
+
     export_dir = self.get_temp_dir()
     self.assertCommandSucceeded(
         "export_simple_text_embedding", export_dir=export_dir)
@@ -64,6 +69,7 @@
         "use_text_embedding_in_dataset", model_dir=export_dir)
 
   def test_mnist_cnn(self):
+    self.skipIfMissingExtraDeps()
     export_dir = self.get_temp_dir()
     self.assertCommandSucceeded(
         "export_mnist_cnn", export_dir=export_dir, fast_test_mode="true")
@@ -71,6 +77,7 @@
         "use_mnist_cnn", export_dir=export_dir, fast_test_mode="true")
 
   def test_mnist_cnn_with_mirrored_strategy(self):
+    self.skipIfMissingExtraDeps()
     self.skipTest(
         "b/129134185 - saved model and distribution strategy integration")
     export_dir = self.get_temp_dir()
@@ -85,5 +92,7 @@
         use_mirrored_strategy=True,
     )
 
+
 if __name__ == "__main__":
+  integration_scripts.MaybeRunScriptInstead()
   tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
index c08b548..957091f 100644
--- a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
+++ b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
@@ -29,9 +29,9 @@
 from absl import app
 from absl import flags
 import tensorflow.compat.v2 as tf
+import tensorflow_hub as hub
 
 from tensorflow.examples.saved_model.integration_tests import mnist_util
-from tensorflow.examples.saved_model.integration_tests import util
 
 FLAGS = flags.FLAGS
 
@@ -80,9 +80,7 @@
   if FLAGS.dropout_rate is not None:
     arguments['dropout_rate'] = FLAGS.dropout_rate
 
-  # CustomLayer mimics hub.KerasLayer because the tests are not able to depend
-  # on Hub at the moment.
-  return util.CustomLayer(obj, trainable=trainable, arguments=arguments)
+  return hub.KerasLayer(obj, trainable=trainable, arguments=arguments)
 
 
 def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
index c7a1a90..2446ff9 100644
--- a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
+++ b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 import tensorflow.compat.v2 as tf
-from tensorflow.examples.saved_model.integration_tests import util
+import tensorflow_hub as hub
 
 FLAGS = flags.FLAGS
 
@@ -42,7 +42,8 @@
   l = tf.keras.layers
   model = tf.keras.Sequential()
   model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
-  model.add(util.CustomLayer(module, output_shape=[10], trainable=fine_tuning))
+  # TODO(b/124219898): output_shape should be optional.
+  model.add(hub.KerasLayer(module, output_shape=[10], trainable=fine_tuning))
   model.add(l.Dense(100, activation="relu"))
   model.add(l.Dense(50, activation="relu"))
   model.add(l.Dense(1, activation="sigmoid"))
diff --git a/tensorflow/examples/saved_model/integration_tests/util.py b/tensorflow/examples/saved_model/integration_tests/util.py
deleted file mode 100644
index 1b709fd..0000000
--- a/tensorflow/examples/saved_model/integration_tests/util.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for integration tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.framework import smart_cond
-from tensorflow.python.util import tf_inspect
-
-
-# TODO(vbardiovsky): We should just reuse Keras's Lambda layer, when that
-# enables to get trainable variables.
-class CustomLayer(tf.keras.layers.Layer):
-  """Wraps callable object as a `Layer` object.
-
-  Args:
-    func: The callable object to wrap. Layer inputs are passed as the first
-      positional argument. If `func` accepts a `training` argument, a Python
-      boolean is passed for it.
-      If present, the following attributes of `func` have a special meaning:
-        * variables: a list of all tf.Variable objects that `func` depends on.
-        * trainable_variables: those elements of `variables` that are reported
-          as trainable variables of this Keras Layer.
-        * regularization_losses: a list of callables to be added as losses
-          of this Keras layer. Each one must accept zero arguments and return
-          a scalare tensor.
-    trainable: Boolean controlling whether the trainable variables of `func`
-      are reported as trainable variables of this layer.
-    arguments: optionally, a dict with additional keyword arguments passed
-      to `func`.
-    **kwargs: 'output_shape': A tuple with the (possibly partial) output
-      shape of the callable *without* leading batch size. Other arguments
-      are pass into the Layer constructor.
-  """
-
-  def __init__(self, func, trainable=False, arguments=None, **kwargs):
-    # Set self._{non,}_trainable_weights before calling Layer.__init__.
-    if hasattr(func, 'trainable_variables'):
-      self._trainable_weights = [v for v in func.trainable_variables]
-      trainable_variables_set = set(func.trainable_variables)
-    else:
-      self._trainable_weights = []
-      trainable_variables_set = set()
-    if hasattr(func, 'variables'):
-      self._non_trainable_weights = [v for v in func.variables
-                                     if v not in trainable_variables_set]
-    else:
-      self._non_trainable_weights = []  # TODO(arnoegw): Infer from `func`.
-
-    # TODO(b/124219898): We should be able to get the embedding dimension from
-    # the restored model.
-    if 'output_shape' in kwargs:
-      self._output_shape = tuple(kwargs.pop('output_shape'))
-
-    super(CustomLayer, self).__init__(trainable=trainable, **kwargs)
-    # Prepare to call `func`.
-    self._func = func
-    self._func_fullargspec = tf_inspect.getfullargspec(func.__call__)
-    self._func_wants_training = (
-        'training' in self._func_fullargspec.args or
-        'training' in self._func_fullargspec.kwonlyargs)
-    self._arguments = arguments or {}
-    # Forward the callable's regularization losses (if any).
-    if hasattr(func, 'regularization_losses'):
-      for l in func.regularization_losses:
-        if not callable(l):
-          raise ValueError(
-              'CustomLayer(func) expects func.regularization_losses to be an '
-              'iterable of callables, each returning a scalar loss term.')
-        self.add_loss(l)  # Supports callables.
-
-  def call(self, x, training=None):
-    # We basically want to call this...
-    f = functools.partial(self._func, x, **self._arguments)
-    # ...but we may also have to pass a Python boolean for `training`.
-    if not self._func_wants_training:
-      result = f()
-    else:
-      if training is None:
-        training = tf.keras.backend.learning_phase()  # Could be a tensor.
-      result = smart_cond.smart_cond(training,
-                                     lambda: f(training=True),
-                                     lambda: f(training=False))
-    # TODO(b/124219898): Polymorphic function should return shaped tensor.
-    if hasattr(self, '_output_shape'):
-      result.set_shape((x.shape[0],) + self._output_shape)
-    return result
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index e15497a..b290aa6 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -35,6 +35,9 @@
         ":models",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 py_library(
@@ -45,7 +48,6 @@
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/lite/experimental/microfrontend:audio_microfrontend_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -60,6 +62,9 @@
         ":models",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 py_binary(
@@ -92,6 +97,9 @@
         ":train",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 py_binary(
@@ -114,6 +122,9 @@
         "freeze.py",
     ],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",  # b/131330719
+    ],
     deps = [
         ":input_data",
         ":models",
@@ -128,9 +139,12 @@
     size = "small",
     srcs = ["freeze_test.py"],
     additional_deps = [
-        ":freeze",
+        ":freeze_main_lib",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 py_binary(
@@ -167,9 +181,12 @@
     size = "small",
     srcs = ["wav_to_features_test.py"],
     additional_deps = [
-        ":wav_to_features",
+        ":wav_to_features_main_lib",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 py_binary(
@@ -206,9 +223,12 @@
     size = "small",
     srcs = ["generate_streaming_test_wav_test.py"],
     additional_deps = [
-        ":generate_streaming_test_wav",
+        ":generate_streaming_test_wav_main_lib",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 tf_cc_binary(
@@ -256,9 +276,12 @@
     size = "medium",
     srcs = ["label_wav_test.py"],
     additional_deps = [
-        ":label_wav",
+        ":label_wav_main_lib",
         "//tensorflow/python:client_testlib",
     ],
+    tags = [
+        "no_pip",  # b/131330719
+    ],
 )
 
 cc_library(
@@ -345,3 +368,17 @@
         "//tensorflow/core:protos_all_cc",
     ],
 )
+
+py_library(
+    name = "test_lib",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":freeze",
+        ":generate_streaming_test_wav",
+        ":input_data",
+        ":label_wav",
+        ":models",
+        ":train",
+        ":wav_to_features",
+    ],
+)
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index 8a6716d..89e790d 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -49,14 +49,6 @@
 import models
 from tensorflow.python.framework import graph_util
 
-# If it's available, load the specialized feature generator. If this doesn't
-# work, try building with bazel instead of running the Python script directly.
-# bazel run tensorflow/examples/speech_commands:freeze_graph
-try:
-  from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op  # pylint:disable=g-import-not-at-top
-except ImportError:
-  frontend_op = None
-
 FLAGS = None
 
 
@@ -78,7 +70,7 @@
     feature_bin_count: Number of frequency bands to analyze.
     model_architecture: Name of the kind of model to generate.
     preprocess: How the spectrogram is processed to produce features, for
-      example 'mfcc', 'average', or 'micro'.
+      example 'mfcc' or 'average'.
 
   Raises:
     Exception: If the preprocessing mode isn't recognized.
@@ -114,33 +106,9 @@
         spectrogram,
         sample_rate,
         dct_coefficient_count=model_settings['fingerprint_width'])
-  elif preprocess == 'micro':
-    if not frontend_op:
-      raise Exception(
-          'Micro frontend op is currently not available when running TensorFlow'
-          ' directly from Python, you need to build and run through Bazel, for'
-          ' example'
-          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
-      )
-    sample_rate = model_settings['sample_rate']
-    window_size_ms = (model_settings['window_size_samples'] *
-                      1000) / sample_rate
-    window_step_ms = (model_settings['window_stride_samples'] *
-                      1000) / sample_rate
-    int16_input = tf.cast(
-        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
-    micro_frontend = frontend_op.audio_microfrontend(
-        int16_input,
-        sample_rate=sample_rate,
-        window_size=window_size_ms,
-        window_step=window_step_ms,
-        num_channels=model_settings['fingerprint_width'],
-        out_scale=1,
-        out_type=tf.float32)
-    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
   else:
-    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
-                    ' "average", or "micro")' % (preprocess))
+    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
+                    ' "average")' % (preprocess))
 
   fingerprint_size = model_settings['fingerprint_size']
   reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index a242453..9ed9050 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -66,24 +66,6 @@
       self.assertEqual(0, ops.count('Mfcc'))
 
   @test_util.run_deprecated_v1
-  def testCreateInferenceGraphWithMicro(self):
-    with self.cached_session() as sess:
-      freeze.create_inference_graph(
-          wanted_words='a,b,c,d',
-          sample_rate=16000,
-          clip_duration_ms=1000.0,
-          clip_stride_ms=30.0,
-          window_size_ms=30.0,
-          window_stride_ms=10.0,
-          feature_bin_count=40,
-          model_architecture='conv',
-          preprocess='micro')
-      self.assertIsNotNone(sess.graph.get_tensor_by_name('wav_data:0'))
-      self.assertIsNotNone(
-          sess.graph.get_tensor_by_name('decoded_sample_data:0'))
-      self.assertIsNotNone(sess.graph.get_tensor_by_name('labels_softmax:0'))
-
-  @test_util.run_deprecated_v1
   def testFeatureBinCount(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 60e1b8c..1079a30 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -37,13 +37,6 @@
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
-# If it's available, load the specialized feature generator. If this doesn't
-# work, try building with bazel instead of running the Python script directly.
-try:
-  from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op  # pylint:disable=g-import-not-at-top
-except ImportError:
-  frontend_op = None
-
 MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
 SILENCE_LABEL = '_silence_'
 SILENCE_INDEX = 0
@@ -176,12 +169,9 @@
   elif model_settings['preprocess'] == 'mfcc':
     features_min = -247.0
     features_max = 30.0
-  elif model_settings['preprocess'] == 'micro':
-    features_min = 0.0
-    features_max = 26.0
   else:
-    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
-                    ' "average", or "micro")' % (model_settings['preprocess']))
+    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
+                    ' "average")' % (model_settings['preprocess']))
   return features_min, features_max
 
 
@@ -387,7 +377,6 @@
 
     Raises:
       ValueError: If the preprocessing mode isn't recognized.
-      Exception: If the preprocessor wasn't compiled in.
     """
     with tf.get_default_graph().name_scope('data'):
       desired_samples = model_settings['desired_samples']
@@ -453,36 +442,9 @@
             dct_coefficient_count=model_settings['fingerprint_width'])
         tf.summary.image(
             'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
-      elif model_settings['preprocess'] == 'micro':
-        if not frontend_op:
-          raise Exception(
-              'Micro frontend op is currently not available when running'
-              ' TensorFlow directly from Python, you need to build and run'
-              ' through Bazel'
-          )
-        sample_rate = model_settings['sample_rate']
-        window_size_ms = (model_settings['window_size_samples'] *
-                          1000) / sample_rate
-        window_step_ms = (model_settings['window_stride_samples'] *
-                          1000) / sample_rate
-        int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16)
-        micro_frontend = frontend_op.audio_microfrontend(
-            int16_input,
-            sample_rate=sample_rate,
-            window_size=window_size_ms,
-            window_step=window_step_ms,
-            num_channels=model_settings['fingerprint_width'],
-            out_scale=1,
-            out_type=tf.float32)
-        self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
-        tf.summary.image(
-            'micro',
-            tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
-            max_outputs=1)
       else:
-        raise ValueError(
-            'Unknown preprocess mode "%s" (should be "mfcc", '
-            ' "average", or "micro")' % (model_settings['preprocess']))
+        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
+                         ' "average")' % (model_settings['preprocess']))
 
       # Merge all the summaries and write them out to /tmp/retrain_logs (by
       # default)
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index 031aa92..9269bb6 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -203,10 +203,6 @@
     self._runGetDataTest("mfcc", 30)
 
   @test_util.run_deprecated_v1
-  def testGetDataMicro(self):
-    self._runGetDataTest("micro", 20)
-
-  @test_util.run_deprecated_v1
   def testGetUnprocessedData(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index d368fec..1fd6a8e 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -71,12 +71,9 @@
   elif preprocess == 'mfcc':
     average_window_width = -1
     fingerprint_width = feature_bin_count
-  elif preprocess == 'micro':
-    average_window_width = -1
-    fingerprint_width = feature_bin_count
   else:
-    raise ValueError('Unknown preprocess mode "%s" (should be "mfcc",'
-                     ' "average", or "micro")' % (preprocess))
+    raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
+                     ' "average")' % (preprocess))
   fingerprint_size = fingerprint_width * spectrogram_length
   return {
       'desired_samples': desired_samples,
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 43a399b..f6e39b0 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -446,7 +446,7 @@
       '--preprocess',
       type=str,
       default='mfcc',
-      help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
+      help='Spectrogram processing mode. Can be "mfcc" or "average"')
 
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/wav_to_features.py b/tensorflow/examples/speech_commands/wav_to_features.py
index d7f2446..e6c8f45c 100644
--- a/tensorflow/examples/speech_commands/wav_to_features.py
+++ b/tensorflow/examples/speech_commands/wav_to_features.py
@@ -56,7 +56,7 @@
     window_stride_ms: How far to move in time between spectogram timeslices.
     feature_bin_count: How many bins to use for the feature fingerprint.
     quantize: Whether to train the model for eight-bit deployment.
-    preprocess: Spectrogram processing mode; "mfcc", "average" or "micro".
+    preprocess: Spectrogram processing mode. Can be "mfcc" or "average".
     input_wav: Path to the audio WAV file to read.
     output_c_file: Where to save the generated C source file.
   """
@@ -86,15 +86,14 @@
     f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms)
     f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count)
     if quantize:
-      f.write(' * --quantize=1 \\\n')
+      f.write(' * --quantize \\\n')
     f.write(' * --preprocess="%s" \\\n' % preprocess)
     f.write(' * --input_wav="%s" \\\n' % input_wav)
     f.write(' * --output_c_file="%s" \\\n' % output_c_file)
     f.write(' */\n\n')
-    f.write('const int g_%s_width = %d;\n' %
-            (variable_base, model_settings['fingerprint_width']))
-    f.write('const int g_%s_height = %d;\n' %
-            (variable_base, model_settings['spectrogram_length']))
+    f.write('const int g_%s_width = %d;\n' % (variable_base, features.shape[2]))
+    f.write(
+        'const int g_%s_height = %d;\n' % (variable_base, features.shape[1]))
     if quantize:
       features_min, features_max = input_data.get_features_range(model_settings)
       f.write('const unsigned char g_%s_data[] = {' % variable_base)
@@ -109,7 +108,7 @@
           quantized_value = 255
         if i == 0:
           f.write('\n  ')
-        f.write('%d, ' % (quantized_value))
+        f.write('%d, ' % quantized_value)
         i = (i + 1) % 10
     else:
       f.write('const float g_%s_data[] = {\n' % variable_base)
@@ -169,7 +168,7 @@
       '--preprocess',
       type=str,
       default='mfcc',
-      help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
+      help='Spectrogram processing mode. Can be "mfcc" or "average"')
   parser.add_argument(
       '--input_wav',
       type=str,
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
index 18e0e63..6234490 100644
--- a/tensorflow/examples/speech_commands/wav_to_features_test.py
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -66,22 +66,6 @@
       content = f.read()
       self.assertTrue(b"const unsigned char g_input_data" in content)
 
-  @test_util.run_deprecated_v1
-  def testWavToFeaturesMicro(self):
-    tmp_dir = self.get_temp_dir()
-    wav_dir = os.path.join(tmp_dir, "wavs")
-    os.mkdir(wav_dir)
-    self._saveWavFolders(wav_dir, ["a", "b", "c"], 100)
-    input_file_path = os.path.join(tmp_dir, "input.wav")
-    output_file_path = os.path.join(tmp_dir, "output.c")
-    wav_data = self._getWavData()
-    self._saveTestWavFile(input_file_path, wav_data)
-    wav_to_features.wav_to_features(16000, 1000, 10, 10, 40, True, "micro",
-                                    input_file_path, output_file_path)
-    with open(output_file_path, "rb") as f:
-      content = f.read()
-      self.assertIn(b"const unsigned char g_input_data", content)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index aad78b1..e4383d1 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -13,6 +13,7 @@
     srcs = [
         "cnn_mnist.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 8e723f7..7508a9f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,6 +38,61 @@
 	return list, start + size, nil
 }
 
+// Generates fingerprint values.
+//
+// Generates fingerprint values of `data`.
+//
+// Fingerprint op considers the first dimension of `data` as the batch dimension,
+// and `output[i]` contains the fingerprint value generated from contents in
+// `data[i, ...]` for all `i`.
+//
+// Fingerprint op writes fingerprint values as byte arrays. For example, the
+// default method `farmhash64` generates a 64-bit fingerprint value at a time.
+// This 8-byte value is written out as an `uint8` array of size 8, in little-endian
+// order.
+//
+// For example, suppose that `data` has data type `DT_INT32` and shape (2, 3, 4),
+// and that the fingerprint method is `farmhash64`. In this case, the output shape
+// is (2, 8), where 2 is the batch dimension size of `data`, and 8 is the size of
+// each fingerprint value in bytes. `output[0, :]` is generated from 12 integers in
+// `data[0, :, :]` and similarly `output[1, :]` is generated from other 12 integers
+// in `data[1, :, :]`.
+//
+// Note that this op fingerprints the raw underlying buffer, and it does not
+// fingerprint Tensor's metadata such as data type and/or shape. For example, the
+// fingerprint values are invariant under reshapes and bitcasts as long as the
+// batch dimension remain the same:
+//
+// ```
+// Fingerprint(data) == Fingerprint(Reshape(data, ...))
+// Fingerprint(data) == Fingerprint(Bitcast(data, ...))
+// ```
+//
+// For string data, one should expect `Fingerprint(data) !=
+// Fingerprint(ReduceJoin(data))` in general.
+//
+// Arguments:
+//	data: Must have rank 1 or higher.
+//	method: Fingerprint method used by this op. Currently available method is
+// `farmhash::fingerprint64`.
+//
+// Returns A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to
+// `data`'s first dimension, and the second dimension size depends on the
+// fingerprint algorithm.
+func Fingerprint(scope *Scope, data tf.Output, method tf.Output) (fingerprint tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fingerprint",
+		Input: []tf.Input{
+			data, method,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
 type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
@@ -96,109 +151,43 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
 // If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
 	return func(m optionalAttr) {
 		m["num_bits"] = value
 	}
 }
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
 // If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
 	return func(m optionalAttr) {
 		m["narrow_range"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
-//
-// and `max` to 'outputs' tensor of same shape as `inputs`.
-//
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
-		Input: []tf.Input{
-			inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -207,86 +196,14 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "FakeQuantWithMinMaxVarsGradient",
 		Input: []tf.Input{
-			gradients, inputs,
+			gradients, inputs, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-//
-// Attributes `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
-		Input: []tf.Input{
-			inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Applies sparse addition to `input` using individual values or slices
@@ -348,93 +265,6 @@
 	return op.Output(0)
 }
 
-// Subtracts sparse `updates` from an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by subtracting sparse `updates` from the
-// passed in `tensor`.
-// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
-// are subtracted from an existing tensor (as opposed to a variable). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of tensor_scatter_sub is to subtract individual elements
-// from a tensor by index. For example, say we want to insert 4 scattered elements
-// in a rank-1 tensor with 8 elements.
-//
-// In Python, this scatter subtract operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [1, -10, 1, -9, -8, 1, 1, -11]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//
-// Returns A new tensor copied from tensor and updates subtracted according to the indices.
-func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorScatterSub",
-		Input: []tf.Input{
-			tensor, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Scatter `updates` into an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by applying sparse `updates` to the passed
@@ -534,152 +364,70 @@
 	return op.Output(0)
 }
 
-// Scatter `updates` into a new tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
-//
-// This operation is similar to tensor_scatter_add, except that the tensor is
-// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
-// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
 
-// LowerBoundAttr is an optional argument to LowerBound.
-type LowerBoundAttr func(optionalAttr)
-
-// LowerBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LowerBoundOutType(value tf.DataType) LowerBoundAttr {
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+//
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["output_range_given"] = value
 	}
 }
 
-// Applies lower_bound(sorted_search_values, values) along each row.
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
 //
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='left')`.
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_min"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
 //
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
+
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
 //
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["variance_epsilon"] = value
+	}
+}
+
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
 //
-//   result = LowerBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 2],
-//              [0, 1, 5]]
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
+	}
+}
+
+// Quantized Instance normalization.
 //
 // Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
 //
-// Returns A `Tensor` with the same shape as `values`.  It contains the first scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func LowerBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...LowerBoundAttr) (output tf.Output) {
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -688,62 +436,108 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LowerBound",
+		Type: "QuantizedInstanceNorm",
 		Input: []tf.Input{
-			sorted_inputs, values,
+			x, x_min, x_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 
-// DequantizeMode sets the optional mode attribute to value.
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
 // If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
 		m["mode"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
 // [min_range, max_range] are scalar floats that specify the range for
 // the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
 // In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
 // ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
 // ```
+//
 // here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
 // *MIN_COMBINED Mode Example*
 //
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
 //
 // If the mode is 'MIN_FIRST', then this approach is used:
 //
-// ```c++
+// ```
 // num_discrete_values = 1 << (# of bits in T)
 // range_adjust = num_discrete_values / (num_discrete_values - 1)
 // range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
 // ```
 //
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
 // *SCALED mode Example*
 //
 // `SCALED` mode matches the quantization approach used in
@@ -756,6 +550,7 @@
 //
 // We first find the range of values in our tensor. The
 // range we use is always centered on 0, so we find m such that
+//
 // ```c++
 //   m = max(abs(input_min), abs(input_max))
 // ```
@@ -764,6 +559,7 @@
 //
 // Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
 // If T is signed, this is
+//
 // ```
 //   num_bits = sizeof(T) * 8
 //   [min_fixed, max_fixed] =
@@ -771,41 +567,56 @@
 // ```
 //
 // Otherwise, if T is unsigned, the fixed-point range is
+//
 // ```
 //   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 // ```
 //
 // From this we compute our scaling factor, s:
+//
 // ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
+//   s = (max_fixed - min_fixed) / (2 * m)
 // ```
 //
-// Now we can dequantize the elements of our tensor:
+// Now we can quantize the elements of our tensor:
+//
 // ```c++
-// result = input * s
+// result = round(input * s)
 // ```
 //
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
 // Arguments:
 //
 //	min_range: The minimum scalar value possibly produced for the input.
 //	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
 			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
@@ -983,215 +794,6 @@
 	return op.Output(0)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
-
-// OneHotAxis sets the optional axis attribute to value.
-//
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
-//
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
-//
-//
-// Examples
-// =========
-//
-// Suppose that
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-// ```
-// output =
-//   [5.0 0.0 0.0]  // one_hot(0)
-//   [0.0 0.0 5.0]  // one_hot(2)
-//   [0.0 0.0 0.0]  // one_hot(-1)
-//   [0.0 5.0 0.0]  // one_hot(1)
-// ```
-//
-// Suppose that
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-// ```
-// output =
-//   [0.0 3.0 3.0 3.0]
-//   [3.0 3.0 3.0 0.0]
-//   [3.0 3.0 3.0 3.0]
-//   [3.0 0.0 3.0 3.0]
-// //  ^                one_hot(0)
-// //      ^            one_hot(2)
-// //          ^        one_hot(-1)
-// //              ^    one_hot(1)
-// ```
-//
-// Suppose that
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[2 x 2 x 3]`:
-// ```
-// output =
-//   [
-//     [1.0, 0.0, 0.0]  // one_hot(0)
-//     [0.0, 0.0, 1.0]  // one_hot(2)
-//   ][
-//     [0.0, 1.0, 0.0]  // one_hot(1)
-//     [0.0, 0.0, 0.0]  // one_hot(-1)
-//   ]
-// ```
-//
-// Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-//
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OneHot",
-		Input: []tf.Input{
-			indices, depth, on_value, off_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
-//
-// Arguments:
-//	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `input`.
-//	strides: 1-D of length 5. How far the centers of two consecutive patches are in
-// `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
-//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
-// ```
-//
-// Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,
-// ksize_planes * ksize_rows * ksize_cols * depth]` containing patches
-// with size `ksize_planes x ksize_rows x ksize_cols x depth` vectorized
-// in the "depth" dimension. Note `out_planes`, `out_rows` and `out_cols`
-// are the dimensions of the output patches.
-func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractVolumePatches",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
-//
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)
 
@@ -1317,104 +919,118 @@
 	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
+// SpaceToDepthAttr is an optional argument to SpaceToDepth.
+type SpaceToDepthAttr func(optionalAttr)
+
+// SpaceToDepthDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// SpaceToDepth for tensors of type T.
 //
-// This is a legacy version of the more general BatchToSpaceND.
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size.
 //
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `block_size * block_size * input_depth`.
+//   * The Y, X coordinates within each block of the input become the high order
+//     component of the output channel index.
+//   * The input tensor's height and width must be divisible by block_size.
 //
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
 //
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
 //
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
 //
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+// block_size = 2:
 //
 // ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
 // ```
 //
-// The output tensor has shape `[1, 2, 2, 1]` and value:
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
 //
 // ```
-// x = [[[[1], [2]], [[3], [4]]]]
+// [[[[1, 2, 3, 4]]]]
 // ```
 //
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
 //
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
 //
 // ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
 //
 // ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 // ```
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
 //
 // ```
-// x = [[[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]]
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
 // ```
 //
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+// the operator will return the following tensor of shape `[1 2 2 4]`:
 //
 // ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
 // ```
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "SpaceToDepth",
 		Input: []tf.Input{
-			input, crops,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -1617,233 +1233,61 @@
 	return op.Output(0), op.Output(1)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
+// Pads a tensor with zeros.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
-	}
-}
-
-// Removes dimensions of size 1 from the shape of a tensor.
+// The padded size of each dimension D of the output is:
 //
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
 // For example:
 //
 // ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
 // ```
 //
-// Or, to remove specific size 1 dimensions:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
-//
-// Arguments:
-//	input: The `input` to squeeze.
-//
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Squeeze",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-// zero; if you specify a negative number for `axis` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
-//
-// This operation requires that:
-//
-// `-1-input.dims() <= dim <= input.dims()`
-//
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
-//
-// Arguments:
-//
-//	axis: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`. Must be in the range
-// `[-rank(input) - 1, rank(input)]`.
-//
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExpandDims",
-		Input: []tf.Input{
-			input, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op that passes through `input` when its output is not fed.
-//
-// Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
-//
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor. The shape can be any partially-specified
-// shape.  To be unconstrained, pass in a shape with unknown rank.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
-//
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
+		Type: "Pad",
 		Input: []tf.Input{
 			input, paddings,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+// Return the shape of s0 op s1 with broadcast.
 //
-// This is typically used by gradient computations for a broadcasting operation.
-func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastGradientArgs",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
 			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
 // Returns locations of nonzero / true values in a tensor.
@@ -1942,85 +1386,60 @@
 	return op.Output(0)
 }
 
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
 
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
 // If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
 		m["begin_mask"] = value
 	}
 }
 
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
 // If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
 		m["end_mask"] = value
 	}
 }
 
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
 // If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
 		m["ellipsis_mask"] = value
 	}
 }
 
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
 // If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
 		m["new_axis_mask"] = value
 	}
 }
 
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
 // If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
 		m["shrink_axis_mask"] = value
 	}
 }
 
-// Returns the gradient of `StridedSlice`.
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2029,9 +1448,201 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			shape, begin, end, strides, dy,
+			ref, begin, end, strides, value,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
+
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
+//
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1)` if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceEndMask sets the optional end_mask attribute to value.
+//
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Return a strided slice from `input`.
+//
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
+//
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
+//
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+//
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+//
+//
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+//
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
+//
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+//
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
+// ```
+//
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
+//
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
+//
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
+//
+// Arguments:
+//
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSlice",
+		Input: []tf.Input{
+			input, begin, end, strides,
 		},
 		Attrs: attrs,
 	}
@@ -2039,37 +1650,131 @@
 	return op.Output(0)
 }
 
-// Return a slice from 'input'.
+// Returns the rank of a tensor.
 //
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
+// This operation returns an integer representing the rank of `input`.
 //
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+// For example:
 //
-// Arguments:
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
 //
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Slice",
+		Type: "Rank",
 		Input: []tf.Input{
-			input, begin, size,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Ensures that the tensor's shape matches the expected shape.
 //
 // Raises an error if the input tensor's shape does not match the specified shape.
@@ -2183,6 +1888,60 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCounts",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // UniqueV2Attr is an optional argument to UniqueV2.
 type UniqueV2Attr func(optionalAttr)
 
@@ -2339,59 +2098,459 @@
 	return op.Output(0)
 }
 
-// Checks a tensor for NaN and Inf values.
+// Reshapes a tensor.
 //
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+//
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
 //
 // Arguments:
 //
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
+		Type: "Reshape",
 		Input: []tf.Input{
-			tensor,
+			tensor, shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
+// SpaceToBatch for N-D tensors of type T.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
+//
+// 2. Reshape `padded` to `reshaped_padded` of shape:
+//
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
+//
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
+//
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatchND",
+		Input: []tf.Input{
+			input, block_shape, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a list of tensors with the same shapes and contents as the input
+//
+// tensors.
+//
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
+}
+
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
+//
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
+//
+//     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
+//
+// Whereas in `tf.gather` `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+//
+// The last dimension of `indices` can be at most the rank of
+// `params`:
+//
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// Some examples below.
+//
+// Simple indexing into a matrix:
+//
+// ```python
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
+// ```
+//
+// Slice indexing into a matrix:
+//
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
+// ```
+//
+// Indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+//
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
+//
+// See also `tf.gather` and `tf.batch_gather`.
+//
+// Arguments:
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GatherNd",
+		Input: []tf.Input{
+			params, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["min"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// Attributes `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
 //
-// Arguments:
-//	input: any tensor.
-//
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2400,9 +2559,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "FakeQuantWithMinMaxArgs",
 		Input: []tf.Input{
-			input,
+			inputs,
 		},
 		Attrs: attrs,
 	}
@@ -2410,41 +2569,6 @@
 	return op.Output(0)
 }
 
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
-//
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
-//
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StopGradient",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Gather slices from `params` axis `axis` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
@@ -2555,89 +2679,92 @@
 	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// This operation creates a tensor of shape `dims` and fills it with `value`.
+// value: boolean (if true, edit distances are normalized by length of truth).
 //
-// For example:
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
 //
-// `tf.fill` differs from `tf.constant` in a few ways:
-//
-// *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
-//     Tensor values.
-// *   `tf.fill` creates an Op in the computation graph that constructs the actual
-//     Tensor value at runtime. This is in contrast to `tf.constant` which embeds
-//     the entire Tensor into the graph with a `Const` node.
-// *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
-//     based on other runtime Tensors, unlike `tf.constant`.
+// The inputs are:
 //
 // Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
 //
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Fill",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			dims, value,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
-//
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
-}
-
 // Reverses specific dimensions of a tensor.
 //
 // NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
@@ -2707,66 +2834,64 @@
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
+// to zero.
 //
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// `rank(tensor) = size(dims)`
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
 // For example:
 //
 // ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
 //
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
 //
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
 //
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
 // ```
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			tensor, dims,
+			input, num_lower, num_upper,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -2823,40 +2948,6 @@
 	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-//
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
-		Input: []tf.Input{
-			input, diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a batched diagonal tensor with a given batched diagonal values.
 //
 // Given a `diagonal`, this operation returns a tensor with the `diagonal` and
@@ -2943,18 +3034,100 @@
 	return op.Output(0)
 }
 
-// Returns a constant tensor on the host. Only for writing C++ tests.
+// Returns a diagonal tensor with a given diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+//
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
 //
 // Arguments:
-//	value: Attr `value` is the tensor to return.
-//
-func HostConst(scope *Scope, value tf.Tensor, dtype tf.DataType) (output tf.Output) {
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value": value, "dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "HostConst",
+		Type: "Diag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OnesLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
+//
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GuaranteeConst",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
 
 		Attrs: attrs,
 	}
@@ -2965,6 +3138,45 @@
 // Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SplitV",
+		Input: []tf.Input{
+			value, size_splits, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
+	}
+	return output
+}
+
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
 //	axis: 0-D.  The dimension along which to split.  Must be in the range
 // `[-rank(value), rank(value))`.
 //	value: The tensor to split.
@@ -2999,43 +3211,46 @@
 	return output
 }
 
-// Broadcast an array for a compatible shape.
+// Computes offsets of concat inputs within its output.
 //
-// Broadcasting is the process of making arrays to have compatible shapes
-// for arithmetic operations. Two shapes are compatible if for each
-// dimension pair they are either equal or one of them is one. When trying
-// to broadcast a Tensor to a shape, it starts with the trailing dimensions,
-// and works its way forward.
+// For example:
 //
-// For example,
 // ```
-// >>> x = tf.constant([1, 2, 3])
-// >>> y = tf.broadcast_to(x, [3, 3])
-// >>> sess.run(y)
-// array([[1, 2, 3],
-//        [1, 2, 3],
-//        [1, 2, 3]], dtype=int32)
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 // ```
-// In the above example, the input Tensor with the shape of `[1, 3]`
-// is broadcasted to output Tensor with shape of `[3, 3]`.
+//
+// This is typically used by gradient computations for a concat operation.
 //
 // Arguments:
-//	input: A Tensor to broadcast.
-//	shape: An 1-D `int` Tensor. The shape of the desired output.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns A Tensor.
-func BroadcastTo(scope *Scope, input tf.Output, shape tf.Output) (output tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastTo",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			input, shape,
+			concat_dim, tf.OutputList(shape),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
 }
 
 // UnpackAttr is an optional argument to Unpack.
@@ -3100,46 +3315,117 @@
 	return output
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
 
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
+// EmptyInit sets the optional init attribute to value.
+//
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["init"] = value
 	}
 }
 
-// Returns shape of tensors.
+// Creates a tensor with the given shape.
 //
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+// This operation creates a tensor of `shape` and `dtype`.
+//
+// Arguments:
+//	shape: 1-D. Represents the shape of the output tensor.
+//
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShapeN",
+		Type: "Empty",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Subtracts `v` into specified rows of `x`.
+//
+//     Computes y = x; y[i, :] -= v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
+	opspec := tf.OpSpec{
+		Type: "InplaceSub",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	return output
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Updates specified rows with values in `v`.
+//
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceUpdate",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // PackAttr is an optional argument to Pack.
@@ -3201,221 +3487,6 @@
 	return op.Output(0)
 }
 
-// Concatenates a list of `N` tensors along the first dimension.
-//
-// The input tensors are all required to have size 1 in the first dimension.
-//
-// For example:
-//
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
-//
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
-//
-// Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
-//
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
-
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
-	}
-}
-
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
-	}
-}
-
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
-	}
-}
-
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
-//
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
-//
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
-//
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mfcc",
-		Input: []tf.Input{
-			spectrogram, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
 type AudioSpectrogramAttr func(optionalAttr)
 
@@ -3440,10 +3511,10 @@
 // This op expects to receive audio data as an input, stored as floats in the range
 // -1 to 1, together with a window width in samples, and a stride specifying how
 // far to move the window between slices. From this it generates a three
-// dimensional output. The lowest dimension has an amplitude value for each
-// frequency during that time slice. The next dimension is time, with successive
-// frequency slices. The final dimension is for the channels in the input, so a
-// stereo audio input would have two here for example.
+// dimensional output. The first dimension is for the channels in the input, so a
+// stereo audio input would have two here for example. The second dimension is time,
+// with successive frequency slices. The third dimension has an amplitude value for
+// each frequency during that time slice.
 //
 // This means the layout when converted and saved as an image is rotated 90 degrees
 // clockwise from a typical spectrogram. Time is descending down the Y axis, and
@@ -3513,669 +3584,57 @@
 	return op.Output(0)
 }
 
-// DecodeWavAttr is an optional argument to DecodeWav.
-type DecodeWavAttr func(optionalAttr)
+// UnbatchAttr is an optional argument to Unbatch.
+type UnbatchAttr func(optionalAttr)
 
-// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
-//
-// value: Number of sample channels wanted.
-// If not specified, defaults to -1
-func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_channels"] = value
-	}
-}
-
-// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
-//
-// value: Length of audio requested.
-// If not specified, defaults to -1
-func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_samples"] = value
-	}
-}
-
-// Decode a 16-bit PCM WAV file to a float tensor.
-//
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
-//
-// When desired_channels is set, if the input contains fewer channels than this
-// then the last channel will be duplicated to give the requested number, else if
-// the input has more channels than requested then the additional channels will be
-// ignored.
-//
-// If desired_samples is set, then the audio will be cropped or padded with zeroes
-// to the requested length.
-//
-// The first output contains a Tensor with the content of the audio samples. The
-// lowest dimension will be the number of channels, and the second will be the
-// number of samples. For example, a ten-sample-long stereo WAV file should give an
-// output shape of [10, 2].
-//
-// Arguments:
-//	contents: The WAV-encoded audio, usually from a file.
-//
-// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
-func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeWav",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Converts a flat index or array of flat indices into a tuple of
-//
-// coordinate arrays.
-//
-// @compatibility(numpy)
-// Equivalent to np.unravel_index
-// @end_compatibility
-//
-// Arguments:
-//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
-// flattened version of an array of dimensions dims.
-//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
-// indices.
-//
-// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
-// same shape as the indices array.
-func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnravelIndex",
-		Input: []tf.Input{
-			indices, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise right-shift of `x` and `y`.
-//
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RightShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Flips all bits elementwise.
-//
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Invert",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate the bucket boundaries for each feature based on accumulated summaries.
-//
-// An op that returns a list of float tensors for a quantile stream resource. Each
-// tensor is Rank 1 containing bucket boundaries for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_features: inferred int; number of features to get bucket boundaries for.
-//
-// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_features": num_features}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
-		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
-		return
-	}
-	return bucket_boundaries
-}
-
-// BoostedTreesQuantileStreamResourceFlushAttr is an optional argument to BoostedTreesQuantileStreamResourceFlush.
-type BoostedTreesQuantileStreamResourceFlushAttr func(optionalAttr)
-
-// BoostedTreesQuantileStreamResourceFlushGenerateQuantiles sets the optional generate_quantiles attribute to value.
-//
-// value: bool; If True, the output will be the num_quantiles for each stream where the ith
-// entry is the ith quantile of the input with an approximation error of epsilon.
-// Duplicate values may be present.
-// If False, the output will be the points in the histogram that we got which roughly
-// translates to 1/epsilon boundaries and without any duplicates.
-// Default to False.
-// If not specified, defaults to false
-func BoostedTreesQuantileStreamResourceFlushGenerateQuantiles(value bool) BoostedTreesQuantileStreamResourceFlushAttr {
-	return func(m optionalAttr) {
-		m["generate_quantiles"] = value
-	}
-}
-
-// Flush the summaries for a quantile stream resource.
-//
-// An op that flushes the summaries for a quantile stream resource.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_buckets: int; approximate number of buckets unless using generate_quantiles.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resource_handle tf.Output, num_buckets tf.Output, optional ...BoostedTreesQuantileStreamResourceFlushAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceFlush",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, num_buckets,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Makes the summary of quantiles for the batch.
-//
-// An op that takes a list of tensors (one tensor per feature) and outputs the
-// quantile summaries for each tensor.
-//
-// Arguments:
-//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
-//	example_weights: float; Rank 1 Tensor with weights per instance.
-//	epsilon: float; The required maximum approximation error.
-//
-// Returns float; List of Rank 2 Tensors each containing the quantile summary
-// (value, weight, min_rank, max_rank) of a single feature.
-func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesMakeQuantileSummaries",
-		Input: []tf.Input{
-			tf.OutputList(float_values), example_weights, epsilon,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if summaries, idx, err = makeOutputList(op, idx, "summaries"); err != nil {
-		scope.UpdateErr("BoostedTreesMakeQuantileSummaries", err)
-		return
-	}
-	return summaries
-}
-
-// BoostedTreesCreateQuantileStreamResourceAttr is an optional argument to BoostedTreesCreateQuantileStreamResource.
-type BoostedTreesCreateQuantileStreamResourceAttr func(optionalAttr)
-
-// BoostedTreesCreateQuantileStreamResourceMaxElements sets the optional max_elements attribute to value.
-//
-// value: int; The maximum number of data points that can be fed to the stream.
-// If not specified, defaults to 1099511627776
-func BoostedTreesCreateQuantileStreamResourceMaxElements(value int64) BoostedTreesCreateQuantileStreamResourceAttr {
-	return func(m optionalAttr) {
-		m["max_elements"] = value
-	}
-}
-
-// Create the Resource for Quantile Streams.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource; Handle to quantile stream resource.
-//	epsilon: float; The required approximation error of the stream resource.
-//	num_streams: int; The number of streams managed by the resource that shares the same epsilon.
-//
-// Returns the created operation.
-func BoostedTreesCreateQuantileStreamResource(scope *Scope, quantile_stream_resource_handle tf.Output, epsilon tf.Output, num_streams tf.Output, optional ...BoostedTreesCreateQuantileStreamResourceAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateQuantileStreamResource",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, epsilon, num_streams,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Calculates the prior from the training data (the bias) and fills in the first node with the logits' prior. Returns a boolean indicating whether to continue centering.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	mean_gradients: A tensor with shape=[logits_dimension] with mean of gradients for a first node.
-//	mean_hessians: A tensor with shape=[logits_dimension] mean of hessians for a first node.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//
-// Returns Bool, whether to continue bias centering.
-func BoostedTreesCenterBias(scope *Scope, tree_ensemble_handle tf.Output, mean_gradients tf.Output, mean_hessians tf.Output, l1 tf.Output, l2 tf.Output) (continue_centering tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCenterBias",
-		Input: []tf.Input{
-			tree_ensemble_handle, mean_gradients, mean_hessians, l1, l2,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Updates the tree ensemble by either adding a layer to the last tree being grown
-//
-// or by starting a new tree.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
-//
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the update to cached logits. It is designed to be used during training.
-// It traverses the trees starting from cached tree id and cached node id and
-// calculates the updates to be pushed to the cache.
-//
-// Arguments:
-//
-//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
-// tree of prediction.
-//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
-// node of prediction.
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Rank 2 Tensor containing logits update (with respect to cached
-// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
-func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesTrainingPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Aggregates the summary of accumulated stats for the batch.
-//
-// The summary stats contains gradients and hessians accumulated for each node, feature dimension id and bucket.
-//
-// Arguments:
-//	node_ids: int32; Rank 1 Tensor containing node ids for each example, shape [batch_size].
-//	gradients: float32; Rank 2 Tensor (shape=[batch_size, logits_dimension]) with gradients for each example.
-//	hessians: float32; Rank 2 Tensor (shape=[batch_size, hessian_dimension]) with hessians for each example.
-//	feature: int32; Rank 2 feature Tensors (shape=[batch_size, feature_dimension]).
-//	max_splits: int; the maximum number of splits possible in the whole tree.
-//	num_buckets: int; equals to the maximum possible value of bucketized feature.
-//
-// Returns output Rank 4 Tensor (shape=[splits, feature_dimension, buckets, logits_dimension + hessian_dimension])
-// containing accumulated stats for each node, feature dimension and bucket.
-func BoostedTreesAggregateStats(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, feature tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesAggregateStats",
-		Input: []tf.Input{
-			node_ids, gradients, hessians, feature,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes the summary of accumulated stats for the batch.
-//
-// The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
-//
-// Arguments:
-//	node_ids: int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
-//	gradients: float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
-//	hessians: float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
-//	bucketized_features_list: int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
-//	max_splits: int; the maximum number of splits possible in the whole tree.
-//	num_buckets: int; equals to the maximum possible value of bucketized feature.
-//
-// Returns output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
-func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, bucketized_features_list []tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesMakeStatsSummary",
-		Input: []tf.Input{
-			node_ids, gradients, hessians, tf.OutputList(bucketized_features_list),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserializes a serialized tree ensemble config and replaces current tree
-//
-// ensemble.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-//
-// Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
-//
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
-//
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
-//
-// Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
-//
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
-		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
-}
-
-// Checks whether a tree ensemble has been initialized.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resouce.
-//
-// Returns output boolean on whether it is initialized or not.
-func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsBoostedTreesEnsembleInitialized",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
-type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
-
-// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
+// UnbatchContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+func UnbatchContainer(value string) UnbatchAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// UnbatchSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+func UnbatchSharedName(value string) UnbatchAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a handle to a BoostedTreesEnsembleResource
-func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
+// Reverses the operation of Batch for a single output Tensor.
+//
+// An instance of Unbatch either receives an empty batched_tensor, in which case it
+// asynchronously waits until the values become available from a concurrently
+// running instance of Unbatch with the same container and shared_name, or receives
+// a non-empty batched_tensor in which case it finalizes all other concurrently
+// running instances and outputs its own element from the batch.
+//
+// batched_tensor: The possibly transformed output of Batch. The size of the first
+//  dimension should remain unchanged by the transformations for the operation to
+//  work.
+// batch_index: The matching batch_index obtained from Batch.
+// id: The id scalar emitted by Batch.
+// unbatched_tensor: The Tensor corresponding to this execution.
+// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+//  batched input tensor associated with a given invocation of the op.
+// container: Container to control resource sharing.
+// shared_name: Instances of Unbatch with the same container and shared_name are
+//  assumed to possibly belong to the same batch. If left empty, the op name will
+//  be used as the shared name.
+func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesEnsembleResourceHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
+		Type: "Unbatch",
 		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Output the logits for the given input data
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource.
-//	dense_features: Rank 2 dense features tensor.
-//	logits_dimension: Scalar, dimension of the logits.
-//
-// Returns The logits predictions from the tree for each instance in the batch.
-func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreePredict",
-		Input: []tf.Input{
-			tree_handle, dense_features,
+			batched_tensor, batch_index, id,
 		},
 		Attrs: attrs,
 	}
@@ -4183,26 +3642,6 @@
 	return op.Output(0)
 }
 
-// Get the number of nodes in a tree
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource.
-//
-// Returns The size of the tree.
-func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSize",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BatchAttr is an optional argument to Batch.
 type BatchAttr func(optionalAttr)
 
@@ -4314,20 +3753,175 @@
 	return batched_tensors, batch_index, id
 }
 
-// Serializes the tree handle to a proto
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// Arguments:
-//	tree_handle: Handle to the tree resource to be serialized.
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
 //
-// Returns Serialied proto string of the tree resource.
-func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSerialize",
+		Type: "RightShift",
 		Input: []tf.Input{
-			tree_handle,
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
+
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+//
+// value: Number of sample channels wanted.
+// If not specified, defaults to -1
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_channels"] = value
+	}
+}
+
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
+//
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_samples"] = value
+	}
+}
+
+// Decode a 16-bit PCM WAV file to a float tensor.
+//
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+//
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
+//
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
+//
+// Arguments:
+//	contents: The WAV-encoded audio, usually from a file.
+//
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeWav",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+//
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
+//
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PopulationCount",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Flips all bits elementwise.
+//
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Invert",
+		Input: []tf.Input{
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -4358,27 +3952,49 @@
 	return scope.AddOperation(opspec)
 }
 
-// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
-type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+// Checks whether a quantile stream has been initialized.
+//
+// An Op that checks if quantile stream resource is initialized.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource; The reference to quantile stream resource handle.
+//
+// Returns bool; True if the resource is initialized, False otherwise.
+func IsBoostedTreesQuantileStreamResourceInitialized(scope *Scope, quantile_stream_resource_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsBoostedTreesQuantileStreamResourceInitialized",
+		Input: []tf.Input{
+			quantile_stream_resource_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
+// BoostedTreesQuantileStreamResourceHandleOpAttr is an optional argument to BoostedTreesQuantileStreamResourceHandleOp.
+type BoostedTreesQuantileStreamResourceHandleOpAttr func(optionalAttr)
+
+// BoostedTreesQuantileStreamResourceHandleOpContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
+func BoostedTreesQuantileStreamResourceHandleOpContainer(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// BoostedTreesQuantileStreamResourceHandleOpSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
+func BoostedTreesQuantileStreamResourceHandleOpSharedName(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a handle to a TensorForestTreeResource
-func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
+// Creates a handle to a BoostedTreesQuantileStreamResource.
+func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...BoostedTreesQuantileStreamResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4387,7 +4003,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeResourceHandleOp",
+		Type: "BoostedTreesQuantileStreamResourceHandleOp",
 
 		Attrs: attrs,
 	}
@@ -4395,26 +4011,467 @@
 	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// Calculates the prior from the training data (the bias) and fills in the first node with the logits' prior. Returns a boolean indicating whether to continue centering.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	mean_gradients: A tensor with shape=[logits_dimension] with mean of gradients for a first node.
+//	mean_hessians: A tensor with shape=[logits_dimension] mean of hessians for a first node.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//
+// Returns Bool, whether to continue bias centering.
+func BoostedTreesCenterBias(scope *Scope, tree_ensemble_handle tf.Output, mean_gradients tf.Output, mean_hessians tf.Output, l1 tf.Output, l2 tf.Output) (continue_centering tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCenterBias",
+		Input: []tf.Input{
+			tree_ensemble_handle, mean_gradients, mean_hessians, l1, l2,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+//
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastGradientArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Aggregates the summary of accumulated stats for the batch.
+//
+// The summary stats contains gradients and hessians accumulated for each node, feature dimension id and bucket.
+//
+// Arguments:
+//	node_ids: int32; Rank 1 Tensor containing node ids for each example, shape [batch_size].
+//	gradients: float32; Rank 2 Tensor (shape=[batch_size, logits_dimension]) with gradients for each example.
+//	hessians: float32; Rank 2 Tensor (shape=[batch_size, hessian_dimension]) with hessians for each example.
+//	feature: int32; Rank 2 feature Tensors (shape=[batch_size, feature_dimension]).
+//	max_splits: int; the maximum number of splits possible in the whole tree.
+//	num_buckets: int; equals to the maximum possible value of bucketized feature.
+//
+// Returns output Rank 4 Tensor (shape=[splits, feature_dimension, buckets, logits_dimension + hessian_dimension])
+// containing accumulated stats for each node, feature dimension and bucket.
+func BoostedTreesAggregateStats(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, feature tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesAggregateStats",
+		Input: []tf.Input{
+			node_ids, gradients, hessians, feature,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Makes the summary of accumulated stats for the batch.
+//
+// The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
+//
+// Arguments:
+//	node_ids: int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+//	gradients: float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+//	hessians: float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+//	bucketized_features_list: int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+//	max_splits: int; the maximum number of splits possible in the whole tree.
+//	num_buckets: int; equals to the maximum possible value of bucketized feature.
+//
+// Returns output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, bucketized_features_list []tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeStatsSummary",
+		Input: []tf.Input{
+			node_ids, gradients, hessians, tf.OutputList(bucketized_features_list),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BoostedTreesCalculateBestFeatureSplitAttr is an optional argument to BoostedTreesCalculateBestFeatureSplit.
+type BoostedTreesCalculateBestFeatureSplitAttr func(optionalAttr)
+
+// BoostedTreesCalculateBestFeatureSplitSplitType sets the optional split_type attribute to value.
+//
+// value: A string indicating if this Op should perform inequality split or equality split.
+// If not specified, defaults to "inequality"
+func BoostedTreesCalculateBestFeatureSplitSplitType(value string) BoostedTreesCalculateBestFeatureSplitAttr {
+	return func(m optionalAttr) {
+		m["split_type"] = value
+	}
+}
+
+// Calculates gains for each feature and returns the best possible split information for the feature.
+//
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The output shapes are compatible in a way that the first dimension of all tensors are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary: A Rank 4 tensor (#shape=[max_splits, feature_dims, bucket, stats_dims]) for accumulated stats summary (gradient/hessian) per node, per dimension, per buckets for each feature.
+// The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	logits_dimension: The dimension of logit, i.e., number of classes.
+//
+// Returns A Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best feature dimension for each feature to split for certain nodes if the feature is multi-dimension. See above for details like shapes and sizes.A Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.A Rank 1 tensors indicating the which direction to go if data is missing. See above for details like shapes and sizes.
+func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output, stats_summary tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, logits_dimension int64, optional ...BoostedTreesCalculateBestFeatureSplitAttr) (node_ids tf.Output, gains tf.Output, feature_dimensions tf.Output, thresholds tf.Output, left_node_contribs tf.Output, right_node_contribs tf.Output, split_with_default_directions tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCalculateBestFeatureSplit",
+		Input: []tf.Input{
+			node_id_range, stats_summary, l1, l2, tree_complexity, min_node_weight,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// Checks whether a tree ensemble has been initialized.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resouce.
+//
+// Returns output boolean on whether it is initialized or not.
+func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsBoostedTreesEnsembleInitialized",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
+type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
+
+// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a BoostedTreesEnsembleResource
+func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesEnsembleResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output the logits for the given input data
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//	dense_features: Rank 2 dense features tensor.
+//	logits_dimension: Scalar, dimension of the logits.
+//
+// Returns The logits predictions from the tree for each instance in the batch.
+func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreePredict",
+		Input: []tf.Input{
+			tree_handle, dense_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the number of nodes in a tree
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//
+// Returns The size of the tree.
+func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a tree has been initialized.
+//
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeIsInitializedOp",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
 // value: If either seed or seed2 are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
 // value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
@@ -4449,7 +4506,7 @@
 // candidate representing the number of times the candidate is expected
 // to occur in a batch of sampled candidates.  If unique=true, then this is a
 // probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4458,7 +4515,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
 			true_classes,
 		},
@@ -4468,103 +4525,6 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-//     Subtracts `v` into specified rows of `x`.
-//
-//     Computes y = x; y[i, :] -= v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceSub",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
 type UniformCandidateSamplerAttr func(optionalAttr)
 
@@ -4638,89 +4598,23 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// Checks a tensor for NaN and Inf values.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
-	}
-}
-
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
-//
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-//
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
-//
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
-//
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -4728,30 +4622,76 @@
 	return op.Output(0)
 }
 
-// Selects the k nearest centers for each point.
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
+
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// Rows of points are assumed to be input points. Rows of centers are assumed to be
-// the list of candidate centers. For each point, the k centers that have least L2
-// distance to it are computed.
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+	return func(m optionalAttr) {
+		m["old_vocab_size"] = value
+	}
+}
+
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
+//
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
+//
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
+//
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
+//
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
+//
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
-//	k: Number of nearest centers to return for each point. If k is larger than m, then
-// only m centers are returned.
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
-// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
-// corresponding center in nearest_center_indices.
-func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NearestNeighbors",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			points, centers, k,
+			new_vocab_file, old_vocab_file,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
@@ -4784,6 +4724,54 @@
 	return op.Output(0)
 }
 
+// Selects num_to_sample rows of input using the KMeans++ criterion.
+//
+// Rows of points are assumed to be input points. One row is selected at random.
+// Subsequent rows are sampled with probability proportional to the squared L2
+// distance from the nearest row selected thus far till num_to_sample rows have
+// been sampled.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	num_to_sample: Scalar. The number of rows to sample. This value must not be larger than n.
+//	seed: Scalar. Seed for initializing the random number generator.
+//	num_retries_per_sample: Scalar. For each row that is sampled, this parameter
+// specifies the number of additional points to draw from the current
+// distribution before selecting the best. If a negative value is specified, a
+// heuristic is used to sample O(log(num_to_sample)) additional points.
+//
+// Returns Matrix of shape (num_to_sample, d). The sampled rows.
+func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample tf.Output, seed tf.Output, num_retries_per_sample tf.Output) (samples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "KmeansPlusPlusInitialization",
+		Input: []tf.Input{
+			points, num_to_sample, seed, num_retries_per_sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveGather",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CollectiveReduceAttr is an optional argument to CollectiveReduce.
 type CollectiveReduceAttr func(optionalAttr)
 
@@ -4860,6 +4848,21 @@
 	return scope.AddOperation(opspec)
 }
 
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Forwards the input to the output.
 //
 // This operator represents the loop termination condition used by the
@@ -4903,109 +4906,6 @@
 	return op.Output(0)
 }
 
-// Exits the current frame to its parent frame.
-//
-// Exit makes its input `data` available to the parent frame.
-//
-// Arguments:
-//	data: The tensor to be made available to the parent frame.
-//
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exit",
-		Input: []tf.Input{
-			data,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
-//
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
-	}
-}
-
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
-//
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
-	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
-	}
-}
-
-// Creates or finds a child frame, and makes `data` available to the child frame.
-//
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
-//
-// Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
-//
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Enter",
-		Input: []tf.Input{
-			data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
-//
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
-//
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
-//
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Merge",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
 type CTCGreedyDecoderAttr func(optionalAttr)
 
@@ -5059,78 +4959,6 @@
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
-
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
-
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
-//
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
-	}
-}
-
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
-//
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
-//
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
-//
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCLoss",
-		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
 type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
 
@@ -5245,247 +5073,6 @@
 	return weights, biases
 }
 
-// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
-type CudnnRNNBackpropV3Attr func(optionalAttr)
-
-// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
-// If not specified, defaults to true
-func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["time_major"] = value
-	}
-}
-
-// Backprop step of CudnnRNNV3.
-//
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "sequence_lengths" input than CudnnRNNBackprop.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
-//     [batch_size, seq_length, input_size].
-// input_h: If time_major is true, this is a 3-D tensor with the shape of
-//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
-//     is [batch_size, num_layer * dir, num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
-//     shape is [batch_size, seq_length, dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// time_major: Indicates whether the input/output format is time major or batch
-//     major.
-// reserve_space: The same reserve_space produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV3",
-		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
-type CudnnRNNBackpropV2Attr func(optionalAttr)
-
-// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Backprop step of CudnnRNN.
-//
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
-//     cudnnRNNAlgo_t and cudnnMathType_t.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in the forward operation.
-// host_reserved: The same host_reserved produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV2",
-		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
 // CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
 type CudnnRNNBackpropAttr func(optionalAttr)
 
@@ -5597,77 +5184,69 @@
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
-type CudnnRNNV3Attr func(optionalAttr)
+// CudnnRNNV2Attr is an optional argument to CudnnRNNV2.
+type CudnnRNNV2Attr func(optionalAttr)
 
-// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// CudnnRNNV2RnnMode sets the optional rnn_mode attribute to value.
 // If not specified, defaults to "lstm"
-func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+func CudnnRNNV2RnnMode(value string) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["rnn_mode"] = value
 	}
 }
 
-// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// CudnnRNNV2InputMode sets the optional input_mode attribute to value.
 // If not specified, defaults to "linear_input"
-func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+func CudnnRNNV2InputMode(value string) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["input_mode"] = value
 	}
 }
 
-// CudnnRNNV3Direction sets the optional direction attribute to value.
+// CudnnRNNV2Direction sets the optional direction attribute to value.
 // If not specified, defaults to "unidirectional"
-func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+func CudnnRNNV2Direction(value string) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["direction"] = value
 	}
 }
 
-// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// CudnnRNNV2Dropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+func CudnnRNNV2Dropout(value float32) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["dropout"] = value
 	}
 }
 
-// CudnnRNNV3Seed sets the optional seed attribute to value.
+// CudnnRNNV2Seed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+func CudnnRNNV2Seed(value int64) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// CudnnRNNV2Seed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+func CudnnRNNV2Seed2(value int64) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// CudnnRNNV2IsTraining sets the optional is_training attribute to value.
 // If not specified, defaults to true
-func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+func CudnnRNNV2IsTraining(value bool) CudnnRNNV2Attr {
 	return func(m optionalAttr) {
 		m["is_training"] = value
 	}
 }
 
-// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["time_major"] = value
-	}
-}
-
 // A RNN backed by cuDNN.
 //
 // Computes the RNN from the input and initial states, with respect to the params
-// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+// buffer. Produces one extra output "host_reserved" than CudnnRNN.
 //
 // rnn_mode: Indicates the type of the RNN model.
 // input_mode: Indicates whether there is a linear projection between the input and
@@ -5679,123 +5258,6 @@
 // dropout: Dropout probability. When set to 0., dropout is disabled.
 // seed: The 1st part of a seed to initialize dropout.
 // seed2: The 2nd part of a seed to initialize dropout.
-// input: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
-//     [batch_size, seq_length, input_size].
-// input_h: If time_major is true, this is a 3-D tensor with the shape of
-//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
-//     is [batch_size, num_layer * dir, num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
-//     shape is [batch_size, seq_length, dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// time_major: Indicates whether the input/output format is time major or batch
-//     major.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNV3",
-		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
-//
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
 // input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
 // input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
 //     num_units].
@@ -5812,8 +5274,11 @@
 // is_training: Indicates whether this operation is used for inferenece or
 //   training.
 // reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+//   is only produced if is_training is true.
+// host_reserved: An opaque tensor that can be used in backprop calculation. It is
+//   only produced if is_training is true. It is output on host memory rather than
+//   device memory.
+func CudnnRNNV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNV2Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5822,14 +5287,107 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
+		Type: "CudnnRNNV2",
 		Input: []tf.Input{
 			input, input_h, input_c, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
+//
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsSize",
+		Input: []tf.Input{
+			num_layers, num_units, input_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // RecordInputAttr is an optional argument to RecordInput.
@@ -5920,47 +5478,49 @@
 	return op.Output(0)
 }
 
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
 
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// OrderedMapClearContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5969,55 +5529,57 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
+		Type: "OrderedMapClear",
 
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
-type OrderedMapSizeAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapSizeContainer sets the optional container attribute to value.
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+// Op removes and returns the (key, value) element with the smallest
+//
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6026,8 +5588,90 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapSize",
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
+}
 
+// TensorStridedSliceUpdateAttr is an optional argument to TensorStridedSliceUpdate.
+type TensorStridedSliceUpdateAttr func(optionalAttr)
+
+// TensorStridedSliceUpdateBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func TensorStridedSliceUpdateBeginMask(value int64) TensorStridedSliceUpdateAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// TensorStridedSliceUpdateEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func TensorStridedSliceUpdateEndMask(value int64) TensorStridedSliceUpdateAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// TensorStridedSliceUpdateEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func TensorStridedSliceUpdateEllipsisMask(value int64) TensorStridedSliceUpdateAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// TensorStridedSliceUpdateNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func TensorStridedSliceUpdateNewAxisMask(value int64) TensorStridedSliceUpdateAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// TensorStridedSliceUpdateShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func TensorStridedSliceUpdateShrinkAxisMask(value int64) TensorStridedSliceUpdateAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `input`.
+//
+// The values of `value` are assigned to the positions in the tensor `input` that
+// are selected by the slice parameters. The slice parameters `begin` `end`
+// `strides` etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s shape
+// must be exactly the shape produced by the slice of `input`.
+func TensorStridedSliceUpdate(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...TensorStridedSliceUpdateAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorStridedSliceUpdate",
+		Input: []tf.Input{
+			input, begin, end, strides, value,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -6105,111 +5749,40 @@
 	return values
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// MapUnstageCapacity sets the optional capacity attribute to value.
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapUnstageContainer sets the optional container attribute to value.
+// OrderedMapPeekContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapUnstageSharedName sets the optional shared_name attribute to value.
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
-
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
@@ -6218,8 +5791,9 @@
 // Op peeks at the values at the specified key.  If the
 //
 // underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6228,7 +5802,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapPeek",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
 			key, indices,
 		},
@@ -6241,128 +5815,53 @@
 	var idx int
 	var err error
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
+		scope.UpdateErr("OrderedMapPeek", err)
 		return
 	}
 	return values
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
 
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
+// MapSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
+func MapSizeCapacity(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
+// MapSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
+func MapSizeContainer(value string) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
+// MapSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a hashtable.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
-
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+func MapSizeSharedName(value string) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
 // Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6371,7 +5870,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageSize",
+		Type: "MapSize",
 
 		Attrs: attrs,
 	}
@@ -6451,103 +5950,6 @@
 	return values
 }
 
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
-
-// StageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageMemoryLimit sets the optional memory_limit attribute to value.
-//
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage values similar to a lightweight Enqueue.
-//
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
-//
-// Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-// Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Stage",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Delete the tensor specified by its handle in the session.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Get the value of the tensor specified by its handle.
 //
 // Arguments:
@@ -6592,42 +5994,22 @@
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Deprecated. Use TensorArrayCloseV3
 //
-// Arguments:
-//	value: The tensor to be stored.
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "TensorArrayCloseV2",
 		Input: []tf.Input{
-			value,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArraySizeV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Deprecated. Use TensorArraySplitV3
@@ -6647,128 +6029,6 @@
 	return op.Output(0)
 }
 
-// EmptyAttr is an optional argument to Empty.
-type EmptyAttr func(optionalAttr)
-
-// EmptyInit sets the optional init attribute to value.
-//
-// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
-// If not specified, defaults to false
-func EmptyInit(value bool) EmptyAttr {
-	return func(m optionalAttr) {
-		m["init"] = value
-	}
-}
-
-// Creates a tensor with the given shape.
-//
-// This operation creates a tensor of `shape` and `dtype`.
-//
-// Arguments:
-//	shape: 1-D. Represents the shape of the output tensor.
-//
-//
-// Returns A `Tensor` of type `T`.
-func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Empty",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
-
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Deprecated. Use TensorArrayScatterV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
-		Input: []tf.Input{
-			handle, indices, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
-
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayGatherV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
-		Input: []tf.Input{
-			handle, indices, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deprecated. Use TensorArrayReadV3
 //
 // DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
@@ -6788,6 +6048,114 @@
 	return op.Output(0)
 }
 
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
+//
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
+//
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSliceGrad",
+		Input: []tf.Input{
+			shape, begin, end, strides, dy,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV2",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TensorArrayV2Attr is an optional argument to TensorArrayV2.
 type TensorArrayV2Attr func(optionalAttr)
 
@@ -6845,26 +6213,25 @@
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// Get the current size of the TensorArray.
 //
 // Arguments:
 //	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "TensorArraySizeV3",
 		Input: []tf.Input{
-			handle,
+			handle, flow_in,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Split the data from the input value into TensorArray elements.
@@ -6967,29 +6334,33 @@
 	return op.Output(0), op.Output(1)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
+// Selects the k nearest centers for each point.
 //
-// `indices` must be a vector, its length must match the first dim of `value`.
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
+		Type: "NearestNeighbors",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			points, centers, k,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
@@ -7038,35 +6409,29 @@
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing multiple gradients of values in the given handle.
-//
-// Similar to TensorArrayGradV3. However it creates an accumulator with an
-// expanded shape compared to the input TensorArray whose gradient is being
-// computed. This enables multiple gradients for the same TensorArray to be
-// calculated using the same accumulator.
+// Read an element from the TensorArray into output `value`.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
+//	handle: The handle to a TensorArray.
+//
 //	flow_in: A float scalar that enforces proper chaining of operations.
-//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
-// have shape which is this shape_to_prepend value concatenated with shape of the
-// elements in the TensorArray corresponding to the input handle.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradWithShape",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			handle, flow_in, shape_to_prepend,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
 // Creates a TensorArray for storing the gradients of values in the given handle.
@@ -7129,199 +6494,187 @@
 	return op.Output(0), op.Output(1)
 }
 
-// Pop the element at the top of the stack.
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the number of elements in the given queue.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
 			handle,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
-//
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackPushV2",
-		Input: []tf.Input{
-			handle, elem,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
-
-// StackV2StackName sets the optional stack_name attribute to value.
-//
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
-	return func(m optionalAttr) {
-		m["stack_name"] = value
-	}
-}
-
-// A stack that produces elements in first-in last-out order.
-//
-// Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
-//
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackV2",
-		Input: []tf.Input{
-			max_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
-
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Applies upper_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
-//
-// Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
-//
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UpperBound",
-		Input: []tf.Input{
-			sorted_inputs, values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks whether a quantile stream has been initialized.
-//
-// An Op that checks if quantile stream resource is initialized.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource; The reference to quantile stream resource handle.
-//
-// Returns bool; True if the resource is initialized, False otherwise.
-func IsBoostedTreesQuantileStreamResourceInitialized(scope *Scope, quantile_stream_resource_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsBoostedTreesQuantileStreamResourceInitialized",
-		Input: []tf.Input{
-			quantile_stream_resource_handle,
-		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7348,6 +6701,76 @@
 	return op.Output(0)
 }
 
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Merge",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
+//
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -7417,6 +6840,31 @@
 	return components
 }
 
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
 type QueueDequeueManyV2Attr func(optionalAttr)
 
@@ -7747,115 +7195,16 @@
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
-
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of a tensor.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "Snapshot",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// FIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7939,130 +7288,6 @@
 	return op.Output(0)
 }
 
-// Bitcasts a tensor from one type to another without copying data.
-//
-// Given a tensor `input`, this operation returns a tensor that has the same buffer
-// data as `input` with datatype `type`.
-//
-// If the input datatype `T` is larger than the output datatype `type` then the
-// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-//
-// If `T` is smaller than `type`, the operator requires that the rightmost
-// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-// [..., sizeof(`type`)/sizeof(`T`)] to [...].
-//
-// tf.bitcast() and tf.cast() work differently when real dtype is casted as a complex dtype
-// (e.g. tf.complex64 or tf.complex128) as tf.cast() make imaginary part 0 while tf.bitcast()
-// gives module error.
-// For example,
-//
-// Example 1:
-// ```python
-// >>> a = [1., 2., 3.]
-// >>> equality_bitcast = tf.bitcast(a,tf.complex128)
-// tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
-// >>> equality_cast = tf.cast(a,tf.complex128)
-// >>> print(equality_cast)
-// tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
-// ```
-// Example 2:
-// ```python
-// >>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
-// <tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
-// ```
-// Example 3:
-// ```python
-// >>> x = [1., 2., 3.]
-// >>> y = [0., 2., 3.]
-// >>> equality= tf.equal(x,y)
-// >>> equality_cast = tf.cast(equality,tf.float32)
-// >>> equality_bitcast = tf.bitcast(equality_cast,tf.uint8)
-// >>> print(equality)
-// tf.Tensor([False True True], shape=(3,), dtype=bool)
-// >>> print(equality_cast)
-// tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
-// >>> print(equality_bitcast)
-// tf.Tensor(
-// [[ 0 0 0 0]
-//  [ 0 0 128 63]
-//  [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
-// ```
-//
-// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-// endian orderings will give different results.
-func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "Bitcast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
 type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 
@@ -8170,55 +7395,6 @@
 	return components
 }
 
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
-//
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
-		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a MultiDeviceIterator resource.
-//
-// Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ModelDatasetAttr is an optional argument to ModelDataset.
 type ModelDatasetAttr func(optionalAttr)
 
@@ -8257,23 +7433,6 @@
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
-		Input: []tf.Input{
-			iterator,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the value stored in an Optional variant or raises an error if none exists.
 func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -8300,6 +7459,57 @@
 	return components
 }
 
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalNone",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OptimizeDatasetAttr is an optional argument to OptimizeDataset.
+type OptimizeDatasetAttr func(optionalAttr)
+
+// OptimizeDatasetOptimizationConfigs sets the optional optimization_configs attribute to value.
+// If not specified, defaults to <>
+func OptimizeDatasetOptimizationConfigs(value []string) OptimizeDatasetAttr {
+	return func(m optionalAttr) {
+		m["optimization_configs"] = value
+	}
+}
+
+// Creates a dataset by applying optimizations to `input_dataset`.
+//
+// Creates a dataset by applying optimizations to `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
+//
+//
+func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...OptimizeDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OptimizeDataset",
+		Input: []tf.Input{
+			input_dataset, optimizations,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a serialized GraphDef representing `input_dataset`.
 //
 // Returns a graph representation for `input_dataset`.
@@ -8322,71 +7532,6 @@
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_max"] = value
-	}
-}
-
-// Use QuantizeAndDequantizeV2 instead.
-//
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Converts the given variant tensor to an iterator and stores it in the given resource.
 //
 // Arguments:
@@ -8408,19 +7553,18 @@
 	return scope.AddOperation(opspec)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// Converts the given `resource_handle` representing an iterator to a string.
 //
 // Arguments:
 //	resource_handle: A handle to an iterator resource.
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
 			resource_handle,
 		},
@@ -8429,68 +7573,19 @@
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+// Gets the next output from the given iterator.
 //
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
-
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Converts the given string representing a handle to an iterator to a resource.
-//
-// Arguments:
-//	string_handle: A string representation of the given handle.
-//
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
-		Input: []tf.Input{
-			string_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "IteratorGetNextSync",
 		Input: []tf.Input{
 			iterator,
 		},
@@ -8503,7 +7598,7 @@
 	var idx int
 	var err error
 	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
+		scope.UpdateErr("IteratorGetNextSync", err)
 		return
 	}
 	return components
@@ -8530,6 +7625,26 @@
 
 // A container for an iterator resource.
 //
+// Arguments:
+//	handle: A handle to the iterator to delete.
+//	deleter: A variant deleter.
+//
+// Returns the created operation.
+func DeleteIterator(scope *Scope, handle tf.Output, deleter tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteIterator",
+		Input: []tf.Input{
+			handle, deleter,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// A container for an iterator resource.
+//
 // Returns A handle to the iterator that can be passed to a "MakeIterator" or
 // "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
 // resource sharing by name, and does not keep a reference to the resource
@@ -8568,6 +7683,39 @@
 	return op.Output(0)
 }
 
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalHasValue",
+		Input: []tf.Input{
+			optional,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that emits the records from one or more TFRecord files.
 //
 // Arguments:
@@ -8616,35 +7764,6 @@
 	return op.Output(0)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
-//
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
-//
-// Arguments:
-//
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
-//
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "CacheDataset",
-		Input: []tf.Input{
-			input_dataset, filename,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
 // pseudorandomly.
@@ -8678,274 +7797,6 @@
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_mean(c, tf.constant([0, 0, 1]))
-// # ==> [[2.5, 2.5, 2.5, 2.5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMean",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Converts CudnnRNN params from canonical form to usable form.
-//
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
-//
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
-		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
-		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
-
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of the variable pointed to by `resource`.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "VariableShape",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
 type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
 
@@ -9006,344 +7857,48 @@
 	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
-//
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
-
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
 // If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["reshuffle_each_iteration"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			set1, set2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns which elements of x are Inf.
-//
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsInf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
-type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
@@ -9351,129 +7906,23 @@
 	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// If `k` varies dynamically, use `TopKV2` below.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TopK",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains `rate` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	rate: A scalar representing the sample rate of elements from the `input_dataset`
-// that should be taken.
-//	seed: A scalar representing seed of random number generator.
-//	seed2: A scalar representing seed2 of random number generator.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
 //
-func SamplingDataset(scope *Scope, input_dataset tf.Output, rate tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SamplingDataset",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			input_dataset, rate, seed, seed2,
+			start, stop, step,
 		},
 		Attrs: attrs,
 	}
@@ -9481,231 +7930,82 @@
 	return op.Output(0)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// PaddedBatchDatasetV2Attr is an optional argument to PaddedBatchDatasetV2.
+type PaddedBatchDatasetV2Attr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
-type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adagrad embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdagradParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// PaddedBatchDatasetV2ParallelCopy sets the optional parallel_copy attribute to value.
 // If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+func PaddedBatchDatasetV2ParallelCopy(value bool) PaddedBatchDatasetV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["parallel_copy"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape, optional ...PaddedBatchDatasetV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "PaddedBatchDatasetV2",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// A container for an iterator resource.
-//
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
-		Input: []tf.Input{
-			tag, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ShardDatasetAttr is an optional argument to ShardDataset.
+type ShardDatasetAttr func(optionalAttr)
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
-
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// ShardDatasetRequireNonEmpty sets the optional require_non_empty attribute to value.
+// If not specified, defaults to false
+func ShardDatasetRequireNonEmpty(value bool) ShardDatasetAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["require_non_empty"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+//	num_shards: An integer representing the number of shards operating in parallel.
+//	index: An integer representing the current worker index.
+//
+//
+func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShardDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "ShardDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, num_shards, index,
 		},
 		Attrs: attrs,
 	}
@@ -9713,239 +8013,183 @@
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Pads a tensor with mirrored values.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
 // For example:
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
 // ```
 //
 // Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "MirrorPad",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input, paddings,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
+// BatchDatasetV2Attr is an optional argument to BatchDatasetV2.
+type BatchDatasetV2Attr func(optionalAttr)
 
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// BatchDatasetV2ParallelCopy sets the optional parallel_copy attribute to value.
+// If not specified, defaults to false
+func BatchDatasetV2ParallelCopy(value bool) BatchDatasetV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["parallel_copy"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+//
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...BatchDatasetV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "BatchDatasetV2",
+		Input: []tf.Input{
+			input_dataset, batch_size, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A dataset that creates window datasets from the input dataset.
+//
+// Arguments:
+//
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "WindowDataset",
+		Input: []tf.Input{
+			input_dataset, size, shift, stride, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Connects outputs of an N-way replicated computation to N outputs.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedOutput",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BoostedTreesCalculateBestFeatureSplitAttr is an optional argument to BoostedTreesCalculateBestFeatureSplit.
-type BoostedTreesCalculateBestFeatureSplitAttr func(optionalAttr)
-
-// BoostedTreesCalculateBestFeatureSplitSplitType sets the optional split_type attribute to value.
-//
-// value: A string indicating if this Op should perform inequality split or equality split.
-// If not specified, defaults to "inequality"
-func BoostedTreesCalculateBestFeatureSplitSplitType(value string) BoostedTreesCalculateBestFeatureSplitAttr {
-	return func(m optionalAttr) {
-		m["split_type"] = value
-	}
-}
-
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
-//
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
-//
-// The output shapes are compatible in a way that the first dimension of all tensors are the same and equal to the number of possible split nodes for each feature.
-//
-// Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary: A Rank 4 tensor (#shape=[max_splits, feature_dims, bucket, stats_dims]) for accumulated stats summary (gradient/hessian) per node, per dimension, per buckets for each feature.
-// The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	logits_dimension: The dimension of logit, i.e., number of classes.
-//
-// Returns A Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best feature dimension for each feature to split for certain nodes if the feature is multi-dimension. See above for details like shapes and sizes.A Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.A Rank 1 tensors indicating the which direction to go if data is missing. See above for details like shapes and sizes.
-func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output, stats_summary tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, logits_dimension int64, optional ...BoostedTreesCalculateBestFeatureSplitAttr) (node_ids tf.Output, gains tf.Output, feature_dimensions tf.Output, thresholds tf.Output, left_node_contribs tf.Output, right_node_contribs tf.Output, split_with_default_directions tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestFeatureSplit",
-		Input: []tf.Input{
-			node_id_range, stats_summary, l1, l2, tree_complexity, min_node_weight,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
 // value: When set to True, it means when pooling, the values at the boundary
 // of adjacent pooling cells are used by both cells. For example:
@@ -9957,145 +8201,42 @@
 // If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
 // The result would be [41/3, 26/3] for fractional avg pooling.
 // If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
 		m["overlapping"] = value
 	}
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// Computes gradient of the FractionalAvgPool function.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -10103,32 +8244,43 @@
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
 		Input: []tf.Input{
-			x,
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Push an element onto the tensor_array.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
+		Type: "TensorArrayWriteV3",
 		Input: []tf.Input{
 			handle, index, value, flow_in,
 		},
@@ -10137,352 +8289,49 @@
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
-
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of average pooling function.
+// Initializes the multi device iterator with the given dataset.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An Op to sum inputs across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-// and `B, D, F, H` as group 1. Thus we get the outputs:
-// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-//
-// Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//
-// Returns The sum of all the distributed inputs.
-func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CrossReplicaSum",
+		Type: "MultiDeviceIteratorInit",
 		Input: []tf.Input{
-			input, group_assignment,
+			dataset, multi_device_iterator, max_buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["out_type"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
-type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Momentum embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, momenta, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
 //
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns a random (key, value)
-//
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10491,331 +8340,14 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			x,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
-//
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
-type LeakyReluGradAttr func(optionalAttr)
-
-// LeakyReluGradAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear gradients for a LeakyRelu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
-//	features: The features passed as input to the corresponding LeakyRelu operation,
-// OR the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
-func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
-		Input: []tf.Input{
-			value, bias,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // BiasAddGradAttr is an optional argument to BiasAddGrad.
@@ -10866,599 +8398,6 @@
 	return op.Output(0)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
-
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
-type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load SGD embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
-		Input: []tf.Input{
-			parameters,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SpaceToBatch for 4-D tensors of type T.
-//
-// This is a legacy version of the more general SpaceToBatchND.
-//
-// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-// More specifically, this op outputs a copy of the input tensor where values from
-// the `height` and `width` dimensions are moved to the `batch` dimension. After
-// the zero-padding, both `height` and `width` of the input must be divisible by the
-// block size.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, depth]`.
-//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-//   the padding of the input with zeros across the spatial dimensions as follows:
-//
-//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-//
-//   The effective spatial dimensions of the zero-padded input tensor will be:
-//
-//       height_pad = pad_top + height + pad_bottom
-//       width_pad = pad_left + width + pad_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` in the height and
-//     width dimensions are rearranged into the batch dimension at each location.
-//   * The batch of the output tensor is `batch * block_size * block_size`.
-//   * Both height_pad and width_pad must be divisible by block_size.
-//
-// The shape of the output will be:
-//
-//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//      depth]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-//
-func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	opspec := tf.OpSpec{
-		Type: "SpaceToBatch",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a tree ensemble model and returns a handle to it.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
-//	stamp_token: Token to use as the initial value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
-//
-// Returns the created operation.
-func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Forwards `data` to the output port determined by `pred`.
-//
-// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-// the data goes to `output_false`.
-//
-// See also `RefSwitch` and `Merge`.
-//
-// Arguments:
-//	data: The tensor to be forwarded to the appropriate output.
-//	pred: A scalar that specifies which output port will receive data.
-//
-// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
-func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Switch",
-		Input: []tf.Input{
-			data, pred,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
-//
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
-
-// AvgPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
-//
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Computes natural logarithm of (1 + x) element-wise.
-//
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log1p",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
 type ParseSingleSequenceExampleAttr func(optionalAttr)
 
@@ -11623,6 +8562,2345 @@
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
+	}
+}
+
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the singular value decompositions of one or more matrices.
+//
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// BatchMatMulV2Attr is an optional argument to BatchMatMulV2.
+type BatchMatMulV2Attr func(optionalAttr)
+
+// BatchMatMulV2AdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulV2AdjX(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulV2AdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// *NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
+// about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+//
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMulV2(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMulV2",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the input.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
+	}
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
+
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
+type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGammaGrad",
+		Input: []tf.Input{
+			alpha, sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
+
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
+//
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
+type LeakyReluGradAttr func(optionalAttr)
+
+// LeakyReluGradAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear gradients for a LeakyRelu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
+//	features: The features passed as input to the corresponding LeakyRelu operation,
+// OR the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
+//
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
+//
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPadGrad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D max pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves the input tensors to disk.
+//
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Save",
+		Input: []tf.Input{
+			filename, tensor_names, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUnbatchDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
+		Input: []tf.Input{
+			parameters, accumulators, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xdivy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bitcasts a tensor from one type to another without copying data.
+//
+// Given a tensor `input`, this operation returns a tensor that has the same buffer
+// data as `input` with datatype `type`.
+//
+// If the input datatype `T` is larger than the output datatype `type` then the
+// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+//
+// If `T` is smaller than `type`, the operator requires that the rightmost
+// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+//
+// tf.bitcast() and tf.cast() work differently when real dtype is casted as a complex dtype
+// (e.g. tf.complex64 or tf.complex128) as tf.cast() make imaginary part 0 while tf.bitcast()
+// gives module error.
+// For example,
+//
+// Example 1:
+// ```python
+// >>> a = [1., 2., 3.]
+// >>> equality_bitcast = tf.bitcast(a,tf.complex128)
+// tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
+// >>> equality_cast = tf.cast(a,tf.complex128)
+// >>> print(equality_cast)
+// tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
+// ```
+// Example 2:
+// ```python
+// >>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
+// <tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
+// ```
+// Example 3:
+// ```python
+// >>> x = [1., 2., 3.]
+// >>> y = [0., 2., 3.]
+// >>> equality= tf.equal(x,y)
+// >>> equality_cast = tf.cast(equality,tf.float32)
+// >>> equality_bitcast = tf.bitcast(equality_cast,tf.uint8)
+// >>> print(equality)
+// tf.Tensor([False True True], shape=(3,), dtype=bool)
+// >>> print(equality_cast)
+// tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
+// >>> print(equality_bitcast)
+// tf.Tensor(
+// [[ 0 0 0 0]
+//  [ 0 0 128 63]
+//  [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
+// ```
+//
+// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+// endian orderings will give different results.
+func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "Bitcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApproximateEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
+
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
+//
+// one in the source data format.
+//
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatVecPermute",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Finds values of the `n`-th order statistic for the last dimension.
+//
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NthElement",
+		Input: []tf.Input{
+			input, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// L2 Loss.
+//
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decodes a `variant` Tensor into a `RaggedTensor`.
+//
+// Decodes the given `variant` Tensor and returns a `RaggedTensor`. The input
+// could be a scalar, meaning it encodes a single `RaggedTensor` with ragged_rank
+// `output_ragged_rank`. It could also have an arbitrary rank, in which case each
+// element is decoded into a `RaggedTensor` with ragged_rank `input_ragged_rank`
+// and these are then stacked according to the input shape to output a single
+// `RaggedTensor` with ragged_rank `output_ragged_rank`. Each `variant` element in
+// the input Tensor is decoded by retrieving from the element a 1-D `variant`
+// Tensor with `input_ragged_rank + 1` Tensors, corresponding to the splits and
+// values of the decoded `RaggedTensor`. If `input_ragged_rank` is -1, then it is
+// inferred as `output_ragged_rank` - `rank(encoded_ragged)`. See
+// `RaggedTensorToVariant` for the corresponding encoding logic.
+//
+//
+// Arguments:
+//	encoded_ragged: A `variant` Tensor containing encoded `RaggedTensor`s.
+//	input_ragged_rank: The ragged rank of each encoded `RaggedTensor` component in the input. If set to
+// -1, this is inferred as `output_ragged_rank` - `rank(encoded_ragged)`
+//	output_ragged_rank: The expected ragged rank of the output `RaggedTensor`. The following must hold:
+// `output_ragged_rank = rank(encoded_ragged) + input_ragged_rank`.
+//
+//
+//
+// Returns A list of one or more Tensors representing the splits of the output
+// `RaggedTensor`.A Tensor representing the values of the output `RaggedTensor`.
+func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragged_rank int64, output_ragged_rank int64, Tvalues tf.DataType, Tsplits tf.DataType) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_ragged_rank": input_ragged_rank, "output_ragged_rank": output_ragged_rank, "Tvalues": Tvalues, "Tsplits": Tsplits}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorFromVariant",
+		Input: []tf.Input{
+			encoded_ragged,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedTensorFromVariant", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
+}
+
+// Creates a dataset that batches and pads `batch_size` elements from the input.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PaddedBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index:
+// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
+// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolWithArgmax",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGrad",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Computes the gradient for the rsqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradients for batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
+	}
+}
+
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
+//
+// Arguments:
+//	string_handle: A string representation of the given handle.
+//
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
+//
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SetSize",
+		Input: []tf.Input{
+			set_indices, set_values, set_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
 type ParseSequenceExampleAttr func(optionalAttr)
 
@@ -11831,121 +11109,6 @@
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Transforms a tf.Example proto (as a string) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
 // DecodeCompressedAttr is an optional argument to DecodeCompressed.
 type DecodeCompressedAttr func(optionalAttr)
 
@@ -11993,45 +11156,128 @@
 	return op.Output(0)
 }
 
-// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
-type OutfeedDequeueAttr func(optionalAttr)
-
-// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieves a single tensor from the computation outfeed.
-//
-// This operation will block indefinitely until data is available.
+// Creates a tree ensemble model and returns a handle to it.
 //
 // Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
 //
-// Returns A tensor that will be read from the device outfeed.
-func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a tree resource and returns a handle to it.
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be created.
+//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//
+// Returns the created operation.
+func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestCreateTreeVariable",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Forwards `data` to the output port determined by `pred`.
+//
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
+//
+// See also `RefSwitch` and `Merge`.
+//
+// Arguments:
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
+//
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Switch",
+		Input: []tf.Input{
+			data, pred,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// RaggedRangeAttr is an optional argument to RaggedRange.
+type RaggedRangeAttr func(optionalAttr)
+
+// RaggedRangeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
+//
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
+//
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OutfeedDequeue",
-
+		Type: "RaggedRange",
+		Input: []tf.Input{
+			starts, limits, deltas,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // RandomPoissonAttr is an optional argument to RandomPoisson.
@@ -12075,127 +11321,239 @@
 	return op.Output(0)
 }
 
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
-		Input: []tf.Input{
-			alpha, sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// Computes the sum along segments of a tensor.
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// # ==> [[5, 5, 5, 5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentSum",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["seed"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
 //
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomStandardNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inserts a dimension of 1 into a tensor's shape.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+// zero; if you specify a negative number for `axis` it is counted backward from
+// the end.
+//
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
+//
+// Arguments:
+//
+//	axis: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExpandDims",
+		Input: []tf.Input{
+			input, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolV2",
+		Input: []tf.Input{
+			input, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
+
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. This op is deprecated in favor of op 'StatefulStandardNormalV2'
+//
+// DEPRECATED at GraphDef version 29: Use StatefulStandardNormalV2 instead
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12204,9 +11562,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "StatefulStandardNormal",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			resource, shape,
 		},
 		Attrs: attrs,
 	}
@@ -12214,126 +11572,393 @@
 	return op.Output(0)
 }
 
-// Bucketize each feature based on bucket boundaries.
+// Returns the cardinality of `input_dataset`.
 //
-// An op that returns a list of float tensors, where each tensor represents the
-// bucketized values for a single feature.
+// Returns the cardinality of `input_dataset`.
 //
 // Arguments:
-//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
-// feature.
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
 //
-// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
-func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesBucketize",
+		Type: "ExperimentalDatasetCardinality",
 		Input: []tf.Input{
-			tf.OutputList(float_values), tf.OutputList(bucket_boundaries),
+			input_dataset,
 		},
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LogUniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformInt",
+		Input: []tf.Input{
+			resource, algorithm, shape, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
+type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3DGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
+type OutfeedDequeueTupleAttr func(optionalAttr)
+
+// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieve multiple values from the computation outfeed.
+//
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
 	var idx int
 	var err error
-	if buckets, idx, err = makeOutputList(op, idx, "buckets"); err != nil {
-		scope.UpdateErr("BoostedTreesBucketize", err)
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTuple", err)
 		return
 	}
-	return buckets
+	return outputs
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
+// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
+type DecodePaddedRawAttr func(optionalAttr)
 
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes size of weights that can be used by a Cudnn RNN model.
+// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
 //
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
+// value: Whether the input `input_bytes` is in little-endian order. Ignored for
+// `out_type` values that are stored in a single byte, like `uint8`
+// If not specified, defaults to true
+func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+// Arguments:
+//	input_bytes: Tensor of string to be decoded.
+//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
+// of the size of the output type.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
+// will have size equal to the length of the elements of `bytes` divided by the
+// number of bytes to represent `out_type`.
+func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "S": S}
+	attrs := map[string]interface{}{"out_type": out_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
+		Type: "DecodePaddedRaw",
 		Input: []tf.Input{
-			num_layers, num_units, input_size,
+			input_bytes, fixed_length,
 		},
 		Attrs: attrs,
 	}
@@ -12341,74 +11966,31 @@
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// StatefulUniformAttr is an optional argument to StatefulUniform.
+type StatefulUniformAttr func(optionalAttr)
+
+// StatefulUniformDtype sets the optional dtype attribute to value.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulUniformDtype(value tf.DataType) StatefulUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
-		Input: []tf.Input{
-			images,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12417,9 +11999,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "StatefulUniform",
 		Input: []tf.Input{
-			logits, num_samples,
+			resource, algorithm, shape,
 		},
 		Attrs: attrs,
 	}
@@ -12427,13 +12009,834 @@
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Produces the average pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAvgPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
+type StatefulStandardNormalV2Attr func(optionalAttr)
+
+// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormalV2",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Xlogy",
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatter",
+		Input: []tf.Input{
+			tensor, indices, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
+
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
+//
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unstage",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
+	}
+	return values
+}
+
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
+// If not specified, defaults to 0
+func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["batch_dims"] = value
+	}
+}
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
+//
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffleQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+//
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
+//
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Quantized Batch normalization.
+//
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
+//
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deprecated. Use TensorArrayScatterV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV2",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
+
+// MutexV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutexV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a Mutex resource that can be locked by `MutexLock`.
+//
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulTruncatedNormalAttr is an optional argument to StatefulTruncatedNormal.
+type StatefulTruncatedNormalAttr func(optionalAttr)
+
+// StatefulTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulTruncatedNormalDtype(value tf.DataType) StatefulTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulTruncatedNormal",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -12442,18 +12845,3427 @@
 	return op.Output(0)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// FIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
+//
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The gradient of SparseFillEmptyRows.
+//
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
+//
+// Arguments:
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRowsGrad",
+		Input: []tf.Input{
+			reverse_index_map, grad_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
+
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Requantizes input with min and max values known per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
+//
+// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the element-wise min of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMinimum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
+
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCBeamSearchDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
+}
+
+// Applies softmax to a batched N-D `SparseTensor`.
+//
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sign and the log of the absolute value of the determinant of
+//
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Broadcast an array for a compatible shape.
+//
+// Broadcasting is the process of making arrays to have compatible shapes
+// for arithmetic operations. Two shapes are compatible if for each
+// dimension pair they are either equal or one of them is one. When trying
+// to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+// and works its way forward.
+//
+// For example,
+//
+// ```python
+// >>> x = tf.constant([1, 2, 3])
+// >>> y = tf.broadcast_to(x, [3, 3])
+// >>> sess.run(y)
+// array([[1, 2, 3],
+//        [1, 2, 3],
+//        [1, 2, 3]], dtype=int32)
+// ```
+//
+// In the above example, the input Tensor with the shape of `[1, 3]`
+// is broadcasted to output Tensor with shape of `[3, 3]`.
+//
+// Arguments:
+//	input: A Tensor to broadcast.
+//	shape: An 1-D `int` Tensor. The shape of the desired output.
+//
+// Returns A Tensor.
+func BroadcastTo(scope *Scope, input tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastTo",
+		Input: []tf.Input{
+			input, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// The gradient operator for the SparseSlice op.
+//
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
+//
+// Arguments:
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSliceGrad",
+		Input: []tf.Input{
+			backprop_val_grad, input_indices, input_start, output_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAdd",
+		Input: []tf.Input{
+			value, bias,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSum",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns an element-wise indication of the sign of a number.
+//
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
+	}
+}
+
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cast",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
+//
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParameterizedTruncatedNormal",
+		Input: []tf.Input{
+			shape, means, stdevs, minvals, maxvals,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
+
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceMaxSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
+type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QUINT8
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	min_freezed_output: The minimum float value of the output tensor.
+//	max_freezed_output: The maximum float value of the output tensor.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
+
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
+//
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+//
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["message_format"] = value
+	}
+}
+
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+//
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
+	}
+}
+
+// The op extracts fields from a serialized protocol buffers message into tensors.
+//
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
+//
+// Arguments:
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names. An extension field can be decoded
+// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
+//	output_types: List of TF types to use for the respective field in field_names.
+//
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeProtoV2",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
+	}
+	return sizes, values
+}
+
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
+//
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Writes the given dataset to the given file using the TFRecord format.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetToTFRecord",
+		Input: []tf.Input{
+			input_dataset, filename, compression_type,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+//
+// if < 0, `scale * features` otherwise.
+//
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Selu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Calculates gains for each feature and returns the best possible split information for the feature.
+//
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Input: []tf.Input{
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+}
+
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
+
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// Arguments:
+//	contents: 0-D.  The BMP-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBmp",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter the data from the input value into specific TensorArray elements.
+//
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV3",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableRemoveV2",
+		Input: []tf.Input{
+			table_handle, keys,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Real",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
+
+// RandomPoissonV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
+//
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoissonV2",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, ms, mom, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopK",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
+//
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
+//
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+//
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	opspec := tf.OpSpec{
+		Type: "DynamicPartition",
+		Input: []tf.Input{
+			data, partitions,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
+}
+
+// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
+type SdcaOptimizerV2Attr func(optionalAttr)
+
+// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
+	return func(m optionalAttr) {
+		m["adaptive"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizerV2",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
+type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParameters",
+		Input: []tf.Input{
+			parameters, momenta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+//
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Floor",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
+//
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the stack from its resource container.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StackCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// num_elements: The size of the output list. Must be large enough to accommodate
+//   the largest index in indices. If -1, the list is just large enough to include
+//   the largest index in indices.
+// output_handle: The TensorList.
+func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterV2",
+		Input: []tf.Input{
+			tensor, indices, element_shape, num_elements,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
+type PrelinearizeTupleAttr func(optionalAttr)
+
+// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for all the
+// tuple shapes in the order the shapes appear in the "shapes" input. The layout
+// elements for a sub-shape can be set to -1 in which case the corresponding layout
+// will be computed by the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// An op which linearizes multiple Tensor values to an opaque variant tensor.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrelinearizeTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Fetches multiple values from infeed as an XLA tuple.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// BatchToSpace for 4-D tensors of type T.
+//
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
+//
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
+//
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "BatchToSpace",
+		Input: []tf.Input{
+			input, crops,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
 //
 // Duplicate entries are handled correctly: if multiple `indices` reference
 // the same location, their contributions are combined.
@@ -12470,12 +16282,12 @@
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
+		Type: "ResourceScatterMax",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -12483,67 +16295,506 @@
 	return scope.AddOperation(opspec)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
+
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
 //
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
-//
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
-//
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
 //
 // Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
 //
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
+}
+
+// Worker heartbeat op.
+//
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
+//
+// Arguments:
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
+//
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WorkerHeartbeat",
+		Input: []tf.Input{
+			request,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+//
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	opspec := tf.OpSpec{
+		Type: "RestoreV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
+}
+
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ParallelDynamicStitch",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(data),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
+
+// HashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// HashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
+//
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParameters",
+		Input: []tf.Input{
+			parameters, accumulators, linears,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Scatters tensor at indices in an input list.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs a padding as a preprocess during a convolution.
+//
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "FusedPadConv2D",
+		Input: []tf.Input{
+			input, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rolls the elements of a tensor along an axis.
+//
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
+//
+// For example:
+//
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+//
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
+//
+// Arguments:
+//
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Roll",
+		Input: []tf.Input{
+			input, shift, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
@@ -12699,31 +16950,316 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StatefulUniformAttr is an optional argument to StatefulUniform.
-type StatefulUniformAttr func(optionalAttr)
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
 
-// StatefulUniformDtype sets the optional dtype attribute to value.
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulUniformDtype(value tf.DataType) StatefulUniformAttr {
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Retrieves a single tensor from the computation outfeed.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// This operation will block indefinitely until data is available.
 //
 // Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
 //
-// Returns Random values with specified shape.
-func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformAttr) (output tf.Output) {
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A TPU core selector Op.
+//
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
+//
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUOrdinalSelector",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
+type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, velocities, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Bessel i0e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI0e",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Strip leading and trailing whitespaces from the Tensor.
+//
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
+//
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
+
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
+//
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexReplace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
+
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+//
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
+//
+// Arguments:
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12732,9 +17268,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatefulUniform",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			resource, algorithm, shape,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -12742,94 +17278,215 @@
 	return op.Output(0)
 }
 
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
-// ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+// Connects N inputs to an N-way replicated TPU computation.
+func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RaggedRange",
+		Type: "TPUReplicatedInput",
 		Input: []tf.Input{
-			starts, limits, deltas,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
+// Creates a dataset that caches elements from `input_dataset`.
 //
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			resource, indices, updates,
+			input_dataset, filename,
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingADAMParameters",
+		Input: []tf.Input{
+			parameters, momenta, velocities,
+		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Returns a tensor of zeros with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ZerosLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Divides sparse updates into the variable referenced by `resource`.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//     ref[indices, ...] /= updates[...]
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//     ref[indices[i], ...] /= updates[i, ...]
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
 //
 // Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// the same location, their contributions multiply.
 //
 // Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
@@ -12843,12 +17500,12 @@
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "ResourceScatterDiv",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -12856,29 +17513,757 @@
 	return scope.AddOperation(opspec)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
-//
-// where \\(\psi(x)\\) is the digamma function.
-// The polygamma function is defined only for non-negative integer orders \\a\\.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			a, x,
+			filename,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
+
+// UnicodeEncodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// Encode a tensor of ints into unicode strings.
+//
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
+//
+// ---
+//
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
+//
+// Arguments:
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
+//
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeEncode",
+		Input: []tf.Input{
+			input_values, input_splits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
+
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceMax",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TridiagonalSolveAttr is an optional argument to TridiagonalSolve.
+type TridiagonalSolveAttr func(optionalAttr)
+
+// TridiagonalSolvePartialPivoting sets the optional partial_pivoting attribute to value.
+//
+// value: Whether to apply partial pivoting. Partial pivoting makes the procedure more
+// stable, but slower.
+// If not specified, defaults to true
+func TridiagonalSolvePartialPivoting(value bool) TridiagonalSolveAttr {
+	return func(m optionalAttr) {
+		m["partial_pivoting"] = value
+	}
+}
+
+// Solves tridiagonal systems of equations.
+//
+//   Solves tridiagonal systems of equations.
+//   Supports batch dimensions and multiple right-hand sides per each left-hand
+//   side.
+//   On CPU, solution is computed via Gaussian elimination with or without partial
+//   pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
+//   library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
+//
+// Arguments:
+//	diagonals: Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the
+// tridiagonal matrices with three rows being the superdiagonal, diagonals, and
+// subdiagonals, in order. The last element of the superdiagonal and the first
+// element of the subdiagonal is ignored.
+//	rhs: Tensor of shape `[..., M, K]`, representing K right-hand sides per each
+// left-hand side.
+//
+// Returns Tensor of shape `[..., M, K]` containing the solutions
+func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional ...TridiagonalSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TridiagonalSolve",
+		Input: []tf.Input{
+			diagonals, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
+//
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAdd",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, updates, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
 type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
@@ -12937,106 +18322,148 @@
 	return scope.AddOperation(opspec)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
+//
+// Arguments:
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
+//
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "FakeParam",
 
 		Attrs: attrs,
 	}
@@ -13044,16 +18471,4399 @@
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise left-shift of `x` and `y`.
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PreventGradient",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "FFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softplus",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by backsubstitution.
+//
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Example:
+// ```python
+//
+// a = tf.constant([[3,  0,  0,  0],
+//                  [2,  1,  0,  0],
+//                  [1,  0,  1,  0],
+//                  [1,  1,  1,  1]], dtype=tf.float32)
+//
+// b = tf.constant([[4],
+//                  [2],
+//                  [4],
+//                  [2]], dtype=tf.float32)
+//
+// x = tf.linalg.triangular_solve(a, b, lower=True)
+// x
+// # <tf.Tensor: id=257, shape=(4, 1), dtype=float32, numpy=
+// # array([[ 1.3333334 ],
+// #        [-0.66666675],
+// #        [ 2.6666665 ],
+// #        [-1.3333331 ]], dtype=float32)>
+//
+// # in python3 one can use `a@x`
+// tf.matmul(a, x)
+// # <tf.Tensor: id=263, shape=(4, 1), dtype=float32, numpy=
+// # array([[4.       ],
+// #        [2.       ],
+// #        [4.       ],
+// #        [1.9999999]], dtype=float32)>
+// ```
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixTriangularSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
+
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor.
+//
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PadV2",
+		Input: []tf.Input{
+			input, paddings, constant_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+//
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
+//
+// Arguments:
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
+		Input: []tf.Input{
+			tf.OutputList(batch), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+//
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
+//
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixInverse",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
+
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyRelu",
+		Input: []tf.Input{
+			features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a Tensor into a serialized TensorProto proto.
+//
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeTensor",
+		Input: []tf.Input{
+			tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tensor filled with a scalar value.
+//
+// This operation creates a tensor of shape `dims` and fills it with `value`.
+//
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// `tf.fill` differs from `tf.constant` in a few ways:
+//
+// *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+//     Tensor values.
+// *   `tf.fill` creates an Op in the computation graph that constructs the actual
+//     Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+//     the entire Tensor into the graph with a `Const` node.
+// *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+//     based on other runtime Tensors, unlike `tf.constant`.
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BoostedTreesCreateQuantileStreamResourceAttr is an optional argument to BoostedTreesCreateQuantileStreamResource.
+type BoostedTreesCreateQuantileStreamResourceAttr func(optionalAttr)
+
+// BoostedTreesCreateQuantileStreamResourceMaxElements sets the optional max_elements attribute to value.
+//
+// value: int; The maximum number of data points that can be fed to the stream.
+// If not specified, defaults to 1099511627776
+func BoostedTreesCreateQuantileStreamResourceMaxElements(value int64) BoostedTreesCreateQuantileStreamResourceAttr {
+	return func(m optionalAttr) {
+		m["max_elements"] = value
+	}
+}
+
+// Create the Resource for Quantile Streams.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource; Handle to quantile stream resource.
+//	epsilon: float; The required approximation error of the stream resource.
+//	num_streams: int; The number of streams managed by the resource that shares the same epsilon.
+//
+// Returns the created operation.
+func BoostedTreesCreateQuantileStreamResource(scope *Scope, quantile_stream_resource_handle tf.Output, epsilon tf.Output, num_streams tf.Output, optional ...BoostedTreesCreateQuantileStreamResourceAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateQuantileStreamResource",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, epsilon, num_streams,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Return histogram of values.
+//
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
+//
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramFixedWidth",
+		Input: []tf.Input{
+			values, value_range, nbins,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Slice a `SparseTensor` based on the `start` and `size`.
+//
+// For example, if the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSlice",
+		Input: []tf.Input{
+			indices, values, shape, start, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CompilationResultProto indicating the status of the TPU compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_prod(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 6, 6, 4],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Enqueue multiple Tensor values on the computation outfeed.
+//
+// Arguments:
+//	inputs: A list of tensors that will be inserted into the outfeed queue as an
+// XLA tuple.
+//
+// Returns the created operation.
+func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load centered RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom, mg,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Component-wise multiplies a SparseTensor by a dense Tensor.
+//
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_min(c, tf.constant([0, 0, 1]))
+// # ==> [[1, 2, 2, 1],
+// #      [5, 6, 7, 8]]
+// ```
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
+type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  3, 3, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+//
+// Arguments:
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
+//
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDirectedInterleaveDataset",
+		Input: []tf.Input{
+			selector_input_dataset, tf.OutputList(data_input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
+//
+// Arguments:
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds sparse `updates` to an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterAdd",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds a value to the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+//
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prod",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a MultiDeviceIterator resource.
+//
+// Arguments:
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
+
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Applies upper_bound(sorted_search_values, values) along each row.
+//
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
+//
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
+//
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = UpperBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
+//
+// Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
+//
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UpperBound",
+		Input: []tf.Input{
+			sorted_inputs, values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
+type QuantizedConv2DPerChannelAttr func(optionalAttr)
+
+// QuantizedConv2DPerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
+//
+// value: list of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes QuantizedConv2D per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	min_input: The minimum value of the input tensor
+//	max_input: The maximum value of the input tensor.
+//	min_filter: The minimum value of the filter tensor.
+//	max_filter: The maximum value of the filter tensor.
+//	strides: list of stride values.
+//
+//
+// Returns The output tensor.The minimum value of the final output tensor.The maximum value of the final output tensor.
+func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DPerChannelAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2DPerChannel",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
+
+// InfeedEnqueueShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// InfeedEnqueueLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
+//
+// Arguments:
+//	input: A tensor that will be provided using the infeed mechanism.
+//
+// Returns the created operation.
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
+//
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
+//
+// Arguments:
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Squeeze",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Bessel i1e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI1e",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse addition to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CrossReplicaSum",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// An op enabling differentiation of TPU Embeddings.
+//
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
+//
+// Arguments:
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
+	opspec := tf.OpSpec{
+		Type: "TPUEmbeddingActivations",
+		Input: []tf.Input{
+			embedding_variable, sliced_activations,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBoxV2",
+		Input: []tf.Input{
+			image_size, bounding_boxes, min_object_covered,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Concat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// BoostedTreesQuantileStreamResourceFlushAttr is an optional argument to BoostedTreesQuantileStreamResourceFlush.
+type BoostedTreesQuantileStreamResourceFlushAttr func(optionalAttr)
+
+// BoostedTreesQuantileStreamResourceFlushGenerateQuantiles sets the optional generate_quantiles attribute to value.
+//
+// value: bool; If True, the output will be the num_quantiles for each stream where the ith
+// entry is the ith quantile of the input with an approximation error of epsilon.
+// Duplicate values may be present.
+// If False, the output will be the points in the histogram that we got which roughly
+// translates to 1/epsilon boundaries and without any duplicates.
+// Default to False.
+// If not specified, defaults to false
+func BoostedTreesQuantileStreamResourceFlushGenerateQuantiles(value bool) BoostedTreesQuantileStreamResourceFlushAttr {
+	return func(m optionalAttr) {
+		m["generate_quantiles"] = value
+	}
+}
+
+// Flush the summaries for a quantile stream resource.
+//
+// An op that flushes the summaries for a quantile stream resource.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_buckets: int; approximate number of buckets unless using generate_quantiles.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resource_handle tf.Output, num_buckets tf.Output, optional ...BoostedTreesQuantileStreamResourceFlushAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceFlush",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, num_buckets,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the set of files matching one or more glob patterns.
+//
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMul",
+		Input: []tf.Input{
+			a, b, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
+
+// AvgPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
+
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the tensor specified by its handle in the session.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
+//
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
+//
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdaMax",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D real-valued fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserializes a proto into the tree handle
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be restored.
+//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
+//
+// Returns the created operation.
+func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeDeserialize",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Splits a tensor into a list.
+//
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+//
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSplit",
+		Input: []tf.Input{
+			tensor, element_shape, lengths,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopKV2",
+		Input: []tf.Input{
+			input, k,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -13062,22 +22872,83 @@
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			x, y,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
 // TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
@@ -13179,389 +23050,33 @@
 	return scope.AddOperation(opspec)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// StringLengthUnit sets the optional unit attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["unit"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// String lengths of `input`.
+//
+// Computes the length of each string given in the input tensor.
 //
 // Arguments:
-//	bytes: All the elements must have the same length.
+//	input: The string for which to compute the length.
 //
-//
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks whether a resource handle-based variable has been initialized.
-//
-// Arguments:
-//	resource: the input resource handle.
-//
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
-//
-// Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
-//
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
-		Input: []tf.Input{
-			true_classes, sampled_candidates,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather ragged slices from `params` axis `0` according to `indices`.
-//
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
-//
-// ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
-//
-// where
-//
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
-//
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
-//
-//
-// Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to flat_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
-//
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
-	opspec := tf.OpSpec{
-		Type: "RaggedGather",
-		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
-}
-
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
-//
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13570,50 +23085,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Rpc",
-		Input: []tf.Input{
-			address, method, request,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
-
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces the match of pattern in input with rewrite.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expression.
-//
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
+		Type: "StringLength",
 		Input: []tf.Input{
 			input,
 		},
@@ -13623,27 +23095,61 @@
 	return op.Output(0)
 }
 
-// BoostedTreesQuantileStreamResourceHandleOpAttr is an optional argument to BoostedTreesQuantileStreamResourceHandleOp.
-type BoostedTreesQuantileStreamResourceHandleOpAttr func(optionalAttr)
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+//
+// The polygamma function is defined as:
+//
+//
+// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+// The polygamma function is defined only for non-negative integer orders \\a\\.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// BoostedTreesQuantileStreamResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesQuantileStreamResourceHandleOpContainer(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_locking"] = value
 	}
 }
 
-// BoostedTreesQuantileStreamResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesQuantileStreamResourceHandleOpSharedName(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a BoostedTreesQuantileStreamResource.
-func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...BoostedTreesQuantileStreamResourceHandleOpAttr) (resource tf.Output) {
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13652,23 +23158,382 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceHandleOp",
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
 
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Subtracts sparse `updates` from an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by subtracting sparse `updates` from the
+// passed in `tensor`.
+// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+// are subtracted from an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_sub is to subtract individual elements
+// from a tensor by index. For example, say we want to insert 4 scattered elements
+// in a rank-1 tensor with 8 elements.
+//
+// In Python, this scatter subtract operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, -10, 1, -9, -8, 1, 1, -11]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates subtracted according to the indices.
+func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "TensorScatterSub",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
+	}
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
+//
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
+//
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Enter",
+		Input: []tf.Input{
+			data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise integer closest to x.
+//
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
 		Input: []tf.Input{
 			x,
 		},
@@ -13677,35 +23542,851 @@
 	return op.Output(0)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of bilinear interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinearGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// 3D real-valued fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
+
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns a one-hot tensor.
+//
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+// ```
+// output =
+//   [5.0 0.0 0.0]  // one_hot(0)
+//   [0.0 0.0 5.0]  // one_hot(2)
+//   [0.0 0.0 0.0]  // one_hot(-1)
+//   [0.0 5.0 0.0]  // one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+// ```
+// output =
+//   [0.0 3.0 3.0 3.0]
+//   [3.0 3.0 3.0 0.0]
+//   [3.0 3.0 3.0 3.0]
+//   [3.0 0.0 3.0 3.0]
+// //  ^                one_hot(0)
+// //      ^            one_hot(2)
+// //          ^        one_hot(-1)
+// //              ^    one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+// ```
+// output =
+//   [
+//     [1.0, 0.0, 0.0]  // one_hot(0)
+//     [0.0, 0.0, 1.0]  // one_hot(2)
+//   ][
+//     [0.0, 1.0, 0.0]  // one_hot(1)
+//     [0.0, 0.0, 0.0]  // one_hot(-1)
+//   ]
+// ```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OneHot",
+		Input: []tf.Input{
+			indices, depth, on_value, off_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces the max pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrelinearizeAttr is an optional argument to Prelinearize.
+type PrelinearizeAttr func(optionalAttr)
+
+// PrelinearizeShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// PrelinearizeLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence. If a layout
+// attribute is passed but its values are all -1 the layout will be computed by
+// the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeLayout(value []int64) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// An op which linearizes one Tensor value to an opaque variant tensor.
+//
+// Arguments:
+//	input: A tensor that will be linearized.
+func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prelinearize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
+
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of x element-wise.
+//
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
+		Input: []tf.Input{
+			var_, m, lr, logbase, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
+type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve MDL Adagrad Light embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, linears, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
 //	orig_input: The original input tensor.
 //	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13714,7 +24395,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
 			orig_input, orig_output, grad,
 		},
@@ -13724,755 +24405,50 @@
 	return op.Output(0)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
-//
-// Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
-//
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
-		Input: []tf.Input{
-			input, threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, perform exclusive cumprod.
 // If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes gradients for the scaled exponential linear (Selu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
-//
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SeluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
-		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns a tensor of ones with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
-//
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OnesLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
-
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
-//
-// Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
-//
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SetSize",
-		Input: []tf.Input{
-			set_indices, set_values, set_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
-type NonDeterministicIntsAttr func(optionalAttr)
-
-// NonDeterministicIntsDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Non-deterministically generates some integers.
-//
-// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//
-// Returns Non-deterministic integer values with specified shape.
-func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonDeterministicInts",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-//
-// Arguments:
-//
-//	num_shards: An integer representing the number of shards operating in parallel.
-//	index: An integer representing the current worker index.
-//
-//
-func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ShardDataset",
-		Input: []tf.Input{
-			input_dataset, num_shards, index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
-//
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
 		m["exclusive"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// CumprodReverse sets the optional reverse attribute to value.
 //
 // value: A `bool` (default: False).
 // If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
 		m["reverse"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// By default, this op performs an inclusive cumsum, which means that the first
+// By default, this op performs an inclusive cumprod, which means that the first
 // element of the input is identical to the first element of the output:
 //
 // ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
 // ```
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
 // performed instead:
 //
 // ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
 // ```
 //
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
 // opposite direction:
 //
 // ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
 // ```
 //
 // This is more efficient than using separate `tf.reverse` ops.
@@ -14480,7 +24456,7 @@
 // The `reverse` and `exclusive` kwargs can also be combined:
 //
 // ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
 // ```
 //
 // Arguments:
@@ -14489,7 +24465,7 @@
 // `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 //	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
 // `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14498,7 +24474,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "Cumprod",
 		Input: []tf.Input{
 			x, axis,
 		},
@@ -14508,519 +24484,158 @@
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
-
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// Computes square of x element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Square",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Update '*var' according to the Adam algorithm.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Add",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adadelta parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
-//
-//
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Type: "ResourceApplyAdamWithAmsgrad",
 		Input: []tf.Input{
-			parameters, accumulators, updates, gradient_accumulators,
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+// Add all input tensors element wise.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
+		Type: "AddN",
 		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
-//
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
-		Input: []tf.Input{
-			x, y,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
+type CudnnRNNBackpropV2Attr func(optionalAttr)
 
-// StridedSliceAttr is an optional argument to StridedSlice.
-type StridedSliceAttr func(optionalAttr)
-
-// StridedSliceBeginMask sets the optional begin_mask attribute to value.
-//
-// value: a bitmask where a bit i being 1 means to ignore the begin
-// value and instead use the largest interval possible. At runtime
-// begin[i] will be replaced with `[0, n-1)` if `stride[i] > 0` or
-// `[-1, n-1]` if `stride[i] < 0`
-// If not specified, defaults to 0
-func StridedSliceBeginMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceEndMask sets the optional end_mask attribute to value.
-//
-// value: analogous to `begin_mask`
-// If not specified, defaults to 0
-func StridedSliceEndMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
-//
-// value: a bitmask where bit `i` being 1 means the `i`th
-// position is actually an ellipsis. One bit at most can be 1.
-// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-// implicitly creates as many range specifications as necessary to fully
-// specify the sliced range for every dimension. For example for a 4-dimensional
-// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-// If not specified, defaults to 0
-func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
-//
-// value: a bitmask where bit `i` being 1 means the `i`th
-// specification creates a new shape 1 dimension. For example
-// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-// If not specified, defaults to 0
-func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-//
-// value: a bitmask where bit `i` implies that the `i`th
-// specification should shrink the dimensionality. begin and end
-// must imply a slice of size 1 in the dimension. For example in
-// python one might do `foo[:, 3, :]` which would result in
-// `shrink_axis_mask` being 2.
-// If not specified, defaults to 0
-func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Return a strided slice from `input`.
-//
-// Note, most python users will want to use the Python `Tensor.__getitem__`
-// or `Variable.__getitem__` rather than this op directly.
-//
-// The goal of this op is to produce a new tensor with a subset of
-// the elements from the `n` dimensional `input` tensor. The subset is chosen using
-// a sequence of `m` sparse range specifications encoded into the arguments
-// of this function. Note, in some cases
-// `m` could be equal to `n`, but this need not be the case. Each
-// range specification entry can be one of the following:
-//
-// - An ellipsis (...). Ellipses are used to imply zero or more
-//   dimensions of full-dimension selection and are produced using
-//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-//
-// - A new axis. This is used to insert a new shape=1 dimension and is
-//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
-//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-//
-//
-// - A range `begin:end:stride`. This is used to specify how much to choose from
-//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-//   which represents the index of the first value to select while `end` represents
-//   the index of the last value to select. The number of values selected in each
-//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
-//   the second to last. `begin_mask` controls whether to replace the explicitly
-//   given `begin` with an implicit effective value of `0` if `stride > 0` and
-//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-//   required to create the largest open interval. For example, given a shape
-//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-//   first dimension of a tensor while dropping the last two (in the original
-//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-//
-// - A single index. This is used to keep only elements that have a given
-//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
-//   `shrink_axis_mask`.
-//
-// Each conceptual range specification is encoded in the op's argument. This
-// encoding is best understand by considering a non-trivial example. In
-// particular,
-// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-//
-// ```
-// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-// end = [2, 4, x, x, -3, x]
-// strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
-// end_mask = 1<<5 = 32
-// ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
-// ```
-//
-// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-// the slice becomes (2, 1, 5, 5, 2, 5).
-// Let us walk step by step through each argument specification.
-//
-// 1.  The first argument in the example slice is turned into `begin = 1` and
-// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-// also set the appropriate bit in `shrink_axis_mask`.
-//
-// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-// zero bits contributed.
-//
-// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-// dimension in the final shape. Dummy values are contributed to begin,
-// end and stride, while the new_axis_mask bit is set.
-//
-// 4. `...` grab the full ranges from as many dimensions as needed to
-// fully specify a slice for every dimension of the input shape.
-//
-// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-// with a dimension that has shape `s` is converted to a positive index
-// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-// is done internally so begin, end and strides receive x, -3, and -1.
-// The appropriate begin_mask bit is set to indicate the start range is the
-// full range (ignoring the x).
-//
-// 6. `:` indicates that the entire contents of the corresponding dimension
-// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-// `end_mask` are also set.
-//
-// *Requirements*:
-//   `0 != strides[i] for i in [0, m)`
-//   `ellipsis_mask must be a power of two (only one ellipsis)`
-//
-// Arguments:
-//
-//	begin: `begin[k]` specifies the offset into the `k`th range specification.
-// The exact dimension this corresponds to will be determined by context.
-// Out-of-bounds values will be silently clamped. If the `k`th bit of
-// `begin_mask` then `begin[k]` is ignored and the full range of the
-// appropriate dimension is used instead. Negative values causes indexing
-// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-//	end: `end[i]` is like `begin` with the exception that `end_mask` is
-// used to determine full ranges.
-//	strides: `strides[i]` specifies the increment in the `i`th specification
-// after extracting a given element. Negative indices will reverse
-// the original order. Out or range values are
-// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSlice",
-		Input: []tf.Input{
-			input, begin, end, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNV2Attr is an optional argument to CudnnRNNV2.
-type CudnnRNNV2Attr func(optionalAttr)
-
-// CudnnRNNV2RnnMode sets the optional rnn_mode attribute to value.
+// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
 // If not specified, defaults to "lstm"
-func CudnnRNNV2RnnMode(value string) CudnnRNNV2Attr {
+func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
 	return func(m optionalAttr) {
 		m["rnn_mode"] = value
 	}
 }
 
-// CudnnRNNV2InputMode sets the optional input_mode attribute to value.
+// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
 // If not specified, defaults to "linear_input"
-func CudnnRNNV2InputMode(value string) CudnnRNNV2Attr {
+func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
 	return func(m optionalAttr) {
 		m["input_mode"] = value
 	}
 }
 
-// CudnnRNNV2Direction sets the optional direction attribute to value.
+// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
 // If not specified, defaults to "unidirectional"
-func CudnnRNNV2Direction(value string) CudnnRNNV2Attr {
+func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
 	return func(m optionalAttr) {
 		m["direction"] = value
 	}
 }
 
-// CudnnRNNV2Dropout sets the optional dropout attribute to value.
+// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV2Dropout(value float32) CudnnRNNV2Attr {
+func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
 	return func(m optionalAttr) {
 		m["dropout"] = value
 	}
 }
 
-// CudnnRNNV2Seed sets the optional seed attribute to value.
+// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV2Seed(value int64) CudnnRNNV2Attr {
+func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNV2Seed2 sets the optional seed2 attribute to value.
+// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV2Seed2(value int64) CudnnRNNV2Attr {
+func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// CudnnRNNV2IsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV2IsTraining(value bool) CudnnRNNV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
+// Backprop step of CudnnRNN.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer. Produces one extra output "host_reserved" than CudnnRNN.
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
+//     cudnnRNNAlgo_t and cudnnMathType_t.
 //
 // rnn_mode: Indicates the type of the RNN model.
 // input_mode: Indicates whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
 // direction: Indicates whether a bidirectional model will be used. Should be
 //   "unidirectional" or "bidirectional".
 // dropout: Dropout probability. When set to 0., dropout is disabled.
@@ -15039,14 +24654,22 @@
 //     dir * num_units].
 // output_h: The same shape has input_h.
 // output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-// host_reserved: An opaque tensor that can be used in backprop calculation. It is
-//   only produced if is_training is true. It is output on host memory rather than
-//   device memory.
-func CudnnRNNV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNV2Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// host_reserved: The same host_reserved produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15055,671 +24678,65 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNV2",
+		Type: "CudnnRNNBackpropV2",
 		Input: []tf.Input{
-			input, input_h, input_c, params,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+//	input: A complex tensor.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			gradients, outputs,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseSlice op.
-//
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
-//
-// Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
-//
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
-		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// Checks whether a tree has been initialized.
-//
-// Arguments:
-//	tree_handle: Handle to the tree.
-//
-// Returns Whether the tree is initialized.
-func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeIsInitializedOp",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-// Graphically this is equivalent to doing
-//
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
-	opspec := tf.OpSpec{
-		Type: "SparseConcat",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RegexReplaceAttr is an optional argument to RegexReplace.
-type RegexReplaceAttr func(optionalAttr)
-
-// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global (that is, all matches of the `pattern` regular
-// expression in each input string are rewritten), otherwise the `rewrite`
-// substitution is only made for the first `pattern` match.
-// If not specified, defaults to true
-func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces matches of the `pattern` regular expression in `input` with the
-// replacement string provided in `rewrite`.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to be matched in the `input` strings.
-//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
-// matched in the `input` strings.
-//
-// Returns The text after applying pattern match and rewrite substitution.
-func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexReplace",
-		Input: []tf.Input{
-			input, pattern, rewrite,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-//     Updates specified rows with values in `v`.
-//
-//     Computes `x[i, :] = v; return x`.
-//
-// Arguments:
-//	x: A tensor of type `T`.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceUpdate",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
-		Input: []tf.Input{
-			tensor, indices, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
-//
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
-//
-// Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
-//
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRNGrad",
-		Input: []tf.Input{
-			input_grads, input_image, output_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
-
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
-	}
-}
-
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
-	}
-}
-
-// ExtractGlimpseNoise sets the optional noise attribute to value.
-//
-// value: indicates if the noise should `uniform`, `gaussian`, or
-// `zero`. The default is `uniform` which means the the noise type
-// will be decided by `uniform_noise`.
-// If not specified, defaults to "uniform"
-func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["noise"] = value
-	}
-}
-
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
-//
-// The argument `normalized` and `centered` controls how the windows are built:
-//
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
-//
-// Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
-//
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
-		Input: []tf.Input{
-			input, size, offsets,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15728,13 +24745,800 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Constructs a tensor by tiling a given tensor.
+//
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
+//
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tile",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate the bucket boundaries for each feature based on accumulated summaries.
+//
+// An op that returns a list of float tensors for a quantile stream resource. Each
+// tensor is Rank 1 containing bucket boundaries for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_features: inferred int; number of features to get bucket boundaries for.
+//
+// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_features": num_features}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
+		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
+		return
+	}
+	return bucket_boundaries
+}
+
+// Encodes a `RaggedTensor` into a `variant` Tensor.
+//
+//
+// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
+// `batched_input` is True, then input `RaggedTensor` is unbatched along the
+// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
+// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
+// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
+// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
+// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
+// corresponding decoding logic.
+//
+//
+// Arguments:
+//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
+// `RaggedTensor`.
+//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
+//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
+//
+// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
+func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"batched_input": batched_input}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariant",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
+//
+// Arguments:
+//	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `input`.
+//	strides: 1-D of length 5. How far the centers of two consecutive patches are in
+// `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
+//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
+// ```
+//
+// Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,
+// ksize_planes * ksize_rows * ksize_cols * depth]` containing patches
+// with size `ksize_planes x ksize_rows x ksize_cols x depth` vectorized
+// in the "depth" dimension. Note `out_planes`, `out_rows` and `out_cols`
+// are the dimensions of the output patches.
+func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides []int64, padding string) (patches tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExtractVolumePatches",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecodeWithOffsets",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizer",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
+
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
+//
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["tpu_embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["is_global_init"] = value
+	}
+}
+
+// Sets up the centralized structures for a distributed TPU system.
+//
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ConfigureDistributedTPU",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
+//
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// element_shape: The shape of the uninitialized elements in the list. If the first
+//   dimension is not -1, it is assumed that all list elements have the same
+//   leading dim.
+// leading_dims: The list of leading dims of uninitialized list elements. Used if
+//   the leading dim of input_handle.element_shape or the element_shape input arg
+//   is not already set.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcatV2",
+		Input: []tf.Input{
+			input_handle, element_shape, leading_dims,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV3",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
+type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
@@ -15762,27 +25566,338 @@
 	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op that receives embedding activations on the TPU.
+//
+// The TPU system performs the embedding lookups and aggregations specified by
+// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+// results of these aggregations are visible to the Tensorflow Graph as the
+// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+// one Tensor of activations per table specified in the model. There can be at
+// most one RecvTPUEmbeddingActivations op in the TPU graph.
+//
+// Arguments:
+//	num_outputs: The number of output activation tensors, equal to the number of
+// embedding tables in the model.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
+	opspec := tf.OpSpec{
+		Type: "RecvTPUEmbeddingActivations",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
+		return
+	}
+	return outputs
+}
+
+// Returns a constant tensor on the host. Only for writing C++ tests.
+//
+// Arguments:
+//	value: Attr `value` is the tensor to return.
+//
+func HostConst(scope *Scope, value tf.Tensor, dtype tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value": value, "dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "HostConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to permute tensors across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
+//
+// Arguments:
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
+//
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectivePermute",
+		Input: []tf.Input{
+			input, source_target_pairs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Debugging/model interpretability outputs for each example.
+//
+// It traverses all the trees and computes debug metrics for individual examples,
+// such as getting split feature ids and logits after each split along the decision
+// path used to compute directional feature contributions.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for constructing the protos in
+// examples_debug_outputs_serialized.
+//
+// Returns Output rank 1 Tensor containing a proto serialized as a string for each example.
+func BoostedTreesExampleDebugOutputs(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (examples_debug_outputs_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesExampleDebugOutputs",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
+
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15791,7 +25906,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
 			sparse_indices, sparse_values, sparse_shape,
 		},
@@ -15801,91 +25916,226 @@
 	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// For example:
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
 //
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	return offset
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
+type MaxPoolGradWithArgmaxAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
+// value: Whether to include batch dimension in flattened index of `argmax`.
 // If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["include_batch_in_index"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
+
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15894,65 +26144,25 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
 //
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalSlidingWindowDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
 			input,
 		},
@@ -15961,257 +26171,262 @@
 	return op.Output(0)
 }
 
-// Serializes the tree ensemble to a proto.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
-func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesSerializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["capacity"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-//
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
-//
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "ParseTensor",
-		Input: []tf.Input{
-			serialized,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
-type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// MapPeekContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adadelta embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParameters",
-		Input: []tf.Input{
-			parameters, accumulators, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
-
-// UnbatchGradContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradContainer(value string) UnbatchGradAttr {
+func MapPeekContainer(value string) MapPeekAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// MapPeekSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func UnbatchGradSharedName(value string) UnbatchGradAttr {
+func MapPeekSharedName(value string) MapPeekAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Gradient of Unbatch.
+// Op peeks at the values at the specified key.  If the
 //
-// Acts like Batch but using the given batch_index index of batching things as they
-// become available. This ensures that the gradients are propagated back in the
-// same session which did the forward pass.
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
+// Makes the summary of quantiles for the batch.
 //
-// original_input: The input to the Unbatch operation this is the gradient of.
-// batch_index: The batch_index given to the Unbatch operation this is the gradient
-// of.
-// grad: The downstream gradient.
-// id: The id scalar emitted by Batch.
-// batched_grad: The return value, either an empty tensor or the batched gradient.
-// container: Container to control resource sharing.
-// shared_name: Instances of UnbatchGrad with the same container and shared_name
-//  are assumed to possibly belong to the same batch. If left empty, the op name
-//  will be used as the shared name.
-func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+// An op that takes a list of tensors (one tensor per feature) and outputs the
+// quantile summaries for each tensor.
+//
+// Arguments:
+//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
+//	example_weights: float; Rank 1 Tensor with weights per instance.
+//	epsilon: float; The required maximum approximation error.
+//
+// Returns float; List of Rank 2 Tensors each containing the quantile summary
+// (value, weight, min_rank, max_rank) of a single feature.
+func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeQuantileSummaries",
+		Input: []tf.Input{
+			tf.OutputList(float_values), example_weights, epsilon,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if summaries, idx, err = makeOutputList(op, idx, "summaries"); err != nil {
+		scope.UpdateErr("BoostedTreesMakeQuantileSummaries", err)
+		return
+	}
+	return summaries
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
+type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Saves input tensors slices to disk.
+//
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
+
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// ReduceJoinSeparator sets the optional separator attribute to value.
+//
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
+//
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
+//
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16220,9 +26435,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnbatchGrad",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			original_input, batch_index, grad, id,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -16230,147 +26445,36 @@
 	return op.Output(0)
 }
 
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
-//
-// For example:
-//
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
-//
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
-//
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
-//
-// Arguments:
-//
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Roll",
-		Input: []tf.Input{
-			input, shift, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
 
-// A placeholder op for a value that will be fed into the computation.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be provided using the infeed mechanism.
-func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeue",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
-type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load centered RMSProp embedding parameters.
+// Retrieve centered RMSProp embedding parameters.
 //
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Arguments:
-//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
-//	ms: Value of ms used in the centered RMSProp optimization algorithm.
-//	mom: Value of mom used in the centered RMSProp optimization algorithm.
-//	mg: Value of mg used in the centered RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16379,86 +26483,213 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
+//
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
+//
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackV2",
 		Input: []tf.Input{
-			parameters, ms, mom, mg,
+			max_size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
 // Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
+	opspec := tf.OpSpec{
+		Type: "ResourceCountUpTo",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Concat",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
+			a_indices, a_values, a_shape, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
-// If not specified, defaults to 0
-func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["batch_dims"] = value
-	}
-}
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from the variable pointed to by `resource` according to `indices`.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionWithOverlaps",
+		Input: []tf.Input{
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Extract `patches` from `images` and put them in the "depth" output dimension.
+//
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
+//
+// We specify the size-related attributes as:
 //
 // ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
 // ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+//
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "ExtractImagePatches",
 		Input: []tf.Input{
-			resource, indices,
+			images,
 		},
 		Attrs: attrs,
 	}
@@ -16466,106 +26697,383 @@
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
-type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
 
-// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pop the element at the top of the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	opspec := tf.OpSpec{
+		Type: "StackPopV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
+//
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
+//
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
+//
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRows",
+		Input: []tf.Input{
+			indices, values, dense_shape, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Qr",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
 //
 // value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
 //	strides: The stride of the sliding window for each dimension of the input
-// tensor.
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16574,48 +27082,96 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
+// Real-valued fast Fourier transform.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
+
+// StringFormatTemplate sets the optional template attribute to value.
+//
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["template"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["placeholder"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
+// StringFormatSummarize sets the optional summarize attribute to value.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Formats a string template using a list of tensors.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+//
+// Arguments:
+//	inputs: The list of tensors to format into the placeholder string.
+//
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16624,36 +27180,217 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
+		Type: "StringFormat",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
-type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+// Shuts down a running distributed TPU system.
+//
+// The op returns an error if no system is running.
+//
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShutdownDistributedTPU",
+	}
+	return scope.AddOperation(opspec)
+}
 
-// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalFromValue",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Dequantize",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash can be used to make it difficult to find inputs with a skewed hash value
+// distribution over buckets. This requires that the hash function is
+// seeded by a high-entropy (random) "key" unknown to the adversary.
+//
+// The additional robustness comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key used to seed the hash function, passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load FTRL embedding parameters.
+// Load proximal Adagrad embedding parameters with debug support.
 //
 // An op that loads optimization parameters into HBM for embedding. Must be
 // preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
@@ -16662,14 +27399,14 @@
 // executed.
 //
 // Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
 //
 //
 //
 // Returns the created operation.
-func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16678,15 +27415,75 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParameters",
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			parameters, accumulators, linears,
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexFullMatch",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AsStringAttr is an optional argument to AsString.
 type AsStringAttr func(optionalAttr)
 
@@ -16767,423 +27564,50 @@
 	return op.Output(0)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
-//
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
-	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
-type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load RMSProp embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, ms, mom, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Produces the average pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
+// value: Number of color channels for the decoded image.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["channels"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["dtype"] = value
 	}
 }
 
-// OrderedMapStageContainer sets the optional container attribute to value.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
+// Accepted values are:
 //
-// associative container.   Elements are ordered by key.
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	key: int64
+//	contents: 0-D.  The PNG-encoded image.
 //
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
-type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adagrad embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParameters",
-		Input: []tf.Input{
-			parameters, accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// A TPU core selector Op.
-//
-// This Op produces a set of TPU cores (for warm-up) or a single TPU core
-// (for regular inference) to execute the TPU program on. The output is
-// consumed by TPUPartitionedCall.
-//
-// Returns A vector 1 or more TPU cores.
-func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUOrdinalSelector",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17192,631 +27616,12 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			contents,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
-type InfeedEnqueueAttr func(optionalAttr)
-
-// InfeedEnqueueShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// InfeedEnqueueLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence.
-// If a layout attribute is passed, but its values are all -1, the layout will
-// be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which feeds a single Tensor value into the computation.
-//
-// Arguments:
-//	input: A tensor that will be provided using the infeed mechanism.
-//
-// Returns the created operation.
-func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Fetches multiple values from infeed as an XLA tuple.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be provided using the infeed mechanism.
-func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("InfeedDequeueTuple", err)
-		return
-	}
-	return outputs
-}
-
-// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
-type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load ADAM embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the ADAM optimization algorithm.
-//	momenta: Value of momenta used in the ADAM optimization algorithm.
-//	velocities: Value of velocities used in the ADAM optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, momenta, velocities, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
-
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
-//
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
-	return func(m optionalAttr) {
-		m["maxsplit"] = value
-	}
-}
-
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
-//
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
-//
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
-//
-// Note that the above mentioned behavior matches python's str.split.
-//
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
-		Input: []tf.Input{
-			input, sep,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
-		Input: []tf.Input{
-			input, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Enqueue a Tensor on the computation outfeed.
-//
-// Arguments:
-//	input: A tensor that will be inserted into the outfeed queue.
-//
-// Returns the created operation.
-func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sign",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
-type PrelinearizeTupleAttr func(optionalAttr)
-
-// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for all the
-// tuple shapes in the order the shapes appear in the "shapes" input. The layout
-// elements for a sub-shape can be set to -1 in which case the corresponding layout
-// will be computed by the infeed operation.
-// If not specified, defaults to <>
-func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// An op which linearizes multiple Tensor values to an opaque variant tensor.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PrelinearizeTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
-
-// VarHandleOpContainer sets the optional container attribute to value.
-//
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
-//
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
-//
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CompilationResultProto indicating the status of the TPU compilation.
-func TPUCompilationResult(scope *Scope) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUCompilationResult",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softplus gradients for a softplus operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
-//
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Elementwise computes the bitwise XOR of `x` and `y`.
-//
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Connects N inputs to an N-way replicated TPU computation.
-func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicatedInput",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
@@ -17980,110 +27785,52 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
 
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
 	return func(m optionalAttr) {
-		m["protocol"] = value
+		m["output_idx_type"] = value
 	}
 }
 
-// TryRpcFailFast sets the optional fail_fast attribute to value.
+// Computes the LU decomposition of one or more square matrices.
 //
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
+// The input has to be invertible.
 //
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
 //
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
 //
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18092,9 +27839,90 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TryRpc",
+		Type: "Lu",
 		Input: []tf.Input{
-			address, method, request,
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Advance the counter of a counter-based RNG.
+//
+// The state of the RNG after
+// `rng_skip(n)` will be the same as that after `stateful_uniform([n])`
+// (or any other distribution). The actual increment added to the
+// counter is an unspecified implementation detail.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	delta: The amount of advancement.
+//
+// Returns the created operation.
+func RngSkip(scope *Scope, resource tf.Output, algorithm tf.Output, delta tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RngSkip",
+		Input: []tf.Input{
+			resource, algorithm, delta,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedDepthwiseConv2DAttr is an optional argument to QuantizedDepthwiseConv2D.
+type QuantizedDepthwiseConv2DAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
@@ -18102,121 +27930,122 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
 
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["use_locking"] = value
 	}
 }
 
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// An Op to exchange data across TPU replicas.
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// On each replica, the input is split into `split_count` blocks along
-// `split_dimension` and send to the other replicas given group_assignment. After
-// receiving `split_count` - 1 blocks from other replicas, we concatenate the
-// blocks along `concat_dimension` as the output.
-//
-// For example, suppose there are 2 TPU replicas:
-// replica 0 receives input: `[[A, B]]`
-// replica 1 receives input: `[[C, D]]`
-//
-// group_assignment=`[[0, 1]]`
-// concat_dimension=0
-// split_dimension=1
-// split_count=2
-//
-// replica 0's output: `[[A], [C]]`
-// replica 1's output: `[[B], [D]]`
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//	concat_dimension: The dimension number to concatenate.
-//	split_dimension: The dimension number to split.
-//	split_count: The number of splits, this number must equal to the sub-group
-// size(group_assignment.get_shape()[1])
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// Returns The exchanged result.
-func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AllToAll",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			input, group_assignment,
+			logits, num_samples,
 		},
 		Attrs: attrs,
 	}
@@ -18224,46 +28053,106 @@
 	return op.Output(0)
 }
 
-// Pads a tensor.
+// Adjust the hue of one or more images.
 //
-// This operation pads `input` according to the `paddings` and `constant_values`
-// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many padding values to add before the contents of `input` in that dimension,
-// and `paddings[D, 1]` indicates how many padding values to add after the contents
-// of `input` in that dimension. `constant_values` is a scalar tensor of the same
-// type as `input` that indicates the value to use for padding `input`.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// The padded size of each dimension D of the output is:
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
 //
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # 'constant_values' is 0
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "PadV2",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			input, paddings, constant_values,
+			images, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AllCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
 type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
 
@@ -18320,9296 +28209,6 @@
 	return scope.AddOperation(opspec)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	opspec := tf.OpSpec{
-		Type: "InTopK",
-		Input: []tf.Input{
-			predictions, targets,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
-		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
-//
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
-//
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Assert",
-		Input: []tf.Input{
-			condition, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Writes the given dataset to the given file using the TFRecord format.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//
-// Returns the created operation.
-func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetToTFRecord",
-		Input: []tf.Input{
-			input_dataset, filename, compression_type,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
-type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedDepthwiseConv2DWithBiasAttr is an optional argument to QuantizedDepthwiseConv2DWithBias.
-type QuantizedDepthwiseConv2DWithBiasAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBias(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBias",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Usage:
-//   ```python
-//   import tensorflow as tf
-//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
-//   b = tf.math.argmax(input = a)
-//   c = tf.keras.backend.eval(b)
-//   # c = 4
-//   # here a[4] = 166.32 which is the largest element of a across axis 0
-//   ```
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMax",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, m, v, beta, gamma,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
-type SdcaOptimizerV2Attr func(optionalAttr)
-
-// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
-	return func(m optionalAttr) {
-		m["adaptive"] = value
-	}
-}
-
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-//
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaOptimizerV2",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
-//
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
-
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
-//
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
-	return func(m optionalAttr) {
-		m["skip_empty"] = value
-	}
-}
-
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
-//
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
-//
-// Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
-//
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplit",
-		Input: []tf.Input{
-			input, delimiter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Provides the time since epoch in seconds.
-//
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
-//
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Timestamp",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Enqueue multiple Tensor values on the computation outfeed.
-//
-// Arguments:
-//	inputs: A list of tensors that will be inserted into the outfeed queue as an
-// XLA tuple.
-//
-// Returns the created operation.
-func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
-type InfeedEnqueueTupleAttr func(optionalAttr)
-
-// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for
-// all the tuple shapes, in the order the shapes appear in the "shapes" input.
-// The layout elements for a sub-shape can be set to -1, in which case the
-// corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Feeds multiple Tensor values into the computation as an XLA tuple.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-//
-// Returns the created operation.
-func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
-
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs beam search decoding on the logits given in input.
-//
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
-//
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
-		Input: []tf.Input{
-			inputs, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
-}
-
-// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
-type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
-
-// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$vhat_t := max{vhat_{t-1}, v_t}$$
-// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	vhat: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdamWithAmsgrad",
-		Input: []tf.Input{
-			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-//
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT2D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
-type ResourceApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Transforms a Tensor into a serialized TensorProto proto.
-//
-// Arguments:
-//	tensor: A Tensor of type `T`.
-//
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
-		Input: []tf.Input{
-			tensor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate a sharded filename. The filename is printf formatted as
-//
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
-type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve MDL Adagrad Light embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
-func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
-// if < 0, `scale * features` otherwise.
-//
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-//
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Selu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
-
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
-//
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
-//
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
-//
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Save",
-		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
-
-// StringLengthUnit sets the optional unit attribute to value.
-//
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// String lengths of `input`.
-//
-// Computes the length of each string given in the input tensor.
-//
-// Arguments:
-//	input: The string for which to compute the length.
-//
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringLength",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
-//
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniform",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Max",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxesV2",
-		Input: []tf.Input{
-			images, boxes, colors,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
-//
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Digamma",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NthElement",
-		Input: []tf.Input{
-			input, n,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prod",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
-//
-// Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
-//
-//
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
-		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
-type StatefulUniformFullIntAttr func(optionalAttr)
-
-// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformFullInt",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
-//
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
-//
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the number of tensors in the input tensor list.
-//
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
-
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse addition to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that addition would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// add = tf.scatter_nd_add(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(add)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of x element-wise.
-//
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
-type ResourceScatterNdSubAttr func(optionalAttr)
-
-// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse subtraction to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-// with 8 elements. In Python, that subtraction would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// sub = tf.scatter_nd_sub(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(sub)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, -9, 3, -6, -4, 6, 7, -4]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdSub",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
-type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve RMSProp embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT2D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
-
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
-
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
-		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adagrad embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
-type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load ADAM embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the ADAM optimization algorithm.
-//	momenta: Value of momenta used in the ADAM optimization algorithm.
-//	velocities: Value of velocities used in the ADAM optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingADAMParameters",
-		Input: []tf.Input{
-			parameters, momenta, velocities,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Selects num_to_sample rows of input using the KMeans++ criterion.
-//
-// Rows of points are assumed to be input points. One row is selected at random.
-// Subsequent rows are sampled with probability proportional to the squared L2
-// distance from the nearest row selected thus far till num_to_sample rows have
-// been sampled.
-//
-// Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	num_to_sample: Scalar. The number of rows to sample. This value must not be larger than n.
-//	seed: Scalar. Seed for initializing the random number generator.
-//	num_retries_per_sample: Scalar. For each row that is sampled, this parameter
-// specifies the number of additional points to draw from the current
-// distribution before selecting the best. If a negative value is specified, a
-// heuristic is used to sample O(log(num_to_sample)) additional points.
-//
-// Returns Matrix of shape (num_to_sample, d). The sampled rows.
-func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample tf.Output, seed tf.Output, num_retries_per_sample tf.Output) (samples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "KmeansPlusPlusInitialization",
-		Input: []tf.Input{
-			points, num_to_sample, seed, num_retries_per_sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
-
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
-//
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
-//
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adagrad embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Connects outputs of an N-way replicated computation to N outputs.
-func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicatedOutput",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("TPUReplicatedOutput", err)
-		return
-	}
-	return outputs
-}
-
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnbatchAttr is an optional argument to Unbatch.
-type UnbatchAttr func(optionalAttr)
-
-// UnbatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchContainer(value string) UnbatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnbatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchSharedName(value string) UnbatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Reverses the operation of Batch for a single output Tensor.
-//
-// An instance of Unbatch either receives an empty batched_tensor, in which case it
-// asynchronously waits until the values become available from a concurrently
-// running instance of Unbatch with the same container and shared_name, or receives
-// a non-empty batched_tensor in which case it finalizes all other concurrently
-// running instances and outputs its own element from the batch.
-//
-// batched_tensor: The possibly transformed output of Batch. The size of the first
-//  dimension should remain unchanged by the transformations for the operation to
-//  work.
-// batch_index: The matching batch_index obtained from Batch.
-// id: The id scalar emitted by Batch.
-// unbatched_tensor: The Tensor corresponding to this execution.
-// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
-//  batched input tensor associated with a given invocation of the op.
-// container: Container to control resource sharing.
-// shared_name: Instances of Unbatch with the same container and shared_name are
-//  assumed to possibly belong to the same batch. If left empty, the op name will
-//  be used as the shared name.
-func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unbatch",
-		Input: []tf.Input{
-			batched_tensor, batch_index, id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_workers.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
-//
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
-		Input: []tf.Input{
-			input_dataset, num_workers,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
-//
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
-//
-// Arguments:
-//	input: Strings to be encoded.
-//
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
-
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
-
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AdaMax algorithm.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds sparse `updates` to an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by adding sparse `updates` to the passed
-// in `tensor`.
-// This operation is very similar to `tf.scatter_nd_add`, except that the updates
-// are added onto an existing tensor (as opposed to a variable). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of tensor_scatter_add is to add individual elements to a
-// tensor by index. For example, say we want to add 4 elements in a rank-1
-// tensor with 8 elements.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [1, 12, 1, 11, 10, 1, 1, 13]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//
-// Returns A new tensor copied from tensor and updates added according to the indices.
-func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorScatterAdd",
-		Input: []tf.Input{
-			tensor, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Divides sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] /= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
-type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
-
-// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
-// and = 0 when the Op is running on the CPU device.
-// If not specified, defaults to -1
-func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which enqueues prelinearized buffer into TPU infeed.
-//
-// Arguments:
-//	input: A variant tensor representing linearized output.
-//
-// Returns the created operation.
-func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueuePrelinearizedBuffer",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
-
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
-//
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Worker heartbeat op.
-//
-// Heartbeats may be sent periodically to indicate the coordinator is still active,
-// to retrieve the current worker status and to expedite shutdown when necessary.
-//
-// Arguments:
-//	request: A string tensor containing a serialized WorkerHeartbeatRequest
-//
-// Returns A string tensor containing a serialized WorkerHeartbeatResponse
-func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WorkerHeartbeat",
-		Input: []tf.Input{
-			request,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve SGD embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
-//
-// For example:
-//
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
-//
-// Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-//
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
-		Input: []tf.Input{
-			inputs, reduction_indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
-//
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PopulationCount",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
-
-// EncodePngCompression sets the optional compression attribute to value.
-//
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
-	return func(m optionalAttr) {
-		m["compression"] = value
-	}
-}
-
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
-//
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
-//
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
-//
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
-//
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodePng",
-		Input: []tf.Input{
-			image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSlice",
-		Input: []tf.Input{
-			indices, values, shape, start, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
-type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// A container for an iterator resource.
-//
-// Arguments:
-//	handle: A handle to the iterator to delete.
-//	deleter: A variant deleter.
-//
-// Returns the created operation.
-func DeleteIterator(scope *Scope, handle tf.Output, deleter tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteIterator",
-		Input: []tf.Input{
-			handle, deleter,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
-//
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
-//
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
-//
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
-		Input: []tf.Input{
-			matrix, rhs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
-type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
-//
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
-//
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
-type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
-type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Deprecated. Use TensorArrayCloseV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
-//
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load proximal Adagrad embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
-type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve centered RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
-	}
-}
-
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
-//
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// JPEG encode input image with provided compression quality.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-// `quality` is an int32 jpeg compression quality value between 0 and 100.
-//
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	quality: An int quality to encode to.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeJpegVariableQuality",
-		Input: []tf.Input{
-			images, quality,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
-
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the (key, value) element with the smallest
-//
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
-
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
-}
-
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
-//
-// ```
-//
-// Arguments:
-//
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Select",
-		Input: []tf.Input{
-			condition, x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
-type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Momentum embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParameters",
-		Input: []tf.Input{
-			parameters, momenta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Reshapes a tensor.
-//
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
-//
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-//
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
-//
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
-//
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-//
-// # -1 can also be used to infer the shape
-//
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
-//
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reshape",
-		Input: []tf.Input{
-			tensor, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a tensor of zeros with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
-//
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ZerosLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Adds `bias` to `value`.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAdd",
-		Input: []tf.Input{
-			value, bias,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates and returns an empty tensor list.
-//
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
-//
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
-		Input: []tf.Input{
-			element_shape, max_num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
-type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve proximal Adagrad embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
-//
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
-//
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An op that receives embedding activations on the TPU.
-//
-// The TPU system performs the embedding lookups and aggregations specified by
-// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-// results of these aggregations are visible to the Tensorflow Graph as the
-// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-// one Tensor of activations per table specified in the model. There can be at
-// most one RecvTPUEmbeddingActivations op in the TPU graph.
-//
-// Arguments:
-//	num_outputs: The number of output activation tensors, equal to the number of
-// embedding tables in the model.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns A TensorList of embedding activations containing one Tensor per
-// embedding table in the model.
-func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
-	opspec := tf.OpSpec{
-		Type: "RecvTPUEmbeddingActivations",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
-		return
-	}
-	return outputs
-}
-
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_max(c, tf.constant([0, 0, 1]))
-// # ==> [[4, 3, 3, 4],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-//
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Svd",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Conj",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
-//
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
-		Input: []tf.Input{
-			images, scale,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An op enabling differentiation of TPU Embeddings.
-//
-// This op simply returns its first input, which is assumed to have been sliced
-// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
-// this op, and its first argument being a trainable Variable, enables automatic
-// differentiation of graphs containing embeddings via the TPU Embedding Python
-// libraries.
-//
-// Arguments:
-//	embedding_variable: A trainable variable, enabling optimizers to find this op.
-//	sliced_activations: The embedding activations Tensor to return.
-//	table_id: The id of the table in the embedding layer configuration from which
-// these activations were computed.
-//	lookup_id: Identifier of the set of embedding indices which produced these
-// activations.
-func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
-	opspec := tf.OpSpec{
-		Type: "TPUEmbeddingActivations",
-		Input: []tf.Input{
-			embedding_variable, sliced_activations,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Performs gradient updates of embedding tables.
-//
-// Arguments:
-//	inputs: A TensorList of gradients with which to update embedding tables.
-// This argument has the same length and shapes as the return value of
-// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-// with respect to the embedding activations. The embedding tables are updated
-// from these gradients via the optimizer specified in the TPU embedding
-// configuration given to tpu.initialize_system.
-//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-// rate tag: see the comments in
-// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
-// Multiple tables can share the same dynamic learning rate tag as specified
-// in the configuration. If the learning rates for all tables are constant,
-// this list should be empty.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns the created operation.
-func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "SendTPUEmbeddingGradients",
-		Input: []tf.Input{
-			tf.OutputList(inputs), tf.OutputList(learning_rates),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
-		Input: []tf.Input{
-			features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
-	opspec := tf.OpSpec{
-		Type: "SparseCross",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
-//
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
-//
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
-//
-// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
-type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op that enqueues a list of input batch tensors to TPUEmbedding.
-//
-// Arguments:
-//	batch: A list of 1D tensors, one for each embedding table, containing the
-// indices into the tables.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingIntegerBatch",
-		Input: []tf.Input{
-			tf.OutputList(batch), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
-type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// An op that enqueues TPUEmbedding input indices from a SparseTensor.
-//
-// This Op eases the porting of code that uses embedding_lookup_sparse(),
-// although some Python preprocessing of the SparseTensor arguments to
-// embedding_lookup_sparse() is required to produce the arguments to this Op,
-// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-// step.
-//
-// The tensors at corresponding positions in the three input lists
-// must have the same shape, i.e. rank 1 with dim_size() equal to the total
-// number of lookups into the table described by the corresponding table_id.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example and
-// feature to which the corresponding embedding_indices and aggregation_weights
-// values belong. sample_indices[i] must equal b * nf + f, where nf is the
-// number of features from the corresponding table, f is in [0, nf), and
-// b is in [0, batch size).
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-// (training example, feature) -- aggregation weights.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
-//
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-//
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ImageSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
-
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
-//
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
-//
-// and
-//
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
-//
-// then the final `SparseTensor` will be:
-//
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
-//
-// Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
-//
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
-		Input: []tf.Input{
-			sparse_handles,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
-//
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
-//
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
-	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
-}
-
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
-		Input: []tf.Input{
-			contents, crop_window,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
-//
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
-//
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
-		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
-//
-// Arguments:
-//	record_bytes: Number of bytes in the record.
-//
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeArea",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
-//
-//
-//
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
-	}
-	return components
-}
-
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-//
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
-//
-// Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
-		Input: []tf.Input{
-			values, value_range, nbins,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that overrides the maximum intra-op parallelism.
-//
-// Arguments:
-//
-//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
-//
-//
-func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalMaxIntraOpParallelismDataset",
-		Input: []tf.Input{
-			input_dataset, max_intra_op_parallelism,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
-//
-//     output = sum(t ** 2) / 2
-//
-// Arguments:
-//	t: Typically 2-D, but may have any dimensions.
-//
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "L2Loss",
-		Input: []tf.Input{
-			t,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// An Op to permute tensors across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-// `[D, A, B, C]`.
-//
-// Arguments:
-//	input: The local input to be permuted. Currently only supports float and
-// bfloat16.
-//	source_target_pairs: A tensor with shape [num_pairs, 2].
-//
-// Returns The permuted input.
-func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CollectivePermute",
-		Input: []tf.Input{
-			input, source_target_pairs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
-//
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
-type OutfeedDequeueTupleAttr func(optionalAttr)
-
-// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieve multiple values from the computation outfeed.
-//
-// This operation will block indefinitely until data is available. Output `i`
-// corresponds to XLA tuple element `i`.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be read from the outfeed.
-func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("OutfeedDequeueTuple", err)
-		return
-	}
-	return outputs
-}
-
-// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
-type ConfigureDistributedTPUAttr func(optionalAttr)
-
-// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
-//
-// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-// describes the embedding lookups of the program.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["tpu_embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to false
-func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["is_global_init"] = value
-	}
-}
-
-// Sets up the centralized structures for a distributed TPU system.
-//
-// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
-// topology.
-func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ConfigureDistributedTPU",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
-//
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
-//
-// Arguments:
-//
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
-//
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
-		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
-	opspec := tf.OpSpec{
-		Type: "Bucketize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
-
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
-//
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
-//
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGamma",
-		Input: []tf.Input{
-			shape, alpha,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
-//
-// The runtime is then free to make optimizations based on this.
-//
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
-//
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resizes the list.
-//
-//
-// input_handle: the input list
-// size: size of the output list
-//
-func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListResize",
-		Input: []tf.Input{
-			input_handle, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchMatMulV2Attr is an optional argument to BatchMatMulV2.
-type BatchMatMulV2Attr func(optionalAttr)
-
-// BatchMatMulV2AdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulV2AdjX(value bool) BatchMatMulV2Attr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulV2AdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// *NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
-// about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
-//
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
-//
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMulV2(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchMatMulV2",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
-		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
-type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load FTRL embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, linears, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Delete the stack from its resource container.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
-type StatefulStandardNormalV2Attr func(optionalAttr)
-
-// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormalV2",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
-//
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, thread_pool,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
-		Input: []tf.Input{
-			input, pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
-
-// StringFormatTemplate sets the optional template attribute to value.
-//
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["template"] = value
-	}
-}
-
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
-//
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["placeholder"] = value
-	}
-}
-
-// StringFormatSummarize sets the optional summarize attribute to value.
-//
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Formats a string template using a list of tensors.
-//
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
-//
-// Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
-//
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringFormat",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a copy of the input tensor.
-func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Snapshot",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorListConcatAttr is an optional argument to TensorListConcat.
-type TensorListConcatAttr func(optionalAttr)
-
-// TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
-//
-// input_handle: The input list.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListConcat",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
-
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of bilinear interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
-}
-
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a TensorList which, when stacked, has the value of `tensor`.
-//
-// Each tensor in the result list corresponds to one row of the input tensor.
-//
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
-		Input: []tf.Input{
-			tensor, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Determine the script codes of a given tensor of Unicode integer code points.
-//
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
-//
-// Arguments:
-//	input: A Tensor of int32 Unicode code points.
-//
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
-type UnicodeTranscodeAttr func(optionalAttr)
-
-// UnicodeTranscodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-//
-// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
-// as ' ', will preserve string alignment to the source since invalid bytes will be
-// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
-// replacement character will preserve byte alignment to the source.
-// If not specified, defaults to 65533
-func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// Transcode the input text from a source encoding to a destination encoding.
-//
-// The input is a string tensor of any shape. The output is a string tensor of
-// the same shape containing the transcoded strings. Output strings are always
-// valid unicode. If the input contains invalid encoding positions, the
-// `errors` attribute sets the policy for how to deal with them. If the default
-// error-handling policy is used, invalid formatting will be substituted in the
-// output by the `replacement_char`. If the errors policy is to `ignore`, any
-// invalid encoding positions in the input are skipped and not included in the
-// output. If it set to `strict` then any invalid formatting will result in an
-// InvalidArgument error.
-//
-// This operation can be used with `output_encoding = input_encoding` to enforce
-// correct formatting for inputs even if they are already in the desired encoding.
-//
-// If the input is prefixed by a Byte Order Mark needed to determine encoding
-// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
-// BOM will be consumed and not emitted into the output. If the input encoding
-// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
-// interpreted as a non-breaking-space and is preserved in the output (including
-// always for UTF-8).
-//
-// The end result is that if the input is marked as an explicit endianness the
-// transcoding is faithful to all codepoints in the source. If it is not marked
-// with an explicit endianness, the BOM is not considered part of the string itself
-// but as metadata, and so is not preserved in the output.
-//
-// Arguments:
-//	input: The text to be processed. Can have any shape.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//	output_encoding: The unicode encoding to use in the output. Must be one of
-// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
-//
-// Returns A string tensor containing unicode text encoded using `output_encoding`.
-func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeTranscode",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
-//
-// Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
-//
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "FakeParam",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
-
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Inv",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
-
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
-//
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
-//
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
-	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
-		Input: []tf.Input{
-			empty_key, deleted_key,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TopKV2",
-		Input: []tf.Input{
-			input, k,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
-
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
-	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
-	}
-}
-
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
-//
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
-		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
-//
-// Our Conv3D implements a form of cross-correlation.
-//
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
-//
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
-		Input: []tf.Input{
-			handle, tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SubstrAttr is an optional argument to Substr.
 type SubstrAttr func(optionalAttr)
 
@@ -27730,374 +28329,289 @@
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Determine the script codes of a given tensor of Unicode integer code points.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+//	input: A Tensor of int32 Unicode code points.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Converts the given `resource_handle` representing an iterator to a string.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "UnicodeScript",
 		Input: []tf.Input{
-			resource_handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformInt",
-		Input: []tf.Input{
-			resource, algorithm, shape, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// UnicodeDecodeErrors sets the optional errors attribute to value.
 //
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["errors"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
 	return func(m optionalAttr) {
-		m["table_name"] = value
+		m["replacement_char"] = value
 	}
 }
 
-// Retrieve Adadelta embedding parameters with debug support.
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
 //
-// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
-
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
-//
-// Arguments:
-//	mutex: The mutex resource to lock.
-//
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MutexLock",
-		Input: []tf.Input{
-			mutex,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserializes a proto into the tree handle
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource to be restored.
-//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
-//
-// Returns the created operation.
-func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeDeserialize",
-		Input: []tf.Input{
-			tree_handle, tree_config,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
-//
-// Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
-//
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
-		Input: []tf.Input{
-			reverse_index_map, grad_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
-type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
+// Subtracts a value from the current value of a variable.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_QUINT8
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// value: List of dilation values.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	min_freezed_output: The minimum float value of the output tensor.
-//	max_freezed_output: The maximum float value of the output tensor.
-//	strides: List of stride values.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28106,34 +28620,173 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
+type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// # ==> [[5, 5, 5, 5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSum",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
+
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
 //
 // value: An optional bool. Defaults to True. If True, the assignment will
 // be protected by a lock; otherwise the behavior is undefined,
 // but may exhibit less contention.
 // If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
+// Applies sparse subtraction to individual values or slices in a Variable.
 //
 // `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
@@ -28147,24 +28800,24 @@
 // `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
 // ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 // ```
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
 //
 // ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
 // ```
 //
 // The resulting update to ref would look like this:
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+//     [1, -9, 3, -6, -4, 6, 7, -4]
 //
 // See `tf.scatter_nd` for more details about how to make updates to
 // slices.
@@ -28173,11 +28826,11 @@
 //	ref: A resource handle. Must be from a VarHandleOp.
 //	indices: A Tensor. Must be one of the following types: int32, int64.
 // A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+//	updates: A Tensor. Must have the same type as ref. A tensor of
 // values to add to ref.
 //
 // Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28186,7 +28839,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "ResourceScatterNdSub",
 		Input: []tf.Input{
 			ref, indices, updates,
 		},
@@ -28195,6 +28848,436 @@
 	return scope.AddOperation(opspec)
 }
 
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that batches input elements into a SparseTensor.
+//
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
+//
+//
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
+type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+//
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppression",
+		Input: []tf.Input{
+			boxes, scores, max_output_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for the scaled exponential linear (Selu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
+//
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SeluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorArray for storing multiple gradients of values in the given handle.
+//
+// Similar to TensorArrayGradV3. However it creates an accumulator with an
+// expanded shape compared to the input TensorArray whose gradient is being
+// computed. This enables multiple gradients for the same TensorArray to be
+// calculated using the same accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
+// have shape which is this shape_to_prepend value concatenated with shape of the
+// elements in the TensorArray corresponding to the input handle.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradWithShape",
+		Input: []tf.Input{
+			handle, flow_in, shape_to_prepend,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes softplus gradients for a softplus operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // 2D real-valued fast Fourier transform.
 //
 // Computes the 2-dimensional discrete Fourier transform of a real-valued signal
@@ -28235,848 +29318,64 @@
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
+// Check if the input matches the regex pattern.
 //
-// to zero.
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "RegexFullMatch",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			input, pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
-
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Usage:
-//   ```python
-//   import tensorflow as tf
-//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
-//   b = tf.math.argmin(input = a)
-//   c = tf.keras.backend.eval(b)
-//   # c = 0
-//   # here a[0] = 1 which is the smallest element of a across axis 0
-//   ```
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMin",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
-// for an explanation of segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
-
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
-//
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
-// If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
-	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-//
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
-//
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the trignometric inverse sine of x element-wise.
-//
-// The `tf.math.asin` operation returns the inverse of `tf.math.sin`, such that
-// if `y = tf.math.sin(x)` then, `x = tf.math.asin(y)`.
-//
-// **Note**: The output of `tf.math.asin` will lie within the invertible range
-// of sine, i.e [-pi/2, pi/2].
-//
-// For example:
-//
-// ```python
-// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
-// x = tf.constant([1.047, 0.785])
-// y = tf.math.sin(x) # [0.8659266, 0.7068252]
-//
-// tf.math.asin(y) # [1.047, 0.785] = x
-// ```
-//
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
-//
-// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceDeserialize",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Retrieve proximal Adagrad embedding parameters with debug support.
+// Retrieve SGD embedding parameters.
 //
 // An op that retrieves optimization parameters from embedding to host
 // memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
 // the correct embedding table configuration. For example, this op is
 // used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29085,151 +29384,629 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
+
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global (that is, all matches of the `pattern` regular
+// expression in each input string are rewritten), otherwise the `rewrite`
+// substitution is only made for the first `pattern` match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces matches of the `pattern` regular expression in `input` with the
+// replacement string provided in `rewrite`.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to be matched in the `input` strings.
+//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
+// matched in the `input` strings.
+//
+// Returns The text after applying pattern match and rewrite substitution.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexReplace",
+		Input: []tf.Input{
+			input, pattern, rewrite,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseSetOperation",
+		Input: []tf.Input{
+			set1, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Solves tridiagonal systems of equations.
+// Converts one or more images from RGB to HSV.
 //
-// `diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
-// represent matrices with three rows being the superdiagonal, diagonals, and
-// subdiagonals, in order. The last element of the superdiagonal and the first
-// element of the subdiagonal is ignored.
-// `rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
-// each left-hand side.
-// The output is a tensor of shape `[..., M, K]` containing the solutions.
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	diagonals: Shape is `[..., 3, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns Shape is `[..., M, K]`.
-func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TridiagonalSolve",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			diagonals, rhs,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
-type QuantizedConv2DPerChannelAttr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// QuantizedConv2DPerChannelOutType sets the optional out_type attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChannelAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
-//
-// value: list of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["update_slots"] = value
 	}
 }
 
-// Computes QuantizedConv2D per channel.
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	min_input: The minimum value of the input tensor
-//	max_input: The maximum value of the input tensor.
-//	min_filter: The minimum value of the filter tensor.
-//	max_filter: The maximum value of the filter tensor.
-//	strides: list of stride values.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-//
-// Returns The output tensor.The minimum value of the final output tensor.The maximum value of the final output tensor.
-func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DPerChannelAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2DPerChannel",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAddSign",
+		Input: []tf.Input{
+			var_, m, lr, alpha, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a quantized tensor as per the Reshape op.
+//
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReshape",
+		Input: []tf.Input{
+			tensor, shape, input_min, input_max,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QuantizedDepthwiseConv2DAttr is an optional argument to QuantizedDepthwiseConv2D.
-type QuantizedDepthwiseConv2DAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DOutType sets the optional out_type attribute to value.
+// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2DAttr {
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MulNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes a range that covers the actual values present in a quantized tensor.
+//
+// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+// range that covers the actual values present in that tensor. This op is typically
+// used to produce the `requested_output_min` and `requested_output_max` for
+// `Requantize`.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
+// For example:
 //
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2D",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighbor",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+//
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
+//
+// Arguments:
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadAndRemapMatrix",
+		Input: []tf.Input{
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
@@ -29288,23 +30065,6 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
-//
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Outputs a tensor containing the reduction across all input tensors.
 //
 // Outputs a tensor containing the reduction across all input tensors passed to ops
@@ -29335,62 +30095,53 @@
 	return op.Output(0)
 }
 
-// Reduces `input` from `num_devices` using `reduction` to a single device.
-//
-// Reduces `input` from `num_devices` using `reduction` to a single device.
-//
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
-//
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"reduction": reduction}
-	opspec := tf.OpSpec{
-		Type: "NcclReduce",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// PrelinearizeAttr is an optional argument to Prelinearize.
-type PrelinearizeAttr func(optionalAttr)
-
-// PrelinearizeShape sets the optional shape attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["shape"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// PrelinearizeLayout sets the optional layout attribute to value.
+// Converts a sparse representation into a dense tensor.
 //
-// value: A vector holding the requested layout in minor-to-major sequence. If a layout
-// attribute is passed but its values are all -1 the layout will be computed by
-// the infeed operation.
-// If not specified, defaults to <>
-func PrelinearizeLayout(value []int64) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// An op which linearizes one Tensor value to an opaque variant tensor.
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	input: A tensor that will be linearized.
-func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29399,9 +30150,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Prelinearize",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
@@ -29409,54 +30160,68 @@
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// N is the size of the segment being reduced.
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-// See `tf.sparse.segment_sum` for usage examples.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
 // Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Sends `input` to all devices that are connected to the output.
@@ -29486,901 +30251,6 @@
 	return op.Output(0)
 }
 
-// PlaceholderAttr is an optional argument to Placeholder.
-type PlaceholderAttr func(optionalAttr)
-
-// PlaceholderShape sets the optional shape attribute to value.
-//
-// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-// shape is unconstrained.
-// If not specified, defaults to <unknown_rank:true >
-func PlaceholderShape(value tf.Shape) PlaceholderAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Placeholder",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatters tensor at indices in an input list.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The list to scatter into.
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// output_handle: The TensorList.
-func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterIntoExistingList",
-		Input: []tf.Input{
-			input_handle, tensor, indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddN",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
-
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
-type RequantizePerChannelAttr func(optionalAttr)
-
-// RequantizePerChannelOutType sets the optional out_type attribute to value.
-//
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QUINT8
-func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Requantizes input with min and max values known per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	requested_output_min: The minimum value of the output tensor requested.
-//	requested_output_max: The maximum value of the output tensor requested.
-//
-// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
-func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
-//
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces a summary of any statistics recorded by the given statistics manager.
-func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
-
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
-		Input: []tf.Input{
-			optional,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes square of x element-wise.
-//
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Square",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes numerical negative value element-wise.
-//
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
-
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
-
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-//
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-//
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-//
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-//
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-//
-// ```c++
-// result = round(input * s)
-// ```
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
-//
-// Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
-		Input: []tf.Input{
-			input, min_range, max_range,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the sqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets the next output from the given iterator.
-//
-// This operation is a synchronous version IteratorGetNext. It should only be used
-// in situations where the iterator does not block the calling thread, or where
-// the calling thread is not a member of the thread pool used to execute parallel
-// operations (e.g. in eager mode).
-func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNextSync",
-		Input: []tf.Input{
-			iterator,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNextSync", err)
-		return
-	}
-	return components
-}
-
-// Computes reciprocal of square root of x element-wise.
-//
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rsqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// A Reader that outputs the records from a TensorFlow Records file.
-//
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
 type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
 
@@ -30467,155 +30337,217 @@
 	return scope.AddOperation(opspec)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// Computes exponential of x - 1 element-wise.
-//
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["Tout"] = value
 	}
 }
 
-// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+// Returns the imaginary part of a complex number.
 //
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Performs max pooling on the input and outputs both max values and indices.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index:
-// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
-// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
+// For example:
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "Imag",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMul",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
+type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Strip leading and trailing whitespaces from the Tensor.
+// Computes the absolute value of a tensor.
 //
-// Arguments:
-//	input: A string `Tensor` of any shape.
-//
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Abs",
 		Input: []tf.Input{
 			x,
 		},
@@ -30624,205 +30556,35 @@
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["Tout"] = value
 	}
 }
 
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+// Computes the complex absolute value of a tensor.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			input, filter,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -30830,30 +30592,32 @@
 	return op.Output(0)
 }
 
-// Computes the trignometric inverse tangent of x element-wise.
+// Computes numerical negative value element-wise.
 //
-// The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
-// if `y = tf.math.tan(x)` then, `x = tf.math.atan(y)`.
-//
-// **Note**: The output of `tf.math.atan` will lie within the invertible range
-// of tan, i.e (-pi/2, pi/2).
-//
-// For example:
-//
-// ```python
-// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
-// x = tf.constant([1.047, 0.785])
-// y = tf.math.tan(x) # [1.731261, 0.99920404]
-//
-// tf.math.atan(y) # [1.047, 0.785] = x
-// ```
-//
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Inv",
 		Input: []tf.Input{
 			x,
 		},
@@ -30868,1642 +30632,22 @@
 // [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 4,  3, 3, 4],
-// #       [5,  6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes a copy of `x`.
-//
-// Arguments:
-//	x: The source tensor of type `T`.
-//
-// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
-//       is not an alias of `x`.
-func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeepCopy",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns which elements of x are NaN.
-//
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsNan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns which elements of x are finite.
-//
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsFinite",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
-//
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 4,  6, 6, 4],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If there is no entry for a given segment ID `i`, it outputs 1.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a tree resource and returns a handle to it.
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource to be created.
-//	tree_config: Serialized proto string of the boosted_trees.Tree.
-//
-// Returns the created operation.
-func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestCreateTreeVariable",
-		Input: []tf.Input{
-			tree_handle, tree_config,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
-//
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
-	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
-	}
-}
-
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
-//
-// Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
-//
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
-		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
-//
-// *NOTE*: `Mul` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MulNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if the denominator is zero.
-//
-//
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DivNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LuAttr is an optional argument to Lu.
-type LuAttr func(optionalAttr)
-
-// LuOutputIdxType sets the optional output_idx_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LuOutputIdxType(value tf.DataType) LuAttr {
-	return func(m optionalAttr) {
-		m["output_idx_type"] = value
-	}
-}
-
-// Computes the LU decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be invertible.
-//
-// The output consists of two tensors LU and P containing the LU decomposition
-// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
-// upper triangular factors.
-//
-// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
-// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
-// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
-// entries correspond to the upper triangular part, including the diagonal, of LU.
-//
-// P represents a permutation matrix encoded as a list of indices each between `0`
-// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
-// P, then the L, U and P satisfies P_mat * input = L * U.
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
-// size `[M, M]`.
-//
-// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
-// lower triangular factor `L` with unit diagonal, and whose upper triangular part
-// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
-// `[..., M]`.
-// @compatibility(scipy)
-// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
-// packed into a single tensor, the permutation is applied to `input` instead of
-// the right hand side and the permutation `P` is returned as a list of indices
-// instead of a permutation matrix.
-// @end_compatibility
-func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Lu",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Shuts down a running distributed TPU system.
-//
-// The op returns an error if no system is running.
-//
-// Returns the created operation.
-func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShutdownDistributedTPU",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns x // y element-wise.
-//
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xdivy",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-//
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-//
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-//
-//
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
-	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
-		Input: []tf.Input{
-			table_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
-
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// UnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op is similar to a lightweight Dequeue.
-//
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unstage",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
-	}
-	return values
-}
-
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
-//
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-//
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igammac",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes requantization range per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	clip_value_max: The maximum value of the output that needs to be clipped.
-// Example: set this to 6 for Relu6.
-//
-// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
-func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRangePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Zeta",
-		Input: []tf.Input{
-			x, q,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	value: The tensor to split.
-//	size_splits: list containing the sizes of each output tensor along the split
-// dimension. Must sum to the dimension of value along split_dim.
-// Can contain one -1 indicating that dimension is to be inferred.
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//
-//
-// Returns Tensors whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `size_splits[i]`.
-func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SplitV",
-		Input: []tf.Input{
-			value, size_splits, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("SplitV", err)
-		return
-	}
-	return output
-}
-
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mean",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatefulTruncatedNormalAttr is an optional argument to StatefulTruncatedNormal.
-type StatefulTruncatedNormalAttr func(optionalAttr)
-
-// StatefulTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulTruncatedNormalDtype(value tf.DataType) StatefulTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulTruncatedNormal",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorStridedSliceUpdateAttr is an optional argument to TensorStridedSliceUpdate.
-type TensorStridedSliceUpdateAttr func(optionalAttr)
-
-// TensorStridedSliceUpdateBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func TensorStridedSliceUpdateBeginMask(value int64) TensorStridedSliceUpdateAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// TensorStridedSliceUpdateEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func TensorStridedSliceUpdateEndMask(value int64) TensorStridedSliceUpdateAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// TensorStridedSliceUpdateEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func TensorStridedSliceUpdateEllipsisMask(value int64) TensorStridedSliceUpdateAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// TensorStridedSliceUpdateNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func TensorStridedSliceUpdateNewAxisMask(value int64) TensorStridedSliceUpdateAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// TensorStridedSliceUpdateShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func TensorStridedSliceUpdateShrinkAxisMask(value int64) TensorStridedSliceUpdateAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Assign `value` to the sliced l-value reference of `input`.
-//
-// The values of `value` are assigned to the positions in the tensor `input` that
-// are selected by the slice parameters. The slice parameters `begin` `end`
-// `strides` etc. work exactly as in `StridedSlice`.
-//
-// NOTE this op currently does not support broadcasting and so `value`'s shape
-// must be exactly the shape produced by the slice of `input`.
-func TensorStridedSliceUpdate(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...TensorStridedSliceUpdateAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorStridedSliceUpdate",
-		Input: []tf.Input{
-			input, begin, end, strides, value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
-//
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x >= y) element-wise.
-//
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x AND y element-wise.
-//
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
-//
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Sum",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
-//
-// Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Diag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EuclideanNormAttr is an optional argument to EuclideanNorm.
-type EuclideanNormAttr func(optionalAttr)
-
-// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the euclidean norm of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EuclideanNorm",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalLatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
 // Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
 // that `segment_ids[j] == i`.
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 // </div>
 //
 // For example:
 //
 // ```
 // c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_prod(c, tf.constant([0, 0, 1]))
-// # ==> [[4, 6, 6, 4],
+// tf.segment_max(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 3, 3, 4],
 // #      [5, 6, 7, 8]]
 // ```
 //
@@ -32515,12 +30659,12 @@
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "SegmentMax",
 		Input: []tf.Input{
 			data, segment_ids,
 		},
@@ -32529,241 +30673,54 @@
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 5,  5, 5, 5],
-// #       [5,  6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// Computes the minimum along segments of a tensor.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the minimum such that:
-//
-// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the minimum is empty for a given segment ID `i`, it outputs the largest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::max()`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 1,  2, 2, 1],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMin",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// Restores a tensor from checkpoint files.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum - lr * grad
-// var += accum
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the mean along sparse segments of a tensor.
-//
-// See `tf.sparse.segment_sum` for usage examples.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutexV2Attr is an optional argument to MutexV2.
-type MutexV2Attr func(optionalAttr)
-
-// MutexV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this variable is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutexV2Container(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutexV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this variable is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func MutexV2SharedName(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a Mutex resource that can be locked by `MutexLock`.
-//
-// Returns The mutex resource.
-func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutexV2",
-
-		Attrs: attrs,
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
@@ -32772,12 +30729,12 @@
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
 // is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
 			y, dy,
 		},
@@ -32786,115 +30743,41 @@
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
+type StatefulUniformFullIntAttr func(optionalAttr)
 
-// Creates a dataset that shards the input dataset.
+// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
 //
-// Creates a dataset that shards the input dataset by num_workers, returning a
-// sharded dataset for the index-th worker. This attempts to automatically shard
-// a dataset by examining the Dataset graph and inserting a shard op before the
-// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
-//
-// This dataset will throw a NotFound error if we cannot shard the dataset
-// automatically.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this dataset across.
-//	index: A scalar representing the index of the current worker out of num_workers.
-//
-//
-func ExperimentalAutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalAutoShardDataset",
-		Input: []tf.Input{
-			input_dataset, num_workers, index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["dtype"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
 //
-//
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "StatefulUniformFullInt",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			resource, algorithm, shape,
 		},
 		Attrs: attrs,
 	}
@@ -32902,158 +30785,15 @@
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "Sin",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
-//
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
-//
-//     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
-//
-// Whereas in `tf.gather` `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-//
-// The last dimension of `indices` can be at most the rank of
-// `params`:
-//
-//     indices.shape[-1] <= params.rank
-//
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
-//
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
-//
-// Some examples below.
-//
-// Simple indexing into a matrix:
-//
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
-//
-// Slice indexing into a matrix:
-//
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
-//
-// Indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
-//
-//
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
-//
-//
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
-//
-// Batched indexing into a matrix:
-//
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
-//
-// Batched slice indexing into a matrix:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
-//
-// Batched indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
-//
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
-//
-//
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
-//
-// See also `tf.gather` and `tf.batch_gather`.
-//
-// Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GatherNd",
-		Input: []tf.Input{
-			params, indices,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -33118,108 +30858,20 @@
 	return scope.AddOperation(opspec)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
-
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
-	}
-}
-
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
-//
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
-//
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
-
-// AllKeepDims sets the optional keep_dims attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
+// Computes the maximum of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -33232,7 +30884,7 @@
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -33241,7 +30893,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "Max",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -33251,41 +30903,879 @@
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// The op serializes protobuf messages provided in the input tensors.
+//
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeProto",
+		Input: []tf.Input{
+			sizes, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs the single element from the given dataset.
+//
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
+//
+//
+//
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DatasetToSingleElement",
+		Input: []tf.Input{
+			dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
+	}
+	return components
+}
+
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exp",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LowerBoundAttr is an optional argument to LowerBound.
+type LowerBoundAttr func(optionalAttr)
+
+// LowerBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LowerBoundOutType(value tf.DataType) LowerBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Applies lower_bound(sorted_search_values, values) along each row.
+//
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='left')`.
+//
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
+//
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = LowerBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 2],
+//              [0, 1, 5]]
+//
+// Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
+//
+// Returns A `Tensor` with the same shape as `values`.  It contains the first scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func LowerBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...LowerBoundAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LowerBound",
+		Input: []tf.Input{
+			sorted_inputs, values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
+type CudnnRNNBackpropV3Attr func(optionalAttr)
+
+// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
+// Backprop step of CudnnRNNV3.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "sequence_lengths" input than CudnnRNNBackprop.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: The same reserve_space produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
+//
+// GIF images with frame or transparency compression are not supported.
+// On Linux and MacOS systems, convert animated GIFs from compressed to
+// uncompressed by running:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeGif",
+		Input: []tf.Input{
+			contents,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a flat index or array of flat indices into a tuple of
+//
+// coordinate arrays.
+//
+// @compatibility(numpy)
+// Equivalent to np.unravel_index
+// @end_compatibility
+//
+// Arguments:
+//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+// flattened version of an array of dimensions dims.
+//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+// indices.
+//
+// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+// same shape as the indices array.
+func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnravelIndex",
+		Input: []tf.Input{
+			indices, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Enqueue a Tensor on the computation outfeed.
+//
+// Arguments:
+//	input: A tensor that will be inserted into the outfeed queue.
+//
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueue",
+		Input: []tf.Input{
+			input,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMin",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cos",
 		Input: []tf.Input{
 			x,
 		},
@@ -33294,6 +31784,1747 @@
 	return op.Output(0)
 }
 
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNextAsOptional",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the trignometric inverse sine of x element-wise.
+//
+// The `tf.math.asin` operation returns the inverse of `tf.math.sin`, such that
+// if `y = tf.math.sin(x)` then, `x = tf.math.asin(y)`.
+//
+// **Note**: The output of `tf.math.asin` will lie within the invertible range
+// of sine, i.e [-pi/2, pi/2].
+//
+// For example:
+//
+// ```python
+// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
+// x = tf.constant([1.047, 0.785])
+// y = tf.math.sin(x) # [0.8659266, 0.7068252]
+//
+// tf.math.asin(y) # [1.047, 0.785] = x
+// ```
+//
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLoss",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the trignometric inverse tangent of x element-wise.
+//
+// The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
+// if `y = tf.math.tan(x)` then, `x = tf.math.atan(y)`.
+//
+// **Note**: The output of `tf.math.atan` will lie within the invertible range
+// of tan, i.e (-pi/2, pi/2).
+//
+// For example:
+//
+// ```python
+// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
+// x = tf.constant([1.047, 0.785])
+// y = tf.math.tan(x) # [1.731261, 0.99920404]
+//
+// tf.math.atan(y) # [1.047, 0.785] = x
+// ```
+//
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are NaN.
+//
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Add",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to exchange data across TPU replicas.
+//
+// On each replica, the input is split into `split_count` blocks along
+// `split_dimension` and send to the other replicas given group_assignment. After
+// receiving `split_count` - 1 blocks from other replicas, we concatenate the
+// blocks along `concat_dimension` as the output.
+//
+// For example, suppose there are 2 TPU replicas:
+// replica 0 receives input: `[[A, B]]`
+// replica 1 receives input: `[[C, D]]`
+//
+// group_assignment=`[[0, 1]]`
+// concat_dimension=0
+// split_dimension=1
+// split_count=2
+//
+// replica 0's output: `[[A], [C]]`
+// replica 1's output: `[[B], [D]]`
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//	concat_dimension: The dimension number to concatenate.
+//	split_dimension: The dimension number to split.
+//	split_count: The number of splits, this number must equal to the sub-group
+// size(group_assignment.get_shape()[1])
+//
+// Returns The exchanged result.
+func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
+	opspec := tf.OpSpec{
+		Type: "AllToAll",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xlogy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape, max_num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+//
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
+//
+// Our Conv3D implements a form of cross-correlation.
+//
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
+
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+//
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SpaceToBatch for 4-D tensors of type T.
+//
+// This is a legacy version of the more general SpaceToBatchND.
+//
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
+//
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+//
+//   The effective spatial dimensions of the zero-padded input tensor will be:
+//
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
+//
+// The shape of the output will be:
+//
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatch",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The shape of the elements of the given list, as a tensor.
+//
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
+	opspec := tf.OpSpec{
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reciprocal",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x <= y) element-wise.
+//
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LessEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `Tensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToDenseSetOperation",
+		Input: []tf.Input{
+			set1, set2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of x AND y element-wise.
+//
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x // y element-wise.
+//
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects elements from `x` or `y`, depending on `condition`.
+//
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
+//
+// Arguments:
+//
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Select",
+		Input: []tf.Input{
+			condition, x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return a slice from 'input'.
+//
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
+//
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+//
+// Arguments:
+//
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Slice",
+		Input: []tf.Input{
+			input, begin, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Sum",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EuclideanNormAttr is an optional argument to EuclideanNorm.
+type EuclideanNormAttr func(optionalAttr)
+
+// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the euclidean norm of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EuclideanNorm",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor. The shape can be any partially-specified
+// shape.  To be unconstrained, pass in a shape with unknown rank.
+//
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+//
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+//
+// Arguments:
+//
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
+//
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBiasAdd",
+		Input: []tf.Input{
+			input, bias, min_input, max_input, min_bias, max_bias,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Usage:
+//   ```python
+//   import tensorflow as tf
+//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+//   b = tf.math.argmax(input = a)
+//   c = tf.keras.backend.eval(b)
+//   # c = 4
+//   # here a[4] = 166.32 which is the largest element of a across axis 0
+//   ```
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMax",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
+
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the smallest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Usage:
+//   ```python
+//   import tensorflow as tf
+//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+//   b = tf.math.argmin(input = a)
+//   c = tf.keras.backend.eval(b)
+//   # c = 0
+//   # here a[0] = 1 which is the smallest element of a across axis 0
+//   ```
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AnyAttr is an optional argument to Any.
 type AnyAttr func(optionalAttr)
 
@@ -33339,526 +33570,242 @@
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
+// Elementwise computes the bitwise XOR of `x` and `y`.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "BitwiseXor",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
+// QuantizedDepthwiseConv2DWithBiasAttr is an optional argument to QuantizedDepthwiseConv2DWithBias.
+type QuantizedDepthwiseConv2DWithBiasAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasOutType sets the optional out_type attribute to value.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBias(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBias",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // For example:
 //
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
 // ```
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			start, limit, delta,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+// Computes the sum along sparse segments of a tensor.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalAvgPool function.
-//
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
-//
-// Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
-type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
-//
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
-//
-// Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
-//
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	opspec := tf.OpSpec{
-		Type: "RestoreV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
-}
-
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
-
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
-	return func(m optionalAttr) {
-		m["adaptative"] = value
-	}
-}
-
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-//
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
+// for an explanation of segments.
 //
 // For example:
 //
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
 // ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			real, imag,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// Merges summaries.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			images, size,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// MaxPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
-		Input: []tf.Input{
-			grads, image, boxes, box_ind,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
-type MaxPoolGradWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	input: 4-D input to pool over.
 //	ksize: The size of the window for each dimension of the input tensor.
 //	strides: The stride of the sliding window for each dimension of the
 // input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -33867,50 +33814,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "MaxPool",
 		Input: []tf.Input{
 			input,
 		},
@@ -33920,92 +33824,174 @@
 	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// Computes the mean along sparse segments of a tensor.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
+// See `tf.sparse.segment_sum` for usage examples.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+//
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "CholeskyGrad",
+		Input: []tf.Input{
+			l, grad,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the imaginary part of a complex number.
+// Computes the mean along sparse segments of a tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
 //
 // For example:
 //
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  6, 6, 4],
+// #       [5,  6, 7, 8]]
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+	return func(m optionalAttr) {
+		m["delete_old_dirs"] = value
+	}
+}
+
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -34014,9 +34000,77 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			input,
+			checkpoint_prefixes, destination_prefix,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes gradients for SparseSegmentMean.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the mean of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mean",
+		Input: []tf.Input{
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -34024,164 +34078,143 @@
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
-type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adadelta embedding parameters.
+// N is the size of the segment being reduced.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// See `tf.sparse.segment_sum` for usage examples.
 //
-// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
-
+		Type: "ResourceApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// SpaceToDepthAttr is an optional argument to SpaceToDepth.
-type SpaceToDepthAttr func(optionalAttr)
+// Computes gradients for SparseSegmentSqrtN.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// SpaceToDepthDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// SpaceToDepth for tensors of type T.
-//
-// Rearranges blocks of spatial data, into depth. More specifically,
-// this op outputs a copy of the input tensor where values from the `height`
-// and `width` dimensions are moved to the `depth` dimension.
-// The attr `block_size` indicates the input block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` are rearranged
-//     into depth at each location.
-//   * The depth of the output tensor is `block_size * block_size * input_depth`.
-//   * The Y, X coordinates within each block of the input become the high order
-//     component of the output channel index.
-//   * The input tensor's height and width must be divisible by block_size.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-//                         within the output image, bX, bY means coordinates
-//                         within the input block, iC means input channels).
-//      The output would be a transpose to the following layout:
-//      n,oY,oX,bY,bX,iC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1], [2]],
-//       [[3], [4]]]]
-// ```
-//
-// This operation will output a tensor of shape `[1, 1, 1, 4]`:
-//
-// ```
-// [[[[1, 2, 3, 4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-// the corresponding output will have a single element (i.e. width and height are
-// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-// The output element shape is `[1, 1, 4]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// This operation, for block_size of 2, will return the following tensor of shape
-// `[1, 1, 1, 12]`
-//
-// ```
-// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [5],  [6]],
-//       [[3],   [4],  [7],  [8]],
-//       [[9],  [10], [13],  [14]],
-//       [[11], [12], [15],  [16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 2 2 4]`:
-//
-// ```
-// x = [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//
-//	block_size: The size of the spatial block.
-func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SpaceToDepth",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			input,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -34189,41 +34222,33 @@
 	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Computes the "logical and" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -34232,9 +34257,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "All",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -34242,6 +34267,127 @@
 	return op.Output(0)
 }
 
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unique",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 5,  5, 5, 5],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AngleAttr is an optional argument to Angle.
 type AngleAttr func(optionalAttr)
 
@@ -34291,211 +34437,102 @@
 	return op.Output(0)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
+// Compute the pairwise cross product.
 //
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumprod",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
 //
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
-//
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the number of elements in the given table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "Cross",
 		Input: []tf.Input{
-			table_handle,
+			a, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// Delete the TensorArray from its resource container.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
-//
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "Bincount",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddV2",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // QuantizedMulAttr is an optional argument to QuantizedMul.
@@ -34542,73 +34579,6 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x + y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-//
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
-		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
 // actual distribution of the values to maximize the usage of the lower bit depth
@@ -34692,33 +34662,152 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes a range that covers the actual values present in a quantized tensor.
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
+
+// PlaceholderShape sets the optional shape attribute to value.
 //
-// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
-// range that covers the actual values present in that tensor. This op is typically
-// used to produce the `requested_output_min` and `requested_output_max` for
-// `Requantize`.
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
 //
 // Arguments:
+//	dtype: The type of elements in the tensor.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Placeholder",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArraySizeV3
 //
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			handle, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes requantization range per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	clip_value_max: The maximum value of the output that needs to be clipped.
+// Example: set this to 6 for Relu6.
+//
+// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRangePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
+// Replaces the contents of the table with the specified keys and values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns the next representable value of `x1` in the direction of `x2`, element-wise.
 //
 // This operation returns the same result as the C++ std::nextafter function.
@@ -34742,70 +34831,35 @@
 	return op.Output(0)
 }
 
-// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
-type StatefulStandardNormalAttr func(optionalAttr)
-
-// StatefulStandardNormalDtype sets the optional dtype attribute to value.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. This op is deprecated in favor of op 'StatefulStandardNormalV2'
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// DEPRECATED at GraphDef version 29: Use StatefulStandardNormalV2 instead
-//
-// The generated values will have mean 0 and standard deviation 1.
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	shape: The shape of the output tensor.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormal",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			resource, shape,
+			table_handle, keys, default_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
-//
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Adjust the contrast of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
@@ -34837,131 +34891,7 @@
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The PNG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodePng",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Get the current size of the TensorArray.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
-//
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the number of elements in the given queue.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Replaces the contents of the table with the specified keys and values.
+// Updates the table to associates keys with values.
 //
 // The tensor `keys` must be of the same type as the keys of the table.
 // The tensor `values` must be of the type of the table values.
@@ -34972,12 +34902,12 @@
 //	values: Values to associate with keys.
 //
 // Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
 			table_handle, keys, values,
 		},
@@ -34985,93 +34915,166 @@
 	return scope.AddOperation(opspec)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
 
-// HashTableV2Container sets the optional container attribute to value.
+// MapStageCapacity sets the optional capacity attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
 // If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
+func MapStageContainer(value string) MapStageAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// MapStageSharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// value: It is necessary to match this name to the matching Unstage Op.
 // If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
+func MapStageSharedName(value string) MapStageAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
-//
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// Stage (key, values) in the underlying container which behaves like a hashtable.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	key: int64
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
 
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction}
+	opspec := tf.OpSpec{
+		Type: "NcclReduce",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["fast"] = value
 	}
 }
 
-// Returns the size of a tensor.
+// Solves one or more linear least-squares problems.
 //
-// This operation returns an integer representing the number of elements in
-// `input`.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// For example:
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
 //
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -35080,9 +35083,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			input,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -35090,93 +35093,340 @@
 	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
+// Outputs all keys and values in the table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
 
-// MutableHashTableV2Container sets the optional container attribute to value.
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into a new tensor according to `indices`.
+//
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+//
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNd",
+		Input: []tf.Input{
+			indices, updates, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this table is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
 // value: If non-empty, this table is shared under the given name across
 // multiple sessions.
 // If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 // If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["use_node_name_sharing"] = value
 	}
 }
 
-// Creates an empty hash table.
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
 //
 // This op creates a mutable hash table, specifying the type of its keys and
 // values. Each value must be a scalar. Data can be inserted into the table using
 // the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//
 //	value_dtype: Type of the table values.
 //
 // Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
+		Type: "MutableDenseHashTableV2",
+		Input: []tf.Input{
+			empty_key, deleted_key,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concats all tensors in the list along the 0th dimension.
+// Table initializer that takes two tensors for keys and values respectively.
 //
-// Requires that all tensors have the same shape except the first dimension.
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
-// input_handle: The input list.
-// element_shape: The shape of the uninitialized elements in the list. If the first
-//   dimension is not -1, it is assumed that all list elements have the same
-//   leading dim.
-// leading_dims: The list of leading dims of uninitialized list elements. Used if
-//   the leading dim of input_handle.element_shape or the element_shape input arg
-//   is not already set.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListConcatV2",
+		Type: "InitializeTableV2",
 		Input: []tf.Input{
-			input_handle, element_shape, leading_dims,
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
+
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+	return func(m optionalAttr) {
+		m["compute_v"] = value
+	}
+}
+
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
+//
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
+//
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEigV2",
+		Input: []tf.Input{
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -35184,6 +35434,187 @@
 	return op.Output(0), op.Output(1)
 }
 
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUniqueDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
+	}
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableFromTextFileV2",
+		Input: []tf.Input{
+			table_handle, filename,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
+//
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
+//
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Assert",
+		Input: []tf.Input{
+			condition, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // PrintAttr is an optional argument to Print.
 type PrintAttr func(optionalAttr)
 
@@ -35245,6 +35676,103 @@
 	return op.Output(0)
 }
 
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PrintV2Attr is an optional argument to PrintV2.
 type PrintV2Attr func(optionalAttr)
 
@@ -35258,6 +35786,14 @@
 	}
 }
 
+// PrintV2End sets the optional end attribute to value.
+// If not specified, defaults to "\n"
+func PrintV2End(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["end"] = value
+	}
+}
+
 // Prints a string scalar.
 //
 // Prints a string scalar to the desired output_stream.
@@ -35284,63 +35820,48 @@
 	return scope.AddOperation(opspec)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["description"] = value
 	}
 }
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["labels"] = value
 	}
 }
 
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["display_name"] = value
 	}
 }
 
-// Gradient for batch normalization.
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -35349,26 +35870,417 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNN",
+		Input: []tf.Input{
+			input, input_h, input_c, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+//
+// and `max` to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVars",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Updates the tree ensemble by either adding a layer to the last tree being grown
+//
+// or by starting a new tree.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesUpdateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
 // value: Max number of batch elements to generate audio for.
 // If not specified, defaults to 3
 //
 // REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
 		m["max_outputs"] = value
 	}
@@ -35376,8 +36288,6 @@
 
 // Outputs a `Summary` protocol buffer with audio.
 //
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
 // The summary has up to `max_outputs` summary values containing audio. The
 // audio is built from `tensor` which must be 3-D with shape `[batch_size,
 // frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
@@ -35396,18 +36306,18 @@
 //	sample_rate: The sample rate of the signal in hertz.
 //
 // Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			tag, tensor,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -35415,6 +36325,44 @@
 	return op.Output(0)
 }
 
+// Makes a copy of `x`.
+//
+// Arguments:
+//	x: The source tensor of type `T`.
+//
+// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+//       is not an alias of `x`.
+func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeepCopy",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
 //
 // tensor: The tensor to put on the list.
@@ -35436,6 +36384,141 @@
 	return op.Output(0)
 }
 
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+//
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
+//
+// Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
+//
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TakeManySparseFromTensorsMap",
+		Input: []tf.Input{
+			sparse_handles,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Lgamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the last element of the input list as well as a list with all but that element.
 //
 // Fails if the list is empty.
@@ -35460,26 +36543,6 @@
 	return op.Output(0), op.Output(1)
 }
 
-// The shape of the elements of the given list, as a tensor.
-//
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
-	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // TensorListStackAttr is an optional argument to TensorListStack.
 type TensorListStackAttr func(optionalAttr)
 
@@ -35518,45 +36581,125 @@
 	return op.Output(0)
 }
 
-// Splits a tensor into a list.
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
+
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Concats all tensors in the list along the 0th dimension.
 //
-// list[i] corresponds to lengths[i] tensors from the input tensor.
-// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcat",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a TensorList which, when stacked, has the value of `tensor`.
+//
+// Each tensor in the result list corresponds to one row of the input tensor.
 //
 // tensor: The input tensor.
-// element_shape: A shape compatible with that of elements in the tensor.
-// lengths: Vector of sizes of the 0th dimension of tensors in the list.
 // output_handle: The list.
-func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListSplit",
+		Type: "TensorListFromTensor",
 		Input: []tf.Input{
-			tensor, element_shape, lengths,
+			tensor, element_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the item in the list with the given index.
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
 //
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
+// The input tensors `real` and `imag` must have the same shape.
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			input_handle, index, element_shape,
+			element_shape, num_elements,
 		},
 		Attrs: attrs,
 	}
@@ -35564,108 +36707,187 @@
 	return op.Output(0)
 }
 
-// Sets the index-th position of the list to contain the given tensor.
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
 //
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
 //
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
+
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Resizes the list.
+//
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
+		Type: "TensorListResize",
 		Input: []tf.Input{
-			input_handle, index, item,
+			input_handle, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a TensorList by indexing into a Tensor.
+// Creates a Tensor by indexing into the TensorList.
 //
-// Each member of the TensorList corresponds to one row of the input tensor,
+// Each row in the produced Tensor corresponds to the element in the TensorList
 // specified by the given index (see `tf.gather`).
 //
-// tensor: The input tensor.
+// input_handle: The input tensor list.
 // indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// num_elements: The size of the output list. Must be large enough to accommodate
-//   the largest index in indices. If -1, the list is just large enough to include
-//   the largest index in indices.
-// output_handle: The TensorList.
-func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListScatterV2",
+		Type: "TensorListGather",
 		Input: []tf.Input{
-			tensor, indices, element_shape, num_elements,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
-//
-// Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
-		Input: []tf.Input{
-			value,
+			input_handle, indices, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -35673,299 +36895,35 @@
 	return op.Output(0)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
-//
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
-//
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-//
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cholesky",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
-	return func(m optionalAttr) {
-		m["compute_v"] = value
-	}
-}
-
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
-//
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
-//
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
-//
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
-
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["lower"] = value
-	}
-}
-
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
 // value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
-//
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
+// adjoint.
 // If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
 		m["adjoint"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
+// Solves systems of linear equations.
 //
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
-//
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
 //	matrix: Shape is `[..., M, M]`.
 //	rhs: Shape is `[..., M, K]`.
 //
 // Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -35974,7 +36932,7 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
 			matrix, rhs,
 		},
@@ -35984,24 +36942,32 @@
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
 //
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			input_dataset, count,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -36049,44 +37015,28 @@
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dtype"] = value
 	}
 }
 
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Non-deterministically generates some integers.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	shape: The shape of the output tensor.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -36095,70 +37045,87 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "NonDeterministicInts",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// Calculate product with tridiagonal matrix.
+//
+// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
+//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
+// tri-diagonal matrices to the left of multiplication. Last element is ingored.
+//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
+// matrices to the left of multiplication.
+//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
+// matrices to the left of multiplication. First element is ingored.
+//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
+// multiplication.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// Returns Tensor of shape `[..., M, N]` containing the product.
+func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "TridiagonalMatMul",
 		Input: []tf.Input{
-			reader_handle,
+			superdiag, maindiag, subdiag, rhs,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// Sets the index-th position of the list to contain the given tensor.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// An op which enqueues prelinearized buffer into TPU infeed.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	input: A variant tensor representing linearized output.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -36167,416 +37134,103 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "InfeedEnqueuePrelinearizedBuffer",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Creates a dataset that emits the lines of one or more text files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_min(c, tf.constant([0, 0, 1]))
-// # ==> [[1, 2, 2, 1],
-// #      [5, 6, 7, 8]]
-// ```
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMin",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
-		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unique",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
-//
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
-		Input: []tf.Input{
-			table_handle, keys,
-		},
-	}
 	return scope.AddOperation(opspec)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
-//
-// See also `RestoreSlice`.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilename",
+		Input: []tf.Input{
+			basename, shard, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
-
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
-//
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
-		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
-	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
-		Input: []tf.Input{
-			data, partitions,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
-}
-
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-//
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			input, paddings,
+			boxes, scores, max_output_size, iou_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -36638,112 +37292,219 @@
 	return op.Output(0)
 }
 
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
-//
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
+// A placeholder op that passes through `input` when its output is not fed.
 //
 // Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
 //
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
+		Type: "PlaceholderWithDefault",
 		Input: []tf.Input{
-			mutex_lock,
+			input,
 		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
-//
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
-		Input: []tf.Input{
-			table_handle, keys, default_value,
-		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+//	record_bytes: Number of bytes in the record.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
+}
+
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// ExtractGlimpseNoise sets the optional noise attribute to value.
+//
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractGlimpse",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Returns the number of records this Reader has produced.
@@ -36767,65 +37528,91 @@
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighborGrad",
+		Input: []tf.Input{
+			grads, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			reader_handle,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	reader_handle: Handle to a Reader.
 //
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			input,
+			reader_handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Writes contents to the file at input filename. Creates file and recursively
@@ -36850,24 +37637,71 @@
 	return scope.AddOperation(opspec)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "Digamma",
 		Input: []tf.Input{
-			pattern,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -36977,6 +37811,98 @@
 	return op.Output(0)
 }
 
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a sequence of numbers.
+//
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResizeBilinearAttr is an optional argument to ResizeBilinear.
 type ResizeBilinearAttr func(optionalAttr)
 
@@ -37029,166 +37955,23 @@
 	return op.Output(0)
 }
 
-// OptimizeDatasetAttr is an optional argument to OptimizeDataset.
-type OptimizeDatasetAttr func(optionalAttr)
-
-// OptimizeDatasetOptimizationConfigs sets the optional optimization_configs attribute to value.
-// If not specified, defaults to <>
-func OptimizeDatasetOptimizationConfigs(value []string) OptimizeDatasetAttr {
-	return func(m optionalAttr) {
-		m["optimization_configs"] = value
-	}
-}
-
-// Creates a dataset by applying optimizations to `input_dataset`.
-//
-// Creates a dataset by applying optimizations to `input_dataset`.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
 //
 //
-func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...OptimizeDatasetAttr) (handle tf.Output) {
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "OptimizeDataset",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			input_dataset, optimizations,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a Tensor by indexing into the TensorList.
-//
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListGather",
-		Input: []tf.Input{
-			input_handle, indices, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-//
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
-		Input: []tf.Input{
-			images, size, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of nearest neighbor interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
-		Input: []tf.Input{
-			grads, size,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -37256,6 +38039,51 @@
 	return op.Output(0)
 }
 
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // DecodeJpegAttr is an optional argument to DecodeJpeg.
 type DecodeJpegAttr func(optionalAttr)
 
@@ -37371,6 +38199,267 @@
 	return op.Output(0)
 }
 
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
+
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+//
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Transcode the input text from a source encoding to a destination encoding.
+//
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
+//
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
+//
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
+//
+// Arguments:
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeTranscode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+//
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // EncodeJpegAttr is an optional argument to EncodeJpeg.
 type EncodeJpegAttr func(optionalAttr)
 
@@ -37507,208 +38596,31 @@
 	return op.Output(0)
 }
 
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
-
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+// JPEG encode input image with provided compression quality.
 //
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["output_range_given"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+// `quality` is an int32 jpeg compression quality value between 0 and 100.
 //
-// value: Output in `y_min` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_min"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
-//
-// value: Output in `y_max` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_max"] = value
-	}
-}
-
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
-//
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
-	}
-}
-
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
-//
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["min_separation"] = value
-	}
-}
-
-// Quantized Instance normalization.
 //
 // Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
+//	images: Images to adjust.  At least 3-D.
+//	quality: An int quality to encode to.
 //
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
-		Input: []tf.Input{
-			x, x_min, x_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "EncodeJpegVariableQuality",
 		Input: []tf.Input{
-			gradients, features,
+			images, quality,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
-//
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Extract the shape information of a JPEG-encoded image.
-//
-// This op only parses the image header, so it is much faster than DecodeJpeg.
-//
-// Arguments:
-//	contents: 0-D. The JPEG-encoded image.
-//
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deprecated. Disallowed in GraphDef version >= 2.
 //
 // DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
@@ -37726,154 +38638,66 @@
 	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
+// Adjust the saturation of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last dimension is
 // interpretted as channels, and must be three.
 //
 // The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
 //	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	scale: A float scale to add to the saturation.
 //
 // Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			images, delta,
+			images, scale,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
 
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["compression"] = value
 	}
 }
 
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+// PNG-encode an image.
 //
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
-		Input: []tf.Input{
-			size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
-
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// Arguments:
-//	contents: 0-D.  The BMP-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -37882,9 +38706,9 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			contents,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -37892,29 +38716,42 @@
 	return op.Output(0)
 }
 
-// Debugging/model interpretability outputs for each example.
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// It traverses all the trees and computes debug metrics for individual examples,
-// such as getting split feature ids and logits after each split along the decision
-// path used to compute directional feature contributions.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for constructing the protos in
-// examples_debug_outputs_serialized.
-//
-// Returns Output rank 1 Tensor containing a proto serialized as a string for each example.
-func BoostedTreesExampleDebugOutputs(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (examples_debug_outputs_serialized tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesExampleDebugOutputs",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -37922,57 +38759,15 @@
 	return op.Output(0)
 }
 
-// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
-//
-// GIF images with frame or transparency compression are not supported.
-// On Linux and MacOS systems, convert animated GIFs from compressed to
-// uncompressed by running:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
-//
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "ExperimentalIteratorGetDevice",
 		Input: []tf.Input{
-			contents,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-//
-// Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
-		Input: []tf.Input{
-			images,
+			resource,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -37989,7 +38784,7 @@
 //
 // For example, if an image is 100 x 200 pixels (height x width) and the bounding
 // box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 //
 // Parts of the bounding box may fall outside the image.
 //
@@ -37997,149 +38792,107 @@
 //	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
 //	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
 // boxes.
+//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
 //
 // Returns 4-D with the same shape as `images`. The batch of input images with
 // bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "DrawBoundingBoxesV2",
 		Input: []tf.Input{
-			images, boxes,
+			images, boxes, colors,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
-//
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-//
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -38148,14 +38901,65 @@
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
+//
+// Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatDimMap",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // CropAndResizeAttr is an optional argument to CropAndResize.
@@ -38241,51 +39045,249 @@
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
 // Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	resource_handle: A handle to an iterator resource.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "SerializeIterator",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
+
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradBoxes",
+		Input: []tf.Input{
+			grads, image, boxes, box_ind,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
+
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+//
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV4",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the size of a tensor.
+//
+// This operation returns an integer representing the number of elements in
+// `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Size",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TakeDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
 type CombinedNonMaxSuppressionAttr func(optionalAttr)
 
@@ -38371,138 +39373,124 @@
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adadelta scheme.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			predictions, targets, k,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
-type UnicodeDecodeAttr func(optionalAttr)
-
-// UnicodeDecodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints for
-// each input string begin and end within the `char_values` tensor.
-// In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
-func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "UnicodeDecode",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
+}
+
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bucketize each feature based on bucket boundaries.
+//
+// An op that returns a list of float tensors, where each tensor represents the
+// bucketized values for a single feature.
+//
+// Arguments:
+//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
+// feature.
+//
+// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
+func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesBucketize",
+		Input: []tf.Input{
+			tf.OutputList(float_values), tf.OutputList(bucket_boundaries),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if buckets, idx, err = makeOutputList(op, idx, "buckets"); err != nil {
+		scope.UpdateErr("BoostedTreesBucketize", err)
+		return
+	}
+	return buckets
 }
 
 // Set a summary_writer_interface to record statistics using given stats_aggregator.
@@ -38521,66 +39509,139 @@
 	return scope.AddOperation(opspec)
 }
 
-// Returns the cardinality of `input_dataset`.
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// Returns the cardinality of `input_dataset`.
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to return cardinality for.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-// Returns The cardinality of `input_dataset`. Named constants are used to represent
-// infinite and unknown cardinality.
-func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that shards the input dataset.
+//
+// Creates a dataset that shards the input dataset by num_workers, returning a
+// sharded dataset for the index-th worker. This attempts to automatically shard
+// a dataset by examining the Dataset graph and inserting a shard op before the
+// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
+//
+// This dataset will throw a NotFound error if we cannot shard the dataset
+// automatically.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this dataset across.
+//	index: A scalar representing the index of the current worker out of num_workers.
+//
+//
+func ExperimentalAutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalAutoShardDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for integer types.
+//
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetCardinality",
+		Type: "TruncateDiv",
 		Input: []tf.Input{
-			input_dataset,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// Serializes the tree ensemble to a proto.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	tree_ensemble_handle: Handle to the tree ensemble.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "BoostedTreesSerializeEnsemble",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			tree_ensemble_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // Creates a dataset that contains the elements of `input_dataset` ignoring errors.
@@ -38600,69 +39661,148 @@
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["out_type"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// Returns shape of tensors.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "ShapeN",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+//
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+	return func(m optionalAttr) {
+		m["maxsplit"] = value
+	}
+}
+
+// Split elements of `source` based on `sep` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
+//
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+//
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
+//
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplitV2",
+		Input: []tf.Input{
+			input, sep,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_workers.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRebatchDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -38725,77 +39865,6 @@
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Assign `value` to the sliced l-value reference of `ref`.
-//
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-//
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
-//
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
-		Input: []tf.Input{
-			ref, begin, end, strides, value,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Creates a Dataset that returns pseudorandom numbers.
 //
 // Arguments:
@@ -38821,16 +39890,27 @@
 	return op.Output(0)
 }
 
-// Mutually accumulates multiple tensors of identical type and shape.
-func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//
+//
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "CollectiveGather",
+		Type: "ExperimentalSlidingWindowDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, window_size, window_shift, window_stride,
 		},
 		Attrs: attrs,
 	}
@@ -38838,395 +39918,177 @@
 	return op.Output(0)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// Returns the truth value of (x >= y) element-wise.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that will write to / read from a snapshot.
+//
+// This dataset attempts to determine whether a valid snapshot exists at the
+// `snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
+// If not, it will run the preprocessing pipeline as usual, and write out a
+// snapshot of the data processed for future use.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	path: The path we should write snapshots to / read snapshots from.
+//
+//
+func SnapshotDataset(scope *Scope, input_dataset tf.Output, path tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SnapshotDataset",
+		Input: []tf.Input{
+			input_dataset, path,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that overrides the maximum intra-op parallelism.
+//
+// Arguments:
+//
+//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+//
+//
+func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalMaxIntraOpParallelismDataset",
+		Input: []tf.Input{
+			input_dataset, max_intra_op_parallelism,
+		},
+		Attrs: attrs,
 	}
-	return output
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// CastTruncate sets the optional Truncate attribute to value.
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
 // If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["Truncate"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			x,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalSqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
-type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
-
-// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a statistics manager resource.
-func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SpaceToBatch for N-D tensors of type T.
-//
-// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-// grid of blocks of shape `block_shape`, and interleaves these blocks with the
-// "batch" dimension (0) such that in the output, the spatial dimensions
-// `[1, ..., M]` correspond to the position within the grid, and the batch
-// dimension combines both the position within a spatial block and the original
-// batch position.  Prior to division into blocks, the spatial dimensions of the
-// input are optionally zero padded according to `paddings`.  See below for a
-// precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has `M` dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-//    input according to `paddings` to produce `padded` of shape `padded_shape`.
-//
-// 2. Reshape `padded` to `reshaped_padded` of shape:
-//
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//        block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1],
-//       block_shape[M-1]] +
-//      remaining_shape
-//
-// 3. Permute dimensions of `reshaped_padded` to produce
-//    `permuted_reshaped_padded` of shape:
-//
-//      block_shape +
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-//    dimension, producing an output tensor of shape:
-//
-//      [batch * prod(block_shape)] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-//     paddings = `[[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 3, 1]` and value:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SpaceToBatchND",
-		Input: []tf.Input{
-			input, block_shape, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Push an element onto the tensor_array.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A dataset that splits the elements of its input into multiple elements.
-func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalUnbatchDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveSlices",
-		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalUniqueDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Creates a dataset that uses a custom thread pool to compute `input_dataset`.
@@ -39252,76 +40114,22 @@
 	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
-
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
-//
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
 // Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
 //
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "EncodeProto",
+		Type: "ExperimentalThreadPoolDataset",
 		Input: []tf.Input{
-			sizes, tf.OutputList(values),
+			input_dataset, thread_pool,
 		},
 		Attrs: attrs,
 	}
@@ -39329,116 +40137,132 @@
 	return op.Output(0)
 }
 
-// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
-type DecodeProtoV2Attr func(optionalAttr)
-
-// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// value: Either the special value `local://` or a path to a file containing
-// a serialized `FileDescriptorSet`.
-// If not specified, defaults to "local://"
-func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// value: Either `binary` or `text`.
-// If not specified, defaults to "binary"
-func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["message_format"] = value
-	}
-}
-
-// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+// The output is computed as follows:
 //
-// value: Whether to sanitize the result or not.
-// If not specified, defaults to false
-func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["sanitize"] = value
-	}
-}
-
-// The op extracts fields from a serialized protocol buffers message into tensors.
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
 //
-// The `decode_proto` op extracts fields from a serialized protocol buffers
-// message into tensors.  The fields in `field_names` are decoded and converted
-// to the corresponding `output_types` if possible.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// Each output tensor is a dense tensor. This means that it is padded to
-// hold the largest number of repeated elements seen in the input
-// minibatch. (The shape is also padded by one to prevent zero-sized
-// dimensions). The actual repeat counts for each example in the
-// minibatch can be found in the `sizes` output. In many cases the output
-// of `decode_proto` is fed immediately into tf.squeeze if missing values
-// are not a concern. When using tf.squeeze, always pass the squeeze
-// dimension explicitly to avoid surprises.
-//
-// For the most part, the mapping between Proto field types and
-// TensorFlow dtypes is straightforward. However, there are a few
-// special cases:
-//
-// - A proto field that contains a submessage or group can only be converted
-// to `DT_STRING` (the serialized submessage). This is to reduce the
-// complexity of the API. The resulting string can be used as input
-// to another instance of the decode_proto op.
-//
-// - TensorFlow lacks support for unsigned integers. The ops represent uint64
-// types as a `DT_INT64` with the same twos-complement bit pattern
-// (the obvious way). Unsigned int32 values can be represented exactly by
-// specifying type `DT_INT64`, or using twos-complement if the caller
-// specifies `DT_INT32` in the `output_types` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// Both binary and text proto serializations are supported, and can be
-// chosen using the `format` attribute.
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
 // Arguments:
-//	bytes: Tensor of serialized protos with shape `batch_shape`.
-//	message_type: Name of the proto message type to decode.
-//	field_names: List of strings containing proto field names. An extension field can be decoded
-// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
-//	output_types: List of TF types to use for the respective field in field_names.
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-// Each entry is the number of values found for the corresponding field.
-// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
-// `values[i]` has datatype `output_types[i]`
-// and shape `[batch_shape, max(sizes[...,i])]`.
-func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeProtoV2",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			bytes,
+			input, diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `rate` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	rate: A scalar representing the sample rate of elements from the `input_dataset`
+// that should be taken.
+//	seed: A scalar representing seed of random number generator.
+//	seed2: A scalar representing seed2 of random number generator.
+//
+//
+func SamplingDataset(scope *Scope, input_dataset tf.Output, rate tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SamplingDataset",
+		Input: []tf.Input{
+			input_dataset, rate, seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gather ragged slices from `params` axis `0` according to `indices`.
+//
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
+//
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
+//
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
+//
+//
+// Arguments:
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
+//
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
+	opspec := tf.OpSpec{
+		Type: "RaggedGather",
+		Input: []tf.Input{
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
 		Attrs: attrs,
 	}
@@ -39448,89 +40272,29 @@
 	}
 	var idx int
 	var err error
-	sizes = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("DecodeProtoV2", err)
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
 		return
 	}
-	return sizes, values
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ClipByValue",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
-		Input: []tf.Input{
-			input, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that concatenates `input_dataset` with `another_dataset`.
 func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -39548,25 +40312,20 @@
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+// Returns 0 if the denominator is zero.
 //
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "DivNoNan",
 		Input: []tf.Input{
-			input_dataset, count,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -39596,60 +40355,95 @@
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
-type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
 
-// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// StageCapacity sets the optional capacity attribute to value.
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["capacity"] = value
 	}
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// StageMemoryLimit sets the optional memory_limit attribute to value.
+//
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func StageContainer(value string) StageAttr {
 	return func(m optionalAttr) {
-		m["table_name"] = value
+		m["container"] = value
 	}
 }
 
-// Load proximal Adagrad embedding parameters.
+// StageSharedName sets the optional shared_name attribute to value.
 //
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
 //
 // Arguments:
-//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
-//
-//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
 // Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Type: "Stage",
 		Input: []tf.Input{
-			parameters, accumulators,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
+// PrefetchDatasetAttr is an optional argument to PrefetchDataset.
+type PrefetchDatasetAttr func(optionalAttr)
+
+// PrefetchDatasetSlackPeriod sets the optional slack_period attribute to value.
+// If not specified, defaults to 0
+func PrefetchDatasetSlackPeriod(value int64) PrefetchDatasetAttr {
+	return func(m optionalAttr) {
+		m["slack_period"] = value
+	}
+}
+
 // Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
 // Arguments:
@@ -39658,11 +40452,14 @@
 // this dataset.
 //
 //
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...PrefetchDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "PrefetchDataset",
 		Input: []tf.Input{
@@ -39674,410 +40471,37 @@
 	return op.Output(0)
 }
 
-// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
-type UnicodeEncodeAttr func(optionalAttr)
-
-// UnicodeEncodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD (U+65533).
-// If not specified, defaults to 65533
-func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// Encode a tensor of ints into unicode strings.
-//
-// Returns a vector of strings, where `output[i]` is constructed by encoding the
-// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
-// using `output_encoding`.
-//
-// ---
-//
-// Example:
-//
-// ```
-// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
-// input_splits = [0, 5, 10]
-// output_encoding = 'UTF-8'
-//
-// output = ['Hello', 'World']
-// ```
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:
-//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
-//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
-// In particular, `output[i]` is constructed by encoding the codepoints in the
-// slice `input_values[input_splits[i]:input_splits[i+1]]`.
-//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
-// "UTF-16-BE", and "UTF-32-BE"`.
-//
-// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
-func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeEncode",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			input_values, input_splits,
+			tag, tensor, serialized_summary_metadata,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A dataset that creates window datasets from the input dataset.
-//
-// Arguments:
-//
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
-//
-//
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "WindowDataset",
+		Type: "FilterByLastComponentDataset",
 		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that batches and pads `batch_size` elements from the input.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
-//
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
-//
-//
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
-		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset with a range of values. Corresponds to python's xrange.
-//
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
-//
-//
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RangeDataset",
-		Input: []tf.Input{
-			start, stop, step,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
-
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
-
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
-//
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
-	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
-	}
-}
-
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
-		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index a68e248..6a71cd1 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -134,6 +134,45 @@
 )
 
 tf_java_test(
+    name = "EagerSessionTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/EagerSessionTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.EagerSessionTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
+tf_java_test(
+    name = "EagerOperationBuilderTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/EagerOperationBuilderTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.EagerOperationBuilderTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
+tf_java_test(
+    name = "EagerOperationTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/EagerOperationTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.EagerOperationTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
+tf_java_test(
     name = "GraphTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/GraphTest.java"],
diff --git a/tensorflow/java/src/main/java/org/tensorflow/AbstractOperation.java b/tensorflow/java/src/main/java/org/tensorflow/AbstractOperation.java
index a1d95f2..0d4745f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/AbstractOperation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/AbstractOperation.java
@@ -24,6 +24,21 @@
 abstract class AbstractOperation implements Operation {
 
   @Override
+  public Output<?>[] outputList(int idx, int length) {
+    Output<?>[] outputs = new Output<?>[length];
+    for (int i = 0; i < length; ++i) {
+      outputs[i] = output(idx + i);
+    }
+    return outputs;
+  }
+
+  @Override
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  public <T> Output<T> output(int idx) {
+    return new Output(this, idx);
+  }
+
+  @Override
   public String toString() {
     return String.format("<%s '%s'>", type(), name());
   }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java b/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java
new file mode 100644
index 0000000..e989c00
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java
@@ -0,0 +1,134 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.util.Arrays;
+
+/**
+ * Implementation of an {@link Operation} executed eagerly.
+ *
+ * <p>EagerOperation instances are valid only as long as the {@link EagerSession} they are a part of
+ * is valid. Thus, if {@link EagerSession#close()} has been invoked, then methods on the
+ * EagerOperation instance may fail with an {@code IllegalStateException}.
+ *
+ * <p>EagerOperation instances are thread-safe.
+ */
+class EagerOperation extends AbstractOperation {
+
+  EagerOperation(
+      EagerSession session,
+      long opNativeHandle,
+      long[] outputNativeHandles,
+      String type,
+      String name) {
+    this.session = session;
+    this.type = type;
+    this.name = name;
+    this.nativeRef = new NativeReference(session, this, opNativeHandle, outputNativeHandles);
+  }
+
+  @Override
+  public String name() {
+    return name;
+  }
+
+  @Override
+  public String type() {
+    return type;
+  }
+
+  @Override
+  public int numOutputs() {
+    return nativeRef.outputHandles.length;
+  }
+
+  @Override
+  public int outputListLength(final String name) {
+    return outputListLength(nativeRef.opHandle, name);
+  }
+
+  @Override
+  public int inputListLength(final String name) {
+    return inputListLength(nativeRef.opHandle, name);
+  }
+
+  @Override
+  public long getUnsafeNativeHandle(int outputIndex) {
+    return nativeRef.outputHandles[outputIndex];
+  }
+
+  @Override
+  public long[] shape(int outputIndex) {
+    long outputNativeHandle = getUnsafeNativeHandle(outputIndex);
+    long[] shape = new long[numDims(outputNativeHandle)];
+    for (int i = 0; i < shape.length; ++i) {
+      shape[i] = dim(outputNativeHandle, i);
+    }
+    return shape;
+  }
+
+  @Override
+  public DataType dtype(int outputIndex) {
+    long outputNativeHandle = getUnsafeNativeHandle(outputIndex);
+    return DataType.fromC(dataType(outputNativeHandle));
+  }
+
+  private static class NativeReference extends EagerSession.NativeReference {
+
+    NativeReference(
+        EagerSession session, EagerOperation operation, long opHandle, long[] outputHandles) {
+      super(session, operation);
+      this.opHandle = opHandle;
+      this.outputHandles = outputHandles;
+    }
+
+    @Override
+    void delete() {
+      if (opHandle != 0L) {
+        for (long tensorHandle : outputHandles) {
+          if (tensorHandle != 0L) {
+            EagerOperation.deleteTensorHandle(tensorHandle);
+          }
+        }
+        EagerOperation.delete(opHandle);
+        opHandle = 0L;
+        Arrays.fill(outputHandles, 0L);
+      }
+    }
+
+    private long opHandle;
+    private final long[] outputHandles;
+  }
+
+  private final EagerSession session;
+  private final NativeReference nativeRef;
+  private final String type;
+  private final String name;
+
+  private static native void delete(long handle);
+
+  private static native void deleteTensorHandle(long handle);
+
+  private static native int outputListLength(long handle, String name);
+
+  private static native int inputListLength(long handle, String name);
+
+  private static native int dataType(long handle);
+
+  private static native int numDims(long handle);
+
+  private static native long dim(long handle, int index);
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java
new file mode 100644
index 0000000..7e5a9a7
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java
@@ -0,0 +1,258 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * An {@link OperationBuilder} for building {@link Operation Operations} that are executed eagerly.
+ */
+final class EagerOperationBuilder implements OperationBuilder {
+
+  EagerOperationBuilder(EagerSession session, String type, String name) {
+    this.session = session;
+    this.type = type;
+    this.name = name;
+    this.nativeRef = new NativeReference(session, this, allocate(session.nativeHandle(), type));
+  }
+
+  @Override
+  public EagerOperation build() {
+    long[] tensorHandles = execute(nativeRef.opHandle);
+    EagerOperation operation =
+        new EagerOperation(session, nativeRef.opHandle, tensorHandles, type, name);
+    // Release our reference to the native op handle now that we transferred its
+    // ownership to the EagerOperation
+    nativeRef.clear();
+    return operation;
+  }
+
+  @Override
+  public EagerOperationBuilder addInput(Output<?> input) {
+    addInput(nativeRef.opHandle, input.getUnsafeNativeHandle());
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder addInputList(Output<?>[] inputs) {
+    long[] inputHandles = new long[inputs.length];
+    for (int i = 0; i < inputs.length; ++i) {
+      inputHandles[i] = inputs[i].getUnsafeNativeHandle();
+    }
+    addInputList(nativeRef.opHandle, inputHandles);
+    return this;
+  }
+
+  @Override
+  public OperationBuilder addControlInput(Operation control) {
+    throw new UnsupportedOperationException(
+        "Control inputs are not supported in an eager execution environment");
+  }
+
+  @Override
+  public EagerOperationBuilder setDevice(String device) {
+    setDevice(nativeRef.opHandle, device);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, String value) {
+    return setAttr(name, value.getBytes(StandardCharsets.UTF_8));
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, String[] values) {
+    Charset utf8 = StandardCharsets.UTF_8;
+    Object[] objects = new Object[values.length];
+    for (int i = 0; i < values.length; ++i) {
+      objects[i] = values[i].getBytes(utf8);
+    }
+    setAttrStringList(nativeRef.opHandle, name, values);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, byte[] values) {
+    setAttrString(nativeRef.opHandle, name, values);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, long value) {
+    setAttrInt(nativeRef.opHandle, name, value);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, long[] values) {
+    setAttrIntList(nativeRef.opHandle, name, values);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, float value) {
+    setAttrFloat(nativeRef.opHandle, name, value);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, float[] values) {
+    setAttrFloatList(nativeRef.opHandle, name, values);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, boolean value) {
+    setAttrBool(nativeRef.opHandle, name, value);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, boolean[] values) {
+    setAttrBoolList(nativeRef.opHandle, name, values);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, DataType value) {
+    setAttrType(nativeRef.opHandle, name, value.c());
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, DataType[] values) {
+    int[] c = new int[values.length];
+    for (int i = 0; i < values.length; ++i) {
+      c[i] = values[i].c();
+    }
+    setAttrTypeList(nativeRef.opHandle, name, c);
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, Tensor<?> value) {
+    setAttrTensor(nativeRef.opHandle, name, value.getNativeHandle());
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, Tensor<?>[] values) {
+    // TODO (karllessard) could be supported by adding this attribute type in the eager C API
+    throw new UnsupportedOperationException(
+        "Tensor list attributes are not supported in eager mode");
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, Shape value) {
+    setAttrShape(nativeRef.opHandle, name, value.asArray(), value.numDimensions());
+    return this;
+  }
+
+  @Override
+  public EagerOperationBuilder setAttr(String name, Shape[] values) {
+    int[] numDimensions = new int[values.length];
+    int totalNumDimensions = 0;
+    for (int idx = 0; idx < values.length; ++idx) {
+      int n = values[idx].numDimensions();
+      numDimensions[idx] = n;
+      if (n > 0) {
+        totalNumDimensions += n;
+      }
+    }
+    // Flatten the shapes into a single array to avoid too much overhead in the
+    // native part
+    long[] shapes = new long[totalNumDimensions];
+    int shapeIdx = 0;
+    for (Shape shape : values) {
+      if (shape.numDimensions() > 0) {
+        for (long dim : shape.asArray()) {
+          shapes[shapeIdx++] = dim;
+        }
+      }
+    }
+    setAttrShapeList(nativeRef.opHandle, name, shapes, numDimensions);
+    return this;
+  }
+
+  private static class NativeReference extends EagerSession.NativeReference {
+
+    NativeReference(EagerSession session, EagerOperationBuilder operation, long opHandle) {
+      super(session, operation);
+      this.opHandle = opHandle;
+    }
+
+    @Override
+    public void clear() {
+      super.clear();
+      opHandle = 0L;
+    }
+
+    @Override
+    synchronized void delete() {
+      if (opHandle != 0L) {
+        EagerOperationBuilder.delete(opHandle);
+        opHandle = 0L;
+      }
+    }
+
+    private long opHandle;
+  }
+
+  private final EagerSession session;
+  private final String type;
+  private final String name;
+  private final NativeReference nativeRef;
+
+  private static native long allocate(long ctxHandle, String type);
+
+  private static native void delete(long opHandle);
+
+  private static native long[] execute(long opHandle);
+
+  private static native void addInput(long opHandle, long tensorHandle);
+
+  private static native void addInputList(long opHandle, long[] tensorHandles);
+
+  private static native void setDevice(long opHandle, String device);
+
+  private static native void setAttrString(long opHandle, String name, byte[] value);
+
+  private static native void setAttrStringList(long opHandle, String name, Object[] value);
+
+  private static native void setAttrInt(long opHandle, String name, long value);
+
+  private static native void setAttrIntList(long opHandle, String name, long[] values);
+
+  private static native void setAttrFloat(long opHandle, String name, float value);
+
+  private static native void setAttrFloatList(long opHandle, String name, float[] values);
+
+  private static native void setAttrBool(long opHandle, String name, boolean value);
+
+  private static native void setAttrBoolList(long opHandle, String name, boolean[] values);
+
+  private static native void setAttrType(long opHandle, String name, int type);
+
+  private static native void setAttrTypeList(long opHandle, String name, int[] types);
+
+  private static native void setAttrTensor(long opHandle, String name, long tensorHandle);
+
+  private static native void setAttrShape(long opHandle, String name, long[] shape, int numDims);
+
+  private static native void setAttrShapeList(
+      long opHandle, String name, long[] shapes, int[] numDims);
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java b/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
new file mode 100644
index 0000000..7f36da1
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
@@ -0,0 +1,417 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.lang.ref.PhantomReference;
+import java.lang.ref.Reference;
+import java.lang.ref.ReferenceQueue;
+import java.util.IdentityHashMap;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+/**
+ * An environment for executing TensorFlow operations eagerly.
+ *
+ * <p>Eager execution is an imperative programming environment that evaluates operations
+ * immediately, without building graphs. Operations return concrete values instead of constructing a
+ * computational graph to run later, as with {@link Graph}s and {@link Session}s.
+ *
+ * <p>This makes it easy to develop with TensorFlow and debug models, as it behaves more like a
+ * standard programming library.
+ *
+ * <p>Instances of a {@code EagerSession} are thread-safe.
+ *
+ * <p><b>WARNING:</b> Resources consumed by an {@code EagerSession} object must be explicitly freed
+ * by invoking the {@link #close()} method when it is no longer needed. This could be achieve using
+ * the `try-with-resources` technique as the example below:
+ *
+ * <pre>{@code
+ * try (EagerSession s = EagerSession.create()) {
+ *    // execute operations eagerly
+ * }
+ * }</pre>
+ *
+ * In addition, {@code EagerSession} objects clean up unused resources during the session, working
+ * in pair with the JVM garbage collector. See {@link ResourceCleanupStrategy} for more details.
+ */
+public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
+
+  /**
+   * Controls how to act when we try to run an operation on a given device but some input tensors
+   * are not on that device.
+   */
+  public static enum DevicePlacementPolicy {
+
+    /** Running operations with input tensors on the wrong device will fail. */
+    EXPLICIT(0),
+
+    /** Copy the tensor to the right device but log a warning. */
+    WARN(1),
+
+    /**
+     * Silently copy the tensor, which has a performance cost since the operation will be blocked
+     * till the copy completes. This is the default placement policy.
+     */
+    SILENT(2),
+
+    /** Placement policy which silently copies int32 tensors but not other dtypes. */
+    SILENT_FOR_INT32(3);
+
+    private DevicePlacementPolicy(int code) {
+      this.code = code;
+    }
+
+    private final int code;
+  }
+
+  /**
+   * Controls how TensorFlow resources are cleaned up when they are no longer needed.
+   *
+   * <p>All resources allocated during an {@code EagerSession} are deleted when the session is
+   * closed. To prevent out-of-memory errors, it is also strongly suggest to cleanup those resources
+   * during the session. For example, executing n operations in a loop of m iterations will allocate
+   * a minimum of n*m resources while in most cases, only resources of the last iteration are still
+   * being used.
+   *
+   * <p>{@code EagerSession} instances can be notified in different ways when TensorFlow objects are
+   * no longer being referred, so they can proceed to the cleanup of any resources they owned.
+   */
+  public static enum ResourceCleanupStrategy {
+
+    /**
+     * Monitor and delete unused resources from a new thread running in background.
+     *
+     * <p>This is the most reliable approach to cleanup TensorFlow resources, at the cost of
+     * starting and running an additional thread dedicated to this task. Each {@code EagerSession}
+     * instance has its own thread, which is stopped only when the session is closed.
+     *
+     * <p>This strategy is used by default.
+     */
+    IN_BACKGROUND,
+
+    /**
+     * Monitor and delete unused resources from existing threads, before or after they complete
+     * another task.
+     *
+     * <p>Unused resources are released when a call to the TensorFlow library reaches a safe point
+     * for cleanup. This is done synchronously and might block for a short period of time the thread
+     * who triggered that call.
+     *
+     * <p>This strategy should be used only if, for some reasons, no additional thread should be
+     * allocated for cleanup. Otherwise, {@link #IN_BACKGROUND} should be preferred.
+     */
+    ON_SAFE_POINTS,
+
+    /**
+     * Only delete resources when the session is closed.
+     *
+     * <p>All resources allocated during the session will remained in memory until the session is
+     * explicitly closed (or via the traditional `try-with-resource` technique). No extra task for
+     * resource cleanup will be attempted.
+     *
+     * <p>This strategy can lead up to out-of-memory errors and its usage is not recommended, unless
+     * the scope of the session is limited to execute only a small amount of operations.
+     */
+    ON_SESSION_CLOSE,
+  }
+
+  public static class Options {
+
+    /**
+     * Controls how operations dispatched are actually executed.
+     *
+     * <p>When set to true, each operation are executed asynchronously (in which case some
+     * operations might return "non-ready" outputs). When set to false, all operations are executed
+     * synchronously.
+     *
+     * <p>Synchronous execution is used by default.
+     *
+     * @param value true for asynchronous execution, false for synchronous.
+     */
+    public Options async(boolean value) {
+      async = value;
+      return this;
+    }
+
+    /**
+     * Controls how to act when we try to run an operation on a given device but some input tensors
+     * are not on that device.
+     *
+     * <p>{@link DevicePlacementPolicy#SILENT} is used by default.
+     *
+     * @param value policy to apply
+     * @see {@link DevicePlacementPolicy}
+     */
+    public Options devicePlacementPolicy(DevicePlacementPolicy value) {
+      devicePlacementPolicy = value;
+      return this;
+    }
+
+    /**
+     * Controls how TensorFlow resources are cleaned up when no longer needed.
+     *
+     * <p>{@link ResourceCleanupStrategy#IN_BACKGROUND} is used by default.
+     *
+     * @param value strategy to use
+     * @see {@link ResourceCleanupStrategy}
+     */
+    public Options resourceCleanupStrategy(ResourceCleanupStrategy value) {
+      resourceCleanupStrategy = value;
+      return this;
+    }
+
+    /**
+     * Configures the session based on the data found in the provided buffer, which is serialized
+     * TensorFlow config proto.
+     *
+     * <p>Warning: the support of this feature is subject to changes since TensorFlow protos might
+     * not be supported on public endpoints in the future.
+     *
+     * @param value a serialized config proto
+     * @see
+     *     https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto
+     */
+    public Options config(byte[] value) {
+      config = value;
+      return this;
+    }
+
+    /** Builds an eager session with the selected options. */
+    public EagerSession build() {
+      return new EagerSession(this);
+    }
+
+    private boolean async;
+    private DevicePlacementPolicy devicePlacementPolicy;
+    private ResourceCleanupStrategy resourceCleanupStrategy;
+    private byte[] config;
+
+    private Options() {
+      async = false;
+      devicePlacementPolicy = DevicePlacementPolicy.SILENT;
+      resourceCleanupStrategy = ResourceCleanupStrategy.IN_BACKGROUND;
+      config = null;
+    }
+  }
+
+  /** Returns an object that configures and builds a {@code EagerSession} with custom options. */
+  public static EagerSession.Options options() {
+    return new Options();
+  }
+
+  /** Returns an {@code EagerSession} configured with default options. */
+  public static EagerSession create() {
+    return options().build();
+  }
+
+  private EagerSession(Options options) {
+    this.nativeHandle = allocate(options.async, options.devicePlacementPolicy.code, options.config);
+    this.resourceCleanupStrategy = options.resourceCleanupStrategy;
+
+    if (resourceCleanupStrategy == ResourceCleanupStrategy.IN_BACKGROUND) {
+      nativeResources.startCleanupThread();
+    }
+  }
+
+  @Override
+  public synchronized void close() {
+    if (nativeHandle != 0L) {
+      if (resourceCleanupStrategy == ResourceCleanupStrategy.IN_BACKGROUND) {
+        nativeResources.stopCleanupThread();
+      }
+      nativeResources.deleteAll();
+      delete(nativeHandle);
+      nativeHandle = 0L;
+    }
+  }
+
+  @Override
+  public OperationBuilder opBuilder(String type, String name) {
+    if (resourceCleanupStrategy == ResourceCleanupStrategy.ON_SAFE_POINTS) {
+      nativeResources.tryCleanup();
+    }
+    checkSession();
+    return new EagerOperationBuilder(this, type, name);
+  }
+
+  long nativeHandle() {
+    checkSession();
+    return nativeHandle;
+  }
+
+  /**
+   * A reference to one or more allocated native resources.
+   *
+   * <p>Any Java objects owning native resources must declare a reference to those resources in a
+   * subclass that extends from {@code NativeReference}. When {@link NativeReference#delete()} is
+   * invoked, the resources must be freed. For example:
+   *
+   * <pre>{@code
+   * private static class NativeReference extends EagerSession.NativeReference {
+   *
+   *    NativeReference(EagerSession session, MyClass referent, long handle) {
+   *        super(session, referent);
+   *        this.handle = handle;
+   *    }
+   *
+   *    @Override
+   *    void delete() {
+   *        MyClass.nativeDelete(handle);
+   *    }
+   *
+   *    private final long handle;
+   * }
+   * }</pre>
+   *
+   * A Java object "owns" a native resource if this resource should not survive beyond the lifetime
+   * of this object.
+   *
+   * <p><b>IMPORTANT</b>: All nested subclasses of {@code NativeReference} must be declared as
+   * static, otherwise their instances will hold an implicit reference to their enclosing object,
+   * preventing the garbage collector to release them when they are no longer needed.
+   */
+  abstract static class NativeReference extends PhantomReference<Object> {
+
+    /** Attach a new phantom reference of {@code referent} to {@code session}. */
+    public NativeReference(EagerSession session, Object referent) {
+      super(referent, session.nativeResources.garbageQueue);
+      session.checkSession();
+      nativeResources = session.nativeResources;
+      nativeResources.attach(this);
+    }
+
+    /**
+     * Detach this reference from its current session.
+     *
+     * <p>Clearing a NativeReference does not invoke {@link #delete()}, thus won't release the
+     * native resources it refers to. It can be used when passing the ownership of those resources
+     * to another object.
+     *
+     * <p>If native resources needs to be deleted as well, call {@link #delete()} explicitly.
+     */
+    @Override
+    public void clear() {
+      nativeResources.detach(this);
+      super.clear();
+    }
+
+    /** Releases all native resources owned by the referred object, now deleted. */
+    abstract void delete();
+
+    private final NativeResourceCollector nativeResources;
+  }
+
+  /**
+   * Collects native references attached to this session and releases their resources if they are no
+   * longer needed.
+   */
+  private static class NativeResourceCollector {
+
+    void attach(NativeReference nativeRef) {
+      synchronized (nativeRefs) {
+        nativeRefs.put(nativeRef, null);
+      }
+    }
+
+    void detach(NativeReference nativeRef) {
+      synchronized (nativeRefs) {
+        nativeRefs.remove(nativeRef);
+      }
+    }
+
+    void delete(NativeReference nativeRef) {
+      synchronized (nativeRefs) {
+        if (!nativeRefs.keySet().remove(nativeRef)) {
+          return; // safety check
+        }
+      }
+      nativeRef.delete();
+    }
+
+    void deleteAll() {
+      synchronized (nativeRefs) {
+        for (NativeReference nativeRef : nativeRefs.keySet()) {
+          nativeRef.delete();
+        }
+        nativeRefs.clear();
+      }
+    }
+
+    void tryCleanup() {
+      Reference<?> nativeRef;
+      synchronized (nativeRefs) {
+        while ((nativeRef = garbageQueue.poll()) != null) {
+          delete((NativeReference) nativeRef);
+        }
+      }
+    }
+
+    synchronized void startCleanupThread() {
+      if (cleanupInBackground) {
+        return; // ignore if cleanup thread is already running
+      }
+      try {
+        cleanupInBackground = true;
+        cleanupService.execute(
+            new Runnable() {
+              @Override
+              public void run() {
+                try {
+                  while (cleanupInBackground) {
+                    NativeReference nativeRef = (NativeReference) garbageQueue.remove();
+                    delete(nativeRef);
+                  }
+                } catch (InterruptedException e) {
+                  // exit
+                }
+              }
+            });
+      } catch (Exception e) {
+        cleanupInBackground = false;
+        throw e;
+      }
+    }
+
+    void stopCleanupThread() {
+      cleanupInBackground = false;
+      cleanupService.shutdownNow(); // returns without waiting for the thread to stop
+    }
+
+    private final ExecutorService cleanupService = Executors.newSingleThreadExecutor();
+    private final Map<NativeReference, Void> nativeRefs = new IdentityHashMap<>();
+    private final ReferenceQueue<Object> garbageQueue = new ReferenceQueue<>();
+    private volatile boolean cleanupInBackground = false;
+  }
+
+  private final NativeResourceCollector nativeResources = new NativeResourceCollector();
+  private final ResourceCleanupStrategy resourceCleanupStrategy;
+  private long nativeHandle;
+
+  private void checkSession() {
+    if (nativeHandle == 0L) {
+      throw new IllegalStateException("Eager session has been closed");
+    }
+  }
+
+  private static native long allocate(boolean async, int devicePlacementPolicy, byte[] config);
+
+  private static native void delete(long handle);
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/GraphOperation.java b/tensorflow/java/src/main/java/org/tensorflow/GraphOperation.java
index 31b80d3..0e43bc3 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/GraphOperation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/GraphOperation.java
@@ -76,21 +76,6 @@
   }
 
   @Override
-  public Output<?>[] outputList(int idx, int length) {
-    Output<?>[] outputs = new Output<?>[length];
-    for (int i = 0; i < length; ++i) {
-      outputs[i] = output(idx + i);
-    }
-    return outputs;
-  }
-
-  @Override
-  @SuppressWarnings({"rawtypes", "unchecked"})
-  public <T> Output<T> output(int idx) {
-    return new Output(this, idx);
-  }
-
-  @Override
   public int hashCode() {
     return Long.valueOf(getUnsafeNativeHandle()).hashCode();
   }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java b/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
index a712204..ee4301f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
@@ -523,7 +523,7 @@
    */
   public static Constant<String> create(Scope scope, String data, Charset charset) {
     try (Tensor<String> value = Tensor.create(data.getBytes(charset), String.class)) {
-      return createWithTensor(scope, Tensor.create(data.getBytes(charset), String.class));
+      return createWithTensor(scope, value);
     }
   }
 
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index ff3b410..9707100 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -39,6 +39,7 @@
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api",
+            "//tensorflow/c/eager:c_api",
             "//tensorflow/core:all_kernels",
             "//tensorflow/core:direct_session",
             "//tensorflow/core:ops",
diff --git a/tensorflow/java/src/main/native/eager_operation_builder_jni.cc b/tensorflow/java/src/main/native/eager_operation_builder_jni.cc
new file mode 100644
index 0000000..f8ed207
--- /dev/null
+++ b/tensorflow/java/src/main/native/eager_operation_builder_jni.cc
@@ -0,0 +1,335 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/eager_operation_builder_jni.h"
+
+#include <cstring>
+#include <memory>
+#include <set>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+// This value should be >= to the maximum number of outputs in any op
+#define MAX_OUTPUTS_PER_OP 8
+
+namespace {
+
+TFE_Op* requireOp(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "Operation has already been built");
+    return nullptr;
+  }
+  return reinterpret_cast<TFE_Op*>(handle);
+}
+
+TFE_Context* requireContext(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException, "Context has been deleted");
+    return nullptr;
+  }
+  return reinterpret_cast<TFE_Context*>(handle);
+}
+
+TF_Tensor* requireTensor(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Tensor");
+    return nullptr;
+  }
+  return reinterpret_cast<TF_Tensor*>(handle);
+}
+
+TFE_TensorHandle* requireTensorHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "Tensor handle has been deleted");
+    return nullptr;
+  }
+  return reinterpret_cast<TFE_TensorHandle*>(handle);
+}
+
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperationBuilder_allocate(
+    JNIEnv* env, jclass clazz, jlong context_handle, jstring name) {
+  TFE_Context* context = requireContext(env, context_handle);
+  if (context == nullptr) return 0;
+  const char* op_or_function_name = env->GetStringUTFChars(name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(context, op_or_function_name, status);
+  env->ReleaseStringUTFChars(name, op_or_function_name);
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  static_assert(sizeof(jlong) >= sizeof(TFE_Op*),
+                "Cannot represent a C TFE_Op as a Java long");
+  return reinterpret_cast<jlong>(op);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_delete(
+    JNIEnv* env, jclass clazz, jlong op_handle) {
+  if (op_handle == 0) return;
+  TFE_DeleteOp(reinterpret_cast<TFE_Op*>(op_handle));
+}
+
+JNIEXPORT jlongArray JNICALL Java_org_tensorflow_EagerOperationBuilder_execute(
+    JNIEnv* env, jclass clazz, jlong op_handle) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return 0;
+  int num_retvals = MAX_OUTPUTS_PER_OP;
+  std::unique_ptr<TFE_TensorHandle*[]> retvals(
+      new TFE_TensorHandle*[num_retvals]);
+  TF_Status* status = TF_NewStatus();
+  TFE_Execute(op, retvals.get(), &num_retvals, status);
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return nullptr;
+  }
+  TF_DeleteStatus(status);
+  jlongArray rethandles = env->NewLongArray(num_retvals);
+  if (num_retvals > 0) {
+    jlong* retval = env->GetLongArrayElements(rethandles, nullptr);
+    for (int i = 0; i < num_retvals; ++i) {
+      retval[i] = reinterpret_cast<jlong>(retvals[i]);
+    }
+    env->ReleaseLongArrayElements(rethandles, retval, 0);
+  }
+  return rethandles;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setDevice(
+    JNIEnv* env, jclass clazz, jlong op_handle, jstring device_name) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  const char* cname = env->GetStringUTFChars(device_name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TFE_OpSetDevice(op, cname, status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  env->ReleaseStringUTFChars(device_name, cname);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_addInput(
+    JNIEnv* env, jclass clazz, jlong op_handle, jlong input_handle) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  TFE_TensorHandle* tensor_handle = requireTensorHandle(env, input_handle);
+  if (tensor_handle == nullptr) return;
+  TF_Status* status = TF_NewStatus();
+  TFE_OpAddInput(op, tensor_handle, status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_addInputList(
+    JNIEnv* env, jclass clazz, jlong op_handle, jlongArray input_handles) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  jlong* cinput_handles = env->GetLongArrayElements(input_handles, nullptr);
+  size_t num_inputs = static_cast<size_t>(env->GetArrayLength(input_handles));
+  std::unique_ptr<TFE_TensorHandle*[]> tensor_handles(
+      new TFE_TensorHandle*[num_inputs]);
+  for (int i = 0; i < num_inputs; ++i) {
+    tensor_handles[i] = requireTensorHandle(env, cinput_handles[i]);
+    if (tensor_handles[i] == nullptr) {
+      env->ReleaseLongArrayElements(input_handles, cinput_handles, JNI_ABORT);
+      return;
+    }
+  }
+  env->ReleaseLongArrayElements(input_handles, cinput_handles, JNI_ABORT);
+  TF_Status* status = TF_NewStatus();
+  TFE_OpAddInputList(op, tensor_handles.get(), num_inputs, status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrString(
+    JNIEnv* env, jclass clazz, jlong op_handle, jstring attr_name,
+    jbyteArray value) {
+  static_assert(sizeof(jbyte) == 1,
+                "Require Java byte to be represented as a single byte");
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  const char* cname = env->GetStringUTFChars(attr_name, nullptr);
+  jbyte* cvalue = env->GetByteArrayElements(value, nullptr);
+  TFE_OpSetAttrString(op, cname, cvalue, env->GetArrayLength(value));
+  env->ReleaseByteArrayElements(value, cvalue, JNI_ABORT);
+  env->ReleaseStringUTFChars(attr_name, cname);
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrStringList(
+    JNIEnv* env, jclass object, jlong op_handle, jstring attr_name,
+    jobjectArray values) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  const char* cname = env->GetStringUTFChars(attr_name, nullptr);
+  int num_values = env->GetArrayLength(values);
+  static_assert(sizeof(jbyte) == 1,
+                "Require Java byte to be represented as a single byte");
+  std::unique_ptr<jbyteArray[]> jarrays(new jbyteArray[num_values]);
+  std::unique_ptr<jbyte*[]> jvalues(new jbyte*[num_values]);
+  std::unique_ptr<void*[]> cvalues(new void*[num_values]);
+  std::unique_ptr<size_t[]> lengths(new size_t[num_values]);
+
+  for (int i = 0; i < num_values; ++i) {
+    jbyteArray v =
+        static_cast<jbyteArray>(env->GetObjectArrayElement(values, i));
+    jarrays[i] = v;
+    jvalues[i] = env->GetByteArrayElements(v, nullptr);
+    cvalues[i] = jvalues[i];
+    lengths[i] = static_cast<size_t>(env->GetArrayLength(v));
+  }
+  TFE_OpSetAttrStringList(op, cname, cvalues.get(), lengths.get(), num_values);
+  for (int i = 0; i < num_values; ++i) {
+    env->ReleaseByteArrayElements(jarrays[i], jvalues[i], JNI_ABORT);
+  }
+  env->ReleaseStringUTFChars(attr_name, cname);
+}
+
+#define DEFINE_SET_ATTR_SCALAR(name, jtype, ctype)                       \
+  JNIEXPORT void JNICALL                                                 \
+      Java_org_tensorflow_EagerOperationBuilder_setAttr##name(           \
+          JNIEnv* env, jclass clazz, jlong op_handle, jstring attr_name, \
+          jtype value) {                                                 \
+    static_assert(                                                       \
+        sizeof(ctype) >= sizeof(jtype),                                  \
+        "Information loss when converting between Java and C types");    \
+    TFE_Op* op = requireOp(env, op_handle);                              \
+    if (op == nullptr) return;                                           \
+    const char* cname = env->GetStringUTFChars(attr_name, nullptr);      \
+    TFE_OpSetAttr##name(op, cname, static_cast<ctype>(value));           \
+    env->ReleaseStringUTFChars(attr_name, cname);                        \
+  }
+
+#define DEFINE_SET_ATTR_LIST(name, jname, jtype, ctype)                  \
+  JNIEXPORT void JNICALL                                                 \
+      Java_org_tensorflow_EagerOperationBuilder_setAttr##name##List(     \
+          JNIEnv* env, jclass clazz, jlong op_handle, jstring attr_name, \
+          jtype##Array value) {                                          \
+    TFE_Op* op = requireOp(env, op_handle);                              \
+    if (op == nullptr) return;                                           \
+    const char* cname = env->GetStringUTFChars(attr_name, nullptr);      \
+    /* Make a copy of the array to paper over any differences */         \
+    /* in byte representations of the jtype and ctype */                 \
+    /* For example, jint vs TF_DataType. */                              \
+    /* If this copy turns out to be a problem in practice */             \
+    /* can avoid it for many types. */                                   \
+    const int n = env->GetArrayLength(value);                            \
+    std::unique_ptr<ctype[]> cvalue(new ctype[n]);                       \
+    jtype* elems = env->Get##jname##ArrayElements(value, nullptr);       \
+    for (int i = 0; i < n; ++i) {                                        \
+      cvalue[i] = static_cast<ctype>(elems[i]);                          \
+    }                                                                    \
+    TFE_OpSetAttr##name##List(op, cname, cvalue.get(), n);               \
+    env->Release##jname##ArrayElements(value, elems, JNI_ABORT);         \
+    env->ReleaseStringUTFChars(attr_name, cname);                        \
+  }
+
+#define DEFINE_SET_ATTR(name, jname, jtype, ctype) \
+  DEFINE_SET_ATTR_SCALAR(name, jtype, ctype)       \
+  DEFINE_SET_ATTR_LIST(name, jname, jtype, ctype)
+
+DEFINE_SET_ATTR(Int, Long, jlong, int64_t);
+DEFINE_SET_ATTR(Float, Float, jfloat, float);
+DEFINE_SET_ATTR(Bool, Boolean, jboolean, unsigned char);
+DEFINE_SET_ATTR(Type, Int, jint, TF_DataType);
+#undef DEFINE_SET_ATTR
+#undef DEFINE_SET_ATTR_LIST
+#undef DEFINE_SET_ATTR_SCALAR
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrTensor(
+    JNIEnv* env, jclass clazz, jlong handle, jstring attr_name,
+    jlong tensor_handle) {
+  TFE_Op* op = requireOp(env, handle);
+  if (op == nullptr) return;
+  TF_Tensor* t = requireTensor(env, tensor_handle);
+  if (t == nullptr) return;
+  const char* cname = env->GetStringUTFChars(attr_name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TFE_OpSetAttrTensor(op, cname, t, status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  env->ReleaseStringUTFChars(attr_name, cname);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrShape(
+    JNIEnv* env, jclass clazz, jlong op_handle, jstring attr_name,
+    jlongArray shape, jint num_dims) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  std::unique_ptr<int64_t[]> cvalue;
+  // num_dims and env->GetArrayLength(shape) are assumed to be consistent.
+  // i.e., either num_dims < 0 or num_dims == env->GetArrayLength(shape).
+  if (num_dims > 0) {
+    cvalue.reset(new int64_t[num_dims]);
+    jlong* elems = env->GetLongArrayElements(shape, nullptr);
+    for (int i = 0; i < num_dims; ++i) {
+      cvalue[i] = static_cast<int64_t>(elems[i]);
+    }
+    env->ReleaseLongArrayElements(shape, elems, JNI_ABORT);
+  }
+  const char* cname = env->GetStringUTFChars(attr_name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TFE_OpSetAttrShape(op, cname, cvalue.get(), static_cast<int>(num_dims),
+                     status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  env->ReleaseStringUTFChars(attr_name, cname);
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrShapeList(
+    JNIEnv* env, jclass clazz, jlong op_handle, jstring attr_name,
+    jlongArray shapes, jintArray num_dims) {
+  TFE_Op* op = requireOp(env, op_handle);
+  if (op == nullptr) return;
+  std::unique_ptr<int64_t[]> cshapes;
+  std::unique_ptr<const int64_t*[]> cdims;
+  std::unique_ptr<int[]> cnum_dims;
+  const int num_dims_length = env->GetArrayLength(num_dims);
+  if (num_dims_length > 0) {
+    const int shapes_length = env->GetArrayLength(shapes);
+    cshapes.reset(new int64_t[shapes_length]);
+    cdims.reset(new const int64_t*[num_dims_length]);
+    cnum_dims.reset(new int[num_dims_length]);
+    jlong* shapes_elems =
+        static_cast<jlong*>(env->GetPrimitiveArrayCritical(shapes, nullptr));
+    std::memcpy(cshapes.get(), shapes_elems, shapes_length << 3);
+    env->ReleasePrimitiveArrayCritical(shapes, shapes_elems, JNI_ABORT);
+    int64_t* cshapes_ptr = cshapes.get();
+    jint* num_dims_elems =
+        static_cast<jint*>(env->GetPrimitiveArrayCritical(num_dims, nullptr));
+    for (int i = 0; i < num_dims_length; ++i) {
+      cnum_dims[i] = static_cast<int>(num_dims_elems[i]);
+      cdims[i] = cshapes_ptr;
+      if (cnum_dims[i] > 0) {
+        cshapes_ptr += cnum_dims[i];
+      }
+    }
+    env->ReleasePrimitiveArrayCritical(num_dims, num_dims_elems, JNI_ABORT);
+  }
+  const char* cname = env->GetStringUTFChars(attr_name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TFE_OpSetAttrShapeList(op, cname, cdims.get(), cnum_dims.get(),
+                         num_dims_length, status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  env->ReleaseStringUTFChars(attr_name, cname);
+}
diff --git a/tensorflow/java/src/main/native/eager_operation_builder_jni.h b/tensorflow/java/src/main/native/eager_operation_builder_jni.h
new file mode 100644
index 0000000..6da891d
--- /dev/null
+++ b/tensorflow/java/src/main/native/eager_operation_builder_jni.h
@@ -0,0 +1,191 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_OPERATION_BUILDER_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_OPERATION_BUILDER_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    allocate
+ * Signature: (JLjava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperationBuilder_allocate(
+    JNIEnv *, jclass, jlong, jstring);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_delete(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    execute
+ * Signature: (J)[J
+ */
+JNIEXPORT jlongArray JNICALL
+Java_org_tensorflow_EagerOperationBuilder_execute(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    addInput
+ * Signature: (JJ)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_addInput(
+    JNIEnv *, jclass, jlong, jlong);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    addInputList
+ * Signature: (J[J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_addInputList(
+    JNIEnv *, jclass, jlong, jlongArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setDevice
+ * Signature: (JLjava/lang/String;)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setDevice(
+    JNIEnv *, jclass, jlong, jstring);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrString
+ * Signature: (JLjava/lang/String;[B)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrString(
+    JNIEnv *, jclass, jlong, jstring, jbyteArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrStringList
+ * Signature: (JLjava/lang/String;[L)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrStringList(JNIEnv *, jclass,
+                                                            jlong, jstring,
+                                                            jobjectArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrInt
+ * Signature: (JLjava/lang/String;J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrInt(
+    JNIEnv *, jclass, jlong, jstring, jlong);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrIntList
+ * Signature: (JLjava/lang/String;[J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrIntList(
+    JNIEnv *, jclass, jlong, jstring, jlongArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrFloat
+ * Signature: (JLjava/lang/String;F)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrFloat(
+    JNIEnv *, jclass, jlong, jstring, jfloat);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrFloatList
+ * Signature: (JLjava/lang/String;[F)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrFloatList(JNIEnv *, jclass,
+                                                           jlong, jstring,
+                                                           jfloatArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrBool
+ * Signature: (JLjava/lang/String;Z)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrBool(
+    JNIEnv *, jclass, jlong, jstring, jboolean);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrBoolList
+ * Signature: (JLjava/lang/String;[Z)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrBoolList(JNIEnv *, jclass,
+                                                          jlong, jstring,
+                                                          jbooleanArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrType
+ * Signature: (JLjava/lang/String;I)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrType(
+    JNIEnv *, jclass, jlong, jstring, jint);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrTypeList
+ * Signature: (JLjava/lang/String;[I)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrTypeList(JNIEnv *, jclass,
+                                                          jlong, jstring,
+                                                          jintArray);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrTensor
+ * Signature: (JLjava/lang/String;J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrTensor(
+    JNIEnv *, jclass, jlong, jstring, jlong);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrShape
+ * Signature: (JLjava/lang/String;[JI)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperationBuilder_setAttrShape(
+    JNIEnv *, jclass, jlong, jstring, jlongArray, jint);
+
+/*
+ * Class:     org_tensorflow_EagerOperationBuilder
+ * Method:    setAttrShapeList
+ * Signature: (JLjava/lang/String;[J[I)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperationBuilder_setAttrShapeList(JNIEnv *, jclass,
+                                                           jlong, jstring,
+                                                           jlongArray,
+                                                           jintArray);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_OPERATION_BUILDER_JNI_H_
diff --git a/tensorflow/java/src/main/native/eager_operation_jni.cc b/tensorflow/java/src/main/native/eager_operation_jni.cc
new file mode 100644
index 0000000..d5545e2
--- /dev/null
+++ b/tensorflow/java/src/main/native/eager_operation_jni.cc
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/eager_operation_jni.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+namespace {
+
+TFE_Op* requireOp(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "Eager session has been closed");
+    return nullptr;
+  }
+  return reinterpret_cast<TFE_Op*>(handle);
+}
+
+TFE_TensorHandle* requireTensorHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException, "EagerSession has been closed");
+    return nullptr;
+  }
+  return reinterpret_cast<TFE_TensorHandle*>(handle);
+}
+
+}  // namespace
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_delete(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong handle) {
+  if (handle == 0) return;
+  TFE_DeleteOp(reinterpret_cast<TFE_Op*>(handle));
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_deleteTensorHandle(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  if (handle == 0) return;
+  TFE_DeleteTensorHandle(reinterpret_cast<TFE_TensorHandle*>(handle));
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_outputListLength(
+    JNIEnv* env, jclass clazz, jlong handle, jstring name) {
+  TFE_Op* op = requireOp(env, handle);
+  if (op == nullptr) return 0;
+  TF_Status* status = TF_NewStatus();
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int length = TFE_OpGetOutputLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  return static_cast<jint>(length);
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_inputListLength(
+    JNIEnv* env, jclass clazz, jlong handle, jstring name) {
+  TFE_Op* op = requireOp(env, handle);
+  if (op == nullptr) return 0;
+  TF_Status* status = TF_NewStatus();
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int length = TFE_OpGetInputLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  return static_cast<jint>(length);
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_dataType(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  TFE_TensorHandle* tensor_handle = requireTensorHandle(env, handle);
+  if (tensor_handle == nullptr) return 0;
+  TF_DataType data_type = TFE_TensorHandleDataType(tensor_handle);
+  return static_cast<jint>(data_type);
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_numDims(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  TFE_TensorHandle* tensor_handle = requireTensorHandle(env, handle);
+  if (tensor_handle == nullptr) return 0;
+  TF_Status* status = TF_NewStatus();
+  int num_dims = TFE_TensorHandleNumDims(tensor_handle, status);
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  return static_cast<jint>(num_dims);
+}
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperation_dim(JNIEnv* env,
+                                                               jclass clazz,
+                                                               jlong handle,
+                                                               jint dim_index) {
+  TFE_TensorHandle* tensor_handle = requireTensorHandle(env, handle);
+  if (tensor_handle == nullptr) return 0;
+  TF_Status* status = TF_NewStatus();
+  int64_t dim = TFE_TensorHandleDim(tensor_handle, dim_index, status);
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  return static_cast<jlong>(dim);
+}
diff --git a/tensorflow/java/src/main/native/eager_operation_jni.h b/tensorflow/java/src/main/native/eager_operation_jni.h
new file mode 100644
index 0000000..732883a
--- /dev/null
+++ b/tensorflow/java/src/main/native/eager_operation_jni.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_OPERATION_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_OPERATION_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_delete(JNIEnv *,
+                                                                 jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    deleteTensorHandle
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperation_deleteTensorHandle(JNIEnv *, jclass, jlong);
+
+/**
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    outputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_outputListLength(
+    JNIEnv *, jclass, jlong, jstring);
+
+/**
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    inputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_inputListLength(
+    JNIEnv *, jclass, jlong, jstring);
+
+/**
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    dataType
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_dataType(JNIEnv *,
+                                                                   jclass,
+                                                                   jlong);
+
+/**
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    numDims
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_numDims(JNIEnv *,
+                                                                  jclass,
+                                                                  jlong);
+
+/**
+ * Class:     org_tensorflow_EagerOperation
+ * Method:    dim
+ * Signature: (JI)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperation_dim(JNIEnv *, jclass,
+                                                               jlong, jint);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_OPERATION_JNI_H_
diff --git a/tensorflow/java/src/main/native/eager_session_jni.cc b/tensorflow/java/src/main/native/eager_session_jni.cc
new file mode 100644
index 0000000..5890520
--- /dev/null
+++ b/tensorflow/java/src/main/native/eager_session_jni.cc
@@ -0,0 +1,64 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/eager_session_jni.h"
+
+#include <cstring>
+#include <memory>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerSession_allocate(
+    JNIEnv* env, jclass clazz, jboolean async, jint dpp, jbyteArray config) {
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  jbyte* cconfig = nullptr;
+  TF_Status* status = TF_NewStatus();
+  if (config != nullptr) {
+    cconfig = env->GetByteArrayElements(config, nullptr);
+    TFE_ContextOptionsSetConfig(
+        opts, cconfig, static_cast<size_t>(env->GetArrayLength(config)),
+        status);
+    if (!throwExceptionIfNotOK(env, status)) {
+      env->ReleaseByteArrayElements(config, cconfig, JNI_ABORT);
+      TFE_DeleteContextOptions(opts);
+      TF_DeleteStatus(status);
+      return 0;
+    }
+  }
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(
+      opts, static_cast<TFE_ContextDevicePlacementPolicy>(dpp));
+  TFE_Context* context = TFE_NewContext(opts, status);
+  TFE_DeleteContextOptions(opts);
+  if (config != nullptr) {
+    env->ReleaseByteArrayElements(config, cconfig, JNI_ABORT);
+  }
+  if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  static_assert(sizeof(jlong) >= sizeof(TFE_Context*),
+                "Cannot represent a C TFE_Op as a Java long");
+  return reinterpret_cast<jlong>(context);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerSession_delete(JNIEnv* env,
+                                                               jclass clazz,
+                                                               jlong handle) {
+  if (handle == 0) return;
+  TFE_DeleteContext(reinterpret_cast<TFE_Context*>(handle));
+}
diff --git a/tensorflow/java/src/main/native/eager_session_jni.h b/tensorflow/java/src/main/native/eager_session_jni.h
new file mode 100644
index 0000000..9f7bdac
--- /dev/null
+++ b/tensorflow/java/src/main/native/eager_session_jni.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_SESSION_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_SESSION_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_EagerSession
+ * Method:    allocate
+ * Signature: (ZI[B)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerSession_allocate(
+    JNIEnv *env, jclass clazz, jboolean async, jint dpp, jbyteArray config);
+
+/*
+ * Class:     org_tensorflow_EagerSession
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerSession_delete(JNIEnv *, jclass,
+                                                               jlong);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EAGER_SESSION_JNI_H_
diff --git a/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java
new file mode 100644
index 0000000..0f00a26
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link EagerOperationBuilder} class. */
+@RunWith(JUnit4.class)
+public class EagerOperationBuilderTest {
+
+  @Test
+  public void failToCreateIfSessionIsClosed() {
+    EagerSession session = EagerSession.create();
+    session.close();
+    try {
+      new EagerOperationBuilder(session, "Add", "add");
+      fail();
+    } catch (IllegalStateException e) {
+      // expected
+    }
+  }
+
+  @Test
+  public void failToBuildOpIfSessionIsClosed() {
+    EagerOperationBuilder opBuilder;
+    try (EagerSession session = EagerSession.create()) {
+      opBuilder = new EagerOperationBuilder(session, "Empty", "empty");
+    }
+    try {
+      opBuilder.setAttr("dtype", DataType.FLOAT);
+      fail();
+    } catch (IllegalStateException e) {
+      // expected
+    }
+  }
+
+  @Test
+  public void addInputs() {
+    try (EagerSession session = EagerSession.create()) {
+      Operation asrt =
+          opBuilder(session, "Assert", "assert")
+              .addInput(TestUtil.constant(session, "Cond", true))
+              .addInputList(new Output<?>[] {TestUtil.constant(session, "Error", -1)})
+              .build();
+      try {
+        opBuilder(session, "Const", "var").addControlInput(asrt);
+        fail();
+      } catch (UnsupportedOperationException e) {
+        // expected
+      }
+    }
+  }
+
+  @Test
+  public void setDevice() {
+    try (EagerSession session = EagerSession.create()) {
+      opBuilder(session, "Add", "SetDevice")
+          .setDevice("/job:localhost/replica:0/task:0/device:CPU:0")
+          .addInput(TestUtil.constant(session, "Const1", 2))
+          .addInput(TestUtil.constant(session, "Const2", 4))
+          .build();
+    }
+  }
+
+  @Test
+  public void setAttrs() {
+    // The effect of setting an attribute may not easily be visible from the other parts of this
+    // package's API. Thus, for now, the test simply executes the various setAttr variants to see
+    // that there are no exceptions.
+    //
+    // This is a bit of an awkward test since it has to find operations with attributes of specific
+    // types that aren't inferred from the input arguments.
+    try (EagerSession session = EagerSession.create()) {
+      // dtype, tensor attributes.
+      try (Tensor<Integer> t = Tensors.create(1)) {
+        opBuilder(session, "Const", "DataTypeAndTensor")
+            .setAttr("dtype", DataType.INT32)
+            .setAttr("value", t)
+            .build();
+      }
+      // type, int (TF "int" attributes are 64-bit signed, so a Java long).
+      opBuilder(session, "RandomUniform", "DataTypeAndInt")
+          .addInput(TestUtil.constant(session, "RandomUniformShape", new int[] {1}))
+          .setAttr("seed", 10)
+          .setAttr("dtype", DataType.FLOAT)
+          .build();
+      // list(int), string
+      opBuilder(session, "MaxPool", "IntListAndString")
+          .addInput(TestUtil.constant(session, "MaxPoolInput", new float[2][2][2][2]))
+          .setAttr("ksize", new long[] {1, 1, 1, 1})
+          .setAttr("strides", new long[] {1, 1, 1, 1})
+          .setAttr("padding", "SAME")
+          .build();
+      // list(float), device
+      opBuilder(session, "FractionalMaxPool", "FloatList")
+          .addInput(TestUtil.constant(session, "FractionalMaxPoolInput", new float[2][2][2][2]))
+          .setAttr("pooling_ratio", new float[] {1.0f, 1.44f, 1.73f, 1.0f})
+          .build();
+      // shape
+      opBuilder(session, "EnsureShape", "ShapeAttr")
+          .addInput(TestUtil.constant(session, "Const", new int[2][2]))
+          .setAttr("shape", Shape.make(2, 2))
+          .build();
+      // list(shape)
+      opBuilder(session, "FIFOQueue", "queue")
+          .setAttr("component_types", new DataType[] {DataType.INT32, DataType.INT32})
+          .setAttr("shapes", new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2)})
+          .build();
+      // bool
+      opBuilder(session, "All", "Bool")
+          .addInput(TestUtil.constant(session, "Const", new boolean[] {true, true, false}))
+          .addInput(TestUtil.constant(session, "Axis", 0))
+          .setAttr("keep_dims", false)
+          .build();
+      // float
+      opBuilder(session, "ApproximateEqual", "Float")
+          .addInput(TestUtil.constant(session, "Const1", 10.00001f))
+          .addInput(TestUtil.constant(session, "Const2", 10.00000f))
+          .setAttr("tolerance", 0.1f)
+          .build();
+      // Missing tests: list(string), list(byte), list(bool), list(type)
+    }
+  }
+
+  private static EagerOperationBuilder opBuilder(EagerSession session, String type, String name) {
+    return new EagerOperationBuilder(session, type, name);
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java
new file mode 100644
index 0000000..1dabbb7
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java
@@ -0,0 +1,128 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link EagerOperation} class. */
+@RunWith(JUnit4.class)
+public class EagerOperationTest {
+
+  @Test
+  public void failToCreateIfSessionIsClosed() {
+    EagerSession session = EagerSession.create();
+    session.close();
+    try {
+      new EagerOperation(session, 1L, new long[] {1L}, "Add", "add");
+      fail();
+    } catch (IllegalStateException e) {
+      // expected
+    }
+  }
+
+  @Test
+  public void outputDataTypeAndShape() {
+    try (EagerSession session = EagerSession.create();
+        Tensor<Integer> t = Tensors.create(new int[2][3])) {
+      EagerOperation op =
+          opBuilder(session, "Const", "OutputAttrs")
+              .setAttr("dtype", DataType.INT32)
+              .setAttr("value", t)
+              .build();
+      assertEquals(DataType.INT32, op.dtype(0));
+      assertEquals(2, op.shape(0)[0]);
+      assertEquals(3, op.shape(0)[1]);
+    }
+  }
+
+  @Test
+  public void inputAndOutputListLengths() {
+    try (EagerSession session = EagerSession.create()) {
+      Output<Float> c1 = TestUtil.constant(session, "Const1", new float[] {1f, 2f});
+      Output<Float> c2 = TestUtil.constant(session, "Const2", new float[] {3f, 4f});
+
+      EagerOperation acc =
+          opBuilder(session, "AddN", "InputListLength")
+              .addInputList(new Output<?>[] {c1, c2})
+              .build();
+      assertEquals(2, acc.inputListLength("inputs"));
+      assertEquals(1, acc.outputListLength("sum"));
+
+      EagerOperation split =
+          opBuilder(session, "Split", "OutputListLength")
+              .addInput(TestUtil.constant(session, "Axis", 0))
+              .addInput(c1)
+              .setAttr("num_split", 2)
+              .build();
+      assertEquals(1, split.inputListLength("split_dim"));
+      assertEquals(2, split.outputListLength("output"));
+
+      try {
+        split.inputListLength("no_such_input");
+        fail();
+      } catch (IllegalArgumentException e) {
+        // expected
+      }
+
+      try {
+        split.outputListLength("no_such_output");
+        fail();
+      } catch (IllegalArgumentException e) {
+        // expected
+      }
+    }
+  }
+
+  @Test
+  public void numOutputs() {
+    try (EagerSession session = EagerSession.create()) {
+      EagerOperation op =
+          opBuilder(session, "UniqueWithCountsV2", "unq")
+              .addInput(TestUtil.constant(session, "Const1", new int[] {1, 2, 1}))
+              .addInput(TestUtil.constant(session, "Axis", new int[] {0}))
+              .setAttr("out_idx", DataType.INT32)
+              .build();
+      assertEquals(3, op.numOutputs());
+    }
+  }
+
+  @Test
+  public void opNotAccessibleIfSessionIsClosed() {
+    EagerSession session = EagerSession.create();
+    EagerOperation add =
+        opBuilder(session, "Add", "SetDevice")
+            .addInput(TestUtil.constant(session, "Const1", 2))
+            .addInput(TestUtil.constant(session, "Const2", 4))
+            .build();
+    assertEquals(1, add.outputListLength("z"));
+    session.close();
+    try {
+      add.outputListLength("z");
+      fail();
+    } catch (IllegalStateException e) {
+      // expected
+    }
+  }
+
+  private static EagerOperationBuilder opBuilder(EagerSession session, String type, String name) {
+    return new EagerOperationBuilder(session, type, name);
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/EagerSessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/EagerSessionTest.java
new file mode 100644
index 0000000..77f38bb
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/EagerSessionTest.java
@@ -0,0 +1,173 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.EagerSession.ResourceCleanupStrategy;
+
+@RunWith(JUnit4.class)
+public class EagerSessionTest {
+
+  @Test
+  public void closeSessionTwiceDoesNotFail() {
+    try (EagerSession s = EagerSession.create()) {
+      s.close();
+    }
+  }
+
+  @Test
+  public void cleanupResourceOnSessionClose() {
+    AtomicBoolean deleted = new AtomicBoolean();
+
+    try (EagerSession s =
+        EagerSession.options()
+            .resourceCleanupStrategy(ResourceCleanupStrategy.ON_SESSION_CLOSE)
+            .build()) {
+
+      new TestReference(s, new Object(), deleted);
+
+      assertFalse(deleted.get());
+      runGC();
+      assertFalse(deleted.get());
+
+      buildOp(s);
+      assertFalse(deleted.get()); // reaching safe point did not release resources
+    }
+    assertTrue(deleted.get());
+  }
+
+  @Test
+  public void cleanupResourceOnSafePoints() {
+    AtomicBoolean deleted = new AtomicBoolean();
+
+    try (EagerSession s =
+        EagerSession.options()
+            .resourceCleanupStrategy(ResourceCleanupStrategy.ON_SAFE_POINTS)
+            .build()) {
+
+      new TestReference(s, new Object(), deleted);
+
+      assertFalse(deleted.get());
+      runGC();
+      assertFalse(deleted.get());
+
+      buildOp(s);
+      assertTrue(deleted.get()); // reaching safe point released resources
+    }
+  }
+
+  @Test
+  public void cleanupResourceInBackground() {
+    AtomicBoolean deleted = new AtomicBoolean();
+
+    try (EagerSession s =
+        EagerSession.options()
+            .resourceCleanupStrategy(ResourceCleanupStrategy.IN_BACKGROUND)
+            .build()) {
+
+      new TestReference(s, new Object(), deleted);
+
+      assertFalse(deleted.get());
+      runGC();
+      sleep(50); // allow some time to the background thread for cleaning up resources
+      assertTrue(deleted.get());
+    }
+  }
+
+  @Test
+  public void clearedResourcesAreNotCleanedUp() {
+    AtomicBoolean deleted = new AtomicBoolean();
+
+    try (EagerSession s = EagerSession.create()) {
+      TestReference ref = new TestReference(s, new Object(), deleted);
+      ref.clear();
+    }
+    assertFalse(deleted.get());
+  }
+
+  @Test
+  public void buildingOpWithClosedSessionFails() {
+    EagerSession s = EagerSession.create();
+    s.close();
+    try {
+      buildOp(s);
+      fail();
+    } catch (IllegalStateException e) {
+      // ok
+    }
+  }
+
+  @Test
+  public void addingReferenceToClosedSessionFails() {
+    EagerSession s = EagerSession.create();
+    s.close();
+    try {
+      new TestReference(s, new Object(), new AtomicBoolean());
+      fail();
+    } catch (IllegalStateException e) {
+      // ok
+    }
+  }
+
+  private static class TestReference extends EagerSession.NativeReference {
+
+    TestReference(EagerSession session, Object referent, AtomicBoolean deleted) {
+      super(session, referent);
+      this.deleted = deleted;
+    }
+
+    @Override
+    void delete() {
+      if (!deleted.compareAndSet(false, true)) {
+        fail("Reference was deleted more than once");
+      }
+    }
+
+    private final AtomicBoolean deleted;
+  }
+
+  private static void buildOp(EagerSession s) {
+    // Creating an operation is a safe point for resource cleanup
+    try {
+      s.opBuilder("Const", "Const");
+    } catch (UnsupportedOperationException e) {
+      // TODO (karlllessard) remove this exception catch when EagerOperationBuilder is implemented
+    }
+  }
+
+  private static void runGC() {
+    // Warning: There is no way to force the garbage collector to run, so here we simply to our best
+    // to get it triggered but it might be sufficient on some platforms. Adjust accordingly if some
+    // cleanup tests start to fail.
+    System.gc();
+    System.runFinalization();
+  }
+
+  private static void sleep(int millis) {
+    try {
+      Thread.sleep(millis);
+    } catch (InterruptedException e) {
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
index 9b48f6a..c97bcaa 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
@@ -50,8 +50,14 @@
     }
   }
 
-  public static <T> Output<T> constant(Graph g, String name, Object value) {
-    return constantOp(g, name, value).<T>output(0);
+  public static <T> Output<T> constant(ExecutionEnvironment env, String name, Object value) {
+    try (Tensor<?> t = Tensor.create(value)) {
+      return env.opBuilder("Const", name)
+          .setAttr("dtype", t.dataType())
+          .setAttr("value", t)
+          .build()
+          .<T>output(0);
+    }
   }
 
   public static <T> Output<T> placeholder(Graph g, String name, Class<T> type) {
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index a0b1fff..f43b8fd 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -159,15 +159,12 @@
         "stderr_reporter.cc",
     ] + select({
         "//tensorflow:android": [
-            "nnapi_delegate.cc",
             "mmap_allocation.cc",
         ],
         "//tensorflow:windows": [
-            "nnapi_delegate_disabled.cc",
             "mmap_allocation_disabled.cc",
         ],
         "//conditions:default": [
-            "nnapi_delegate_disabled.cc",
             "mmap_allocation.cc",
         ],
     }),
@@ -181,7 +178,6 @@
         "interpreter.h",
         "model.h",
         "mutable_op_resolver.h",
-        "nnapi_delegate.h",
         "op_resolver.h",
         "optional_debug_tools.h",
         "stderr_reporter.h",
@@ -198,8 +194,8 @@
         ":version",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/nnapi:nnapi_implementation",
-        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
         ":with_select_tf_ops": [
@@ -255,6 +251,7 @@
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
+        "//third_party/eigen3",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index babddcd3..582ec71 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -307,6 +307,7 @@
         "resolve_constant_strided_slice",
         "reverse_sequence",
         "reverse_v2",
+        "round",
         "rsqrt",
         "shape",
         "sigmoid",
@@ -346,7 +347,6 @@
     if conversion_mode == "toco-flex":
         return [
             "lstm",  # TODO(b/117510976): Restore when lstm flex conversion works.
-            "unroll_batch_matmul",  # TODO(b/123030774): Fails in 1.13 tests.
             "unidirectional_sequence_lstm",
             "unidirectional_sequence_rnn",
         ]
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index 914fd7f..4e86e4b 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -141,6 +141,7 @@
   kTfLiteBuiltinMatrixDiag = 113,
   kTfLiteBuiltinQuantize = 114,
   kTfLiteBuiltinMatrixSetDiag = 115,
+  kTfLiteBuiltinRound = 116,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index f20ee23..926d992 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -172,6 +172,8 @@
       return "COMPLEX64";
     case kTfLiteString:
       return "STRING";
+    case kTfLiteFloat16:
+      return "FLOAT16";
   }
   return "Unknown type";
 }
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 1560a6f..1948e1b 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -44,10 +44,11 @@
 // need. Access to the external contexts is controled by one of the
 // corresponding support files.
 typedef enum {
-  kTfLiteEigenContext = 0,     // include eigen_support.h to use.
-  kTfLiteGemmLowpContext = 1,  // include gemm_support.h to use.
-  kTfLiteEdgeTpuContext = 2,   // Placeholder for Edge TPU support.
-  kTfLiteMaxExternalContexts = 3
+  kTfLiteEigenContext = 0,       // include eigen_support.h to use.
+  kTfLiteGemmLowpContext = 1,    // include gemm_support.h to use.
+  kTfLiteEdgeTpuContext = 2,     // Placeholder for Edge TPU support.
+  kTfLiteCpuBackendContext = 3,  // include cpu_backend_support.h to use.
+  kTfLiteMaxExternalContexts = 4
 } TfLiteExternalContextType;
 
 struct TfLiteContext;
@@ -194,6 +195,11 @@
   float re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex64;
 
+// Half precision data type compatible with the C99 definition.
+typedef struct {
+  uint16_t data;
+} TfLiteFloat16;
+
 // Types supported by tensor
 typedef enum {
   kTfLiteNoType = 0,
@@ -206,6 +212,7 @@
   kTfLiteInt16 = 7,
   kTfLiteComplex64 = 8,
   kTfLiteInt8 = 9,
+  kTfLiteFloat16 = 10,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -258,6 +265,8 @@
   int32_t* i32;
   int64_t* i64;
   float* f;
+  // Placeholder for 16b float type. Use uint16* in the pointer union for now.
+  TfLiteFloat16* f16;
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index d01cf63..9a37cd9 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -78,6 +78,7 @@
   };
   EXPECT_EQ(type_name(kTfLiteNoType), "NOTYPE");
   EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
+  EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
   EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index db6b4a2..17eeed6 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -17,6 +17,7 @@
         "error_reporter.h",
         "flatbuffer_conversions.h",
         "op_resolver.h",
+        "profiler.h",
     ],
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 0ff207b..9d496f6 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -61,9 +61,8 @@
       *type = kTfLiteFloat32;
       break;
     case TensorType_FLOAT16:
-      error_reporter->Report("Unimplemented data type float16 in tensor\n",
-                             tensor_type);
-      return kTfLiteError;
+      *type = kTfLiteFloat16;
+      break;
     case TensorType_INT16:
       *type = kTfLiteInt16;
       break;
@@ -728,6 +727,7 @@
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:
     case BuiltinOperator_RELU_N1_TO_1:
+    case BuiltinOperator_ROUND:
     case BuiltinOperator_RSQRT:
     case BuiltinOperator_SELECT:
     case BuiltinOperator_SIN:
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 4a5de48..c7f8c1a 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -141,6 +141,13 @@
   EXPECT_EQ(kTfLiteFloat32, type);
 }
 
+TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeFloat16) {
+  TfLiteType type;
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_FLOAT16, &type, &mock_reporter_));
+  EXPECT_EQ(kTfLiteFloat16, type);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
new file mode 100644
index 0000000..f36f8e1
--- /dev/null
+++ b/tensorflow/lite/core/api/profiler.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_PROFILER_H_
+#define TENSORFLOW_LITE_CORE_API_PROFILER_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+// A simple utility for enabling profiled event tracing in TensorFlow Lite.
+class Profiler {
+ public:
+  enum class EventType {
+    // Default event type, the metadata field has no special significance.
+    DEFAULT = 0,
+    // The event is an operator invocation and the event_metadata field is the
+    // index of operator node.
+    OPERATOR_INVOKE_EVENT = 1
+  };
+
+  virtual ~Profiler() {}
+
+  // Signals the beginning of an event, returning a handle to the profile event.
+  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
+                              uint32_t event_metadata) = 0;
+
+  // Signals an end to the specified profile event.
+  virtual void EndEvent(uint32_t event_handle) = 0;
+};
+
+// Adds a profile event to `profiler` that begins with the construction
+// of the object and ends when the object goes out of scope.
+// The lifetime of tag should be at least the lifetime of `profiler`.
+// `profiler` may be null, in which case nothing is profiled.
+class ScopedProfile {
+ public:
+  ScopedProfile(Profiler* profiler, const char* tag,
+                Profiler::EventType event_type = Profiler::EventType::DEFAULT,
+                uint32_t event_metadata = 0)
+      : profiler_(profiler), event_handle_(0) {
+    if (profiler) {
+      event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
+    }
+  }
+
+  ~ScopedProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_);
+    }
+  }
+
+ private:
+  Profiler* const profiler_;
+  uint32_t event_handle_;
+};
+
+class ScopedOperatorProfile : public ScopedProfile {
+ public:
+  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
+      : ScopedProfile(profiler, tag, Profiler::EventType::OPERATOR_INVOKE_EVENT,
+                      static_cast<uint32_t>(node_index)) {}
+};
+
+}  // namespace tflite
+
+#define TFLITE_VARNAME_UNIQ(name, ctr) name##ctr
+#define TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)     \
+  tflite::ScopedOperatorProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
+      (profiler), (tag), (node_index))
+#define TFLITE_SCOPED_OPERATOR_PROFILE(profiler, node_index) \
+  TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE((profiler), "OpInvoke", (node_index))
+
+#endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 59898b2..afa2d63 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -14,11 +14,14 @@
 ==============================================================================*/
 
 #include "tensorflow/lite/core/subgraph.h"
+
+#include <complex>
+
 #include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/graph_info.h"
-#include "tensorflow/lite/nnapi_delegate.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -466,6 +469,9 @@
     case kTfLiteInt8:
       *bytes = sizeof(int8_t) * count;
       break;
+    case kTfLiteFloat16:
+      *bytes = sizeof(TfLiteFloat16) * count;
+      break;
     default:
       ReportError(
           "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
@@ -675,18 +681,11 @@
     return kTfLiteError;
   }
 
-  if (nnapi_delegate_) {
-    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
-      TF_LITE_ENSURE_OK(context_, nnapi_delegate_->Invoke(this));
-      return kTfLiteOk;
-    } else {
-      // TODO(aselle): In the future, we would like this to be an
-      // automatic tflite CPU fallback.
-      ReportError(
-          "NNAPI was requested, but dependent sized tensors "
-          "being used.\n");
-      return kTfLiteError;
-    }
+  // This is only needed for UseNNAPI(true);
+  if (should_apply_nnapi_delegate_ && !applied_nnapi_delegate_) {
+    TF_LITE_ENSURE_OK(context_, ModifyGraphWithDelegate(NnApiDelegate()));
+    // only need to modify the graph once upon the first invocation.
+    applied_nnapi_delegate_ = true;
   }
 
   // Invocations are always done in node order.
@@ -704,7 +703,7 @@
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
-    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+    TFLITE_SCOPED_OPERATOR_PROFILE(profiler_, node_index);
 
     // TODO(ycling): This is an extra loop through inputs to check if the data
     // need to be copied from Delegate buffer to raw memory, which is often not
@@ -970,14 +969,12 @@
 }
 
 void Subgraph::UseNNAPI(bool enable) {
-  // TODO(aselle): This is a workaround for finding if NNAPI exists.
-  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
-  // prefixed.
-  if (!NNAPIDelegate::IsSupported()) enable = false;
-  if (!enable) {
-    nnapi_delegate_.reset();
-  } else if (!nnapi_delegate_) {
-    nnapi_delegate_.reset(new NNAPIDelegate);
+  // Note that there is no way to disable the delegate once it modified the
+  // graph.
+  if (applied_nnapi_delegate_ && !enable) {
+    ReportError("Attempting to disable NNAPI delegate after it's applied.");
+  } else {
+    should_apply_nnapi_delegate_ = enable;
   }
 }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index cf1e834..b20cd06 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -20,8 +20,9 @@
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/memory_planner.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -275,12 +276,12 @@
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus ResetVariableTensors();
 
-  void SetProfiler(profiling::Profiler* profiler) {
+  void SetProfiler(Profiler* profiler) {
     profiler_ = profiler;
     context_->profiler = profiler;
   }
 
-  profiling::Profiler* GetProfiler() { return profiler_; }
+  Profiler* GetProfiler() { return profiler_; }
 
   // Returns a pointer to vector of subgraphs.
   // WARNING: This is an experimental API and subject to change.
@@ -511,8 +512,9 @@
   // TODO(aselle): replace execution_plan_ with this.
   std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
 
-  // Whether to delegate to NN API
-  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+  // Whether to use delegate to modify the graph.
+  bool should_apply_nnapi_delegate_ = false;
+  bool applied_nnapi_delegate_ = false;
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
@@ -525,7 +527,7 @@
   TfLiteExternalContext** external_contexts_;
 
   // Profiler for this interpreter instance.
-  profiling::Profiler* profiler_ = nullptr;
+  Profiler* profiler_ = nullptr;
 
   // A pointer to vector of subgraphs. The vector is owned by the interpreter.
   std::vector<std::unique_ptr<Subgraph>>* subgraphs_ = nullptr;
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 185a59e..43c3d5f 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -15,7 +15,6 @@
     hdrs = ["buffer_map.h"],
     deps = [
         ":util",
-        "//tensorflow/c:c_api_internal",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
@@ -24,6 +23,7 @@
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "//tensorflow/c:c_api_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
         ],
@@ -55,13 +55,13 @@
     deps = [
         ":delegate_data",
         ":delegate_only_runtime",
-        "//tensorflow/lite/c:c_api_internal",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
+            "//tensorflow/lite/c:c_api_internal",
         ],
     }),
     alwayslink = 1,
@@ -120,12 +120,12 @@
     deps = [
         ":buffer_map",
         "@com_google_absl//absl/memory",
-        "//tensorflow/core/common_runtime/eager:context",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:lib",
         ],
@@ -154,14 +154,11 @@
         ":delegate_data",
         ":util",
         "@flatbuffers",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/profiling:profiler",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/common_runtime/eager:execute",
-        "//tensorflow/core/common_runtime/eager:tensor_handle",
     ] + select({
         # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
         # set of core TensorFlow kernels. We may want to revisit this dependency
@@ -170,6 +167,9 @@
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core:lib",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:framework",
@@ -216,7 +216,6 @@
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
-        "//tensorflow/c:c_api_internal",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:kernel_api",
     ] + select({
@@ -224,6 +223,7 @@
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "//tensorflow/c:c_api_internal",
             "//tensorflow/core:lib",
             "//tensorflow/core:framework",
         ],
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 0d0c953..1f6df9a 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -15,11 +15,12 @@
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/string.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/core/framework/allocation_description.pb.h"
-#include "tensorflow/core/framework/log_memory.h"
 
 namespace tflite {
 namespace flex {
@@ -99,8 +100,9 @@
 
   ~StringTfLiteTensorBuffer() override {
     LogDeallocation();
-    tensorflow::cpu_allocator()->Deallocate<string>(
-        static_cast<string*>(data()), num_strings_);
+    tensorflow::TypedAllocator::Deallocate<tensorflow::string>(
+        tensorflow::cpu_allocator(), static_cast<tensorflow::string*>(data()),
+        num_strings_);
   }
 
   size_t size() const override { return num_strings_ * sizeof(string); }
@@ -109,7 +111,9 @@
   StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings)
       : BaseTfLiteTensorBuffer(
             num_strings != 0
-                ? tensorflow::cpu_allocator()->Allocate<string>(num_strings)
+                ? tensorflow::TypedAllocator::Allocate<tensorflow::string>(
+                      tensorflow::cpu_allocator(), num_strings,
+                      tensorflow::AllocationAttributes())
                 : nullptr),
         num_strings_(num_strings) {
     LogAllocation();
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index 87f3769..1c036c2 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -22,7 +22,9 @@
 namespace flex {
 DelegateData::DelegateData() {}
 
-DelegateData::~DelegateData() {}
+DelegateData::~DelegateData() {
+  if (eager_context_) eager_context_->Unref();
+}
 
 tensorflow::Status DelegateData::Prepare(
     const tensorflow::SessionOptions& session_options) {
@@ -40,10 +42,10 @@
   // Note that Rendezvous is ref-counted so it will be automatically deleted.
   tensorflow::Rendezvous* rendezvous =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
-  eager_context_.reset(new tensorflow::EagerContext(
+  eager_context_ = new tensorflow::EagerContext(
       session_options,
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      /*async=*/false, std::move(device_mgr), rendezvous));
+      /*async=*/false, std::move(device_mgr), rendezvous);
   return tensorflow::Status();
 }
 
diff --git a/tensorflow/lite/delegates/flex/delegate_data.h b/tensorflow/lite/delegates/flex/delegate_data.h
index 20d6b40..5f88cfb 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.h
+++ b/tensorflow/lite/delegates/flex/delegate_data.h
@@ -39,7 +39,7 @@
 
   // The EagerContext that is required for execution of Flex Ops.
   // Note: The context is lazily created after the first call to |Prepare()|.
-  tensorflow::EagerContext* GetEagerContext() { return eager_context_.get(); }
+  tensorflow::EagerContext* GetEagerContext() { return eager_context_; }
 
   // Map from TF Lite tensor index to TensorFlow tensor for a given context.
   BufferMap* GetBufferMap(const TfLiteContext* context) {
@@ -48,7 +48,7 @@
 
  private:
   // Will be null until Prepare() is called and completes successfully.
-  std::unique_ptr<tensorflow::EagerContext> eager_context_;
+  tensorflow::EagerContext* eager_context_ = nullptr;
   // TODO(b/112439500): Clean up stale BufferMap instances after adding the
   // necessary cleanup hook from a TfLiteContext to a TfLiteDelegate.
   std::unordered_map<const TfLiteContext*, BufferMap> buffer_map_;
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 6443e9f..4f3d0f1 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -24,10 +24,10 @@
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/string.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
@@ -529,8 +529,8 @@
 
   // Execute the TensorFlow Ops sequentially.
   for (auto& node_data : op_data->nodes) {
-    SCOPED_TAGGED_OPERATOR_PROFILE(
-        reinterpret_cast<profiling::Profiler*>(context->profiler),
+    TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(
+        reinterpret_cast<Profiler*>(context->profiler),
         node_data->name().c_str(), node_data->index());
 
     auto status = ExecuteFlexOp(context, buffer_map, node_data.get());
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index c995b36..4279f4a 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -60,6 +60,8 @@
       return TF_FLOAT;
     case kTfLiteFloat32:
       return TF_FLOAT;
+    case kTfLiteFloat16:
+      return TF_HALF;
     case kTfLiteInt16:
       return TF_INT16;
     case kTfLiteInt32:
@@ -83,6 +85,8 @@
   switch (type) {
     case TF_FLOAT:
       return kTfLiteFloat32;
+    case TF_HALF:
+      return kTfLiteFloat16;
     case TF_INT16:
       return kTfLiteInt16;
     case TF_INT32:
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index 8710475..69bba40 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -101,9 +101,9 @@
 
   EXPECT_EQ(
       CopyShapeAndType(&context, Tensor(tensorflow::DT_HALF, {1, 2}), &dst),
-      kTfLiteError);
-  EXPECT_EQ(context.error,
-            "TF Lite does not support TensorFlow data type: half");
+      kTfLiteOk);
+  EXPECT_THAT(context.new_size, ElementsAre(1, 2));
+  EXPECT_EQ(dst.type, kTfLiteFloat16);
 }
 
 TEST(UtilTest, TypeConversionsFromTFLite) {
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index e43e938..33f5a86 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -85,9 +85,9 @@
     ],
 )
 
-# build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtflite_gpu_gl.so
+# build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtensorflowlite_gpu_gl.so
 cc_binary(
-    name = "libtflite_gpu_gl.so",
+    name = "libtensorflowlite_gpu_gl.so",
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
@@ -104,9 +104,9 @@
     deps = [":gl_delegate"],
 )
 
-# build -c opt --config ios_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtflite_gpu_metal.so
+# build -c opt --config ios_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtensorflowlite_gpu_metal.so
 cc_binary(
-    name = "libtflite_gpu_metal.so",
+    name = "libtensorflowlite_gpu_metal.so",
     linkshared = 1,
     linkstatic = 1,
     tags = [
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc
index 09b40aa..73a27a3 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc
@@ -32,7 +32,7 @@
   PoolRecord(uint32_t size, size_t obj_id)
       : object_size(size), object_id(obj_id) {}
 
-  // objects in pool are ordered by size
+  // Objects in pool are ordered by size.
   bool operator<(const PoolRecord& other) const {
     return (object_size < other.object_size) ||
            (object_size == other.object_size && object_id < other.object_id);
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 93897ea..6fd8b21 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -22,8 +22,6 @@
 namespace gpu {
 namespace {
 
-// using ::testing::Eq;  // Optional ::testing aliases. Remove if unused.
-// using ::testing::Test;
 using ::testing::ElementsAre;
 
 TEST(Model, EmptyRecords) {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 9c1d5d1..32b7506 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -118,7 +118,7 @@
 template <>
 Status SetAllDimensions<HWC>(const TfLiteIntArray* dimensions, HWC* shape) {
   if (dimensions->size != 4) {
-    return InvalidArgumentError("Dimensions are not BHWC");
+    return InvalidArgumentError("Dimensions are not HWC");
   }
   if (dimensions->data[0] != 1) {
     return UnimplementedError("Batch size is not equal to 1.");
@@ -1746,6 +1746,90 @@
   }
 };
 
+class SpaceToBatchOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    return OkStatus();
+  }
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    auto* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SPACE_TO_BATCH);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    SpaceToBatchAttributes sb_attr;
+    Tensor<Linear, DataType::INT32> block;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &block));
+    if (block.shape.v != 2) {
+      return InternalError("Space has to be HxW.");
+    }
+    sb_attr.block.h = block.data[0];
+    sb_attr.block.w = block.data[1];
+
+    Tensor<HW, DataType::INT32> padding;
+    RETURN_IF_ERROR(reader->ReadTensor(2, &padding));
+    auto padding_shape = padding.shape;
+
+    if (padding_shape.h != 2 && padding_shape.w != 2) {
+      return InternalError("Space has to be HxW.");
+    }
+
+    sb_attr.padding.prepended.h = padding.data[0];
+    sb_attr.padding.prepended.w = padding.data[2];
+
+    sb_attr.padding.appended.h = padding.data[1];
+    sb_attr.padding.appended.w = padding.data[3];
+
+    node->operation.attributes = std::move(sb_attr);
+    return OkStatus();
+  }
+};
+
+class BatchToSpaceOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    return OkStatus();
+  }
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    auto* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::BATCH_TO_SPACE);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    BatchToSpaceAttributes bs_attr;
+    Tensor<Linear, DataType::INT32> block;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &block));
+    if (block.shape.v != 2) {
+      return InternalError("Space has to be HxW.");
+    }
+    bs_attr.block.h = block.data[0];
+    bs_attr.block.w = block.data[1];
+
+    Tensor<HW, DataType::INT32> crop;
+    RETURN_IF_ERROR(reader->ReadTensor(2, &crop));
+    auto crop_shape = crop.shape;
+    if (crop_shape.h != 2 && crop_shape.w != 2) {
+      return InternalError("Space has to be HxW.");
+    }
+
+    bs_attr.crop.prepended.h = crop.data[0];
+    bs_attr.crop.prepended.w = crop.data[2];
+
+    bs_attr.crop.appended.h = crop.data[1];
+    bs_attr.crop.appended.w = crop.data[3];
+
+    node->operation.attributes = std::move(bs_attr);
+    return OkStatus();
+  }
+};
+
 class UnsupportedOperationParser : public TFLiteOperationParser {
   Status IsSupported(const TfLiteContext* context,
                      const TfLiteNode* tflite_node,
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index bb03283..f7f9d1b 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -36,6 +36,14 @@
 
 bool Padding2D::operator!=(const Padding2D& value) { return !(*this == value); }
 
+Padding2D& Padding2D::operator-(const Padding2D& value) {
+  prepended.h -= value.prepended.h;
+  prepended.w -= value.prepended.w;
+  appended.h -= value.appended.h;
+  appended.w -= value.appended.w;
+  return *this;
+}
+
 std::string ToString(enum OperationType op) {
   switch (op) {
     case OperationType::UNKNOWN:
@@ -46,8 +54,8 @@
       return "add";
     case OperationType::APPLY_MASK:
       return "apply_mask";
-    case OperationType::SUB:
-      return "subtract";
+    case OperationType::BATCH_TO_SPACE:
+      return "batch_to_space";
     case OperationType::POOLING_2D:
       return "pooling_2d";
     case OperationType::MAX_UNPOOLING_2D:
@@ -92,12 +100,16 @@
       return "slice";
     case OperationType::SOFT_MAX:
       return "soft_max";
+    case OperationType::SPACE_TO_BATCH:
+      return "space_to_batch";
     case OperationType::SQRT:
       return "sqrt";
     case OperationType::SQUARE:
       return "square";
     case OperationType::SQUARED_DIFF:
       return "squared_diff";
+    case OperationType::SUB:
+      return "subtract";
     case OperationType::UPSAMPLE_2D:
       return "upsample_2d";
     case OperationType::CONVOLUTION_TRANSPOSED:
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index eebe1ee..ef82537 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -36,6 +36,7 @@
   ADD,
   // TODO(eignasheva): remove APPLY_MASK operation, is should be just MUL
   APPLY_MASK,
+  BATCH_TO_SPACE,
   BATCH_NORMALIZATION,
   CONCAT,
   CONST,
@@ -62,6 +63,7 @@
   SIN,
   SLICE,
   SOFT_MAX,
+  SPACE_TO_BATCH,
   SQRT,
   SQUARE,
   SQUARED_DIFF,
@@ -79,6 +81,7 @@
   Padding2D& operator=(const Padding2D& value);
   bool operator==(const Padding2D& value);
   bool operator!=(const Padding2D& value);
+  Padding2D& operator-(const Padding2D& value);
 
   // Padding values for every axis (if needed), where 'prepended' defines
   // padding for the beginning of each axis and 'appended' represents end part
@@ -87,6 +90,18 @@
   HW appended = HW(-1, -1);
 };
 
+struct Crop2D : public Padding2D {};
+
+struct SpaceToBatchAttributes {
+  HW block;
+  Padding2D padding;
+};
+
+struct BatchToSpaceAttributes {
+  HW block;
+  Crop2D crop;
+};
+
 enum class PoolingType {
   UNDEFINED = 0,
 
diff --git a/tensorflow/lite/delegates/gpu/common/shape.cc b/tensorflow/lite/delegates/gpu/common/shape.cc
index 3ffc651..df34076 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape.cc
@@ -111,5 +111,15 @@
                       absl::StrJoin(s.dimensions, ", "), "}}");
 }
 
+template <>
+int64_t StrongShape<Layout::OHWI>::LinearIndex(
+    const std::array<int32_t, 4>& coordinates) const {
+  int64_t index = coordinates[0];
+  index = index * StrongShape::get(1) + coordinates[1];
+  index = index * StrongShape::get(2) + coordinates[2];
+  index = index * StrongShape::get(3) + coordinates[3];
+  return index;
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index bea9e20..8fa0368 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -77,6 +77,7 @@
         ":fuse_mul_to_conv",
         ":make_fully_connected",
         ":make_padding",
+        ":match_dilated_convolution",
         ":merge_padding_with",
         ":remove_noop",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
@@ -145,6 +146,34 @@
 )
 
 cc_library(
+    name = "match_dilated_convolution",
+    srcs = ["match_dilated_convolution.cc"],
+    hdrs = ["match_dilated_convolution.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
+cc_test(
+    name = "match_dilated_convolution_test",
+    srcs = ["match_dilated_convolution_test.cc"],
+    deps = [
+        ":match_dilated_convolution",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "@com_google_absl//absl/types:any",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
     name = "merge_padding_with",
     srcs = ["merge_padding_with.cc"],
     hdrs = ["merge_padding_with.h"],
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.cc b/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.cc
new file mode 100644
index 0000000..5257ba4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.cc
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.h"
+
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+class MatchDilatedConvolution : public SequenceTransformation {
+ public:
+  int ExpectedSequenceLength() const final { return 3; }
+
+  // TODO(eignasheva): use span instead of const reference b/131628066.
+  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
+                                       GraphFloat32* graph) final {
+    auto& sb_node = *sequence[0];
+    auto& conv_node = *sequence[1];
+    auto& bs_node = *sequence[2];
+    if (sb_node.operation.type != ToString(OperationType::SPACE_TO_BATCH) &&
+        bs_node.operation.type != ToString(OperationType::BATCH_TO_SPACE)) {
+      return {TransformStatus::SKIPPED, ""};
+    }
+    if (conv_node.operation.type !=
+            ToString(OperationType::DEPTHWISE_CONVOLUTION) &&
+        conv_node.operation.type != ToString(OperationType::CONVOLUTION_2D)) {
+      return {TransformStatus::SKIPPED, ""};
+    }
+
+    auto sb_attr =
+        absl::any_cast<SpaceToBatchAttributes>(sb_node.operation.attributes);
+
+    auto bs_attr =
+        absl::any_cast<BatchToSpaceAttributes>(bs_node.operation.attributes);
+
+    if (sb_attr.block != bs_attr.block) {
+      return {TransformStatus::INVALID, "Invalid block size"};
+    }
+
+    if (conv_node.operation.type ==
+        ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
+      auto dw_attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
+          conv_node.operation.attributes);
+      dw_attr.padding = sb_attr.padding - bs_attr.crop;
+      dw_attr.dilations = sb_attr.block;
+      conv_node.operation.attributes = std::move(dw_attr);
+    } else {
+      auto conv2d_attr = absl::any_cast<Convolution2DAttributes>(
+          conv_node.operation.attributes);
+      conv2d_attr.padding = sb_attr.padding - bs_attr.crop;
+      conv2d_attr.dilations = sb_attr.block;
+      conv_node.operation.attributes = std::move(conv2d_attr);
+    }
+
+    Status status = RemoveFollowingNode(graph, &bs_node, &conv_node);
+    if (!status.ok()) {
+      return {TransformStatus::INVALID,
+              "Unable to remove batch_to_space node after convolution."};
+    }
+    status = RemovePrecedingNode(graph, &sb_node, &conv_node);
+    if (!status.ok()) {
+      return {TransformStatus::INVALID,
+              "Unable to remove space_to_batch node before convolution."};
+    }
+
+    return {TransformStatus::APPLIED, ""};
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<SequenceTransformation> NewMatchDilatedConvolution() {
+  return absl::make_unique<MatchDilatedConvolution>();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.h b/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.h
new file mode 100644
index 0000000..38b87d8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCH_DILATED_CONVOLUTION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCH_DILATED_CONVOLUTION_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// TF->TFLite converter converts convolution with dilation into the chain of
+// SpaceToBatch->Convolution->BatchToSpace. Our GPU backend natively supports
+// dilation in convolutions, so we try to skip this inefficiency. For more
+// information see b/131436214.
+std::unique_ptr<SequenceTransformation> NewMatchDilatedConvolution();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCH_DILATED_CONVOLUTION_H_
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution_test.cc
new file mode 100644
index 0000000..74c385b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/transformations/match_dilated_convolution.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+TEST(MatchDilatedConvolutionTest, MakesDilatedConvolution) {
+  GraphFloat32 graph;
+  auto input = graph.NewValue();
+  input->tensor.shape = BHWC(1, 95, 1, 17);
+
+  SpaceToBatchAttributes sb_attr;
+  sb_attr.block = HW(128, 1);
+  sb_attr.padding.prepended = HW(128, 0);
+  sb_attr.padding.appended = HW(161, 0);
+
+  DepthwiseConvolution2DAttributes dw_attr;
+  dw_attr.padding.prepended = HW(0, 0);
+  dw_attr.padding.appended = HW(0, 0);
+  dw_attr.strides = HW(1, 1);
+  dw_attr.dilations = HW(1, 1);
+  dw_attr.weights.shape = OHWI(1, 3, 1, 17);
+  dw_attr.bias.shape = Linear(96);
+
+  BatchToSpaceAttributes bs_attr;
+  bs_attr.block = HW(128, 1);
+  bs_attr.crop.prepended = HW(0, 0);
+  bs_attr.crop.appended = HW(33, 0);
+
+  auto sb_node = graph.NewNode();
+  sb_node->operation.type = ToString(OperationType::SPACE_TO_BATCH);
+  sb_node->operation.attributes = sb_attr;
+  auto dw_node = graph.NewNode();
+  dw_node->operation.type = ToString(OperationType::DEPTHWISE_CONVOLUTION);
+  dw_node->operation.attributes = dw_attr;
+  auto bs_node = graph.NewNode();
+  bs_node->operation.type = ToString(OperationType::BATCH_TO_SPACE);
+  bs_node->operation.attributes = bs_attr;
+
+  ASSERT_TRUE(graph.AddConsumer(sb_node->id, input->id).ok());
+
+  Value<TensorRefFloat32>* output;
+  ASSERT_TRUE(AddOutput(&graph, bs_node, &output).ok());
+  output->tensor.shape = BHWC(1, 95, 1, 17);
+
+  Value<TensorRefFloat32>* sb_link;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, sb_node, dw_node, &sb_link).ok());
+  sb_link->tensor.shape = BHWC(21, 128, 1, 17);
+
+  Value<TensorRefFloat32>* bs_link;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, dw_node, bs_node, &bs_link).ok());
+  bs_link->tensor.shape = BHWC(1, 95, 1, 17);
+
+  ASSERT_EQ(graph.nodes().size(), 3);
+  ASSERT_EQ(graph.values().size(), 4);
+
+  auto transformation = NewMatchDilatedConvolution();
+  ModelTransformer transformer(&graph, nullptr);
+  transformer.Apply("match_dilated_convolution", transformation.get());
+
+  ASSERT_EQ(graph.nodes().size(), 1);
+  ASSERT_EQ(graph.values().size(), 2);
+  ASSERT_EQ(graph.nodes()[0]->operation.type,
+            ToString(OperationType::DEPTHWISE_CONVOLUTION));
+
+  auto updated_dw_attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
+      graph.nodes()[0]->operation.attributes);
+  EXPECT_EQ(updated_dw_attr.padding.prepended, HW(128, 0));
+  EXPECT_EQ(updated_dw_attr.padding.appended, HW(128, 0));
+  EXPECT_EQ(updated_dw_attr.dilations, HW(128, 1));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
index 2f95947..696d525 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
@@ -45,9 +45,6 @@
  public:
   Status GenerateCode(const GenerationContext& ctx,
                       GeneratedCode* generated_code) const final {
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
-    auto outputs = ctx.graph->FindOutputs(ctx.node->id);
-
     std::string code = R"(
       vec4 prev_state  = $input_data_1[gid.x, gid.y, gid.z]$;
 
@@ -78,7 +75,7 @@
         /*objects=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
-        /*source_code=*/code,
+        /*source_code=*/std::move(code),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/IOStructure::AUTO,
     };
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index d2accc9..5538831 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -145,6 +145,12 @@
 
     // TODO(impjdi): Remove code duplication.
     auto values = graph.values();
+    auto find_value = [&](int tensor_index) -> Value<TensorRefFloat32>* {
+      for (auto value : values) {
+        if (value->tensor.ref == tensor_index) return value;
+      }
+      return nullptr;
+    };
     tensors_.reserve(values.back()->id + 1);
     for (auto value : values) {
       if (tensors_.size() <= value->id) {
@@ -154,15 +160,25 @@
     }
 
     // Prepare graph inputs.
+    //
+    // Note that graph.inputs() cannot be used directly, as the notion of
+    // graph input has a different meaning in public API and GPU-internal API.
     {
       inputs_.reserve(delegate_params->input_tensors->size);
-      for (auto input : graph.inputs()) {
-        auto tensor_index = input->tensor.ref;
-        auto& tensor = context->tensors[tensor_index];
+      for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
+        const int tensor_index = delegate_params->input_tensors->data[i];
+        auto* tensor = context->tensors + tensor_index;
+        if (tensor->allocation_type == TfLiteAllocationType::kTfLiteMmapRo) {
+          continue;
+        }
+        const auto* input = find_value(tensor_index);
+        if (!input || tensor->type != TfLiteType::kTfLiteFloat32) {
+          return NotFoundError("Input tensor is not found in the graph.");
+        }
 
         inputs_.push_back(input->id);
-        tensor.buffer_handle = input->id;
-        tensor.delegate = &delegate_;
+        tensor->buffer_handle = input->id;
+        tensor->delegate = &delegate_;
         tensors_[input->id].tensor_index = tensor_index;
 
         // Create phwc4 input buffer.
@@ -184,15 +200,22 @@
     }
 
     // Prepare graph outputs.
+    //
+    // Note that graph.outputs() cannot be used directly, as the notion of
+    // graph output has a different meaning in public API and GPU-internal API.
     {
       outputs_.reserve(delegate_params->output_tensors->size);
-      for (auto output : graph.outputs()) {
-        auto tensor_index = output->tensor.ref;
-        auto& tensor = context->tensors[tensor_index];
+      for (int i = 0; i < delegate_params->output_tensors->size; ++i) {
+        const int tensor_index = delegate_params->output_tensors->data[i];
+        auto* tensor = context->tensors + tensor_index;
+        const auto* output = find_value(tensor_index);
+        if (!output || tensor->type != TfLiteType::kTfLiteFloat32) {
+          return NotFoundError("Output tensor is not found in the graph.");
+        }
 
         outputs_.push_back(output->id);
-        tensor.buffer_handle = output->id;
-        tensor.delegate = &delegate_;
+        tensor->buffer_handle = output->id;
+        tensor->delegate = &delegate_;
         tensors_[output->id].tensor_index = tensor_index;
 
         // Create phwc4 output buffer.
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.h b/tensorflow/lite/delegates/gpu/gl_delegate.h
index 66fd0c1..aa78e1b 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.h
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.h
@@ -39,6 +39,7 @@
 extern "C" {
 #endif  // __cplusplus
 
+// LINT.IfChange
 enum TfLiteGlObjectType {
   TFLITE_GL_OBJECT_TYPE_FASTEST = 0,
   TFLITE_GL_OBJECT_TYPE_TEXTURE = 1,
@@ -74,6 +75,7 @@
   const uint8_t* metadata;  // Internal.
   TfLiteGlCompileOptions compile_options;
 };
+// LINT.ThenChange(//tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java)
 
 // Creates a new delegate instance that need to be destroyed with
 // TfLiteGpuDelegateDelete when delegate is no longer used by TFLite.
@@ -88,7 +90,7 @@
     const TfLiteGpuDelegateOptions* options);
 
 // Destroys a delegate created with `TfLiteGpuDelegateCreate` call.
-void TfLiteGpuDelegateDelete(TfLiteDelegate* delegate);
+TFL_CAPI_EXPORT void TfLiteGpuDelegateDelete(TfLiteDelegate* delegate);
 
 // Binds GL shader storage object to an input or an output tensor in the
 // initialized delegate.  Bound buffer should have sufficient storage to
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/experimental/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/experimental/GpuDelegate.java
deleted file mode 100644
index 8fafb9e..0000000
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/experimental/GpuDelegate.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.experimental;
-
-import java.io.Closeable;
-import org.tensorflow.lite.Delegate;
-import org.tensorflow.lite.Tensor;
-
-/** {@link Delegate} for GPU inference. */
-public class GpuDelegate implements Delegate, Closeable {
-
-  private static final long INVALID_DELEGATE_HANDLE = 0;
-  private static final String TFLITE_GPU_LIB = "tensorflowlite_gpu_jni";
-
-  private long delegateHandle;
-
-  public GpuDelegate() {
-    delegateHandle = createDelegate();
-  }
-
-  /**
-   * Advanced: Binds a GL SSBO to an input or an output tensor in the initialized delegate.
-   *
-   * <p>The bound buffer should have sufficient storage to accommodate all elements of the tensor.
-   *
-   * <p><b>Note:</b> This method must be called *before* calling the delegate instance is installed
-   * in the {@link Interpreter}.
-   *
-   * <p>WARNING: This is an experimental API and subject to change.
-   *
-   * @param tensor The input or output {@link Tensor} to bind to the buffer object.
-   * @param ssbo The GL buffer object to bind to the tensor. See also {@link
-   *     Interpreter.Options#setAllowBufferHandleOutput()} for details on allowing zero-copy output
-   *     when GL textures are bound to output tensors.
-   * @return Whether the operation succeeded.
-   */
-  public boolean bindGlBufferToTensor(Tensor tensor, int ssbo) {
-    return bindGlBufferToTensor(delegateHandle, tensor.index(), ssbo);
-  }
-
-  @Override
-  public long getNativeHandle() {
-    return delegateHandle;
-  }
-
-  /**
-   * Frees TFLite resources in C runtime.
-   *
-   * <p>User is expected to call this method explicitly.
-   */
-  @Override
-  public void close() {
-    if (delegateHandle != INVALID_DELEGATE_HANDLE) {
-      deleteDelegate(delegateHandle);
-      delegateHandle = INVALID_DELEGATE_HANDLE;
-    }
-  }
-
-  static {
-    System.loadLibrary(TFLITE_GPU_LIB);
-  }
-
-  private static native long createDelegate();
-
-  private static native void deleteDelegate(long delegateHandle);
-
-  private static native boolean bindGlBufferToTensor(
-      long delegateHandle, int tensorIndex, int ssbo);
-}
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/experimental/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/BUILD
similarity index 100%
rename from tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/experimental/BUILD
rename to tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/BUILD
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
new file mode 100644
index 0000000..b19dc34
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.gpu;
+
+import java.io.Closeable;
+import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.Tensor;
+
+/** {@link Delegate} for GPU inference. */
+public class GpuDelegate implements Delegate, Closeable {
+
+  private static final long INVALID_DELEGATE_HANDLE = 0;
+  private static final String TFLITE_GPU_LIB = "tensorflowlite_gpu_jni";
+
+  private long delegateHandle;
+
+  /** Shader compilation options. */
+  public static final class CompileOptions {
+    public CompileOptions() {}
+
+    /** Delegate chooses fastest GL object type to represent tensors (default). */
+    public static final int GL_OBJECT_TYPE_FASTEST = 0;
+    /**
+     * Delegate uses GL textures to represent tensors, which works faster on Adreno-based devices,
+     * but may use more memory.
+     */
+    public static final int GL_OBJECT_TYPE_TEXTURE = 1;
+    /** Delegate uses GL shader storage buffer objects to represent tensors. */
+    public static final int GL_OBJECT_TYPE_BUFFER = 2;
+
+    /**
+     * Sets whether precision loss is allowed.
+     *
+     * @param precisionLossAllowed When `true` (default), the GPU may quantify tensors, downcast
+     *     values, process in FP16. When `false`, computations are carried out in 32-bit floating
+     *     point.
+     */
+    public CompileOptions setPrecisionLossAllowed(boolean precisionLossAllowed) {
+      this.precisionLossAllowed = precisionLossAllowed;
+      return this;
+    }
+
+    /**
+     * Sets whether dynamic batch is enabled.
+     *
+     * @param dynamicBatchEnabled When `false` (default), dynamic batching is disabled and
+     *     input/output tensors must have a batch size of 1 (probably what you want, unless you use
+     *     LSTMs). When `true`, enables dynamic batching and input/output tensor can have a batch
+     *     size greater than 1.
+     */
+    public CompileOptions setDynamicBatchEnabled(boolean dynamicBatchEnabled) {
+      this.dynamicBatchEnabled = dynamicBatchEnabled;
+      return this;
+    }
+
+    /**
+     * Sets the preferred GL object type for tensor representation
+     *
+     * @param preferredGlObjectType One of `GL_OBJECT_TYPE_FASTEST` (default),
+     *     `GL_OBJECT_TYPE_TEXTURE`, `GL_OBJECT_TYPE_BUFFER`.
+     */
+    public CompileOptions setPreferredGlObjectType(int preferredGlObjectType) {
+      this.preferredGlObjectType = preferredGlObjectType;
+      return this;
+    }
+
+    boolean precisionLossAllowed = true;
+    boolean dynamicBatchEnabled = false;
+    int preferredGlObjectType = GL_OBJECT_TYPE_FASTEST;
+  }
+
+  /** Delegate options. */
+  public static final class Options {
+    public Options() {}
+
+    private static final CompileOptions DEFAULT_COMPILE_OPTIONS = new CompileOptions();
+
+    /**
+     * Sets the shader compilation options to be used by the delegate.
+     *
+     * @param compileOptions the {@link CompileOptions} to use.
+     */
+    public Options setCompileOptions(CompileOptions compileOptions) {
+      this.compileOptions = compileOptions != null ? compileOptions : DEFAULT_COMPILE_OPTIONS;
+      return this;
+    }
+
+    CompileOptions compileOptions = DEFAULT_COMPILE_OPTIONS;
+  }
+
+  public GpuDelegate(Options options) {
+    delegateHandle =
+        createDelegate(
+            options.compileOptions.precisionLossAllowed,
+            options.compileOptions.dynamicBatchEnabled,
+            options.compileOptions.preferredGlObjectType);
+  }
+
+  public GpuDelegate() {
+    this(new Options());
+  }
+
+  /**
+   * Advanced: Binds a GL SSBO to an input or an output tensor in the initialized delegate.
+   *
+   * <p>The bound buffer should have sufficient storage to accommodate all elements of the tensor.
+   *
+   * <p><b>Note:</b> This method must be called *before* calling the delegate instance is installed
+   * in the {@link Interpreter}.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   *
+   * @param tensor The input or output {@link Tensor} to bind to the buffer object.
+   * @param ssbo The GL buffer object to bind to the tensor. See also {@link
+   *     Interpreter.Options#setAllowBufferHandleOutput()} for details on allowing zero-copy output
+   *     when GL textures are bound to output tensors.
+   * @return Whether the operation succeeded.
+   */
+  public boolean bindGlBufferToTensor(Tensor tensor, int ssbo) {
+    return bindGlBufferToTensor(delegateHandle, tensor.index(), ssbo);
+  }
+
+  @Override
+  public long getNativeHandle() {
+    return delegateHandle;
+  }
+
+  /**
+   * Frees TFLite resources in C runtime.
+   *
+   * <p>User is expected to call this method explicitly.
+   */
+  @Override
+  public void close() {
+    if (delegateHandle != INVALID_DELEGATE_HANDLE) {
+      deleteDelegate(delegateHandle);
+      delegateHandle = INVALID_DELEGATE_HANDLE;
+    }
+  }
+
+  static {
+    System.loadLibrary(TFLITE_GPU_LIB);
+  }
+
+  private static native long createDelegate(
+      boolean precisionLossAllowed, boolean dynamicBatchEnabled, int preferredGlObjectType);
+
+  private static native void deleteDelegate(long delegateHandle);
+
+  private static native boolean bindGlBufferToTensor(
+      long delegateHandle, int tensorIndex, int ssbo);
+}
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 9fde661..e0308c2 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -17,27 +17,27 @@
 
 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
 
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_experimental_GpuDelegate_createDelegate(JNIEnv* env,
-                                                                 jclass clazz) {
-  // Auto-choosing the best performing config for closed release.
+JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
+    JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
+    jboolean dynamic_batch_enabled, jint preferred_gl_object_type) {
   TfLiteGpuDelegateOptions options;
   options.metadata = nullptr;
-  options.compile_options.precision_loss_allowed = 1;
+  options.compile_options.precision_loss_allowed =
+      precision_loss_allowed == JNI_TRUE ? 1 : 0;
   options.compile_options.preferred_gl_object_type =
-      TFLITE_GL_OBJECT_TYPE_FASTEST;
-  options.compile_options.dynamic_batch_enabled = 0;
+      static_cast<int32_t>(preferred_gl_object_type);
+  options.compile_options.dynamic_batch_enabled =
+      dynamic_batch_enabled == JNI_TRUE ? 1 : 0;
   return reinterpret_cast<jlong>(TfLiteGpuDelegateCreate(&options));
 }
 
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_experimental_GpuDelegate_deleteDelegate(
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_deleteDelegate(
     JNIEnv* env, jclass clazz, jlong delegate) {
   TfLiteGpuDelegateDelete(reinterpret_cast<TfLiteDelegate*>(delegate));
 }
 
 JNIEXPORT jboolean JNICALL
-Java_org_tensorflow_lite_experimental_GpuDelegate_bindGlBufferToTensor(
+Java_org_tensorflow_lite_gpu_GpuDelegate_bindGlBufferToTensor(
     JNIEnv* env, jclass clazz, jlong delegate, jint tensor_index, jint ssbo) {
   return TfLiteGpuDelegateBindBufferToTensor(
              reinterpret_cast<TfLiteDelegate*>(delegate),
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h
index 28a9cdf..b36fd91 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h
@@ -23,30 +23,29 @@
 #endif  // __cplusplus
 
 /*
- * Class:     org_tensorflow_lite_experimental_GpuDelegate
+ * Class:     org_tensorflow_lite_gpu_GpuDelegate
  * Method:    createDelegate
- * Signature: ()J
+ * Signature: (ZZI)J
  */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_experimental_GpuDelegate_createDelegate(JNIEnv* env,
-                                                                 jclass clazz);
+JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
+    JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
+    jboolean dynamic_batch_enabled, jint preferred_gl_object_type);
 
 /*
- * Class:     org_tensorflow_lite_experimental_GpuDelegate
+ * Class:     org_tensorflow_lite_gpu_GpuDelegate
  * Method:    deleteDelegate
  * Signature: (J)
  */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_experimental_GpuDelegate_deleteDelegate(
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_deleteDelegate(
     JNIEnv* env, jclass clazz, jlong delegate);
 
 /*
- * Class:     org_tensorflow_lite_experimental_GpuDelegate
+ * Class:     org_tensorflow_lite_gpu_GpuDelegate
  * Method:    bindGlBufferToTensor
  * Signature: (JII)Z
  */
 JNIEXPORT jboolean JNICALL
-Java_org_tensorflow_lite_experimental_GpuDelegate_bindGlBufferToTensor(
+Java_org_tensorflow_lite_gpu_GpuDelegate_bindGlBufferToTensor(
     JNIEnv* env, jclass clazz, jlong delegate, jint tensor_index, jint ssbo);
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 7bd7d40..3588cd9 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -251,10 +251,12 @@
 
       case OperationType::APPLY_MASK:
       case OperationType::BATCH_NORMALIZATION:
+      case OperationType::BATCH_TO_SPACE:
       case OperationType::CONST:
       case OperationType::LSTM:
       case OperationType::MUL:
       case OperationType::RESIZE:
+      case OperationType::SPACE_TO_BATCH:
       case OperationType::UNKNOWN:
         return UnimplementedError("Unsupported op: " + node->operation.type);
     }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index f4a67dc..19be9d4 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -701,7 +701,7 @@
 )";
   for (int i = 0; i < z_out; ++i) {
     const std::string s_i = std::to_string(i);
-    code += "  ACCUM_FLT4 r" + s_i + " = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f);\n";
+    code += "  float4 r" + s_i + " = float4(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
   code += R"(
   device FLT4* tmp = filters + gid_z * 4 * params.src_size.w;
@@ -728,7 +728,7 @@
   )";
   for (int i = 0; i < z_out; ++i) {
     const std::string s_i = std::to_string(i);
-    code += "  r" + s_i + " += TO_ACCUM4_TYPE(bias_loc[" + s_i + "]);\n";
+    code += "  r" + s_i + " += float4(bias_loc[" + s_i + "]);\n";
   }
   for (int i = 0; i < z_out; ++i) {
     const std::string s_i = std::to_string(i);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
index cacbb3c..15b4654 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@@ -468,6 +468,7 @@
     int id, ValueId input_id, ValueId output_id,
     const DepthwiseConvolution2DAttributes& attr,
     const RuntimeOptions& options) {
+  int channels_multiplier = attr.weights.shape.o;
   auto desc = std::make_shared<ComputeTaskDescriptor>();
   desc->id = id;
   desc->is_linkable = false;
@@ -503,10 +504,44 @@
           const bool outside = coords.x < 0 || coords.y < 0 ||
             coords.x >= params.size.x || coords.y >= params.size.y;
           if (outside) continue;
-
-          const int src_layer = gid.z;
-          const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
-          sum0 += float4(src_buffer[src_index]) * float4(temp[ky * kernel_x + kx]);
+)";
+  if (channels_multiplier == 1) {
+    shader_source += R"(
+        const int src_layer = gid.z;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src_modified = src_buffer[src_index];
+)";
+  } else if (channels_multiplier == 2) {
+    shader_source += R"(
+        const int src_layer = gid.z / 2;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src = src_buffer[src_index];
+        const FLT2 t0 = gid.z % 2 == 0 ? src.xy : src.zw;
+        const FLT4 src_modified = FLT4(t0.x, t0.x, t0.y, t0.y);
+)";
+  } else if (channels_multiplier == 4) {
+    shader_source += R"(
+        const int src_layer = gid.z / 4;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src = src_buffer[src_index];
+        const FLT t0 = src[gid.z % 4];
+        const FLT4 src_modified = FLT4(t0, t0, t0, t0);
+)";
+  } else {
+    shader_source += R"(
+        const int src_layer = gid.z / params.channel_multiplier.x;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src = src_buffer[src_index];
+        FLT4 src_modified;
+        const int src_layer_offset = (gid.z % params.channel_multiplier.x) * 4;
+        src_modified.x = src[(src_layer_offset + 0) / params.channel_multiplier.x];
+        src_modified.y = src[(src_layer_offset + 1) / params.channel_multiplier.x];
+        src_modified.z = src[(src_layer_offset + 2) / params.channel_multiplier.x];
+        src_modified.w = src[(src_layer_offset + 3) / params.channel_multiplier.x];
+)";
+  }
+  shader_source += R"(
+          sum0 += float4(src_modified * temp[ky * kernel_x + kx]);
         }
       }
       FLT4 res = FLT4(sum0 + float4(biases[gid.z]));
@@ -531,19 +566,7 @@
         return out_shape;
       }};
 
-  const int num_output_channels = attr.weights.shape.i * attr.weights.shape.o;
-  BHWC reordered_dims{1, attr.weights.shape.h, attr.weights.shape.w,
-                      num_output_channels};
-  std::vector<float> filters_reordered(GetElementsSizeForPHWC4(reordered_dims),
-                                       0.0f);
-  if (!ConvertToPHWC4(
-           absl::MakeConstSpan(attr.weights.data.data(),
-                               attr.weights.data.size()),
-           reordered_dims,
-           absl::MakeSpan(filters_reordered.data(), filters_reordered.size()))
-           .ok()) {
-    return {};
-  }
+  std::vector<float> filters_reordered = ConvertToPIOHW4(attr.weights);
   auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
                      ? VectorToUint8Vector(filters_reordered)
                      : VectorFloatToHalf(filters_reordered);
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h
index 0fd59d5..d38e73a 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -18,7 +18,7 @@
 
 #import <Metal/Metal.h>
 
-#include <stdint.h>
+#include <functional>
 
 #include "tensorflow/lite/c/c_api_internal.h"
 
@@ -35,8 +35,10 @@
     // additional CPU resources.
     kActive,
     // Useful when the output is used with GPU pipeline then or if external
-    // command encoder is set
+    // command encoder is set.
     kDoNotWait,
+    // Tries to avoid GPU sleep mode.
+    kAggressive,
   };
   WaitType wait_type;
 };
@@ -61,7 +63,10 @@
 
 // Binds user-defined MTLComputeCommandEncoder. The delegate puts all GPU tasks
 // into this encoder instead of the internal encoder.
-bool SetCommandEncoder(TfLiteDelegate* delegate,
-                       id<MTLComputeCommandEncoder> encoder);
+// The callback is a user-defined function to take control over encoder and
+// command buffer. Can be nullptr.
+bool TFLSetCommandEncoder(
+    TfLiteDelegate* delegate, id<MTLComputeCommandEncoder> encoder,
+    std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder);
 
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index b71881b..d62d10a 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -20,7 +20,10 @@
 #include <algorithm>
 #include <cstring>
 #include <map>
+#include <memory>
+#include <mutex>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -36,6 +39,7 @@
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/metal/api.h"
 #include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h"
+#include "tensorflow/lite/delegates/gpu/metal/common.h"
 #include "tensorflow/lite/delegates/gpu/metal/compiled_model.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
@@ -46,6 +50,111 @@
 namespace metal {
 namespace {
 
+// Multi-thread safe alarm clock for preventing GPU sleeping. It spawns lightweight compute tasks
+// until no inference is performing on a device. It's reduces the CPU-to-CPU inference latency.
+// The class is used only for kAggressive wait type.
+class GpuAlarmClock {
+ public:
+  explicit GpuAlarmClock(id<MTLCommandQueue> command_queue) {
+    auto device = [command_queue device];
+    std::lock_guard<std::mutex> lock(alarms_mutex_);
+    if (!alarms_) alarms_ = new std::map<id<MTLDevice>, GpuAlarmClockInternal*>();
+    auto it = alarms_->find(device);
+    if (it == alarms_->end()) {
+      internal_ = new GpuAlarmClockInternal(command_queue);
+      (*alarms_)[device] = internal_;
+    } else {
+      internal_ = it->second;
+      internal_->total_alarms_++;
+    }
+  }
+  ~GpuAlarmClock() {
+    std::lock_guard<std::mutex> lock(alarms_mutex_);
+    if (--internal_->total_alarms_ > 0) return;
+    Stop();
+    delete internal_;
+    // Remove the alarm from the container to free-up device handle.
+    for (auto it = alarms_->begin(); it != alarms_->end(); ++it) {
+      if (it->second == internal_) {
+        alarms_->erase(it);
+        break;
+      }
+    }
+    if (alarms_->empty()) {
+      delete alarms_;
+      alarms_ = nullptr;
+    }
+  }
+  void Start() {
+    if (started_) return;
+    started_ = true;
+    internal_->active_alarms_++;
+  }
+  void Stop() {
+    if (!started_) return;
+    started_ = false;
+    internal_->active_alarms_--;
+  }
+
+ private:
+  class GpuAlarmClockInternal {
+   public:
+    id<MTLComputePipelineState> stub_program_;
+    id<MTLBuffer> stub_buffer_;
+    explicit GpuAlarmClockInternal(id<MTLCommandQueue> command_queue) {
+      command_queue_ = command_queue;
+      device_ = [command_queue_ device];
+      total_alarms_ = 1;
+      NSString* error;
+      id<MTLComputePipelineState> program;
+      CreateComputeProgram(device_,
+                           @"kernel void ComputeFunction(device int* output_buffer [[buffer(0)]]) "
+                           @"{ output_buffer[0] = 0; }",
+                           @"ComputeFunction", nullptr, &program);
+      stub_program_ = program;
+      stub_buffer_ = [device_ newBufferWithLength:sizeof(int) * 4
+                                          options:MTLResourceHazardTrackingModeUntracked];
+      alarm_thread_ = std::thread([this]() {
+        id<MTLCommandBuffer> prev_command_buffer;
+        while (!release_thread_) {
+          if (active_alarms_ == total_alarms_) {
+            id<MTLCommandBuffer> command_buffer = [command_queue_ commandBuffer];
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+            [encoder setComputePipelineState:stub_program_];
+            [encoder setBuffer:stub_buffer_ offset:0 atIndex:0];
+            [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1)
+                    threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            [encoder endEncoding];
+            [command_buffer commit];
+            if (prev_command_buffer != nil) [prev_command_buffer waitUntilScheduled];
+            prev_command_buffer = command_buffer;
+          } else {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+          }
+        }
+      });
+    }
+    ~GpuAlarmClockInternal() {
+      release_thread_ = true;
+      alarm_thread_.join();
+    }
+
+   private:
+    friend class GpuAlarmClock;
+    std::atomic<int> active_alarms_;
+    std::thread alarm_thread_;
+    id<MTLCommandQueue> command_queue_;
+    id<MTLDevice> device_;
+    volatile bool release_thread_ = false;
+    int total_alarms_ = 0;
+  };
+  static std::map<id<MTLDevice>, GpuAlarmClockInternal*>* alarms_;
+  std::mutex alarms_mutex_;
+  GpuAlarmClockInternal* internal_;
+  bool started_ = false;
+};
+std::map<id<MTLDevice>, GpuAlarmClock::GpuAlarmClockInternal*>* GpuAlarmClock::alarms_ = nullptr;
+
 // Forward declaration.
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
 
@@ -65,6 +174,23 @@
       options_.wait_type = GpuDelegateOptions::WaitType::kPassive;
     }
     metal_device_ = MTLCreateSystemDefaultDevice();
+    command_queue_ = [metal_device_ newCommandQueue];
+    if (options_.wait_type == GpuDelegateOptions::WaitType::kAggressive) {
+      gpu_alarm_clock_ = std::unique_ptr<GpuAlarmClock>(new GpuAlarmClock(command_queue_));
+      NSString* code = @R"(
+          kernel void ComputeFunction(device int* output_buffer [[buffer(0)]],
+                                      constant int& value [[buffer(1)]]) {
+            output_buffer[0] = value;
+          }
+        )";
+      NSString* error;
+      id<MTLComputePipelineState> signal_program;
+      CreateComputeProgram(metal_device_, code, @"ComputeFunction", nullptr, &signal_program);
+      signal_program_ = signal_program;
+      signal_buffer_ = [metal_device_ newBufferWithLength:sizeof(int) * 4
+                                                  options:MTLResourceStorageModeShared |
+                                                          MTLResourceHazardTrackingModeUntracked];
+    }
   }
 
   Status BindBufferToTensor(id<MTLBuffer> buffer, int tensor_index) {
@@ -87,7 +213,10 @@
     return NotFoundError("Couldn't find tensor: " + std::to_string(tensor_index));
   }
 
-  void SetCommandEncoder(id<MTLComputeCommandEncoder> encoder) {
+  void SetCommandEncoder(
+      id<MTLComputeCommandEncoder> encoder,
+      std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder) {
+    control_encoder_ = control_encoder;
     external_command_encoder_ = encoder;
   }
 
@@ -105,6 +234,12 @@
 
     // TODO(impjdi): Remove code duplication.
     auto values = graph.values();
+    auto find_value = [&](int tensor_index) -> Value<TensorRefFloat32>* {
+      for (auto value : values) {
+        if (value->tensor.ref == tensor_index) return value;
+      }
+      return nullptr;
+    };
     tensors_.reserve(values.back()->id + 1);
     for (const auto* value : values) {
       if (tensors_.size() <= value->id) tensors_.resize(value->id + 1);
@@ -115,21 +250,38 @@
     }
 
     // Prepare graph inputs.
+    //
+    // Note that graph.inputs() cannot be used directly, as the notion of graph input has a
+    // different meaning in public API and GPU-internal API.
     inputs_.reserve(delegate_params->input_tensors->size);
-    for (auto input : graph.inputs()) {
-      inputs_.push_back(input->id);
+    for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
+      const int tensor_index = delegate_params->input_tensors->data[i];
+      auto* tensor = context->tensors + tensor_index;
+      if (tensor->allocation_type == TfLiteAllocationType::kTfLiteMmapRo) continue;
+      const auto* input = find_value(tensor_index);
+      if (!input || tensor->type != TfLiteType::kTfLiteFloat32) {
+        return NotFoundError("Input tensor is not found in the graph.");
+      }
 
-      auto tensor = &context->tensors[input->tensor.ref];
+      inputs_.push_back(input->id);
       tensor->buffer_handle = input->id;
       tensor->delegate = &delegate_;
     }
 
     // Prepare graph outputs.
+    //
+    // Note that graph.outputs() cannot be used directly, as the notion of graph output has a
+    // different meaning in public API and GPU-internal API.
     outputs_.reserve(delegate_params->output_tensors->size);
-    for (auto output : graph.outputs()) {
-      outputs_.push_back(output->id);
+    for (int i = 0; i < delegate_params->output_tensors->size; ++i) {
+      const int tensor_index = delegate_params->output_tensors->data[i];
+      auto* tensor = context->tensors + tensor_index;
+      const auto* output = find_value(tensor_index);
+      if (!output || tensor->type != TfLiteType::kTfLiteFloat32) {
+        return NotFoundError("Output tensor is not found in the graph.");
+      }
 
-      auto tensor = &context->tensors[output->tensor.ref];
+      outputs_.push_back(output->id);
       tensor->buffer_handle = output->id;
       tensor->delegate = &delegate_;
     }
@@ -145,7 +297,6 @@
       runtime_options.storage_precision = RuntimeOptions::Precision::FP32;
       runtime_options.accumulator_precision = RuntimeOptions::Precision::FP32;
     }
-    command_queue_ = [metal_device_ newCommandQueue];
 
     // TODO(impjdi): Merge logic with above.
     // Pre-allocate input and output metal buffers
@@ -244,12 +395,13 @@
   }
 
   Status Invoke(TfLiteContext* context) {
+    if (options_.wait_type == GpuDelegateOptions::WaitType::kAggressive) gpu_alarm_clock_->Stop();
     // We need only synchronization so volatile works better than atomic which reads from global
     // memory each time.
     __block volatile bool buffer_completed = false;
     __block id<MTLCommandBuffer> command_buffer;
     __block id<MTLComputeCommandEncoder> encoder = external_command_encoder_;
-    if (encoder == nil) {
+    if (external_command_encoder_ == nil) {
       command_buffer = [command_queue_ commandBuffer];
       encoder = [command_buffer computeCommandEncoder];
     }
@@ -267,11 +419,20 @@
                                          shape:input.shape
                                   sourceBuffer:input_output_buffers_[input.id]
                                convertedBuffer:bphwc4_buffers_[input.id]];
+      if (external_command_encoder_ == nil) {
+        [encoder endEncoding];
+        [command_buffer commit];
+        command_buffer = [command_queue_ commandBuffer];
+        encoder = [command_buffer computeCommandEncoder];
+      }
     }
 
     [inference_context_ encodeWithEncoder:encoder
                        inputOutputBuffers:bphwc4_buffers_
                              encoderBlock:^(bool isLast) {
+                               if (control_encoder_ != nullptr) {
+                                 return control_encoder_(isLast);
+                               }
                                if (external_command_encoder_ != nil ||
                                    options_.wait_type == GpuDelegateOptions::WaitType::kPassive) {
                                  return encoder;
@@ -304,11 +465,31 @@
       [command_buffer commit];
       if (options_.wait_type == GpuDelegateOptions::WaitType::kActive) {
         while (!buffer_completed) {
-          // Busy wait.
+          // Busy wait. Use local variable. Volatile uses RAM access all the time.
+          for (volatile int i = 0; i < 100; i++) {
+          }
         }
       } else if (options_.wait_type == GpuDelegateOptions::WaitType::kPassive) {
         // passive wait: this thread sleeps until GPU finishes.
         [command_buffer waitUntilCompleted];
+      } else if (options_.wait_type == GpuDelegateOptions::WaitType::kAggressive) {
+        command_buffer = [command_queue_ commandBuffer];
+        encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState:signal_program_];
+        [encoder setBuffer:signal_buffer_ offset:0 atIndex:0];
+        signal_value_++;
+        [encoder setBytes:&signal_value_ length:sizeof(int) atIndex:1];
+        [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1)
+                threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+        [encoder endEncoding];
+        [command_buffer commit];
+        gpu_alarm_clock_->Start();
+        const int* signal_ptr = reinterpret_cast<const int*>([signal_buffer_ contents]);
+        while (signal_ptr[0] != signal_value_) {
+          // Busy wait. Spinning with local variable to avoid RAM pressure.
+          for (volatile int i = 0; i < 100; i++) {
+          }
+        }
       }
     } else {
       // External command encoder must be set before every invoke call.
@@ -371,14 +552,19 @@
   std::vector<BufferDescriptor> graph_outputs_;
 
   id<MTLComputeCommandEncoder> external_command_encoder_;
+  std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder_;
   id<MTLCommandQueue> command_queue_;
+  std::unique_ptr<GpuAlarmClock> gpu_alarm_clock_;
+  id<MTLComputePipelineState> signal_program_;
+  id<MTLBuffer> signal_buffer_;
+  int signal_value_ = 0;
 };
 
-inline Delegate* GetMetalDelegate(TfLiteNode* node) {
+Delegate* GetMetalDelegate(TfLiteNode* node) {
   return reinterpret_cast<Delegate*>(node->user_data);
 }
 
-inline Delegate* GetMetalDelegate(TfLiteDelegate* delegate) {
+Delegate* GetMetalDelegate(TfLiteDelegate* delegate) {
   return reinterpret_cast<Delegate*>(delegate->data_);
 }
 
@@ -392,8 +578,7 @@
         // forbids that.
         const auto status = metal_delegate->Prepare(context, params);
         if (status.ok()) return metal_delegate;
-        context->ReportError(context, "TfLiteGpuDelegate Prepare: %s",
-                             status.error_message().c_str());
+        context->ReportError(context, "TfLiteGpuDelegate Prepare: %s", status.message().data());
         return nullptr;
       },
       // .free
@@ -406,8 +591,7 @@
       [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
         const auto status = GetMetalDelegate(node)->Invoke(context);
         if (status.ok()) return kTfLiteOk;
-        context->ReportError(context, "TfLiteMetalDelegate Invoke: %s",
-                             status.error_message().c_str());
+        context->ReportError(context, "TfLiteMetalDelegate Invoke: %s", status.message().data());
         return kTfLiteError;
       },
       nullptr,                // .profiling_string
@@ -428,8 +612,7 @@
 }  // namespace tflite
 
 TfLiteDelegate* NewGpuDelegate(const GpuDelegateOptions* options) {
-  TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
-                       "Created TensorFlow Lite delegate for Metal.");
+  TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO, "Created TensorFlow Lite delegate for Metal.");
   auto* metal_delegate = new ::tflite::gpu::metal::Delegate(options);
   return metal_delegate ? metal_delegate->tflite_delegate() : nullptr;
 }
@@ -443,9 +626,11 @@
   return metal_delegate && metal_delegate->BindBufferToTensor(buffer, tensor_index).ok();
 }
 
-bool SetCommandEncoder(TfLiteDelegate* delegate, id<MTLComputeCommandEncoder> encoder) {
+bool TFLSetCommandEncoder(
+    TfLiteDelegate* delegate, id<MTLComputeCommandEncoder> encoder,
+    std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder) {
   auto* metal_delegate = ::tflite::gpu::metal::GetMetalDelegate(delegate);
   if (!metal_delegate) return false;
-  metal_delegate->SetCommandEncoder(encoder);
+  metal_delegate->SetCommandEncoder(encoder, control_encoder);
   return true;
 }
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index ec0d78e..6cb3587 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -8,10 +8,19 @@
 
 cc_library(
     name = "nnapi_delegate",
-    srcs = ["nnapi_delegate.cc"],
+    srcs = select({
+        "//tensorflow:ios": [
+            "nnapi_delegate_disabled.cc",
+        ],
+        "//tensorflow:windows": [
+            "nnapi_delegate_disabled.cc",
+        ],
+        "//conditions:default": [
+            "nnapi_delegate.cc",
+        ],
+    }),
     hdrs = ["nnapi_delegate.h"],
     deps = [
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
@@ -24,6 +33,7 @@
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
     tags = [
+        "no_windows",
         # TODO(b/122987564): Enable on Android after resolving API 27 failures.
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
new file mode 100644
index 0000000..17a2389
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
@@ -0,0 +1,7 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "nnapi_delegate_src",
+    srcs = ["NnApiDelegate.java"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
new file mode 100644
index 0000000..78bf59d
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.nnapi;
+
+import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.Tensor;
+
+/** {@link Delegate} for NNAPI inference. */
+public class NnApiDelegate implements Delegate, AutoCloseable {
+
+  private static final long INVALID_DELEGATE_HANDLE = 0;
+
+  private long delegateHandle;
+
+  public NnApiDelegate() {
+    delegateHandle = createDelegate();
+  }
+
+  @Override
+  public long getNativeHandle() {
+    return delegateHandle;
+  }
+
+  /**
+   * The NNAPI delegate is singleton. Nothing to delete for now, so mark the handle invalid only.
+   */
+  @Override
+  public void close() {
+    if (delegateHandle != INVALID_DELEGATE_HANDLE) {
+      delegateHandle = INVALID_DELEGATE_HANDLE;
+    }
+  }
+
+  private static native long createDelegate();
+}
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
new file mode 100644
index 0000000..405fc2e
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
@@ -0,0 +1,25 @@
+# Description:
+# Java Native Interface (JNI) library intended for implementing the
+# TensorFlow Lite GPU delegate Java API using the TensorFlow Lite CC library.
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "native",
+    srcs = ["nnapi_delegate_jni.cc"],
+    hdrs = ["nnapi_delegate_jni.h"],
+    copts = tflite_copts(),
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/java/jni",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
new file mode 100644
index 0000000..a4ff12b
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
@@ -0,0 +1,24 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h"
+
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(JNIEnv* env,
+                                                            jclass clazz) {
+  return reinterpret_cast<jlong>(tflite::NnApiDelegate());
+}
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h
new file mode 100644
index 0000000..12cf56c
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_JAVA_SRC_MAIN_NATIVE_NNAPI_DELEGATE_JNI_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_JAVA_SRC_MAIN_NATIVE_NNAPI_DELEGATE_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/*
+ * Class:     org_tensorflow_lite_nnapi_NnApiDelegate
+ * Method:    createDelegate
+ * Signature: ()J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(JNIEnv* env,
+                                                            jclass clazz);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_JAVA_SRC_MAIN_NATIVE_NNAPI_DELEGATE_JNI_H_
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 4331f52..5cb02e2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -12,18 +12,18 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+
 #include <cstdarg>
 #include <cstring>
 #include <iostream>
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
@@ -72,6 +72,18 @@
   }
 }
 
+bool IsScalarInputSupported(int builtin_code) {
+  switch (builtin_code) {
+    case kTfLiteBuiltinAdd:
+    case kTfLiteBuiltinMul:
+    case kTfLiteBuiltinSub:
+    case kTfLiteBuiltinDiv:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
                       const TfLiteNode* node) {
   switch (builtin_code) {
@@ -116,7 +128,16 @@
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
 constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+constexpr size_t kDefaultByteAlignmentForNNAPI = 16;
 
+static size_t getNumPaddingBytes(size_t byte_size) {
+  size_t num_padding_bytes = 0;
+  if (byte_size % kDefaultByteAlignmentForNNAPI) {
+    num_padding_bytes = kDefaultByteAlignmentForNNAPI -
+                        (byte_size % kDefaultByteAlignmentForNNAPI);
+  }
+  return num_padding_bytes;
+}
 }  // namespace
 
 // RAII NN API Model Destructor for use with std::unique_ptr
@@ -287,8 +308,10 @@
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddTensorInput(int tensor_index, bool hybrid_op) {
-    return AddTensor(tensor_index, hybrid_op, &augmented_inputs_);
+  TfLiteStatus AddTensorInput(int tensor_index, bool hybrid_op,
+                              bool scalar_as_tensor = false) {
+    return AddTensor(tensor_index, hybrid_op, &augmented_inputs_,
+                     scalar_as_tensor);
   }
 
   TfLiteStatus AddTensorOutput(int tensor_index) {
@@ -418,7 +441,8 @@
   // If another caller previously created a NN API tensor for `tensor_index`
   // then the existing one is returned.
   TfLiteStatus AddTensor(int tensor_index, bool hybrid_op,
-                         std::vector<uint32_t>* indices) {
+                         std::vector<uint32_t>* indices,
+                         bool scalar_as_tensor = false) {
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
       indices->push_back(ann_tensor_index);
@@ -469,10 +493,16 @@
         context_->ReportError(context_, "Logic error in NN API Delegate.\n");
         return kTfLiteError;
     }
+    uint32_t tensor_rank = static_cast<uint32_t>(tensor->dims->size);
+    uint32_t* tensor_dims = reinterpret_cast<uint32_t*>(tensor->dims->data);
+    if (scalar_as_tensor && tensor_rank == 0) {
+      // Use rank 1, shape {1} operand for TFLite scalar tensors.
+      tensor_rank = 1;
+      tensor_dims = &tensor_rank;
+    }
 
-    ANeuralNetworksOperandType operand_type{
-        nn_type, static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    ANeuralNetworksOperandType operand_type{nn_type, tensor_rank, tensor_dims,
+                                            scale, zeroPoint};
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
@@ -703,16 +733,16 @@
         break;
       case kTfLiteBuiltinResizeBilinear:
         if (version == 1) {
-          if (android_sdk_version < kMinSdkVersionForNNAPI12) {
-            // Some NNAPI 1.1 drivers don't support this operator properly.
-            return nullptr;
-          }
           const auto& input = context->tensors[node->inputs->data[0]];
           if (input.dims->size != 4) return nullptr;
           if (input.type != kTfLiteFloat32 && input.type != kTfLiteUInt8) {
             return nullptr;
           }
-
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              input.type != kTfLiteFloat32) {
+            // NNAPI 1.0 & 11 only supports float input.
+            return nullptr;
+          }
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             const int output_id = mapping_args.node->outputs->data[0];
@@ -720,8 +750,8 @@
             const int output_height = output.dims->data[1];
             const int output_width = output.dims->data[2];
             // TfLiteResizeBilinearParams's |align_corners| is ignored.
-            mapping_args.builder->AddScalarInt32Operand(output_height);
             mapping_args.builder->AddScalarInt32Operand(output_width);
+            mapping_args.builder->AddScalarInt32Operand(output_height);
             return ANEURALNETWORKS_RESIZE_BILINEAR;
           };
         }
@@ -882,11 +912,14 @@
         if (version == 1 && node->inputs->size == 2 &&
             (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
             (context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 ||
+             (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8 &&
+              context->tensors[node->inputs->data[0]].params.zero_point == 0) ||
              android_sdk_version >= kMinSdkVersionForNNAPI12)) {
           // NNAPI does not support specifying the padding value.
           // Before 1.2, NNAPI pads physical zero for quantized tensors, so only
-          // delegate float pad to NNAPI. NNAPI 1.2 onwards pads with
-          // zero-point, so delegate quantized pad as well.
+          // delegate pad with float input or quantized input with zero_point ==
+          // 0 to NNAPI. NNAPI 1.2 onwards pads with zero-point, so delegate
+          // other quantized pad as well.
           return BasicMappingFn<ANEURALNETWORKS_PAD>;
         }
         break;
@@ -921,6 +954,36 @@
           return BasicMappingFn<ANEURALNETWORKS_TRANSPOSE>;
         }
         break;
+      case kTfLiteBuiltinAbs:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_ABS>;
+        }
+        break;
+      case kTfLiteBuiltinExp:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_EXP>;
+        }
+        break;
+      case kTfLiteBuiltinLog:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_LOG>;
+        }
+        break;
+      case kTfLiteBuiltinRsqrt:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_RSQRT>;
+        }
+        break;
+      case kTfLiteBuiltinSin:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_SIN>;
+        }
+        break;
+      case kTfLiteBuiltinSqrt:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_SQRT>;
+        }
+        break;
       case kTfLiteBuiltinRnn:
         // NNAPI only support float32 weights.
         if (version == 1 && node->inputs->size == 5 &&
@@ -1079,6 +1142,11 @@
           return BasicMappingFn<ANEURALNETWORKS_HASHTABLE_LOOKUP>;
         }
         break;
+      case kTfLiteBuiltinPrelu:
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_PRELU>;
+        }
+        break;
       default:
         // All other operators are not mapped.
         return nullptr;
@@ -1108,6 +1176,21 @@
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
           context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
                                                              &compilation));
+
+      auto preference = StatefulNnApiDelegate::GetOptions(params->delegate)
+                            .execution_preference;
+      if (preference !=
+          StatefulNnApiDelegate::Options::ExecutionPreference::kUndefined) {
+        const int preference_result =
+            nnapi_->ANeuralNetworksCompilation_setPreference(compilation,
+                                                             preference);
+        if (preference_result != ANEURALNETWORKS_NO_ERROR) {
+          nnapi_->ANeuralNetworksCompilation_free(compilation);
+          compilation = nullptr;
+        }
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(context, preference_result);
+      }
+
       const int finish_result =
           nnapi_->ANeuralNetworksCompilation_finish(compilation);
       if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -1150,6 +1233,7 @@
                 execution, relative_input_index, nullptr,
                 nn_input_memory_->get_handle(), input_offset, tensor->bytes));
         input_offset += tensor->bytes;
+        input_offset += getNumPaddingBytes(tensor->bytes);
         relative_input_index++;
       }
     }
@@ -1165,6 +1249,7 @@
               execution, relative_output_index, nullptr,
               nn_output_memory_->get_handle(), output_offset, tensor->bytes));
       output_offset += tensor->bytes;
+      output_offset += getNumPaddingBytes(tensor->bytes);
       relative_output_index++;
     }
 
@@ -1204,6 +1289,7 @@
       memcpy(tensor->data.raw,
              nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes);
       output_offset += tensor->bytes;
+      output_offset += getNumPaddingBytes(tensor->bytes);
     }
 
     return kTfLiteOk;
@@ -1298,6 +1384,7 @@
           context->GetNodeAndRegistration(context, node_index, &node, &reg));
 
       const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
+      const bool scalar_as_tensor = IsScalarInputSupported(reg->builtin_code);
 
       // Map inputs to NN API tensor indices.
       int num_added_inputs = 0;
@@ -1327,7 +1414,8 @@
                 builder.AddTensorInput(input_index, hybrid_op));
           }
         } else {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+          TF_LITE_ENSURE_STATUS(
+              builder.AddTensorInput(input_index, hybrid_op, scalar_as_tensor));
         }
         ++num_added_inputs;
       }
@@ -1370,6 +1458,7 @@
           context->tensors[i].allocation_type != kTfLiteMmapRo) {
         inputs.push_back(operand_mapping_.lite_index_to_ann(i));
         total_input_byte_size += context->tensors[i].bytes;
+        total_input_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
       }
     }
 
@@ -1377,6 +1466,7 @@
     for (int i : TfLiteIntArrayView(output_tensors)) {
       outputs.push_back(operand_mapping_.lite_index_to_ann(i));
       total_output_byte_size += context->tensors[i].bytes;
+      total_output_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
     }
 
     // Add state output tensors as model outputs.
@@ -1414,107 +1504,113 @@
 
 }  // namespace
 
-// Return a NN API Delegate struct that can check for support of ops.
-TfLiteDelegate* NnApiDelegate() {
-  static TfLiteDelegate delegate = {
-      .data_ = nullptr,
-      .Prepare = [](TfLiteContext* context,
-                    TfLiteDelegate* delegate) -> TfLiteStatus {
-        // Do not check nodes_ if NN API is unavailable.
-        const NnApi* nnapi = NnApiImplementation();
-        if (nnapi->android_sdk_version < kMinSdkVersionForNNAPI ||
-            !nnapi->nnapi_exists) {
-          return kTfLiteOk;
-        }
-        // For NNAPI 1.2+, check if there is any accelerator available.
-        // If not, don't delegate to NNAPI's CPU reference implementation.
-        if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          uint32_t device_count = 0;
-          RETURN_TFLITE_ERROR_IF_NN_ERROR(
-              context, nnapi->ANeuralNetworks_getDeviceCount(&device_count));
-          // Any available accelerator will make the device_count larger than 1.
-          // More sophisticated check and whitelisting can be added later.
-          if (device_count <= 1) {
-            return kTfLiteOk;
-          }
-        }
-        // Allocate one element in vector already since TensorFlow Lite uses
-        // the first value as the number of nodes. The actual value will be set
-        // later, after the vector has been filled.
-        std::vector<int> supported_nodes(1);
-        // We don't care about all nodes_, we only care about ones in the
-        // current plan.
-        TfLiteIntArray* plan;
-        TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(Data{.options = options}) {
+  Prepare = DoPrepare;
+  data_ = &delegate_data_;
+}
 
-        int android_sdk_version = NnApiImplementation()->android_sdk_version;
-        // Check for every node if it is supported
-        // TODO(b/80625235): Fix this to do more careful checking of versioning.
-        for (int node_index : TfLiteIntArrayView(plan)) {
-          TfLiteNode* node;
-          TfLiteRegistration* registration;
-          TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-              context, node_index, &node, &registration));
-          if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
-                                       registration->version,
-                                       android_sdk_version, node)) {
-            supported_nodes.push_back(node_index);
-          }
-        }
-        // First element in vector must be the number of actual nodes.
-        supported_nodes[0] = supported_nodes.size() - 1;
+StatefulNnApiDelegate::StatefulNnApiDelegate()
+    : StatefulNnApiDelegate(Options()) {}
 
-        // NN API Delegate Registration (the pseudo kernel that will invoke NN
-        // API node sub sets)
-        static const TfLiteRegistration nnapi_delegate_kernel = {
-            .init = [](TfLiteContext* context, const char* buffer,
-                       size_t length) -> void* {
-              const TfLiteDelegateParams* params =
-                  reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-              NNAPIDelegateKernel* kernel_state = new NNAPIDelegateKernel;
-              kernel_state->Init(context, params);
-              return kernel_state;
-            },
+const StatefulNnApiDelegate::Options& StatefulNnApiDelegate::GetOptions(
+    TfLiteDelegate* delegate) {
+  auto delegate_data = reinterpret_cast<Data*>(delegate->data_);
+  return delegate_data->options;
+}
 
-            .free = [](TfLiteContext* context, void* buffer) -> void {
-              delete reinterpret_cast<NNAPIDelegateKernel*>(buffer);
-            },
+TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
+                                              TfLiteDelegate* delegate) {
+  // Do not check nodes_ if NN API is unavailable.
+  const NnApi* nnapi = NnApiImplementation();
+  if (nnapi->android_sdk_version < kMinSdkVersionForNNAPI ||
+      !nnapi->nnapi_exists) {
+    return kTfLiteOk;
+  }
+  // For NNAPI 1.2+, check if there is any accelerator available.
+  // If not, don't delegate to NNAPI's CPU reference implementation.
+  if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI12) {
+    uint32_t device_count = 0;
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi->ANeuralNetworks_getDeviceCount(&device_count));
+    // Any available accelerator will make the device_count larger than 1.
+    // More sophisticated check and whitelisting can be added later.
+    if (device_count <= 1) {
+      return kTfLiteOk;
+    }
+  }
+  // Allocate one element in vector already since TensorFlow Lite uses
+  // the first value as the number of nodes. The actual value will be set
+  // later, after the vector has been filled.
+  std::vector<int> supported_nodes(1);
+  // We don't care about all nodes_, we only care about ones in the
+  // current plan.
+  TfLiteIntArray* plan;
+  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
 
-            .prepare = [](TfLiteContext* context,
-                          TfLiteNode* node) -> TfLiteStatus {
-              // Since the underlying resize happened ahead of delegation
-              // worked. This does nothing.
-              return kTfLiteOk;
-            },
+  int android_sdk_version = NnApiImplementation()->android_sdk_version;
+  // Check for every node if it is supported
+  // TODO(b/80625235): Fix this to do more careful checking of versioning.
+  for (int node_index : TfLiteIntArrayView(plan)) {
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+        context, node_index, &node, &registration));
+    if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
+                                 registration->version, android_sdk_version,
+                                 node)) {
+      supported_nodes.push_back(node_index);
+    }
+  }
+  // First element in vector must be the number of actual nodes.
+  supported_nodes[0] = supported_nodes.size() - 1;
 
-            .invoke = [](TfLiteContext* context,
-                         TfLiteNode* node) -> TfLiteStatus {
-              NNAPIDelegateKernel* state =
-                  reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
-              return state->Invoke(context, node);
-            },
-
-            .profiling_string = nullptr,
-            .builtin_code = kTfLiteBuiltinDelegate,
-            .custom_name = "TfLiteNnapiDelegate",
-            .version = 1,
-        };
-
-        // Request TFLite to partition the graph and make kernels
-        // for each independent node sub set a new nnapi_delegate_kernel.
-        return context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, nnapi_delegate_kernel,
-            reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
-            delegate);
+  // NN API Delegate Registration (the pseudo kernel that will invoke NN
+  // API node sub sets)
+  static const TfLiteRegistration nnapi_delegate_kernel = {
+      .init = [](TfLiteContext* context, const char* buffer,
+                 size_t length) -> void* {
+        const TfLiteDelegateParams* params =
+            reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+        NNAPIDelegateKernel* kernel_state = new NNAPIDelegateKernel;
+        kernel_state->Init(context, params);
+        return kernel_state;
       },
 
-      .CopyFromBufferHandle = nullptr,
-      .CopyToBufferHandle = nullptr,
-      .FreeBufferHandle = nullptr,
-      .flags = kTfLiteDelegateFlagsNone,
+      .free = [](TfLiteContext* context, void* buffer) -> void {
+        delete reinterpret_cast<NNAPIDelegateKernel*>(buffer);
+      },
+
+      .prepare = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        // Since the underlying resize happened ahead of delegation
+        // worked. This does nothing.
+        return kTfLiteOk;
+      },
+
+      .invoke = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        NNAPIDelegateKernel* state =
+            reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
+        return state->Invoke(context, node);
+      },
+
+      .profiling_string = nullptr,
+      .builtin_code = kTfLiteBuiltinDelegate,
+      .custom_name = "TfLiteNnapiDelegate",
+      .version = 1,
   };
 
-  return &delegate;
+  // Request TFLite to partition the graph and make kernels
+  // for each independent node sub set a new nnapi_delegate_kernel.
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, nnapi_delegate_kernel,
+      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+}
+
+// Returns a singleton NNAPI Delegate that can check for support of ops.
+TfLiteDelegate* NnApiDelegate() {
+  static StatefulNnApiDelegate* delegate = new StatefulNnApiDelegate();
+  return delegate;
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 099fb72..9981e38 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -15,17 +15,69 @@
 #ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
 
+#include <memory>
+#include <string>
+
 #include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
-// Return a delegate that can be used to use the NN API.
+// TFliteDelegate to interface with NNAPI.
+class StatefulNnApiDelegate : public TfLiteDelegate {
+ public:
+  // Encapsulates all options that are specific to NNAPI delegate.
+  struct Options {
+    // Preferred Power/perf trade-off. For more details please see
+    // ANeuralNetworksCompilation_setPreference documentation in :
+    // https://developer.android.com/ndk/reference/group/neural-networks.html
+    enum ExecutionPreference {
+      kUndefined = -1,
+      kLowPower = 0,
+      kFastSingleAnswer = 1,
+      kSustainedSpeed = 2,
+    };
+
+    // Preferred Power/perf trade-off.
+    ExecutionPreference execution_preference = kUndefined;
+  };
+
+  // Uses default options.
+  StatefulNnApiDelegate();
+
+  // The constructor that accepts options from user.
+  explicit StatefulNnApiDelegate(Options options);
+
+  ~StatefulNnApiDelegate() = default;
+
+  // Returns the delegate options.
+  static const Options& GetOptions(TfLiteDelegate* delegate);
+
+ private:
+  // Encapsulates all delegate data.
+  struct Data {
+    // Delegate options to use.
+    Options options;
+  };
+
+  // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate
+  // documentation for more info.
+  static TfLiteStatus DoPrepare(TfLiteContext* context,
+                                TfLiteDelegate* delegate);
+
+  // Delegate data presented through TfLiteDelegate::data_.
+  Data delegate_data_;
+};
+
+// DEPRECATED: Please use StatefulNnApiDelegate class instead.
+//
+// Returns a singleton delegate that can be used to use the NN API.
 // e.g.
 //   NnApiDelegate* delegate = NnApiDelegate();
 //   interpreter->ModifyGraphWithDelegate(&delegate);
 // NnApiDelegate() returns a singleton, so you should not free this
 // pointer or worry about its lifetime.
 TfLiteDelegate* NnApiDelegate();
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
new file mode 100644
index 0000000..1eb783a
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+
+namespace tflite {
+
+// Return a non-functional NN API Delegate struct.
+TfLiteDelegate* NnApiDelegate() {
+  static TfLiteDelegate delegate = [] {
+    TfLiteDelegate delegate = TfLiteDelegateCreate();
+    delegate.Prepare = [](TfLiteContext* context,
+                          TfLiteDelegate* delegate) -> TfLiteStatus {
+      // Silently succeed without modifying the graph.
+      return kTfLiteOk;
+    };
+    return delegate;
+  }();
+
+  return &delegate;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 17d3776..e8c71e0 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
@@ -45,6 +46,15 @@
     });
   }
 
+  explicit SingleOpModelWithNNAPI(
+      const StatefulNnApiDelegate::Options& options) {
+    stateful_delegate_.reset(new StatefulNnApiDelegate(options));
+    auto* delegate = stateful_delegate_.get();
+    this->SetApplyDelegate([delegate](Interpreter* interpreter) {
+      interpreter->ModifyGraphWithDelegate(delegate);
+    });
+  }
+
   TfLiteStatus ResizeInputTensor(int tensor_index,
                                  const std::vector<int>& dims) {
     return interpreter_->ResizeInputTensor(tensor_index, dims);
@@ -70,6 +80,26 @@
         break;
     }
   }
+
+  void GetData(int index, TensorType type, std::vector<float>* output) {
+    switch (type) {
+      case TensorType_FLOAT32:
+        *output = ExtractVector<float>(index);
+        break;
+      case TensorType_UINT8:
+        *output = Dequantize<uint8_t>(ExtractVector<uint8_t>(index),
+                                      GetScale(index), GetZeroPoint(index));
+        break;
+      default:
+        FAIL() << "Type not supported: " << type;
+        break;
+    }
+  }
+
+ private:
+  // Stateful NNAPI delegate. This is valid only if the state-ful constructor is
+  // used.
+  std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -78,13 +108,16 @@
                   const TensorData& output,
                   ActivationFunctionType activation_type,
                   bool allow_fp32_relax_to_fp16 = false) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    output_ = AddOutput(output);
-    SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
-                 CreateAddOptions(builder_, activation_type).Union());
-    BuildInterpreter({GetShape(input1_), GetShape(input2_)},
-                     allow_fp32_relax_to_fp16);
+    Init(input1, input2, output, activation_type, allow_fp32_relax_to_fp16);
+  }
+
+  FloatAddOpModel(const StatefulNnApiDelegate::Options& options,
+                  const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type,
+                  bool allow_fp32_relax_to_fp16 = false)
+      : SingleOpModelWithNNAPI(options) {
+    Init(input1, input2, output, activation_type, allow_fp32_relax_to_fp16);
   }
 
   int input1() { return input1_; }
@@ -96,6 +129,20 @@
   int input1_;
   int input2_;
   int output_;
+
+ private:
+  // Performs initialization logic shared across all constructors.
+  void Init(const TensorData& input1, const TensorData& input2,
+            const TensorData& output, ActivationFunctionType activation_type,
+            bool allow_fp32_relax_to_fp16 = false) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                 CreateAddOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)},
+                     allow_fp32_relax_to_fp16);
+  }
 };
 
 // Do a test with the NN API using no activation.
@@ -109,6 +156,17 @@
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Do a test with scalar input using no activation.
+TEST(NNAPIDelegate, AddScalarWithNoActivation) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+                    ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.7});
+  m.PopulateTensor<float>(m.input2(), {0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.3, 0.8, 0.8}));
+}
+
 // Do a test with the NN API using no activation.
 // The test allows computing FP32 with FP16 precision. In this particular case,
 // calculating in FP32 or FP16 should produce the same results.
@@ -144,6 +202,21 @@
   EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 3, 1}), kTfLiteError);
 }
 
+// Sanity check for the state-ful NNAPI delegate.
+TEST(NNAPIDelegate, StatefulDelegate) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_preference =
+      StatefulNnApiDelegate::Options::ExecutionPreference::kLowPower;
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
@@ -987,6 +1060,87 @@
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+class ElementwiseOpBaseModel : public SingleOpModelWithNNAPI {
+ public:
+  int input() const { return input_; }
+  int output() const { return output_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class ElementwiseOpFloatModel : public ElementwiseOpBaseModel {
+ public:
+  ElementwiseOpFloatModel(BuiltinOperator op,
+                          std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(op, BuiltinOptions_NONE, 0);
+    BuildInterpreter({input_shape});
+  }
+};
+
+TEST(Elementwise, Abs) {
+  ElementwiseOpFloatModel m(BuiltinOperator_ABS, {1, 2, 4, 1});
+  m.PopulateTensor<float>(m.input(), {
+                                         0.f, -6.2f, 2.f, 4.f,  //
+                                         3.f, -2.f, 10.f, 1.f,  //
+                                     });
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({
+                                                      0.f, 6.2f, 2.f, 4.f,  //
+                                                      3.f, 2.f, 10.f, 1.f,  //
+                                                  }));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 2, 4, 1}));
+}
+
+TEST(Elementwise, Exp) {
+  ElementwiseOpFloatModel m(BuiltinOperator_EXP, {3, 1, 2});
+  m.PopulateTensor<float>(m.input(), {1.0, 0.0, -1.0, 1.0, 1.0, -1.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear(
+                  {2.71828, 1, 0.367879, 2.71828, 2.71828, 0.367879})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({3, 1, 2}));
+}
+
+TEST(Elementwise, Log) {
+  ElementwiseOpFloatModel m(BuiltinOperator_LOG, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {1, 3.1415926, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({0, 1.14473, 0, 0})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
+TEST(Elementwise, Rsqrt) {
+  ElementwiseOpFloatModel m(BuiltinOperator_RSQRT, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {1, 2, 4, 9});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({1, 0.7071, 0.5, 0.33333})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
+TEST(Elementwise, Sin) {
+  ElementwiseOpFloatModel m(BuiltinOperator_SIN, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({0, 0, 0, 0.84147})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
+TEST(Elementwise, Sqrt) {
+  ElementwiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({0, 1, 1.41421, 2})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
 class FloatSubOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatSubOpModel(const TensorData& input1, const TensorData& input2,
@@ -4217,6 +4371,85 @@
                               1,
                           }));
 }
+
+// A base class of PRelu op model. It provides the constructor for
+// FloatPReluOpModel and QuantizedPReluOpModel.
+class PReluOpModel : public SingleOpModelWithNNAPI {
+ public:
+  PReluOpModel(const TensorData& input, const TensorData& alpha)
+      : input_type_(input.type) {
+    input_ = AddInput(input);
+    alpha_ = AddInput(alpha);
+    output_ = AddOutput({input.type, input.shape, input.min, input.max});
+    SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_), GetShape(alpha_)});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    SetData(input_, input_type_, data);
+  }
+
+  void SetAlpha(std::initializer_list<float> data) {
+    SetData(alpha_, input_type_, data);
+  }
+
+  std::vector<float> GetOutput() {
+    std::vector<float> output;
+    GetData(output_, input_type_, &output);
+    return output;
+  }
+
+ protected:
+  int input_;
+  int alpha_;
+  int output_;
+
+  const TensorType input_type_;
+};
+
+TEST(NNAPIDelegate, PReluFloat) {
+  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                 {TensorType_FLOAT32, {1, 1, 3}});
+
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+  });
+  m.SetAlpha({0.0f, 1.0f, 2.0f});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                 0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                 0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                             }));
+}
+
+TEST(NNAPIDelegate, PReluQuantized) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  PReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                 {TensorType_UINT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  });
+  m.SetAlpha({0.0f, 0.5f, -0.5f});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                                     0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                                     0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                                     0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                                 },
+                                 kQuantizedTolerance)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/examples/android/BUILD b/tensorflow/lite/examples/android/BUILD
deleted file mode 100644
index f7fedb3..0000000
--- a/tensorflow/lite/examples/android/BUILD
+++ /dev/null
@@ -1,61 +0,0 @@
-# Description:
-#   TensorFlow camera demo app for Android.
-
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-# Build the demo native demo lib from the original directory to reduce code
-# reuse. Note that the Java counterparts (ObjectTracker.java and
-# ImageUtils.java) are still duplicated.
-cc_library(
-    name = "tensorflow_native_libs",
-    srcs = [
-        "//tensorflow/examples/android:libtensorflow_demo.so",
-    ],
-    tags = [
-        "manual",
-        "notap",
-    ],
-)
-
-android_binary(
-    name = "tflite_demo",
-    srcs = glob([
-        "app/src/main/java/**/*.java",
-    ]),
-    aapt_version = "aapt",
-    # Package assets from assets dir as well as all model targets.
-    # Remove undesired models (and corresponding Activities in source)
-    # to reduce APK size.
-    assets = [
-        "//tensorflow/lite/examples/android/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
-        "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
-        "//tensorflow/lite/examples/android/app/src/main/assets:conv_actions_labels.txt",
-        "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
-        "@tflite_mobilenet_ssd_quant//:detect.tflite",
-        "//tensorflow/lite/examples/android/app/src/main/assets:box_priors.txt",
-        "//tensorflow/lite/examples/android/app/src/main/assets:labelmap.txt",
-    ],
-    assets_dir = "",
-    custom_package = "org.tensorflow.lite.demo",
-    inline_constants = 1,
-    manifest = "app/src/main/AndroidManifest.xml",
-    nocompress_extensions = [
-        ".tflite",
-    ],
-    resource_files = glob(["app/src/main/res/**"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
-    deps = [
-        ":tensorflow_native_libs",
-        "//tensorflow/lite/java:tensorflowlite",
-    ],
-)
diff --git a/tensorflow/lite/examples/android/android.iml b/tensorflow/lite/examples/android/android.iml
deleted file mode 100644
index f0a5ac2..0000000
--- a/tensorflow/lite/examples/android/android.iml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module external.linked.project.id="android" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="GRADLE" type="JAVA_MODULE" version="4">
-  <component name="FacetManager">
-    <facet type="java-gradle" name="Java-Gradle">
-      <configuration>
-        <option name="BUILD_FOLDER_PATH" value="$MODULE_DIR$/build" />
-        <option name="BUILDABLE" value="false" />
-      </configuration>
-    </facet>
-  </component>
-  <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8" inherit-compiler-output="true">
-    <exclude-output />
-    <content url="file://$MODULE_DIR$">
-      <excludeFolder url="file://$MODULE_DIR$/.gradle" />
-    </content>
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/tensorflow/lite/examples/android/app/README.md b/tensorflow/lite/examples/android/app/README.md
index e2b1b26..0da8d13 100644
--- a/tensorflow/lite/examples/android/app/README.md
+++ b/tensorflow/lite/examples/android/app/README.md
@@ -1,54 +1,9 @@
-# TF Lite Android App Example
+# TF Lite Android Example (Deprecated)
 
-A simple Android example that demonstrates image classification and object
-detection using the camera, as well as speech recognition using the microphone.
+This example has been moved to the new
+[TensorFlow examples repo](https://github.com/tensorflow/examples), and split
+into several distinct examples:
 
-## Building in Android Studio with TensorFlow Lite AAR from JCenter.
-The build.gradle is configured to use TensorFlow Lite's nightly build.
-
-If you see a build error related to compatibility with Tensorflow Lite's Java
-API (example: method X is undefined for type Interpreter), there has likely been
-a backwards compatible change to the API. You will need to pull new app code
-that's compatible with the nightly build and may need to first wait a few days
-for our external and internal code to merge.
-
-## Building from Source with Bazel
-
-1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel):
-
-  1. [Install Bazel and Android Prerequisites](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites).
-     It's easiest with Android Studio.
-
-      - You'll need at least SDK version 23.
-      - Make sure to install the latest version of Bazel. Some distributions
-        ship with Bazel 0.5.4, which is too old.
-      - Bazel requires Android Build Tools `26.0.1` or higher.
-      - You also need to install the Android Support Repository, available
-        through Android Studio under `Android SDK Manager -> SDK Tools ->
-        Android Support Repository`.
-
-  2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
-     to add SDK and NDK targets.
-
-     NOTE: As long as you have the SDK and NDK installed, the `./configure`
-     script will create these rules for you. Answer "Yes" when the script asks
-     to automatically configure the `./WORKSPACE`.
-
-      - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
-        you have installed.
-      - By default, Android Studio will install the SDK to `~/Android/Sdk` and
-        the NDK to `~/Android/Sdk/ndk-bundle`.
-
-2. Build this demo app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
-
-  ```shell
-  bazel build -c opt --cxxopt='--std=c++11' --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
-    //tensorflow/lite/examples/android:tflite_demo
-  ```
-
-3. Install the demo on a
-   [debug-enabled device](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install):
-
-  ```shell
-  adb install bazel-bin/tensorflow/lite/examples/android/tflite_demo.apk
-  ```
+*   [Image Classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android)
+*   [Object Detection](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android)
+*   [Speech Commands](https://github.com/tensorflow/examples/tree/master/lite/examples/speech_commands/android)
diff --git a/tensorflow/lite/examples/android/app/build.gradle b/tensorflow/lite/examples/android/app/build.gradle
deleted file mode 100644
index d2bc984..0000000
--- a/tensorflow/lite/examples/android/app/build.gradle
+++ /dev/null
@@ -1,50 +0,0 @@
-apply plugin: 'com.android.application'
-
-// import DownloadModels task
-project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
-project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
-
-// Download default models; if you wish to use your own models then
-// place them in the "assets" directory and comment out this line.
-apply from: "download-models.gradle"
-
-android {
-    compileSdkVersion 26
-    buildToolsVersion '28.0.3'
-    defaultConfig {
-        applicationId "org.tensorflow.lite.demo"
-        minSdkVersion 15
-        targetSdkVersion 26
-        versionCode 1
-        versionName "1.0"
-
-    }
-    lintOptions {
-        abortOnError false
-    }
-    buildTypes {
-        release {
-            minifyEnabled false
-            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
-        }
-    }
-    aaptOptions {
-        noCompress "tflite"
-    }
-
-    compileOptions {
-        sourceCompatibility JavaVersion.VERSION_1_8
-        targetCompatibility JavaVersion.VERSION_1_8
-    }
-}
-
-repositories {
-    maven {
-        url 'https://google.bintray.com/tensorflow'
-    }
-}
-
-dependencies {
-    implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-}
diff --git a/tensorflow/lite/examples/android/app/download-models.gradle b/tensorflow/lite/examples/android/app/download-models.gradle
deleted file mode 100644
index 514eeb0..0000000
--- a/tensorflow/lite/examples/android/app/download-models.gradle
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * download-models.gradle
- *     Downloads model files from ${MODEL_URL} into application's asset folder
- * Input:
- *     project.ext.TMP_DIR: absolute path to hold downloaded zip files
- *     project.ext.ASSET_DIR: absolute path to save unzipped model files
- * Output:
- *     3 model files will be downloaded into given folder of ext.ASSET_DIR
- */
-// hard coded model files
-
-def models = ['https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip',
-              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip',
-              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip',
-              'http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz',
-              'http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz']
-
-// Root URL for model archives
-def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models/tflite'
-
-buildscript {
-    repositories {
-        jcenter()
-    }
-    dependencies {
-        classpath 'de.undercouch:gradle-download-task:3.2.0'
-    }
-}
-
-import de.undercouch.gradle.tasks.download.Download
-task downloadFile(type: Download){
-    for (modelUrl in models) {
-        def localFile = modelUrl.split("/")[-1]
-        println "Downloading ${localFile} from ${modelUrl}"
-        src modelUrl
-    }
-
-    dest new File(project.ext.TMP_DIR)
-    overwrite true
-}
-
-task extractModels(type: Copy) {
-    for (f in models) {
-        def localFile = f.split("/")[-1]
-        def localExt = localFile.split("[.]")[-1]
-        if (localExt == "tgz") {
-            from tarTree(project.ext.TMP_DIR + '/' + localFile)
-        } else {
-            from zipTree(project.ext.TMP_DIR + '/' + localFile)
-        }
-    }
-
-    into file(project.ext.ASSET_DIR)
-    fileMode  0644
-    exclude '**/LICENSE'
-
-    def needDownload = false
-    for (f in models) {
-        def localFile = f.split("/")[-1]
-        if (!(new File(project.ext.TMP_DIR + '/' + localFile)).exists()) {
-            needDownload = true
-        }
-    }
-
-    if (needDownload) {
-        dependsOn downloadFile
-    }
-}
-
-tasks.whenTaskAdded { task ->
-    if (task.name == 'assembleDebug') {
-        task.dependsOn 'extractModels'
-    }
-    if (task.name == 'assembleRelease') {
-        task.dependsOn 'extractModels'
-    }
-}
-
diff --git a/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml b/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
deleted file mode 100644
index d4c98c6..0000000
--- a/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
+++ /dev/null
@@ -1,60 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.tensorflow.lite.demo">
-
-    <uses-permission android:name="android.permission.CAMERA" />
-    <uses-feature android:name="android.hardware.camera" />
-    <uses-feature android:name="android.hardware.camera.autofocus" />
-    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
-    <uses-permission android:name="android.permission.RECORD_AUDIO" />
-
-    <application android:allowBackup="true"
-        android:label="@string/app_name"
-        android:icon="@drawable/ic_launcher"
-        android:theme="@style/MaterialTheme">
-
-        <activity android:name="org.tensorflow.demo.ClassifierActivity"
-                  android:screenOrientation="portrait"
-                  android:label="@string/activity_name_classification">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-
-        <activity android:name="org.tensorflow.demo.DetectorActivity"
-                  android:screenOrientation="portrait"
-                  android:label="@string/activity_name_detection">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-
-        <activity android:name="org.tensorflow.demo.SpeechActivity"
-            android:screenOrientation="portrait"
-            android:label="@string/activity_name_speech">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-
-</manifest>
diff --git a/tensorflow/lite/examples/android/app/src/main/assets/conv_actions_labels.txt b/tensorflow/lite/examples/android/app/src/main/assets/conv_actions_labels.txt
deleted file mode 100644
index ba41645..0000000
--- a/tensorflow/lite/examples/android/app/src/main/assets/conv_actions_labels.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-_silence_
-_unknown_
-yes
-no
-up
-down
-left
-right
-on
-off
-stop
-go
\ No newline at end of file
diff --git a/tensorflow/lite/examples/android/app/src/main/assets/labelmap.txt b/tensorflow/lite/examples/android/app/src/main/assets/labelmap.txt
deleted file mode 100644
index 5a70ff8..0000000
--- a/tensorflow/lite/examples/android/app/src/main/assets/labelmap.txt
+++ /dev/null
@@ -1,91 +0,0 @@
-???
-person
-bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-???
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-???
-backpack
-umbrella
-???
-???
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-???
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-???
-dining table
-???
-???
-toilet
-???
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-???
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java
deleted file mode 100644
index eff24af..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.tensorflow.demo;
-
-import android.content.Context;
-import android.util.AttributeSet;
-import android.view.TextureView;
-
-/**
- * A {@link TextureView} that can be adjusted to a specified aspect ratio.
- */
-public class AutoFitTextureView extends TextureView {
-  private int ratioWidth = 0;
-  private int ratioHeight = 0;
-
-  public AutoFitTextureView(final Context context) {
-    this(context, null);
-  }
-
-  public AutoFitTextureView(final Context context, final AttributeSet attrs) {
-    this(context, attrs, 0);
-  }
-
-  public AutoFitTextureView(final Context context, final AttributeSet attrs, final int defStyle) {
-    super(context, attrs, defStyle);
-  }
-
-  /**
-   * Sets the aspect ratio for this view. The size of the view will be measured based on the ratio
-   * calculated from the parameters. Note that the actual sizes of parameters don't matter, that
-   * is, calling setAspectRatio(2, 3) and setAspectRatio(4, 6) make the same result.
-   *
-   * @param width  Relative horizontal size
-   * @param height Relative vertical size
-   */
-  public void setAspectRatio(final int width, final int height) {
-    if (width < 0 || height < 0) {
-      throw new IllegalArgumentException("Size cannot be negative.");
-    }
-    ratioWidth = width;
-    ratioHeight = height;
-    requestLayout();
-  }
-
-  @Override
-  protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
-    super.onMeasure(widthMeasureSpec, heightMeasureSpec);
-    final int width = MeasureSpec.getSize(widthMeasureSpec);
-    final int height = MeasureSpec.getSize(heightMeasureSpec);
-    if (0 == ratioWidth || 0 == ratioHeight) {
-      setMeasuredDimension(width, height);
-    } else {
-      if (width < height * ratioWidth / ratioHeight) {
-        setMeasuredDimension(width, width * ratioHeight / ratioWidth);
-      } else {
-        setMeasuredDimension(height * ratioWidth / ratioHeight, height);
-      }
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java
deleted file mode 100644
index 15d5456..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.tensorflow.demo;
-
-import android.Manifest;
-import android.app.Activity;
-import android.app.Fragment;
-import android.content.Context;
-import android.content.pm.PackageManager;
-import android.hardware.Camera;
-import android.hardware.camera2.CameraAccessException;
-import android.hardware.camera2.CameraCharacteristics;
-import android.hardware.camera2.CameraManager;
-import android.hardware.camera2.params.StreamConfigurationMap;
-import android.media.Image;
-import android.media.Image.Plane;
-import android.media.ImageReader;
-import android.media.ImageReader.OnImageAvailableListener;
-import android.os.Build;
-import android.os.Bundle;
-import android.os.Handler;
-import android.os.HandlerThread;
-import android.os.Trace;
-import android.util.Size;
-import android.view.KeyEvent;
-import android.view.Surface;
-import android.view.WindowManager;
-import android.widget.Toast;
-import java.nio.ByteBuffer;
-import org.tensorflow.demo.env.ImageUtils;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
-
-public abstract class CameraActivity extends Activity
-    implements OnImageAvailableListener, Camera.PreviewCallback {
-  private static final Logger LOGGER = new Logger();
-
-  private static final int PERMISSIONS_REQUEST = 1;
-
-  private static final String PERMISSION_CAMERA = Manifest.permission.CAMERA;
-  private static final String PERMISSION_STORAGE = Manifest.permission.WRITE_EXTERNAL_STORAGE;
-
-  private boolean debug = false;
-
-  private Handler handler;
-  private HandlerThread handlerThread;
-  private boolean useCamera2API;
-  private boolean isProcessingFrame = false;
-  private byte[][] yuvBytes = new byte[3][];
-  private int[] rgbBytes = null;
-  private int yRowStride;
-
-  protected int previewWidth = 0;
-  protected int previewHeight = 0;
-
-  private Runnable postInferenceCallback;
-  private Runnable imageConverter;
-
-  @Override
-  protected void onCreate(final Bundle savedInstanceState) {
-    LOGGER.d("onCreate " + this);
-    super.onCreate(null);
-    getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
-
-    setContentView(R.layout.activity_camera);
-
-    if (hasPermission()) {
-      setFragment();
-    } else {
-      requestPermission();
-    }
-  }
-
-
-  protected int[] getRgbBytes() {
-    imageConverter.run();
-    return rgbBytes;
-  }
-
-  protected int getLuminanceStride() {
-    return yRowStride;
-  }
-
-  protected byte[] getLuminance() {
-    return yuvBytes[0];
-  }
-
-  /**
-   * Callback for android.hardware.Camera API
-   */
-  @Override
-  public void onPreviewFrame(final byte[] bytes, final Camera camera) {
-    if (isProcessingFrame) {
-      LOGGER.w("Dropping frame!");
-      return;
-    }
-
-    try {
-      // Initialize the storage bitmaps once when the resolution is known.
-      if (rgbBytes == null) {
-        Camera.Size previewSize = camera.getParameters().getPreviewSize();
-        previewHeight = previewSize.height;
-        previewWidth = previewSize.width;
-        rgbBytes = new int[previewWidth * previewHeight];
-        onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90);
-      }
-    } catch (final Exception e) {
-      LOGGER.e(e, "Exception!");
-      return;
-    }
-
-    isProcessingFrame = true;
-    yuvBytes[0] = bytes;
-    yRowStride = previewWidth;
-
-    imageConverter =
-        new Runnable() {
-          @Override
-          public void run() {
-            ImageUtils.convertYUV420SPToARGB8888(bytes, previewWidth, previewHeight, rgbBytes);
-          }
-        };
-
-    postInferenceCallback =
-        new Runnable() {
-          @Override
-          public void run() {
-            camera.addCallbackBuffer(bytes);
-            isProcessingFrame = false;
-          }
-        };
-    processImage();
-  }
-
-  /**
-   * Callback for Camera2 API
-   */
-  @Override
-  public void onImageAvailable(final ImageReader reader) {
-    //We need wait until we have some size from onPreviewSizeChosen
-    if (previewWidth == 0 || previewHeight == 0) {
-      return;
-    }
-    if (rgbBytes == null) {
-      rgbBytes = new int[previewWidth * previewHeight];
-    }
-    try {
-      final Image image = reader.acquireLatestImage();
-
-      if (image == null) {
-        return;
-      }
-
-      if (isProcessingFrame) {
-        image.close();
-        return;
-      }
-      isProcessingFrame = true;
-      Trace.beginSection("imageAvailable");
-      final Plane[] planes = image.getPlanes();
-      fillBytes(planes, yuvBytes);
-      yRowStride = planes[0].getRowStride();
-      final int uvRowStride = planes[1].getRowStride();
-      final int uvPixelStride = planes[1].getPixelStride();
-
-      imageConverter =
-          new Runnable() {
-            @Override
-            public void run() {
-              ImageUtils.convertYUV420ToARGB8888(
-                  yuvBytes[0],
-                  yuvBytes[1],
-                  yuvBytes[2],
-                  previewWidth,
-                  previewHeight,
-                  yRowStride,
-                  uvRowStride,
-                  uvPixelStride,
-                  rgbBytes);
-            }
-          };
-
-      postInferenceCallback =
-          new Runnable() {
-            @Override
-            public void run() {
-              image.close();
-              isProcessingFrame = false;
-            }
-          };
-
-      processImage();
-    } catch (final Exception e) {
-      LOGGER.e(e, "Exception!");
-      Trace.endSection();
-      return;
-    }
-    Trace.endSection();
-  }
-
-  @Override
-  public synchronized void onStart() {
-    LOGGER.d("onStart " + this);
-    super.onStart();
-  }
-
-  @Override
-  public synchronized void onResume() {
-    LOGGER.d("onResume " + this);
-    super.onResume();
-
-    handlerThread = new HandlerThread("inference");
-    handlerThread.start();
-    handler = new Handler(handlerThread.getLooper());
-  }
-
-  @Override
-  public synchronized void onPause() {
-    LOGGER.d("onPause " + this);
-
-    if (!isFinishing()) {
-      LOGGER.d("Requesting finish");
-      finish();
-    }
-
-    handlerThread.quitSafely();
-    try {
-      handlerThread.join();
-      handlerThread = null;
-      handler = null;
-    } catch (final InterruptedException e) {
-      LOGGER.e(e, "Exception!");
-    }
-
-    super.onPause();
-  }
-
-  @Override
-  public synchronized void onStop() {
-    LOGGER.d("onStop " + this);
-    super.onStop();
-  }
-
-  @Override
-  public synchronized void onDestroy() {
-    LOGGER.d("onDestroy " + this);
-    super.onDestroy();
-  }
-
-  protected synchronized void runInBackground(final Runnable r) {
-    if (handler != null) {
-      handler.post(r);
-    }
-  }
-
-  @Override
-  public void onRequestPermissionsResult(
-      final int requestCode, final String[] permissions, final int[] grantResults) {
-    if (requestCode == PERMISSIONS_REQUEST) {
-      if (grantResults.length > 0
-          && grantResults[0] == PackageManager.PERMISSION_GRANTED
-          && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
-        setFragment();
-      } else {
-        requestPermission();
-      }
-    }
-  }
-
-  private boolean hasPermission() {
-    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
-      return checkSelfPermission(PERMISSION_CAMERA) == PackageManager.PERMISSION_GRANTED &&
-          checkSelfPermission(PERMISSION_STORAGE) == PackageManager.PERMISSION_GRANTED;
-    } else {
-      return true;
-    }
-  }
-
-  private void requestPermission() {
-    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
-      if (shouldShowRequestPermissionRationale(PERMISSION_CAMERA) ||
-          shouldShowRequestPermissionRationale(PERMISSION_STORAGE)) {
-        Toast.makeText(CameraActivity.this,
-            "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
-      }
-      requestPermissions(new String[] {PERMISSION_CAMERA, PERMISSION_STORAGE}, PERMISSIONS_REQUEST);
-    }
-  }
-
-  // Returns true if the device supports the required hardware level, or better.
-  private boolean isHardwareLevelSupported(
-      CameraCharacteristics characteristics, int requiredLevel) {
-    int deviceLevel = characteristics.get(CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL);
-    if (deviceLevel == CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_LEGACY) {
-      return requiredLevel == deviceLevel;
-    }
-    // deviceLevel is not LEGACY, can use numerical sort
-    return requiredLevel <= deviceLevel;
-  }
-
-  private String chooseCamera() {
-    final CameraManager manager = (CameraManager) getSystemService(Context.CAMERA_SERVICE);
-    try {
-      for (final String cameraId : manager.getCameraIdList()) {
-        final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
-
-        // We don't use a front facing camera in this sample.
-        final Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
-        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
-          continue;
-        }
-
-        final StreamConfigurationMap map =
-            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
-
-        if (map == null) {
-          continue;
-        }
-
-        // Fallback to camera1 API for internal cameras that don't have full support.
-        // This should help with legacy situations where using the camera2 API causes
-        // distorted or otherwise broken previews.
-        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
-            || isHardwareLevelSupported(characteristics, 
-                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
-        LOGGER.i("Camera API lv2?: %s", useCamera2API);
-        return cameraId;
-      }
-    } catch (CameraAccessException e) {
-      LOGGER.e(e, "Not allowed to access camera");
-    }
-
-    return null;
-  }
-
-  protected void setFragment() {
-    String cameraId = chooseCamera();
-
-    Fragment fragment;
-    if (useCamera2API) {
-      CameraConnectionFragment camera2Fragment =
-          CameraConnectionFragment.newInstance(
-              new CameraConnectionFragment.ConnectionCallback() {
-                @Override
-                public void onPreviewSizeChosen(final Size size, final int rotation) {
-                  previewHeight = size.getHeight();
-                  previewWidth = size.getWidth();
-                  CameraActivity.this.onPreviewSizeChosen(size, rotation);
-                }
-              },
-              this,
-              getLayoutId(),
-              getDesiredPreviewFrameSize());
-
-      camera2Fragment.setCamera(cameraId);
-      fragment = camera2Fragment;
-    } else {
-      fragment =
-          new LegacyCameraConnectionFragment(this, getLayoutId(), getDesiredPreviewFrameSize());
-    }
-
-    getFragmentManager()
-        .beginTransaction()
-        .replace(R.id.container, fragment)
-        .commit();
-  }
-
-  protected void fillBytes(final Plane[] planes, final byte[][] yuvBytes) {
-    // Because of the variable row stride it's not possible to know in
-    // advance the actual necessary dimensions of the yuv planes.
-    for (int i = 0; i < planes.length; ++i) {
-      final ByteBuffer buffer = planes[i].getBuffer();
-      if (yuvBytes[i] == null) {
-        LOGGER.d("Initializing buffer %d at size %d", i, buffer.capacity());
-        yuvBytes[i] = new byte[buffer.capacity()];
-      }
-      buffer.get(yuvBytes[i]);
-    }
-  }
-
-  public boolean isDebug() {
-    return debug;
-  }
-
-  public void requestRender() {
-    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
-    if (overlay != null) {
-      overlay.postInvalidate();
-    }
-  }
-
-  public void addCallback(final OverlayView.DrawCallback callback) {
-    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
-    if (overlay != null) {
-      overlay.addCallback(callback);
-    }
-  }
-
-  public void onSetDebug(final boolean debug) {}
-
-  @Override
-  public boolean onKeyDown(final int keyCode, final KeyEvent event) {
-    if (keyCode == KeyEvent.KEYCODE_VOLUME_DOWN || keyCode == KeyEvent.KEYCODE_VOLUME_UP) {
-      debug = !debug;
-      requestRender();
-      onSetDebug(debug);
-      return true;
-    }
-    return super.onKeyDown(keyCode, event);
-  }
-
-  protected void readyForNextImage() {
-    if (postInferenceCallback != null) {
-      postInferenceCallback.run();
-    }
-  }
-
-  protected int getScreenOrientation() {
-    switch (getWindowManager().getDefaultDisplay().getRotation()) {
-      case Surface.ROTATION_270:
-        return 270;
-      case Surface.ROTATION_180:
-        return 180;
-      case Surface.ROTATION_90:
-        return 90;
-      default:
-        return 0;
-    }
-  }
-
-  protected abstract void processImage();
-
-  protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
-  protected abstract int getLayoutId();
-  protected abstract Size getDesiredPreviewFrameSize();
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java
deleted file mode 100644
index 51a1adb..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java
+++ /dev/null
@@ -1,634 +0,0 @@
-/*
- * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.tensorflow.demo;
-
-import android.app.Activity;
-import android.app.AlertDialog;
-import android.app.Dialog;
-import android.app.DialogFragment;
-import android.app.Fragment;
-import android.content.Context;
-import android.content.DialogInterface;
-import android.content.res.Configuration;
-import android.graphics.ImageFormat;
-import android.graphics.Matrix;
-import android.graphics.RectF;
-import android.graphics.SurfaceTexture;
-import android.hardware.camera2.CameraAccessException;
-import android.hardware.camera2.CameraCaptureSession;
-import android.hardware.camera2.CameraCharacteristics;
-import android.hardware.camera2.CameraDevice;
-import android.hardware.camera2.CameraManager;
-import android.hardware.camera2.CaptureRequest;
-import android.hardware.camera2.CaptureResult;
-import android.hardware.camera2.TotalCaptureResult;
-import android.hardware.camera2.params.StreamConfigurationMap;
-import android.media.ImageReader;
-import android.media.ImageReader.OnImageAvailableListener;
-import android.os.Bundle;
-import android.os.Handler;
-import android.os.HandlerThread;
-import android.text.TextUtils;
-import android.util.Size;
-import android.util.SparseIntArray;
-import android.view.LayoutInflater;
-import android.view.Surface;
-import android.view.TextureView;
-import android.view.View;
-import android.view.ViewGroup;
-import android.widget.Toast;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.concurrent.Semaphore;
-import java.util.concurrent.TimeUnit;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
-
-public class CameraConnectionFragment extends Fragment {
-  private static final Logger LOGGER = new Logger();
-
-  /**
-   * The camera preview size will be chosen to be the smallest frame by pixel size capable of
-   * containing a DESIRED_SIZE x DESIRED_SIZE square.
-   */
-  private static final int MINIMUM_PREVIEW_SIZE = 320;
-
-  /**
-   * Conversion from screen rotation to JPEG orientation.
-   */
-  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
-  private static final String FRAGMENT_DIALOG = "dialog";
-
-  static {
-    ORIENTATIONS.append(Surface.ROTATION_0, 90);
-    ORIENTATIONS.append(Surface.ROTATION_90, 0);
-    ORIENTATIONS.append(Surface.ROTATION_180, 270);
-    ORIENTATIONS.append(Surface.ROTATION_270, 180);
-  }
-
-  /**
-   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
-   * {@link TextureView}.
-   */
-  private final TextureView.SurfaceTextureListener surfaceTextureListener =
-      new TextureView.SurfaceTextureListener() {
-        @Override
-        public void onSurfaceTextureAvailable(
-            final SurfaceTexture texture, final int width, final int height) {
-          openCamera(width, height);
-        }
-
-        @Override
-        public void onSurfaceTextureSizeChanged(
-            final SurfaceTexture texture, final int width, final int height) {
-          configureTransform(width, height);
-        }
-
-        @Override
-        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
-          return true;
-        }
-
-        @Override
-        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
-      };
-
-  /**
-   * Callback for Activities to use to initialize their data once the
-   * selected preview size is known.
-   */
-  public interface ConnectionCallback {
-    void onPreviewSizeChosen(Size size, int cameraRotation);
-  }
-
-  /**
-   * ID of the current {@link CameraDevice}.
-   */
-  private String cameraId;
-
-  /**
-   * An {@link AutoFitTextureView} for camera preview.
-   */
-  private AutoFitTextureView textureView;
-
-  /**
-   * A {@link CameraCaptureSession } for camera preview.
-   */
-  private CameraCaptureSession captureSession;
-
-  /**
-   * A reference to the opened {@link CameraDevice}.
-   */
-  private CameraDevice cameraDevice;
-
-  /**
-   * The rotation in degrees of the camera sensor from the display.
-   */
-  private Integer sensorOrientation;
-
-  /**
-   * The {@link android.util.Size} of camera preview.
-   */
-  private Size previewSize;
-
-  /**
-   * {@link android.hardware.camera2.CameraDevice.StateCallback}
-   * is called when {@link CameraDevice} changes its state.
-   */
-  private final CameraDevice.StateCallback stateCallback =
-      new CameraDevice.StateCallback() {
-        @Override
-        public void onOpened(final CameraDevice cd) {
-          // This method is called when the camera is opened.  We start camera preview here.
-          cameraOpenCloseLock.release();
-          cameraDevice = cd;
-          createCameraPreviewSession();
-        }
-
-        @Override
-        public void onDisconnected(final CameraDevice cd) {
-          cameraOpenCloseLock.release();
-          cd.close();
-          cameraDevice = null;
-        }
-
-        @Override
-        public void onError(final CameraDevice cd, final int error) {
-          cameraOpenCloseLock.release();
-          cd.close();
-          cameraDevice = null;
-          final Activity activity = getActivity();
-          if (null != activity) {
-            activity.finish();
-          }
-        }
-      };
-
-  /**
-   * An additional thread for running tasks that shouldn't block the UI.
-   */
-  private HandlerThread backgroundThread;
-
-  /**
-   * A {@link Handler} for running tasks in the background.
-   */
-  private Handler backgroundHandler;
-
-  /**
-   * An {@link ImageReader} that handles preview frame capture.
-   */
-  private ImageReader previewReader;
-
-  /**
-   * {@link android.hardware.camera2.CaptureRequest.Builder} for the camera preview
-   */
-  private CaptureRequest.Builder previewRequestBuilder;
-
-  /**
-   * {@link CaptureRequest} generated by {@link #previewRequestBuilder}
-   */
-  private CaptureRequest previewRequest;
-
-  /**
-   * A {@link Semaphore} to prevent the app from exiting before closing the camera.
-   */
-  private final Semaphore cameraOpenCloseLock = new Semaphore(1);
-
-  /**
-   * A {@link OnImageAvailableListener} to receive frames as they are available.
-   */
-  private final OnImageAvailableListener imageListener;
-
-  /** The input size in pixels desired by TensorFlow (width and height of a square bitmap). */
-  private final Size inputSize;
-
-  /**
-   * The layout identifier to inflate for this Fragment.
-   */
-  private final int layout;
-
-
-  private final ConnectionCallback cameraConnectionCallback;
-
-  private CameraConnectionFragment(
-      final ConnectionCallback connectionCallback,
-      final OnImageAvailableListener imageListener,
-      final int layout,
-      final Size inputSize) {
-    this.cameraConnectionCallback = connectionCallback;
-    this.imageListener = imageListener;
-    this.layout = layout;
-    this.inputSize = inputSize;
-  }
-
-  /**
-   * Shows a {@link Toast} on the UI thread.
-   *
-   * @param text The message to show
-   */
-  private void showToast(final String text) {
-    final Activity activity = getActivity();
-    if (activity != null) {
-      activity.runOnUiThread(
-          new Runnable() {
-            @Override
-            public void run() {
-              Toast.makeText(activity, text, Toast.LENGTH_SHORT).show();
-            }
-          });
-    }
-  }
-
-  /**
-   * Given {@code choices} of {@code Size}s supported by a camera, chooses the smallest one whose
-   * width and height are at least as large as the minimum of both, or an exact match if possible.
-   *
-   * @param choices The list of sizes that the camera supports for the intended output class
-   * @param width The minimum desired width
-   * @param height The minimum desired height
-   * @return The optimal {@code Size}, or an arbitrary one if none were big enough
-   */
-  protected static Size chooseOptimalSize(final Size[] choices, final int width, final int height) {
-    final int minSize = Math.max(Math.min(width, height), MINIMUM_PREVIEW_SIZE);
-    final Size desiredSize = new Size(width, height);
-
-    // Collect the supported resolutions that are at least as big as the preview Surface
-    boolean exactSizeFound = false;
-    final List<Size> bigEnough = new ArrayList<Size>();
-    final List<Size> tooSmall = new ArrayList<Size>();
-    for (final Size option : choices) {
-      if (option.equals(desiredSize)) {
-        // Set the size but don't return yet so that remaining sizes will still be logged.
-        exactSizeFound = true;
-      }
-
-      if (option.getHeight() >= minSize && option.getWidth() >= minSize) {
-        bigEnough.add(option);
-      } else {
-        tooSmall.add(option);
-      }
-    }
-
-    LOGGER.i("Desired size: " + desiredSize + ", min size: " + minSize + "x" + minSize);
-    LOGGER.i("Valid preview sizes: [" + TextUtils.join(", ", bigEnough) + "]");
-    LOGGER.i("Rejected preview sizes: [" + TextUtils.join(", ", tooSmall) + "]");
-
-    if (exactSizeFound) {
-      LOGGER.i("Exact size match found.");
-      return desiredSize;
-    }
-
-    // Pick the smallest of those, assuming we found any
-    if (bigEnough.size() > 0) {
-      final Size chosenSize = Collections.min(bigEnough, new CompareSizesByArea());
-      LOGGER.i("Chosen size: " + chosenSize.getWidth() + "x" + chosenSize.getHeight());
-      return chosenSize;
-    } else {
-      LOGGER.e("Couldn't find any suitable preview size");
-      return choices[0];
-    }
-  }
-
-  public static CameraConnectionFragment newInstance(
-      final ConnectionCallback callback,
-      final OnImageAvailableListener imageListener,
-      final int layout,
-      final Size inputSize) {
-    return new CameraConnectionFragment(callback, imageListener, layout, inputSize);
-  }
-
-  @Override
-  public View onCreateView(
-      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
-    return inflater.inflate(layout, container, false);
-  }
-
-  @Override
-  public void onViewCreated(final View view, final Bundle savedInstanceState) {
-    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
-  }
-
-  @Override
-  public void onActivityCreated(final Bundle savedInstanceState) {
-    super.onActivityCreated(savedInstanceState);
-  }
-
-  @Override
-  public void onResume() {
-    super.onResume();
-    startBackgroundThread();
-
-    // When the screen is turned off and turned back on, the SurfaceTexture is already
-    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
-    // a camera and start preview from here (otherwise, we wait until the surface is ready in
-    // the SurfaceTextureListener).
-    if (textureView.isAvailable()) {
-      openCamera(textureView.getWidth(), textureView.getHeight());
-    } else {
-      textureView.setSurfaceTextureListener(surfaceTextureListener);
-    }
-  }
-
-  @Override
-  public void onPause() {
-    closeCamera();
-    stopBackgroundThread();
-    super.onPause();
-  }
-
-  public void setCamera(String cameraId) {
-    this.cameraId = cameraId;
-  }
-
-  /**
-   * Sets up member variables related to camera.
-   */
-  private void setUpCameraOutputs() {
-    final Activity activity = getActivity();
-    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
-    try {
-      final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
-
-      final StreamConfigurationMap map =
-          characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
-
-      sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
-
-      // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
-      // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
-      // garbage capture data.
-      previewSize =
-          chooseOptimalSize(map.getOutputSizes(SurfaceTexture.class),
-              inputSize.getWidth(),
-              inputSize.getHeight());
-
-      // We fit the aspect ratio of TextureView to the size of preview we picked.
-      final int orientation = getResources().getConfiguration().orientation;
-      if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
-        textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
-      } else {
-        textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
-      }
-    } catch (final CameraAccessException e) {
-      LOGGER.e(e, "Exception!");
-    } catch (final NullPointerException e) {
-      // Currently an NPE is thrown when the Camera2API is used but not supported on the
-      // device this code runs.
-      // TODO(andrewharp): abstract ErrorDialog/RuntimeException handling out into new method and
-      // reuse throughout app.
-      ErrorDialog.newInstance(getString(R.string.camera_error))
-          .show(getChildFragmentManager(), FRAGMENT_DIALOG);
-      throw new RuntimeException(getString(R.string.camera_error));
-    }
-
-    cameraConnectionCallback.onPreviewSizeChosen(previewSize, sensorOrientation);
-  }
-
-  /**
-   * Opens the camera specified by {@link CameraConnectionFragment#cameraId}.
-   */
-  private void openCamera(final int width, final int height) {
-    setUpCameraOutputs();
-    configureTransform(width, height);
-    final Activity activity = getActivity();
-    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
-    try {
-      if (!cameraOpenCloseLock.tryAcquire(2500, TimeUnit.MILLISECONDS)) {
-        throw new RuntimeException("Time out waiting to lock camera opening.");
-      }
-      manager.openCamera(cameraId, stateCallback, backgroundHandler);
-    } catch (final CameraAccessException e) {
-      LOGGER.e(e, "Exception!");
-    } catch (final InterruptedException e) {
-      throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
-    }
-  }
-
-  /**
-   * Closes the current {@link CameraDevice}.
-   */
-  private void closeCamera() {
-    try {
-      cameraOpenCloseLock.acquire();
-      if (null != captureSession) {
-        captureSession.close();
-        captureSession = null;
-      }
-      if (null != cameraDevice) {
-        cameraDevice.close();
-        cameraDevice = null;
-      }
-      if (null != previewReader) {
-        previewReader.close();
-        previewReader = null;
-      }
-    } catch (final InterruptedException e) {
-      throw new RuntimeException("Interrupted while trying to lock camera closing.", e);
-    } finally {
-      cameraOpenCloseLock.release();
-    }
-  }
-
-  /**
-   * Starts a background thread and its {@link Handler}.
-   */
-  private void startBackgroundThread() {
-    backgroundThread = new HandlerThread("ImageListener");
-    backgroundThread.start();
-    backgroundHandler = new Handler(backgroundThread.getLooper());
-  }
-
-  /**
-   * Stops the background thread and its {@link Handler}.
-   */
-  private void stopBackgroundThread() {
-    backgroundThread.quitSafely();
-    try {
-      backgroundThread.join();
-      backgroundThread = null;
-      backgroundHandler = null;
-    } catch (final InterruptedException e) {
-      LOGGER.e(e, "Exception!");
-    }
-  }
-
-  private final CameraCaptureSession.CaptureCallback captureCallback =
-      new CameraCaptureSession.CaptureCallback() {
-        @Override
-        public void onCaptureProgressed(
-            final CameraCaptureSession session,
-            final CaptureRequest request,
-            final CaptureResult partialResult) {}
-
-        @Override
-        public void onCaptureCompleted(
-            final CameraCaptureSession session,
-            final CaptureRequest request,
-            final TotalCaptureResult result) {}
-      };
-
-  /**
-   * Creates a new {@link CameraCaptureSession} for camera preview.
-   */
-  private void createCameraPreviewSession() {
-    try {
-      final SurfaceTexture texture = textureView.getSurfaceTexture();
-      assert texture != null;
-
-      // We configure the size of default buffer to be the size of camera preview we want.
-      texture.setDefaultBufferSize(previewSize.getWidth(), previewSize.getHeight());
-
-      // This is the output Surface we need to start preview.
-      final Surface surface = new Surface(texture);
-
-      // We set up a CaptureRequest.Builder with the output Surface.
-      previewRequestBuilder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
-      previewRequestBuilder.addTarget(surface);
-
-      LOGGER.i("Opening camera preview: " + previewSize.getWidth() + "x" + previewSize.getHeight());
-
-      // Create the reader for the preview frames.
-      previewReader =
-          ImageReader.newInstance(
-              previewSize.getWidth(), previewSize.getHeight(), ImageFormat.YUV_420_888, 2);
-
-      previewReader.setOnImageAvailableListener(imageListener, backgroundHandler);
-      previewRequestBuilder.addTarget(previewReader.getSurface());
-
-      // Here, we create a CameraCaptureSession for camera preview.
-      cameraDevice.createCaptureSession(
-          Arrays.asList(surface, previewReader.getSurface()),
-          new CameraCaptureSession.StateCallback() {
-
-            @Override
-            public void onConfigured(final CameraCaptureSession cameraCaptureSession) {
-              // The camera is already closed
-              if (null == cameraDevice) {
-                return;
-              }
-
-              // When the session is ready, we start displaying the preview.
-              captureSession = cameraCaptureSession;
-              try {
-                // Auto focus should be continuous for camera preview.
-                previewRequestBuilder.set(
-                    CaptureRequest.CONTROL_AF_MODE,
-                    CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
-                // Flash is automatically enabled when necessary.
-                previewRequestBuilder.set(
-                    CaptureRequest.CONTROL_AE_MODE, CaptureRequest.CONTROL_AE_MODE_ON_AUTO_FLASH);
-
-                // Finally, we start displaying the camera preview.
-                previewRequest = previewRequestBuilder.build();
-                captureSession.setRepeatingRequest(
-                    previewRequest, captureCallback, backgroundHandler);
-              } catch (final CameraAccessException e) {
-                LOGGER.e(e, "Exception!");
-              }
-            }
-
-            @Override
-            public void onConfigureFailed(final CameraCaptureSession cameraCaptureSession) {
-              showToast("Failed");
-            }
-          },
-          null);
-    } catch (final CameraAccessException e) {
-      LOGGER.e(e, "Exception!");
-    }
-  }
-
-  /**
-   * Configures the necessary {@link android.graphics.Matrix} transformation to `mTextureView`.
-   * This method should be called after the camera preview size is determined in
-   * setUpCameraOutputs and also the size of `mTextureView` is fixed.
-   *
-   * @param viewWidth  The width of `mTextureView`
-   * @param viewHeight The height of `mTextureView`
-   */
-  private void configureTransform(final int viewWidth, final int viewHeight) {
-    final Activity activity = getActivity();
-    if (null == textureView || null == previewSize || null == activity) {
-      return;
-    }
-    final int rotation = activity.getWindowManager().getDefaultDisplay().getRotation();
-    final Matrix matrix = new Matrix();
-    final RectF viewRect = new RectF(0, 0, viewWidth, viewHeight);
-    final RectF bufferRect = new RectF(0, 0, previewSize.getHeight(), previewSize.getWidth());
-    final float centerX = viewRect.centerX();
-    final float centerY = viewRect.centerY();
-    if (Surface.ROTATION_90 == rotation || Surface.ROTATION_270 == rotation) {
-      bufferRect.offset(centerX - bufferRect.centerX(), centerY - bufferRect.centerY());
-      matrix.setRectToRect(viewRect, bufferRect, Matrix.ScaleToFit.FILL);
-      final float scale =
-          Math.max(
-              (float) viewHeight / previewSize.getHeight(),
-              (float) viewWidth / previewSize.getWidth());
-      matrix.postScale(scale, scale, centerX, centerY);
-      matrix.postRotate(90 * (rotation - 2), centerX, centerY);
-    } else if (Surface.ROTATION_180 == rotation) {
-      matrix.postRotate(180, centerX, centerY);
-    }
-    textureView.setTransform(matrix);
-  }
-
-  /**
-   * Compares two {@code Size}s based on their areas.
-   */
-  static class CompareSizesByArea implements Comparator<Size> {
-    @Override
-    public int compare(final Size lhs, final Size rhs) {
-      // We cast here to ensure the multiplications won't overflow
-      return Long.signum(
-          (long) lhs.getWidth() * lhs.getHeight() - (long) rhs.getWidth() * rhs.getHeight());
-    }
-  }
-
-  /**
-   * Shows an error message dialog.
-   */
-  public static class ErrorDialog extends DialogFragment {
-    private static final String ARG_MESSAGE = "message";
-
-    public static ErrorDialog newInstance(final String message) {
-      final ErrorDialog dialog = new ErrorDialog();
-      final Bundle args = new Bundle();
-      args.putString(ARG_MESSAGE, message);
-      dialog.setArguments(args);
-      return dialog;
-    }
-
-    @Override
-    public Dialog onCreateDialog(final Bundle savedInstanceState) {
-      final Activity activity = getActivity();
-      return new AlertDialog.Builder(activity)
-          .setMessage(getArguments().getString(ARG_MESSAGE))
-          .setPositiveButton(
-              android.R.string.ok,
-              new DialogInterface.OnClickListener() {
-                @Override
-                public void onClick(final DialogInterface dialogInterface, final int i) {
-                  activity.finish();
-                }
-              })
-          .create();
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java
deleted file mode 100644
index 07995fe..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo;
-
-import android.graphics.Bitmap;
-import android.graphics.RectF;
-import java.util.List;
-
-/**
- * Generic interface for interacting with different recognition engines.
- */
-public interface Classifier {
-  /**
-   * An immutable result returned by a Classifier describing what was recognized.
-   */
-  public class Recognition {
-    /**
-     * A unique identifier for what has been recognized. Specific to the class, not the instance of
-     * the object.
-     */
-    private final String id;
-
-    /**
-     * Display name for the recognition.
-     */
-    private final String title;
-
-    /**
-     * A sortable score for how good the recognition is relative to others. Higher should be better.
-     */
-    private final Float confidence;
-
-    /** Optional location within the source image for the location of the recognized object. */
-    private RectF location;
-
-    public Recognition(
-        final String id, final String title, final Float confidence, final RectF location) {
-      this.id = id;
-      this.title = title;
-      this.confidence = confidence;
-      this.location = location;
-    }
-
-    public String getId() {
-      return id;
-    }
-
-    public String getTitle() {
-      return title;
-    }
-
-    public Float getConfidence() {
-      return confidence;
-    }
-
-    public RectF getLocation() {
-      return new RectF(location);
-    }
-
-    public void setLocation(RectF location) {
-      this.location = location;
-    }
-
-    @Override
-    public String toString() {
-      String resultString = "";
-      if (id != null) {
-        resultString += "[" + id + "] ";
-      }
-
-      if (title != null) {
-        resultString += title + " ";
-      }
-
-      if (confidence != null) {
-        resultString += String.format("(%.1f%%) ", confidence * 100.0f);
-      }
-
-      if (location != null) {
-        resultString += location + " ";
-      }
-
-      return resultString.trim();
-    }
-  }
-
-  List<Recognition> recognizeImage(Bitmap bitmap);
-
-  void enableStatLogging(final boolean debug);
-
-  String getStatString();
-
-  void close();
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
deleted file mode 100644
index 698251d..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.tensorflow.demo;
-
-import android.graphics.Bitmap;
-import android.graphics.Bitmap.Config;
-import android.graphics.Canvas;
-import android.graphics.Matrix;
-import android.graphics.Paint;
-import android.graphics.Typeface;
-import android.media.ImageReader.OnImageAvailableListener;
-import android.os.SystemClock;
-import android.util.Size;
-import android.util.TypedValue;
-import java.util.List;
-import java.util.Vector;
-import org.tensorflow.demo.OverlayView.DrawCallback;
-import org.tensorflow.demo.env.BorderedText;
-import org.tensorflow.demo.env.ImageUtils;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
-
-public class ClassifierActivity extends CameraActivity implements OnImageAvailableListener {
-  private static final Logger LOGGER = new Logger();
-
-  protected static final boolean SAVE_PREVIEW_BITMAP = false;
-
-  private ResultsView resultsView;
-
-  private Bitmap rgbFrameBitmap = null;
-  private Bitmap croppedBitmap = null;
-  private Bitmap cropCopyBitmap = null;
-
-  private long lastProcessingTimeMs;
-
-  // These are the settings for the original v1 Inception model. If you want to
-  // use a model that's been produced from the TensorFlow for Poets codelab,
-  // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
-  // INPUT_NAME = "Mul", and OUTPUT_NAME = "final_result".
-  // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
-  // the ones you produced.
-  //
-  // To use v3 Inception model, strip the DecodeJpeg Op from your retrained
-  // model first:
-  //
-  // python strip_unused.py \
-  // --input_graph=<retrained-pb-file> \
-  // --output_graph=<your-stripped-pb-file> \
-  // --input_node_names="Mul" \
-  // --output_node_names="final_result" \
-  // --input_binary=true
-  private static final int INPUT_SIZE = 224;
-
-  private static final String MODEL_FILE = "mobilenet_v1_1.0_224_quant.tflite";
-  private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
-
-  private static final boolean MAINTAIN_ASPECT = true;
-
-  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
-
-
-  private Integer sensorOrientation;
-  private Classifier classifier;
-  private Matrix frameToCropTransform;
-  private Matrix cropToFrameTransform;
-
-  private BorderedText borderedText;
-
-  @Override
-  protected int getLayoutId() {
-    return R.layout.camera_connection_fragment;
-  }
-
-  @Override
-  protected Size getDesiredPreviewFrameSize() {
-    return DESIRED_PREVIEW_SIZE;
-  }
-
-  private static final float TEXT_SIZE_DIP = 10;
-
-  @Override
-  public void onPreviewSizeChosen(final Size size, final int rotation) {
-    final float textSizePx = TypedValue.applyDimension(
-        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
-    borderedText = new BorderedText(textSizePx);
-    borderedText.setTypeface(Typeface.MONOSPACE);
-
-    classifier = TFLiteImageClassifier.create(getAssets(), MODEL_FILE, LABEL_FILE, INPUT_SIZE);
-
-    previewWidth = size.getWidth();
-    previewHeight = size.getHeight();
-
-    sensorOrientation = rotation - getScreenOrientation();
-    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
-
-    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
-    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
-    croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
-
-    frameToCropTransform = ImageUtils.getTransformationMatrix(
-        previewWidth, previewHeight,
-        INPUT_SIZE, INPUT_SIZE,
-        sensorOrientation, MAINTAIN_ASPECT);
-
-    cropToFrameTransform = new Matrix();
-    frameToCropTransform.invert(cropToFrameTransform);
-
-    addCallback(
-        new DrawCallback() {
-          @Override
-          public void drawCallback(final Canvas canvas) {
-            renderDebug(canvas);
-          }
-        });
-  }
-
-  @Override
-  protected void processImage() {
-    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
-    final Canvas canvas = new Canvas(croppedBitmap);
-    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
-
-    // For examining the actual TF input.
-    if (SAVE_PREVIEW_BITMAP) {
-      ImageUtils.saveBitmap(croppedBitmap);
-    }
-    runInBackground(
-        new Runnable() {
-          @Override
-          public void run() {
-            final long startTime = SystemClock.uptimeMillis();
-            final List<Classifier.Recognition> results = classifier.recognizeImage(croppedBitmap);
-            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
-            LOGGER.i("Detect: %s", results);
-            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
-            if (resultsView == null) {
-              resultsView = (ResultsView) findViewById(R.id.results);
-            }
-            resultsView.setResults(results);
-            requestRender();
-            readyForNextImage();
-          }
-        });
-  }
-
-  @Override
-  public void onSetDebug(boolean debug) {
-    classifier.enableStatLogging(debug);
-  }
-
-  private void renderDebug(final Canvas canvas) {
-    if (!isDebug()) {
-      return;
-    }
-    final Bitmap copy = cropCopyBitmap;
-    if (copy != null) {
-      final Matrix matrix = new Matrix();
-      final float scaleFactor = 2;
-      matrix.postScale(scaleFactor, scaleFactor);
-      matrix.postTranslate(
-          canvas.getWidth() - copy.getWidth() * scaleFactor,
-          canvas.getHeight() - copy.getHeight() * scaleFactor);
-      canvas.drawBitmap(copy, matrix, new Paint());
-
-      final Vector<String> lines = new Vector<String>();
-      if (classifier != null) {
-        String statString = classifier.getStatString();
-        String[] statLines = statString.split("\n");
-        for (String line : statLines) {
-          lines.add(line);
-        }
-      }
-
-      lines.add("Frame: " + previewWidth + "x" + previewHeight);
-      lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
-      lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
-      lines.add("Rotation: " + sensorOrientation);
-      lines.add("Inference time: " + lastProcessingTimeMs + "ms");
-
-      borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
deleted file mode 100644
index ca0c8ca..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.tensorflow.demo;
-
-import android.graphics.Bitmap;
-import android.graphics.Bitmap.Config;
-import android.graphics.Canvas;
-import android.graphics.Color;
-import android.graphics.Matrix;
-import android.graphics.Paint;
-import android.graphics.Paint.Style;
-import android.graphics.RectF;
-import android.graphics.Typeface;
-import android.media.ImageReader.OnImageAvailableListener;
-import android.os.SystemClock;
-import android.util.Size;
-import android.util.TypedValue;
-import android.widget.Toast;
-import java.io.IOException;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Vector;
-import org.tensorflow.demo.OverlayView.DrawCallback;
-import org.tensorflow.demo.env.BorderedText;
-import org.tensorflow.demo.env.ImageUtils;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.demo.tracking.MultiBoxTracker;
-import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
-
-/**
- * An activity that uses a TensorFlowMultiBoxDetector and ObjectTracker to detect and then track
- * objects.
- */
-public class DetectorActivity extends CameraActivity implements OnImageAvailableListener {
-  private static final Logger LOGGER = new Logger();
-
-  // Configuration values for the prepackaged SSD model.
-  private static final int TF_OD_API_INPUT_SIZE = 300;
-  private static final boolean TF_OD_API_IS_QUANTIZED = true;
-  private static final String TF_OD_API_MODEL_FILE = "detect.tflite";
-  private static final String TF_OD_API_LABELS_FILE = "labelmap.txt";
-
-  // Which detection model to use: by default uses Tensorflow Object Detection API frozen
-  // checkpoints.
-  private enum DetectorMode {
-    TF_OD_API;
-  }
-
-  private static final DetectorMode MODE = DetectorMode.TF_OD_API;
-
-  // Minimum detection confidence to track a detection.
-  private static final float MINIMUM_CONFIDENCE_TF_OD_API = 0.6f;
-
-  private static final boolean MAINTAIN_ASPECT = false;
-
-  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
-
-  private static final boolean SAVE_PREVIEW_BITMAP = false;
-  private static final float TEXT_SIZE_DIP = 10;
-
-  private Integer sensorOrientation;
-
-  private Classifier detector;
-
-  private long lastProcessingTimeMs;
-  private Bitmap rgbFrameBitmap = null;
-  private Bitmap croppedBitmap = null;
-  private Bitmap cropCopyBitmap = null;
-
-  private boolean computingDetection = false;
-
-  private long timestamp = 0;
-
-  private Matrix frameToCropTransform;
-  private Matrix cropToFrameTransform;
-
-  private MultiBoxTracker tracker;
-
-  private byte[] luminanceCopy;
-
-  private BorderedText borderedText;
-  @Override
-  public void onPreviewSizeChosen(final Size size, final int rotation) {
-    final float textSizePx =
-        TypedValue.applyDimension(
-            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
-    borderedText = new BorderedText(textSizePx);
-    borderedText.setTypeface(Typeface.MONOSPACE);
-
-    tracker = new MultiBoxTracker(this);
-
-    int cropSize = TF_OD_API_INPUT_SIZE;
-
-    try {
-      detector =
-          TFLiteObjectDetectionAPIModel.create(
-              getAssets(),
-              TF_OD_API_MODEL_FILE,
-              TF_OD_API_LABELS_FILE,
-              TF_OD_API_INPUT_SIZE,
-              TF_OD_API_IS_QUANTIZED);
-      cropSize = TF_OD_API_INPUT_SIZE;
-    } catch (final IOException e) {
-      LOGGER.e("Exception initializing classifier!", e);
-      Toast toast =
-          Toast.makeText(
-              getApplicationContext(), "Classifier could not be initialized", Toast.LENGTH_SHORT);
-      toast.show();
-      finish();
-    }
-
-
-    previewWidth = size.getWidth();
-    previewHeight = size.getHeight();
-
-    sensorOrientation = rotation - getScreenOrientation();
-    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
-
-    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
-    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
-    croppedBitmap = Bitmap.createBitmap(cropSize, cropSize, Config.ARGB_8888);
-
-    frameToCropTransform =
-        ImageUtils.getTransformationMatrix(
-            previewWidth, previewHeight,
-            cropSize, cropSize,
-            sensorOrientation, MAINTAIN_ASPECT);
-
-    cropToFrameTransform = new Matrix();
-    frameToCropTransform.invert(cropToFrameTransform);
-
-    trackingOverlay = (OverlayView) findViewById(R.id.tracking_overlay);
-    trackingOverlay.addCallback(
-        new DrawCallback() {
-          @Override
-          public void drawCallback(final Canvas canvas) {
-            tracker.draw(canvas);
-            if (isDebug()) {
-              tracker.drawDebug(canvas);
-            }
-          }
-        });
-
-    addCallback(
-        new DrawCallback() {
-          @Override
-          public void drawCallback(final Canvas canvas) {
-            if (!isDebug()) {
-              return;
-            }
-            final Bitmap copy = cropCopyBitmap;
-            if (copy == null) {
-              return;
-            }
-
-            final int backgroundColor = Color.argb(100, 0, 0, 0);
-            canvas.drawColor(backgroundColor);
-
-            final Matrix matrix = new Matrix();
-            final float scaleFactor = 2;
-            matrix.postScale(scaleFactor, scaleFactor);
-            matrix.postTranslate(
-                canvas.getWidth() - copy.getWidth() * scaleFactor,
-                canvas.getHeight() - copy.getHeight() * scaleFactor);
-            canvas.drawBitmap(copy, matrix, new Paint());
-
-            final Vector<String> lines = new Vector<String>();
-            if (detector != null) {
-              final String statString = detector.getStatString();
-              final String[] statLines = statString.split("\n");
-              for (final String line : statLines) {
-                lines.add(line);
-              }
-            }
-            lines.add("");
-
-            lines.add("Frame: " + previewWidth + "x" + previewHeight);
-            lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
-            lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
-            lines.add("Rotation: " + sensorOrientation);
-            lines.add("Inference time: " + lastProcessingTimeMs + "ms");
-
-            borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
-          }
-        });
-  }
-
-  OverlayView trackingOverlay;
-
-  @Override
-  protected void processImage() {
-    ++timestamp;
-    final long currTimestamp = timestamp;
-    byte[] originalLuminance = getLuminance();
-    tracker.onFrame(
-        previewWidth,
-        previewHeight,
-        getLuminanceStride(),
-        sensorOrientation,
-        originalLuminance,
-        timestamp);
-    trackingOverlay.postInvalidate();
-
-    // No mutex needed as this method is not reentrant.
-    if (computingDetection) {
-      readyForNextImage();
-      return;
-    }
-    computingDetection = true;
-    LOGGER.i("Preparing image " + currTimestamp + " for detection in bg thread.");
-
-    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
-
-    if (luminanceCopy == null) {
-      luminanceCopy = new byte[originalLuminance.length];
-    }
-    System.arraycopy(originalLuminance, 0, luminanceCopy, 0, originalLuminance.length);
-    readyForNextImage();
-
-    final Canvas canvas = new Canvas(croppedBitmap);
-    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
-    // For examining the actual TF input.
-    if (SAVE_PREVIEW_BITMAP) {
-      ImageUtils.saveBitmap(croppedBitmap);
-    }
-
-    runInBackground(
-        new Runnable() {
-          @Override
-          public void run() {
-            LOGGER.i("Running detection on image " + currTimestamp);
-            final long startTime = SystemClock.uptimeMillis();
-            final List<Classifier.Recognition> results = detector.recognizeImage(croppedBitmap);
-            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
-
-            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
-            final Canvas canvas = new Canvas(cropCopyBitmap);
-            final Paint paint = new Paint();
-            paint.setColor(Color.RED);
-            paint.setStyle(Style.STROKE);
-            paint.setStrokeWidth(2.0f);
-
-            float minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
-            switch (MODE) {
-              case TF_OD_API:
-                minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
-                break;
-            }
-
-            final List<Classifier.Recognition> mappedRecognitions =
-                new LinkedList<Classifier.Recognition>();
-
-            for (final Classifier.Recognition result : results) {
-              final RectF location = result.getLocation();
-              if (location != null && result.getConfidence() >= minimumConfidence) {
-                canvas.drawRect(location, paint);
-
-                cropToFrameTransform.mapRect(location);
-                result.setLocation(location);
-                mappedRecognitions.add(result);
-              }
-            }
-
-            tracker.trackResults(mappedRecognitions, luminanceCopy, currTimestamp);
-            trackingOverlay.postInvalidate();
-
-            requestRender();
-            computingDetection = false;
-          }
-        });
-  }
-
-  @Override
-  protected int getLayoutId() {
-    return R.layout.camera_connection_fragment_tracking;
-  }
-
-  @Override
-  protected Size getDesiredPreviewFrameSize() {
-    return DESIRED_PREVIEW_SIZE;
-  }
-
-  @Override
-  public void onSetDebug(final boolean debug) {
-    detector.enableStatLogging(debug);
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java
deleted file mode 100644
index fd83029..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java
+++ /dev/null
@@ -1,216 +0,0 @@
-package org.tensorflow.demo;
-
-/*
- * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import android.app.Fragment;
-import android.graphics.SurfaceTexture;
-import android.hardware.Camera;
-import android.hardware.Camera.CameraInfo;
-import android.os.Bundle;
-import android.os.Handler;
-import android.os.HandlerThread;
-import android.util.Size;
-import android.util.SparseIntArray;
-import android.view.LayoutInflater;
-import android.view.Surface;
-import android.view.TextureView;
-import android.view.View;
-import android.view.ViewGroup;
-import java.io.IOException;
-import java.util.List;
-import org.tensorflow.demo.env.ImageUtils;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
-
-public class LegacyCameraConnectionFragment extends Fragment {
-  private Camera camera;
-  private static final Logger LOGGER = new Logger();
-  private Camera.PreviewCallback imageListener;
-  private Size desiredSize;
-
-  /**
-   * The layout identifier to inflate for this Fragment.
-   */
-  private int layout;
-
-  public LegacyCameraConnectionFragment(
-      final Camera.PreviewCallback imageListener, final int layout, final Size desiredSize) {
-    this.imageListener = imageListener;
-    this.layout = layout;
-    this.desiredSize = desiredSize;
-  }
-
-  /**
-   * Conversion from screen rotation to JPEG orientation.
-   */
-  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
-
-  static {
-    ORIENTATIONS.append(Surface.ROTATION_0, 90);
-    ORIENTATIONS.append(Surface.ROTATION_90, 0);
-    ORIENTATIONS.append(Surface.ROTATION_180, 270);
-    ORIENTATIONS.append(Surface.ROTATION_270, 180);
-  }
-
-  /**
-   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
-   * {@link TextureView}.
-   */
-  private final TextureView.SurfaceTextureListener surfaceTextureListener =
-      new TextureView.SurfaceTextureListener() {
-        @Override
-        public void onSurfaceTextureAvailable(
-            final SurfaceTexture texture, final int width, final int height) {
-
-          int index = getCameraId();
-          camera = Camera.open(index);
-
-          try {
-            Camera.Parameters parameters = camera.getParameters();
-            List<String> focusModes = parameters.getSupportedFocusModes();
-            if (focusModes != null
-                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
-              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
-            }
-            List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
-            Size[] sizes = new Size[cameraSizes.size()];
-            int i = 0;
-            for (Camera.Size size : cameraSizes) {
-              sizes[i++] = new Size(size.width, size.height);
-            }
-            Size previewSize =
-                CameraConnectionFragment.chooseOptimalSize(
-                    sizes, desiredSize.getWidth(), desiredSize.getHeight());
-            parameters.setPreviewSize(previewSize.getWidth(), previewSize.getHeight());
-            camera.setDisplayOrientation(90);
-            camera.setParameters(parameters);
-            camera.setPreviewTexture(texture);
-          } catch (IOException exception) {
-            camera.release();
-          }
-
-          camera.setPreviewCallbackWithBuffer(imageListener);
-          Camera.Size s = camera.getParameters().getPreviewSize();
-          camera.addCallbackBuffer(new byte[ImageUtils.getYUVByteSize(s.height, s.width)]);
-
-          textureView.setAspectRatio(s.height, s.width);
-
-          camera.startPreview();
-        }
-
-        @Override
-        public void onSurfaceTextureSizeChanged(
-            final SurfaceTexture texture, final int width, final int height) {}
-
-        @Override
-        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
-          return true;
-        }
-
-        @Override
-        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
-      };
-
-  /**
-   * An {@link AutoFitTextureView} for camera preview.
-   */
-  private AutoFitTextureView textureView;
-
-  /**
-   * An additional thread for running tasks that shouldn't block the UI.
-   */
-  private HandlerThread backgroundThread;
-
-  @Override
-  public View onCreateView(
-      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
-    return inflater.inflate(layout, container, false);
-  }
-
-  @Override
-  public void onViewCreated(final View view, final Bundle savedInstanceState) {
-    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
-  }
-
-  @Override
-  public void onActivityCreated(final Bundle savedInstanceState) {
-    super.onActivityCreated(savedInstanceState);
-  }
-
-  @Override
-  public void onResume() {
-    super.onResume();
-    startBackgroundThread();
-    // When the screen is turned off and turned back on, the SurfaceTexture is already
-    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
-    // a camera and start preview from here (otherwise, we wait until the surface is ready in
-    // the SurfaceTextureListener).
-
-    if (textureView.isAvailable()) {
-      camera.startPreview();
-    } else {
-      textureView.setSurfaceTextureListener(surfaceTextureListener);
-    }
-  }
-
-  @Override
-  public void onPause() {
-    stopCamera();
-    stopBackgroundThread();
-    super.onPause();
-  }
-
-  /**
-   * Starts a background thread and its {@link Handler}.
-   */
-  private void startBackgroundThread() {
-    backgroundThread = new HandlerThread("CameraBackground");
-    backgroundThread.start();
-  }
-
-  /**
-   * Stops the background thread and its {@link Handler}.
-   */
-  private void stopBackgroundThread() {
-    backgroundThread.quitSafely();
-    try {
-      backgroundThread.join();
-      backgroundThread = null;
-    } catch (final InterruptedException e) {
-      LOGGER.e(e, "Exception!");
-    }
-  }
-
-  protected void stopCamera() {
-    if (camera != null) {
-      camera.stopPreview();
-      camera.setPreviewCallback(null);
-      camera.release();
-      camera = null;
-    }
-  }
-
-  private int getCameraId() {
-    CameraInfo ci = new CameraInfo();
-    for (int i = 0; i < Camera.getNumberOfCameras(); i++) {
-      Camera.getCameraInfo(i, ci);
-      if (ci.facing == CameraInfo.CAMERA_FACING_BACK)
-        return i;
-    }
-    return -1; // No camera found
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java
deleted file mode 100644
index 0f8d109..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo;
-
-import android.content.Context;
-import android.graphics.Canvas;
-import android.util.AttributeSet;
-import android.view.View;
-import java.util.LinkedList;
-import java.util.List;
-
-/**
- * A simple View providing a render callback to other classes.
- */
-public class OverlayView extends View {
-  private final List<DrawCallback> callbacks = new LinkedList<DrawCallback>();
-
-  public OverlayView(final Context context, final AttributeSet attrs) {
-    super(context, attrs);
-  }
-
-  /**
-   * Interface defining the callback for client classes.
-   */
-  public interface DrawCallback {
-    public void drawCallback(final Canvas canvas);
-  }
-
-  public void addCallback(final DrawCallback callback) {
-    callbacks.add(callback);
-  }
-
-  @Override
-  public synchronized void draw(final Canvas canvas) {
-    for (final DrawCallback callback : callbacks) {
-      callback.drawCallback(canvas);
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java
deleted file mode 100644
index 31a4b07..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo;
-
-import android.content.Context;
-import android.graphics.Canvas;
-import android.graphics.Paint;
-import android.util.AttributeSet;
-import android.util.TypedValue;
-import android.view.View;
-import java.util.List;
-import org.tensorflow.demo.Classifier.Recognition;
-
-public class RecognitionScoreView extends View implements ResultsView {
-  private static final float TEXT_SIZE_DIP = 24;
-  private List<Recognition> results;
-  private final float textSizePx;
-  private final Paint fgPaint;
-  private final Paint bgPaint;
-
-  public RecognitionScoreView(final Context context, final AttributeSet set) {
-    super(context, set);
-
-    textSizePx =
-        TypedValue.applyDimension(
-            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
-    fgPaint = new Paint();
-    fgPaint.setTextSize(textSizePx);
-
-    bgPaint = new Paint();
-    bgPaint.setColor(0xcc4285f4);
-  }
-
-  @Override
-  public void setResults(final List<Recognition> results) {
-    this.results = results;
-    postInvalidate();
-  }
-
-  @Override
-  public void onDraw(final Canvas canvas) {
-    final int x = 10;
-    int y = (int) (fgPaint.getTextSize() * 1.5f);
-
-    canvas.drawPaint(bgPaint);
-
-    if (results != null) {
-      for (final Recognition recog : results) {
-        canvas.drawText(recog.getTitle() + ": " + recog.getConfidence(), x, y, fgPaint);
-        y += (int) (fgPaint.getTextSize() * 1.5f);
-      }
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java
deleted file mode 100644
index 9e91aea..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.tensorflow.demo;
-
-import android.util.Log;
-import android.util.Pair;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Deque;
-import java.util.List;
-
-/** Reads in results from an instantaneous audio recognition model and smoothes them over time. */
-public class RecognizeCommands {
-  // Configuration settings.
-  private List<String> labels = new ArrayList<String>();
-  private long averageWindowDurationMs;
-  private float detectionThreshold;
-  private int suppressionMs;
-  private int minimumCount;
-  private long minimumTimeBetweenSamplesMs;
-
-  // Working variables.
-  private Deque<Pair<Long, float[]>> previousResults = new ArrayDeque<Pair<Long, float[]>>();
-  private String previousTopLabel;
-  private int labelsCount;
-  private long previousTopLabelTime;
-  private float previousTopLabelScore;
-
-  private static final String SILENCE_LABEL = "_silence_";
-  private static final long MINIMUM_TIME_FRACTION = 4;
-
-  public RecognizeCommands(
-      List<String> inLabels,
-      long inAverageWindowDurationMs,
-      float inDetectionThreshold,
-      int inSuppressionMS,
-      int inMinimumCount,
-      long inMinimumTimeBetweenSamplesMS) {
-    labels = inLabels;
-    averageWindowDurationMs = inAverageWindowDurationMs;
-    detectionThreshold = inDetectionThreshold;
-    suppressionMs = inSuppressionMS;
-    minimumCount = inMinimumCount;
-    labelsCount = inLabels.size();
-    previousTopLabel = SILENCE_LABEL;
-    previousTopLabelTime = Long.MIN_VALUE;
-    previousTopLabelScore = 0.0f;
-    minimumTimeBetweenSamplesMs = inMinimumTimeBetweenSamplesMS;
-  }
-
-  /** Holds information about what's been recognized. */
-  public static class RecognitionResult {
-    public final String foundCommand;
-    public final float score;
-    public final boolean isNewCommand;
-
-    public RecognitionResult(String inFoundCommand, float inScore, boolean inIsNewCommand) {
-      foundCommand = inFoundCommand;
-      score = inScore;
-      isNewCommand = inIsNewCommand;
-    }
-  }
-
-  private static class ScoreForSorting implements Comparable<ScoreForSorting> {
-    public final float score;
-    public final int index;
-
-    public ScoreForSorting(float inScore, int inIndex) {
-      score = inScore;
-      index = inIndex;
-    }
-
-    @Override
-    public int compareTo(ScoreForSorting other) {
-      if (this.score > other.score) {
-        return -1;
-      } else if (this.score < other.score) {
-        return 1;
-      } else {
-        return 0;
-      }
-    }
-  }
-
-  public RecognitionResult processLatestResults(float[] currentResults, long currentTimeMS) {
-    if (currentResults.length != labelsCount) {
-      throw new RuntimeException(
-          "The results for recognition should contain "
-              + labelsCount
-              + " elements, but there are "
-              + currentResults.length);
-    }
-
-    if ((!previousResults.isEmpty()) && (currentTimeMS < previousResults.getFirst().first)) {
-      throw new RuntimeException(
-          "You must feed results in increasing time order, but received a timestamp of "
-              + currentTimeMS
-              + " that was earlier than the previous one of "
-              + previousResults.getFirst().first);
-    }
-
-    final int howManyResults = previousResults.size();
-    // Ignore any results that are coming in too frequently.
-    if (howManyResults > 1) {
-      final long timeSinceMostRecent = currentTimeMS - previousResults.getLast().first;
-      if (timeSinceMostRecent < minimumTimeBetweenSamplesMs) {
-        return new RecognitionResult(previousTopLabel, previousTopLabelScore, false);
-      }
-    }
-
-    // Add the latest results to the head of the queue.
-    previousResults.addLast(new Pair<Long, float[]>(currentTimeMS, currentResults));
-
-    // Prune any earlier results that are too old for the averaging window.
-    final long timeLimit = currentTimeMS - averageWindowDurationMs;
-    while (previousResults.getFirst().first < timeLimit) {
-      previousResults.removeFirst();
-    }
-
-    // If there are too few results, assume the result will be unreliable and
-    // bail.
-    final long earliestTime = previousResults.getFirst().first;
-    final long samplesDuration = currentTimeMS - earliestTime;
-    if ((howManyResults < minimumCount)
-        || (samplesDuration < (averageWindowDurationMs / MINIMUM_TIME_FRACTION))) {
-      Log.v("RecognizeResult", "Too few results");
-      return new RecognitionResult(previousTopLabel, 0.0f, false);
-    }
-
-    // Calculate the average score across all the results in the window.
-    float[] averageScores = new float[labelsCount];
-    for (Pair<Long, float[]> previousResult : previousResults) {
-      final float[] scoresTensor = previousResult.second;
-      int i = 0;
-      while (i < scoresTensor.length) {
-        averageScores[i] += scoresTensor[i] / howManyResults;
-        ++i;
-      }
-    }
-
-    // Sort the averaged results in descending score order.
-    ScoreForSorting[] sortedAverageScores = new ScoreForSorting[labelsCount];
-    for (int i = 0; i < labelsCount; ++i) {
-      sortedAverageScores[i] = new ScoreForSorting(averageScores[i], i);
-    }
-    Arrays.sort(sortedAverageScores);
-
-    // See if the latest top score is enough to trigger a detection.
-    final int currentTopIndex = sortedAverageScores[0].index;
-    final String currentTopLabel = labels.get(currentTopIndex);
-    final float currentTopScore = sortedAverageScores[0].score;
-    // If we've recently had another label trigger, assume one that occurs too
-    // soon afterwards is a bad result.
-    long timeSinceLastTop;
-    if (previousTopLabel.equals(SILENCE_LABEL) || (previousTopLabelTime == Long.MIN_VALUE)) {
-      timeSinceLastTop = Long.MAX_VALUE;
-    } else {
-      timeSinceLastTop = currentTimeMS - previousTopLabelTime;
-    }
-    boolean isNewCommand;
-    if ((currentTopScore > detectionThreshold) && (timeSinceLastTop > suppressionMs)) {
-      previousTopLabel = currentTopLabel;
-      previousTopLabelTime = currentTimeMS;
-      previousTopLabelScore = currentTopScore;
-      isNewCommand = true;
-    } else {
-      isNewCommand = false;
-    }
-    return new RecognitionResult(currentTopLabel, currentTopScore, isNewCommand);
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java
deleted file mode 100644
index 9c9c30b..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Demonstrates how to run an audio recognition model in Android.
-
-This example loads a simple speech recognition model trained by the tutorial at
-https://www.tensorflow.org/tutorials/audio_training
-
-The model files should be downloaded automatically from the TensorFlow website,
-but if you have a custom model you can update the LABEL_FILENAME and
-MODEL_FILENAME constants to point to your own files.
-
-The example application displays a list view with all of the known audio labels,
-and highlights each one when it thinks it has detected one through the
-microphone. The averaging of results to give a more reliable signal happens in
-the RecognizeCommands helper class.
-*/
-
-package org.tensorflow.demo;
-
-import android.animation.ValueAnimator;
-import android.app.Activity;
-import android.content.pm.PackageManager;
-import android.content.res.AssetFileDescriptor;
-import android.content.res.AssetManager;
-import android.media.AudioFormat;
-import android.media.AudioRecord;
-import android.media.MediaRecorder;
-import android.os.Build;
-import android.os.Bundle;
-import android.util.Log;
-import android.view.View;
-import android.widget.ArrayAdapter;
-import android.widget.Button;
-import android.widget.ListView;
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.locks.ReentrantLock;
-import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
-
-/**
- * An activity that listens for audio and then uses a TensorFlow model to detect particular classes,
- * by default a small set of action words.
- */
-public class SpeechActivity extends Activity {
-
-  // Constants that control the behavior of the recognition code and model
-  // settings. See the audio recognition tutorial for a detailed explanation of
-  // all these, but you should customize them to match your training settings if
-  // you are running your own model.
-  private static final int SAMPLE_RATE = 16000;
-  private static final int SAMPLE_DURATION_MS = 1000;
-  private static final int RECORDING_LENGTH = (int) (SAMPLE_RATE * SAMPLE_DURATION_MS / 1000);
-  private static final long AVERAGE_WINDOW_DURATION_MS = 500;
-  private static final float DETECTION_THRESHOLD = 0.70f;
-  private static final int SUPPRESSION_MS = 1500;
-  private static final int MINIMUM_COUNT = 3;
-  private static final long MINIMUM_TIME_BETWEEN_SAMPLES_MS = 30;
-  private static final String LABEL_FILENAME = "file:///android_asset/conv_actions_labels.txt";
-  private static final String MODEL_FILENAME = "file:///android_asset/conv_actions_frozen.tflite";
-
-  // UI elements.
-  private static final int REQUEST_RECORD_AUDIO = 13;
-  private Button quitButton;
-  private ListView labelsListView;
-  private static final String LOG_TAG = SpeechActivity.class.getSimpleName();
-
-  // Working variables.
-  short[] recordingBuffer = new short[RECORDING_LENGTH];
-  int recordingOffset = 0;
-  boolean shouldContinue = true;
-  private Thread recordingThread;
-  boolean shouldContinueRecognition = true;
-  private Thread recognitionThread;
-  private final ReentrantLock recordingBufferLock = new ReentrantLock();
-
-  private List<String> labels = new ArrayList<String>();
-  private List<String> displayedLabels = new ArrayList<>();
-  private RecognizeCommands recognizeCommands = null;
-
-  private Interpreter tfLite;
-
-  /** Memory-map the model file in Assets. */
-  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
-      throws IOException {
-    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    FileChannel fileChannel = inputStream.getChannel();
-    long startOffset = fileDescriptor.getStartOffset();
-    long declaredLength = fileDescriptor.getDeclaredLength();
-    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-  }
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    // Set up the UI.
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_speech);
-    quitButton = (Button) findViewById(R.id.quit);
-    quitButton.setOnClickListener(
-        new View.OnClickListener() {
-          @Override
-          public void onClick(View view) {
-            moveTaskToBack(true);
-            android.os.Process.killProcess(android.os.Process.myPid());
-            System.exit(1);
-          }
-        });
-    labelsListView = (ListView) findViewById(R.id.list_view);
-
-    // Load the labels for the model, but only display those that don't start
-    // with an underscore.
-    String actualLabelFilename = LABEL_FILENAME.split("file:///android_asset/", -1)[1];
-    Log.i(LOG_TAG, "Reading labels from: " + actualLabelFilename);
-    BufferedReader br = null;
-    try {
-      br = new BufferedReader(new InputStreamReader(getAssets().open(actualLabelFilename)));
-      String line;
-      while ((line = br.readLine()) != null) {
-        labels.add(line);
-        if (line.charAt(0) != '_') {
-          displayedLabels.add(line.substring(0, 1).toUpperCase() + line.substring(1));
-        }
-      }
-      br.close();
-    } catch (IOException e) {
-      throw new RuntimeException("Problem reading label file!", e);
-    }
-
-    // Build a list view based on these labels.
-    ArrayAdapter<String> arrayAdapter =
-        new ArrayAdapter<String>(this, R.layout.list_text_item, displayedLabels);
-    labelsListView.setAdapter(arrayAdapter);
-
-    // Set up an object to smooth recognition results to increase accuracy.
-    recognizeCommands =
-        new RecognizeCommands(
-            labels,
-            AVERAGE_WINDOW_DURATION_MS,
-            DETECTION_THRESHOLD,
-            SUPPRESSION_MS,
-            MINIMUM_COUNT,
-            MINIMUM_TIME_BETWEEN_SAMPLES_MS);
-
-    String actualModelFilename = MODEL_FILENAME.split("file:///android_asset/", -1)[1];
-    try {
-      tfLite = new Interpreter(loadModelFile(getAssets(), actualModelFilename));
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-
-    tfLite.resizeInput(0, new int[] {RECORDING_LENGTH, 1});
-    tfLite.resizeInput(1, new int[] {1});
-
-    // Start the recording and recognition threads.
-    requestMicrophonePermission();
-    startRecording();
-    startRecognition();
-  }
-
-  private void requestMicrophonePermission() {
-    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
-      requestPermissions(
-          new String[]{android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
-    }
-  }
-
-  @Override
-  public void onRequestPermissionsResult(
-      int requestCode, String[] permissions, int[] grantResults) {
-    if (requestCode == REQUEST_RECORD_AUDIO
-        && grantResults.length > 0
-        && grantResults[0] == PackageManager.PERMISSION_GRANTED) {
-      startRecording();
-      startRecognition();
-    }
-  }
-
-  public synchronized void startRecording() {
-    if (recordingThread != null) {
-      return;
-    }
-    shouldContinue = true;
-    recordingThread =
-        new Thread(
-            new Runnable() {
-              @Override
-              public void run() {
-                record();
-              }
-            });
-    recordingThread.start();
-  }
-
-  public synchronized void stopRecording() {
-    if (recordingThread == null) {
-      return;
-    }
-    shouldContinue = false;
-    recordingThread = null;
-  }
-
-  private void record() {
-    android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_AUDIO);
-
-    // Estimate the buffer size we'll need for this device.
-    int bufferSize =
-        AudioRecord.getMinBufferSize(
-            SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
-    if (bufferSize == AudioRecord.ERROR || bufferSize == AudioRecord.ERROR_BAD_VALUE) {
-      bufferSize = SAMPLE_RATE * 2;
-    }
-    short[] audioBuffer = new short[bufferSize / 2];
-
-    AudioRecord record =
-        new AudioRecord(
-            MediaRecorder.AudioSource.DEFAULT,
-            SAMPLE_RATE,
-            AudioFormat.CHANNEL_IN_MONO,
-            AudioFormat.ENCODING_PCM_16BIT,
-            bufferSize);
-
-    if (record.getState() != AudioRecord.STATE_INITIALIZED) {
-      Log.e(LOG_TAG, "Audio Record can't initialize!");
-      return;
-    }
-
-    record.startRecording();
-
-    Log.v(LOG_TAG, "Start recording");
-
-    // Loop, gathering audio data and copying it to a round-robin buffer.
-    while (shouldContinue) {
-      int numberRead = record.read(audioBuffer, 0, audioBuffer.length);
-      int maxLength = recordingBuffer.length;
-      int newRecordingOffset = recordingOffset + numberRead;
-      int secondCopyLength = Math.max(0, newRecordingOffset - maxLength);
-      int firstCopyLength = numberRead - secondCopyLength;
-      // We store off all the data for the recognition thread to access. The ML
-      // thread will copy out of this buffer into its own, while holding the
-      // lock, so this should be thread safe.
-      recordingBufferLock.lock();
-      try {
-        System.arraycopy(audioBuffer, 0, recordingBuffer, recordingOffset, firstCopyLength);
-        System.arraycopy(audioBuffer, firstCopyLength, recordingBuffer, 0, secondCopyLength);
-        recordingOffset = newRecordingOffset % maxLength;
-      } finally {
-        recordingBufferLock.unlock();
-      }
-    }
-
-    record.stop();
-    record.release();
-  }
-
-  public synchronized void startRecognition() {
-    if (recognitionThread != null) {
-      return;
-    }
-    shouldContinueRecognition = true;
-    recognitionThread =
-        new Thread(
-            new Runnable() {
-              @Override
-              public void run() {
-                recognize();
-              }
-            });
-    recognitionThread.start();
-  }
-
-  public synchronized void stopRecognition() {
-    if (recognitionThread == null) {
-      return;
-    }
-    shouldContinueRecognition = false;
-    recognitionThread = null;
-  }
-
-  private void recognize() {
-    Log.v(LOG_TAG, "Start recognition");
-
-    short[] inputBuffer = new short[RECORDING_LENGTH];
-    float[][] floatInputBuffer = new float[RECORDING_LENGTH][1];
-    float[][] outputScores = new float[1][labels.size()];
-    int[] sampleRateList = new int[] {SAMPLE_RATE};
-
-    // Loop, grabbing recorded data and running the recognition model on it.
-    while (shouldContinueRecognition) {
-      // The recording thread places data in this round-robin buffer, so lock to
-      // make sure there's no writing happening and then copy it to our own
-      // local version.
-      recordingBufferLock.lock();
-      try {
-        int maxLength = recordingBuffer.length;
-        int firstCopyLength = maxLength - recordingOffset;
-        int secondCopyLength = recordingOffset;
-        System.arraycopy(recordingBuffer, recordingOffset, inputBuffer, 0, firstCopyLength);
-        System.arraycopy(recordingBuffer, 0, inputBuffer, firstCopyLength, secondCopyLength);
-      } finally {
-        recordingBufferLock.unlock();
-      }
-
-      // We need to feed in float values between -1.0f and 1.0f, so divide the
-      // signed 16-bit inputs.
-      for (int i = 0; i < RECORDING_LENGTH; ++i) {
-        floatInputBuffer[i][0] = inputBuffer[i] / 32767.0f;
-      }
-
-      Object[] inputArray = {floatInputBuffer, sampleRateList};
-      Map<Integer, Object> outputMap = new HashMap<>();
-      outputMap.put(0, outputScores);
-
-      // Run the model.
-      tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
-
-      // Use the smoother to figure out if we've had a real recognition event.
-      long currentTime = System.currentTimeMillis();
-      final RecognizeCommands.RecognitionResult result =
-          recognizeCommands.processLatestResults(outputScores[0], currentTime);
-
-      runOnUiThread(
-          new Runnable() {
-            @Override
-            public void run() {
-              // If we do have a new command, highlight the right list entry.
-              if (!result.foundCommand.startsWith("_") && result.isNewCommand) {
-                int labelIndex = -1;
-                for (int i = 0; i < labels.size(); ++i) {
-                  if (labels.get(i).equals(result.foundCommand)) {
-                    labelIndex = i;
-                  }
-                }
-                final View labelView = (View) labelsListView.getChildAt(labelIndex - 2);
-                ValueAnimator colorAnimation =
-                    ValueAnimator.ofArgb(0x00b3ccff, 0xffb3ccff, 0x00b3ccff);
-                colorAnimation.setDuration(750);
-                colorAnimation.addUpdateListener(
-                    new ValueAnimator.AnimatorUpdateListener() {
-                      @Override
-                      public void onAnimationUpdate(ValueAnimator animator) {
-                        labelView.setBackgroundColor((int) animator.getAnimatedValue());
-                      }
-                    });
-                colorAnimation.start();
-              }
-            }
-          });
-      try {
-        // We don't need to run too frequently, so snooze for a bit.
-        Thread.sleep(MINIMUM_TIME_BETWEEN_SAMPLES_MS);
-      } catch (InterruptedException e) {
-        // Ignore
-      }
-    }
-
-    Log.v(LOG_TAG, "End recognition");
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java
deleted file mode 100644
index d75c3ce..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo;
-
-import android.content.res.AssetFileDescriptor;
-import android.content.res.AssetManager;
-import android.graphics.Bitmap;
-import android.os.SystemClock;
-import android.os.Trace;
-import android.util.Log;
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import java.util.PriorityQueue;
-import java.util.Vector;
-import org.tensorflow.lite.Interpreter;
-
-/** A classifier specialized to label images using TensorFlow. */
-public class TFLiteImageClassifier implements Classifier {
-  private static final String TAG = "TFLiteImageClassifier";
-
-  // Only return this many results with at least this confidence.
-  private static final int MAX_RESULTS = 3;
-
-  private Interpreter tfLite;
-
-  /** Dimensions of inputs. */
-  private static final int DIM_BATCH_SIZE = 1;
-
-  private static final int DIM_PIXEL_SIZE = 3;
-
-  private static final int DIM_IMG_SIZE_X = 224;
-  private static final int DIM_IMG_SIZE_Y = 224;
-
-  byte[][] labelProb;
-
-  // Pre-allocated buffers.
-  private Vector<String> labels = new Vector<String>();
-  private int[] intValues;
-  private ByteBuffer imgData = null;
-
-  private TFLiteImageClassifier() {}
-
-  /** Memory-map the model file in Assets. */
-  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
-      throws IOException {
-    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    FileChannel fileChannel = inputStream.getChannel();
-    long startOffset = fileDescriptor.getStartOffset();
-    long declaredLength = fileDescriptor.getDeclaredLength();
-    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-  }
-
-  /**
-   * Initializes a native TensorFlow session for classifying images.
-   *
-   * @param assetManager The asset manager to be used to load assets.
-   * @param modelFilename The filepath of the model GraphDef protocol buffer.
-   * @param labelFilename The filepath of label file for classes.
-   * @param inputSize The input size. A square image of inputSize x inputSize is assumed.
-   * @throws IOException
-   */
-  public static Classifier create(
-      AssetManager assetManager, String modelFilename, String labelFilename, int inputSize) {
-    TFLiteImageClassifier c = new TFLiteImageClassifier();
-
-    // Read the label names into memory.
-    // TODO(andrewharp): make this handle non-assets.
-    Log.i(TAG, "Reading labels from: " + labelFilename);
-    BufferedReader br = null;
-    try {
-      br = new BufferedReader(new InputStreamReader(assetManager.open(labelFilename)));
-      String line;
-      while ((line = br.readLine()) != null) {
-        c.labels.add(line);
-      }
-      br.close();
-    } catch (IOException e) {
-      throw new RuntimeException("Problem reading label file!" , e);
-    }
-
-    c.imgData =
-        ByteBuffer.allocateDirect(
-            DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
-
-    c.imgData.order(ByteOrder.nativeOrder());
-    try {
-      c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-
-    // The shape of the output is [N, NUM_CLASSES], where N is the batch size.
-    Log.i(TAG, "Read " + c.labels.size() + " labels");
-
-    // Pre-allocate buffers.
-    c.intValues = new int[inputSize * inputSize];
-
-    c.labelProb = new byte[1][c.labels.size()];
-
-    return c;
-  }
-
-  /** Writes Image data into a {@code ByteBuffer}. */
-  private void convertBitmapToByteBuffer(Bitmap bitmap) {
-    if (imgData == null) {
-      return;
-    }
-    imgData.rewind();
-    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
-    // Convert the image to floating point.
-    int pixel = 0;
-    long startTime = SystemClock.uptimeMillis();
-    for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
-      for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
-        final int val = intValues[pixel++];
-        imgData.put((byte) ((val >> 16) & 0xFF));
-        imgData.put((byte) ((val >> 8) & 0xFF));
-        imgData.put((byte) (val & 0xFF));
-      }
-    }
-    long endTime = SystemClock.uptimeMillis();
-    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
-  }
-
-  @Override
-  public List<Recognition> recognizeImage(final Bitmap bitmap) {
-    // Log this method so that it can be analyzed with systrace.
-    Trace.beginSection("recognizeImage");
-
-    Trace.beginSection("preprocessBitmap");
-
-    long startTime;
-    long endTime;
-    startTime = SystemClock.uptimeMillis();
-
-    convertBitmapToByteBuffer(bitmap);
-
-    // Run the inference call.
-    Trace.beginSection("run");
-    startTime = SystemClock.uptimeMillis();
-    tfLite.run(imgData, labelProb);
-    endTime = SystemClock.uptimeMillis();
-    Log.i(TAG, "Inf time: " + (endTime - startTime));
-    Trace.endSection();
-
-    // Find the best classifications.
-    PriorityQueue<Recognition> pq =
-        new PriorityQueue<Recognition>(
-            3,
-            new Comparator<Recognition>() {
-              @Override
-              public int compare(Recognition lhs, Recognition rhs) {
-                // Intentionally reversed to put high confidence at the head of the queue.
-                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
-              }
-            });
-    for (int i = 0; i < labels.size(); ++i) {
-      pq.add(
-          new Recognition(
-              "" + i,
-              labels.size() > i ? labels.get(i) : "unknown",
-              (float) labelProb[0][i],
-              null));
-    }
-    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
-    int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
-    for (int i = 0; i < recognitionsSize; ++i) {
-      recognitions.add(pq.poll());
-    }
-    Trace.endSection(); // "recognizeImage"
-    return recognitions;
-  }
-
-  @Override
-  public void enableStatLogging(boolean logStats) {
-  }
-
-  @Override
-  public String getStatString() {
-    return "";
-  }
-
-  @Override
-  public void close() {
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
deleted file mode 100644
index afbf317..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo;
-
-import android.content.res.AssetFileDescriptor;
-import android.content.res.AssetManager;
-import android.graphics.Bitmap;
-import android.graphics.RectF;
-import android.os.Trace;
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Vector;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.lite.Interpreter;
-
-/**
- * Wrapper for frozen detection models trained using the Tensorflow Object Detection API:
- * github.com/tensorflow/models/tree/master/research/object_detection
- */
-public class TFLiteObjectDetectionAPIModel implements Classifier {
-  private static final Logger LOGGER = new Logger();
-
-  // Only return this many results.
-  private static final int NUM_DETECTIONS = 10;
-  private boolean isModelQuantized;
-  // Float model
-  private static final float IMAGE_MEAN = 128.0f;
-  private static final float IMAGE_STD = 128.0f;
-  // Number of threads in the java app
-  private static final int NUM_THREADS = 4;
-  // Config values.
-  private int inputSize;
-  // Pre-allocated buffers.
-  private Vector<String> labels = new Vector<String>();
-  private int[] intValues;
-  // outputLocations: array of shape [Batchsize, NUM_DETECTIONS,4]
-  // contains the location of detected boxes
-  private float[][][] outputLocations;
-  // outputClasses: array of shape [Batchsize, NUM_DETECTIONS]
-  // contains the classes of detected boxes
-  private float[][] outputClasses;
-  // outputScores: array of shape [Batchsize, NUM_DETECTIONS]
-  // contains the scores of detected boxes
-  private float[][] outputScores;
-  // numDetections: array of shape [Batchsize]
-  // contains the number of detected boxes
-  private float[] numDetections;
-
-  private ByteBuffer imgData;
-
-  private Interpreter tfLite;
-
-
-  /** Memory-map the model file in Assets. */
-  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
-      throws IOException {
-    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    FileChannel fileChannel = inputStream.getChannel();
-    long startOffset = fileDescriptor.getStartOffset();
-    long declaredLength = fileDescriptor.getDeclaredLength();
-    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-  }
-
-  /**
-   * Initializes a native TensorFlow session for classifying images.
-   *
-   * @param assetManager The asset manager to be used to load assets.
-   * @param modelFilename The filepath of the model GraphDef protocol buffer.
-   * @param labelFilename The filepath of label file for classes.
-   * @param inputSize The size of image input
-   * @param isQuantized Boolean representing model is quantized or not
-   */
-  public static Classifier create(
-      final AssetManager assetManager,
-      final String modelFilename,
-      final String labelFilename,
-      final int inputSize,
-      final boolean isQuantized)
-      throws IOException {
-    final TFLiteObjectDetectionAPIModel d = new TFLiteObjectDetectionAPIModel();
-
-    InputStream labelsInput = null;
-    labelsInput = assetManager.open(labelFilename);
-    BufferedReader br = null;
-    br = new BufferedReader(new InputStreamReader(labelsInput));
-    String line;
-    while ((line = br.readLine()) != null) {
-      LOGGER.w(line);
-      d.labels.add(line);
-    }
-    br.close();
-
-    d.inputSize = inputSize;
-
-    try {
-      d.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-
-    d.isModelQuantized = isQuantized;
-    // Pre-allocate buffers.
-    int numBytesPerChannel;
-    if (isQuantized) {
-      numBytesPerChannel = 1; // Quantized
-    } else {
-      numBytesPerChannel = 4; // Floating point
-    }
-    d.imgData = ByteBuffer.allocateDirect(1 * d.inputSize * d.inputSize * 3 * numBytesPerChannel);
-    d.imgData.order(ByteOrder.nativeOrder());
-    d.intValues = new int[d.inputSize * d.inputSize];
-
-    d.tfLite.setNumThreads(NUM_THREADS);
-    d.outputLocations = new float[1][NUM_DETECTIONS][4];
-    d.outputClasses = new float[1][NUM_DETECTIONS];
-    d.outputScores = new float[1][NUM_DETECTIONS];
-    d.numDetections = new float[1];
-    return d;
-  }
-
-  private TFLiteObjectDetectionAPIModel() {}
-
-  @Override
-  public List<Recognition> recognizeImage(final Bitmap bitmap) {
-    // Log this method so that it can be analyzed with systrace.
-    Trace.beginSection("recognizeImage");
-
-    Trace.beginSection("preprocessBitmap");
-    // Preprocess the image data from 0-255 int to normalized float based
-    // on the provided parameters.
-    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
-
-    imgData.rewind();
-    for (int i = 0; i < inputSize; ++i) {
-      for (int j = 0; j < inputSize; ++j) {
-        int pixelValue = intValues[i * inputSize + j];
-        if (isModelQuantized) {
-          // Quantized model
-          imgData.put((byte) ((pixelValue >> 16) & 0xFF));
-          imgData.put((byte) ((pixelValue >> 8) & 0xFF));
-          imgData.put((byte) (pixelValue & 0xFF));
-        } else { // Float model
-          imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-          imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-          imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-        }
-      }
-    }
-    Trace.endSection(); // preprocessBitmap
-
-    // Copy the input data into TensorFlow.
-    Trace.beginSection("feed");
-    outputLocations = new float[1][NUM_DETECTIONS][4];
-    outputClasses = new float[1][NUM_DETECTIONS];
-    outputScores = new float[1][NUM_DETECTIONS];
-    numDetections = new float[1];
-
-    Object[] inputArray = {imgData};
-    Map<Integer, Object> outputMap = new HashMap<>();
-    outputMap.put(0, outputLocations);
-    outputMap.put(1, outputClasses);
-    outputMap.put(2, outputScores);
-    outputMap.put(3, numDetections);
-    Trace.endSection();
-
-    // Run the inference call.
-    Trace.beginSection("run");
-    tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
-    Trace.endSection();
-
-    // Show the best detections.
-    // after scaling them back to the input size.
-    final ArrayList<Recognition> recognitions = new ArrayList<>(NUM_DETECTIONS);
-    for (int i = 0; i < NUM_DETECTIONS; ++i) {
-      final RectF detection =
-          new RectF(
-              outputLocations[0][i][1] * inputSize,
-              outputLocations[0][i][0] * inputSize,
-              outputLocations[0][i][3] * inputSize,
-              outputLocations[0][i][2] * inputSize);
-      // SSD Mobilenet V1 Model assumes class 0 is background class
-      // in label file and class labels start from 1 to number_of_classes+1,
-      // while outputClasses correspond to class index from 0 to number_of_classes
-      int labelOffset = 1;
-      recognitions.add(
-          new Recognition(
-              "" + i,
-              labels.get((int) outputClasses[0][i] + labelOffset),
-              outputScores[0][i],
-              detection));
-    }
-    Trace.endSection(); // "recognizeImage"
-    return recognitions;
-  }
-
-  @Override
-  public void enableStatLogging(final boolean logStats) {
-  }
-
-  @Override
-  public String getStatString() {
-    return "";
-  }
-
-  @Override
-  public void close() {
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java
deleted file mode 100644
index c50efdf..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.env;
-
-import android.content.Context;
-import android.content.res.AssetManager;
-import android.util.Log;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/** Utilities for dealing with assets. */
-public class AssetUtils {
-
-  private static final String TAG = AssetUtils.class.getSimpleName();
-
-  private static final int BYTE_BUF_SIZE = 2048;
-
-  /**
-   * Copies a file from assets.
-   *
-   * @param context application context used to discover assets.
-   * @param assetName the relative file name within assets.
-   * @param targetName the target file name, always over write the existing file.
-   * @throws IOException if operation fails.
-   */
-  public static void copy(Context context, String assetName, String targetName) throws IOException {
-
-    Log.d(TAG, "creating file " + targetName + " from " + assetName);
-
-    File targetFile = null;
-    InputStream inputStream = null;
-    FileOutputStream outputStream = null;
-
-    try {
-      AssetManager assets = context.getAssets();
-      targetFile = new File(targetName);
-      inputStream = assets.open(assetName);
-      // TODO(kanlig): refactor log messages to make them more useful.
-      Log.d(TAG, "Creating outputstream");
-      outputStream = new FileOutputStream(targetFile, false /* append */);
-      copy(inputStream, outputStream);
-    } finally {
-      if (outputStream != null) {
-        outputStream.close();
-      }
-      if (inputStream != null) {
-        inputStream.close();
-      }
-    }
-  }
-
-  private static void copy(InputStream from, OutputStream to) throws IOException {
-    byte[] buf = new byte[BYTE_BUF_SIZE];
-    while (true) {
-      int r = from.read(buf);
-      if (r == -1) {
-        break;
-      }
-      to.write(buf, 0, r);
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java
deleted file mode 100644
index decfc3d..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.env;
-
-import android.graphics.Canvas;
-import android.graphics.Color;
-import android.graphics.Paint;
-import android.graphics.Paint.Align;
-import android.graphics.Paint.Style;
-import android.graphics.Rect;
-import android.graphics.Typeface;
-import java.util.Vector;
-
-/**
- * A class that encapsulates the tedious bits of rendering legible, bordered text onto a canvas.
- */
-public class BorderedText {
-  private final Paint interiorPaint;
-  private final Paint exteriorPaint;
-
-  private final float textSize;
-
-  /**
-   * Creates a left-aligned bordered text object with a white interior, and a black exterior with
-   * the specified text size.
-   *
-   * @param textSize text size in pixels
-   */
-  public BorderedText(final float textSize) {
-    this(Color.WHITE, Color.BLACK, textSize);
-  }
-
-  /**
-   * Create a bordered text object with the specified interior and exterior colors, text size and
-   * alignment.
-   *
-   * @param interiorColor the interior text color
-   * @param exteriorColor the exterior text color
-   * @param textSize text size in pixels
-   */
-  public BorderedText(final int interiorColor, final int exteriorColor, final float textSize) {
-    interiorPaint = new Paint();
-    interiorPaint.setTextSize(textSize);
-    interiorPaint.setColor(interiorColor);
-    interiorPaint.setStyle(Style.FILL);
-    interiorPaint.setAntiAlias(false);
-    interiorPaint.setAlpha(255);
-
-    exteriorPaint = new Paint();
-    exteriorPaint.setTextSize(textSize);
-    exteriorPaint.setColor(exteriorColor);
-    exteriorPaint.setStyle(Style.FILL_AND_STROKE);
-    exteriorPaint.setStrokeWidth(textSize / 8);
-    exteriorPaint.setAntiAlias(false);
-    exteriorPaint.setAlpha(255);
-
-    this.textSize = textSize;
-  }
-
-  public void setTypeface(Typeface typeface) {
-    interiorPaint.setTypeface(typeface);
-    exteriorPaint.setTypeface(typeface);
-  }
-
-  public void drawText(final Canvas canvas, final float posX, final float posY, final String text) {
-    canvas.drawText(text, posX, posY, exteriorPaint);
-    canvas.drawText(text, posX, posY, interiorPaint);
-  }
-
-  public void drawLines(Canvas canvas, final float posX, final float posY, Vector<String> lines) {
-    int lineNum = 0;
-    for (final String line : lines) {
-      drawText(canvas, posX, posY - getTextSize() * (lines.size() - lineNum - 1), line);
-      ++lineNum;
-    }
-  }
-
-  public void setInteriorColor(final int color) {
-    interiorPaint.setColor(color);
-  }
-
-  public void setExteriorColor(final int color) {
-    exteriorPaint.setColor(color);
-  }
-
-  public float getTextSize() {
-    return textSize;
-  }
-
-  public void setAlpha(final int alpha) {
-    interiorPaint.setAlpha(alpha);
-    exteriorPaint.setAlpha(alpha);
-  }
-
-  public void getTextBounds(
-      final String line, final int index, final int count, final Rect lineBounds) {
-    interiorPaint.getTextBounds(line, index, count, lineBounds);
-  }
-
-  public void setTextAlign(final Align align) {
-    interiorPaint.setTextAlign(align);
-    exteriorPaint.setTextAlign(align);
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java
deleted file mode 100644
index e02c655..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.env;
-
-import android.graphics.Bitmap;
-import android.graphics.Matrix;
-import android.os.Environment;
-import java.io.File;
-import java.io.FileOutputStream;
-
-/**
- * Utility class for manipulating images.
- **/
-public class ImageUtils {
-  @SuppressWarnings("unused")
-  private static final Logger LOGGER = new Logger();
-
-  static {
-    try {
-      System.loadLibrary("tensorflow_demo");
-    } catch (UnsatisfiedLinkError e) {
-      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
-    }
-  }
-
-  /**
-   * Utility method to compute the allocated size in bytes of a YUV420SP image
-   * of the given dimensions.
-   */
-  public static int getYUVByteSize(final int width, final int height) {
-    // The luminance plane requires 1 byte per pixel.
-    final int ySize = width * height;
-
-    // The UV plane works on 2x2 blocks, so dimensions with odd size must be rounded up.
-    // Each 2x2 block takes 2 bytes to encode, one each for U and V.
-    final int uvSize = ((width + 1) / 2) * ((height + 1) / 2) * 2;
-
-    return ySize + uvSize;
-  }
-
-  /**
-   * Saves a Bitmap object to disk for analysis.
-   *
-   * @param bitmap The bitmap to save.
-   */
-  public static void saveBitmap(final Bitmap bitmap) {
-    saveBitmap(bitmap, "preview.png");
-  }
-
-  /**
-   * Saves a Bitmap object to disk for analysis.
-   *
-   * @param bitmap The bitmap to save.
-   * @param filename The location to save the bitmap to.
-   */
-  public static void saveBitmap(final Bitmap bitmap, final String filename) {
-    final String root =
-        Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "tensorflow";
-    LOGGER.i("Saving %dx%d bitmap to %s.", bitmap.getWidth(), bitmap.getHeight(), root);
-    final File myDir = new File(root);
-
-    if (!myDir.mkdirs()) {
-      LOGGER.i("Make dir failed");
-    }
-
-    final String fname = filename;
-    final File file = new File(myDir, fname);
-    if (file.exists()) {
-      file.delete();
-    }
-    try {
-      final FileOutputStream out = new FileOutputStream(file);
-      bitmap.compress(Bitmap.CompressFormat.PNG, 99, out);
-      out.flush();
-      out.close();
-    } catch (final Exception e) {
-      LOGGER.e(e, "Exception!");
-    }
-  }
-
-  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
-  // are normalized to eight bits.
-  static final int kMaxChannelValue = 262143;
-
-  // Always prefer the native implementation if available.
-  private static boolean useNativeConversion = false;
-
-  public static void convertYUV420SPToARGB8888(
-      byte[] input,
-      int width,
-      int height,
-      int[] output) {
-    if (useNativeConversion) {
-      try {
-        ImageUtils.convertYUV420SPToARGB8888(input, output, width, height, false);
-        return;
-      } catch (UnsatisfiedLinkError e) {
-        LOGGER.w(
-            "Native YUV420SP -> RGB implementation not found, falling back to Java implementation");
-        useNativeConversion = false;
-      }
-    }
-
-    // Java implementation of YUV420SP to ARGB8888 converting
-    final int frameSize = width * height;
-    for (int j = 0, yp = 0; j < height; j++) {
-      int uvp = frameSize + (j >> 1) * width;
-      int u = 0;
-      int v = 0;
-
-      for (int i = 0; i < width; i++, yp++) {
-        int y = 0xff & input[yp];
-        if ((i & 1) == 0) {
-          v = 0xff & input[uvp++];
-          u = 0xff & input[uvp++];
-        }
-
-        output[yp] = YUV2RGB(y, u, v);
-      }
-    }
-  }
-
-  private static int YUV2RGB(int y, int u, int v) {
-    // Adjust and check YUV values
-    y = (y - 16) < 0 ? 0 : (y - 16);
-    u -= 128;
-    v -= 128;
-
-    // This is the floating point equivalent. We do the conversion in integer
-    // because some Android devices do not have floating point in hardware.
-    // nR = (int)(1.164 * nY + 2.018 * nU);
-    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
-    // nB = (int)(1.164 * nY + 1.596 * nV);
-    int y1192 = 1192 * y;
-    int r = (y1192 + 1634 * v);
-    int g = (y1192 - 833 * v - 400 * u);
-    int b = (y1192 + 2066 * u);
-
-    // Clipping RGB values to be inside boundaries [ 0 , kMaxChannelValue ]
-    r = r > kMaxChannelValue ? kMaxChannelValue : (r < 0 ? 0 : r);
-    g = g > kMaxChannelValue ? kMaxChannelValue : (g < 0 ? 0 : g);
-    b = b > kMaxChannelValue ? kMaxChannelValue : (b < 0 ? 0 : b);
-
-    return 0xff000000 | ((r << 6) & 0xff0000) | ((g >> 2) & 0xff00) | ((b >> 10) & 0xff);
-  }
-
-
-  public static void convertYUV420ToARGB8888(
-      byte[] yData,
-      byte[] uData,
-      byte[] vData,
-      int width,
-      int height,
-      int yRowStride,
-      int uvRowStride,
-      int uvPixelStride,
-      int[] out) {
-    if (useNativeConversion) {
-      try {
-        convertYUV420ToARGB8888(
-            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
-        return;
-      } catch (UnsatisfiedLinkError e) {
-        LOGGER.w(
-            "Native YUV420 -> RGB implementation not found, falling back to Java implementation");
-        useNativeConversion = false;
-      }
-    }
-
-    int yp = 0;
-    for (int j = 0; j < height; j++) {
-      int pY = yRowStride * j;
-      int pUV = uvRowStride * (j >> 1);
-
-      for (int i = 0; i < width; i++) {
-        int uv_offset = pUV + (i >> 1) * uvPixelStride;
-
-        out[yp++] = YUV2RGB(
-            0xff & yData[pY + i],
-            0xff & uData[uv_offset],
-            0xff & vData[uv_offset]);
-      }
-    }
-  }
-
-
-  /**
-   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
-   * input and output must already be allocated and non-null. For efficiency, no error checking is
-   * performed.
-   *
-   * @param input The array of YUV 4:2:0 input data.
-   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
-   * @param width The width of the input image.
-   * @param height The height of the input image.
-   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
-   */
-  private static native void convertYUV420SPToARGB8888(
-      byte[] input, int[] output, int width, int height, boolean halfSize);
-
-  /**
-   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
-   * and height. The input and output must already be allocated and non-null.
-   * For efficiency, no error checking is performed.
-   *
-   * @param y
-   * @param u
-   * @param v
-   * @param uvPixelStride
-   * @param width The width of the input image.
-   * @param height The height of the input image.
-   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
-   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
-   */
-  private static native void convertYUV420ToARGB8888(
-      byte[] y,
-      byte[] u,
-      byte[] v,
-      int[] output,
-      int width,
-      int height,
-      int yRowStride,
-      int uvRowStride,
-      int uvPixelStride,
-      boolean halfSize);
-
-  /**
-   * Converts YUV420 semi-planar data to RGB 565 data using the supplied width
-   * and height. The input and output must already be allocated and non-null.
-   * For efficiency, no error checking is performed.
-   *
-   * @param input The array of YUV 4:2:0 input data.
-   * @param output A pre-allocated array for the RGB 5:6:5 output data.
-   * @param width The width of the input image.
-   * @param height The height of the input image.
-   */
-  private static native void convertYUV420SPToRGB565(
-      byte[] input, byte[] output, int width, int height);
-
-  /**
-   * Converts 32-bit ARGB8888 image data to YUV420SP data.  This is useful, for
-   * instance, in creating data to feed the classes that rely on raw camera
-   * preview frames.
-   *
-   * @param input An array of input pixels in ARGB8888 format.
-   * @param output A pre-allocated array for the YUV420SP output data.
-   * @param width The width of the input image.
-   * @param height The height of the input image.
-   */
-  private static native void convertARGB8888ToYUV420SP(
-      int[] input, byte[] output, int width, int height);
-
-  /**
-   * Converts 16-bit RGB565 image data to YUV420SP data.  This is useful, for
-   * instance, in creating data to feed the classes that rely on raw camera
-   * preview frames.
-   *
-   * @param input An array of input pixels in RGB565 format.
-   * @param output A pre-allocated array for the YUV420SP output data.
-   * @param width The width of the input image.
-   * @param height The height of the input image.
-   */
-  private static native void convertRGB565ToYUV420SP(
-      byte[] input, byte[] output, int width, int height);
-
-  /**
-   * Returns a transformation matrix from one reference frame into another.
-   * Handles cropping (if maintaining aspect ratio is desired) and rotation.
-   *
-   * @param srcWidth Width of source frame.
-   * @param srcHeight Height of source frame.
-   * @param dstWidth Width of destination frame.
-   * @param dstHeight Height of destination frame.
-   * @param applyRotation Amount of rotation to apply from one frame to another.
-   *  Must be a multiple of 90.
-   * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant,
-   * cropping the image if necessary.
-   * @return The transformation fulfilling the desired requirements.
-   */
-  public static Matrix getTransformationMatrix(
-      final int srcWidth,
-      final int srcHeight,
-      final int dstWidth,
-      final int dstHeight,
-      final int applyRotation,
-      final boolean maintainAspectRatio) {
-    final Matrix matrix = new Matrix();
-
-    if (applyRotation != 0) {
-      if (applyRotation % 90 != 0) {
-        LOGGER.w("Rotation of %d % 90 != 0", applyRotation);
-      }
-
-      // Translate so center of image is at origin.
-      matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
-
-      // Rotate around origin.
-      matrix.postRotate(applyRotation);
-    }
-
-    // Account for the already applied rotation, if any, and then determine how
-    // much scaling is needed for each axis.
-    final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0;
-
-    final int inWidth = transpose ? srcHeight : srcWidth;
-    final int inHeight = transpose ? srcWidth : srcHeight;
-
-    // Apply scaling if necessary.
-    if (inWidth != dstWidth || inHeight != dstHeight) {
-      final float scaleFactorX = dstWidth / (float) inWidth;
-      final float scaleFactorY = dstHeight / (float) inHeight;
-
-      if (maintainAspectRatio) {
-        // Scale by minimum factor so that dst is filled completely while
-        // maintaining the aspect ratio. Some image may fall off the edge.
-        final float scaleFactor = Math.max(scaleFactorX, scaleFactorY);
-        matrix.postScale(scaleFactor, scaleFactor);
-      } else {
-        // Scale exactly to fill dst from src.
-        matrix.postScale(scaleFactorX, scaleFactorY);
-      }
-    }
-
-    if (applyRotation != 0) {
-      // Translate back from origin centered reference to destination frame.
-      matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f);
-    }
-
-    return matrix;
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java
deleted file mode 100644
index 0d98409..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.env;
-
-import android.util.Log;
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * Wrapper for the platform log function, allows convenient message prefixing and log disabling.
- */
-public final class Logger {
-  private static final String DEFAULT_TAG = "tensorflow";
-  private static final int DEFAULT_MIN_LOG_LEVEL = Log.DEBUG;
-
-  // Classes to be ignored when examining the stack trace
-  private static final Set<String> IGNORED_CLASS_NAMES;
-
-  static {
-    IGNORED_CLASS_NAMES = new HashSet<String>(3);
-    IGNORED_CLASS_NAMES.add("dalvik.system.VMStack");
-    IGNORED_CLASS_NAMES.add("java.lang.Thread");
-    IGNORED_CLASS_NAMES.add(Logger.class.getCanonicalName());
-  }
-
-  private final String tag;
-  private final String messagePrefix;
-  private int minLogLevel = DEFAULT_MIN_LOG_LEVEL;
-
-  /**
-   * Creates a Logger using the class name as the message prefix.
-   *
-   * @param clazz the simple name of this class is used as the message prefix.
-   */
-  public Logger(final Class<?> clazz) {
-    this(clazz.getSimpleName());
-  }
-
-  /**
-   * Creates a Logger using the specified message prefix.
-   *
-   * @param messagePrefix is prepended to the text of every message.
-   */
-  public Logger(final String messagePrefix) {
-    this(DEFAULT_TAG, messagePrefix);
-  }
-
-  /**
-   * Creates a Logger with a custom tag and a custom message prefix. If the message prefix
-   * is set to <pre>null</pre>, the caller's class name is used as the prefix.
-   *
-   * @param tag identifies the source of a log message.
-   * @param messagePrefix prepended to every message if non-null. If null, the name of the caller is
-   *                      being used
-   */
-  public Logger(final String tag, final String messagePrefix) {
-    this.tag = tag;
-    final String prefix = messagePrefix == null ? getCallerSimpleName() : messagePrefix;
-    this.messagePrefix = (prefix.length() > 0) ? prefix + ": " : prefix;
-  }
-
-  /**
-   * Creates a Logger using the caller's class name as the message prefix.
-   */
-  public Logger() {
-    this(DEFAULT_TAG, null);
-  }
-
-  /**
-   * Creates a Logger using the caller's class name as the message prefix.
-   */
-  public Logger(final int minLogLevel) {
-    this(DEFAULT_TAG, null);
-    this.minLogLevel = minLogLevel;
-  }
-
-  public void setMinLogLevel(final int minLogLevel) {
-    this.minLogLevel = minLogLevel;
-  }
-
-  public boolean isLoggable(final int logLevel) {
-    return logLevel >= minLogLevel || Log.isLoggable(tag, logLevel);
-  }
-
-  /**
-   * Return caller's simple name.
-   *
-   * Android getStackTrace() returns an array that looks like this:
-   *     stackTrace[0]: dalvik.system.VMStack
-   *     stackTrace[1]: java.lang.Thread
-   *     stackTrace[2]: com.google.android.apps.unveil.env.UnveilLogger
-   *     stackTrace[3]: com.google.android.apps.unveil.BaseApplication
-   *
-   * This function returns the simple version of the first non-filtered name.
-   *
-   * @return caller's simple name
-   */
-  private static String getCallerSimpleName() {
-    // Get the current callstack so we can pull the class of the caller off of it.
-    final StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
-
-    for (final StackTraceElement elem : stackTrace) {
-      final String className = elem.getClassName();
-      if (!IGNORED_CLASS_NAMES.contains(className)) {
-        // We're only interested in the simple name of the class, not the complete package.
-        final String[] classParts = className.split("\\.");
-        return classParts[classParts.length - 1];
-      }
-    }
-
-    return Logger.class.getSimpleName();
-  }
-
-  private String toMessage(final String format, final Object... args) {
-    return messagePrefix + (args.length > 0 ? String.format(format, args) : format);
-  }
-
-  public void v(final String format, final Object... args) {
-    if (isLoggable(Log.VERBOSE)) {
-      Log.v(tag, toMessage(format, args));
-    }
-  }
-
-  public void v(final Throwable t, final String format, final Object... args) {
-    if (isLoggable(Log.VERBOSE)) {
-      Log.v(tag, toMessage(format, args), t);
-    }
-  }
-
-  public void d(final String format, final Object... args) {
-    if (isLoggable(Log.DEBUG)) {
-      Log.d(tag, toMessage(format, args));
-    }
-  }
-
-  public void d(final Throwable t, final String format, final Object... args) {
-    if (isLoggable(Log.DEBUG)) {
-      Log.d(tag, toMessage(format, args), t);
-    }
-  }
-
-  public void i(final String format, final Object... args) {
-    if (isLoggable(Log.INFO)) {
-      Log.i(tag, toMessage(format, args));
-    }
-  }
-
-  public void i(final Throwable t, final String format, final Object... args) {
-    if (isLoggable(Log.INFO)) {
-      Log.i(tag, toMessage(format, args), t);
-    }
-  }
-
-  public void w(final String format, final Object... args) {
-    if (isLoggable(Log.WARN)) {
-      Log.w(tag, toMessage(format, args));
-    }
-  }
-
-  public void w(final Throwable t, final String format, final Object... args) {
-    if (isLoggable(Log.WARN)) {
-      Log.w(tag, toMessage(format, args), t);
-    }
-  }
-
-  public void e(final String format, final Object... args) {
-    if (isLoggable(Log.ERROR)) {
-      Log.e(tag, toMessage(format, args));
-    }
-  }
-
-  public void e(final Throwable t, final String format, final Object... args) {
-    if (isLoggable(Log.ERROR)) {
-      Log.e(tag, toMessage(format, args), t);
-    }
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java
deleted file mode 100644
index ef15d14..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.env;
-
-import android.graphics.Bitmap;
-import android.text.TextUtils;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Size class independent of a Camera object.
- */
-public class Size implements Comparable<Size>, Serializable {
-
-  // 1.4 went out with this UID so we'll need to maintain it to preserve pending queries when
-  // upgrading.
-  public static final long serialVersionUID = 7689808733290872361L;
-
-  public final int width;
-  public final int height;
-
-  public Size(final int width, final int height) {
-    this.width = width;
-    this.height = height;
-  }
-
-  public Size(final Bitmap bmp) {
-    this.width = bmp.getWidth();
-    this.height = bmp.getHeight();
-  }
-
-  /**
-   * Rotate a size by the given number of degrees.
-   * @param size Size to rotate.
-   * @param rotation Degrees {0, 90, 180, 270} to rotate the size.
-   * @return Rotated size.
-   */
-  public static Size getRotatedSize(final Size size, final int rotation) {
-    if (rotation % 180 != 0) {
-      // The phone is portrait, therefore the camera is sideways and frame should be rotated.
-      return new Size(size.height, size.width);
-    }
-    return size;
-  }
-
-  public static Size parseFromString(String sizeString) {
-    if (TextUtils.isEmpty(sizeString)) {
-      return null;
-    }
-
-    sizeString = sizeString.trim();
-
-    // The expected format is "<width>x<height>".
-    final String[] components = sizeString.split("x");
-    if (components.length == 2) {
-      try {
-        final int width = Integer.parseInt(components[0]);
-        final int height = Integer.parseInt(components[1]);
-        return new Size(width, height);
-      } catch (final NumberFormatException e) {
-        return null;
-      }
-    } else {
-      return null;
-    }
-  }
-
-  public static List<Size> sizeStringToList(final String sizes) {
-    final List<Size> sizeList = new ArrayList<Size>();
-    if (sizes != null) {
-      final String[] pairs = sizes.split(",");
-      for (final String pair : pairs) {
-        final Size size = Size.parseFromString(pair);
-        if (size != null) {
-          sizeList.add(size);
-        }
-      }
-    }
-    return sizeList;
-  }
-
-  public static String sizeListToString(final List<Size> sizes) {
-    String sizesString = "";
-    if (sizes != null && sizes.size() > 0) {
-      sizesString = sizes.get(0).toString();
-      for (int i = 1; i < sizes.size(); i++) {
-        sizesString += "," + sizes.get(i).toString();
-      }
-    }
-    return sizesString;
-  }
-
-  public final float aspectRatio() {
-    return (float) width / (float) height;
-  }
-
-  @Override
-  public int compareTo(final Size other) {
-    return width * height - other.width * other.height;
-  }
-
-  @Override
-  public boolean equals(final Object other) {
-    if (other == null) {
-      return false;
-    }
-
-    if (!(other instanceof Size)) {
-      return false;
-    }
-
-    final Size otherSize = (Size) other;
-    return (width == otherSize.width && height == otherSize.height);
-  }
-
-  @Override
-  public int hashCode() {
-    return width * 32713 + height;
-  }
-
-  @Override
-  public String toString() {
-    return dimensionsAsString(width, height);
-  }
-
-  public static final String dimensionsAsString(final int width, final int height) {
-    return width + "x" + height;
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java
deleted file mode 100644
index 459b0a0..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.env;
-
-import android.os.SystemClock;
-
-/**
- * A simple utility timer for measuring CPU time and wall-clock splits.
- */
-public class SplitTimer {
-  private final Logger logger;
-
-  private long lastWallTime;
-  private long lastCpuTime;
-
-  public SplitTimer(final String name) {
-    logger = new Logger(name);
-    newSplit();
-  }
-
-  public void newSplit() {
-    lastWallTime = SystemClock.uptimeMillis();
-    lastCpuTime = SystemClock.currentThreadTimeMillis();
-  }
-
-  public void endSplit(final String splitName) {
-    final long currWallTime = SystemClock.uptimeMillis();
-    final long currCpuTime = SystemClock.currentThreadTimeMillis();
-
-    logger.i(
-        "%s: cpu=%dms wall=%dms",
-        splitName, currCpuTime - lastCpuTime, currWallTime - lastWallTime);
-
-    lastWallTime = currWallTime;
-    lastCpuTime = currCpuTime;
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java
deleted file mode 100644
index af6af2b..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ /dev/null
@@ -1,421 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.tracking;
-
-import android.content.Context;
-import android.graphics.Canvas;
-import android.graphics.Color;
-import android.graphics.Matrix;
-import android.graphics.Paint;
-import android.graphics.Paint.Cap;
-import android.graphics.Paint.Join;
-import android.graphics.Paint.Style;
-import android.graphics.RectF;
-import android.text.TextUtils;
-import android.util.Pair;
-import android.util.TypedValue;
-import android.widget.Toast;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Queue;
-import org.tensorflow.demo.Classifier.Recognition;
-import org.tensorflow.demo.env.BorderedText;
-import org.tensorflow.demo.env.ImageUtils;
-import org.tensorflow.demo.env.Logger;
-
-/**
- * A tracker wrapping ObjectTracker that also handles non-max suppression and matching existing
- * objects to new detections.
- */
-public class MultiBoxTracker {
-  private final Logger logger = new Logger();
-
-  private static final float TEXT_SIZE_DIP = 18;
-
-  // Maximum percentage of a box that can be overlapped by another box at detection time. Otherwise
-  // the lower scored box (new or old) will be removed.
-  private static final float MAX_OVERLAP = 0.2f;
-
-  private static final float MIN_SIZE = 16.0f;
-
-  // Allow replacement of the tracked box with new results if
-  // correlation has dropped below this level.
-  private static final float MARGINAL_CORRELATION = 0.75f;
-
-  // Consider object to be lost if correlation falls below this threshold.
-  private static final float MIN_CORRELATION = 0.3f;
-
-  private static final int[] COLORS = {
-    Color.BLUE, Color.RED, Color.GREEN, Color.YELLOW, Color.CYAN, Color.MAGENTA, Color.WHITE,
-    Color.parseColor("#55FF55"), Color.parseColor("#FFA500"), Color.parseColor("#FF8888"),
-    Color.parseColor("#AAAAFF"), Color.parseColor("#FFFFAA"), Color.parseColor("#55AAAA"),
-    Color.parseColor("#AA33AA"), Color.parseColor("#0D0068")
-  };
-
-  private final Queue<Integer> availableColors = new LinkedList<Integer>();
-
-  public ObjectTracker objectTracker;
-
-  final List<Pair<Float, RectF>> screenRects = new LinkedList<Pair<Float, RectF>>();
-
-  private static class TrackedRecognition {
-    ObjectTracker.TrackedObject trackedObject;
-    RectF location;
-    float detectionConfidence;
-    int color;
-    String title;
-  }
-
-  private final List<TrackedRecognition> trackedObjects = new LinkedList<TrackedRecognition>();
-
-  private final Paint boxPaint = new Paint();
-
-  private final float textSizePx;
-  private final BorderedText borderedText;
-
-  private Matrix frameToCanvasMatrix;
-
-  private int frameWidth;
-  private int frameHeight;
-
-  private int sensorOrientation;
-  private Context context;
-
-  public MultiBoxTracker(final Context context) {
-    this.context = context;
-    for (final int color : COLORS) {
-      availableColors.add(color);
-    }
-
-    boxPaint.setColor(Color.RED);
-    boxPaint.setStyle(Style.STROKE);
-    boxPaint.setStrokeWidth(12.0f);
-    boxPaint.setStrokeCap(Cap.ROUND);
-    boxPaint.setStrokeJoin(Join.ROUND);
-    boxPaint.setStrokeMiter(100);
-
-    textSizePx =
-        TypedValue.applyDimension(
-            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
-    borderedText = new BorderedText(textSizePx);
-  }
-
-  private Matrix getFrameToCanvasMatrix() {
-    return frameToCanvasMatrix;
-  }
-
-  public synchronized void drawDebug(final Canvas canvas) {
-    final Paint textPaint = new Paint();
-    textPaint.setColor(Color.WHITE);
-    textPaint.setTextSize(60.0f);
-
-    final Paint boxPaint = new Paint();
-    boxPaint.setColor(Color.RED);
-    boxPaint.setAlpha(200);
-    boxPaint.setStyle(Style.STROKE);
-
-    for (final Pair<Float, RectF> detection : screenRects) {
-      final RectF rect = detection.second;
-      canvas.drawRect(rect, boxPaint);
-      canvas.drawText("" + detection.first, rect.left, rect.top, textPaint);
-      borderedText.drawText(canvas, rect.centerX(), rect.centerY(), "" + detection.first);
-    }
-
-    if (objectTracker == null) {
-      return;
-    }
-
-    // Draw correlations.
-    for (final TrackedRecognition recognition : trackedObjects) {
-      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
-
-      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
-
-      if (getFrameToCanvasMatrix().mapRect(trackedPos)) {
-        final String labelString = String.format("%.2f", trackedObject.getCurrentCorrelation());
-        borderedText.drawText(canvas, trackedPos.right, trackedPos.bottom, labelString);
-      }
-    }
-
-    final Matrix matrix = getFrameToCanvasMatrix();
-    objectTracker.drawDebug(canvas, matrix);
-  }
-
-  public synchronized void trackResults(
-      final List<Recognition> results, final byte[] frame, final long timestamp) {
-    logger.i("Processing %d results from %d", results.size(), timestamp);
-    processResults(timestamp, results, frame);
-  }
-
-  public synchronized void draw(final Canvas canvas) {
-    final boolean rotated = sensorOrientation % 180 == 90;
-    final float multiplier =
-        Math.min(canvas.getHeight() / (float) (rotated ? frameWidth : frameHeight),
-                 canvas.getWidth() / (float) (rotated ? frameHeight : frameWidth));
-    frameToCanvasMatrix =
-        ImageUtils.getTransformationMatrix(
-            frameWidth,
-            frameHeight,
-            (int) (multiplier * (rotated ? frameHeight : frameWidth)),
-            (int) (multiplier * (rotated ? frameWidth : frameHeight)),
-            sensorOrientation,
-            false);
-    for (final TrackedRecognition recognition : trackedObjects) {
-      final RectF trackedPos =
-          (objectTracker != null)
-              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
-              : new RectF(recognition.location);
-
-      getFrameToCanvasMatrix().mapRect(trackedPos);
-      boxPaint.setColor(recognition.color);
-
-      final float cornerSize = Math.min(trackedPos.width(), trackedPos.height()) / 8.0f;
-      canvas.drawRoundRect(trackedPos, cornerSize, cornerSize, boxPaint);
-
-      final String labelString =
-          !TextUtils.isEmpty(recognition.title)
-              ? String.format("%s %.2f", recognition.title, recognition.detectionConfidence)
-              : String.format("%.2f", recognition.detectionConfidence);
-      borderedText.drawText(canvas, trackedPos.left + cornerSize, trackedPos.bottom, labelString);
-    }
-  }
-
-  private boolean initialized = false;
-
-  public synchronized void onFrame(
-      final int w,
-      final int h,
-      final int rowStride,
-      final int sensorOrientation,
-      final byte[] frame,
-      final long timestamp) {
-    if (objectTracker == null && !initialized) {
-      ObjectTracker.clearInstance();
-
-      logger.i("Initializing ObjectTracker: %dx%d", w, h);
-      objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
-      frameWidth = w;
-      frameHeight = h;
-      this.sensorOrientation = sensorOrientation;
-      initialized = true;
-
-      if (objectTracker == null) {
-        String message =
-            "Object tracking support not found. "
-                + "See tensorflow/examples/android/README.md for details.";
-        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
-        logger.e(message);
-      }
-    }
-
-    if (objectTracker == null) {
-      return;
-    }
-
-    objectTracker.nextFrame(frame, null, timestamp, null, true);
-
-    // Clean up any objects not worth tracking any more.
-    final LinkedList<TrackedRecognition> copyList =
-        new LinkedList<TrackedRecognition>(trackedObjects);
-    for (final TrackedRecognition recognition : copyList) {
-      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
-      final float correlation = trackedObject.getCurrentCorrelation();
-      if (correlation < MIN_CORRELATION) {
-        logger.v("Removing tracked object %s because NCC is %.2f", trackedObject, correlation);
-        trackedObject.stopTracking();
-        trackedObjects.remove(recognition);
-
-        availableColors.add(recognition.color);
-      }
-    }
-  }
-
-  private void processResults(
-      final long timestamp, final List<Recognition> results, final byte[] originalFrame) {
-    final List<Pair<Float, Recognition>> rectsToTrack = new LinkedList<Pair<Float, Recognition>>();
-
-    screenRects.clear();
-    final Matrix rgbFrameToScreen = new Matrix(getFrameToCanvasMatrix());
-
-    for (final Recognition result : results) {
-      if (result.getLocation() == null) {
-        continue;
-      }
-      final RectF detectionFrameRect = new RectF(result.getLocation());
-
-      final RectF detectionScreenRect = new RectF();
-      rgbFrameToScreen.mapRect(detectionScreenRect, detectionFrameRect);
-
-      logger.v(
-          "Result! Frame: " + result.getLocation() + " mapped to screen:" + detectionScreenRect);
-
-      screenRects.add(new Pair<Float, RectF>(result.getConfidence(), detectionScreenRect));
-
-      if (detectionFrameRect.width() < MIN_SIZE || detectionFrameRect.height() < MIN_SIZE) {
-        logger.w("Degenerate rectangle! " + detectionFrameRect);
-        continue;
-      }
-
-      rectsToTrack.add(new Pair<Float, Recognition>(result.getConfidence(), result));
-    }
-
-    if (rectsToTrack.isEmpty()) {
-      logger.v("Nothing to track, aborting.");
-      return;
-    }
-
-    if (objectTracker == null) {
-      trackedObjects.clear();
-      for (final Pair<Float, Recognition> potential : rectsToTrack) {
-        final TrackedRecognition trackedRecognition = new TrackedRecognition();
-        trackedRecognition.detectionConfidence = potential.first;
-        trackedRecognition.location = new RectF(potential.second.getLocation());
-        trackedRecognition.trackedObject = null;
-        trackedRecognition.title = potential.second.getTitle();
-        trackedRecognition.color = COLORS[trackedObjects.size()];
-        trackedObjects.add(trackedRecognition);
-
-        if (trackedObjects.size() >= COLORS.length) {
-          break;
-        }
-      }
-      return;
-    }
-
-    logger.i("%d rects to track", rectsToTrack.size());
-    for (final Pair<Float, Recognition> potential : rectsToTrack) {
-      handleDetection(originalFrame, timestamp, potential);
-    }
-  }
-
-  private void handleDetection(
-      final byte[] frameCopy, final long timestamp, final Pair<Float, Recognition> potential) {
-    final ObjectTracker.TrackedObject potentialObject =
-        objectTracker.trackObject(potential.second.getLocation(), timestamp, frameCopy);
-
-    final float potentialCorrelation = potentialObject.getCurrentCorrelation();
-    logger.v(
-        "Tracked object went from %s to %s with correlation %.2f",
-        potential.second, potentialObject.getTrackedPositionInPreviewFrame(), potentialCorrelation);
-
-    if (potentialCorrelation < MARGINAL_CORRELATION) {
-      logger.v("Correlation too low to begin tracking %s.", potentialObject);
-      potentialObject.stopTracking();
-      return;
-    }
-
-    final List<TrackedRecognition> removeList = new LinkedList<TrackedRecognition>();
-
-    float maxIntersect = 0.0f;
-
-    // This is the current tracked object whose color we will take. If left null we'll take the
-    // first one from the color queue.
-    TrackedRecognition recogToReplace = null;
-
-    // Look for intersections that will be overridden by this object or an intersection that would
-    // prevent this one from being placed.
-    for (final TrackedRecognition trackedRecognition : trackedObjects) {
-      final RectF a = trackedRecognition.trackedObject.getTrackedPositionInPreviewFrame();
-      final RectF b = potentialObject.getTrackedPositionInPreviewFrame();
-      final RectF intersection = new RectF();
-      final boolean intersects = intersection.setIntersect(a, b);
-
-      final float intersectArea = intersection.width() * intersection.height();
-      final float totalArea = a.width() * a.height() + b.width() * b.height() - intersectArea;
-      final float intersectOverUnion = intersectArea / totalArea;
-
-      // If there is an intersection with this currently tracked box above the maximum overlap
-      // percentage allowed, either the new recognition needs to be dismissed or the old
-      // recognition needs to be removed and possibly replaced with the new one.
-      if (intersects && intersectOverUnion > MAX_OVERLAP) {
-        if (potential.first < trackedRecognition.detectionConfidence
-            && trackedRecognition.trackedObject.getCurrentCorrelation() > MARGINAL_CORRELATION) {
-          // If track for the existing object is still going strong and the detection score was
-          // good, reject this new object.
-          potentialObject.stopTracking();
-          return;
-        } else {
-          removeList.add(trackedRecognition);
-
-          // Let the previously tracked object with max intersection amount donate its color to
-          // the new object.
-          if (intersectOverUnion > maxIntersect) {
-            maxIntersect = intersectOverUnion;
-            recogToReplace = trackedRecognition;
-          }
-        }
-      }
-    }
-
-    // If we're already tracking the max object and no intersections were found to bump off,
-    // pick the worst current tracked object to remove, if it's also worse than this candidate
-    // object.
-    if (availableColors.isEmpty() && removeList.isEmpty()) {
-      for (final TrackedRecognition candidate : trackedObjects) {
-        if (candidate.detectionConfidence < potential.first) {
-          if (recogToReplace == null
-              || candidate.detectionConfidence < recogToReplace.detectionConfidence) {
-            // Save it so that we use this color for the new object.
-            recogToReplace = candidate;
-          }
-        }
-      }
-      if (recogToReplace != null) {
-        logger.v("Found non-intersecting object to remove.");
-        removeList.add(recogToReplace);
-      } else {
-        logger.v("No non-intersecting object found to remove");
-      }
-    }
-
-    // Remove everything that got intersected.
-    for (final TrackedRecognition trackedRecognition : removeList) {
-      logger.v(
-          "Removing tracked object %s with detection confidence %.2f, correlation %.2f",
-          trackedRecognition.trackedObject,
-          trackedRecognition.detectionConfidence,
-          trackedRecognition.trackedObject.getCurrentCorrelation());
-      trackedRecognition.trackedObject.stopTracking();
-      trackedObjects.remove(trackedRecognition);
-      if (trackedRecognition != recogToReplace) {
-        availableColors.add(trackedRecognition.color);
-      }
-    }
-
-    if (recogToReplace == null && availableColors.isEmpty()) {
-      logger.e("No room to track this object, aborting.");
-      potentialObject.stopTracking();
-      return;
-    }
-
-    // Finally safe to say we can track this object.
-    logger.v(
-        "Tracking object %s (%s) with detection confidence %.2f at position %s",
-        potentialObject,
-        potential.second.getTitle(),
-        potential.first,
-        potential.second.getLocation());
-    final TrackedRecognition trackedRecognition = new TrackedRecognition();
-    trackedRecognition.detectionConfidence = potential.first;
-    trackedRecognition.trackedObject = potentialObject;
-    trackedRecognition.title = potential.second.getTitle();
-
-    // Use the color from a replaced object before taking one from the color queue.
-    trackedRecognition.color =
-        recogToReplace != null ? recogToReplace.color : availableColors.poll();
-    trackedObjects.add(trackedRecognition);
-  }
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java
deleted file mode 100644
index 8b4248d..0000000
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java
+++ /dev/null
@@ -1,661 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.demo.tracking;
-
-import android.graphics.Canvas;
-import android.graphics.Color;
-import android.graphics.Matrix;
-import android.graphics.Paint;
-import android.graphics.PointF;
-import android.graphics.RectF;
-import android.graphics.Typeface;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Vector;
-import javax.microedition.khronos.opengles.GL10;
-import org.tensorflow.demo.env.Logger;
-import org.tensorflow.demo.env.Size;
-
-/**
- * True object detector/tracker class that tracks objects across consecutive preview frames.
- * It provides a simplified Java interface to the analogous native object defined by
- * jni/client_vision/tracking/object_tracker.*.
- *
- * Currently, the ObjectTracker is a singleton due to native code restrictions, and so must
- * be allocated by ObjectTracker.getInstance(). In addition, release() should be called
- * as soon as the ObjectTracker is no longer needed, and before a new one is created.
- *
- * nextFrame() should be called as new frames become available, preferably as often as possible.
- *
- * After allocation, new TrackedObjects may be instantiated via trackObject(). TrackedObjects
- * are associated with the ObjectTracker that created them, and are only valid while that
- * ObjectTracker still exists.
- */
-public class ObjectTracker {
-  private static final Logger LOGGER = new Logger();
-
-  private static boolean libraryFound = false;
-
-  static {
-    try {
-      System.loadLibrary("tensorflow_demo");
-      libraryFound = true;
-    } catch (UnsatisfiedLinkError e) {
-      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
-    }
-  }
-
-  private static final boolean DRAW_TEXT = false;
-
-  /**
-   * How many history points to keep track of and draw in the red history line.
-   */
-  private static final int MAX_DEBUG_HISTORY_SIZE = 30;
-
-  /**
-   * How many frames of optical flow deltas to record.
-   * TODO(andrewharp): Push this down to the native level so it can be polled
-   * efficiently into a an array for upload, instead of keeping a duplicate
-   * copy in Java.
-   */
-  private static final int MAX_FRAME_HISTORY_SIZE = 200;
-
-  private static final int DOWNSAMPLE_FACTOR = 2;
-
-  private final byte[] downsampledFrame;
-
-  protected static ObjectTracker instance;
-
-  private final Map<String, TrackedObject> trackedObjects;
-
-  private long lastTimestamp;
-
-  private FrameChange lastKeypoints;
-
-  private final Vector<PointF> debugHistory;
-
-  private final LinkedList<TimestampedDeltas> timestampedDeltas;
-
-  protected final int frameWidth;
-  protected final int frameHeight;
-  private final int rowStride;
-  protected final boolean alwaysTrack;
-
-  private static class TimestampedDeltas {
-    final long timestamp;
-    final byte[] deltas;
-
-    public TimestampedDeltas(final long timestamp, final byte[] deltas) {
-      this.timestamp = timestamp;
-      this.deltas = deltas;
-    }
-  }
-
-  /**
-   * A simple class that records keypoint information, which includes
-   * local location, score and type. This will be used in calculating
-   * FrameChange.
-   */
-  public static class Keypoint {
-    public final float x;
-    public final float y;
-    public final float score;
-    public final int type;
-
-    public Keypoint(final float x, final float y) {
-      this.x = x;
-      this.y = y;
-      this.score = 0;
-      this.type = -1;
-    }
-
-    public Keypoint(final float x, final float y, final float score, final int type) {
-      this.x = x;
-      this.y = y;
-      this.score = score;
-      this.type = type;
-    }
-
-    Keypoint delta(final Keypoint other) {
-      return new Keypoint(this.x - other.x, this.y - other.y);
-    }
-  }
-
-  /**
-   * A simple class that could calculate Keypoint delta.
-   * This class will be used in calculating frame translation delta
-   * for optical flow.
-   */
-  public static class PointChange {
-    public final Keypoint keypointA;
-    public final Keypoint keypointB;
-    Keypoint pointDelta;
-    private final boolean wasFound;
-
-    public PointChange(final float x1, final float y1,
-                       final float x2, final float y2,
-                       final float score, final int type,
-                       final boolean wasFound) {
-      this.wasFound = wasFound;
-
-      keypointA = new Keypoint(x1, y1, score, type);
-      keypointB = new Keypoint(x2, y2);
-    }
-
-    public Keypoint getDelta() {
-      if (pointDelta == null) {
-        pointDelta = keypointB.delta(keypointA);
-      }
-      return pointDelta;
-    }
-  }
-
-  /** A class that records a timestamped frame translation delta for optical flow. */
-  public static class FrameChange {
-    public static final int KEYPOINT_STEP = 7;
-
-    public final Vector<PointChange> pointDeltas;
-
-    private final float minScore;
-    private final float maxScore;
-
-    public FrameChange(final float[] framePoints) {
-      float minScore = 100.0f;
-      float maxScore = -100.0f;
-
-      pointDeltas = new Vector<PointChange>(framePoints.length / KEYPOINT_STEP);
-
-      for (int i = 0; i < framePoints.length; i += KEYPOINT_STEP) {
-        final float x1 = framePoints[i + 0] * DOWNSAMPLE_FACTOR;
-        final float y1 = framePoints[i + 1] * DOWNSAMPLE_FACTOR;
-
-        final boolean wasFound = framePoints[i + 2] > 0.0f;
-
-        final float x2 = framePoints[i + 3] * DOWNSAMPLE_FACTOR;
-        final float y2 = framePoints[i + 4] * DOWNSAMPLE_FACTOR;
-        final float score = framePoints[i + 5];
-        final int type = (int) framePoints[i + 6];
-
-        minScore = Math.min(minScore, score);
-        maxScore = Math.max(maxScore, score);
-
-        pointDeltas.add(new PointChange(x1, y1, x2, y2, score, type, wasFound));
-      }
-
-      this.minScore = minScore;
-      this.maxScore = maxScore;
-    }
-  }
-
-  public static synchronized ObjectTracker getInstance(
-      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
-    if (!libraryFound) {
-      LOGGER.e(
-          "Native object tracking support not found. "
-              + "See tensorflow/examples/android/README.md for details.");
-      return null;
-    }
-
-    if (instance == null) {
-      instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
-      instance.init();
-    } else {
-      throw new RuntimeException(
-          "Tried to create a new objectracker before releasing the old one!");
-    }
-    return instance;
-  }
-
-  public static synchronized void clearInstance() {
-    if (instance != null) {
-      instance.release();
-    }
-  }
-
-  protected ObjectTracker(
-      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
-    this.frameWidth = frameWidth;
-    this.frameHeight = frameHeight;
-    this.rowStride = rowStride;
-    this.alwaysTrack = alwaysTrack;
-    this.timestampedDeltas = new LinkedList<TimestampedDeltas>();
-
-    trackedObjects = new HashMap<String, TrackedObject>();
-
-    debugHistory = new Vector<PointF>(MAX_DEBUG_HISTORY_SIZE);
-
-    downsampledFrame =
-        new byte
-            [(frameWidth + DOWNSAMPLE_FACTOR - 1)
-                / DOWNSAMPLE_FACTOR
-                * (frameWidth + DOWNSAMPLE_FACTOR - 1)
-                / DOWNSAMPLE_FACTOR];
-  }
-
-  protected void init() {
-    // The native tracker never sees the full frame, so pre-scale dimensions
-    // by the downsample factor.
-    initNative(frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, alwaysTrack);
-  }
-
-  private final float[] matrixValues = new float[9];
-
-  private long downsampledTimestamp;
-
-  @SuppressWarnings("unused")
-  public synchronized void drawOverlay(final GL10 gl,
-      final Size cameraViewSize, final Matrix matrix) {
-    final Matrix tempMatrix = new Matrix(matrix);
-    tempMatrix.preScale(DOWNSAMPLE_FACTOR, DOWNSAMPLE_FACTOR);
-    tempMatrix.getValues(matrixValues);
-    drawNative(cameraViewSize.width, cameraViewSize.height, matrixValues);
-  }
-
-  public synchronized void nextFrame(
-      final byte[] frameData, final byte[] uvData,
-      final long timestamp, final float[] transformationMatrix,
-      final boolean updateDebugInfo) {
-    if (downsampledTimestamp != timestamp) {
-      ObjectTracker.downsampleImageNative(
-          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
-      downsampledTimestamp = timestamp;
-    }
-
-    // Do Lucas Kanade using the fullframe initializer.
-    nextFrameNative(downsampledFrame, uvData, timestamp, transformationMatrix);
-
-    timestampedDeltas.add(new TimestampedDeltas(timestamp, getKeypointsPacked(DOWNSAMPLE_FACTOR)));
-    while (timestampedDeltas.size() > MAX_FRAME_HISTORY_SIZE) {
-      timestampedDeltas.removeFirst();
-    }
-
-    for (final TrackedObject trackedObject : trackedObjects.values()) {
-      trackedObject.updateTrackedPosition();
-    }
-
-    if (updateDebugInfo) {
-      updateDebugHistory();
-    }
-
-    lastTimestamp = timestamp;
-  }
-
-  public synchronized void release() {
-    releaseMemoryNative();
-    synchronized (ObjectTracker.class) {
-      instance = null;
-    }
-  }
-
-  private void drawHistoryDebug(final Canvas canvas) {
-    drawHistoryPoint(
-        canvas, frameWidth * DOWNSAMPLE_FACTOR / 2, frameHeight * DOWNSAMPLE_FACTOR / 2);
-  }
-
-  private void drawHistoryPoint(final Canvas canvas, final float startX, final float startY) {
-    final Paint p = new Paint();
-    p.setAntiAlias(false);
-    p.setTypeface(Typeface.SERIF);
-
-    p.setColor(Color.RED);
-    p.setStrokeWidth(2.0f);
-
-    // Draw the center circle.
-    p.setColor(Color.GREEN);
-    canvas.drawCircle(startX, startY, 3.0f, p);
-
-    p.setColor(Color.RED);
-
-    // Iterate through in backwards order.
-    synchronized (debugHistory) {
-      final int numPoints = debugHistory.size();
-      float lastX = startX;
-      float lastY = startY;
-      for (int keypointNum = 0; keypointNum < numPoints; ++keypointNum) {
-        final PointF delta = debugHistory.get(numPoints - keypointNum - 1);
-        final float newX = lastX + delta.x;
-        final float newY = lastY + delta.y;
-        canvas.drawLine(lastX, lastY, newX, newY, p);
-        lastX = newX;
-        lastY = newY;
-      }
-    }
-  }
-
-  private static int floatToChar(final float value) {
-    return Math.max(0, Math.min((int) (value * 255.999f), 255));
-  }
-
-  private void drawKeypointsDebug(final Canvas canvas) {
-    final Paint p = new Paint();
-    if (lastKeypoints == null) {
-      return;
-    }
-    final int keypointSize = 3;
-
-    final float minScore = lastKeypoints.minScore;
-    final float maxScore = lastKeypoints.maxScore;
-
-    for (final PointChange keypoint : lastKeypoints.pointDeltas) {
-      if (keypoint.wasFound) {
-        final int r =
-            floatToChar((keypoint.keypointA.score - minScore) / (maxScore - minScore));
-        final int b =
-            floatToChar(1.0f - (keypoint.keypointA.score - minScore) / (maxScore - minScore));
-
-        final int color = 0xFF000000 | (r << 16) | b;
-        p.setColor(color);
-
-        final float[] screenPoints = {keypoint.keypointA.x, keypoint.keypointA.y,
-                                      keypoint.keypointB.x, keypoint.keypointB.y};
-        canvas.drawRect(screenPoints[2] - keypointSize,
-                        screenPoints[3] - keypointSize,
-                        screenPoints[2] + keypointSize,
-                        screenPoints[3] + keypointSize, p);
-        p.setColor(Color.CYAN);
-        canvas.drawLine(screenPoints[2], screenPoints[3],
-                        screenPoints[0], screenPoints[1], p);
-
-        if (DRAW_TEXT) {
-          p.setColor(Color.WHITE);
-          canvas.drawText(keypoint.keypointA.type + ": " + keypoint.keypointA.score,
-              keypoint.keypointA.x, keypoint.keypointA.y, p);
-        }
-      } else {
-        p.setColor(Color.YELLOW);
-        final float[] screenPoint = {keypoint.keypointA.x, keypoint.keypointA.y};
-        canvas.drawCircle(screenPoint[0], screenPoint[1], 5.0f, p);
-      }
-    }
-  }
-
-  private synchronized PointF getAccumulatedDelta(final long timestamp, final float positionX,
-      final float positionY, final float radius) {
-    final RectF currPosition = getCurrentPosition(timestamp,
-        new RectF(positionX - radius, positionY - radius, positionX + radius, positionY + radius));
-    return new PointF(currPosition.centerX() - positionX, currPosition.centerY() - positionY);
-  }
-
-  private synchronized RectF getCurrentPosition(final long timestamp, final RectF
-      oldPosition) {
-    final RectF downscaledFrameRect = downscaleRect(oldPosition);
-
-    final float[] delta = new float[4];
-    getCurrentPositionNative(timestamp, downscaledFrameRect.left, downscaledFrameRect.top,
-        downscaledFrameRect.right, downscaledFrameRect.bottom, delta);
-
-    final RectF newPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
-
-    return upscaleRect(newPosition);
-  }
-
-  private void updateDebugHistory() {
-    lastKeypoints = new FrameChange(getKeypointsNative(false));
-
-    if (lastTimestamp == 0) {
-      return;
-    }
-
-    final PointF delta =
-        getAccumulatedDelta(
-            lastTimestamp, frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, 100);
-
-    synchronized (debugHistory) {
-      debugHistory.add(delta);
-
-      while (debugHistory.size() > MAX_DEBUG_HISTORY_SIZE) {
-        debugHistory.remove(0);
-      }
-    }
-  }
-
-  public synchronized void drawDebug(final Canvas canvas, final Matrix frameToCanvas) {
-    canvas.save();
-    canvas.setMatrix(frameToCanvas);
-
-    drawHistoryDebug(canvas);
-    drawKeypointsDebug(canvas);
-
-    canvas.restore();
-  }
-
-  public Vector<String> getDebugText() {
-    final Vector<String> lines = new Vector<String>();
-
-    if (lastKeypoints != null) {
-      lines.add("Num keypoints " + lastKeypoints.pointDeltas.size());
-      lines.add("Min score: " + lastKeypoints.minScore);
-      lines.add("Max score: " + lastKeypoints.maxScore);
-    }
-
-    return lines;
-  }
-
-  public synchronized List<byte[]> pollAccumulatedFlowData(final long endFrameTime) {
-    final List<byte[]> frameDeltas = new ArrayList<byte[]>();
-    while (timestampedDeltas.size() > 0) {
-      final TimestampedDeltas currentDeltas = timestampedDeltas.peek();
-      if (currentDeltas.timestamp <= endFrameTime) {
-        frameDeltas.add(currentDeltas.deltas);
-        timestampedDeltas.removeFirst();
-      } else {
-        break;
-      }
-    }
-
-    return frameDeltas;
-  }
-
-  private RectF downscaleRect(final RectF fullFrameRect) {
-    return new RectF(
-        fullFrameRect.left / DOWNSAMPLE_FACTOR,
-        fullFrameRect.top / DOWNSAMPLE_FACTOR,
-        fullFrameRect.right / DOWNSAMPLE_FACTOR,
-        fullFrameRect.bottom / DOWNSAMPLE_FACTOR);
-  }
-
-  private RectF upscaleRect(final RectF downsampledFrameRect) {
-    return new RectF(
-        downsampledFrameRect.left * DOWNSAMPLE_FACTOR,
-        downsampledFrameRect.top * DOWNSAMPLE_FACTOR,
-        downsampledFrameRect.right * DOWNSAMPLE_FACTOR,
-        downsampledFrameRect.bottom * DOWNSAMPLE_FACTOR);
-  }
-
-  /**
-   * A TrackedObject represents a native TrackedObject, and provides access to the
-   * relevant native tracking information available after every frame update. They may
-   * be safely passed around and accessed externally, but will become invalid after
-   * stopTracking() is called or the related creating ObjectTracker is deactivated.
-   *
-   * @author andrewharp@google.com (Andrew Harp)
-   */
-  public class TrackedObject {
-    private final String id;
-
-    private long lastExternalPositionTime;
-
-    private RectF lastTrackedPosition;
-    private boolean visibleInLastFrame;
-
-    private boolean isDead;
-
-    TrackedObject(final RectF position, final long timestamp, final byte[] data) {
-      isDead = false;
-
-      id = Integer.toString(this.hashCode());
-
-      lastExternalPositionTime = timestamp;
-
-      synchronized (ObjectTracker.this) {
-        registerInitialAppearance(position, data);
-        setPreviousPosition(position, timestamp);
-        trackedObjects.put(id, this);
-      }
-    }
-
-    public void stopTracking() {
-      checkValidObject();
-
-      synchronized (ObjectTracker.this) {
-        isDead = true;
-        forgetNative(id);
-        trackedObjects.remove(id);
-      }
-    }
-
-    public float getCurrentCorrelation() {
-      checkValidObject();
-      return ObjectTracker.this.getCurrentCorrelation(id);
-    }
-
-    void registerInitialAppearance(final RectF position, final byte[] data) {
-      final RectF externalPosition = downscaleRect(position);
-      registerNewObjectWithAppearanceNative(id,
-            externalPosition.left, externalPosition.top,
-            externalPosition.right, externalPosition.bottom,
-            data);
-    }
-
-    synchronized void setPreviousPosition(final RectF position, final long timestamp) {
-      checkValidObject();
-      synchronized (ObjectTracker.this) {
-        if (lastExternalPositionTime > timestamp) {
-          LOGGER.w("Tried to use older position time!");
-          return;
-        }
-        final RectF externalPosition = downscaleRect(position);
-        lastExternalPositionTime = timestamp;
-
-        setPreviousPositionNative(id,
-            externalPosition.left, externalPosition.top,
-            externalPosition.right, externalPosition.bottom,
-            lastExternalPositionTime);
-
-        updateTrackedPosition();
-      }
-    }
-
-    void setCurrentPosition(final RectF position) {
-      checkValidObject();
-      final RectF downsampledPosition = downscaleRect(position);
-      synchronized (ObjectTracker.this) {
-        setCurrentPositionNative(id,
-            downsampledPosition.left, downsampledPosition.top,
-            downsampledPosition.right, downsampledPosition.bottom);
-      }
-    }
-
-    private synchronized void updateTrackedPosition() {
-      checkValidObject();
-
-      final float[] delta = new float[4];
-      getTrackedPositionNative(id, delta);
-      lastTrackedPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
-
-      visibleInLastFrame = isObjectVisible(id);
-    }
-
-    public synchronized RectF getTrackedPositionInPreviewFrame() {
-      checkValidObject();
-
-      if (lastTrackedPosition == null) {
-        return null;
-      }
-      return upscaleRect(lastTrackedPosition);
-    }
-
-    synchronized long getLastExternalPositionTime() {
-      return lastExternalPositionTime;
-    }
-
-    public synchronized boolean visibleInLastPreviewFrame() {
-      return visibleInLastFrame;
-    }
-
-    private void checkValidObject() {
-      if (isDead) {
-        throw new RuntimeException("TrackedObject already removed from tracking!");
-      } else if (ObjectTracker.this != instance) {
-        throw new RuntimeException("TrackedObject created with another ObjectTracker!");
-      }
-    }
-  }
-
-  public synchronized TrackedObject trackObject(
-      final RectF position, final long timestamp, final byte[] frameData) {
-    if (downsampledTimestamp != timestamp) {
-      ObjectTracker.downsampleImageNative(
-          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
-      downsampledTimestamp = timestamp;
-    }
-    return new TrackedObject(position, timestamp, downsampledFrame);
-  }
-
-  public synchronized TrackedObject trackObject(final RectF position, final byte[] frameData) {
-    return new TrackedObject(position, lastTimestamp, frameData);
-  }
-
-  /** ********************* NATIVE CODE ************************************ */
-
-  /** This will contain an opaque pointer to the native ObjectTracker */
-  private long nativeObjectTracker;
-
-  private native void initNative(int imageWidth, int imageHeight, boolean alwaysTrack);
-
-  protected native void registerNewObjectWithAppearanceNative(
-      String objectId, float x1, float y1, float x2, float y2, byte[] data);
-
-  protected native void setPreviousPositionNative(
-      String objectId, float x1, float y1, float x2, float y2, long timestamp);
-
-  protected native void setCurrentPositionNative(
-      String objectId, float x1, float y1, float x2, float y2);
-
-  protected native void forgetNative(String key);
-
-  protected native String getModelIdNative(String key);
-
-  protected native boolean haveObject(String key);
-  protected native boolean isObjectVisible(String key);
-  protected native float getCurrentCorrelation(String key);
-
-  protected native float getMatchScore(String key);
-
-  protected native void getTrackedPositionNative(String key, float[] points);
-
-  protected native void nextFrameNative(
-      byte[] frameData, byte[] uvData, long timestamp, float[] frameAlignMatrix);
-
-  protected native void releaseMemoryNative();
-
-  protected native void getCurrentPositionNative(long timestamp,
-      final float positionX1, final float positionY1,
-      final float positionX2, final float positionY2,
-      final float[] delta);
-
-  protected native byte[] getKeypointsPacked(float scaleFactor);
-
-  protected native float[] getKeypointsNative(boolean onlyReturnCorrespondingKeypoints);
-
-  protected native void drawNative(int viewWidth, int viewHeight, float[] frameToCanvas);
-
-  protected static native void downsampleImageNative(
-      int width, int height, int rowStride, byte[] input, int factor, byte[] output);
-}
diff --git a/tensorflow/lite/examples/android/app/src/main/res/animator/color_animation.xml b/tensorflow/lite/examples/android/app/src/main/res/animator/color_animation.xml
deleted file mode 100644
index 891d8cc..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/animator/color_animation.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<set xmlns:android="http://schemas.android.com/apk/res/android"
-  android:ordering="sequentially">
-  <objectAnimator
-    android:propertyName="backgroundColor"
-    android:duration="375"
-    android:valueFrom="0x00b3ccff"
-    android:valueTo="0xffb3ccff"
-    android:valueType="colorType"/>
-  <objectAnimator
-    android:propertyName="backgroundColor"
-    android:duration="375"
-    android:valueFrom="0xffb3ccff"
-    android:valueTo="0x00b3ccff"
-    android:valueType="colorType"/>
-</set>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png
deleted file mode 100644
index 32bd1aa..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png
deleted file mode 100644
index b3113cd..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png
deleted file mode 100644
index 1358628..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png
deleted file mode 100644
index 8efbbf8..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png
deleted file mode 100644
index 51f87ee..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png
deleted file mode 100644
index ba143ea..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png
deleted file mode 100644
index 6361d79..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png
deleted file mode 100644
index 394eb7e..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png
deleted file mode 100644
index 2e27bec..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/examples/android/app/src/main/res/drawable/border.xml b/tensorflow/lite/examples/android/app/src/main/res/drawable/border.xml
deleted file mode 100644
index dd1d64d..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/drawable/border.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle" >
-  <solid android:color="#00000000" />
-  <stroke android:width="1dip" android:color="#cccccc" />
-</shape>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/layout/activity_camera.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/activity_camera.xml
deleted file mode 100644
index 1a22d4b..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/layout/activity_camera.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:id="@+id/container"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#000"
-    tools:context="org.tensorflow.demo.CameraActivity" />
diff --git a/tensorflow/lite/examples/android/app/src/main/res/layout/activity_speech.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/activity_speech.xml
deleted file mode 100644
index 2fe1338..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/layout/activity_speech.xml
+++ /dev/null
@@ -1,55 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<FrameLayout
-    xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context="org.tensorflow.demo.SpeechActivity">
-
-    <TextView
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:text="Say one of the words below!"
-        android:id="@+id/textView"
-        android:textAlignment="center"
-        android:layout_gravity="top"
-        android:textSize="24dp"
-        android:layout_marginTop="10dp"
-        android:layout_marginLeft="10dp"
-        />
-
-    <ListView
-        android:id="@+id/list_view"
-        android:layout_width="240dp"
-        android:layout_height="wrap_content"
-        android:background="@drawable/border"
-        android:layout_gravity="top|center_horizontal"
-        android:textAlignment="center"
-        android:layout_marginTop="100dp"
-        />
-
-    <Button
-        android:id="@+id/quit"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:text="Quit"
-        android:layout_gravity="bottom|center_horizontal"
-        android:layout_marginBottom="10dp"
-        />
-
-</FrameLayout>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml
deleted file mode 100644
index a1bbdf1..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent">
-
-    <org.tensorflow.demo.AutoFitTextureView
-        android:id="@+id/texture"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true" />
-
-    <org.tensorflow.demo.RecognitionScoreView
-        android:id="@+id/results"
-        android:layout_width="match_parent"
-        android:layout_height="112dp"
-        android:layout_alignParentTop="true" />
-
-    <org.tensorflow.demo.OverlayView
-        android:id="@+id/debug_overlay"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:layout_alignParentBottom="true" />
-
-</RelativeLayout>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml
deleted file mode 100644
index 1cdb24c..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-      android:orientation="vertical"
-      android:layout_width="match_parent"
-      android:layout_height="match_parent">
-  <org.tensorflow.demo.AutoFitTextureView
-    android:id="@+id/texture"
-    android:layout_width="wrap_content"
-    android:layout_height="wrap_content"
-    android:layout_alignParentTop="true" />
-
-  <RelativeLayout
-    android:id="@+id/black"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#FF000000" />
-
-  <GridView
-    android:id="@+id/grid_layout"
-    android:numColumns="7"
-    android:stretchMode="columnWidth"
-    android:layout_alignParentBottom="true"
-    android:layout_width="match_parent"
-    android:layout_height="wrap_content" />
-
-  <org.tensorflow.demo.OverlayView
-      android:id="@+id/overlay"
-      android:layout_width="match_parent"
-      android:layout_height="match_parent"
-      android:layout_alignParentTop="true" />
-
-  <org.tensorflow.demo.OverlayView
-    android:id="@+id/debug_overlay"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:layout_alignParentTop="true" />
-</RelativeLayout>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml
deleted file mode 100644
index ca18ea0..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent">
-
-      <org.tensorflow.demo.AutoFitTextureView
-          android:id="@+id/texture"
-          android:layout_width="wrap_content"
-          android:layout_height="wrap_content"/>
-
-      <org.tensorflow.demo.OverlayView
-          android:id="@+id/tracking_overlay"
-          android:layout_width="match_parent"
-          android:layout_height="match_parent"/>
-
-      <org.tensorflow.demo.OverlayView
-          android:id="@+id/debug_overlay"
-          android:layout_width="match_parent"
-          android:layout_height="match_parent"/>
-</FrameLayout>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/layout/list_text_item.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/list_text_item.xml
deleted file mode 100644
index 526017f..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/layout/list_text_item.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<TextView
-    xmlns:android="http://schemas.android.com/apk/res/android"
-    android:id="@+id/list_text_item"
-    android:layout_width="match_parent"
-    android:layout_height="wrap_content"
-    android:text="TextView"
-    android:textSize="24dp"
-    android:textAlignment="center"
-    android:gravity="center_horizontal"
-    />
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml b/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml
deleted file mode 100644
index 820eda0..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<!--
-  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-
-<resources>
-
-    <!-- Semantic definitions -->
-
-    <dimen name="horizontal_page_margin">@dimen/margin_huge</dimen>
-    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml
deleted file mode 100644
index 0930331..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<!--
-  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-
-<resources>
-
-    <style name="Widget.SampleMessage">
-        <item name="android:textAppearance">?android:textAppearanceLarge</item>
-        <item name="android:lineSpacingMultiplier">1.2</item>
-        <item name="android:shadowDy">-6.5</item>
-    </style>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-v11/styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v11/styles.xml
deleted file mode 100644
index c2d1bab..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-v11/styles.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-
-  <!--
-        Base application theme for API 11+. This theme completely replaces
-        AppBaseTheme from res/values/styles.xml on API 11+ devices.
-  -->
-  <style name="AppBaseTheme" parent="android:Theme.Holo.Light">
-    <!-- API 11 theme customizations can go here. -->
-  </style>
-
-  <style name="FullscreenTheme" parent="android:Theme.Holo">
-    <item name="android:actionBarStyle">@style/FullscreenActionBarStyle</item>
-    <item name="android:windowActionBarOverlay">true</item>
-    <item name="android:windowBackground">@null</item>
-    <item name="metaButtonBarStyle">?android:attr/buttonBarStyle</item>
-    <item name="metaButtonBarButtonStyle">?android:attr/buttonBarButtonStyle</item>
-  </style>
-
-  <style name="FullscreenActionBarStyle" parent="android:Widget.Holo.ActionBar">
-    <!--  <item name="android:background">@color/black_overlay</item>  -->
-  </style>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-v11/template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v11/template-styles.xml
deleted file mode 100644
index 1ad0484..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-v11/template-styles.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<!--
-  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-
-<resources>
-
-    <!-- Activity themes -->
-    <style name="Theme.Base" parent="android:Theme.Holo.Light" />
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-v14/styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v14/styles.xml
deleted file mode 100644
index cc37084..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-v14/styles.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-<resources>
-
-  <!--
-        Base application theme for API 14+. This theme completely replaces
-        AppBaseTheme from BOTH res/values/styles.xml and
-        res/values-v11/styles.xml on API 14+ devices.
-  -->
-  <style name="AppBaseTheme" parent="android:Theme.Holo.Light.DarkActionBar">
-    <!-- API 14 theme customizations can go here. -->
-  </style>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-colors.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-colors.xml
deleted file mode 100644
index c16da7c..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-colors.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Copyright 2013 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<resources>
-
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml
deleted file mode 100644
index 8890d2f..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<resources>
-
-    <!-- Activity themes -->
-    <style name="Theme.Base" parent="android:Theme.Material.Light">
-    </style>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/attrs.xml b/tensorflow/lite/examples/android/app/src/main/res/values/attrs.xml
deleted file mode 100644
index 56e5bea..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/attrs.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<resources>
-
-  <!--
-         Declare custom theme attributes that allow changing which styles are
-         used for button bars depending on the API level.
-         ?android:attr/buttonBarStyle is new as of API 11 so this is
-         necessary to support previous API levels.
-  -->
-  <declare-styleable name="ButtonBarContainerTheme">
-    <attr name="metaButtonBarStyle" format="reference" />
-    <attr name="metaButtonBarButtonStyle" format="reference" />
-  </declare-styleable>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/base-strings.xml b/tensorflow/lite/examples/android/app/src/main/res/values/base-strings.xml
deleted file mode 100644
index ebc5dc8..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/base-strings.xml
+++ /dev/null
@@ -1,23 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<resources>
-    <string name="app_name">TFLite Demo</string>
-    <string name="activity_name_classification">TFL Classify</string>
-    <string name="activity_name_detection">TFL Detect</string>
-    <string name="activity_name_speech">TFL Speech</string>
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/colors.xml b/tensorflow/lite/examples/android/app/src/main/res/values/colors.xml
deleted file mode 100644
index 584ed60..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/colors.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
- Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<resources>
-    <color name="control_background">#cc4285f4</color>
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/strings.xml b/tensorflow/lite/examples/android/app/src/main/res/values/strings.xml
deleted file mode 100644
index ea20ee7..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/strings.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<resources>
-    <string name="description_info">Info</string>
-    <string name="request_permission">This sample needs camera permission.</string>
-    <string name="camera_error">This device doesn\'t support Camera2 API.</string>
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values/styles.xml
deleted file mode 100644
index dd1d973..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/styles.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<resources>
-    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/template-dimens.xml b/tensorflow/lite/examples/android/app/src/main/res/values/template-dimens.xml
deleted file mode 100644
index 069977b..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/template-dimens.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<!--
-  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-
-<resources>
-
-    <!-- Define standard dimensions to comply with Holo-style grids and rhythm. -->
-
-    <dimen name="margin_tiny">4dp</dimen>
-    <dimen name="margin_small">8dp</dimen>
-    <dimen name="margin_medium">16dp</dimen>
-    <dimen name="margin_large">32dp</dimen>
-    <dimen name="margin_huge">64dp</dimen>
-
-    <!-- Semantic definitions -->
-
-    <dimen name="horizontal_page_margin">@dimen/margin_medium</dimen>
-    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/app/src/main/res/values/template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values/template-styles.xml
deleted file mode 100644
index 1b87714..0000000
--- a/tensorflow/lite/examples/android/app/src/main/res/values/template-styles.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<!--
-  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-
-<resources>
-
-    <!-- Activity themes -->
-
-    <style name="Theme.Base" parent="android:Theme.Light" />
-
-    <style name="Theme.Sample" parent="Theme.Base" />
-
-    <style name="AppTheme" parent="Theme.Sample" />
-    <!-- Widget styling -->
-
-    <style name="Widget" />
-
-    <style name="Widget.SampleMessage">
-        <item name="android:textAppearance">?android:textAppearanceMedium</item>
-        <item name="android:lineSpacingMultiplier">1.1</item>
-    </style>
-
-    <style name="Widget.SampleMessageTile">
-        <item name="android:background">@drawable/tile</item>
-        <item name="android:shadowColor">#7F000000</item>
-        <item name="android:shadowDy">-3.5</item>
-        <item name="android:shadowRadius">2</item>
-    </style>
-
-</resources>
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
deleted file mode 100644
index 90b8f0f..0000000
--- a/tensorflow/lite/examples/android/build.gradle
+++ /dev/null
@@ -1,26 +0,0 @@
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-
-buildscript {
-    repositories {
-        google()
-        jcenter()
-    }
-    dependencies {
-        classpath 'com.android.tools.build:gradle:3.2.1'
-    }
-}
-
-allprojects {
-    repositories {
-        google()
-        jcenter()
-    }
-}
-
-task clean(type: Delete) {
-    delete rootProject.buildDir
-}
-
-// Changed since default name 'build' conflicts with
-// bazel BUILD file name.
-buildDir = "gradle-build"
diff --git a/tensorflow/lite/examples/android/settings.gradle b/tensorflow/lite/examples/android/settings.gradle
deleted file mode 100644
index e7b4def..0000000
--- a/tensorflow/lite/examples/android/settings.gradle
+++ /dev/null
@@ -1 +0,0 @@
-include ':app'
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 088cd2f..88e5fd2 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -30,6 +30,7 @@
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/profiling:profiler",
     ],
 )
 
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 340fbab..ac84e27 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -13,6 +13,13 @@
 limitations under the License.
 ==============================================================================*/
 
+#include <fcntl.h>      // NOLINT(build/include_order)
+#include <getopt.h>     // NOLINT(build/include_order)
+#include <sys/time.h>   // NOLINT(build/include_order)
+#include <sys/types.h>  // NOLINT(build/include_order)
+#include <sys/uio.h>    // NOLINT(build/include_order)
+#include <unistd.h>     // NOLINT(build/include_order)
+
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
@@ -25,21 +32,14 @@
 #include <unordered_set>
 #include <vector>
 
-#include <fcntl.h>      // NOLINT(build/include_order)
-#include <getopt.h>     // NOLINT(build/include_order)
-#include <sys/time.h>   // NOLINT(build/include_order)
-#include <sys/types.h>  // NOLINT(build/include_order)
-#include <sys/uio.h>    // NOLINT(build/include_order)
-#include <unistd.h>     // NOLINT(build/include_order)
-
+#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/lite/examples/label_image/get_top_n.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/optional_debug_tools.h"
+#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/string_util.h"
 
-#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
-#include "tensorflow/lite/examples/label_image/get_top_n.h"
-
 #define LOG(x) std::cerr
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 521da33..41d308b 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -13,10 +13,6 @@
     build_version = TFL_IOS_BUILD_VERSION,
 )
 
-#   Build the framework:
-#     bazel build tensorflow/lite/experimental/ios:TensorFlowLiteC_framework -c opt --ios_multi_cpus=x86_64,armv7,arm64 --apple_bitcode=embedded --copt=-fembed-bitcode
-#   Unzip the generated framework:
-#     unzip bazel-bin/tensorflow/lite/experimental/ios/TensorFlowLiteC_framework.zip -d /Users/path/to/tensorflow/Frameworks
 ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.md b/tensorflow/lite/experimental/ios/TensorFlowLiteC.md
index edf23dd..fe697dc 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.md
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.md
@@ -1,7 +1,6 @@
-# 2019-04-08 -- v0.1.0
-- Experimental release of the `TensorFlowLiteC` CocoaPod. For Swift developers,
-  add the `TensorFlowLiteSwift` pod to your Podfile. For Objective-C developers,
-  add `TensorFlowLiteObjC`. See the TensorFlow Lite
+# TensorFlow Lite for iOS
+- For Swift developers, add the `TensorFlowLiteSwift` pod to your Podfile. For
+  Objective-C developers, add `TensorFlowLiteObjC`. See the TensorFlow Lite
   [Swift](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/swift)
   and
   [ObjC](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/objc)
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
index 81fa3df..3bbe68c 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
@@ -2,11 +2,11 @@
 
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
-  s.version          = '0.1.0'
+  s.version          = '0.2.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/dl/cpdc/e3b0c44298fc1c14/TensorFlowLiteC-#{s.version}.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/dl/cpdc/9d0ec5e53f4ff34a/TensorFlowLiteC-#{s.version}.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl
index a8862a8..1698134 100644
--- a/tensorflow/lite/experimental/ios/ios.bzl
+++ b/tensorflow/lite/experimental/ios/ios.bzl
@@ -1,10 +1,7 @@
 """TensorFlow Lite Build Configurations for iOS"""
 
 # Current version of the TensorFlow Lite iOS libraries.
-TFL_IOS_BUILD_VERSION = "0.1.0"
-
-# Git commit that was used to build the TensorFlow Lite iOS libraries. See Swift and ObjC podspecs.
-TFL_IOS_GIT_COMMIT = "2b96dde"
+TFL_IOS_BUILD_VERSION = "0.2.0"
 
 TFL_MINIMUM_OS_VERSION = "9.0"
 
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index 78af889..bf4a007 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -35,7 +35,7 @@
 )
 
 cc_library(
-    name = "experimental_ops",
+    name = "ctc_beam_search_decoder_op",
     srcs = [
         "ctc_beam_search_decoder.cc",
     ],
@@ -66,7 +66,7 @@
     srcs = ["ctc_beam_search_decoder_test.cc"],
     tags = ["tflite_not_portable_ios"],
     deps = [
-        ":experimental_ops",
+        ":ctc_beam_search_decoder_op",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
@@ -74,3 +74,54 @@
         "@flatbuffers",
     ],
 )
+
+cc_library(
+    name = "gru_cell",
+    srcs = ["gru_cell.cc"],
+    hdrs = ["gru_cell.h"],
+    deps = [
+        "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "unidirectional_sequence_gru_op",
+    srcs = [
+        "unidirectional_sequence_gru.cc",
+    ],
+    # Suppress warnings that are introduced by Eigen Tensor.
+    copts = tflite_copts() + [
+        "-Wno-error=reorder",
+    ] + select({
+        "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"],
+        "//conditions:default": [
+        ],
+    }),
+    deps = [
+        ":gru_cell",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels:cpu_backend_support",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "unidirectional_sequence_gru_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_gru_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":unidirectional_sequence_gru_op",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/experimental/kernels/gru_cell.cc b/tensorflow/lite/experimental/kernels/gru_cell.cc
new file mode 100644
index 0000000..c21896a
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/gru_cell.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/kernels/gru_cell.h"
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+namespace gru_cell {
+
+using optimized_ops::ArrayMap;
+using optimized_ops::FullyConnected;
+using optimized_ops::MapAsArrayWithLastDimAsRows;
+using reference_ops::Concatenation;
+
+void GruCell(const RuntimeShape& input_shape, const float* input,
+             const RuntimeShape& state_shape, const float* input_state,
+             const RuntimeShape& gate_weight_shape, const float* gate_weight,
+             const RuntimeShape& gate_bias_shape, const float* gate_bias,
+             const RuntimeShape& candidate_weight_shape,
+             const float* candidate_weight,
+             const RuntimeShape& candidate_bias_shape,
+             const float* candidate_bias, const RuntimeShape& output_shape,
+             float* output, float* output_state,
+             const RuntimeShape& activation_shape, float* activation,
+             const RuntimeShape& concat_shape, float* concat,
+             const tflite::FullyConnectedParams& fc_params,
+             tflite::CpuBackendContext* cpu_backend_context) {
+  const int n_batch = input_shape.Dims(0);
+  const int n_input = input_shape.Dims(1);
+  const int n_output = state_shape.Dims(1);
+
+  // [x h] = concat(input, state)
+  std::vector<float const*> concat_arrays_data;
+  std::vector<RuntimeShape const*> concat_arrays_shapes;
+  concat_arrays_data.push_back(input);
+  concat_arrays_data.push_back(input_state);
+  concat_arrays_shapes.push_back(&input_shape);
+  concat_arrays_shapes.push_back(&state_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 1;
+  concat_params.inputs_count = concat_arrays_data.size();
+  Concatenation(concat_params, &(concat_arrays_shapes[0]),
+                &(concat_arrays_data[0]), concat_shape, concat);
+
+  // [r u] = [x h] * gate_weight + gate_bias
+  FullyConnected(fc_params, concat_shape, concat, gate_weight_shape,
+                 gate_weight, gate_bias_shape, gate_bias, activation_shape,
+                 activation, cpu_backend_context);
+
+  // [r u] = sigmoid([r u])
+  auto ru = MapAsArrayWithLastDimAsRows(activation, activation_shape);
+  ru = ru.unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+  auto r = ru.block(0 * n_output, 0, n_output, n_batch);
+  auto u = ru.block(1 * n_output, 0, n_output, n_batch);
+
+  // hr = h .* r
+  auto h = MapAsArrayWithLastDimAsRows(input_state, state_shape);
+  auto xh = MapAsArrayWithLastDimAsRows(concat, concat_shape);
+  auto hr = xh.block(n_input, 0, n_output, n_batch);
+  hr = h * r;
+
+  // c = [x hr] * candidate_weight + candidate_bias
+  FullyConnected(fc_params, concat_shape, concat, candidate_weight_shape,
+                 candidate_weight, candidate_bias_shape, candidate_bias,
+                 output_shape, output, cpu_backend_context);
+
+  auto c = MapAsArrayWithLastDimAsRows(output, output_shape);
+  // output = (1 - u) .* tanh(c) + u .* h
+  c = (1.0 - u) * c.tanh() + u * h;
+
+  memcpy(output_state, output, n_batch * n_output * sizeof(float));
+}
+
+}  // namespace gru_cell
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/kernels/gru_cell.h b/tensorflow/lite/experimental/kernels/gru_cell.h
new file mode 100644
index 0000000..cd7b02e
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/gru_cell.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_GRU_CELL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_GRU_CELL_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+namespace gru_cell {
+
+void GruCell(const RuntimeShape& input_shape, const float* input,
+             const RuntimeShape& state_shape, const float* input_state,
+             const RuntimeShape& gate_weight_shape, const float* gate_weight,
+             const RuntimeShape& gate_bias_shape, const float* gate_bias,
+             const RuntimeShape& candidate_weight_shape,
+             const float* candidate_weight,
+             const RuntimeShape& candidate_bias_shape,
+             const float* candidate_bias, const RuntimeShape& output_shape,
+             float* output, float* output_state,
+             const RuntimeShape& activation_shape, float* activation,
+             const RuntimeShape& concat_shape, float* concat,
+             const tflite::FullyConnectedParams& fc_params,
+             tflite::CpuBackendContext* cpu_backend_context);
+
+}  // namespace gru_cell
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_GRU_CELL_H_
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
new file mode 100644
index 0000000..fc0d681
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/kernels/gru_cell.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+namespace unidirectional_sequence_gru {
+namespace {
+
+void GruImpl(const TfLiteTensor* input, const TfLiteTensor* input_state,
+             const TfLiteTensor* gate_weight, const TfLiteTensor* gate_bias,
+             const TfLiteTensor* candidate_weight,
+             const TfLiteTensor* candidate_bias, TfLiteTensor* output,
+             TfLiteTensor* output_state, TfLiteTensor* activation,
+             TfLiteTensor* concat,
+             tflite::CpuBackendContext* cpu_backend_context) {
+  const int n_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+  const int n_output = output->dims->data[2];
+  const int n_batch_input = n_batch * n_input;
+  const int n_batch_output = n_batch * n_output;
+  const RuntimeShape input_shape({n_batch, n_input});
+  const float* input_data = GetTensorData<float>(input);
+  const RuntimeShape state_shape = GetTensorShape(input_state);
+  const float* input_state_data = GetTensorData<float>(input_state);
+  const RuntimeShape gate_weight_shape = GetTensorShape(gate_weight);
+  const float* gate_weight_data = GetTensorData<float>(gate_weight);
+  const RuntimeShape gate_bias_shape = GetTensorShape(gate_bias);
+  const float* gate_bias_data = GetTensorData<float>(gate_bias);
+  const RuntimeShape candidate_weight_shape = GetTensorShape(candidate_weight);
+  const float* candidate_weight_data = GetTensorData<float>(candidate_weight);
+  const RuntimeShape candidate_bias_shape = GetTensorShape(candidate_bias);
+  const float* candidate_bias_data = GetTensorData<float>(candidate_bias);
+  const RuntimeShape activation_shape = GetTensorShape(activation);
+  const RuntimeShape output_shape = RuntimeShape({n_batch, n_output});
+  float* output_data = GetTensorData<float>(output);
+  float* output_state_data = GetTensorData<float>(output_state);
+  float* activation_data = GetTensorData<float>(activation);
+  const RuntimeShape concat_shape = GetTensorShape(concat);
+  float* concat_data = GetTensorData<float>(concat);
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  for (int i = 0; i < n_time; ++i) {
+    gru_cell::GruCell(
+        input_shape, input_data, state_shape, input_state_data,
+        gate_weight_shape, gate_weight_data, gate_bias_shape, gate_bias_data,
+        candidate_weight_shape, candidate_weight_data, candidate_bias_shape,
+        candidate_bias_data, output_shape, output_data, output_state_data,
+        activation_shape, activation_data, concat_shape, concat_data, fc_params,
+        cpu_backend_context);
+    input_data += n_batch_input;
+    output_data += n_batch_output;
+    input_state_data = output_state_data;
+  }
+}
+
+}  // namespace
+
+enum InputTensor {
+  // Input tensor of size [n_time, n_batch, n_input]
+  kInput = 0,
+  // Input state tensor of size [n_batch, n_output]
+  kInputState = 1,
+  // Gate weight tensor of size [2*n_output, n_input+n_output]
+  kGateWeight = 2,
+  // Gate bias tensor of size [2*n_output]
+  kGateBias = 3,
+  // Candidate weight tensor of size [n_output, n_input+n_output]
+  kCandidateWeight = 4,
+  // Candidate bias tensor of size [n_output]
+  kCandidateBias = 5,
+  kInputNum = 6
+};
+
+enum OutputTensor {
+  // Input tensor of size [n_time, n_batch, n_output]
+  kOutput = 0,
+  // Output state tensor of size [n_batch, n_output]
+  kOutputState = 1,
+  kOutputNum = 2
+};
+
+enum TemporaryTensor {
+  // Scratch buffer for activation of size [n_batch, 2*n_output]
+  kActivation = 0,
+  // Scratch buffer for activation of size [n_batch, n_input+n_output]
+  kConcat = 1,
+  kTemporaryNum = 2
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  cpu_backend_support::IncrementUsageCounter(context);
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, kTemporaryNum, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  cpu_backend_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<int*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, kInputNum);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, kOutputNum);
+
+  // input's dim = [n_time, n_batch, n_input]
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
+  const int n_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  // input_state's dim = [n_batch, n_output]
+  const TfLiteTensor* input_state = GetInput(context, node, kInputState);
+  TF_LITE_ENSURE_EQ(context, input_state->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_state->dims->data[0], n_batch);
+  const int n_output = input_state->dims->data[1];
+
+  // gate_weight' dim = [2 * n_output, n_input + n_output]
+  const TfLiteTensor* gate_weight = GetInput(context, node, kGateWeight);
+  TF_LITE_ENSURE_EQ(context, gate_weight->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, gate_weight->dims->data[0], 2 * n_output);
+  TF_LITE_ENSURE_EQ(context, gate_weight->dims->data[1], n_input + n_output);
+
+  // gate_bias' dim = [2 * n_output]
+  const TfLiteTensor* gate_bias = GetInput(context, node, kGateBias);
+  TF_LITE_ENSURE_EQ(context, gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, gate_bias->dims->data[0], 2 * n_output);
+
+  // candidate_weight' dim = [n_output, n_input + n_output]
+  const TfLiteTensor* candidate_weight =
+      GetInput(context, node, kCandidateWeight);
+  TF_LITE_ENSURE_EQ(context, candidate_weight->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, candidate_weight->dims->data[0], n_output);
+  TF_LITE_ENSURE_EQ(context, candidate_weight->dims->data[1],
+                    n_input + n_output);
+
+  // candidate_bias' dim = [n_output]
+  const TfLiteTensor* candidate_bias = GetInput(context, node, kCandidateBias);
+  TF_LITE_ENSURE_EQ(context, candidate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, candidate_bias->dims->data[0], n_output);
+
+  // output's dim = [n_time, n_batch, n_output]
+  TfLiteTensor* output = GetOutput(context, node, kOutput);
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = n_time;
+  output_size->data[1] = n_batch;
+  output_size->data[2] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  // output_state's dim = [n_batch, n_output]
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, output_state,
+                                     TfLiteIntArrayCopy(input_state->dims)));
+
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(kTemporaryNum);
+
+  // activation's dim = [n_batch, 2 * n_output]
+  node->temporaries->data[kActivation] = *scratch_tensor_index;
+  TfLiteTensor* activation = GetTemporary(context, node, kActivation);
+  activation->type = input->type;
+  activation->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* activation_size = TfLiteIntArrayCreate(2);
+  activation_size->data[0] = n_batch;
+  activation_size->data[1] = 2 * n_output;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, activation, activation_size));
+
+  // concat's dim  = [n_batch, n_input + n_output]
+  node->temporaries->data[kConcat] = (*scratch_tensor_index) + kConcat;
+  TfLiteTensor* concat = GetTemporary(context, node, kConcat);
+  concat->type = input->type;
+  concat->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* concat_size = TfLiteIntArrayCreate(2);
+  concat_size->data[0] = n_batch;
+  concat_size->data[1] = n_input + n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, concat, concat_size));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  const TfLiteTensor* input_state = GetInput(context, node, kInputState);
+  const TfLiteTensor* gate_weight = GetInput(context, node, kGateWeight);
+  const TfLiteTensor* gate_bias = GetInput(context, node, kGateBias);
+  const TfLiteTensor* candidate_weight =
+      GetInput(context, node, kCandidateWeight);
+  const TfLiteTensor* candidate_bias = GetInput(context, node, kCandidateBias);
+  TfLiteTensor* output = GetOutput(context, node, kOutput);
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
+  TfLiteTensor* activation = GetTemporary(context, node, kActivation);
+  TfLiteTensor* concat = GetTemporary(context, node, kConcat);
+  auto cpu_backend_context = cpu_backend_support::GetFromContext(context);
+
+  if (gate_weight->type == kTfLiteFloat32) {
+    GruImpl(input, input_state, gate_weight, gate_bias, candidate_weight,
+            candidate_bias, output, output_state, activation, concat,
+            cpu_backend_context);
+  } else {
+    context->ReportError(context,
+                         "Unsupported combination of data types for GruCell");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace unidirectional_sequence_gru
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_GRU() {
+  static TfLiteRegistration r = {
+      unidirectional_sequence_gru::Init, unidirectional_sequence_gru::Free,
+      unidirectional_sequence_gru::Prepare, unidirectional_sequence_gru::Eval};
+  return &r;
+}
+
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
new file mode 100644
index 0000000..593d714
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_GRU();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class GRUOpModel : public SingleOpModel {
+ public:
+  explicit GRUOpModel(int n_batch, int n_input, int n_output,
+                      const std::vector<std::vector<int>>& input_shapes,
+                      const TensorType& weight_type = TensorType_FLOAT32)
+      : n_batch_(n_batch), n_input_(n_input), n_output_(n_output) {
+    input_ = AddInput(TensorType_FLOAT32);
+    input_state_ =
+        AddInput(TensorData{TensorType_FLOAT32, {n_batch, n_output}}, true);
+    gate_weight_ = AddInput(TensorType_FLOAT32);
+    gate_bias_ = AddInput(TensorType_FLOAT32);
+    candidate_weight_ = AddInput(TensorType_FLOAT32);
+    candidate_bias_ = AddInput(TensorType_FLOAT32);
+
+    output_ = AddOutput(TensorType_FLOAT32);
+    output_state_ = AddOutput(TensorType_FLOAT32);
+
+    SetCustomOp("UNIDIRECTIONAL_SEQUENCE_GRU", {},
+                Register_UNIDIRECTIONAL_SEQUENCE_GRU);
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
+
+  void SetInputState(const std::vector<float>& f) {
+    PopulateTensor(input_state_, f);
+  }
+
+  void SetGateWeight(const std::vector<float>& f) {
+    PopulateTensor(gate_weight_, f);
+  }
+
+  void SetGateBias(const std::vector<float>& f) {
+    PopulateTensor(gate_bias_, f);
+  }
+
+  void SetCandidateWeight(const std::vector<float>& f) {
+    PopulateTensor(candidate_weight_, f);
+  }
+
+  void SetCandidateBias(const std::vector<float>& f) {
+    PopulateTensor(candidate_bias_, f);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_batches() { return n_batch_; }
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+
+ private:
+  int input_;
+  int input_state_;
+  int gate_weight_;
+  int gate_bias_;
+  int candidate_weight_;
+  int candidate_bias_;
+
+  int output_;
+  int output_state_;
+  int n_batch_;
+  int n_input_;
+  int n_output_;
+};
+
+TEST(GRUTest, SimpleTest) {
+  const int n_time = 2;
+  const int n_batch = 2;
+  const int n_input = 2;
+  const int n_output = 3;
+
+  GRUOpModel m(n_batch, n_input, n_output,
+               {{n_time, n_batch, n_input},
+                {n_batch, n_output},
+                {2 * n_output, n_input + n_output},
+                {2 * n_output},
+                {n_output, n_input + n_output},
+                {n_output}});
+  // All data is randomly generated.
+  m.SetInput({0.89495724, 0.34482682, 0.68505806, 0.7135783, 0.3167085,
+              0.93647677, 0.47361764, 0.39643127});
+  m.SetInputState(
+      {0.09992421, 0.3028481, 0.78305984, 0.50438094, 0.11269058, 0.10244724});
+  m.SetGateWeight({0.7256918,  0.8945897,  0.03285786, 0.42637166, 0.119376324,
+                   0.83035135, 0.16997327, 0.42302176, 0.77598256, 0.2660894,
+                   0.9587266,  0.6218451,  0.88164485, 0.12272458, 0.2699055,
+                   0.18399088, 0.21930052, 0.3374841,  0.70866305, 0.9523419,
+                   0.25170696, 0.60988617, 0.79823977, 0.64477515, 0.2602957,
+                   0.5053131,  0.93722224, 0.8451359,  0.97905475, 0.38669217});
+  m.SetGateBias(
+      {0.032708533, 0.018445263, 0.15320699, 0.8163046, 0.26683575, 0.1412022});
+  m.SetCandidateWeight({0.96165305, 0.95572084, 0.11534478, 0.96965164,
+                        0.33562955, 0.8680755, 0.003066936, 0.057793964,
+                        0.8671354, 0.33354893, 0.7313398, 0.78492093,
+                        0.19530584, 0.116550304, 0.13599132});
+  m.SetCandidateBias({0.89837056, 0.54769796, 0.63364106});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(n_time, n_batch, n_output));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {0.20112592, 0.45286041, 0.80842507, 0.59567153, 0.2619998,
+                   0.22922856, 0.27715868, 0.5247152, 0.82300174, 0.65812796,
+                   0.38217607, 0.3401444})));
+}
+
+}  // namespace
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index 37e3f23..f6fd908 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -21,7 +21,9 @@
 
 -   [Goals](#goals)
 
--   [Generating Project Files](#generating-project-#files)
+-   [Generating Project Files](#generating-project-files)
+
+-   [Generating Arduino Libraries](#generating-arduino_libraries)
 
 -   [How to Port TensorFlow Lite Micro to a New Platform](#how-to-port-tensorflow-lite-micro-to-a-new-platform)
 
@@ -307,13 +309,14 @@
 
 ### Additional Apollo3 Instructions
 
-To flash a part with JFlash Lite, do the following: 
+To flash a part with JFlash Lite, do the following:
 
-1. At the command line: JFlashLiteExe 
-2. Device = AMA3B1KK-KBR 
-3. Interface = SWD at 1000 kHz 
-4. Data file = `tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin`
-5. Prog Addr = 0x0000C000
+1.  At the command line: JFlashLiteExe
+2.  Device = AMA3B1KK-KBR
+3.  Interface = SWD at 1000 kHz
+4.  Data file =
+    `tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin`
+5.  Prog Addr = 0x0000C000
 
 ## Building for the Eta Compute ECM3531 EVB using Make
 
@@ -360,6 +363,31 @@
     tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
     &nbsp;&nbsp;&nbsp;&nbsp;./flash_program executable_name to load into flash.
 
+## Implement target optimized kernels
+
+The reference kernels in tensorflow/lite/experimental/micro/kernels are
+implemented in pure C/C++. It might not utilize all HW architecture specific
+optimizations, such as DSP instructions etc. The instructions below provides an
+example on how to compile an external lib with HW architecture specific
+optimizations and link it with the microlite lib.
+
+### CMSIS-NN optimized kernels (---under development---)
+
+To utilize the CMSIS-NN optimized kernels, choose your target, e.g. Bluepill,
+and build with:
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn
+TARGET=bluepill test
+
+That will build the microlite lib including CMSIS-NN optimized kernels based on
+the version downloaded by 'download_dependencies.sh', so make sure you have run
+this script. If you want to utilize another version of CMSIS, clone it to a
+custom location run the following command:
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile
+CMSIS_PATH=<CUSTOM_LOCATION> TAGS=cmsis-nn TARGET=bluepill test (--- Under
+development, it will build, but test will fail ---)
+
 ## Goals
 
 The design goals are for the framework to be:
@@ -493,6 +521,30 @@
 use it just like any other Mbed project. There's more information about project
 files [below](#working-with-generated-projects).
 
+## Generating Arduino Libraries
+
+It's possible to use the Arduino Desktop IDE to build TFL Micro targets for
+Arduino devices. The source code is packaged as a .zip archive that you can add
+in the IDE by going to Sketch->Include Library->Add .ZIP Library... Once you've
+added the library, you can then go to File->Examples->TensorFlowLite to find a
+simple sketch that you can use to build the example.
+
+You can generate the zip file from the source code here in git by running the
+following command:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=arduino TAGS="" generate_micro_speech_mock_arduino_library_zip
+```
+
+The resulting library can be found in `tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/micro_speech_mock/micro_speech_mock.zip`.
+This generates a library that builds the `micro_speech_mock` binary, but you can
+do the same for any other target by replacing the name in the make command line.
+If you want to build all the possible libraries, you can run this command:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=arduino TAGS="" generate_projects
+```
+
 ## How to Port TensorFlow Lite Micro to a New Platform
 
 Are you a hardware or operating system provider looking to run machine learning
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
index 8187962..8cc7e2e 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
@@ -124,8 +124,8 @@
     time_since_last_top = current_time_ms - previous_top_label_time_;
   }
   if ((current_top_score > detection_threshold_) &&
-      (current_top_label != previous_top_label_) &&
-      (time_since_last_top > suppression_ms_)) {
+      ((current_top_label != previous_top_label_) ||
+       (time_since_last_top > suppression_ms_))) {
     previous_top_label_ = current_top_label;
     previous_top_label_time_ = current_time_ms;
     *is_new_command = true;
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index 451eed2..4daca26 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -15,6 +15,7 @@
     srcs = [
         "depthwise_conv.cc",
         "fully_connected.cc",
+        "pooling.cc",
         "softmax.cc",
     ],
     hdrs = [
@@ -50,6 +51,7 @@
     name = "portable_optimized_micro_ops",
     srcs = [
         "fully_connected.cc",
+        "pooling.cc",
         "portable_optimized/depthwise_conv.cc",
         "softmax.cc",
     ],
@@ -83,6 +85,19 @@
 )
 
 tflite_micro_cc_test(
+    name = "pooling_test",
+    srcs = [
+        "pooling_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
     name = "depthwise_conv_test",
     srcs = [
         "depthwise_conv_test.cc",
diff --git a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
index b733949..67efc2e 100644
--- a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
@@ -29,6 +29,11 @@
 TfLiteRegistration* Register_SOFTMAX();
 TfLiteRegistration* Micro_Register_SOFTMAX() { return Register_SOFTMAX(); }
 
+TfLiteRegistration* Register_AVERAGE_POOL_2D();
+TfLiteRegistration* Micro_Register_AVERAGE_POOL_2D() {
+  return Register_AVERAGE_POOL_2D();
+}
+
 AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
              Micro_Register_DEPTHWISE_CONV_2D());
@@ -36,6 +41,7 @@
              /* min_version */ 1,
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SOFTMAX, Micro_Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Micro_Register_AVERAGE_POOL_2D());
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc
new file mode 100644
index 0000000..8e3f4b9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -0,0 +1,216 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+#ifdef ARM_CMSIS_NN_M3
+  return kTfLiteError;
+#elif ARM_CMSIS_NN_M4
+  // Todo: call cmsis ops
+  return kTfLiteError;
+#elif ARM_CMSIS_NN_M7
+  return kTfLiteError;
+#else
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int out_width = ComputeOutSize(params->padding, width, filter_width,
+                                 params->stride_width);
+  int out_height = ComputeOutSize(params->padding, height, filter_height,
+                                  params->stride_height);
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, out_width,
+                                        out_height, data_type, data));
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat(context, node, params, data, input, filter, bias, output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+#endif
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
+                                 depthwise_conv::Prepare, depthwise_conv::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc
index ce821a9..2fdd18f 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc
@@ -50,12 +50,12 @@
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
                              const TfLiteType data_type, OpData* data) {
-  data->padding.height = ComputePadding(params->stride_height, 1, height,
-                                        filter_height, out_height);
-  data->padding.width =
-      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+  int unused_output_height, unused_output_width;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
@@ -168,15 +168,11 @@
   int height = SizeOfDimension(input, 1);
   int filter_width = SizeOfDimension(filter, 2);
   int filter_height = SizeOfDimension(filter, 1);
-  int out_width = ComputeOutSize(params->padding, width, filter_width,
-                                 params->stride_width);
-  int out_height = ComputeOutSize(params->padding, height, filter_height,
-                                  params->stride_height);
   OpData local_data_object;
   OpData* data = &local_data_object;
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, out_width,
-                                        out_height, data_type, data));
+                                        filter_width, filter_height, data_type,
+                                        data));
 
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
index a344c4f..2cacee7 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
@@ -102,8 +102,7 @@
       op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
       GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
       GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output), \
-      nullptr)
+      GetTensorShape(output), GetTensorData<output_data_type>(output))
   switch (output->type) {
     case kTfLiteUInt8:
       TF_LITE_FULLY_CONNECTED(uint8_t);
diff --git a/tensorflow/lite/experimental/micro/kernels/pooling.cc b/tensorflow/lite/experimental/micro/kernels/pooling.cc
new file mode 100644
index 0000000..0b9901b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/pooling.cc
@@ -0,0 +1,147 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace pooling {
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+};
+
+TfLiteStatus CalculateOpData(const TfLiteContext* context,
+                             const TfLitePoolParams* params,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* output, OpData* data) {
+  // input: batch, height, width, channel
+  int height = SizeOfDimension(input, 1);
+  int width = SizeOfDimension(input, 2);
+
+  int out_height, out_width;
+
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, /* dilation_rate= */ 1,
+      height, width, params->filter_height, params->filter_width,
+      params->padding, params->padding, &out_height, &out_width);
+
+  return kTfLiteOk;
+}
+
+void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
+                      const TfLitePoolParams* params, const OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
+  float activation_min, activation_max;
+  CalculateActivationRange(params->activation, &activation_min,
+                           &activation_max);
+
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+}
+
+void AverageEvalUint8(const TfLiteContext* context, const TfLiteNode* node,
+                      const TfLitePoolParams* params, const OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min, activation_max;
+  CalculateActivationRangeUint8(params->activation, output, &activation_min,
+                                &activation_max);
+
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+  reference_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+
+  // Inputs and outputs share the same type, guarenteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      AverageEvalFloat(context, node, params, &data, input, output);
+      break;
+    case kTfLiteUInt8:
+      AverageEvalUint8(context, node, params, &data, input, output);
+      break;
+    default:
+      context->ReportError(context, "Input type %s is not currently supported",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace pooling
+
+TfLiteRegistration* Register_AVERAGE_POOL_2D() {
+  static TfLiteRegistration r = {
+      pooling::Init,
+      pooling::Free,
+      pooling::Prepare,
+      pooling::AverageEval,
+  };
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/pooling_test.cc b/tensorflow/lite/experimental/micro/kernels/pooling_test.cc
new file mode 100644
index 0000000..88d07af
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/pooling_test.cc
@@ -0,0 +1,227 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
+                             std::initializer_list<float> input_data,
+                             const int filter_height, const int filter_width,
+                             const int stride_height, const int stride_width,
+                             std::initializer_list<float> expected_output_data,
+                             std::initializer_list<int> output_dims_data,
+                             TfLitePadding padding,
+                             TfLiteFusedActivation activation,
+                             float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteConvParams builtin_data = {padding,      stride_width,  stride_height,
+                                   filter_width, filter_height, activation};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestAveragePoolingUint8(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data, const float input_min,
+    const float input_max, const int filter_height, const int filter_width,
+    const int stride_height, const int stride_width,
+    std::initializer_list<uint8_t> expected_output_data,
+    std::initializer_list<int> output_dims_data, float output_min,
+    float output_max, TfLitePadding padding, TfLiteFusedActivation activation,
+    uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteConvParams builtin_data = {padding,      stride_width,  stride_height,
+                                   filter_width, filter_height, activation};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestFloat) {
+  float output_data[2];
+  tflite::testing::TestAveragePoolingFloat({4, 1, 2, 4, 1},  // Input shape
+                                           {                 // Input values
+                                            0., 6., 2., 4., 3., 2., 10., 7.},
+                                           2, 2,  // filter width, filter height
+                                           2, 2,  // stride width, stride height
+                                           {
+                                               // Output values
+                                               2.75,
+                                               5.75,
+                                           },
+                                           {4, 1, 1, 2, 1},  // Output shape
+                                           kTfLitePaddingValid, kTfLiteActNone,
+                                           output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestUint8) {
+  using tflite::testing::F2Q;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.9375;
+  const float output_min = -15.9375;
+  const float output_max = 15.9375;
+  uint8_t output_data[2];
+  tflite::testing::TestAveragePoolingUint8(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0., input_min, input_max),
+          F2Q(-6., input_min, input_max),
+          F2Q(2., input_min, input_max),
+          F2Q(4., input_min, input_max),
+          F2Q(3., input_min, input_max),
+          F2Q(2., input_min, input_max),
+          F2Q(-10., input_min, input_max),
+          F2Q(7., input_min, input_max),
+      },
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter width, filter height
+      2, 2,                  // stride width, stride height
+      {
+          // Output values
+          F2Q(0., output_min, output_max),
+          F2Q(0.75, output_min, output_max),
+      },
+      {4, 1, 1, 2, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/tools/docker/run_jupyter.sh b/tensorflow/lite/experimental/micro/tools/ci_build/test_x86.sh
similarity index 66%
copy from tensorflow/tools/docker/run_jupyter.sh
copy to tensorflow/lite/experimental/micro/tools/ci_build/test_x86.sh
index 2771aea..c0de765 100755
--- a/tensorflow/tools/docker/run_jupyter.sh
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/test_x86.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+#
+# Tests the microcontroller code using native x86 execution.
 
+set -e
 
-jupyter notebook "$@"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../../..
+cd ${ROOT_DIR}
+pwd
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile test
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 2e7e1eb4..ed8ee6e 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -106,9 +106,11 @@
 tensorflow/lite/kernels/padding.h \
 tensorflow/lite/kernels/internal/common.h \
 tensorflow/lite/kernels/internal/compatibility.h \
+tensorflow/lite/kernels/internal/reference/conv.h \
 tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h \
 tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h \
 tensorflow/lite/kernels/internal/reference/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/pooling.h \
 tensorflow/lite/kernels/internal/reference/softmax.h \
 tensorflow/lite/kernels/internal/round.h \
 tensorflow/lite/kernels/internal/tensor_ctypes.h \
@@ -185,6 +187,9 @@
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
 AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 
+# Load optimized kernel implementations
+include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc)
+
 # Load the examples.
 include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
new file mode 100644
index 0000000..779b77f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
@@ -0,0 +1,27 @@
+ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
+    # Enable u-arch specfic behaviours
+    ifneq (,$(filter $(TARGET_ARCH), cortex-m3))
+        CCFLAGS += -DARM_MATH_CM3
+        CXXFLAGS += -DARM_CMSIS_NN_M3
+    endif
+    ifneq (,$(filter $(TARGET_ARCH), cortex-m4))
+        CCFLAGS += -DARM_MATH_CM4
+        CXXFLAGS += -DARM_CMSIS_NN_M4
+    endif
+    ifneq (,$(filter $(TARGET_ARCH), cortex-m7))
+        CCFLAGS += -DARM_MATH_CM7
+        CXXFLAGS += -DARM_CMSIS_NN_M7
+    endif
+    ifneq (,$(filter $(TARGET_ARCH), x86_64))
+        # For development purposes
+        CCFLAGS += -DARM_MATH_CM4
+        CXXFLAGS += -DARM_CMSIS_NN_X86_64
+    endif
+
+    # Setup CMSIS-NN lib and add required header files to microlite lib INCLUDE
+    CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+    MICROLITE_CC_SRCS += $(shell find $(CMSIS_PATH)/CMSIS/NN/Source/ -name *.c)
+    INCLUDES += -I$(CMSIS_PATH)/CMSIS/Core/Include \
+                -I$(CMSIS_PATH)/CMSIS/NN/Include \
+                -I$(CMSIS_PATH)/CMSIS/DSP/Include
+endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
index 3be2418..cbeeb51 100644
--- a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -115,7 +115,7 @@
 	python tensorflow/lite/experimental/micro/tools/make/transform_arduino_source.py \
         --third_party_headers="$(4)" < $$< > $$@
 
-$(PRJDIR)$(2)/arduino/src/%: %
+$(PRJDIR)$(2)/arduino/src/%: % third_party_downloads
 	@mkdir -p $$(dir $$@)
 	python tensorflow/lite/experimental/micro/tools/make/transform_arduino_source.py \
         --third_party_headers="$(4)" < $$< > $$@
@@ -125,13 +125,13 @@
 	python tensorflow/lite/experimental/micro/tools/make/transform_arduino_source.py \
         --third_party_headers="$(4)" < $$< > $$@
 
-$(PRJDIR)$(2)/arduino/src/third_party/flatbuffers/include/flatbuffers/base.h: tensorflow/lite/experimental/micro/tools/make/downloads/flatbuffers/include/flatbuffers/base.h
+$(PRJDIR)$(2)/arduino/src/third_party/flatbuffers/include/flatbuffers/base.h: tensorflow/lite/experimental/micro/tools/make/downloads/flatbuffers/include/flatbuffers/base.h third_party_downloads
 	@mkdir -p $$(dir $$@)
 	python tensorflow/lite/experimental/micro/tools/make/transform_arduino_source.py \
         --third_party_headers="$(4)" < $$< | \
         sed -E 's/utility\.h/utility/g' > $$@
 
-$(PRJDIR)$(2)/arduino/src/third_party/kissfft/kiss_fft.h: tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+$(PRJDIR)$(2)/arduino/src/third_party/kissfft/kiss_fft.h: tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h third_party_downloads
 	@mkdir -p $$(dir $$@)
 	python tensorflow/lite/experimental/micro/tools/make/transform_arduino_source.py \
         --third_party_headers="$(4)" < $$< | \
@@ -155,7 +155,10 @@
 
 generate_$(2)_arduino_project: $(addprefix $(PRJDIR)$(2)/arduino/src/,$(ARDUINO_SRCS) $(ARDUINO_HDRS)) $(addprefix $(PRJDIR)$(2)/arduino/,$(1)) $(PRJDIR)$(2)/arduino/examples/$(2)/$(2).ino $(PRJDIR)$(2)/arduino/src/TensorFlowLite.h
 
-ALL_PROJECT_TARGETS += generate_$(2)_arduino_project
+generate_$(2)_arduino_library_zip: generate_$(2)_arduino_project
+	cd $(PRJDIR)$(2) && cp -r arduino $(2) && zip -r $(2).zip $(2)
+
+ALL_PROJECT_TARGETS += generate_$(2)_arduino_library_zip
 endef
 
 # Specialized version of generate_project for TF Lite Micro test targets that
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
index 766b7f2..d62433a 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
@@ -22,7 +22,8 @@
   if (argc != 3) {
     fprintf(stderr,
             "%s requires exactly two parameters - the names of the header and "
-            "source files to save\n");
+            "source files to save\n",
+            argv[0]);
     return 1;
   }
   struct FrontendConfig frontend_config;
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index b118ae4..e5f1805 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -68,7 +68,7 @@
 
 ios_unit_test(
     name = "TensorFlowLiteTests",
-    size = "small",
+    size = "medium",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
diff --git a/tensorflow/lite/experimental/objc/README.md b/tensorflow/lite/experimental/objc/README.md
index f735d41..e0788e6 100644
--- a/tensorflow/lite/experimental/objc/README.md
+++ b/tensorflow/lite/experimental/objc/README.md
@@ -5,7 +5,7 @@
 on-device machine learning models with a small binary size and fast performance
 supporting hardware acceleration.
 
-## Getting Started
+## Build TensorFlow with iOS support
 
 To build the Objective-C TensorFlow Lite library on Apple platforms,
 [install from source](https://www.tensorflow.org/install/source#setup_for_linux_and_macos)
@@ -19,9 +19,34 @@
 
 Follow the prompts and when asked to build TensorFlow with iOS support, enter `y`.
 
-### Bazel
+### CocoaPods developers
 
-In your `BUILD` file, add the `TensorFlowLite` dependency:
+Add the TensorFlow Lite pod to your `Podfile`:
+
+```ruby
+pod 'TensorFlowLiteObjC'
+```
+
+Then, run `pod install`.
+
+In your Objective-C files, import the umbrella header:
+
+```objectivec
+#import "TFLTensorFlowLite.h"
+```
+
+Or, the module if you set `CLANG_ENABLE_MODULES = YES` in your Xcode project:
+
+```objectivec
+@import TFLTensorFlowLite;
+```
+
+Note: To import the TensorFlow Lite module in your Objective-C files, you must
+also include `use_frameworks!` in your `Podfile`.
+
+### Bazel developers
+
+In your `BUILD` file, add the `TensorFlowLite` dependency to your target:
 
 ```python
 objc_library(
@@ -37,6 +62,12 @@
 #import "TFLTensorFlowLite.h"
 ```
 
+Or, the module if you set `CLANG_ENABLE_MODULES = YES` in your Xcode project:
+
+```objectivec
+@import TFLTensorFlowLite;
+```
+
 Build the `TensorFlowLite` Objective-C library target:
 
 ```shell
@@ -49,36 +80,14 @@
 bazel test tensorflow/lite/experimental/objc:TensorFlowLiteTests
 ```
 
-### Tulsi
+#### Generate the Xcode project using Tulsi
 
-Open the `TensorFlowLite.tulsiproj` using the
-[TulsiApp](https://github.com/bazelbuild/tulsi) or by running the
+Open the `//tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj` using
+the [TulsiApp](https://github.com/bazelbuild/tulsi)
+or by running the
 [`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
 script from the root `tensorflow` directory:
 
 ```shell
 generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
 ```
-
-### CocoaPods
-
-Add the following to your `Podfile`:
-
-```ruby
-pod 'TensorFlowLiteObjC'
-```
-
-Then, run `pod install`.
-
-In your Objective-C files, import the umbrella header:
-
-```objectivec
-#import "TFLTensorFlowLite.h"
-```
-
-Or, the module if `CLANG_ENABLE_MODULES = YES` and `use_frameworks!` is
-specified in your `Podfile`:
-
-```objectivec
-@import TFLTensorFlowLite;
-```
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index b1dba62..8698df3 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -1,12 +1,10 @@
-# Run `pod lib lint TensorFlowLiteObjC.podspec` to ensure this is a valid spec.
-
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '0.1.0'
+  s.version          = '0.2.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '2b96dde' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '37c101d' }
   s.summary          = 'TensorFlow Lite for Objective-C'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensor.h b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
index dc710ab..fd781bd 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLTensor.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
@@ -29,6 +29,9 @@
   /** 32-bit single precision floating point. */
   TFLTensorDataTypeFloat32,
 
+  /** 16-bit half precision floating point. */
+  TFLTensorDataTypeFloat16,
+
   /** 32-bit signed integer. */
   TFLTensorDataTypeInt32,
 
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index cf5a6b4..1c8b7f9 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -366,6 +366,8 @@
   switch (cTensorType) {
     case kTfLiteFloat32:
       return TFLTensorDataTypeFloat32;
+    case kTfLiteFloat16:
+      return TFLTensorDataTypeFloat16;
     case kTfLiteInt32:
       return TFLTensorDataTypeInt32;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index 521b6d1..0b9443a 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -8,6 +8,8 @@
     default_visibility = ["//visibility:private"],
 )
 
+load(":ruy_visibility.bzl", "ruy_visibility")
+
 cc_library(
     name = "check_macros",
     hdrs = ["check_macros.h"],
@@ -127,6 +129,7 @@
     hdrs = [
         "thread_pool.h",
     ],
+    visibility = ruy_visibility(),
     deps = [
         ":blocking_counter",
         ":check_macros",
@@ -174,6 +177,7 @@
     hdrs = [
         "context.h",
     ],
+    visibility = ruy_visibility(),
     deps = [
         ":allocator",
         ":check_macros",
@@ -192,6 +196,17 @@
 )
 
 cc_library(
+    name = "internal_matrix",
+    hdrs = ["internal_matrix.h"],
+    deps = [
+        ":check_macros",
+        ":common",
+        ":matrix",
+        ":size_util",
+    ],
+)
+
+cc_library(
     name = "common",
     hdrs = [
         "common.h",
@@ -201,7 +216,6 @@
         ":matrix",
         ":opt_set",
         ":path",
-        ":size_util",
     ],
 )
 
@@ -215,7 +229,7 @@
     ],
     deps = [
         ":common",
-        ":matrix",
+        ":internal_matrix",
         ":opt_set",
         ":path",
         ":size_util",
@@ -236,7 +250,7 @@
     ],
     deps = [
         ":common",
-        ":matrix",
+        ":internal_matrix",
         ":opt_set",
         ":path",
         ":spec",
@@ -245,8 +259,6 @@
     ],
 )
 
-load(":ruy_visibility.bzl", "ruy_visibility")
-
 # The main library.
 cc_library(
     name = "ruy",
@@ -254,7 +266,11 @@
         "dispatch.h",
         "impl.h",
     ],
-    hdrs = ["ruy.h"],
+    hdrs = [
+        "matrix.h",
+        "path.h",
+        "ruy.h",
+    ],
     visibility = ruy_visibility(),
     deps = [
         ":allocator",
@@ -263,10 +279,8 @@
         ":common",
         ":context",
         ":kernel",
-        ":matrix",
         ":opt_set",
         ":pack",
-        ":path",
         ":size_util",
         ":spec",
         ":thread_pool",
@@ -291,6 +305,7 @@
     testonly = True,
     srcs = ["pmu.cc"],
     hdrs = ["pmu.h"],
+    deps = [":check_macros"],
 )
 
 load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
@@ -325,6 +340,7 @@
         ("i8", "i8", "i32", "u8"),
         ("i8", "i8", "i32", "i8"),
         ("u8", "u8", "i32", "i16"),
+        ("i8", "i8", "i32", "i32"),
     ],
 )
 
@@ -339,6 +355,7 @@
         ("i8", "i8", "i32", "i8"),
         ("i8", "u8", "i32", "i8"),
         ("u8", "u8", "i32", "i16"),
+        ("i8", "i8", "i32", "i32"),
     ],
 )
 
@@ -350,6 +367,7 @@
         ("u8", "u8", "i32", "u8"),
         ("i8", "i8", "i32", "i8"),
         ("u8", "u8", "i32", "i16"),
+        ("i8", "i8", "i32", "i32"),
     ],
 )
 
@@ -384,3 +402,7 @@
         "7ff",
     ],
 )
+
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/experimental/ruy/README.md b/tensorflow/lite/experimental/ruy/README.md
index 9954eac..87d0ab1 100644
--- a/tensorflow/lite/experimental/ruy/README.md
+++ b/tensorflow/lite/experimental/ruy/README.md
@@ -28,12 +28,3 @@
 ruy is currently optimized only for the following combination of storage orders:
 LHS = row-major, RHS = column-major, destination = column-major. All other
 combinations of storage orders fall back to slow reference code at the moment.
-
-With these caveats out of the way, here are benchmark results:
-
-*   [float GEMM benchmark](https://docs.google.com/spreadsheets/d/1-k5KYWutjE6Qr7RiRZxb6v8YvSqR1BNsUAfLZ31oWJ8/edit?usp=sharing)
-    against Eigen and OpenBLAS.
-*   [8bit GEMM benchmark](https://docs.google.com/spreadsheets/d/1_6-OlhpUJepwsMiVubeKTKBgcM6P9_ugTFwd5HANBZQ/edit?usp=sharing)
-    against gemmlowp.
-*   [TFLite integrated benchmark](https://docs.google.com/spreadsheets/d/1AjpjRnNViBoyFEwQdHSEhMpE_XiRTRkIFh-L_0vFRNI/edit?usp=sharing)
-    on vision CNNs.
diff --git a/tensorflow/lite/experimental/ruy/allocator.h b/tensorflow/lite/experimental/ruy/allocator.h
index 789731a..ef1db4d 100644
--- a/tensorflow/lite/experimental/ruy/allocator.h
+++ b/tensorflow/lite/experimental/ruy/allocator.h
@@ -64,8 +64,9 @@
   //    be queried cheaply, at runtime, from userspace, if needed.
   static constexpr std::size_t kAlignment = 64;
 
+  void operator=(const AlignedAllocator&) = delete;
   ~AlignedAllocator() {
-    RUY_DCHECK(fallback_blocks_.empty());
+    FreeAll();
     SystemAlignedFree(ptr_);
   }
 
@@ -146,12 +147,17 @@
 // typed buffer.
 class Allocator {
  public:
+  void* AllocateBytes(std::size_t num_bytes) {
+    if (num_bytes == 0) {
+      return nullptr;
+    }
+    return aligned.AllocateAlignedBytes(
+        round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
+  }
   template <typename Pointer>
   void Allocate(std::size_t count, Pointer* out) {
     using T = typename std::pointer_traits<Pointer>::element_type;
-    std::size_t num_bytes =
-        round_up_pot(count * sizeof(T), detail::AlignedAllocator::kAlignment);
-    *out = static_cast<T*>(aligned.AllocateAlignedBytes(num_bytes));
+    *out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
   }
 
   void FreeAll() { aligned.FreeAll(); }
diff --git a/tensorflow/lite/experimental/ruy/allocator_test.cc b/tensorflow/lite/experimental/ruy/allocator_test.cc
index 44848fa..7006b0d 100644
--- a/tensorflow/lite/experimental/ruy/allocator_test.cc
+++ b/tensorflow/lite/experimental/ruy/allocator_test.cc
@@ -72,6 +72,30 @@
   }
 }
 
+TEST(AllocatorTest, DestructorHandlesMainBumpPtr) {
+  // This is a white-box test.
+  Allocator allocator;
+  allocator.AllocateBytes(1);
+  allocator.FreeAll();
+  // After the call to FreeAll, the allocator will consolidate all of the memory
+  // into the main bump-ptr allocator's block, which we then expect to be freed
+  // in the destructor.
+  //
+  // We have no test assertions -- we primarily expect that this trigger a leak
+  // checker and cause the test to fail.
+}
+
+TEST(AllocatorTest, DestructorHandlesFallbackBlocks) {
+  // This is a white-box test.
+  Allocator allocator;
+  // Since we just created the allocator, this will allocate a fallback block,
+  // which we then expect to be freed in the destructor.
+  //
+  // We have no test assertions -- we primarily expect that this trigger a leak
+  // checker and cause the test to fail.
+  allocator.AllocateBytes(1);
+}
+
 }  // namespace
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/benchmark.cc
index ccf7f5d..55b02d2 100644
--- a/tensorflow/lite/experimental/ruy/benchmark.cc
+++ b/tensorflow/lite/experimental/ruy/benchmark.cc
@@ -36,8 +36,7 @@
 };
 
 template <typename TestSetType>
-std::vector<TestResult<DstScalar>> BenchmarkPackedLinearRCC(
-    const BenchmarkShape& shape) {
+std::vector<TestResult<DstScalar>> BenchmarkRCC(const BenchmarkShape& shape) {
   TestSetType test_set;
   test_set.rows = shape.rows;
   test_set.depth = shape.depth;
@@ -104,7 +103,7 @@
 
   for (int i = 0; i < shapes.size(); i++) {
     const auto& shape = shapes[i];
-    const auto& results = BenchmarkPackedLinearRCC<TestSetType>(shape);
+    const auto& results = BenchmarkRCC<TestSetType>(shape);
     if (i == 0) {
       if (benchmark_cubic) {
         printf("size");
diff --git a/tensorflow/lite/experimental/ruy/common.h b/tensorflow/lite/experimental/ruy/common.h
index 53ebbe9..3f6e8ac 100644
--- a/tensorflow/lite/experimental/ruy/common.h
+++ b/tensorflow/lite/experimental/ruy/common.h
@@ -21,13 +21,11 @@
 #include <atomic>
 #include <limits>
 #include <type_traits>
-#include <utility>
 
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/size_util.h"
 
 #ifdef __aarch64__
 #include <arm_neon.h>
@@ -44,111 +42,17 @@
 
 namespace ruy {
 
-inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) {
-  layout->rows = rows;
-  layout->cols = cols;
-  layout->order = order;
-  layout->stride = order == Order::kColMajor ? rows : cols;
-  layout->kernel.order = order;
-  layout->kernel.rows = 1;
-  layout->kernel.cols = 1;
-}
-
-inline bool IsLinear(const Layout& layout) {
-  return layout.kernel.rows == 1 && layout.kernel.cols == 1;
-}
-
-inline bool IsPacked(const Layout& layout) {
-  if (layout.order == Order::kColMajor) {
-    return layout.stride == layout.rows;
-  } else {
-    return layout.stride == layout.cols;
-  }
-}
-
-inline bool IsPackedLinear(const Layout& layout) {
-  return IsPacked(layout) && IsLinear(layout);
-}
-
-inline bool IsRowMajor(const Layout& layout) {
-  return layout.order == Order::kRowMajor;
-}
-
-inline bool IsColMajor(const Layout& layout) {
-  return layout.order == Order::kColMajor;
-}
-
-inline bool IsLinearColMajor(const Layout& layout) {
-  return IsLinear(layout) && IsColMajor(layout);
-}
-
-inline bool IsPackedLinearColMajor(const Layout& layout) {
-  return IsLinearColMajor(layout) && IsPacked(layout);
-}
-
-inline bool IsLinearRowMajor(const Layout& layout) {
-  return IsLinear(layout) && IsRowMajor(layout);
-}
-
-inline bool IsPackedLinearRowMajor(const Layout& layout) {
-  return IsLinearRowMajor(layout) && IsPacked(layout);
-}
-
-inline int FlatSize(const Layout& layout) {
-  const int outerdim =
-      layout.order == Order::kColMajor ? layout.cols : layout.rows;
-  return layout.stride * outerdim;
-}
-
-// TODO(b/130417400) add a unit test
-inline int Offset(const Layout& layout, int row, int col) {
-  // TODO(benoitjacob)  - should check this but this make the _slow tests take
-  // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
-  // bypassing the check?
-  // RUY_DCHECK_GE(row, 0);
-  // RUY_DCHECK_GE(col, 0);
-  // RUY_DCHECK_LT(row, layout.rows);
-  // RUY_DCHECK_LT(col, layout.cols);
-  if (IsLinear(layout)) {
-    int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
-    int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
-    return row * row_stride + col * col_stride;
-  } else {
-    RUY_DCHECK(is_pot(layout.kernel.rows));
-    RUY_DCHECK(is_pot(layout.kernel.cols));
-    int row_outer = row & ~(layout.kernel.rows - 1);
-    int col_outer = col & ~(layout.kernel.cols - 1);
-    int row_stride_outer =
-        layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
-    int col_stride_outer =
-        layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
-    int offset_outer =
-        row_outer * row_stride_outer + col_outer * col_stride_outer;
-    int row_inner = row - row_outer;
-    int col_inner = col - col_outer;
-    int row_stride_inner =
-        layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
-    int col_stride_inner =
-        layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
-    int offset_inner =
-        row_inner * row_stride_inner + col_inner * col_stride_inner;
-    return offset_outer + offset_inner;
-  }
-}
-
-template <typename Scalar>
-const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
-  return mat.data.get() + Offset(mat.layout, row, col);
-}
-
-template <typename Scalar>
-Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
-  return mat->data.get() + Offset(mat->layout, row, col);
-}
-
-template <typename Scalar>
-Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
-  return *ElementPtr(mat, row, col);
+// Helper for type-erasing a pointer.
+//
+// Often inside Ruy, a template parameter holds type information statically, but
+// we would like to have a function signature that doesn't depend on the
+// template parameters, so that we can dispatch indirectly across multiple
+// implementations. This helper is at the core of such type-erasure.
+//
+// The opposite of this operation is just `static_cast<T*>(void_ptr)`.
+template <typename T>
+void* ToVoidPtr(T* p) {
+  return const_cast<void*>(static_cast<const void*>(p));
 }
 
 // We need this where we have multiple threads potentially writing concurrently
@@ -176,33 +80,6 @@
   return std::numeric_limits<Scalar>::max() / 2 + 1;
 }
 
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-struct TrMulImpl;
-
-template <Order tOrder, int tRows, int tCols>
-struct FixedKernelLayout {
-  static constexpr Order kOrder = tOrder;
-  static constexpr int kRows = tRows;
-  static constexpr int kCols = tCols;
-};
-
-inline void Transpose(Order* order) {
-  *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
-}
-
-inline void Transpose(Layout* layout) {
-  Transpose(&layout->order);
-  Transpose(&layout->kernel.order);
-  std::swap(layout->rows, layout->cols);
-  std::swap(layout->kernel.rows, layout->kernel.cols);
-}
-
-template <typename Scalar>
-inline void Transpose(Matrix<Scalar>* matrix) {
-  Transpose(&matrix->layout);
-}
-
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_COMMON_H_
diff --git a/tensorflow/lite/experimental/ruy/context.h b/tensorflow/lite/experimental/ruy/context.h
index 7f2c271..48b02f8 100644
--- a/tensorflow/lite/experimental/ruy/context.h
+++ b/tensorflow/lite/experimental/ruy/context.h
@@ -45,6 +45,7 @@
 struct Context final {
   Path last_taken_path = Path::kNone;
   Tuning explicit_tuning = Tuning::kAuto;
+  // TODO(benoitjacob) rename that thread_pool. Current name is gemmlowp legacy.
   ThreadPool workers_pool;
   int max_num_threads = 1;
   // State for each thread in the thread pool. Entry 0 is the main thread.
@@ -64,6 +65,13 @@
     }
   }
 
+  template <Path CompiledPaths>
+  Path GetPathToTake() {
+    last_taken_path =
+        GetMostSignificantPath(CompiledPaths & GetRuntimeEnabledPaths());
+    return last_taken_path;
+  }
+
   void SetRuntimeEnabledPaths(Path paths);
   Path GetRuntimeEnabledPaths();
 
diff --git a/tensorflow/lite/experimental/ruy/detect_dotprod.cc b/tensorflow/lite/experimental/ruy/detect_dotprod.cc
index d587d5b..fdafd19 100644
--- a/tensorflow/lite/experimental/ruy/detect_dotprod.cc
+++ b/tensorflow/lite/experimental/ruy/detect_dotprod.cc
@@ -19,7 +19,9 @@
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <unistd.h>
+
 #include <mutex>
 
 #endif
diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h
index 3386e14..4d27b2f 100644
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@@ -37,6 +37,8 @@
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DISPATCH_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DISPATCH_H_
 
+#include <limits>
+
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
@@ -52,10 +54,10 @@
 template <typename Spec>
 void EnforceLayoutSupport(const Layout& lhs_layout, const Layout& rhs_layout,
                           const Layout& dst_layout) {
-  if (Spec::kLayoutSupport == LayoutSupport::kPackedLinearRCC) {
-    RUY_DCHECK(IsPackedLinearRowMajor(lhs_layout));
-    RUY_DCHECK(IsPackedLinearColMajor(rhs_layout));
-    RUY_DCHECK(IsPackedLinearColMajor(dst_layout));
+  if (Spec::kLayoutSupport == LayoutSupport::kRCC) {
+    RUY_DCHECK(IsRowMajor(lhs_layout));
+    RUY_DCHECK(IsColMajor(rhs_layout));
+    RUY_DCHECK(IsColMajor(dst_layout));
   }
 }
 
@@ -72,33 +74,161 @@
   }
 }
 
-// If the Spec's ZeroPointSupport covers only some special cases,
-// this function enforces that the matrix multiplication at hand falls into
-// that special case.
 template <typename Spec, typename LhsScalar, typename RhsScalar,
           typename DstScalar>
 void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point,
                              DstScalar dst_zero_point) {
+  // If the Spec's ZeroPointSupport covers only some special cases,
+  // this function enforces that the matrix multiplication at hand falls into
+  // that special case.
   CheckZeroPoint<Spec>(lhs_zero_point);
   CheckZeroPoint<Spec>(rhs_zero_point);
   CheckZeroPoint<Spec>(dst_zero_point);
+
+  // Guard against the case when both LHS and RHS zero_point's are equal to
+  // the minimum representable value. In that case, padding with zero_point
+  // values will generate the bad case for fast int8 kernels on NEON
+  // (pre-dotprod) which attempt to multiply-accumulate two pairs of int8
+  // into a int16:  this is safe except in the bad case -128*-128 + -128*-128.
+  // See b/131609283. This only affects the kNeon path but we ban this for all
+  // paths in order for ruy to have the same supported parameter space
+  // on all paths.
+  RUY_DCHECK(lhs_zero_point != std::numeric_limits<LhsScalar>::lowest() ||
+             rhs_zero_point != std::numeric_limits<RhsScalar>::lowest());
 }
 
-// GetTrMulImplRunFn is implemented with template metaprogramming by mutual
-// recursion between PathSearchCountdown and PathSearchCompiledPaths.
+template <typename Spec, typename DstScalar>
+void EnforceDstSpecSupport(const Spec& spec, DstScalar dst_zero_point) {
+  if (!std::is_same<typename Spec::DstScalar, std::int32_t>::value) return;
+
+  // If user is looking for the raw accumulator, zero_point and all the other
+  // dequantize fields don't make sense and should not be set.
+  RUY_DCHECK(dst_zero_point == 0);
+  RUY_DCHECK(spec.clamp_max == std::numeric_limits<std::int32_t>::max());
+  RUY_DCHECK(spec.clamp_min == std::numeric_limits<std::int32_t>::min());
+  RUY_DCHECK(spec.multiplier_fixedpoint == 0);
+  RUY_DCHECK(spec.multiplier_exponent == 0);
+  RUY_DCHECK(spec.multiplier_fixedpoint_perchannel == nullptr);
+  RUY_DCHECK(spec.multiplier_exponent_perchannel == nullptr);
+}
+
+inline bool IsColMajorTrMul(const DMatrix& lhs, const DMatrix& rhs,
+                            const DMatrix& dst) {
+  return IsColMajor(lhs.layout) && IsColMajor(rhs.layout) &&
+         IsColMajor(dst.layout);
+}
+
+inline void CreatePackedLayout(const Layout& src, const Type& scalar,
+                               const KernelLayout& kernel_layout,
+                               PackedLayout* packed) {
+  packed->order = Order::kColMajor;
+  packed->rows = round_up_pot(src.rows, kernel_layout.rows);
+  packed->cols = round_up_pot(src.cols, kernel_layout.cols);
+  packed->kernel = kernel_layout;
+  int inner_size = packed->rows;
+  if (RUY_OPT_SET & RUY_OPT_AVOID_ALIASING) {
+    packed->stride =
+        (inner_size * scalar.size) % 1024 ? inner_size : inner_size + 64;
+  } else {
+    packed->stride = inner_size;
+  }
+}
+
+template <typename Scalar, typename PackedScalar>
+void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout,
+                        PMatrix* packed) {
+  // Ruy always uses 32-bit signed accumulators for quantized
+  // matrix multiplication, so we would like to always use std::int32_t
+  // unconditionally for SumsType.
+  // However, for floating point types, we still need a reasonable type here to
+  // avoid tripping assertions elsewhere in the code.
+  using SumsType =
+      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
+                                std::int32_t>::type;
+
+  packed->data_type = Type::Create<PackedScalar>();
+  packed->sums_type = Type::Create<SumsType>();
+  CreatePackedLayout(src.layout, packed->data_type, kernel_layout,
+                     &packed->layout);
+  packed->zero_point = Pack<PackedScalar, Scalar>(src.zero_point);
+}
+
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void PopulateTrMulParams(TrMulParams* params) {
+  static_assert((ThePath & Path::kReference) == Path::kNone,
+                "Path::kReference should not do TrMul");
+  // The optimized code paths don't handle the full generality of Ruy's API.
+  // Fall back to Path::kStandardCpp if necessary.
+  bool fallback_to_standard_cpp = false;
+  if (ThePath != Path::kStandardCpp) {
+    // The optimized code paths currently only handle the case of all matrices
+    // being column major.
+    if (!IsColMajorTrMul(params->lhs, params->rhs, params->dst)) {
+      fallback_to_standard_cpp = true;
+    }
+
+    // If DstScalar is std::int32_t, means user want to get from accumulator
+    // results directly, if it's not Neon path, will fallback to
+    // Path::kStandardCpp.
+    if (std::is_same<DstScalar, std::int32_t>::value &&
+        ThePath != Path::kNeon) {
+      fallback_to_standard_cpp = true;
+    }
+  }
+
+  if (fallback_to_standard_cpp) {
+    PopulateTrMulParams<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
+                        Spec>(params);
+    return;
+  }
+
+  using PackedLhsScalar = PackedType<ThePath, LhsScalar>;
+  using PackedRhsScalar = PackedType<ThePath, RhsScalar>;
+  using Kernel =
+      Kernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
+  using LhsKernelLayout = typename Kernel::LhsLayout;
+  using RhsKernelLayout = typename Kernel::RhsLayout;
+
+  CreatePackedMatrix<LhsScalar, PackedLhsScalar>(
+      params->lhs, ToKernelLayout<LhsKernelLayout>(), &params->packed_lhs);
+  CreatePackedMatrix<RhsScalar, PackedRhsScalar>(
+      params->rhs, ToKernelLayout<RhsKernelLayout>(), &params->packed_rhs);
+
+  params->lhs_run_pack =
+      &RunPack<ThePath, LhsKernelLayout, LhsScalar, PackedLhsScalar>;
+  params->rhs_run_pack =
+      &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
+  params->run_kernel =
+      &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
+  return;
+}
+
+// PopulateTrMulParamsAllCompiledPaths calls into one of multiple
+// instantiations of PopulateTrMulParams. For each bit that is set in
+// CompiledPaths, it statically instantiates PopulateTrMulParams with a Path
+// corresponding to that single bit. The call to PopulateTrMulParams is
+// guarded by a runtime check that it is in fact the dynamically selected path.
 //
-// GetTrMulImplRunFn is logically implementing the following computation:
+// PopulateTrMulParamsAllCompiledPaths is implemented with template
+// metaprogramming by mutual recursion between PathSearchCountdown and
+// PathSearchCompiledPaths.
 //
-// decltype(&TrMulImpl<...>::Run) GetTrMulImplRunFn(Path single_path) {
+// PopulateTrMulParamsAllCompiledPaths is logically implementing the following
+// computation:
+//
+// template <Path CompiledPaths>
+// void PopulateTrMulParamsAllCompiledPaths(Path the_path,
+//                                            TrMulParams* params) {
 //   for (int bit = 8 * sizeof(Path) - 1; bit != -1; bit--) { // [1]
 //     Path current_path = static_cast<Path>(1 << bit);
 //     if ((CompiledPaths & current_path) != Path::kNone) { // [2]
-//       if (current_path == single_path) { // [3]
-//         return &TrMulImpl<current_path, ...>::Run;
+//       if (current_path == the_path) { // [3]
+//         PopulateTrMulParams<current_path, ...>(the_path, params);
+//         return;
 //       }
 //     }
 //   }
-//   return nullptr; // [4]
 // }
 //
 //
@@ -110,15 +240,13 @@
 // doing the whole computation at C++ compile time.
 // [3] - Done by the `if` in the main definition of
 // PathSearchOnlyCompiledPaths.
-// [4] - Done by the partial specialization of PathSearchCountdown.
 //
 // The template metaprogramming is necessary because:
-// - In `TrMulImpl<current_path, ...>::Run`, current_path must be a C++
+// - In `PopulateTrMulParams<current_path, ...>`, current_path must be a C++
 // compile-time constant.
-// - GetTrMulImplRunFn must not instantiate
-// `TrMulImpl<curent_path, ...>::Run` for paths that are not in
-// CompiledPaths, since that can result in bogus instantiations which cause
-// a compile time failure.
+// - PopulateTrMulParamsAllCompiledPaths must not instantiate
+// inner loops for paths that are not in CompiledPaths, since that can result in
+// bogus instantiations which cause a compile time failure.
 template <Path CompiledPaths, int BitNumber, typename LhsScalar,
           typename RhsScalar, typename DstScalar, typename Spec>
 struct PathSearchCountdown;
@@ -128,29 +256,25 @@
           typename Spec>
 struct PathSearchOnlyCompiledPaths {
   static constexpr Path kCurrentPath = static_cast<Path>(1 << BitNumber);
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    if (kCurrentPath == single_path) {
-      return &TrMulImpl<kCurrentPath, LhsScalar, RhsScalar, DstScalar,
-                        Spec>::Run;
+  static void Search(Path the_path, TrMulParams* params) {
+    if (kCurrentPath == the_path) {
+      PopulateTrMulParams<kCurrentPath, LhsScalar, RhsScalar, DstScalar, Spec>(
+          params);
+      return;
     }
-    return PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar,
-                               RhsScalar, DstScalar, Spec>::Search(single_path);
+    PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar, RhsScalar,
+                        DstScalar, Spec>::Search(the_path, params);
   }
 };
 
-// Skip instantiating TrMulImpl if CompiledPaths doesn't contain the
-// specified path.
+// Skip this iteration if CompiledPaths doesn't contain the specified path.
 template <Path CompiledPaths, int BitNumber, typename LhsScalar,
           typename RhsScalar, typename DstScalar, typename Spec>
 struct PathSearchOnlyCompiledPaths<CompiledPaths, false, BitNumber, LhsScalar,
                                    RhsScalar, DstScalar, Spec> {
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    return PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar,
-                               RhsScalar, DstScalar, Spec>::Search(single_path);
+  static void Search(Path the_path, TrMulParams* params) {
+    PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar, RhsScalar,
+                        DstScalar, Spec>::Search(the_path, params);
   }
 };
 
@@ -158,12 +282,10 @@
           typename RhsScalar, typename DstScalar, typename Spec>
 struct PathSearchCountdown {
   static constexpr Path kCurrentPath = static_cast<Path>(1 << BitNumber);
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    return PathSearchOnlyCompiledPaths<
+  static void Search(Path the_path, TrMulParams* params) {
+    PathSearchOnlyCompiledPaths<
         CompiledPaths, (CompiledPaths & kCurrentPath) != Path::kNone, BitNumber,
-        LhsScalar, RhsScalar, DstScalar, Spec>::Search(single_path);
+        LhsScalar, RhsScalar, DstScalar, Spec>::Search(the_path, params);
   }
 };
 
@@ -173,48 +295,133 @@
           typename DstScalar, typename Spec>
 struct PathSearchCountdown<CompiledPaths, -1, LhsScalar, RhsScalar, DstScalar,
                            Spec> {
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    return nullptr;
-  }
+  static void Search(Path the_path, TrMulParams* params) { RUY_DCHECK(false); }
 };
 
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
-decltype(&TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-GetTrMulImplRunFn(Path single_path) {
+void PopulateTrMulParamsAllCompiledPaths(Path the_path, TrMulParams* params) {
   return PathSearchCountdown<CompiledPaths, 8 * sizeof(Path) - 1, LhsScalar,
-                             RhsScalar, DstScalar, Spec>::Search(single_path);
+                             RhsScalar, DstScalar, Spec>::Search(the_path,
+                                                                 params);
+}
+
+template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void CreateTrMulParams(const Matrix<LhsScalar>& lhs,
+                       const Matrix<RhsScalar>& rhs, const Spec& spec,
+                       Context* context, Matrix<DstScalar>* dst, Path the_path,
+                       TrMulParams* params) {
+  // Fill in the fields we already know.
+  params->lhs = ToDMatrix(lhs);
+  params->rhs = ToDMatrix(rhs);
+  params->dst = ToDMatrix(*dst);
+  params->spec = ToVoidPtr(&spec);
+
+  // Create inner loops and packed matrices based on the Path.
+  PopulateTrMulParamsAllCompiledPaths<CompiledPaths, LhsScalar, RhsScalar,
+                                      DstScalar, Spec>(the_path, params);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+          typename Spec>
+void ReferenceMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                  const Spec& spec, Matrix<DstScalar>* dst) {
+  gemmlowp::ScopedProfilingLabel label("ReferenceMul");
+  for (int i = 0; i < lhs.layout.rows; i++) {
+    for (int j = 0; j < rhs.layout.cols; j++) {
+      using AccumScalar = typename Spec::AccumScalar;
+      AccumScalar accum = 0;
+      for (int k = 0; k < lhs.layout.cols; k++) {
+        AccumScalar lhs_val = Element(lhs, i, k);
+        AccumScalar rhs_val = Element(rhs, k, j);
+        accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point);
+      }
+      if (spec.bias) {
+        accum += spec.bias[i];
+      }
+      ApplyMultiplier(spec, i, &accum);
+      accum += dst->zero_point;
+      accum = std::min<AccumScalar>(accum, spec.clamp_max);
+      accum = std::max<AccumScalar>(accum, spec.clamp_min);
+      *ElementPtr(dst, i, j) = static_cast<DstScalar>(accum);
+    }
+  }
+}
+
+// Compile-time dispatch to ReferenceMul. This allows us to statically ensure
+// that there is no call to ReferenceMul in the user's binary.
+template <bool ReferenceMulIsEnabled>
+struct CompileTimeEnabledReferenceMul {
+  template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+            typename Spec>
+  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                  const Spec& spec, Matrix<DstScalar>* dst) {
+    ReferenceMul(lhs, rhs, spec, dst);
+  }
+};
+
+// When this partial specialization is chosen, it ensures that ReferenceMul
+// is never compiled.
+template <>
+struct CompileTimeEnabledReferenceMul</*ReferenceMulIsEnabled=*/false> {
+  template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+            typename Spec>
+  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                  const Spec& spec, Matrix<DstScalar>* dst) {
+    RUY_DCHECK(false);
+  }
 };
 
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
-struct MulDispatch {
-  void Mul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-           const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
-    gemmlowp::ScopedProfilingLabel label("Mul");
+void DispatchMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                 const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
+  static_assert(CompiledPaths != Path::kNone, "Must compile at least one Path");
+  static_assert((CompiledPaths & ~kAllPaths) == Path::kNone,
+                "CompiledPaths must be a subset of ruy::kAllPaths");
 
-    const Path runtime_enabled_paths = context->GetRuntimeEnabledPaths();
-    // The above query should resolve to specific paths, never return kNone.
-    RUY_DCHECK(runtime_enabled_paths != Path::kNone);
+  gemmlowp::ScopedProfilingLabel label("Mul");
 
-    Path single_path =
-        GetMostSignificantPath(CompiledPaths & runtime_enabled_paths);
-    auto tr_mul_impl_run_fn =
-        GetTrMulImplRunFn<CompiledPaths, LhsScalar, RhsScalar, DstScalar, Spec>(
-            single_path);
-    context->last_taken_path = single_path;
+  EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
+  EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
+                                dst->zero_point);
+  EnforceDstSpecSupport<Spec>(spec, dst->zero_point);
 
-    EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
-    EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
-                                  dst->zero_point);
+  // This should be a constant, for a given machine and CompiledPaths.
+  // There is a back door to override it for testing, but in production it will
+  // always be the "best" Path. I.e. the one with the newest SIMD instructions
+  // available on the present machine, and avoiding Path::kReference unless
+  // no other path is compiled.
+  //
+  // Unfortunately, it is not a *static* constant, since it depends on runtime
+  // detection of the available SIMD instructions.
+  Path the_path = context->GetPathToTake<CompiledPaths>();
 
-    Matrix<LhsScalar> lhs_copy(lhs);
-    Transpose(&lhs_copy);
-    tr_mul_impl_run_fn(lhs_copy, rhs, spec, context, dst);
+  // Production code should probably never execute Path::kReference.
+  // Path::kReference implements a Mul, not a TrMul like the rest of Ruy, so if
+  // that's what we need to do, then get it out of the way before going down the
+  // TrMul path.
+  if (the_path == Path::kReference) {
+    constexpr bool ReferenceMulIsEnabled =
+        (CompiledPaths & Path::kReference) != Path::kNone;
+    CompileTimeEnabledReferenceMul<ReferenceMulIsEnabled>::Run(lhs, rhs, spec,
+                                                               dst);
+    return;
   }
-};
+
+  // As described in the comment at the top of this file, Ruy internally
+  // converts Mul into TrMul. We handle that here.
+  //
+  // This is Ruy's main code path.
+  constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
+  Matrix<LhsScalar> transposed_lhs(lhs);
+  Transpose(&transposed_lhs);
+  TrMulParams params;
+  CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
+                                        the_path, &params);
+  TrMul(&params, context);
+}
 
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/impl.h b/tensorflow/lite/experimental/ruy/impl.h
index da311ba..deadf52 100644
--- a/tensorflow/lite/experimental/ruy/impl.h
+++ b/tensorflow/lite/experimental/ruy/impl.h
@@ -16,7 +16,7 @@
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_IMPL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_IMPL_H_
 
-#include <vector>
+#include <cstring>
 
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/allocator.h"
@@ -24,6 +24,7 @@
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/pack.h"
 #include "tensorflow/lite/experimental/ruy/thread_pool.h"
@@ -32,31 +33,49 @@
 
 namespace ruy {
 
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename PackedLhsScalar, typename PackedRhsScalar,
-          typename DstScalar, typename Spec>
-struct TrMulTask final : Task {
-  using AccumScalar = typename Spec::AccumScalar;
-  TrMulTask(const Matrix<LhsScalar>& lhs_, const Matrix<RhsScalar>& rhs_,
-            Matrix<PackedLhsScalar>* packed_lhs_,
-            Matrix<PackedRhsScalar>* packed_rhs_, Matrix<DstScalar>* result_,
-            const BlockMap& block_map_,
+// Type-erased data needed for implementing TrMul.
+struct TrMulParams {
+  // Helper functions for invoking the function pointers.
+  void LhsRunPack(Tuning tuning, int start_c, int end_c) {
+    lhs_run_pack(tuning, lhs, &packed_lhs, start_c, end_c);
+  }
+  void RhsRunPack(Tuning tuning, int start_c, int end_c) {
+    rhs_run_pack(tuning, rhs, &packed_rhs, start_c, end_c);
+  }
+  void RunKernel(Tuning tuning, int start_r, int start_c, int end_r,
+                 int end_c) {
+    run_kernel(tuning, packed_lhs, packed_rhs, spec, start_r, start_c, end_r,
+               end_c, &dst);
+  }
 
+  // Function pointers to type-erased entry points for kernels and packers.
+  RunPackFn* lhs_run_pack = nullptr;
+  RunPackFn* rhs_run_pack = nullptr;
+  RunKernelFn* run_kernel = nullptr;
+
+  // Matrices and packed matrices.
+  DMatrix lhs;
+  DMatrix rhs;
+  DMatrix dst;
+  PMatrix packed_lhs;
+  PMatrix packed_rhs;
+
+  // Type-erased Spec.
+  void* spec = nullptr;
+};
+
+struct TrMulTask final : Task {
+  TrMulTask(TrMulParams* params_, const BlockMap& block_map_,
             std::atomic<std::uint32_t>* atomic_n_, std::uint32_t thread_id_,
             std::atomic<bool>* lhs_packed_, std::atomic<bool>* rhs_packed_,
-            const Spec& spec_, TuningResolver* tuning_resolver_,
-            Allocator* local_allocator_, Trace* trace_)
-      : lhs(lhs_),
-        rhs(rhs_),
-        packed_lhs(packed_lhs_),
-        packed_rhs(packed_rhs_),
-        result(result_),
+            TuningResolver* tuning_resolver_, Allocator* local_allocator_,
+            Trace* trace_)
+      : params(params_),
         block_map(block_map_),
         atomic_n(atomic_n_),
         thread_id(thread_id_),
         lhs_packed(lhs_packed_),
         rhs_packed(rhs_packed_),
-        spec(spec_),
         tuning_resolver(tuning_resolver_),
         local_allocator(local_allocator_),
         trace(trace_) {}
@@ -80,13 +99,7 @@
       memset(local_rhs_packed, 0, num_blocks_of_cols * sizeof(bool));
     }
 
-    using Kernel =
-        Kernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
-    using LhsKernelLayout = typename Kernel::RhsLayout;
-    using RhsKernelLayout = typename Kernel::RhsLayout;
-
     const Tuning tuning = tuning_resolver->Resolve();
-    Kernel kernel(tuning);
 
     TraceRecordThreadLoopStart(thread_id, trace);
 
@@ -103,6 +116,7 @@
     GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c,
                          &end_r, &end_c);
     TraceRecordBlockCoordsComputed(n, trace);
+
     while (n < num_blocks) {
       // Get index of next block to handle
       next_n = atomic_n->fetch_add(1, std::memory_order_relaxed);
@@ -133,8 +147,7 @@
       // different contention with other processes.
       if (local_lhs_packed && !local_lhs_packed[block_r]) {
         if (!lhs_packed[block_r].load(std::memory_order_acquire)) {
-          Pack<ThePath, LhsKernelLayout>(tuning, lhs, packed_lhs, start_r,
-                                         end_r);
+          params->LhsRunPack(tuning, start_r, end_r);
           TraceRecordBlockPackedLhs(n, trace);
           local_lhs_packed[block_r] = true;
           lhs_packed[block_r].store(true, std::memory_order_release);
@@ -143,16 +156,14 @@
       // Maybe pack the current RHS block. Same comments as above for LHS.
       if (local_rhs_packed && !local_rhs_packed[block_c]) {
         if (!rhs_packed[block_c].load(std::memory_order_acquire)) {
-          Pack<ThePath, RhsKernelLayout>(tuning, rhs, packed_rhs, start_c,
-                                         end_c);
+          params->RhsRunPack(tuning, start_c, end_c);
           TraceRecordBlockPackedRhs(n, trace);
           local_rhs_packed[block_c] = true;
           rhs_packed[block_c].store(true, std::memory_order_release);
         }
       }
       // Actually do matrix multiplication work
-      RunKernel(kernel, *packed_lhs, *packed_rhs, spec, start_r, start_c, end_r,
-                end_c, result);
+      params->RunKernel(tuning, start_r, start_c, end_r, end_c);
       TraceRecordBlockFinished(n, trace);
       n = next_n;
       block_r = next_block_r;
@@ -169,54 +180,20 @@
   }
 
  private:
-  const Matrix<LhsScalar>& lhs;
-  const Matrix<RhsScalar>& rhs;
-  Matrix<PackedLhsScalar>* packed_lhs;
-  Matrix<PackedRhsScalar>* packed_rhs;
-
-  Matrix<DstScalar>* result;
+  TrMulParams* params;
   const BlockMap& block_map;
   std::atomic<std::uint32_t>* atomic_n;
   std::uint32_t thread_id;
   std::atomic<bool>* lhs_packed;
   std::atomic<bool>* rhs_packed;
-  const Spec& spec;
   TuningResolver* tuning_resolver;
   Allocator* local_allocator;
   Trace* trace;
 };
 
-template <typename FixedKernelLayout, typename Scalar, typename PackedScalar>
-void CreatePackedMatrix(Tuning tuning, const Matrix<Scalar>& src,
-                        Allocator* allocator,
-                        Matrix<PackedScalar>* packed) {
-  packed->zero_point = src.zero_point - SymmetricZeroPoint<Scalar>() +
-                       SymmetricZeroPoint<PackedScalar>();
-  packed->layout = src.layout;
-  packed->layout.order = Order::kColMajor;
-  packed->layout.rows = round_up_pot(src.layout.rows, FixedKernelLayout::kRows);
-  packed->layout.cols = round_up_pot(src.layout.cols, FixedKernelLayout::kCols);
-  packed->layout.kernel.order = FixedKernelLayout::kOrder;
-  packed->layout.kernel.rows = FixedKernelLayout::kRows;
-  packed->layout.kernel.cols = FixedKernelLayout::kCols;
-  int innersize = (packed->layout.order == Order::kColMajor)
-                      ? packed->layout.rows
-                      : packed->layout.cols;
-  int outersize = (packed->layout.order == Order::kColMajor)
-                      ? packed->layout.cols
-                      : packed->layout.rows;
-  if (RUY_OPT_SET & RUY_OPT_AVOID_ALIASING) {
-    if (tuning == Tuning::kInOrder) {
-      packed->layout.stride =
-          (innersize * sizeof(Scalar)) % 1024 ? innersize : innersize + 64;
-    } else {
-      packed->layout.stride =
-          (innersize * sizeof(Scalar)) % 4096 ? innersize : innersize + 64;
-    }
-  } else {
-    packed->layout.stride = innersize;
-  }
-  allocator->Allocate(outersize * packed->layout.stride, &packed->data);
+inline void AllocatePMatrix(Allocator* allocator, PMatrix* packed) {
+  packed->data = allocator->AllocateBytes(DataSize(*packed));
+  packed->sums = allocator->AllocateBytes(SumsSize(*packed));
 }
 
 inline int GetThreadCount(Context* context, int rows, int cols, int depth) {
@@ -227,12 +204,8 @@
   return clamp(guess, 1, context->max_num_threads);
 }
 
-template <typename Spec>
-LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
-                               int depth) {
-  if (Spec::kLoopStructure != LoopStructure::kAuto) {
-    return Spec::kLoopStructure;
-  }
+inline LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
+                                      int depth) {
   if (thread_count == 1 &&
       (rows + cols) * depth < kCacheFriendlyLoopThreshold) {
     return LoopStructure::kSimple;
@@ -248,182 +221,102 @@
   return tuning_resolver->Resolve();
 }
 
-// General TrMulImpl definition.  See the reference-code implementation given
-// in the partial specialization below for ThePath==kReference.
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-struct TrMulImpl {
-  using AccumScalar = typename Spec::AccumScalar;
-  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-                  const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
-    // Fall back, if needed, to Path::kStandardCpp.
-    if (ThePath != Path::kStandardCpp) {
-      if (!IsLinear(lhs.layout) || !IsLinear(rhs.layout) ||
-          !IsLinear(dst->layout) || lhs.layout.order != Order::kColMajor ||
-          rhs.layout.order != Order::kColMajor ||
-          dst->layout.order != Order::kColMajor) {
-        TrMulImpl<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
-                  Spec>::Run(lhs, rhs, spec, context, dst);
-        return;
-      }
-    }
+inline void TrMul(TrMulParams* params, Context* context) {
+  gemmlowp::ScopedProfilingLabel label("TrMul");
 
-    gemmlowp::ScopedProfilingLabel label("TrMulImpl");
-    using PackedLhsScalar = PackedType<ThePath, LhsScalar>;
-    using PackedRhsScalar = PackedType<ThePath, RhsScalar>;
-    using Kernel =
-        Kernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
-    using LhsKernelLayout = typename Kernel::LhsLayout;
-    using RhsKernelLayout = typename Kernel::RhsLayout;
+  PMatrix& packed_lhs = params->packed_lhs;
+  PMatrix& packed_rhs = params->packed_rhs;
+  DMatrix& lhs = params->lhs;
+  DMatrix& rhs = params->rhs;
 
-    const int rows = lhs.layout.cols;
-    const int cols = rhs.layout.cols;
-    const int depth = lhs.layout.rows;
-    const int rows_rounded_up = round_up_pot(rows, LhsKernelLayout::kCols);
-    const int cols_rounded_up = round_up_pot(cols, RhsKernelLayout::kCols);
+  const int rows = lhs.layout.cols;
+  const int cols = rhs.layout.cols;
+  const int depth = lhs.layout.rows;
+  const int rows_rounded_up = packed_lhs.layout.cols;
+  const int cols_rounded_up = packed_rhs.layout.cols;
 
-    int thread_count = GetThreadCount(context, rows, cols, depth);
-    const auto loop_structure =
-        GetLoopStructure<Spec>(thread_count, rows, cols, depth);
-    const Tuning tuning = GetTuning(context);
-    Allocator* allocator = context->GetMainAllocator();
+  int thread_count = GetThreadCount(context, rows, cols, depth);
+  const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
+  const Tuning tuning = GetTuning(context);
+  Allocator* allocator = context->GetMainAllocator();
+  AllocatePMatrix(allocator, &packed_lhs);
+  AllocatePMatrix(allocator, &packed_rhs);
 
-    // The packed matrices.
-    Matrix<PackedLhsScalar> packed_lhs;
-    Matrix<PackedRhsScalar> packed_rhs;
-    using LhsSumsType = typename Matrix<PackedLhsScalar>::SumsType;
-    using RhsSumsType = typename Matrix<PackedRhsScalar>::SumsType;
-    const bool lhs_use_packing_sums =
-        Pack<PackedRhsScalar>(rhs.zero_point) != 0;
-    const bool rhs_use_packing_sums =
-        Pack<PackedLhsScalar>(lhs.zero_point) != 0;
+  if (loop_structure == LoopStructure::kSimple) {
+    gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop");
 
-    // Allocate the packed matrices.
-    CreatePackedMatrix<LhsKernelLayout>(tuning, lhs, allocator, &packed_lhs);
-    CreatePackedMatrix<RhsKernelLayout>(tuning, rhs, allocator, &packed_rhs);
-    if (lhs_use_packing_sums) {
-      allocator->Allocate(rows_rounded_up, &packed_lhs.sums);
-    }
-    if (rhs_use_packing_sums) {
-      allocator->Allocate(cols_rounded_up, &packed_rhs.sums);
-    }
-
-    if (loop_structure == LoopStructure::kSimple) {
-      gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop");
-
-      Pack<ThePath, LhsKernelLayout>(tuning, lhs, &packed_lhs, 0,
-                                     rows_rounded_up);
-      Pack<ThePath, RhsKernelLayout>(tuning, rhs, &packed_rhs, 0,
-                                     cols_rounded_up);
-
-      Kernel kernel(tuning);
-      RunKernel(kernel, packed_lhs, packed_rhs, spec, 0, 0, rows_rounded_up,
-                cols_rounded_up, dst);
-
-      allocator->FreeAll();
-      return;
-    }
-
-    gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case");
-
-    auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols);
-    TraceRecordStart(trace);
-
-    // Initialize block map.
-    BlockMap block_map;
-    MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
-                 LhsKernelLayout::kCols, RhsKernelLayout::kCols,
-                 sizeof(LhsScalar), sizeof(RhsScalar), &block_map);
-    std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
-    std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
-    std::uint32_t num_blocks = NumBlocks(block_map);
-    RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols);
-
-    // Initialize per-thread state.
-    thread_count = clamp(thread_count, 1, num_blocks);
-    context->EnsureNPerThreadStates(thread_count);
-    for (auto& per_thread_state : context->per_thread_states) {
-      per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning);
-    }
-
-    // Allocate memory.
-    std::atomic<bool>* lhs_packed;
-    allocator->Allocate(num_blocks_of_rows, &lhs_packed);
-    std::atomic<bool>* rhs_packed;
-    allocator->Allocate(num_blocks_of_cols, &rhs_packed);
-    std::atomic<std::uint32_t>* atomic_n;
-    allocator->Allocate(1, &atomic_n);
-    using TaskType = TrMulTask<ThePath, LhsScalar, RhsScalar, PackedLhsScalar,
-                               PackedRhsScalar, DstScalar, Spec>;
-    TaskType* tasks;
-    allocator->Allocate(thread_count, &tasks);
-    Task** tasks_ptrs;
-    allocator->Allocate(thread_count, &tasks_ptrs);
-
-    // Initialize allocated data.
-    for (int i = 0; i < num_blocks_of_rows; i++) {
-      lhs_packed[i].store(false, std::memory_order_release);
-    }
-    for (int i = 0; i < num_blocks_of_cols; i++) {
-      rhs_packed[i].store(false, std::memory_order_release);
-    }
-    atomic_n->store(thread_count);
-
-    for (int i = 0; i < thread_count; i++) {
-      tasks_ptrs[i] = static_cast<Task*>(tasks + i);
-      new (tasks_ptrs[i])
-          TaskType(lhs, rhs, &packed_lhs, &packed_rhs, dst, block_map, atomic_n,
-                   i, lhs_packed, rhs_packed, spec,
-                   &context->per_thread_states[i]->tuning_resolver,
-                   &context->per_thread_states[i]->allocator, trace);
-    }
-
-    // Do the computation.
-    TraceRecordExecute(trace);
-    TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace);
-
-    context->workers_pool.Execute(thread_count, tasks_ptrs);
-
-    // Finish up.
-    for (int i = 0; i < thread_count; i++) {
-      tasks[i].~TaskType();
-    }
-
-    TraceRecordEnd(trace);
+    params->LhsRunPack(tuning, 0, rows_rounded_up);
+    params->RhsRunPack(tuning, 0, cols_rounded_up);
+    params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up);
 
     allocator->FreeAll();
+    return;
   }
-};
 
-// Reference code for TrMul, doing a transpose-multiply: compute
-//   Destination = Transpose(LHS) * RHS
-template <typename LhsScalar, typename RhsScalar, typename DstScalar,
-          typename Spec>
-struct TrMulImpl<Path::kReference, LhsScalar, RhsScalar, DstScalar, Spec> {
-  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-                  const Spec& spec, Context*, Matrix<DstScalar>* dst) {
-    gemmlowp::ScopedProfilingLabel label("TrMulImpl Reference");
-    for (int i = 0; i < lhs.layout.cols; i++) {
-      for (int j = 0; j < rhs.layout.cols; j++) {
-        using AccumScalar = typename Spec::AccumScalar;
-        AccumScalar accum = 0;
-        for (int k = 0; k < lhs.layout.rows; k++) {
-          AccumScalar lhs_val = Element(lhs, k, i);
-          AccumScalar rhs_val = Element(rhs, k, j);
-          accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point);
-        }
-        if (spec.bias) {
-          accum += spec.bias[i];
-        }
-        ApplyMultiplier(spec, i, &accum);
-        accum += dst->zero_point;
-        accum = std::min<AccumScalar>(accum, spec.clamp_max);
-        accum = std::max<AccumScalar>(accum, spec.clamp_min);
-        *ElementPtr(dst, i, j) = static_cast<DstScalar>(accum);
-      }
-    }
+  gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case");
+
+  auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols);
+  TraceRecordStart(trace);
+
+  // Initialize block map.
+  BlockMap block_map;
+  MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
+               packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
+               packed_lhs.data_type.size, packed_rhs.data_type.size,
+               &block_map);
+  std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
+  std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
+  std::uint32_t num_blocks = NumBlocks(block_map);
+  RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols);
+
+  // Initialize per-thread state.
+  thread_count = clamp(thread_count, 1, num_blocks);
+  context->EnsureNPerThreadStates(thread_count);
+  for (auto& per_thread_state : context->per_thread_states) {
+    per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning);
   }
-};
+
+  // Allocate memory.
+  std::atomic<bool>* lhs_packed;
+  allocator->Allocate(num_blocks_of_rows, &lhs_packed);
+  std::atomic<bool>* rhs_packed;
+  allocator->Allocate(num_blocks_of_cols, &rhs_packed);
+  std::atomic<std::uint32_t>* atomic_n;
+  allocator->Allocate(1, &atomic_n);
+  TrMulTask* tasks;
+  allocator->Allocate(thread_count, &tasks);
+
+  // Initialize allocated data.
+  for (int i = 0; i < num_blocks_of_rows; i++) {
+    lhs_packed[i].store(false, std::memory_order_release);
+  }
+  for (int i = 0; i < num_blocks_of_cols; i++) {
+    rhs_packed[i].store(false, std::memory_order_release);
+  }
+  atomic_n->store(thread_count);
+
+  for (int i = 0; i < thread_count; i++) {
+    new (tasks + i)
+        TrMulTask(params, block_map, atomic_n, i, lhs_packed, rhs_packed,
+                  &context->per_thread_states[i]->tuning_resolver,
+                  &context->per_thread_states[i]->allocator, trace);
+  }
+
+  // Do the computation.
+  TraceRecordExecute(trace);
+  TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace);
+
+  context->workers_pool.Execute(thread_count, tasks);
+
+  // Finish up.
+  for (int i = 0; i < thread_count; i++) {
+    tasks[i].~TrMulTask();
+  }
+
+  TraceRecordEnd(trace);
+
+  allocator->FreeAll();
+}
 
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h
new file mode 100644
index 0000000..9a7d6ee
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@@ -0,0 +1,382 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Internal types and helpers for matrices.
+//
+// Ruy has a couple slightly different notions of matrices, besides the
+// Matrix<T> class that we expose to the user-facing API.
+//
+// TODO(silvasean): Put parts of this architecture description somewhere more
+// prominent.
+//
+// The 4 different matrix types are:
+// - Matrix<T>: This is a user-facing type on Ruy's external API boundary.
+// - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic".
+// - PMatrix: This represents a packed matrix, which requires tracking kernel
+// layout and row/column sums for quantization. It is type-erased.
+// - PackedMatrix<T>: This is a statically typed variant of PMatrix for
+// convenience inside typed routines.
+//
+// Note that Matrix<T> is *not* implemented in terms of the internal types. It
+// is an independent, simple, and user-facing type.
+//
+// The use of type-erasure might seem surprising for a library like Ruy with a
+// heavily-templated entry point, but it is motivated by the desire for most of
+// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
+// main parts:
+// - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry
+// point, along with routines that select RunKernel and RunPack implementations
+// statically based on those template parameters.
+// - "back-end" (kernel.h, pack.h)- this consists of the implementations of
+// RunKernel and RunPack, often in assembly code, which are the building blocks
+// that Ruy calls to perform matrix multiplication.  These are templated so that
+// only the requested types/Path's are actually emitted by the compiler.
+// - "middle-end" (impl.h) - this is the part of Ruy that orchestrates the
+// calls to the "back-end" optimized building blocks. This layer has to deal
+// with issues like cache locality and low-overhead multi-threading.
+//
+// There is a desire for the "middle-end" to be non-templated in order to
+// simplify the implementation and reduce code-size. We type-erase when going
+// from the "front-end" to the "middle-end", and un-type-erase going from the
+// "middle-end" to the "back-end". The un-type-erasure is possible because the
+// "front-end" is responsible for instantiating the needed "back-end" templates,
+// and thus the static type information is still present.
+//
+// Each layer of Ruy uses matrix types:
+// - "front-end": Matrix<T>
+// - "middle-end": DMatrix, PMatrix
+// - "back-end": Matrix<T>, PackedMatrix<T>
+//
+// The use of separate types for packed matrices is not essential, but makes it
+// obvious at a glance whether a matrix is a packed matrix or not. We would
+// reconsider this decision if there was significant duplication between packed
+// and unpacked matrices, but that doesn't seem to be the case at the moment.
+//
+// Another goal is to keep the user-facing Matrix<T> as simple and
+// understandable as possible. Ideally, a user should be able to read the struct
+// definition for Matrix<T> and see a very simple definition with no internal
+// details like sums and kernel block layout.
+//
+// To present another structured view of our various matrix types, here's a
+// table:
+//                User matrices    Packed matrices
+//             +----------------------------------
+// Templated   |  Matrix<T>        PackedMatrix<T>
+// Type-erased |  DMatrix          PMatrix
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
+
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+
+namespace ruy {
+
+// KernelLayout describes small-scale block structure in a packed matrix layout.
+//
+// This is is sometimes known as "tiling" in other contexts.
+//
+// For example, consider a packed matrix in column-major format with a
+// column-major KernelLayout. The matrix logically has a shape of
+// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
+// of shape `[cols / kcols, rows / krows, kcols, krows]`.
+//
+// Note that in the case of kcols=1, krows=1, this degenerates to
+// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
+// structure.
+struct KernelLayout {
+  Order order = Order::kColMajor;
+  std::uint8_t rows = 1;
+  std::uint8_t cols = 1;
+};
+
+// Compile time version of KernelLayout, suitable for template metaprogramming.
+// In particular, partial template specializations of Kernel use this type to
+// statically declare their kernel format.
+template <Order tOrder, int tRows, int tCols>
+struct FixedKernelLayout {
+  static constexpr Order kOrder = tOrder;
+  static constexpr int kRows = tRows;
+  static constexpr int kCols = tCols;
+};
+
+// A packed matrix has a small-scale block structure that is not present in in
+// the input matrices. This block structure is necessary for the kernels to
+// process data efficiently.
+//
+// This struct is very similar to Layout, but has the extra KernelLayout field.
+struct PackedLayout {
+  std::int32_t rows = 0;
+  std::int32_t cols = 0;
+  // Stride is the offset between two adjacent matrix elements
+  // in the non-contiguous direction.
+  std::int32_t stride = 0;
+  Order order = Order::kColMajor;
+  // Small scale layout shuffling, potentially departing from
+  // linear row-major or column-major storage. See KernelLayout.
+  KernelLayout kernel;
+};
+
+// Dynamic representation for a type.
+//
+// The most important field in this struct is the size, which Ruy uses to know
+// how much memory to allocate without having to be templated on a type.
+// Signed-ness and floating-point-ness are mainly present as debugging checks.
+//
+// Note: Ruy does not use this struct to to dynamically dispatch between
+// different typed implementations. As described in the comment at the top of
+// this file, Ruy's "front-end", which is templated, instantiates all the
+// necessary "back-end" routines with complete static knowledge of all the
+// types.
+struct Type {
+  template <typename T>
+  static Type Create() {
+    Type ret;
+    ret.is_signed = std::is_signed<T>::value;
+    ret.is_floating_point = std::is_floating_point<T>::value;
+    ret.size = sizeof(T);
+    return ret;
+  }
+
+  template <typename T>
+  void AssertIs() const {
+    RUY_DCHECK(is_signed == Create<T>().is_signed);
+    RUY_DCHECK(is_floating_point == Create<T>().is_floating_point);
+    RUY_DCHECK(size == Create<T>().size);
+  }
+
+  bool is_signed = false;
+  bool is_floating_point = false;
+  std::uint8_t size = 0;
+};
+
+// Type-erased matrix.
+struct DMatrix {
+  Type data_type;
+  void* data = nullptr;
+  Layout layout;
+  std::int32_t zero_point = 0;
+};
+
+// Type-erased packed matrix.
+struct PMatrix {
+  Type data_type;
+  void* data = nullptr;
+  Type sums_type;
+  void* sums = nullptr;
+  PackedLayout layout;
+  std::int32_t zero_point = 0;
+};
+
+// Convenient typed helper for packed matrices.
+template <typename Scalar>
+struct PackedMatrix {
+  // The row/column sums needed for quantized matrix multiplication when
+  // the opposite operand of the multiplication uses a non-symmetric zero
+  // point.
+  // This member is only relevant for packed matrices.
+  // Additionally, Ruy always uses 32-bit signed accumulators for quantized
+  // matrix multiplication.
+  // For floating point types, there is no quantization, so this pointer
+  // will always be null. We still need code referencing it to compile
+  // though, even if it is always branched around. Hence we use Scalar*
+  // itself as the type in that case.
+  using SumsType =
+      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
+                                std::int32_t>::type;
+
+  Scalar* data = nullptr;
+  SumsType* sums = nullptr;
+  PackedLayout layout;
+  std::int32_t zero_point = 0;
+};
+
+template <typename T>
+DMatrix ToDMatrix(const Matrix<T>& matrix) {
+  DMatrix ret;
+  ret.data_type = Type::Create<T>();
+  ret.data = ToVoidPtr(matrix.data.get());
+  ret.layout = matrix.layout;
+  ret.zero_point = matrix.zero_point;
+  return ret;
+}
+
+template <typename T>
+Matrix<T> ToMatrix(const DMatrix& dmatrix) {
+  dmatrix.data_type.AssertIs<T>();
+  Matrix<T> ret;
+  ret.data = static_cast<T*>(dmatrix.data);
+  ret.layout = dmatrix.layout;
+  ret.zero_point = dmatrix.zero_point;
+  return ret;
+}
+
+template <typename T>
+PackedMatrix<T> ToPackedMatrix(const PMatrix& pmatrix) {
+  using SumsType = typename PackedMatrix<T>::SumsType;
+  pmatrix.data_type.AssertIs<T>();
+  pmatrix.sums_type.AssertIs<SumsType>();
+  PackedMatrix<T> ret;
+  ret.data = static_cast<T*>(pmatrix.data);
+  ret.sums = static_cast<SumsType*>(pmatrix.sums);
+  ret.layout = pmatrix.layout;
+  ret.zero_point = pmatrix.zero_point;
+  return ret;
+}
+
+// Helpers for Layout / PackedLayout.
+
+inline bool IsPacked(const Layout& layout) {
+  if (layout.order == Order::kColMajor) {
+    return layout.stride == layout.rows;
+  } else {
+    return layout.stride == layout.cols;
+  }
+}
+
+inline bool IsRowMajor(const Layout& layout) {
+  return layout.order == Order::kRowMajor;
+}
+
+template <typename LayoutOrPackedLayout>
+inline bool IsColMajor(const LayoutOrPackedLayout& layout) {
+  return layout.order == Order::kColMajor;
+}
+
+template <typename LayoutOrPackedLayout>
+inline int FlatSize(const LayoutOrPackedLayout& layout) {
+  const int outerdim =
+      layout.order == Order::kColMajor ? layout.cols : layout.rows;
+  return layout.stride * outerdim;
+}
+
+// TODO(b/130417400) add a unit test
+inline int Offset(const Layout& layout, int row, int col) {
+  // TODO(benoitjacob)  - should check this but this make the _slow tests take
+  // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
+  // bypassing the check?
+  // RUY_DCHECK_GE(row, 0);
+  // RUY_DCHECK_GE(col, 0);
+  // RUY_DCHECK_LT(row, layout.rows);
+  // RUY_DCHECK_LT(col, layout.cols);
+  int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
+  int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
+  return row * row_stride + col * col_stride;
+}
+
+// TODO(b/130417400) add a unit test
+inline int Offset(const PackedLayout& layout, int row, int col) {
+  RUY_DCHECK(is_pot(layout.kernel.rows));
+  RUY_DCHECK(is_pot(layout.kernel.cols));
+  int row_outer = row & ~(layout.kernel.rows - 1);
+  int col_outer = col & ~(layout.kernel.cols - 1);
+  int row_stride_outer =
+      layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
+  int col_stride_outer =
+      layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
+  int offset_outer =
+      row_outer * row_stride_outer + col_outer * col_stride_outer;
+  int row_inner = row - row_outer;
+  int col_inner = col - col_outer;
+  int row_stride_inner =
+      layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
+  int col_stride_inner =
+      layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
+  int offset_inner =
+      row_inner * row_stride_inner + col_inner * col_stride_inner;
+  return offset_outer + offset_inner;
+}
+
+// Helpers for Matrix<T>.
+
+template <typename Scalar>
+const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
+  return mat.data.get() + Offset(mat.layout, row, col);
+}
+
+template <typename Scalar>
+Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
+  return mat->data.get() + Offset(mat->layout, row, col);
+}
+
+template <typename Scalar>
+Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
+  return *ElementPtr(mat, row, col);
+}
+
+// Helpers for PackedMatrix<T>.
+// Duplicated from Matrix<T>, but the duplication seems acceptable.
+
+template <typename Scalar>
+const Scalar* ElementPtr(const PackedMatrix<Scalar>& mat, int row, int col) {
+  return mat.data + Offset(mat.layout, row, col);
+}
+
+template <typename Scalar>
+Scalar* ElementPtr(PackedMatrix<Scalar>* mat, int row, int col) {
+  return mat->data + Offset(mat->layout, row, col);
+}
+
+template <typename Scalar>
+Scalar Element(const PackedMatrix<Scalar>& mat, int row, int col) {
+  return *ElementPtr(mat, row, col);
+}
+
+// Helpers for PMatrix.
+
+inline std::size_t DataSize(const PMatrix& packed) {
+  return FlatSize(packed.layout) * packed.data_type.size;
+}
+
+inline std::size_t SumsSize(const PMatrix& packed) {
+  // Packed matrices are only relevant for Ruy's TrMul implementations. For
+  // TrMul, the number of sums is always equal to the number of columns.
+  return packed.layout.cols * packed.sums_type.size;
+}
+
+// Transpose helpers.
+
+inline void Transpose(Order* order) {
+  *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
+}
+
+inline void Transpose(Layout* layout) {
+  Transpose(&layout->order);
+  std::swap(layout->rows, layout->cols);
+}
+
+template <typename Scalar>
+inline void Transpose(Matrix<Scalar>* matrix) {
+  Transpose(&matrix->layout);
+}
+
+// Helpers for KernelLayout.
+
+template <typename FixedKernelLayout>
+KernelLayout ToKernelLayout() {
+  KernelLayout ret;
+  ret.order = FixedKernelLayout::kOrder;
+  ret.rows = FixedKernelLayout::kRows;
+  ret.cols = FixedKernelLayout::kCols;
+  return ret;
+}
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
diff --git a/tensorflow/lite/experimental/ruy/kernel.cc b/tensorflow/lite/experimental/ruy/kernel.cc
index 98fb737..5287332 100644
--- a/tensorflow/lite/experimental/ruy/kernel.cc
+++ b/tensorflow/lite/experimental/ruy/kernel.cc
@@ -24,6 +24,7 @@
 #define RUY_ASM_LABEL_STORE_UINT8 91
 #define RUY_ASM_LABEL_STORE_INT8 92
 #define RUY_ASM_LABEL_STORE_INT16 93
+#define RUY_ASM_LABEL_STORE_INT32 94
 #define RUY_ASM_LABEL_AFTER_STORE 99
 
 #define RUY_OFFSET_BIAS 0
@@ -49,8 +50,8 @@
 #define RUY_OFFSET_DST_STRIDE 112
 #define RUY_OFFSET_DEPTH 116
 #define RUY_OFFSET_CLAMP_MIN 120
-#define RUY_OFFSET_CLAMP_MAX 122
-#define RUY_OFFSET_FLAGS 124
+#define RUY_OFFSET_CLAMP_MAX 124
+#define RUY_OFFSET_FLAGS 128
 
 template <typename Params>
 void CheckOffsetsInKernelParams8bit(const Params&) {
@@ -476,6 +477,12 @@
         "sub v17.4s, v17.4s, v11.4s\n"
         "sub v18.4s, v18.4s, v11.4s\n"
         "sub v19.4s, v19.4s, v11.4s\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
         "402:\n"
 
         // At this point we have computed the final int32 values. Now we
@@ -508,18 +515,26 @@
         "sqrdmulh v18.4s, v18.4s, v15.4s\n"
         "sqrdmulh v19.4s, v19.4s, v15.4s\n"
 
-        // We have some rounding
-        // division-by-power-of-two to do. Normally, this should be just
-        // a rounding-right-shift, srshl. However, that does not quite
-        // implement the round-to-nearest semantics that we need. See
-        // Appendix B of https://arxiv.org/pdf/1712.05877.pdf
-
-        // Because we are going to get benchmarked against less-careful
-        // competition, let's give people the ability to get faster, less
-        // careful arithmetic if they want --- define RUY_SLOPPY. We don't
-        // recommend using that in production, we have observed measurable
-        // loss of accuracy from this on MobileNets (which is how we noticed
-        // this whole issue in the first place).
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
 #if !(RUY_OPT_SET & RUY_OPT_NATIVE_ROUNDING)
         // Fix up values to be right-shifted, so that the (round to nearest,
         // break ties upward) behavior of srshl applied to these fixed-up
@@ -916,6 +931,108 @@
         RUY_MAKE_ZERO(v16)
         RUY_MAKE_ZERO(v17)
 
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        RUY_MAKE_ZERO(v20)
+        RUY_MAKE_ZERO(v21)
+        RUY_MAKE_ZERO(v22)
+        RUY_MAKE_ZERO(v23)
+        RUY_MAKE_ZERO(v24)
+        RUY_MAKE_ZERO(v25)
+        RUY_MAKE_ZERO(v26)
+        RUY_MAKE_ZERO(v27)
+        RUY_MAKE_ZERO(v28)
+        RUY_MAKE_ZERO(v29)
+        RUY_MAKE_ZERO(v30)
+        RUY_MAKE_ZERO(v31)
+
+        // Compute how much of the 4x4 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x4, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+        "sub w1, %w[dst_rows], %w[row]\n"
+        "sub w2, %w[dst_cols], %w[col]\n"
+        "mov w3, #4\n"
+        "cmp w1, #4\n"
+        // Compute w1 = how many rows of the 4x4 block fit
+        "csel w1, w1, w3, le\n"
+        "cmp w2, #4\n"
+        // Compute w2 = how many cols of the 4x4 block fit
+        "csel w2, w2, w3, le\n"
+
+        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
+        "cmp w1, w3\n"
+        "ccmp w2, w3, 0, eq\n"
+        "mov x4, %[dst_ptr]\n"
+        // Yes, all of the 4x4 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x4 block fits.
+        // Store to dst_tmp_buf
+        "str q16, [%[dst_tmp_buf], #0]\n"
+        "str q17, [%[dst_tmp_buf], #16]\n"
+        "str q18, [%[dst_tmp_buf], #32]\n"
+        "str q19, [%[dst_tmp_buf], #48]\n"
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov x3, %[dst_tmp_buf]\n"
+        "mov w6, #0\n"
+        "50:\n"
+        "mov w5, #0\n"
+        "51:\n"
+        "ldr w7, [x3, x5, lsl #2]\n"
+        "str w7, [x4, x5, lsl #2]\n"
+        "add w5, w5, #1\n"
+        "cmp w5, w1\n"
+        "blt 51b\n"
+        "add w6, w6, #1\n"
+        "add x3, x3, #16\n"
+        "add x4, x4, x11\n"
+        "cmp w6, w2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x4 block fits.
+        "mov x3, x4\n"
+        "st1 {v16.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v16.s}[1], [x3], #4\n"
+        "st1 {v16.s}[2], [x3], #4\n"
+        "st1 {v16.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v17.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v17.s}[1], [x3], #4\n"
+        "st1 {v17.s}[2], [x3], #4\n"
+        "st1 {v17.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v18.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v18.s}[1], [x3], #4\n"
+        "st1 {v18.s}[2], [x3], #4\n"
+        "st1 {v18.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v19.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v19.s}[1], [x3], #4\n"
+        "st1 {v19.s}[2], [x3], #4\n"
+        "st1 {v19.s}[3], [x3], #4\n"
+        "31:\n"
+
+        "add %[dst_ptr], %[dst_ptr], #16\n"
+
+        RUY_MAKE_ZERO(v16)
+        RUY_MAKE_ZERO(v17)
+        RUY_MAKE_ZERO(v18)
+        RUY_MAKE_ZERO(v19)
+
         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
 
         // For the next block: perform the first few multiply-adds on the data
@@ -1390,6 +1507,12 @@
         "sub v17.4s, v17.4s, v11.4s\n"
         "sub v18.4s, v18.4s, v11.4s\n"
         "sub v19.4s, v19.4s, v11.4s\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
         "402:\n"
 
         // At this point we have computed the final int32 values. Now we
@@ -1434,17 +1557,26 @@
         "ldr x4, [%[rhs_ptr], #56]\n"
         "sqrdmulh v19.4s, v19.4s, v15.4s\n"
 
-        // We have some rounding
-        // division-by-power-of-two to do. Normally, this should be just
-        // a rounding-right-shift, srshl. However, that does not quite
-        // implement the round-to-nearest semantics that we need. See
-        // Appendix B of https://arxiv.org/pdf/1712.05877.pdf
-        // Because we are going to get benchmarked against less-careful
-        // competition, let's give people the ability to get faster, less
-        // careful arithmetic if they want --- define RUY_SLOPPY. We don't
-        // recommend using that in production, we have observed measurable
-        // loss of accuracy from this on MobileNets (which is how we noticed
-        // this whole issue in the first place).
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
 #if !(RUY_OPT_SET & RUY_OPT_NATIVE_ROUNDING)
         // Fix up values to be right-shifted, so that the (round to nearest,
         // break ties upward) behavior of srshl applied to these fixed-up
@@ -1859,6 +1991,130 @@
         RUY_MAKE_ZERO(v16)
         RUY_MAKE_ZERO(v17)
 
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        "ldr x1, [%[lhs_ptr], #8]\n"
+        "ldr x2, [%[lhs_ptr], #24]\n"
+        "ldr x3, [%[lhs_ptr], #40]\n"
+        "ldr x4, [%[lhs_ptr], #56]\n"
+
+        "ins v0.d[1], x1\n"
+        "ldr x1, [%[rhs_ptr], #8]\n"
+        "ins v1.d[1], x2\n"
+        "ldr x2, [%[rhs_ptr], #24]\n"
+        "ins v2.d[1], x3\n"
+        "ldr x3, [%[rhs_ptr], #40]\n"
+        "ins v3.d[1], x4\n"
+        "ldr x4, [%[rhs_ptr], #56]\n"
+        "ins v4.d[1], x1\n"
+        "ins v5.d[1], x2\n"
+        "ins v6.d[1], x3\n"
+        "ins v7.d[1], x4\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+
+        RUY_MAKE_ZERO(v20)
+        "add %[lhs_ptr], %[lhs_ptr], #64\n"
+        RUY_MAKE_ZERO(v21)
+        "add %[rhs_ptr], %[rhs_ptr], #64\n"
+        RUY_MAKE_ZERO(v22)
+
+        RUY_MAKE_ZERO(v23)
+        RUY_MAKE_ZERO(v24)
+        RUY_MAKE_ZERO(v25)
+        RUY_MAKE_ZERO(v26)
+        RUY_MAKE_ZERO(v27)
+        RUY_MAKE_ZERO(v28)
+        RUY_MAKE_ZERO(v29)
+        RUY_MAKE_ZERO(v30)
+
+        // Compute how much of the 4x4 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x4, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+        "sub w1, %w[dst_rows], %w[row]\n"
+        RUY_MAKE_ZERO(v31)
+        "sub w2, %w[dst_cols], %w[col]\n"
+        "mov w3, #4\n"
+        "cmp w1, #4\n"
+        // Compute w1 = how many rows of the 4x4 block fit
+        "csel w1, w1, w3, le\n"
+        "cmp w2, #4\n"
+        // Compute w2 = how many cols of the 4x4 block fit
+        "csel w2, w2, w3, le\n"
+
+        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
+        "cmp w1, w3\n"
+        "ccmp w2, w3, 0, eq\n"
+        "mov x4, %[dst_ptr]\n"
+        // Yes, all of the 4x4 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x4 block fits.
+        // Store to dst_tmp_buf
+        "str q16, [%[dst_tmp_buf], #0]\n"
+        "str q17, [%[dst_tmp_buf], #16]\n"
+        "str q18, [%[dst_tmp_buf], #32]\n"
+        "str q19, [%[dst_tmp_buf], #48]\n"
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov x3, %[dst_tmp_buf]\n"
+        "mov w6, #0\n"
+        "50:\n"
+        "mov w5, #0\n"
+        "51:\n"
+        "ldr w7, [x3, x5, lsl #2]\n"
+        "str w7, [x4, x5, lsl #2]\n"
+        "add w5, w5, #1\n"
+        "cmp w5, w1\n"
+        "blt 51b\n"
+        "add w6, w6, #1\n"
+        "add x3, x3, #16\n"
+        "add x4, x4, x11\n"
+        "cmp w6, w2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x4 block fits.
+        "mov x3, x4\n"
+        "st1 {v16.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v16.s}[1], [x3], #4\n"
+        "st1 {v16.s}[2], [x3], #4\n"
+        "st1 {v16.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v17.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v17.s}[1], [x3], #4\n"
+        "st1 {v17.s}[2], [x3], #4\n"
+        "st1 {v17.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v18.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v18.s}[1], [x3], #4\n"
+        "st1 {v18.s}[2], [x3], #4\n"
+        "st1 {v18.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v19.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v19.s}[1], [x3], #4\n"
+        "st1 {v19.s}[2], [x3], #4\n"
+        "st1 {v19.s}[3], [x3], #4\n"
+        "31:\n"
+
+        "add %[dst_ptr], %[dst_ptr], #16\n"
+
+        RUY_MAKE_ZERO(v16)
+        RUY_MAKE_ZERO(v17)
+        RUY_MAKE_ZERO(v18)
+        RUY_MAKE_ZERO(v19)
+
         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
 
         // For the next block: perform the first few multiply-adds on the data
@@ -2485,17 +2741,26 @@
         "sqrdmulh v30.4s, v30.4s, v14.4s\n"
         "sqrdmulh v31.4s, v31.4s, v15.4s\n"
 
-        // We have some rounding
-        // division-by-power-of-two to do. Normally, this should be just
-        // a rounding-right-shift, srshl. However, that does not quite
-        // implement the round-to-nearest semantics that we need. See
-        // Appendix B of https://arxiv.org/pdf/1712.05877.pdf
-        // Because we are going to get benchmarked against less-careful
-        // competition, let's give people the ability to get faster, less
-        // careful arithmetic if they want --- define RUY_SLOPPY. We don't
-        // recommend using that in production, we have observed measurable
-        // loss of accuracy from this on MobileNets (which is how we noticed
-        // this whole issue in the first place).
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
 #if !(RUY_OPT_SET & RUY_OPT_NATIVE_ROUNDING)
         // Fix up values to be right-shifted, so that the (round to nearest,
         // break ties upward) behavior of srshl applied to these fixed-up
@@ -3504,17 +3769,26 @@
         "sqrdmulh v30.4s, v30.4s, v14.4s\n"
         "sqrdmulh v31.4s, v31.4s, v15.4s\n"
 
-        // We have some rounding
-        // division-by-power-of-two to do. Normally, this should be just
-        // a rounding-right-shift, srshl. However, that does not quite
-        // implement the round-to-nearest semantics that we need. See
-        // Appendix B of https://arxiv.org/pdf/1712.05877.pdf
-        // Because we are going to get benchmarked against less-careful
-        // competition, let's give people the ability to get faster, less
-        // careful arithmetic if they want --- define RUY_SLOPPY. We don't
-        // recommend using that in production, we have observed measurable
-        // loss of accuracy from this on MobileNets (which is how we noticed
-        // this whole issue in the first place).
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
 #if !(RUY_OPT_SET & RUY_OPT_NATIVE_ROUNDING)
         // Fix up values to be right-shifted, so that the (round to nearest,
         // break ties upward) behavior of srshl applied to these fixed-up
diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h
index 211c67c..fae8515 100644
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@@ -16,12 +16,13 @@
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_H_
 
+#include <cstddef>
 #include <cstdint>
 
 #include "fixedpoint/fixedpoint.h"
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
-#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/size_util.h"
@@ -36,17 +37,17 @@
 
 template <Path ThePath, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
-void RunKernel(
-    const Kernel<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>& kernel,
-    const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-    const Spec& spec, int start_row, int start_col, int end_row, int end_col,
-    Matrix<DstScalar>* dst) {
+void RunKernelTyped(Tuning tuning, const PackedMatrix<LhsScalar>& lhs,
+                    const PackedMatrix<RhsScalar>& rhs, const Spec& spec,
+                    int start_row, int start_col, int end_row, int end_col,
+                    Matrix<DstScalar>* dst) {
   using Kernel = Kernel<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>;
-  using LhsLayout = typename Kernel::LhsLayout;
-  using RhsLayout = typename Kernel::RhsLayout;
+  Kernel kernel(tuning);
 #if RUY_OPT_SET & RUY_OPT_FAT_KERNEL
   kernel.Run(lhs, rhs, spec, start_row, start_col, end_row, end_col, dst);
 #else
+  using LhsLayout = typename Kernel::LhsLayout;
+  using RhsLayout = typename Kernel::RhsLayout;
   for (int col = start_col; col < end_col; col += RhsLayout::kCols) {
     int block_end_col = std::min(col + RhsLayout::kCols, end_col);
     for (int row = start_row; row < end_row; row += LhsLayout::kCols) {
@@ -57,6 +58,24 @@
 #endif
 }
 
+// Main entry point for kernels.
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void RunKernel(Tuning tuning, const PMatrix& lhs, const PMatrix& rhs,
+               void* spec, int start_row, int start_col, int end_row,
+               int end_col, DMatrix* dst) {
+  Matrix<DstScalar> mdst = ToMatrix<DstScalar>(*dst);
+  RunKernelTyped<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>(
+      tuning, ToPackedMatrix<LhsScalar>(lhs), ToPackedMatrix<RhsScalar>(rhs),
+      *static_cast<const Spec*>(spec), start_row, start_col, end_row, end_col,
+      &mdst);
+}
+
+// The signature of RunKernel is the same, regardless of template parameters.
+using RunKernelFn =
+    decltype(RunKernel<Path::kStandardCpp, std::int8_t, std::int8_t,
+                       std::int8_t, BasicSpec<std::int32_t, std::int8_t>>);
+
 // Copied from TF Lite code.
 inline std::int32_t MultiplyByQuantizedMultiplier(
     std::int32_t x, std::int32_t quantized_multiplier, int shift) {
@@ -118,16 +137,17 @@
   using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
   using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
   explicit Kernel(Tuning) {}
-  void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-           const Spec& spec, int start_row, int start_col, int end_row,
-           int end_col, Matrix<DstScalar>* dst) const {
+  void Run(const PackedMatrix<LhsScalar>& lhs,
+           const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
     gemmlowp::ScopedProfilingLabel label("Kernel (Standard Cpp)");
     const int depth = lhs.layout.rows;
     for (int i = start_row; i < end_row; i++) {
       for (int j = start_col; j < end_col; j++) {
         using AccumScalar = typename Spec::AccumScalar;
         AccumScalar accum = 0;
-        for (int k = 0; k < lhs.layout.rows; k++) {
+        for (int k = 0; k < depth; k++) {
           AccumScalar lhs_val = Element(lhs, k, i);
           AccumScalar rhs_val = Element(rhs, k, j);
           accum += lhs_val * rhs_val;
@@ -136,10 +156,10 @@
           accum += spec.bias[i];
         }
         if (lhs.zero_point) {
-          accum -= lhs.zero_point * rhs.sums.get()[j];
+          accum -= lhs.zero_point * rhs.sums[j];
         }
         if (rhs.zero_point) {
-          accum -= rhs.zero_point * lhs.sums.get()[i];
+          accum -= rhs.zero_point * lhs.sums[i];
         }
         if (lhs.zero_point && rhs.zero_point) {
           accum += lhs.zero_point * rhs.zero_point * depth;
@@ -177,6 +197,7 @@
 #define RUY_ASM_TYPE_ID_UINT8 1
 #define RUY_ASM_TYPE_ID_INT8 2
 #define RUY_ASM_TYPE_ID_INT16 3
+#define RUY_ASM_TYPE_ID_INT32 4
 
 template <typename DstScalar>
 struct DstTypeId {};
@@ -196,9 +217,14 @@
   static constexpr int kValue = RUY_ASM_TYPE_ID_INT16;
 };
 
+template <>
+struct DstTypeId<std::int32_t> {
+  static constexpr int kValue = RUY_ASM_TYPE_ID_INT32;
+};
+
 template <int LhsCols, int RhsCols>
 struct KernelParams8bit {
-  static constexpr int kMaxDstTypeSize = 2;
+  static constexpr int kMaxDstTypeSize = 4;
 
   const std::int32_t* bias;
   const std::int32_t* lhs_sums;
@@ -222,8 +248,8 @@
   std::int32_t rhs_stride;
   std::int32_t dst_stride;
   std::int32_t depth;
-  std::int16_t clamp_min;
-  std::int16_t clamp_max;
+  std::int32_t clamp_min;
+  std::int32_t clamp_max;
   std::uint8_t flags;
   std::uint8_t dst_type_id;
   const std::int32_t zero_data[LhsCols] = {0};
@@ -233,8 +259,8 @@
 };
 
 template <typename DstScalar, int LhsCols, int RhsCols>
-void MakeKernelParams8bit(const Matrix<std::int8_t>& lhs,
-                          const Matrix<std::int8_t>& rhs,
+void MakeKernelParams8bit(const PackedMatrix<std::int8_t>& lhs,
+                          const PackedMatrix<std::int8_t>& rhs,
                           const BasicSpec<std::int32_t, DstScalar>& spec,
                           int start_row, int start_col, int end_row,
                           int end_col, Matrix<DstScalar>* dst,
@@ -249,20 +275,20 @@
   RUY_DCHECK_EQ(end_row % LhsCols, 0);
   RUY_DCHECK_EQ(end_col % RhsCols, 0);
 
-  params->lhs_base_ptr = lhs.data.get() + start_row * lhs.layout.stride;
-  params->rhs_base_ptr = rhs.data.get() + start_col * rhs.layout.stride;
+  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
+  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
   params->flags = 0;
   params->bias = params->zero_data;
   if (spec.bias) {
     params->bias = spec.bias;
     params->flags |= RUY_ASM_FLAG_HAS_BIAS;
   }
-  if (lhs.sums.get()) {
-    params->lhs_sums = lhs.sums.get();
+  if (lhs.sums) {
+    params->lhs_sums = lhs.sums;
     params->flags |= RUY_ASM_FLAG_HAS_LHS_SUMS;
   }
-  if (rhs.sums.get()) {
-    params->rhs_sums = rhs.sums.get();
+  if (rhs.sums) {
+    params->rhs_sums = rhs.sums;
     params->flags |= RUY_ASM_FLAG_HAS_RHS_SUMS;
   }
   params->start_row = start_row;
@@ -314,7 +340,8 @@
   using RhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
   Tuning tuning = Tuning::kAuto;
   explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const Matrix<std::int8_t>& lhs, const Matrix<std::int8_t>& rhs,
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
            const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
            int start_col, int end_row, int end_col,
            Matrix<DstScalar>* dst) const {
@@ -336,7 +363,8 @@
   using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
   using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
   explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const Matrix<std::int8_t>& lhs, const Matrix<std::int8_t>& rhs,
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
            const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
            int start_col, int end_row, int end_col,
            Matrix<DstScalar>* dst) const {
@@ -375,8 +403,8 @@
 };
 
 template <int LhsCols, int RhsCols>
-inline void MakeKernelParamsFloat(const Matrix<float>& lhs,
-                                  const Matrix<float>& rhs,
+inline void MakeKernelParamsFloat(const PackedMatrix<float>& lhs,
+                                  const PackedMatrix<float>& rhs,
                                   const BasicSpec<float, float>& spec,
                                   int start_row, int start_col, int end_row,
                                   int end_col, Matrix<float>* dst,
@@ -389,8 +417,8 @@
   RUY_DCHECK_EQ(end_row % LhsCols, 0);
   RUY_DCHECK_EQ(end_col % RhsCols, 0);
 
-  params->lhs_base_ptr = lhs.data.get() + start_row * lhs.layout.stride;
-  params->rhs_base_ptr = rhs.data.get() + start_col * rhs.layout.stride;
+  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
+  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
   params->dst_base_ptr =
       dst->data.get() + start_col * dst->layout.stride + start_row;
 
@@ -428,7 +456,7 @@
   using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
   using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
   explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const Matrix<float>& lhs, const Matrix<float>& rhs,
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
            const BasicSpec<float, float>& spec, int start_row, int start_col,
            int end_row, int end_col, Matrix<float>* dst) const {
     KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
@@ -451,7 +479,7 @@
   using Base =
       Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>>;
   explicit Kernel(Tuning tuning_) : Base(tuning_) {}
-  void Run(const Matrix<float>& lhs, const Matrix<float>& rhs,
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
            const BasicSpec<float, float>& spec, int start_row, int start_col,
            int end_row, int end_col, Matrix<float>* dst) const {
     KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h
index e7cf4a6..49b7c1d 100644
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@@ -27,17 +27,6 @@
 // 'column-major' means that each column is contiguous in memory.
 enum class Order : std::uint8_t { kColMajor, kRowMajor };
 
-// KernelLayout describes small-scale block structure in a matrix layout.
-// The default (rows = 1, cols = 1) means no such small-scale block structure,
-// since 1x1 blocks is the same as no blocks. In that case, the overall
-// matrix layout is just the usual linear row-major or column-major layout
-// described by the other members of struct Layout.
-struct KernelLayout final {
-  Order order = Order::kColMajor;
-  std::uint8_t rows = 1;
-  std::uint8_t cols = 1;
-};
-
 // Describes the shape and storage layout of a matrix.
 struct Layout final {
   std::int32_t rows = 0;
@@ -46,10 +35,6 @@
   // in the non-contiguous direction.
   std::int32_t stride = 0;
   Order order = Order::kColMajor;
-
-  // Small scale layout shuffling, potentially departing from
-  // linear row-major or column-major storage. See KernelLayout.
-  KernelLayout kernel;
 };
 
 namespace detail {
@@ -110,16 +95,12 @@
 // signed or unsigned.
 template <typename Scalar>
 struct Matrix final {
-
   void operator=(const Matrix& other) {
     data = other.data;
     layout = other.layout;
     zero_point = other.zero_point;
   }
 
- private:
-
- public:
   // The underlying buffer wrapped by this matrix.
   detail::ConstCheckingPtr<Scalar> data;
   // The shape and data layout of this matrix.
@@ -127,22 +108,15 @@
   // The zero_point, i.e. which Scalar value is to be interpreted as zero.
   // When Scalar is floating-point, this must be 0.
   Scalar zero_point = 0;
-  // The row/column sums needed for quantized matrix multiplication when
-  // the opposite operand of the multiplication uses a non-symmetric zero
-  // point.
-  // This member is only relevant for packed matrices.
-  // Additionally, Ruy always uses 32-bit signed accumulators for quantized
-  // matrix multiplication.
-  // For floating point types, there is no quantization, so this pointer
-  // will always be null. We still need code referencing it to compile
-  // though, even if it is always branched around. Hence we use Scalar*
-  // itself as the type in that case.
-  using SumsType =
-      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
-                                std::int32_t>::type;
-  detail::ConstCheckingPtr<SumsType> sums;
 };
 
+inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) {
+  layout->rows = rows;
+  layout->cols = cols;
+  layout->order = order;
+  layout->stride = order == Order::kColMajor ? rows : cols;
+}
+
 template <typename StreamType, typename Scalar>
 StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
   for (int row = 0; row < mat.layout.rows; row++) {
diff --git a/tensorflow/lite/experimental/ruy/pack.cc b/tensorflow/lite/experimental/ruy/pack.cc
index 801210d..d21c984 100644
--- a/tensorflow/lite/experimental/ruy/pack.cc
+++ b/tensorflow/lite/experimental/ruy/pack.cc
@@ -1145,11 +1145,19 @@
           "trn1 v21.2d, v17.2d, v19.2d\n"
           "trn2 v23.2d, v17.2d, v19.2d\n"
 
-          "str q20, [%[packed_ptr], #0]\n"
-          "str q21, [%[packed_ptr], #32]\n"
-          "str q22, [%[packed_ptr], #64]\n"
-          "str q23, [%[packed_ptr], #96]\n"
-          "add %[packed_ptr], %[packed_ptr], #128\n"
+          "mov x1, #32\n"
+
+#define RUY_STORE_ONE_ROW(ROW, REGISTER)                  \
+          "cmp w2, #" #ROW "\n"                           \
+          "beq 4f\n"                                      \
+          "st1 {" #REGISTER ".4s}, [%[packed_ptr]], x1\n"
+
+          RUY_STORE_ONE_ROW(0, v20)
+          RUY_STORE_ONE_ROW(1, v21)
+          RUY_STORE_ONE_ROW(2, v22)
+          RUY_STORE_ONE_ROW(3, v23)
+
+#undef RUY_STORE_ONE_ROW
 
           "4:\n"
 
@@ -1295,11 +1303,19 @@
           "trn1 v21.2d, v17.2d, v19.2d\n"
           "trn2 v23.2d, v17.2d, v19.2d\n"
 
-          "str q20, [%[packed_ptr], #0]\n"
-          "str q21, [%[packed_ptr], #32]\n"
-          "str q22, [%[packed_ptr], #64]\n"
-          "str q23, [%[packed_ptr], #96]\n"
-          "add %[packed_ptr], %[packed_ptr], #128\n"
+          "mov x1, #32\n"
+
+#define RUY_STORE_ONE_ROW(ROW, REGISTER)                  \
+          "cmp w2, #" #ROW "\n"                           \
+          "beq 4f\n"                                      \
+          "st1 {" #REGISTER ".4s}, [%[packed_ptr]], x1\n"
+
+          RUY_STORE_ONE_ROW(0, v20)
+          RUY_STORE_ONE_ROW(1, v21)
+          RUY_STORE_ONE_ROW(2, v22)
+          RUY_STORE_ONE_ROW(3, v23)
+
+#undef RUY_STORE_ONE_ROW
 
           "4:\n"
 
diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h
index 65b1a1f..520099c 100644
--- a/tensorflow/lite/experimental/ruy/pack.h
+++ b/tensorflow/lite/experimental/ruy/pack.h
@@ -20,6 +20,7 @@
 
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/tune.h"
 
@@ -63,11 +64,11 @@
 struct PackImpl<Path::kStandardCpp, FixedKernelLayout, Scalar, PackedScalar,
                 SumsType> {
   static void Run(Tuning, const Matrix<Scalar>& src_matrix,
-                  Matrix<PackedScalar>* packed_matrix, int start_col,
+                  PackedMatrix<PackedScalar>* packed_matrix, int start_col,
                   int end_col) {
     gemmlowp::ScopedProfilingLabel label("Pack (generic)");
     RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0);
-    SumsType* sums = packed_matrix->sums.get();
+    SumsType* sums = packed_matrix->sums;
     for (int col = start_col; col < end_col; col++) {
       SumsType accum = 0;
       for (int row = 0; row < packed_matrix->layout.rows; row++) {
@@ -129,12 +130,12 @@
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
 
   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
-                  Matrix<std::int8_t>* packed_matrix, int start_col,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
-    RUY_DCHECK(IsLinearColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ(start_col % 4, 0);
-    std::int32_t* sums = packed_matrix->sums.get();
+    std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[16];
     memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
     for (int block_col = start_col; block_col < end_col; block_col += 4) {
@@ -166,7 +167,7 @@
         }
       }
       std::int8_t* packed_ptr =
-          packed_matrix->data.get() + packed_matrix->layout.stride * block_col;
+          packed_matrix->data + packed_matrix->layout.stride * block_col;
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
       if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
         Pack8bitNeonInOrder(
@@ -193,12 +194,12 @@
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
 
   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
-                  Matrix<std::int8_t>* packed_matrix, int start_col,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
-    RUY_DCHECK(IsLinearColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ(start_col % 8, 0);
-    std::int32_t* sums = packed_matrix->sums.get();
+    std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[16];
     memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
     for (int block_col = start_col; block_col < end_col; block_col += 4) {
@@ -230,7 +231,7 @@
         }
       }
       std::int8_t* packed_ptr =
-          packed_matrix->data.get() +
+          packed_matrix->data +
           packed_matrix->layout.stride * (block_col & ~7) +
           ((block_col & 4) * 4);
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
@@ -261,11 +262,12 @@
                           float* packed_ptr, int start_col, int end_col);
 
 template <>
-struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 1, 8>, float,
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
                 float, float> {
   static void Run(Tuning tuning, const Matrix<float>& src_matrix,
-                  Matrix<float>* packed_matrix, int start_col, int end_col) {
-    RUY_DCHECK(IsLinearColMajor(src_matrix.layout));
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ(start_col % 8, 0);
     const float zerobuf[4] = {0};
@@ -297,7 +299,7 @@
           src_inc3 = 0;
         }
       }
-      float* packed_ptr = packed_matrix->data.get() +
+      float* packed_ptr = packed_matrix->data +
                           packed_matrix->layout.stride * (block_col & ~7) +
                           ((block_col & 4));
       if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
@@ -317,15 +319,24 @@
 
 #endif  // (defined __aarch64__) && (RUY_OPT_SET & RUY_OPT_ASM)
 
+// Main entry point for packing.
 template <Path ThePath, typename FixedKernelLayout, typename Scalar,
           typename PackedScalar>
-void Pack(Tuning tuning, const Matrix<Scalar>& src_matrix,
-          Matrix<PackedScalar>* packed_matrix, int start_col, int end_col) {
-  using SumsType = typename Matrix<PackedScalar>::SumsType;
+void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix,
+             int start_col, int end_col) {
+  using SumsType = typename PackedMatrix<PackedScalar>::SumsType;
+  Matrix<Scalar> src = ToMatrix<Scalar>(src_matrix);
+  PackedMatrix<PackedScalar> packed =
+      ToPackedMatrix<PackedScalar>(*packed_matrix);
   PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType>::Run(
-      tuning, src_matrix, packed_matrix, start_col, end_col);
+      tuning, src, &packed, start_col, end_col);
 }
 
+// The signature of RunPack is the same, regardless of its template parameters.
+using RunPackFn = decltype(
+    RunPack<Path::kStandardCpp, FixedKernelLayout<Order::kColMajor, 1, 1>,
+            std::int8_t, std::int8_t>);
+
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h
index 261e4a7..fd0c4a4 100644
--- a/tensorflow/lite/experimental/ruy/path.h
+++ b/tensorflow/lite/experimental/ruy/path.h
@@ -44,12 +44,31 @@
 // at runtime; then, typically in dispatch.h, we internally pick one
 // specific path and from there on, internal Ruy code deals with only one
 // path.
+//
+// When a user selects a set of compiled paths, Ruy internally dispatches to the
+// "best" one, which typically means the newest optimized instructions for a
+// given base architecture (such as ARM). Higher values of this enum correspond
+// to "better" code paths within a given base architecture for which Ruy has
+// optimized code paths.
 enum class Path : std::uint8_t {
-  // Higher values have higher precedence.
+  // This is a special null value, representing the absence of any path.
   kNone = 0,
-  kReference = 0x1,    // reference code.
-  kStandardCpp = 0x2,  // Standard C++ only. No SIMD or other arch features.
+  // Reference multiplication code.
+  // The main purpose of this path is to have a very simple standalone Mul
+  // implementation to check against.
+  // This path bypasses almost all of Ruy's internal implementation details.
+  //
+  // This is intended for testing/development.
+  kReference = 0x1,
+  // Standard C++ implementation of Ruy's architecture-specific parts.
+  // Unlike Path::kReference, this path exercises most of Ruy's internal logic.
+  //
+  // This is intended for testing/development.
+  kStandardCpp = 0x2,
+  // Optimized path using a widely available subset of ARM NEON instructions.
   kNeon = 0x4,
+  // Optimized path making use of ARM NEON dot product instructions that are
+  // available on newer ARM cores.
   kNeonDotprod = 0x8,
 };
 
@@ -68,10 +87,16 @@
                            static_cast<std::uint32_t>(q));
 }
 
+inline constexpr Path operator~(Path p) {
+  return static_cast<Path>(~static_cast<std::uint32_t>(p));
+}
+
 inline Path GetMostSignificantPath(Path path_mask) {
   return static_cast<Path>(round_down_pot(static_cast<int>(path_mask)));
 }
 
+// ruy::kAllPaths represents all Path's that make sense to on a given
+// base architecture.
 #ifdef __aarch64__
 constexpr Path kAllPaths =
     Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
diff --git a/tensorflow/lite/experimental/ruy/pmu.cc b/tensorflow/lite/experimental/ruy/pmu.cc
index b69fd80..2fe79e7 100644
--- a/tensorflow/lite/experimental/ruy/pmu.cc
+++ b/tensorflow/lite/experimental/ruy/pmu.cc
@@ -15,6 +15,8 @@
 
 #include "tensorflow/lite/experimental/ruy/pmu.h"
 
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+
 #ifdef __linux__
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
@@ -54,7 +56,7 @@
 
   void Stop() {
     ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
-    read(fd_, &count_, sizeof(count_));
+    RUY_CHECK_NE(read(fd_, &count_, sizeof(count_)), -1);
     close(fd_);
   }
 
diff --git a/tensorflow/lite/experimental/ruy/ruy.h b/tensorflow/lite/experimental/ruy/ruy.h
index 0e849ae..d9f88f6 100644
--- a/tensorflow/lite/experimental/ruy/ruy.h
+++ b/tensorflow/lite/experimental/ruy/ruy.h
@@ -34,8 +34,8 @@
           typename DstScalar, typename Spec>
 void Mul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
          const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
-  MulDispatch<CompiledPaths, LhsScalar, RhsScalar, DstScalar, Spec> dispatch;
-  dispatch.Mul(lhs, rhs, spec, context, dst);
+  DispatchMul<CompiledPaths, LhsScalar, RhsScalar, DstScalar, Spec>(
+      lhs, rhs, spec, context, dst);
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl b/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
index a6d244d..bb111c2 100644
--- a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
+++ b/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
@@ -3,4 +3,6 @@
 """
 
 def ruy_visibility():
-    return []
+    return [
+        "//tensorflow/lite/kernels:__subpackages__",
+    ]
diff --git a/tensorflow/lite/experimental/ruy/spec.h b/tensorflow/lite/experimental/ruy/spec.h
index a8e192a..b4d5901 100644
--- a/tensorflow/lite/experimental/ruy/spec.h
+++ b/tensorflow/lite/experimental/ruy/spec.h
@@ -18,6 +18,7 @@
 
 #include <cstdint>
 #include <limits>
+#include <type_traits>
 
 namespace ruy {
 
@@ -37,16 +38,14 @@
 enum class ZeroPointSupport { kGeneral, kSymmetric };
 
 // In general we allow all Layout's, even if we may use slow paths for some
-// kinds of layouts. By choosing kPackedLinearRCC, one may opt out of this and
+// kinds of layouts. By choosing kRCC, one may opt out of this and
 // only keep support for the simplest and most efficient combination of
 // Layout's, in exchange for smaller code size. The case covered by
-// kPackedLinearRCC is that where all matrix layouts are linear (no sub-block
-// structure), packed (no striding), and where the storage orders are exactly
-// the following:
+// kRCC is where the storage orders are exactly the following:
 //    - LHS is RowMajor
 //    - RHS is ColMajor
 //    - Destination is ColMajor
-enum class LayoutSupport { kGeneral, kPackedLinearRCC };
+enum class LayoutSupport { kGeneral, kRCC };
 
 // A Spec describes all about a matrix multiplication operation that isn't
 // encoded in the LHS, RHS and destination matrices. Some of that information
@@ -83,9 +82,13 @@
   // multiplier_fixedpoint_perchannel must be nullptr.
   const int* multiplier_exponent_perchannel = nullptr;
   // min clamp bound of destination values.
-  DstScalar clamp_min = std::numeric_limits<DstScalar>::lowest();
+  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+                            ? -std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::lowest();
   // max clamp bound of destination values.
-  DstScalar clamp_max = std::numeric_limits<DstScalar>::max();
+  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+                            ? std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::max();
   // See above enum LoopStructure
   static constexpr LoopStructure kLoopStructure = LoopStructure::kAuto;
   // See above enum LayoutSupport
diff --git a/tensorflow/lite/experimental/ruy/test.h b/tensorflow/lite/experimental/ruy/test.h
index e7eb1a9..61393e4 100644
--- a/tensorflow/lite/experimental/ruy/test.h
+++ b/tensorflow/lite/experimental/ruy/test.h
@@ -297,17 +297,13 @@
   }
 }
 
-enum class LayoutStyle { kPackedLinear, kLinear, kBlocked };
+enum class LayoutStyle { kPackedLinear, kLinear };
 
-void MakeLayout(int rows, int cols, int kernel_rows, int kernel_cols,
-                Order order, Order kernel_order, LayoutStyle layout_style,
+void MakeLayout(int rows, int cols, Order order, LayoutStyle layout_style,
                 Layout* layout) {
   layout->rows = rows;
   layout->cols = cols;
   layout->order = order;
-  layout->kernel.order = kernel_order;
-  layout->kernel.rows = kernel_rows;
-  layout->kernel.cols = kernel_cols;
 
   const int packed_stride = order == Order::kColMajor ? rows : cols;
 
@@ -340,12 +336,10 @@
 }
 
 template <typename Scalar>
-void MakeRandom(int rows, int cols, int kernel_rows, int kernel_cols,
-                Order order, Order kernel_order, Scalar zero_point,
+void MakeRandom(int rows, int cols, Order order, Scalar zero_point,
                 LayoutStyle layout_style, RandomRange range,
                 StorageMatrix<Scalar>* storage_matrix) {
-  MakeLayout(rows, cols, kernel_rows, kernel_cols, order, kernel_order,
-             layout_style, &storage_matrix->matrix.layout);
+  MakeLayout(rows, cols, order, layout_style, &storage_matrix->matrix.layout);
   storage_matrix->matrix.zero_point = zero_point;
   UniformRandomDistribution<Scalar> data_dist(range);
   MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout),
@@ -443,13 +437,8 @@
   int rows = 0;
   int cols = 0;
   int depth = 0;
-  int kernel_rows = 1;
-  int kernel_cols = 1;
-  int kernel_depth = 1;
   Order lhs_order = Order::kRowMajor;
   Order rhs_order = Order::kColMajor;
-  Order lhs_kernel_order = Order::kRowMajor;
-  Order rhs_kernel_order = Order::kColMajor;
   Order dst_order = Order::kColMajor;
   LayoutStyle layout_style = LayoutStyle::kPackedLinear;
   ExpectedOutcome expected_outcome = ExpectedOutcome::kSuccess;
@@ -526,7 +515,6 @@
 template <typename Scalar, gemmlowp::MapOrder tOrder>
 void WrapGemmlowp(const Matrix<Scalar>& src,
                   gemmlowp::MatrixMap<const Scalar, tOrder>* dst) {
-  RUY_CHECK(IsLinear(src.layout));
   RUY_CHECK(src.layout.order == (tOrder == gemmlowp::MapOrder::ColMajor
                                      ? Order::kColMajor
                                      : Order::kRowMajor));
@@ -537,7 +525,6 @@
 template <typename Scalar, gemmlowp::MapOrder tOrder>
 void WrapGemmlowpMutable(Matrix<Scalar>* src,
                          gemmlowp::MatrixMap<Scalar, tOrder>* dst) {
-  RUY_CHECK(IsLinear(src->layout));
   RUY_CHECK(src->layout.order == (tOrder == gemmlowp::MapOrder::ColMajor
                                       ? Order::kColMajor
                                       : Order::kRowMajor));
@@ -706,9 +693,6 @@
           typename RhsScalar, typename DstScalar, typename Spec>
 void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
                const Spec& spec, int max_num_threads, Matrix<DstScalar>* dst) {
-  RUY_CHECK(IsLinear(lhs.layout));
-  RUY_CHECK(IsLinear(rhs.layout));
-  RUY_CHECK(IsLinear(dst->layout));
   RUY_CHECK_EQ(lhs.zero_point, 0);
   RUY_CHECK_EQ(rhs.zero_point, 0);
   RUY_CHECK_EQ(dst->zero_point, 0);
@@ -802,9 +786,9 @@
   RUY_CHECK_EQ(spec.multiplier_exponent, 0);
 
   // Eigen::TensorMap only supports packed layouts
-  RUY_CHECK(IsPackedLinear(lhs.layout));
-  RUY_CHECK(IsPackedLinear(rhs.layout));
-  RUY_CHECK(IsPackedLinear(dst->layout));
+  RUY_CHECK(IsPacked(lhs.layout));
+  RUY_CHECK(IsPacked(rhs.layout));
+  RUY_CHECK(IsPacked(dst->layout));
 
   using TensorLhsType =
       Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>;
@@ -1412,11 +1396,14 @@
   spec_unclamped.multiplier_exponent_perchannel =
       spec->multiplier_exponent_perchannel;
   Mul<Path::kReference>(lhs, rhs, spec_unclamped, &context, &unclamped_dst);
-  std::sort(unclamped_dst_data.begin(), unclamped_dst_data.end());
-  const int clamp_count = static_cast<int>(std::floor(kClampRatio * size));
-  RUY_CHECK_LT(clamp_count, size);
-  spec->clamp_min = unclamped_dst_data[clamp_count];
-  spec->clamp_max = unclamped_dst_data[size - 1 - clamp_count];
+  // If dst is std::int32_t, no need to set the clamp min/max.
+  if (!std::is_same<typename Spec::DstScalar, std::int32_t>::value) {
+    std::sort(unclamped_dst_data.begin(), unclamped_dst_data.end());
+    const int clamp_count = static_cast<int>(std::floor(kClampRatio * size));
+    RUY_CHECK_LT(clamp_count, size);
+    spec->clamp_min = unclamped_dst_data[clamp_count];
+    spec->clamp_max = unclamped_dst_data[size - 1 - clamp_count];
+  }
 }
 
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
@@ -1425,7 +1412,12 @@
   if (!use_specified_zero_points) {
     MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &lhs_zero_point);
     MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &rhs_zero_point);
-    MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point);
+    // If destination is std::int32_t, no dst_zero_point is necessary.
+    if (std::is_same<DstScalar, std::int32_t>::value) {
+      dst_zero_point = 0;
+    } else {
+      MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point);
+    }
   }
   life_stage = LifeStage::kHasZeroPoints;
 }
@@ -1433,11 +1425,9 @@
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::MakeLhsRhs() {
   RUY_CHECK(life_stage == LifeStage::kHasZeroPoints);
-  MakeRandom(rows, depth, kernel_rows, kernel_depth, lhs_order,
-             lhs_kernel_order, lhs_zero_point, layout_style,
+  MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style,
              RandomRange::kAvoidMinValue, &lhs);
-  MakeRandom(depth, cols, kernel_depth, kernel_cols, rhs_order,
-             rhs_kernel_order, rhs_zero_point, layout_style,
+  MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style,
              RandomRange::kGeneral, &rhs);
   life_stage = LifeStage::kHasLhsRhs;
 }
@@ -1527,12 +1517,11 @@
   paths_bitfield = paths_bitfield & kAllPaths;
   paths = PathsBitfieldAsVector(paths_bitfield);
 
-  using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
-
 #ifdef RUY_TEST_EXTERNAL_PATHS
 
-  if (!getenv("NOEXT") && IsLinear(lhs.matrix.layout) &&
-      IsLinear(rhs.matrix.layout)) {
+  using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
+
+  if (!getenv("NOEXT")) {
     if (SupportsGemmlowp<TestSetType>::kValue) {
 #ifdef GEMMLOWP_SSE4
       const bool gemmlowp_supported = !spec.multiplier_fixedpoint_perchannel;
@@ -1569,8 +1558,8 @@
       TestResult<DstScalar>& result = results.back();
       result.path = path;
       result.tuning = tuning;
-      MakeRandom(rows, cols, 1, 1, dst_order, dst_order, dst_zero_point,
-                 layout_style, RandomRange::kGeneral, &result.storage_matrix);
+      MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
+                 RandomRange::kGeneral, &result.storage_matrix);
     }
   }
 
@@ -1578,8 +1567,8 @@
     results.emplace_back();
     TestResult<DstScalar>& result = results.back();
     result.external_path = external_path;
-    MakeRandom(rows, cols, 1, 1, dst_order, dst_order, dst_zero_point,
-               layout_style, RandomRange::kGeneral, &result.storage_matrix);
+    MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
+               RandomRange::kGeneral, &result.storage_matrix);
   }
 
   life_stage = LifeStage::kHasResultPaths;
@@ -1918,8 +1907,7 @@
 }
 
 template <typename TestSetType>
-void TestPackedLinearRCC(int rows, int depth, int cols,
-                         ExpectedOutcome expected_outcome) {
+void TestRCC(int rows, int depth, int cols, ExpectedOutcome expected_outcome) {
   TestSetType test_set;
   test_set.rows = rows;
   test_set.depth = depth;
@@ -1933,9 +1921,23 @@
 }
 
 template <typename TestSetType>
-void TestPackedLinearRCC(int rows, int depth, int cols) {
-  TestPackedLinearRCC<TestSetType>(rows, depth, cols,
-                                   ExpectedOutcome::kSuccess);
+void TestRCC(int rows, int depth, int cols) {
+  TestRCC<TestSetType>(rows, depth, cols, ExpectedOutcome::kSuccess);
+}
+
+template <typename TestSetType>
+void TestNonRCC(int rows, int depth, int cols,
+                ExpectedOutcome expected_outcome) {
+  TestSetType test_set;
+  test_set.rows = rows;
+  test_set.depth = depth;
+  test_set.cols = cols;
+  test_set.lhs_order = Order::kColMajor;
+  test_set.rhs_order = Order::kColMajor;
+  test_set.dst_order = Order::kColMajor;
+  test_set.layout_style = LayoutStyle::kPackedLinear;
+  test_set.expected_outcome = expected_outcome;
+  test_set.Run();
 }
 
 template <typename TestSetType>
@@ -1967,50 +1969,6 @@
                                    ExpectedOutcome::kSuccess);
 }
 
-template <typename TestSetType>
-void TestNonLinearAllOrders(int rows, int depth, int cols, int kernel_rows,
-                            int kernel_depth, int kernel_cols,
-                            ExpectedOutcome expected_outcome) {
-  const std::vector<Order> orders{Order::kColMajor, Order::kRowMajor};
-
-  for (Order lhs_order : orders) {
-    for (Order rhs_order : orders) {
-      for (Order dst_order : orders) {
-        for (Order lhs_kernel_order : orders) {
-          for (Order rhs_kernel_order : orders) {
-            TestSetType test_set;
-            test_set.rows = rows;
-            test_set.depth = depth;
-            test_set.cols = cols;
-            test_set.kernel_rows = kernel_rows;
-            test_set.kernel_depth = kernel_depth;
-            test_set.kernel_cols = kernel_cols;
-            test_set.lhs_order = lhs_order;
-            test_set.rhs_order = rhs_order;
-            test_set.lhs_kernel_order = lhs_kernel_order;
-            test_set.rhs_kernel_order = rhs_kernel_order;
-            test_set.dst_order = dst_order;
-            test_set.layout_style = LayoutStyle::kLinear;
-            test_set.expected_outcome = expected_outcome;
-            test_set.Run();
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename TestSetType>
-void TestNonLinearAllOrders(int rows, int depth, int cols, int kernel_rows,
-                            int kernel_depth, int kernel_cols) {
-  RUY_CHECK_EQ(rows % kernel_rows, 0);
-  RUY_CHECK_EQ(depth % kernel_depth, 0);
-  RUY_CHECK_EQ(cols % kernel_cols, 0);
-  TestNonLinearAllOrders<TestSetType>(rows, depth, cols, kernel_rows,
-                                      kernel_depth, kernel_cols,
-                                      ExpectedOutcome::kSuccess);
-}
-
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_
diff --git a/tensorflow/lite/experimental/ruy/test_fast.cc b/tensorflow/lite/experimental/ruy/test_fast.cc
index 06533a0..7026bca 100644
--- a/tensorflow/lite/experimental/ruy/test_fast.cc
+++ b/tensorflow/lite/experimental/ruy/test_fast.cc
@@ -56,7 +56,7 @@
   };
 
   for (int size : sizes) {
-    TestPackedLinearRCC<TestSetType>(size, size, size);
+    TestRCC<TestSetType>(size, size, size);
     TestLinearAllOrders<TestSetType>(size, size, size);
   }
 }
@@ -73,7 +73,7 @@
 }
 
 TEST(RuyTest, TestDeepMuls) {
-  TestPackedLinearRCC<TestSetType>(1, 50001, 1);
+  TestRCC<TestSetType>(1, 50001, 1);
   TestLinearAllOrders<TestSetType>(5, 5001, 4);
   TestLinearAllOrders<TestSetType>(9, 1025, 10);
 }
@@ -94,10 +94,4 @@
   }
 }
 
-TEST(RuyTest, TestNonLinear) {
-  TestNonLinearAllOrders<TestSetType>(10, 11, 12, 2, 1, 4);
-  TestNonLinearAllOrders<TestSetType>(10, 12, 11, 2, 4, 1);
-  TestNonLinearAllOrders<TestSetType>(8, 2, 4, 8, 2, 4);
-  TestNonLinearAllOrders<TestSetType>(24, 32, 16, 8, 16, 4);
-}
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/test_slow.cc b/tensorflow/lite/experimental/ruy/test_slow.cc
index e19fb72..0a10a16 100644
--- a/tensorflow/lite/experimental/ruy/test_slow.cc
+++ b/tensorflow/lite/experimental/ruy/test_slow.cc
@@ -29,10 +29,10 @@
 
 TEST(RuyTest, TestBigNarrowMuls) {
   for (int width : {1, 2, 3, 4, 5, 8}) {
-    TestPackedLinearRCC<TestSetType>(width, 401, 601);
-    TestPackedLinearRCC<TestSetType>(587, 443, width);
+    TestRCC<TestSetType>(width, 401, 601);
+    TestRCC<TestSetType>(587, 443, width);
   }
-  TestPackedLinearRCC<TestSetType>(512, 256, 16);
+  TestRCC<TestSetType>(512, 256, 16);
 }
 
 TEST(RuyTest, TestBigShallowMuls) {
@@ -42,7 +42,7 @@
 }
 
 TEST(RuyTest, TestBigMuls) {
-  TestPackedLinearRCC<TestSetType>(225, 303, 199);
+  TestRCC<TestSetType>(225, 303, 199);
   TestLinearAllOrders<TestSetType>(256, 192, 128);
 }
 
diff --git a/tensorflow/lite/experimental/ruy/test_special_specs.cc b/tensorflow/lite/experimental/ruy/test_special_specs.cc
index 9498e2b..5e1d8d9 100644
--- a/tensorflow/lite/experimental/ruy/test_special_specs.cc
+++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc
@@ -32,9 +32,8 @@
 };
 
 template <typename AccumScalar, typename DstScalar>
-struct PackedLinearRCCSpec : BasicSpec<AccumScalar, DstScalar> {
-  static constexpr LayoutSupport kLayoutSupport =
-      LayoutSupport::kPackedLinearRCC;
+struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
+  static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
 };
 
 using LhsScalar = RUY_TEST_LHSSCALAR;
@@ -117,13 +116,11 @@
       SymmetricZeroPoint<DstScalar>() - 1, ExpectedOutcome::kDeath);
 }
 
-TEST(TestSpecialSpecs, PackedLinearRCC) {
-  using PackedLinearRCCSpec = PackedLinearRCCSpec<AccumScalar, DstScalar>;
-  using PackedLinearRCCTestSet =
-      TestSet<LhsScalar, RhsScalar, PackedLinearRCCSpec>;
-  TestPackedLinearRCC<PackedLinearRCCTestSet>(81, 93, 72);
-  TestLinearAllOrders<PackedLinearRCCTestSet>(81, 93, 72,
-                                              ExpectedOutcome::kDeath);
+TEST(TestSpecialSpecs, RCC) {
+  using RCCSpec = RCCSpec<AccumScalar, DstScalar>;
+  using RCCTestSet = TestSet<LhsScalar, RhsScalar, RCCSpec>;
+  TestRCC<RCCTestSet>(81, 93, 72);
+  TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/thread_pool.cc b/tensorflow/lite/experimental/ruy/thread_pool.cc
index 10b2976..de65a62 100644
--- a/tensorflow/lite/experimental/ruy/thread_pool.cc
+++ b/tensorflow/lite/experimental/ruy/thread_pool.cc
@@ -225,17 +225,17 @@
   BlockingCounter* const counter_to_decrement_when_ready_;
 };
 
-void ThreadPool::Execute(int task_count, Task** tasks_ptrs) {
+void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
   RUY_DCHECK_GE(task_count, 1);
   // Task #0 will be run on the current thread.
   CreateThreads(task_count - 1);
   counter_to_decrement_when_ready_.Reset(task_count - 1);
   for (int i = 1; i < task_count; i++) {
-    threads_[i - 1]->StartWork(tasks_ptrs[i]);
+    auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride;
+    threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address));
   }
   // Execute task #0 workload immediately on the current thread.
-  Task* last_task = tasks_ptrs[0];
-  last_task->Run();
+  (tasks + 0)->Run();
   // Wait for the threads submitted above to finish.
   counter_to_decrement_when_ready_.Wait();
 }
diff --git a/tensorflow/lite/experimental/ruy/thread_pool.h b/tensorflow/lite/experimental/ruy/thread_pool.h
index 55eb875..179f5d4 100644
--- a/tensorflow/lite/experimental/ruy/thread_pool.h
+++ b/tensorflow/lite/experimental/ruy/thread_pool.h
@@ -68,7 +68,13 @@
   // want to run an unbounded number of tasks on a bounded number of threads,
   // then you need something higher-level than this ThreadPool, that can
   // be layered on top of it by appropriately subclassing Tasks.
-  void Execute(int task_count, Task** tasks_ptrs);
+  //
+  // TaskType must be a subclass of ruy::Task. That is implicitly guarded by
+  // the static_cast in this inline implementation.
+  template <typename TaskType>
+  void Execute(int task_count, TaskType* tasks) {
+    ExecuteImpl(task_count, sizeof(TaskType), static_cast<Task*>(tasks));
+  }
 
  private:
   // Ensures that the pool has at least the given count of threads.
@@ -76,6 +82,10 @@
   // be ready.
   void CreateThreads(int threads_count);
 
+  // Non-templatized implementation of the public Execute method.
+  // See the inline implementation of Execute for how this is used.
+  void ExecuteImpl(int task_count, int stride, Task* tasks);
+
   // copy construction disallowed
   ThreadPool(const ThreadPool&) = delete;
 
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 44e73bd..a5a0c94 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -42,7 +42,7 @@
     sdk_frameworks = [
         "CoreGraphics",
     ],
-    tags = TFL_DEFAULT_TAGS + ["manual"],
+    tags = TFL_DEFAULT_TAGS,
     deps = [
         ":AppLib",
     ],
diff --git a/tensorflow/lite/experimental/swift/README.md b/tensorflow/lite/experimental/swift/README.md
index 2ed07ba..3e5badf 100644
--- a/tensorflow/lite/experimental/swift/README.md
+++ b/tensorflow/lite/experimental/swift/README.md
@@ -5,7 +5,7 @@
 machine learning models with a small binary size and fast performance supporting
 hardware acceleration.
 
-## Getting Started
+## Build TensorFlow with iOS support
 
 To build the Swift TensorFlow Lite library on Apple platforms,
 [install from source](https://www.tensorflow.org/install/source#setup_for_linux_and_macos)
@@ -19,9 +19,25 @@
 
 Follow the prompts and when asked to build TensorFlow with iOS support, enter `y`.
 
-### Bazel
+### CocoaPods developers
 
-In your `BUILD` file, add the `TensorFlowLite` dependency:
+Add the TensorFlow Lite pod to your `Podfile`:
+
+```ruby
+pod 'TensorFlowLiteSwift'
+```
+
+Then, run `pod install`.
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
+
+### Bazel developers
+
+In your `BUILD` file, add the `TensorFlowLite` dependency to your target:
 
 ```python
 swift_library(
@@ -49,12 +65,12 @@
 bazel test tensorflow/lite/experimental/swift:TensorFlowLiteTests --swiftcopt=-enable-testing
 ```
 
-Note that `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
+Note: `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
 
-### Tulsi
+#### Generate the Xcode project using Tulsi
 
-Open the `TensorFlowLite.tulsiproj` using the
-[TulsiApp](https://github.com/bazelbuild/tulsi)
+Open the `//tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj` using
+the [TulsiApp](https://github.com/bazelbuild/tulsi)
 or by running the
 [`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
 script from the root `tensorflow` directory:
@@ -62,19 +78,3 @@
 ```shell
 generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
 ```
-
-### CocoaPods
-
-Add the following to your `Podfile`:
-
-```ruby
-pod 'TensorFlowLiteSwift'
-```
-
-Then, run `pod install`.
-
-In your Swift files, import the module:
-
-```swift
-import TensorFlowLite
-```
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index 47ea935..c1aea0a 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -87,7 +87,6 @@
   /// - Throws: An error if the model was not ready because tensors were not allocated.
   public func invoke() throws {
     guard TFL_InterpreterInvoke(cInterpreter) == kTfLiteOk else {
-      // TODO(b/117510052): Determine which error to throw.
       throw InterpreterError.allocateTensorsRequired
     }
   }
@@ -104,8 +103,8 @@
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
     guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)),
-          let bytes = TFL_TensorData(cTensor),
-          let nameCString = TFL_TensorName(cTensor)
+      let bytes = TFL_TensorData(cTensor),
+      let nameCString = TFL_TensorName(cTensor)
     else {
       throw InterpreterError.allocateTensorsRequired
     }
@@ -124,7 +123,6 @@
     let zeroPoint = Int(cQuantizationParams.zero_point)
     var quantizationParameters: QuantizationParameters? = nil
     if scale != 0.0 {
-      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
       quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
     }
     let tensor = Tensor(
@@ -151,10 +149,9 @@
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
     guard let cTensor = TFL_InterpreterGetOutputTensor(cInterpreter, Int32(index)),
-          let bytes = TFL_TensorData(cTensor),
-          let nameCString = TFL_TensorName(cTensor)
+      let bytes = TFL_TensorData(cTensor),
+      let nameCString = TFL_TensorName(cTensor)
     else {
-      // TODO(b/117510052): Determine which error to throw.
       throw InterpreterError.invokeInterpreterRequired
     }
     guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
@@ -172,7 +169,6 @@
     let zeroPoint = Int(cQuantizationParams.zero_point)
     var quantizationParameters: QuantizationParameters? = nil
     if scale != 0.0 {
-      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
       quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
     }
     let tensor = Tensor(
@@ -200,11 +196,11 @@
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
     guard TFL_InterpreterResizeInputTensor(
-            cInterpreter,
-            Int32(index),
-            shape.int32Dimensions,
-            Int32(shape.rank)
-          ) == kTfLiteOk
+      cInterpreter,
+      Int32(index),
+      shape.int32Dimensions,
+      Int32(shape.rank)
+    ) == kTfLiteOk
     else {
       throw InterpreterError.failedToResizeInputTensor(index: index)
     }
@@ -233,7 +229,13 @@
       throw InterpreterError.invalidTensorDataCount(provided: data.count, required: byteCount)
     }
 
+    #if swift(>=5.0)
+    let status = data.withUnsafeBytes {
+      TFL_TensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+    }
+    #else
     let status = data.withUnsafeBytes { TFL_TensorCopyFromBuffer(cTensor, $0, data.count) }
+    #endif  // swift(>=5.0)
     guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
     return try input(at: index)
   }
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
index 5de58b9..b9dc01c 100644
--- a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
@@ -68,32 +68,4 @@
   }
 }
 
-#if swift(>=4.2)
 extension InterpreterError: Equatable {}
-#else
-extension InterpreterError: Equatable {
-  public static func == (lhs: InterpreterError, rhs: InterpreterError) -> Bool {
-    switch (lhs, rhs) {
-    case (.invalidTensorDataType, .invalidTensorDataType),
-         (.failedToLoadModel, .failedToLoadModel),
-         (.failedToCreateInterpreter, .failedToCreateInterpreter),
-         (.failedToAllocateTensors, .failedToAllocateTensors),
-         (.allocateTensorsRequired, .allocateTensorsRequired),
-         (.invokeInterpreterRequired, .invokeInterpreterRequired):
-      return true
-    case (.invalidTensorIndex(let lhsIndex, let lhsMaxIndex),
-          .invalidTensorIndex(let rhsIndex, let rhsMaxIndex)):
-      return lhsIndex == rhsIndex && lhsMaxIndex == rhsMaxIndex
-    case (.invalidTensorDataCount(let lhsProvidedCount, let lhsRequiredCount),
-          .invalidTensorDataCount(let rhsProvidedCount, let rhsRequiredCount)):
-      return lhsProvidedCount == rhsProvidedCount && lhsRequiredCount == rhsRequiredCount
-    case (.failedToResizeInputTensor(let lhsIndex), .failedToResizeInputTensor(let rhsIndex)):
-      return lhsIndex == rhsIndex
-    case (.tensorFlowLiteError(let lhsMessage), .tensorFlowLiteError(let rhsMessage)):
-      return lhsMessage == rhsMessage
-    default:
-      return false
-    }
-  }
-}
-#endif  // swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
index 5d600f0..3210ccc 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
@@ -1,12 +1,10 @@
-# Run `pod lib lint TensorFlowLiteSwift.podspec` to ensure this is a valid spec.
-
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '0.1.0'
+  s.version          = '0.2.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '2b96dde' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '37c101d' }
   s.summary          = 'TensorFlow Lite for Swift'
   s.description      = <<-DESC
 
@@ -16,7 +14,6 @@
                        DESC
 
   s.ios.deployment_target = '9.0'
-  s.swift_version = '4.2'
 
   s.module_name = 'TensorFlowLite'
   s.static_framework = true
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/Podfile b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/Podfile
new file mode 100644
index 0000000..9c9fe28
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/Podfile
@@ -0,0 +1,6 @@
+platform :ios, '9.0'
+
+target 'TensorFlowLiteApp' do
+  use_frameworks!
+  pod 'TensorFlowLiteSwift'
+end
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
index fbbf9a1..eb5d65a 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
@@ -7,6 +7,9 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		4A1E2BA0227C8B53006C23E2 /* multi_add.bin in Resources */ = {isa = PBXBuildFile; fileRef = 4A1E2B9D227C8B51006C23E2 /* multi_add.bin */; };
+		4A1E2BA1227C8B53006C23E2 /* add_quantized.bin in Resources */ = {isa = PBXBuildFile; fileRef = 4A1E2B9E227C8B52006C23E2 /* add_quantized.bin */; };
+		4A1E2BA2227C8B53006C23E2 /* add.bin in Resources */ = {isa = PBXBuildFile; fileRef = 4A1E2B9F227C8B52006C23E2 /* add.bin */; };
 		4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */; };
 		4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B722146ED64006C3AEF /* AppDelegate.swift */; };
 		4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B742146ED64006C3AEF /* ViewController.swift */; };
@@ -17,6 +20,9 @@
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
+		4A1E2B9D227C8B51006C23E2 /* multi_add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = multi_add.bin; path = ../../../../../testdata/multi_add.bin; sourceTree = "<group>"; };
+		4A1E2B9E227C8B52006C23E2 /* add_quantized.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_quantized.bin; path = ../../../../../testdata/add_quantized.bin; sourceTree = "<group>"; };
+		4A1E2B9F227C8B52006C23E2 /* add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add.bin; path = ../../../../../testdata/add.bin; sourceTree = "<group>"; };
 		4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+TensorFlowLite.swift"; sourceTree = "<group>"; };
 		4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TensorFlowLiteApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		4AA72B722146ED64006C3AEF /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
@@ -66,6 +72,9 @@
 				4AA72B792146ED66006C3AEF /* Assets.xcassets */,
 				4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */,
 				4AA72B7E2146ED66006C3AEF /* Info.plist */,
+				4A1E2B9E227C8B52006C23E2 /* add_quantized.bin */,
+				4A1E2B9F227C8B52006C23E2 /* add.bin */,
+				4A1E2B9D227C8B51006C23E2 /* multi_add.bin */,
 			);
 			path = TensorFlowLiteApp;
 			sourceTree = "<group>";
@@ -102,6 +111,7 @@
 				TargetAttributes = {
 					4AA72B6E2146ED64006C3AEF = {
 						CreatedOnToolsVersion = 9.4.1;
+						LastSwiftMigration = 1020;
 					};
 				};
 			};
@@ -128,9 +138,12 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				4A1E2BA1227C8B53006C23E2 /* add_quantized.bin in Resources */,
 				4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */,
 				4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */,
+				4A1E2BA2227C8B53006C23E2 /* add.bin in Resources */,
 				4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */,
+				4A1E2BA0227C8B53006C23E2 /* multi_add.bin in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -296,7 +309,7 @@
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
 				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_VERSION = 4.0;
+				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
@@ -313,7 +326,7 @@
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
 				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_VERSION = 4.0;
+				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
index ffa90a0..45fd697 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
@@ -14,11 +14,3 @@
     return true
   }
 }
-
-// MARK: - Extensions
-
-#if !swift(>=4.2)
-extension UIApplication {
-  typealias LaunchOptionsKey = UIApplicationLaunchOptionsKey
-}
-#endif  // !swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
index 56df1ce..e9fb026 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
@@ -11,12 +11,15 @@
   /// - Parameter unsafeData: The data containing the bytes to turn into an array.
   init?(unsafeData: Data) {
     guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
-    let elements = unsafeData.withUnsafeBytes {
-      UnsafeBufferPointer<Element>(
+    #if swift(>=5.0)
+    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+    #else
+    self = unsafeData.withUnsafeBytes {
+      .init(UnsafeBufferPointer<Element>(
         start: $0,
         count: unsafeData.count / MemoryLayout<Element>.stride
-      )
+      ))
     }
-    self.init(elements)
+    #endif  // swift(>=5.0)
   }
 }
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
index b9e8bfb..5eab6d7 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
@@ -1,17 +1,18 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14490.70" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
     <device id="retina4_7" orientation="portrait">
         <adaptation id="fullscreen"/>
     </device>
     <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14490.49"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
     <scenes>
         <!--View Controller-->
         <scene sceneID="tne-QT-ifu">
             <objects>
-                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="AppLib" sceneMemberID="viewController">
+                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="TensorFlowLiteApp" customModuleProvider="target" sceneMemberID="viewController">
                     <layoutGuides>
                         <viewControllerLayoutGuide type="top" id="y3c-jy-aDJ"/>
                         <viewControllerLayoutGuide type="bottom" id="wfy-db-euE"/>
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index e98da5f..1a9b898 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -287,18 +287,24 @@
 extension Array {
   /// Creates a new array from the bytes of the given unsafe data.
   ///
+  /// - Warning: The array's `Element` type must be trivial in that it can be copied bit for bit
+  ///     with no indirection or reference-counting operations; otherwise, copying the raw bytes in
+  ///     the `unsafeData`'s buffer to a new array returns an unsafe copy.
   /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
   ///     `MemoryLayout<Element>.stride`.
   /// - Parameter unsafeData: The data containing the bytes to turn into an array.
   init?(unsafeData: Data) {
     guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
-    let elements = unsafeData.withUnsafeBytes {
-      UnsafeBufferPointer<Element>(
+    #if swift(>=5.0)
+    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+    #else
+    self = unsafeData.withUnsafeBytes {
+      .init(UnsafeBufferPointer<Element>(
         start: $0,
         count: unsafeData.count / MemoryLayout<Element>.stride
-      )
+      ))
     }
-    self.init(elements)
+    #endif  // swift(>=5.0)
   }
 }
 
diff --git a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
index 025db18..c0fc15e 100644
--- a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
@@ -24,9 +24,9 @@
 
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(
-            forResource: Constant.modelInfo.name,
-            ofType: Constant.modelInfo.extension)
-    else {
+      forResource: Constant.modelInfo.name,
+      ofType: Constant.modelInfo.extension
+    ) else {
       XCTFail("Failed to get the model file path.")
       return
     }
diff --git a/tensorflow/lite/experimental/tensorboard/BUILD b/tensorflow/lite/experimental/tensorboard/BUILD
new file mode 100644
index 0000000..10848e7
--- /dev/null
+++ b/tensorflow/lite/experimental/tensorboard/BUILD
@@ -0,0 +1,25 @@
+# TFLite modules to support TensorBoard plugin.
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "ops_util",
+    srcs = ["ops_util.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/python:wrap_toco",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "ops_util_test",
+    srcs = ["ops_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ops_util",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/lite/experimental/tensorboard/README.md b/tensorflow/lite/experimental/tensorboard/README.md
new file mode 100644
index 0000000..fdbd160
--- /dev/null
+++ b/tensorflow/lite/experimental/tensorboard/README.md
@@ -0,0 +1,4 @@
+This folder contains basic modules to support TFLite plugin for TensorBoard.
+
+Warning: Everything in this directory is experimental and highly subject to
+changes.
diff --git a/tensorflow/lite/experimental/tensorboard/ops_util.py b/tensorflow/lite/experimental/tensorboard/ops_util.py
new file mode 100644
index 0000000..3359e86
--- /dev/null
+++ b/tensorflow/lite/experimental/tensorboard/ops_util.py
@@ -0,0 +1,50 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops util to handle ops for Lite."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.lite.python import wrap_toco
+from tensorflow.python.util.tf_export import tf_export
+
+
+class SupportedOp(collections.namedtuple("SupportedOp", ["op"])):
+  """Spec of supported ops.
+
+  Args:
+    op: string of op name.
+  """
+
+
+@tf_export(v1=["lite.experimental.get_potentially_supported_ops"])
+def get_potentially_supported_ops():
+  """Returns operations potentially supported by TensorFlow Lite.
+
+  The potentially support list contains a list of ops that are partially or
+  fully supported, which is derived by simply scanning op names to check whether
+  they can be handled without real conversion and specific parameters.
+
+  Given that some ops may be partially supported, the optimal way to determine
+  if a model's operations are supported is by converting using the TensorFlow
+  Lite converter.
+
+  Returns:
+    A list of SupportedOp.
+  """
+  ops = wrap_toco.wrapped_get_potentially_supported_ops()
+  return [SupportedOp(o["op"]) for o in ops]
diff --git a/tensorflow/lite/experimental/tensorboard/ops_util_test.py b/tensorflow/lite/experimental/tensorboard/ops_util_test.py
new file mode 100644
index 0000000..3896667
--- /dev/null
+++ b/tensorflow/lite/experimental/tensorboard/ops_util_test.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for backend."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.lite.experimental.tensorboard import ops_util
+from tensorflow.python.platform import test
+
+
+class OpsUtilTest(test.TestCase):
+
+  def testGetPotentiallySupportedOps(self):
+    ops = ops_util.get_potentially_supported_ops()
+    # See GetTensorFlowNodeConverterMap() in
+    # tensorflow/lite/toco/import_tensorflow.cc
+    self.assertIsInstance(ops, list)
+    # Test partial ops that surely exist in the list.
+    self.assertIn(ops_util.SupportedOp("Add"), ops)
+    self.assertIn(ops_util.SupportedOp("Log"), ops)
+    self.assertIn(ops_util.SupportedOp("Sigmoid"), ops)
+    self.assertIn(ops_util.SupportedOp("Softmax"), ops)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index 4556f74..77f7b26 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -62,6 +62,8 @@
       return TensorType_FLOAT32;  // TODO(aselle): Consider an error.
     case kTfLiteFloat32:
       return TensorType_FLOAT32;
+    case kTfLiteFloat16:
+      return TensorType_FLOAT16;
     case kTfLiteInt32:
       return TensorType_INT32;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index a5c5dc8..2ea105f 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -174,6 +174,7 @@
     op_to_option_["RELU"] = "";
     op_to_option_["RELU_N1_TO_1"] = "";
     op_to_option_["RELU6"] = "";
+    op_to_option_["ROUND"] = "";
     op_to_option_["TANH"] = "";
     op_to_option_["PRELU"] = "";
     op_to_option_["SIN"] = "";
@@ -272,10 +273,12 @@
   }
 
   fprintf(fp, "  case BuiltinOperator_%s:  {\n", op_name.c_str());
-  fprintf(fp,
-          "    const auto* params = reinterpret_cast<const "
-          "%s*>(builtin_op_data);\n",
-          struct_name.c_str());
+  if (options->num_elems != 0) {
+    fprintf(fp,
+            "    const auto* params = reinterpret_cast<const "
+            "%s*>(builtin_op_data);\n",
+            struct_name.c_str());
+  }
 
   for (size_t i = 0; i < options->num_elems; i++) {
     std::string elem_name = options->names[i];
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index ba31086..3c1b484 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -90,12 +90,15 @@
       - title: "Build for Raspberry Pi"
         path: /lite/guide/build_rpi
 
-      - heading: "Microcontroller"
+      - heading: "Microcontrollers"
       - title: "Overview"
-        path: /lite/guide/microcontroller
-      - title: "TensorFlow Codelab"
-        path: https://g.co/codelabs/sparkfunTF
-        status: external
+        path: /lite/microcontrollers/overview
+      - title: "Get started with microcontrollers"
+        path: /lite/microcontrollers/get_started
+      - title: "Build and convert models"
+        path: /lite/microcontrollers/build_convert
+      - title: "Understand the C++ library"
+        path: /lite/microcontrollers/library
 
     - name: "Examples"
       contents:
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 45802fe..f9c6d9f 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -1,15 +1,23 @@
 # TensorFlow Lite converter
 
-TensorFlow Lite uses the optimized
-[FlatBuffer](https://google.github.io/flatbuffers/) format to represent graphs.
-Therefore, a TensorFlow model
-([protocol buffer](https://developers.google.com/protocol-buffers/)) needs to be
-converted into a `FlatBuffer` file before deploying to clients.
+The TensorFlow Lite converter is used to convert TensorFlow models into an
+optimized [FlatBuffer](https://google.github.io/flatbuffers/) format, so that
+they can be used by the TensorFlow Lite interpreter.
 
 Note: This page contains documentation on the converter API for TensorFlow 1.x.
 The API for TensorFlow 2.0 is available
 [here](https://www.tensorflow.org/lite/r2/convert/).
 
+## FlatBuffers
+
+FlatBuffer is an efficient open-source cross-platform serialization library. It
+is similar to
+[protocol buffers](https://developers.google.com/protocol-buffers), with the
+distinction that FlatBuffers do not need a parsing/unpacking step to a secondary
+representation before data can be accessed, avoiding per-object memory
+allocation. The code footprint of FlatBuffers is an order of magnitude smaller
+than protocol buffers.
+
 ## From model training to device deployment
 
 The TensorFlow Lite converter generates a TensorFlow Lite
@@ -20,14 +28,13 @@
 
 *   [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
 *   Frozen `GraphDef`: Models generated by
-    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
+    [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py).
 *   `tf.keras` HDF5 models.
 *   Any model taken from a `tf.Session` (Python API only).
 
-The TensorFlow Lite `FlatBuffer` file is then deployed to a client device
-(generally a mobile or embedded device), and the TensorFlow Lite interpreter
-uses the compressed model for on-device inference. This conversion process is
-shown in the diagram below:
+The TensorFlow Lite `FlatBuffer` file is then deployed to a client device, and
+the TensorFlow Lite interpreter uses the compressed model for on-device
+inference. This conversion process is shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
 
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index 4b2f38a..68f1eb5 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -1,146 +1,87 @@
 # Android quickstart
 
-An example Android application using TensorFLow Lite is available
-[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo).
-The demo is a sample camera app that classifies images continuously
-using either a quantized Mobilenet model or a floating point Inception-v3 model.
-To run the demo, a device running Android 5.0 ( API 21) or higher is required.
+To get started with TensorFlow Lite on Android, we recommend exploring the
+following example.
 
-In the demo app, inference is done using the TensorFlow Lite Java API. The demo
-app classifies frames in real-time, displaying the top most probable
-classifications. It also displays the time taken to detect the object.
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">Android
+image classification example</a>
 
-There are three ways to get the demo app to your device:
+For an explanation of the source code, you should also read
+[TensorFlow Lite Android image classification](https://www.tensorflow.org/lite/models/image_classification/android).
 
-* Download the [prebuilt binary APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-* Use Android Studio to build the application.
-* Download the source code for TensorFlow Lite and the demo and build it using
-  bazel.
+This example app uses
+[image classification](https://www.tensorflow.org/lite/models/image_classification/overview)
+to continuously classify whatever it sees from the device's rear-facing camera.
+The application can run either on device or emulator.
 
+Inference is performed using the TensorFlow Lite Java API. The demo app
+classifies frames in real-time, displaying the top most probable
+classifications. It allows the user to choose between a floating point or
+[quantized](https://www.tensorflow.org/lite/performance/post_training_quantization)
+model, select the thread count, and decide whether to run on CPU, GPU, or via
+[NNAPI](https://developer.android.com/ndk/guides/neuralnetworks).
 
-## Download the pre-built binary
+Note: Additional Android applications demonstrating TensorFlow Lite in a variety
+of use cases are available in
+[Examples](https://www.tensorflow.org/lite/examples).
 
-The easiest way to try the demo is to download the
-[pre-built binary APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
+## Build in Android Studio
 
-Once the APK is installed, click the app icon to start the program. The first
-time the app is opened, it asks for runtime permissions to access the device
-camera. The demo app opens the back-camera of the device and recognizes objects
-in the camera's field of view. At the bottom of the image (or at the left
-of the image if the device is in landscape mode), it displays top three objects
-classified and the classification latency.
+To build the example in Android Studio, follow the instructions in
+[README.md](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/README.md).
 
+## Create your own Android app
 
-## Build in Android Studio with TensorFlow Lite AAR from JCenter
+To get started quickly writing your own Android code, we recommend using our
+[Android image classification example](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android)
+as a starting point.
 
-Use Android Studio to try out changes in the project code and compile the demo
-app:
+The following sections contain some useful information for working with
+TensorFlow Lite on Android.
 
-* Install the latest version of
-  [Android Studio](https://developer.android.com/studio/index.html).
-* Make sure the Android SDK version is greater than 26 and NDK version is greater
-  than 14 (in the Android Studio settings).
-* Import the `tensorflow/lite/java/demo` directory as a new
-  Android Studio project.
-* Install all the Gradle extensions it requests.
+### Use the TensorFlow Lite AAR from JCenter
 
-Now you can build and run the demo app. 
+To use TensorFlow Lite in your Android app, we recommend using the
+[TensorFlow Lite AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite).
 
-The build process downloads the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip), and unzips it into the assets directory: `tensorflow/lite/java/demo/app/src/main/assets/`.
+You can specify this in your `build.gradle` dependencies as follows:
 
-Some additional details are available on the
-[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/README.md).
-
-### Using other models
-
-To use a different model:
-* Download the floating point [Inception-v3 model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip).
-* Unzip and copy `inceptionv3_non_slim_2015.tflite` to the assets directory. 
-* Change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
-  from: `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`<br>
-  to: `classifier = new ImageClassifierFloatInception(getActivity());`.
-
-
-## Build TensorFlow Lite and the demo app from source
-
-### Clone the TensorFlow repo
-
-```sh
-git clone https://github.com/tensorflow/tensorflow
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+}
 ```
 
-### Install Bazel
+This AAR includes binaries for all of the
+[Android ABIs](https://developer.android.com/ndk/guides/abis). You can reduce
+the size of your application's binary by only including the ABIs you need to
+support.
 
-If `bazel` is not installed on your system, see
-[Installing Bazel](https://bazel.build/versions/master/docs/install.html).
+We recommend most developers omit the `x86`, `x86_64`, and `arm32` ABIs. This
+can be achieved with the following Gradle configuration, which specifically
+includes only `armeabi-v7a` and `arm64-v8a`, which should cover most modern
+Android devices.
 
-Note: Bazel does not currently support Android builds on Windows. Windows users
-should download the
-[prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-
-### Install Android NDK and SDK
-
-The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The
-current recommended version is *14b* and can be found on the
-[NDK Archives](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads)
-page.
-
-The Android SDK and build tools can be
-[downloaded separately](https://developer.android.com/tools/revisions/build-tools.html)
-or used as part of
-[Android Studio](https://developer.android.com/studio/index.html). To build the
-TensorFlow Lite Android demo, build tools require API >= 23 (but it will run on
-devices with API >= 21).
-
-In the root of the TensorFlow repository, update the `WORKSPACE` file with the
-`api_level` and location of the SDK and NDK. If you installed it with
-Android Studio, the SDK path can be found in the SDK manager. The default NDK
-path is:`{SDK path}/ndk-bundle.` For example:
-
-```
-android_sdk_repository (
-    name = "androidsdk",
-    api_level = 23,
-    build_tools_version = "23.0.2",
-    path = "/home/xxxx/android-sdk-linux/",
-)
-
-android_ndk_repository(
-    name = "androidndk",
-    path = "/home/xxxx/android-ndk-r10e/",
-    api_level = 19,
-)
+```build
+android {
+    defaultConfig {
+        ndk {
+            abiFilters 'armeabi-v7a', 'arm64-v8a'
+        }
+    }
+}
 ```
 
-Some additional details are available on the
-[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/README.md).
+To learn more about `abiFilters`, see
+[`NdkOptions`](https://google.github.io/android-gradle-dsl/current/com.android.build.gradle.internal.dsl.NdkOptions.html)
+in the Android Gradle documentation.
 
-### Build the source code
+### Build TensorFlow Lite locally
 
-To build the demo app, run `bazel`:
+In some cases, you might wish to use a local build of TensorFlow Lite. For
+example, you may be building a custom binary that includes
+[operations selected from TensorFlow](https://www.tensorflow.org/lite/guide/ops_select).
 
-```
-bazel build --cxxopt=--std=c++11 //tensorflow/lite/java/demo/app/src/main:TfLiteCameraDemo
-```
-
-Caution: Because of an bazel bug, we only support building the Android demo app
-within a Python 2 environment.
-
-
-## About the demo
-
-The demo app is resizing each camera image frame (224 width * 224 height) to
-match the quantized MobileNets model (299 * 299 for Inception-v3). The resized
-image is converted—row by row—into a
-[ByteBuffer](https://developer.android.com/reference/java/nio/ByteBuffer.html).
-Its size is  1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch.
-224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents
-the 3 colors of a pixel.
-
-This demo uses the TensorFlow Lite Java inference API
-for models which take a single input and provide a single output. This outputs a
-two-dimensional array, with the first dimension being the category index and the
-second dimension being the confidence of classification. Both models have 1001
-unique categories and the app sorts the probabilities of all the categories and
-displays the top three. The model file must be downloaded and bundled within the
-assets directory of the app.
+In this case, follow the
+[custom AAR build instructions](https://www.tensorflow.org/lite/guide/ops_select#android_aar)
+to create your own AAR and include it in your app.
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index cb0cabc..1a438ab 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -34,7 +34,7 @@
 ```
 
 This should compile a static library in:
-`tensorflow/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
 
 ## Native compiling
 This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md
index 2e42c95..e20dc08 100644
--- a/tensorflow/lite/g3doc/guide/get_started.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -1,270 +1,286 @@
 # Get started with TensorFlow Lite
 
-Using a TensorFlow Lite model in your mobile app requires multiple
-considerations: you must choose a pre-trained or custom model, convert the model
-to a TensorFLow Lite format, and finally, integrate the model in your app.
+TensorFlow Lite provides all the tools you need to convert and run TensorFlow
+models on mobile, embedded, and IoT devices. The following guide walks through
+each step of the developer workflow and provides links to further instructions.
 
 ## 1. Choose a model
 
-Depending on the use case, you can choose one of the popular open-sourced models,
-such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
-data set or even build your own custom model.
+<a id="1_choose_a_model"></a>
+
+TensorFlow Lite allows you to run TensorFlow models on a wide range of devices.
+A TensorFlow model is a data structure that contains the logic and knowledge of
+a machine learning network trained to solve a particular problem.
+
+There are many ways to obtain a TensorFlow model, from using pre-trained models
+to training your own. To use a model with TensorFlow Lite it must be converted
+into a special format. This is explained in section 2,
+[Convert the model](#2_convert_the_model_format).
+
+Note: Not all TensorFlow models will work with TensorFlow Lite, since the
+interpreter supports a limited subset of TensorFlow operations. See section 2,
+[Convert the model](#2_convert_the_model_format) to learn about compatibility.
 
 ### Use a pre-trained model
 
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
-is a family of mobile-first computer vision models for TensorFlow designed to
-effectively maximize accuracy, while taking into consideration the restricted
-resources for on-device or embedded applications. MobileNets are small,
-low-latency, low-power models parameterized to meet the resource constraints for
-a variety of uses. They can be used for classification, detection, embeddings, and
-segmentation—similar to other popular large scale models, such as
-[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
-[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
-for MobileNets that can be used in mobile projects of all sizes.
+The TensorFlow Lite team provides a set of pre-trained models that solve a
+variety of machine learning problems. These models have been converted to work
+with TensorFlow Lite and are ready to use in your applications.
 
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
-that achieves fairly high accuracy recognizing general objects with 1000 classes,
-for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
-features from input images using a convolutional neural network and classifies
-them based on those features with fully-connected and softmax layers.
+The pre-trained models include:
 
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-is an on-device model that provides one-touch replies for incoming text messages
-by suggesting contextually relevant messages. The model is built specifically for
-memory constrained devices, such as watches and phones, and has been successfully
-used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+*   [Image classification](../models/image_classification/overview.md)
+*   [Object detection](../models/object_detection/overview.md)
+*   [Smart reply](../models/smart_reply/overview.md)
+*   [Pose estimation](../models/pose_estimation/overview.md)
+*   [Segmentation](../models/segmentation/overview.md)
 
-These pre-trained models are [available for download](hosted_models.md).
+See our full list of pre-trained models in [Models](../models).
 
-### Re-train Inception-V3 or MobileNet for a custom data set
+#### Models from other sources
 
-These pre-trained models were trained on the *ImageNet* data set which contains
-1000 predefined classes. If these classes are not sufficient for your use case,
-the model will need to be re-trained. This technique is called
-*transfer learning* and starts with a model that has been already trained on a
-problem, then retrains the model on a similar problem. Deep learning from
-scratch can take days, but transfer learning is fairly quick. In order to do
-this, you need to generate a custom data set labeled with the relevant classes.
+There are many other places you can obtain pre-trained TensorFlow models,
+including [TensorFlow Hub](https://www.tensorflow.org/hub). In most cases, these
+models will not be provided in the TensorFlow Lite format, and you'll have to
+[convert](#2_convert_the_model_format) them before use.
 
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
-codelab walks through the re-training process step-by-step. The code supports
-both floating point and quantized inference.
+### Re-train a model (transfer learning)
+
+Transfer learning allows you to take a trained model and re-train it to perform
+another task. For example, an
+[image classification](../models/image_classification/overview.md) model could
+be retrained to recognize new categories of image. Re-training takes less time
+and requires less data than training a model from scratch.
+
+You can use transfer learning to customize pre-trained models to your
+application. Learn how to perform transfer learning in the
+<a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android">Recognize
+flowers with TensorFlow</a> codelab.
 
 ### Train a custom model
 
-A developer may choose to train a custom model using Tensorflow (see the
-[TensorFlow tutorials](https://www.tensorflow.org/tutorials/) for examples of building and training
-models). If you have already written a model, the first step is to export this
-to a `tf.GraphDef` file. This is required because some formats do not store the
-model structure outside the code, and we must communicate with other parts of
-the framework. See
-[Exporting the Inference Graph](https://www.tensorflow.org/tutorials/keras/save_and_restore_models#save_the_entire_model)
-to create file for the custom model.
+If you have designed and trained your own TensorFlow model, or you have trained
+a model obtained from another source, you should convert it to the TensorFlow
+Lite format before use.
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to
-the [TensorFlow Lite & TensorFlow Compatibility Guide](ops_compatibility.md)
-for supported operators and their usage. This set of operators will continue to
-grow in future Tensorflow Lite releases.
+## 2. Convert the model
 
-## 2. Convert the model format
+<a id="2_convert_the_model_format"></a>
 
-The [TensorFlow Lite Converter](../convert/index.md) accepts the following file
-formats:
+TensorFlow Lite is designed to execute models efficiently on devices. Some of
+this efficiency comes from the use of a special format for storing models.
+TensorFlow models must be converted into this format before they can be used by
+TensorFlow Lite.
 
-*   `SavedModel` — A `GraphDef` and checkpoint with a signature that labels
-    input and output arguments to a model. See the documentation for converting
-    SavedModels using [Python](../convert/python_api.md#basic_savedmodel) or using
-    the [command line](../convert/cmdline_examples.md#savedmodel).
-*   `tf.keras` - A HDF5 file containing a model with weights and input and
-    output arguments generated by `tf.Keras`. See the documentation for
-    converting HDF5 models using
-    [Python](../convert/python_api.md#basic_keras_file) or using the
-    [command line](../convert/cmdline_examples.md#keras).
-*   `frozen tf.GraphDef` — A subclass of `tf.GraphDef` that does not contain
-    variables. A `GraphDef` can be converted to a `frozen GraphDef` by taking a
-    checkpoint and a `GraphDef`, and converting each variable into a constant
-    using the value retrieved from the checkpoint. Instructions on converting a
-    `tf.GraphDef` to a TensorFlow Lite model are described in the next
-    subsection.
+Converting models reduces their file size and introduces optimizations that do
+not affect accuracy. Developers can opt to further reduce file size and increase
+speed of execution in exchange for some trade-offs. You can use the TensorFlow
+Lite converter to choose which optimizations to apply.
 
-### Converting a tf.GraphDef
+TensorFlow Lite supports a limited subset of TensorFlow operations, so not all
+models can be converted. See [Ops compatibility](#ops-compatibility) for more
+information.
 
-TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
-to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
-frozen. This process involves several file formats including the `frozen
-GraphDef`:
+### TensorFlow Lite converter
 
-*   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
-    training or computation graph. It contains operators, tensors, and variables
-    definitions.
-*   *checkpoint* (.ckpt) — Serialized variables from a TensorFlow graph. Since
-    this does not contain a graph structure, it cannot be interpreted by itself.
-*   *TensorFlow Lite model* (.tflite) — A serialized
-    [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
-    Lite operators and tensors for the TensorFlow Lite interpreter.
+The [TensorFlow Lite converter](../convert) is a tool that converts trained
+TensorFlow models into the TensorFlow Lite format. It can also introduce
+optimizations, which are covered in section 4,
+[Optimize your model](#4_optimize_your_model_optional).
 
-You must have checkpoints that contain trained weights. The `tf.GraphDef` file
-only contains the structure of the graph. The process of merging the checkpoint
-values with the graph structure is called *freezing the graph*.
+The converter is available as a Python API. The following example shows a
+TensorFlow `SavedModel` being converted into the TensorFlow Lite format:
 
-`tf.GraphDef` and checkpoint files for MobileNet models are available
-[here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
+```python
+import tensorflow as tf
 
-To freeze the graph, use the following command (changing the arguments):
-
-```
-freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
-  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-  --input_binary=true \
-  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-  --output_node_names=MobileNetV1/Predictions/Reshape_1
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-Set the `input_binary` flag to `True` when reading a binary protobuf, a `.pb`
-file. Set to `False` for a `.pbtxt` file.
+You can [convert TensorFlow 2.0 models](../r2/convert) in a similar way.
 
-Set `input_graph` and `input_checkpoint` to the respective filenames. The
-`output_node_names` may not be obvious outside of the code that built the model.
-The easiest way to find them is to visualize the graph, either with
-[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) or
-`graphviz`.
+The converter can also be used from the
+[command line](../convert/cmdline_examples), but the Python API is recommended.
 
-The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
-(.tflite) for use on Android or iOS devices. For Android, the TensorFlow Lite
-Converter tool supports both float and quantized models. To convert the frozen
-`GraphDef` to the .tflite format use a command similar to the following:
+### Options
 
-```
-tflite_convert \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
-  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1
-```
+The converter can convert from a variety of input types.
 
-The
-[frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
-file used here is available for download. Setting the `input_array` and
-`output_array` arguments is not straightforward. The easiest way to find these
-values is to explore the graph using
-[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard). Reuse
-the arguments for specifying the output nodes for inference in the
-`freeze_graph` step.
+When [converting TensorFlow 1.x models](../convert/python_api), these are:
 
-### Full converter reference
+*   [SavedModel directories](https://www.tensorflow.org/alpha/guide/saved_model)
+*   Frozen GraphDef (models generated by
+    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
+*   [Keras](https://keras.io) HDF5 models
+*   Models taken from a `tf.Session`
 
-The [TensorFlow Lite Converter](../convert/index.md) can be
-[Python](../convert/python_api.md) or from the
-[command line](../convert/cmdline_examples.md). This allows you to integrate the
-conversion step into the model design workflow, ensuring the model is easy to
-convert to a mobile inference graph.
+When [converting TensorFlow 2.x models](../r2/convert/python_api), these are:
+
+*   [SavedModel directories](https://www.tensorflow.org/alpha/guide/saved_model)
+*   [`tf.keras` models](https://www.tensorflow.org/alpha/guide/keras/overview)
+*   [Concrete functions](../r2/convert/concrete_function.md)
+
+The converter can be configured to apply various optimizations that can improve
+performance or reduce file size. This is covered in section 4,
+[Optimize your model](#4_optimize_your_model_optional).
 
 ### Ops compatibility
 
-Refer to the [ops compatibility guide](ops_compatibility.md) for
-troubleshooting help, and if that doesn't help, please
-[file an issue](https://github.com/tensorflow/tensorflow/issues).
+TensorFlow Lite currently supports a [limited subset](ops_compatibility.md) of
+TensorFlow operations. The long term goal is for all TensorFlow operations to be
+supported.
 
-### Graph Visualization tool
+If the model you wish to convert contains unsupported operations, you can use
+[TensorFlow Select](ops_select.md) to include operations from TensorFlow. This
+will result in a larger binary being deployed to devices.
 
-The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
-to visualize TensorFlow Lite models after conversion. To build the
-[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
-tool:
+## 3. Run inference with the model
 
-```sh
-bazel run tensorflow/lite/tools:visualize -- model.tflite model_viz.html
+<a id="3_use_the_tensorflow_lite_model_for_inference_in_a_mobile_app"></a>
+
+*Inference* is the process of running data through a model to obtain
+predictions. It requires a model, an interpreter, and input data.
+
+### TensorFlow Lite interpreter
+
+The [TensorFlow Lite interpreter](inference.md) is a library that takes a model
+file, executes the operations it defines on input data, and provides access to
+the output.
+
+The interpreter works across multiple platforms and provides a simple API for
+running TensorFlow Lite models from Java, Swift, Objective-C, C++, and Python.
+
+The following code shows the interpreter being invoked from Java:
+
+```java
+try (Interpreter interpreter = new Interpreter(tensorflow_lite_model_file)) {
+  interpreter.run(input, output);
+}
 ```
 
-This generates an interactive HTML page listing subgraphs, operations, and a
-graph visualization.
+### GPU acceleration and Delegates
 
-## 3. Use the TensorFlow Lite model for inference in a mobile app
+Some devices provide hardware acceleration for machine learning operations. For
+example, most mobile phones have GPUs, which can perform floating point matrix
+operations faster than a CPU.
 
-After completing the prior steps, you should now have a `.tflite` model file.
+The speed-up can be substantial. For example, a MobileNet v1 image
+classification model runs 5.5x faster on a Pixel 3 phone when GPU acceleration
+is used.
 
-### Android
+The TensorFlow Lite interpreter can be configured with
+[Delegates](../performance/delegates.md) to make use of hardware acceleration on
+different devices. The [GPU Delegate](../performance/gpu.md) allows the
+interpreter to run appropriate operations on the device's GPU.
 
-Since Android apps are written in Java and the core TensorFlow library is in C++,
-a JNI library is provided as an interface. This is only meant for inference—it
-provides the ability to load a graph, set up inputs, and run the model to
-calculate outputs.
+The following code shows the GPU Delegate being used from Java:
 
-The open source Android demo app uses the JNI interface and is available
-[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/app).
-You can also download a
-[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-See the <a href="./android.md">Android demo</a> guide for details.
+```java
+GpuDelegate delegate = new GpuDelegate();
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+Interpreter interpreter = new Interpreter(tensorflow_lite_model_file, options);
+try {
+  interpreter.run(input, output);
+}
+```
 
-The <a href="./android.md">Android mobile</a> guide has instructions for
-installing TensorFlow on Android and setting up `bazel` and Android Studio.
+To add support for new hardware accelerators you can
+[define your own delegate](../performance/delegates.md#how_to_add_a_delegate).
 
-### iOS
+### Android and iOS
 
-To integrate a TensorFlow model in an iOS app, see the
-[TensorFlow Lite for iOS](ios.md) guide and <a href="./ios.md">iOS demo</a>
-guide.
+The TensorFlow Lite interpreter is easy to use from both major mobile platforms.
+To get started, explore the [Android quickstart](android.md) and
+[iOS quickstart](ios.md) guides.
+[Example applications](https://www.tensorflow.org/lite/examples) are available
+for both platforms.
 
-#### Core ML support
+To obtain the required libraries, Android developers should use the
+[TensorFlow Lite AAR](android.md#use_the_tensorflow_lite_aar_from_jcenter). iOS
+developers should use the
+[CocoaPods for Swift or Objective-C](ios.md#add_tensorflow_lite_to_your_swift_or_objective-c_project).
 
-Core ML is a machine learning framework used in Apple products. In addition to
-using Tensorflow Lite models directly in your applications, you can convert
-trained Tensorflow models to the
-[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
-devices. To use the converter, refer to the
-[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+### Linux
 
-### ARM32 and ARM64 Linux
+Embedded Linux is an important platform for deploying machine learning. We
+provide build instructions for both [Raspberry Pi](build_rpi.md) and
+[Arm64-based boards](build_arm64.md) such as Odroid C2, Pine64, and NanoPi.
 
-Compile Tensorflow Lite for a Raspberry Pi by following the
-[RPi build instructions](build_rpi.md) Compile Tensorflow Lite for a generic aarch64
-board such as Odroid C2, Pine64, NanoPi, and others by following the
-[ARM64 Linux build instructions](build_arm64.md) This compiles a static
-library file (`.a`) used to build your app. There are plans for Python bindings
-and a demo app.
+### Microcontrollers
 
-## 4. Optimize your model (optional)
+[TensorFlow Lite for Microcontrollers](../microcontrollers/overview.md) is an
+experimental port of TensorFlow Lite aimed at microcontrollers and other devices
+with only kilobytes of memory.
 
-There are two options. If you plan to run on CPU, we recommend that you quantize
-your weights and activation tensors. If the hardware is available, another
-option is to run on GPU for massively parallelizable workloads.
+### Operations
+
+If your model requires TensorFlow operations that are not yet implemented in
+TensorFlow Lite, you can use [TensorFlow Select](ops_select.md) to use them in
+your model. You'll need to build a custom version of the interpreter that
+includes the TensorFlow operations.
+
+You can use [Custom operators](ops_custom.md) to write your own operations, or
+port new operations into TensorFlow Lite.
+
+[Operator versions](ops_version.md) allows you to add new functionalities and
+parameters into existing operations.
+
+## 4. Optimize your model
+
+<a id="4_optimize_your_model_optional"></a>
+
+TensorFlow Lite provides tools to optimize the size and performance of your
+models, often with minimal impact on accuracy. Optimized models may require
+slightly more complex training, conversion, or integration.
+
+Machine learning optimization is an evolving field, and TensorFlow Lite's
+[Model Optimization Toolkit](#model-optimization-toolkit) is continually growing
+as new techniques are developed.
+
+### Performance
+
+The goal of model optimization is to reach the ideal balance of performance,
+model size, and accuracy on a given device.
+[Performance best practices](../performance/best_practices.md) can help guide
+you through this process.
 
 ### Quantization
-Compress your model size by lowering the precision of the parameters (i.e.
-neural network weights) from their training-time 32-bit floating-point
-representations into much smaller and efficient 8-bit integer ones.
 
-This will execute the heaviest computations fast in lower precision, but the
-most sensitive ones with higher precision, thus typically resulting in little to
-no final accuracy losses for the task, yet a significant speed-up over pure
-floating-point execution.
+By reducing the precision of values and operations within a model, quantization
+can reduce both the size of model and the time required for inference. For many
+models, there is only a minimal loss of accuracy.
 
-The post-training quantization technique is integrated into the TensorFlow Lite
-conversion tool. Getting started is easy: after building your TensorFlow model,
-simply enable the ‘post_training_quantize’ flag in the TensorFlow Lite
-conversion tool. Assuming that the saved model is stored in saved_model_dir, the
-quantized tflite flatbuffer can be generated in command line:
+The TensorFlow Lite converter makes it easy to quantize TensorFlow models. The
+following Python code quantizes a `SavedModel` and saves it to disk:
 
-```
+```python
+import tensorflow as tf
+
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
 tflite_quant_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_quantized_model)
 ```
 
-Read the full documentation [here](../performance/post_training_quantization.md)
-and see a tutorial
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb).
+To learn more about quantization, see
+[Post-training quantization](../performance/post_training_quantization.md).
 
-### GPU
-Run on GPU GPUs are designed to have high throughput for massively
-parallelizable workloads. Thus, they are well-suited for deep neural nets, which
-consist of a huge number of operators, each working on some input tensor(s) that
-can be easily divided into smaller workloads and carried out in parallel,
-typically resulting in lower latency.
+### Model Optimization Toolkit
 
-Another benefit with GPU inference is its power efficiency. GPUs carry out the
-computations in a very efficient and optimized manner, so that they consume less
-power and generate less heat than when the same task is run on CPUs.
+The [Model Optimization Toolkit](../performance/model_optimization.md) is a set
+of tools and techniques designed to make it easy for developers to optimize
+their models. Many of the techniques can be applied to all TensorFlow models and
+are not specific to TensorFlow Lite, but they are especially valuable when
+running inference on devices with limited resources.
 
-Read the tutorial [here](../performance/gpu.md) and full documentation [here](../performance/gpu_advanced.md).
+## Next steps
+
+Now that you're familiar with TensorFlow Lite, explore some of the following
+resources:
+
+*   If you're a mobile developer, visit [Android quickstart](android.md) or
+    [iOS quickstart](ios.md).
+*   Explore our [pre-trained models](../models).
+*   Try our [example apps](https://www.tensorflow.org/lite/examples).
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index 69f1967..323d31b 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -39,7 +39,7 @@
 Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
 Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
 Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 53.4 ms
 Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
 Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
 Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
diff --git a/tensorflow/lite/g3doc/guide/index.md b/tensorflow/lite/g3doc/guide/index.md
index 288f7a0..2475c7e 100644
--- a/tensorflow/lite/g3doc/guide/index.md
+++ b/tensorflow/lite/g3doc/guide/index.md
@@ -1,202 +1,121 @@
-
 # TensorFlow Lite guide
 
-TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
-devices. It enables on-device machine learning inference with low latency and a
-small binary size. TensorFlow Lite also supports hardware acceleration with the
-[Android Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+TensorFlow Lite is a set of tools to help developers run TensorFlow models on
+mobile, embedded, and IoT devices. It enables on-device machine learning
+inference with low latency and a small binary size.
 
-TensorFlow Lite uses many techniques for achieving low latency such as
-optimizing the kernels for mobile apps, pre-fused activations, and quantized
-kernels that allow smaller and faster (fixed-point math) models.
+TensorFlow Lite consists of two main components:
 
-Most of our TensorFlow Lite documentation is [on
-GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite)
-for the time being.
+-   The [TensorFlow Lite interpreter](inference.md), which runs specially
+    optimized models on many different hardware types, including mobile phones,
+    embedded Linux devices, and microcontrollers.
+-   The [TensorFlow Lite converter](../convert/index.md), which converts
+    TensorFlow models into an efficient form for use by the interpreter, and can
+    introduce optimizations to improve binary size and performance.
 
-## What does TensorFlow Lite contain?
+### Machine learning at the edge
 
-TensorFlow Lite supports a set of core operators, both quantized and
-float, which have been tuned for mobile platforms. They incorporate pre-fused
-activations and biases to further enhance performance and quantized
-accuracy. Additionally, TensorFlow Lite also supports using custom operations in
-models.
+TensorFlow Lite is designed to make it easy to perform machine learning on
+devices, "at the edge" of the network, instead of sending data back and forth
+from a server. For developers, performing machine learning on-device can help
+improve:
 
-TensorFlow Lite defines a new model file format, based on
-[FlatBuffers](https://google.github.io/flatbuffers/). FlatBuffers is an
-efficient open-source cross-platform serialization library. It is similar to
-[protocol buffers](https://developers.google.com/protocol-buffers/?hl=en), but
-the primary difference is that FlatBuffers does not need a parsing/unpacking
-step to a secondary representation before you can access data, often coupled
-with per-object memory allocation. Also, the code footprint of FlatBuffers is an
-order of magnitude smaller than protocol buffers.
+*   *Latency:* there's no round-trip to a server
+*   *Privacy:* no data needs to leave the device
+*   *Connectivity:* an Internet connection isn't required
+*   *Power consumption:* network connections are power hungry
 
-TensorFlow Lite has a new mobile-optimized interpreter, which has the key goals
-of keeping apps lean and fast. The interpreter uses a static graph ordering and
-a custom (less-dynamic) memory allocator to ensure minimal load, initialization,
-and execution latency.
+TensorFlow Lite works with a huge range of devices, from tiny microcontrollers
+to powerful mobile phones.
 
-TensorFlow Lite provides an interface to leverage hardware acceleration, if
-available on the device. It does so via the
-[Android Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/index.html),
-available on Android 8.1 (API level 27) and higher.
+Key Point: The TensorFlow Lite binary is smaller than 300KB when all supported
+operators are linked, and less than 200KB when using only the operators needed
+for supporting the common image classification models InceptionV3 and MobileNet.
 
-## Why do we need a new mobile-specific library?
+## Get started
 
-Machine Learning is changing the computing paradigm, and we see an emerging
-trend of new use cases on mobile and embedded devices. Consumer expectations are
-also trending toward natural, human-like interactions with their devices, driven
-by the camera and voice interaction models.
+To begin working with TensorFlow Lite, visit [Get started](get_started.md).
 
-There are several factors which are fueling interest in this domain:
+## Key features
 
-- Innovation at the silicon layer is enabling new possibilities for hardware
-  acceleration, and frameworks such as the Android Neural Networks API make it
-  easy to leverage these.
+*   *[Interpreter](inference.md) tuned for on-device ML*, supporting a set of
+    core operators that are optimized for on-device applications, and with a
+    small binary size.
+*   *Diverse platform support*, covering [Android](android.md) and [iOS](ios.md)
+    devices, embedded Linux, and microcontrollers, making use of platform APIs
+    for accelerated inference.
+*   *APIs for multiple languages* including Java, Swift, Objective-C, C++, and
+    Python.
+*   *High performance*, with [hardware acceleration](../performance/gpu.md) on
+    supported devices, device-optimized kernels, and
+    [pre-fused activations and biases](ops_compatibility.md).
+*   *Model optimization tools*, including
+    [quantization](../performance/post_training_quantization.md), that can
+    reduce size and increase performance of models without sacrificing accuracy.
+*   *Efficient model format*, using a [FlatBuffer](../convert/index.md) that is
+    optimized for small size and portability.
+*   *[Pre-trained models](../models)* for common machine learning tasks that can
+    be customized to your application.
+*   *[Samples and tutorials](https://www.tensorflow.org/examples)* that show you
+    how to deploy machine learning models on supported platforms.
 
-- Recent advances in real-time computer-vision and spoken language understanding
-  have led to mobile-optimized benchmark models being open sourced
-  (e.g. MobileNets, SqueezeNet).
+## Development workflow
 
-- Widely-available smart appliances create new possibilities for
-  on-device intelligence.
+The workflow for using TensorFlow Lite involves the following steps:
 
-- Interest in stronger user data privacy paradigms where user data does not need
-  to leave the mobile device.
+1.  **Pick a model**
 
-- Ability to serve ‘offline’ use cases, where the device does not need to be
-  connected to a network.
+    Bring your own TensorFlow model, find a model online, or pick a model from
+    our [Pre-trained models](../models) to drop in or retrain.
 
-We believe the next wave of machine learning applications will have significant
-processing on mobile and embedded devices.
+1.  **Convert the model**
 
-## TensorFlow Lite highlights
+    If you're using a custom model, use the
+    [TensorFlow Lite converter](../convert/index.md) and a few lines of Python
+    to convert it to the TensorFlow Lite format.
 
-TensorFlow Lite provides:
+1.  **Deploy to your device**
 
-- A set of core operators, both quantized and float, many of which have been
-  tuned for mobile platforms.  These can be used to create and run custom
-  models.  Developers can also write their own custom operators and use them in
-  models.
+    Run your model on-device with the
+    [TensorFlow Lite interpreter](inference.md), with APIs in many languages.
 
-- A new [FlatBuffers](https://google.github.io/flatbuffers/)-based
-  model file format.
+1.  **Optimize your model**
 
-- On-device interpreter with kernels optimized for faster execution on mobile.
+    Use our [Model Optimization Toolkit](../performance/model_optimization.md)
+    to reduce your model's size and increase its efficiency with minimal impact
+    on accuracy.
 
-- TensorFlow converter to convert TensorFlow-trained models to the TensorFlow
-  Lite format.
+To learn more about using TensorFlow Lite in your project, see
+[Get started](get_started.md).
 
-- Smaller in size: TensorFlow Lite is smaller than 300KB when all supported
-  operators are linked and less than 200KB when using only the operators needed
-  for supporting InceptionV3 and Mobilenet.
+## Technical constraints
 
-- **Pre-tested models:**
+TensorFlow Lite plans to provide high performance on-device inference for any
+TensorFlow model. However, the TensorFlow Lite interpreter currently supports a
+limited subset of TensorFlow operators that have been optimized for on-device
+use. This means that some models require additional steps to work with
+TensorFlow Lite.
 
-    All of the following models are guaranteed to work out of the box:
+To learn which operators are available, see
+[Operator compatibility](ops_compatibility.md).
 
-    - Inception V3, a popular model for detecting the dominant objects
-      present in an image.
+If your model uses operators that are not yet supported by TensorFlow Lite
+interpreter, you can use [TensorFlow Select](ops_select.md) to include
+TensorFlow operations in your TensorFlow Lite build. However, this will lead to
+an increased binary size.
 
-    - [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md),
-      a family of mobile-first computer vision models designed to effectively
-      maximize accuracy while being mindful of the restricted resources for an
-      on-device or embedded application. They are small, low-latency, low-power
-      models parameterized to meet the resource constraints of a variety of use
-      cases. They can be built upon for classification, detection, embeddings
-      and segmentation. MobileNet models are smaller but [lower in
-      accuracy](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
-      than Inception V3.
+TensorFlow Lite does not currently support on-device training, but it is in our
+[Roadmap](roadmap.md), along with other planned improvements.
 
-    - On Device Smart Reply, an on-device model which provides one-touch
-      replies for an incoming text message by suggesting contextually relevant
-      messages. The model was built specifically for memory constrained devices
-      such as watches & phones and it has been successfully used to surface
-      [Smart Replies on Android
-      Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-      to all first-party and third-party apps.
+## Next steps
 
-    Also see the complete list of
-    [TensorFlow Lite's supported models](hosted_models.md),
-    including the model sizes, performance numbers, and downloadable model files.
+Want to keep learning about TensorFlow Lite? Here are some next steps:
 
-- Quantized versions of the MobileNet model, which runs faster than the
-  non-quantized (float) version on CPU.
-
-- New Android demo app to illustrate the use of TensorFlow Lite with a quantized
-  MobileNet model for object classification.
-
-- Java and C++ API support
-
-
-## Getting Started
-
-We recommend you try out TensorFlow Lite with the pre-tested models indicated
-above. If you have an existing model, you will need to test whether your model
-is compatible with both the converter and the supported operator set.  To test
-your model, see the
-[documentation on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-
-The pre-trained models mentioned above have been trained on the ImageNet data
-set, which consists of 1000 predefined classes. If those classes are not
-relevant or useful for your use case, you will need to retrain those
-models. This technique is called transfer learning, which starts with a model
-that has been already trained on a problem and will then be retrained on a
-similar problem. Deep learning from scratch can take days, but transfer learning
-can be done fairly quickly. In order to do this, you'll need to generate your
-custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
-codelab walks through this process step-by-step. The retraining code supports
-retraining for both floating point and quantized inference.
-
-## TensorFlow Lite Architecture
-
-The following diagram shows the architectural design of TensorFlow Lite:
-
-<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
-     alt="TensorFlow Lite architecture diagram"
-     style="max-width:600px;">
-
-Starting with a trained TensorFlow model on disk, you'll convert that model to
-the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
-Converter. Then you can use that converted file in your mobile application.
-
-Deploying the TensorFlow Lite model file uses:
-
-- Java API: A convenience wrapper around the C++ API on Android.
-
-- C++ API: Loads the TensorFlow Lite Model File and invokes the Interpreter. The
-  same library is available on both Android and iOS.
-
-- Interpreter: Executes the model using a set of kernels. The interpreter
-  supports selective kernel loading; without kernels it is only 100KB, and 300KB
-  with all the kernels loaded. This is a significant reduction from the 1.5M
-  required by TensorFlow Mobile.
-
-- On select Android devices, the Interpreter will use the Android Neural
-  Networks API for hardware acceleration, or default to CPU execution if none
-  are available.
-
-You can also implement custom kernels using the C++ API that can be used by the
-Interpreter.
-
-## Future Work
-
-In future releases, TensorFlow Lite will support more models and built-in
-operators, contain performance improvements for both fixed point and floating
-point models, improvements to the tools to enable easier developer workflows and
-support for other smaller devices and more. As we continue development, we hope
-that TensorFlow Lite will greatly simplify the developer experience of targeting
-a model for small devices.
-
-Future plans include using specialized machine learning hardware to get the best
-possible performance for a particular model on a particular device.
-
-## Next Steps
-
-The TensorFlow Lite [GitHub repository](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite).
-contains additional docs, code samples, and demo applications.
+*   Visit [Get started](get_started.md) to walk through the process of using
+    TensorFlow Lite.
+*   If you're a mobile developer, visit [Android quickstart](android.md) or
+    [iOS quickstart](ios.md).
+*   Learn about
+    [TensorFlow Lite for Microcontrollers](../microcontrollers/overview.md).
+*   Explore our [pre-trained models](../models).
+*   Try our [example apps](https://www.tensorflow.org/lite/examples).
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index b0107ec..353a656 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -1,16 +1,15 @@
 # TensorFlow Lite inference
 
-[TOC]
+The term *inference* refers to the process of executing a TensorFlow Lite model
+on-device in order to make predictions based on input data. Inference is the
+final step in using the model on-device.
 
-## Overview
+Inference for TensorFlow Lite models is run through an interpreter. The
+TensorFlow Lite interpreter is designed to be lean and fast. The interpreter
+uses a static graph ordering and a custom (less-dynamic) memory allocator to
+ensure minimal load, initialization, and execution latency.
 
-TensorFlow Lite inference is the process of executing a TensorFlow Lite
-model on-device and extracting meaningful results from it. Inference is the
-final step in using the model on-device in the
-[architecture](index.md#tensorflow_lite_architecture).
-
-Inference for TensorFlow Lite models is run through an interpreter. This
-document outlines the various APIs for the interpreter along with the
+This document outlines the various APIs for the interpreter, along with the
 [supported platforms](#supported-platforms).
 
 ### Important Concepts
@@ -43,19 +42,27 @@
    present it to their user.
 
 ### Supported Platforms
+
 TensorFlow inference APIs are provided for most common mobile/embedded platforms
 such as Android, iOS and Linux.
 
 #### Android
+
 On Android, TensorFlow Lite inference can be performed using either Java or C++
 APIs. The Java APIs provide convenience and can be used directly within your
-Android Activity classes. The C++ APIs on the other hand may offer more
-flexibility and speed, but may require writing JNI wrappers to move data between
-Java and C++ layers. You can find an example [here](android.md).
+Android Activity classes. The C++ APIs offer more flexibility and speed, but may
+require writing JNI wrappers to move data between Java and C++ layers.
+
+Visit the [Android quickstart](android.md) for a tutorial and example code.
 
 #### iOS
-TensorFlow Lite provides Swift/Objective C++ APIs for inference on iOS. An
-example can be found [here](ios.md).
+
+TensorFlow Lite provides native iOS libraries written in
+[Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
+and
+[Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc).
+
+Visit the [iOS quickstart](ios.md) for a tutorial and example code.
 
 #### Linux
 On Linux platforms such as [Raspberry Pi](build_rpi.md), TensorFlow Lite C++
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
index 3565ce7..4c84dbd 100644
--- a/tensorflow/lite/g3doc/guide/ios.md
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -1,229 +1,98 @@
 # iOS quickstart
 
-This tutorial provides a simple iOS mobile application to classify images using
-the iOS device camera. In this tutorial, you will download the demo application
-from the Tensorflow repository, build it on your computer, and install it on
-your iOS Device. You will also learn how to customize the application to suit
-your requirements.
+To get started with TensorFlow Lite on iOS, we recommend exploring the following
+example:
 
-## Prerequisites
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios">iOS
+image classification example</a>
 
-*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
-    a valid Apple Developer ID, and have an iOS device set up and linked to your
-    developer account with all of the appropriate certificates. For these
-    instructions, we assume that you have already been able to build and deploy
-    an app to an iOS device with your current developer environment.
+For an explanation of the source code, you should also read
+[TensorFlow Lite iOS image classification](https://www.tensorflow.org/lite/models/image_classification/ios).
 
-*   The demo app requires a camera and must be executed on a real iOS device.
-    You can build it and run with the iPhone Simulator but it won't have any
-    camera information to classify.
+This example app uses
+[image classification](https://www.tensorflow.org/lite/models/image_classification/overview)
+to continuously classify whatever it sees from the device's rear-facing camera,
+displaying the top most probable classifications. It allows the user to choose
+between a floating point or
+[quantized](https://www.tensorflow.org/lite/performance/post_training_quantization)
+model and select the number of threads to perform inference on.
 
-*   You don't need to build the entire TensorFlow library to run the demo, but
-    you will need to clone the TensorFlow repository if you haven't already:
+Note: Additional iOS applications demonstrating TensorFlow Lite in a variety of
+use cases are available in [Examples](https://www.tensorflow.org/lite/examples).
 
-        git clone https://github.com/tensorflow/tensorflow
-        cd tensorflow
+## Add TensorFlow Lite to your Swift or Objective-C project
 
-*   You'll also need the Xcode command-line tools:
+TensorFlow Lite offers native iOS libraries written in
+[Swift](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/swift)
+and
+[Objective-C](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/objc).
+To get started quickly writing your own iOS code, we recommend using our
+[Swift image classification example](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios)
+as a starting point.
 
-        xcode-select --install
+The sections below demonstrate how to add TensorFlow Lite Swift or Objective-C
+to your project:
 
-    If this is a new install, you will need to run the Xcode application once to
-    agree to the license before continuing.
+### CocoaPods developers
 
-*   Install CocoaPods if you don't have it:
+In your `Podfile`, add the TensorFlow Lite pod. Then, run `pod install`.
 
-        sudo gem install cocoapods
+#### Swift
 
-### Step 1. Clone the TensorFlow source code
-
-First, we clone the GitHub repository on the computer in a folder to get the
-demo application.
-
-```
-git clone https://github.com/tensorflow/tensorflow
+```ruby
+use_frameworks!
+pod 'TensorFlowLiteSwift'
 ```
 
-### Step 2. Download required dependencies
+#### Objective-C
 
-Execute the shell script to download the model files used by the demo app (this
-is done from inside the cloned directory):
-
-```
-    tensorflow/lite/examples/ios/download_models.sh
+```ruby
+pod 'TensorFlowLiteObjC'
 ```
 
-Run the following command to install TensorFlow Lite pod:
+### Bazel developers
 
-```
-    cd tensorflow/lite/examples/ios/camera
-    pod install
+In your `BUILD` file, add the `TensorFlowLite` dependency to your target.
+
+#### Swift
+
+```python
+swift_library(
+  deps = [
+      "//tensorflow/lite/experimental/swift:TensorFlowLite",
+  ],
+)
 ```
 
-If you have installed this pod before and that command doesn't work, try
+#### Objective-C
 
-```
-    pod repo update
+```python
+objc_library(
+  deps = [
+      "//tensorflow/lite/experimental/objc:TensorFlowLite",
+  ],
+)
 ```
 
-### Step 3. Build the XCode project
+### Import the library
 
-Open the `tflite_camera_example.xcworkspace` project file generated in the last
-step:
+For Swift files, import the TensorFlow Lite module:
 
-```
-    open tflite_camera_example.xcworkspace
+```swift
+import TensorFlowLite
 ```
 
-Under `Project navigator -> tflite_camera_example -> Targets ->
-tflite_camera_example -> General` change the bundle identifier by pre-pending
-your name:
+For Objective-C files, import the umbrella header:
 
-![pre-pend your name to the bundle identifier](../images/ios/bundle_identifier.png)
-
-Plug in your iOS device. Note the app must be executed with a real device with
-camera. Select the iOS device from the drop-down menu.
-
-![Device selection](../images/ios/device_selection.png)
-
-Click the "Run" button to build and run the app
-
-![Build and execute](../images/ios/build_and_execute.png)
-
-Note that as mentioned earlier, you must already have a device set up and linked
-to your Apple Developer account in order to deploy the app on a device.
-
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
-
-## Understanding iOS App Code
-
-### Get camera input
-
-The main logic of this app is in the Objective C++ source file
-`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
-
-The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
-delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
-called for every captured frame. It calls `runModelOnFrame` to run the model for
-every frame.
-
-### Create an interpreter
-
-To create the interpreter, we need to load the model file. The following code
-will load a model and create an interpreter.
-
-```
-model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```objectivec
+#import "TFLTensorFlowLite.h"
 ```
 
-Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
-load times and reduce the dirty pages in memory.
+Or, the module if you set `CLANG_ENABLE_MODULES = YES` in your Xcode project:
 
-Construct a `BuiltinOpResolver` to use the TensorFlow Lite buildin ops. Then,
-create the interpreter object using `InterpreterBuilder` that takes the model
-file as argument as shown below.
-
-```
-tflite::ops::builtin::BuiltinOpResolver resolver;
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```objectivec
+@import TFLTensorFlowLite;
 ```
 
-### Obtain the input buffer
-
-By default, the app uses quantized model since it's smaller and faster. The
-buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
-following code obtains the input buffer from the interpreter:
-
-```
-// Get the index of first input tensor.
-int input_tensor_index = interpreter->inputs()[0];
-// Get the pointer to the input buffer.
-uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
-```
-
-Throughout this document, it's assumed a quantized model is used.
-
-### Pre-process of bitmap image
-
-The MobileNet model we're using takes 224x224x3 inputs, where the dimensions are
-width, height, and colors (RGB). The images returned from `AVCaptureSession` is
-bigger, and has 4 color channels (RGBA).
-
-Many image classification models (like MobileNet) take fixe-sized inputs. It's
-required to scale or crop the image before feeding it into the model, and change
-the channels from RGBA to RGB.
-
-The code to pre-process the images is in `ProcessInputWithQuantizedModel`
-function in
-`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
-simple implementation for nearest neighbor color sampling, and it only copies
-the first 3 bytes for each pixel.
-
-```
-void ProcessInputWithQuantizedModel(
-    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
-  for (int y = 0; y < wanted_input_height; ++y) {
-    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
-    for (int x = 0; x < wanted_input_width; ++x) {
-      const int in_x = (y * image_width) / wanted_input_width;
-      const int in_y = (x * image_height) / wanted_input_height;
-      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
-      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
-      for (int c = 0; c < wanted_input_channels; ++c) {
-        out_pixel[c] = in_pixel[c];
-      }
-    }
-  }
-}
-```
-
-Note the code is preprocessing and preparing the model input from the camera
-data. Therefore the first parameter `input` should be the camera buffer. The
-second parameter `output` should be the buffer of model input.
-
-### Run inference and obtain output buffer
-
-After preprocessing and filling the data into the input buffer of the
-interpreter, it's really easy to run the interpreter:
-
-```
-if (interpreter->Invoke() != kTfLiteOk) {
-  NSLog("Failed to invoke!");
-}
-```
-
-The result is stored in the output tensor buffer of the interpreter. The
-following code obtains the pointer to the buffer:
-
-```
-// Get the index of first output tensor.
-const int output_tensor_index = interpreter->outputs()[0];
-// Get the pointer to the output buffer.
-uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
-```
-
-### Post-process values
-
-The output buffer contains an array of `uint8_t`, and the value range is 0-255.
-We need to convert the value to float to get the probabilities with value range
-0.0-1.0. The formula of the quantization value mapping is:
-
-    float_value = (quantized_value - zero_point) * scale
-
-The following code converts quantized values back to float values, using the
-quantizaiton parameters in tensors:
-
-```
-uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
-int32_t zero_point = input_tensor->params.zero_point;
-float scale = input_tensor->params.scale;
-float output[output_size];
-for (int i = 0; i < output_size; ++i) {
-  output[i] = (quantized_output[i] - zero_point) * scale;
-}
-```
-
-Finally, we find the best set of classifications by storing them in a priority
-queue based on their confidence scores. See the `GetTopN` function in
-`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+Note: For CocoaPods developers who want to import the Objective-C TensorFlow
+Lite module, you must also include `use_frameworks!` in your `Podfile`.
diff --git a/tensorflow/lite/g3doc/guide/microcontroller.md b/tensorflow/lite/g3doc/guide/microcontroller.md
deleted file mode 100644
index 6351320..0000000
--- a/tensorflow/lite/g3doc/guide/microcontroller.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Microcontrollers
-
-## Overview
-
-Microcontrollers are compact integrated circuits with very limited resources. Currently, they only perform simple functions.
-
-With the onset of TensorFlow Lite, hence smaller binary sizes, these devices will be able to support machine learning applications, opening the industry up to a myriad of use cases.
-
-## Getting started
-
-Note: This is an experimental release aimed at microcontrollers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems.
-
-One of the challenges of embedded software development is that there are a lot of different architectures, devices, operating systems, and build systems. We aim to support as many of the popular combinations as we can and make it as easy as possible to add support for others.
-
-Read more about [how to get started](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro#getting-started).
-
-## Goals
-
-The design goals are to make the framework readable, easy to modify, well-tested, easy to integrate, and compatible (e.g. consistent file schema, interpreter, API, kernel interface).
-
-Read more about [goals and tradeoffs](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro#goals).
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index 56caf7d..e11f34f 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -390,10 +390,10 @@
 **CEIL**
 
 ```
-inputs {
-  0: tensor
+Inputs {
+  0: a tensor
 }
-outputs: {
+Outputs {
   0: result of computing element-wise ceil of the input tensor
 }
 ```
@@ -844,6 +844,17 @@
 }
 ```
 
+**ROUND**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: result of computing element-wise round of the input tensor
+}
+```
+
 **SLICE**
 
 ```
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
new file mode 100644
index 0000000..98d791b
--- /dev/null
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -0,0 +1,107 @@
+# Build and convert models
+
+Microcontrollers have limited RAM and storage, which places constraints on the
+sizes of machine learning models. In addition, TensorFlow Lite for
+Microcontrollers currently supports a limited subset of operations, so not all
+model architectures are possible.
+
+This document explains the process of converting a TensorFlow model to run on
+microcontrollers. It also outlines the supported operations and gives some
+guidance on designing and training a model to fit in limited memory.
+
+## Model conversion
+
+To convert a trained TensorFlow model to run on microcontrollers, you should use
+the
+[TensorFlow Lite converter Python API](https://www.tensorflow.org/lite/convert/python_api).
+This will convert the model into a
+[`FlatBuffer`](https://google.github.io/flatbuffers/), reducing the model size,
+and modify it to use TensorFlow Lite operations.
+
+### Quantization
+
+To obtain the smallest possible model size, you should consider using
+[Post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization).
+This will reduce the precision of the numbers in your model, which results in a
+smaller model size. However, this is likely to reduce accuracy, particularly for
+small models. It is important to profile the accuracy of your model before and
+after quantization to confirm that this loss is acceptable.
+
+The following Python snippet shows how to convert a model using post-training
+quantization:
+
+```python
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+tflite_quant_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_quant_model)
+```
+
+### Convert to a C array
+
+Many microcontroller platforms do not have native filesystem support. The
+easiest way to use a model from your program is to include it as a C array and
+compile it into your program.
+
+The following unix command will generate a C source file that contains the
+TensorFlow Lite model as a `char` array:
+
+```bash
+xxd -i converted_model.tflite > model_data.cc
+```
+
+The output will look similar to the following:
+
+```C
+unsigned char converted_model_tflite[] = {
+  0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
+  // <Lines omitted>
+};
+unsigned int converted_model_tflite_len = 18200;
+```
+
+Once you have generated the file, you can include it in your program. It is
+important to change the array declaration to `const` for better memory
+efficiency on embedded platforms.
+
+For an example of how to include and use a model in your program, see
+[`tiny_conv_micro_features_model_data.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h)
+in the micro speech example.
+
+## Model architecture and training
+
+When designing a model for use on microcontrollers, it is important to consider
+the model size, workload, and the operations that are used.
+
+### Model size
+
+A model must be small enough to fit within your target device's memory alongside
+the rest of your program, both as a binary and at runtime.
+
+To create a smaller model, you can use fewer and smaller layers in your
+architecture. However, small models are more likely to suffer from overfitting.
+This means for many problems, it makes sense to try and use the largest model
+that will fit in memory. However, using larger models will also lead to
+increased processor workload.
+
+Note: The core runtime for TensorFlow Lite for Microcontrollers fits in 16KB on
+a Cortex M3.
+
+### Workload
+
+The size and complexity of the model has an impact on workload. Large, complex
+models might result in a higher duty cycle, which means your device's processor
+is spending more time working and less time idle. This will increase power
+consumption and heat output, which might be an issue depending on your
+application.
+
+### Operation support
+
+TensorFlow Lite for Microcontrollers currently supports a limited subset of
+TensorFlow operations, which impacts the model architectures that it is possible
+to run. We are working on expanding operation support, both in terms of
+reference implementations and optimizations for specific architectures.
+
+The supported operations can be seen in the file
+[`all_ops_resolver.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc)
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md
new file mode 100644
index 0000000..f5afa01
--- /dev/null
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md
@@ -0,0 +1,326 @@
+# Get started with microcontrollers
+
+This document will help you start working with TensorFlow Lite for
+Microcontrollers.
+
+## Sample code
+
+To get started, you can explore the following example:
+
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech">Micro
+speech example</a>
+
+This example uses a simple
+[audio recognition model](https://www.tensorflow.org/tutorials/sequences/audio_recognition)
+to identify keywords in speech. The sample code captures audio from a device's
+microphones. The model classifies this audio in real time, determining whether
+the word "yes" or "no" has been spoken.
+
+The sample works end-to-end (including audio capture and inference) on the
+following platforms:
+
+-   [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170)
+-   [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)
+-   Mac OS X
+
+### SparkFun Edge
+
+If you need a device to get started, we recommend the
+[SparkFun Edge Powered by TensorFlow](https://www.sparkfun.com/products/15170).
+It was designed in conjunction with the TensorFlow Lite team to offer a flexible
+platform for experimenting with deep learning on microcontrollers.
+
+To get started using the Edge board, we recommend following
+[Machine learning on a microcontroller with SparkFun TensorFlow](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow),
+a codelab that introduces you to the development workflow.
+
+## Workflow
+
+Using TensorFlow Lite for Microcontrollers involves four major steps:
+
+1.  Create or find a model architecture.
+2.  Train a model.
+3.  Convert the model.
+4.  Write code to run inference.
+
+The first three steps are covered in the guide
+[Build and convert models](build_convert.md). The sample code comes with a
+pretrained model, and includes scripts to train a model that recognizes
+different spoken words. Instructions on training are in
+[README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/README.md#creating-your-own-model).
+
+In this document, we will focus on the code that will feed processed audio data
+into the model and execute it, resulting in a prediction of which word was
+spoken. This process is called *inference*.
+
+## Run inference
+
+The sample's
+[main.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc)
+contains the code that runs inference. We'll now walk through the key parts.
+
+### Includes
+
+To use the library, we must include the following header files:
+
+```C++
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+```
+
+-   [`all_ops_resolver.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h)
+    provides the operations used by the interpreter to run the model.
+-   [`micro_error_reporter.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/micro_error_reporter.h)
+    outputs debug information.
+-   [`micro_interpreter.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/micro_interpreter.h)
+    contains code to handle and run models.
+-   [`schema_generated.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h)
+    contains the schema for the TensorFlow Lite
+    [`FlatBuffer`](https://google.github.io/flatbuffers/) model file format.
+-   [`version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/version.h)
+    provides versioning information for the TensorFlow Lite schema.
+
+The sample also includes some other files. These are the most significant:
+
+```C++
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+```
+
+-   [`feature_provider.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/feature_provider.h)
+    contains code to extract features from the audio stream to input to the
+    model.
+-   [`tiny_conv_micro_features_model_data.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h)
+    contains the model stored as a `char` array. Read
+    [Build and convert models](build_convert.md) to learn how to convert a
+    TensorFlow Lite model into this format.
+-   [`micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h)
+    defines various constants related to the model.
+
+### Set up logging
+
+To set up logging, a `tflite::ErrorReporter` pointer is created using a pointer
+to a `tflite::MicroErrorReporter` instance:
+
+```C++
+tflite::MicroErrorReporter micro_error_reporter;
+tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+```
+
+This variable will be passed into the interpreter, which allows it to write
+logs. Since microcontrollers often have a variety of mechanisms for logging, the
+implementation of `tflite::MicroErrorReporter` is designed to be customized for
+your particular device.
+
+### Load a model
+
+In the following code, the model is instantiated from a `char` array,
+`g_tiny_conv_micro_features_model_data` (to learn how this is created, see
+[Build and convert models](build_convert.md)). We then check the model to ensure
+its schema version is compatible with the version we are using:
+
+```C++
+const tflite::Model* model =
+    ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
+if (model->version() != TFLITE_SCHEMA_VERSION) {
+  error_reporter->Report(
+      "Model provided is schema version %d not equal "
+      "to supported version %d.\n",
+      model->version(), TFLITE_SCHEMA_VERSION);
+  return 1;
+}
+```
+
+### Instantiate operations resolver
+
+An
+[`AllOpsResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h)
+instance is required by the interpreter to access TensorFlow operations. This
+class can be extended to add custom operations to your project:
+
+```C++
+tflite::ops::micro::AllOpsResolver resolver;
+```
+
+### Allocate memory
+
+We need to preallocate a certain amount of memory for input, output, and
+intermediate arrays. This is provided as a `uint8_t` array of size
+`tensor_arena_size`, which is passed into a `tflite::SimpleTensorAllocator`
+instance:
+
+```C++
+const int tensor_arena_size = 10 * 1024;
+uint8_t tensor_arena[tensor_arena_size];
+tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                               tensor_arena_size);
+```
+
+Note: The size required will depend on the model you are using, and may need to
+be determined by experimentation.
+
+### Instantiate interpreter
+
+We create a `tflite::MicroInterpreter` instance, passing in the variables
+created earlier:
+
+```C++
+tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                     error_reporter);
+```
+
+### Validate input shape
+
+The `MicroInterpreter` instance can provide us with a pointer to the model's
+input tensor by calling `.input(0)`, where `0` represents the first (and only)
+input tensor. We inspect this tensor to confirm that its shape and type are what
+we are expecting:
+
+```C++
+TfLiteTensor* model_input = interpreter.input(0);
+if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
+    (model_input->dims->data[1] != kFeatureSliceCount) ||
+    (model_input->dims->data[2] != kFeatureSliceSize) ||
+    (model_input->type != kTfLiteUInt8)) {
+  error_reporter->Report("Bad input tensor parameters in model");
+  return 1;
+}
+```
+
+In this snippet, the variables `kFeatureSliceCount` and `kFeatureSliceSize`
+relate to properties of the input and are defined in
+[`micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h).
+The enum value `kTfLiteUInt8` is a reference to one of the TensorFlow Lite data
+types, and is defined in
+[`c_api_internal.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/c_api_internal.h).
+
+### Generate features
+
+The data we input to our model must be generated from the microcontroller's
+audio input. The `FeatureProvider` class defined in
+[`feature_provider.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/feature_provider.h)
+captures audio and converts it into a set of features that will be passed into
+the model. When it is instantiated, we use the `TfLiteTensor` obtained earlier
+to pass in a pointer to the input array. This is used by the `FeatureProvider`
+to populate the input data that will be passed into the model:
+
+```C++
+  FeatureProvider feature_provider(kFeatureElementCount,
+                                   model_input->data.uint8);
+```
+
+The following code causes the `FeatureProvider` to generate a set of features
+from the most recent second of audio and populate the input tensor:
+
+```C++
+TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
+    error_reporter, previous_time, current_time, &how_many_new_slices);
+```
+
+In the sample, feature generation and inference happens in a loop, so the device
+is constantly capturing and processing new audio.
+
+If you are writing your own program, you will likely generate features in a
+different way, but you will always populate the input tensor with data before
+running the model.
+
+### Run the model
+
+To run the model, we can call `Invoke()` on our `tflite::MicroInterpreter`
+instance:
+
+```C++
+TfLiteStatus invoke_status = interpreter.Invoke();
+if (invoke_status != kTfLiteOk) {
+  error_reporter->Report("Invoke failed");
+  return 1;
+}
+```
+
+We can check the return value, a `TfLiteStatus`, to determine if the run was
+successful. The possible values of `TfLiteStatus`, defined in
+[`c_api_internal.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/c_api_internal.h),
+are `kTfLiteOk` and `kTfLiteError`.
+
+### Obtain the output
+
+The model's output tensor can be obtained by calling `output(0)` on the
+`tflite::MicroIntepreter`, where `0` represents the first (and only) output
+tensor.
+
+In the sample, the output is an array representing the probability of the input
+belonging to various classes (representing "yes", "no", "unknown", and
+"silence"). Since they are in a set order, we can use simple logic to determine
+which class has the highest probability:
+
+```C++
+    TfLiteTensor* output = interpreter.output(0);
+    uint8_t top_category_score = 0;
+    int top_category_index;
+    for (int category_index = 0; category_index < kCategoryCount;
+         ++category_index) {
+      const uint8_t category_score = output->data.uint8[category_index];
+      if (category_score > top_category_score) {
+        top_category_score = category_score;
+        top_category_index = category_index;
+      }
+    }
+```
+
+Elsewhere in the sample, a more sophisticated algorithm is used to smooth
+recognition results across a number of frames. This is defined in
+[recognize_commands.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h).
+The same technique can be used to improve reliability when processing any
+continuous stream of data.
+
+## Build the sample
+
+The sample contains build scripts that will download all required dependencies
+and compile a binary that can be run on a device.
+
+Note: The build process has been tested on MacOS and Linux, but not on Windows.
+
+To build the sample, take the following steps:
+
+1.  Clone the TensorFlow repository from GitHub to a convenient place.
+
+    ```bash
+    git clone --depth 1 https://github.com/tensorflow/tensorflow.git
+    ```
+
+1.  Enter the directory that was created in the previous step.
+
+    ```bash
+    cd tensorflow
+    ```
+
+1.  If you are using MacOS, run the following command. If you are using Linux,
+    you do not need to do this.
+
+    ```bash
+    PATH=tensorflow/lite/experimental/micro/tools/make/downloads/gcc_embedded/bin/:$PATH
+    ```
+
+1.  To download all of the required dependencies and initiate the build process,
+    issue the following command. You can set `TARGET` depending on which
+    platform you want to build for. Explore
+    [`targets/`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/tools/make/targets)
+    for the current options.
+
+    ```bash
+    make -f tensorflow/lite/experimental/micro/tools/make/Makefile
+    TARGET=sparkfun_edge micro_speech_bin
+    ```
+
+## Next steps
+
+Once you have built and run the sample, read the following documents:
+
+*   Learn how to work with models in
+    [Build and convert models](build_convert.md).
+*   Learn more about the C++ library in
+    [Understand the C++ library](library.md).
diff --git a/tensorflow/lite/g3doc/microcontrollers/library.md b/tensorflow/lite/g3doc/microcontrollers/library.md
new file mode 100644
index 0000000..6dc7261
--- /dev/null
+++ b/tensorflow/lite/g3doc/microcontrollers/library.md
@@ -0,0 +1,110 @@
+# Understand the C++ library
+
+The TensorFlow Lite for Microcontrollers C++ library is part of the
+[TensorFlow repository](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro).
+It is designed to be readable, easy to modify, well-tested, easy to integrate,
+and compatible with regular TensorFlow Lite.
+
+The following document will outline the basic structure of the C++ library,
+provide the commands required for compilation, and give an overview of how to
+port to new devices.
+
+The
+[README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/README.md#how-to-port-tensorflow-lite-micro-to-a-new-platform)
+contains more in-depth information on all of these topics.
+
+## File structure
+
+The
+[`micro`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+root directory has a relatively simple structure. However, since it is located
+inside of the extensive TensorFlow repository, we have created scripts and
+pre-generated project files that provide the relevant source files in isolation
+within various embedded development environments such as Arduino, Keil, Make,
+and Mbed.
+
+### Key files
+
+The most important files for using the TensorFlow Lite for Microcontrollers
+interpreter are located in the root of the project, accompanied by tests:
+
+-   [`all_ops_resolver.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h)
+    provides the operations used by the interpreter to run the model.
+-   [`micro_error_reporter.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/micro_error_reporter.h)
+    outputs debug information.
+-   [`micro_interpreter.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/micro_interpreter.h)
+    contains code to handle and run models.
+
+See [Get started with microcontrollers](get_started.md) for a walkthrough of
+typical usage.
+
+The build system provides for platform-specific implementations of certain
+files. These are located in a directory with the platform name, for example
+[`sparkfun_edge`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/sparkfun_edge).
+
+Several other directories exist, including:
+
+-   [`kernel`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/kernels),
+    which contains operation implementations and the associated code.
+-   [`tools`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/tools),
+    which contains build tools and their output.
+-   [`examples`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples),
+    which contains sample code.
+
+### Generate project files
+
+The project's `Makefile` is able to generate standalone projects containing all
+necessary source files that can be imported into embedded development
+environments. The current supported environments are Arduino, Keil, Make, and
+Mbed.
+
+Note: We host prebuilt projects for some of these environments. See
+[Supported platforms](overview.md#supported-platforms) to download.
+
+To generate these projects with Make, use the following command:
+
+```bash
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile generate_projects
+```
+
+This will take a few minutes, since it has to download some large toolchains for
+the dependencies. Once it has finished, you should see some folders created
+inside a path like
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/` (the exact
+path depends on your host operating system). These folders contain the generated
+project and source files. For example,
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/keil`
+contains the Keil uVision targets.
+
+## Build the library
+
+If you are using a generated project, see its included README for build
+instructions.
+
+To build the library and run tests from the main TensorFlow repository, run the
+following commands:
+
+1.  Clone the TensorFlow repository from GitHub to a convenient place.
+
+    ```bash
+    git clone --depth 1 https://github.com/tensorflow/tensorflow.git
+    ```
+
+1.  Enter the directory that was created in the previous step.
+
+    ```bash
+    cd tensorflow
+    ```
+
+1.  Invoke the `Makefile` to build the project and run tests. Note that this
+    will download all required dependencies:
+
+    ```bash
+    make -f tensorflow/lite/experimental/micro/tools/make/Makefile test
+    ```
+
+## Port to new devices
+
+Guidance on porting TensorFlow Lite for Microcontrollers to new platforms and
+devices can be found in
+[README.md](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro#how-to-port-tensorflow-lite-micro-to-a-new-platform).
diff --git a/tensorflow/lite/g3doc/microcontrollers/overview.md b/tensorflow/lite/g3doc/microcontrollers/overview.md
new file mode 100644
index 0000000..b9a16bd
--- /dev/null
+++ b/tensorflow/lite/g3doc/microcontrollers/overview.md
@@ -0,0 +1,136 @@
+# TensorFlow Lite for Microcontrollers
+
+TensorFlow Lite for Microcontrollers is an experimental port of TensorFlow Lite
+aimed at microcontrollers and other devices with only kilobytes of memory.
+
+It is designed to be portable even to "bare metal" systems, so it doesn't
+require operating system support, any standard C or C++ libraries, or dynamic
+memory allocation. The core runtime fits in 16KB on a Cortex M3, and with enough
+operators to run a speech keyword detection model, takes up a total of 22KB.
+
+## Get started
+
+To quickly get up and running with TensorFlow Lite for Microcontrollers, read
+[Get started with microcontrollers](get_started.md).
+
+## Why microcontrollers are important
+
+Microcontrollers are typically small, low-powered computing devices that are
+often embedded within hardware that requires basic computation, including
+household appliances and Internet of Things devices. Billions of
+microcontrollers are manufactured each year.
+
+Microcontrollers are often optimized for low energy consumption and small size,
+at the cost of reduced processing power, memory, and storage. Some
+microcontrollers have features designed to optimize performance on machine
+learning tasks.
+
+By running machine learning inference on microcontrollers, developers can add AI
+to a vast range of hardware devices without relying on network connectivity,
+which is often subject to bandwidth and power constraints and results in high
+latency. Running inference on-device can also help preserve privacy, since no
+data has to leave the device.
+
+## Features and components
+
+*   C++ API, with runtime that fits in 16KB on a Cortex M3
+*   Uses standard TensorFlow Lite
+    [FlatBuffer](https://google.github.io/flatbuffers/) schema
+*   Pre-generated project files for popular embedded development platforms, such
+    as Arduino, Keil, and Mbed
+*   Optimizations for several embedded platforms
+*   [Sample code](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech)
+    demonstrating spoken hotword detection
+
+## Developer workflow
+
+This is the process for deploying a TensorFlow model to a microcontroller:
+
+1.  **Create or obtain a TensorFlow model**
+
+    The model must be small enough to fit on your target device after
+    conversion, and it can only use
+    [supported operations](build_convert.md#operation-support). If you want to
+    use operations that are not currently supported, you can provide your own
+    implementations.
+
+2.  **Convert the model to a TensorFlow Lite FlatBuffer**
+
+    You will convert your model into the standard TensorFlow Lite format using
+    the [TensorFlow Lite converter](build_convert.md#model-conversion). You may
+    wish to output a quantized model, since these are smaller in size and more
+    efficient to execute.
+
+3.  **Convert the FlatBuffer to a C byte array**
+
+    Models are kept in read-only program memory and provided in the form of a
+    simple C file. Standard tools can be used to
+    [convert the FlatBuffer into a C array](build_convert.md#convert-to-a-c-array).
+
+4.  **Integrate the TensorFlow Lite for Microcontrollers C++ library**
+
+    Write your microcontroller code to perform inference using the
+    [C++ library](library.md).
+
+5.  **Deploy to your device**
+
+    Build and deploy the program to your device.
+
+## Supported platforms
+
+One of the challenges of embedded software development is that there are a lot
+of different architectures, devices, operating systems, and build systems. We
+aim to support as many of the popular combinations as we can, and make it as
+easy as possible to add support for others.
+
+If you're a product developer, we have build instructions or pre-generated
+project files that you can download for the following platforms:
+
+Device                                                                                         | Mbed                                                                           | Keil                                                                           | Make/GCC
+---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ | --------
+[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)     | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | -                                                                              | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl)
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | -                                                                              | -                                                                              | [Instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/README.md#building-for-the-blue-pill-stm32f103-using-make)
+[Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/)  | -                                                                              | -                                                                              | [Instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/README.md#building-for-ambiq-micro-apollo3blue-evb-using-make)
+[Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/)                            | -                                                                              | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
+[Eta Compute ECM3531 EVB](https://etacompute.com/)                                             | -                                                                              | -                                                                              | [Instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/README.md#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
+
+If your device is not yet supported, it may not be difficult add support. You
+can learn about that process in
+[README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/README.md#how-to-port-tensorflow-lite-micro-to-a-new-platform).
+
+### Portable reference code
+
+If you don't have a particular microcontroller platform in mind yet, or just
+want to try out the code before beginning porting, the easiest way to begin is
+by
+[downloading the platform-agnostic reference code](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+
+There is a series of folders inside the archive, with each one containing just
+the source files you need to build one binary. There is a simple Makefile for
+each folder, but you should be able to load the files into almost any IDE and
+build them. There is also a [Visual Studio Code](https://code.visualstudio.com/)
+project file already set up, so you can easily explore the code in a
+cross-platform IDE.
+
+## Goals
+
+Our design goals are to make the framework readable, easy to modify,
+well-tested, easy to integrate, and fully compatible with TensorFlow Lite via a
+consistent file schema, interpreter, API, and kernel interface.
+
+You can read more about the design in
+[goals and tradeoffs](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro#goals).
+
+## Limitations
+
+TensorFlow Lite for Microcontrollers is designed for the specific constraints of
+microcontroller development. If you are working on more powerful devices (for
+example, an embedded Linux device like the Raspberry Pi), the standard
+TensorFlow Lite framework might be easier to integrate.
+
+The following limitations should be considered:
+
+*   Support for a [limited subset](build_convert.md#operation-support) of
+    TensorFlow operations
+*   Support for a limited set of devices
+*   Low-level C++ API requiring manual memory management
diff --git a/tensorflow/lite/g3doc/models/image_classification/android.md b/tensorflow/lite/g3doc/models/image_classification/android.md
index 5cca221..51e354e 100644
--- a/tensorflow/lite/g3doc/models/image_classification/android.md
+++ b/tensorflow/lite/g3doc/models/image_classification/android.md
@@ -1,207 +1,317 @@
-# TensorFlow Lite Android Image Classifier App Example
+# TensorFlow Lite Android image classification example
 
-This tutorial provides a simple Android mobile application to classify images
-using the Android device camera. In this tutorial, you will download the demo
-application from the Tensorflow examples repository, build it on your computer,
-and install it on your Android device. You will also learn how to customize the
-application to suit your requirements.
+This document walks through the code of a simple Android mobile application that
+demonstrates [image classification](overview.md) using the device camera.
 
-### Prerequisites
+The application code is located in the
+[Tensorflow examples](https://github.com/tensorflow/examples) repository, along
+with instructions for building and deploying the app.
 
-*   Android Studio 3.2 (installed on a Linux, Mac or Windows machine)
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">Example
+application</a>
 
-*   Android device
+## Explore the code
 
-*   USB cable (to connect Android device to your computer)
-
-### Step 1. Clone the TensorFlow source code
-
-Clone the TensorFlow examples GitHub repository to your computer to get the demo
-application.
-
-```
-
-git clone https://github.com/tensorflow/examples
-
-```
-
-Open the TensorFlow source code in Android Studio. To do this, open Android
-Studio and select `Open an existing project` setting the folder to
-`examples/lite/examples/image_classification/android`
-
-<img src="images/classifydemo_img1.png" />
-
-This folder contains the demo application for image classification, object
-detection, and speech hotword detection.
-
-### Step 2. Build the Android Studio project
-
-Select `Build -> Make Project` and check that the project builds
-successfully. You will need Android SDK configured in the settings. You'll need
-at least SDK version 23. The gradle file will prompt you to download any missing
-libraries.
-
-<img src="images/classifydemo_img4.png" style="width: 40%" />
-
-<img src="images/classifydemo_img2.png" style="width: 60%" />
-
-#### TensorFlow Lite AAR from JCenter:
-
-Note that the `build.gradle` is configured to use TensorFlow Lite's nightly
-build.
-
-If you see a build error related to compatibility with Tensorflow Lite's Java
-API (example: method X is undefined for type Interpreter), there has likely been
-a backwards compatible change to the API. You will need to pull new app code
-that's compatible with the nightly build by running `git pull`.
-
-### Step 3. Install and run the app
-
-Connect the Android device to the computer and be sure to approve any ADB
-permission prompts that appear on your phone. Select `Run -> Run app.` Select
-the deployment target in the connected devices to the device on which the app will
-be installed. This will install the app on the device.
-
-<img src="images/classifydemo_img5.png" style="width: 60%" />
-
-<img src="images/classifydemo_img6.png" style="width: 70%" />
-
-<img src="images/classifydemo_img7.png" style="width: 40%" />
-
-<img src="images/classifydemo_img8.png" style="width: 80%" />
-
-To test the app, open the app called `TFL Classify` on your device. When you run
-the app the first time, the app will request permission to access the camera.
-Re-installing the app may require you to uninstall the previous installations.
-
-## Understanding Android App Code
+We're now going to walk through the most important parts of the sample code.
 
 ### Get camera input
 
 This mobile application gets the camera input using the functions defined in the
-file CameraActivity.java in the folder
-`examples/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/CameraActivity.java.`
-This file depends on `AndroidManifest.xml` in the folder
-`examples/lite/examples/image_classification/android/app/src/main` to set the
-camera orientation.
+file
+[`CameraActivity.java`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/CameraActivity.java).
+This file depends on
+[`AndroidManifest.xml`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/AndroidManifest.xml)
+to set the camera orientation.
 
-### Pre-process bitmap image
+`CameraActivity` also contains code to capture user preferences from the UI and
+make them available to other classes via convenience methods.
 
-The mobile application code that pre-processes the images and runs inference is
-in
-`examples/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/Classifier.java.`
-Here, we take the input camera bitmap image and convert it to a Bytebuffer
-format for efficient processing. We pre-allocate the memory for ByteBuffer
-object based on the image dimensions because Bytebuffer objects can't infer the
-object shape.
-
-```
-c.imgData =
-ByteBuffer.allocateDirect( DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y *
-DIM_PIXEL_SIZE);
-c.imgData.order(ByteOrder.nativeOrder());
+```java
+model = Model.valueOf(modelSpinner.getSelectedItem().toString().toUpperCase());
+device = Device.valueOf(deviceSpinner.getSelectedItem().toString());
+numThreads = Integer.parseInt(threadsTextView.getText().toString().trim());
 ```
 
-While running the application, we pre-process the incoming bitmap images from the
-camera to a Bytebuffer. Since this model is quantized 8-bit, we will put a
-single byte for each channel. `imgData` will contain an encoded `Color` for each
-pixel in ARGB format, so we need to mask the least significant 8 bits to get
-blue, and next 8 bits to get green and next 8 bits to get blue, and we have an
-opaque image so alpha can be ignored.
+### Classifier
 
-```
- imgData.rewind();
- bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
- // Convert the image to floating point.
- int pixel = 0;
- for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
-   for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
-     final int val = intValues[pixel++];
-     imgData.put((byte) ((val >> 16) & 0xFF));
-     imgData.put((byte) ((val >> 8) & 0xFF));
-     imgData.put((byte) (val & 0xFF));
-     }
+The file
+[`Classifier.java`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/Classifier.java)
+contains most of the complex logic for processing the camera input and running
+inference.
+
+Two subclasses of the file exist, in
+[`ClassifierFloatMobileNet.java`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/ClassifierFloatMobileNet.java)
+and
+[`ClassifierQuantizedMobileNet.java`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/ClassifierQuantizedMobileNet.java),
+to demonstrate the use of both floating point and
+[quantized](https://www.tensorflow.org/lite/performance/post_training_quantization)
+models.
+
+The `Classifier` class implements a static method, `create`, which is used to
+instantiate the appropriate subclass based on the supplied model type (quantized
+vs floating point).
+
+#### Load model and create interpreter
+
+To perform inference, we need to load a model file and instantiate an
+`Interpreter`. This happens in the constructor of the `Classifier` class, along
+with loading the list of class labels. Information about the device type and
+number of threads is used to configure the `Interpreter` via the
+`Interpreter.Options` instance passed into its constructor. Note how that in the
+case of a GPU being available, a
+[`Delegate`](https://www.tensorflow.org/lite/performance/gpu) is created using
+`GpuDelegateHelper`.
+
+```java
+protected Classifier(Activity activity, Device device, int numThreads) throws IOException {
+  tfliteModel = loadModelFile(activity);
+  switch (device) {
+    case NNAPI:
+      tfliteOptions.setUseNNAPI(true);
+      break;
+    case GPU:
+      gpuDelegate = GpuDelegateHelper.createGpuDelegate();
+      tfliteOptions.addDelegate(gpuDelegate);
+      break;
+    case CPU:
+      break;
   }
+  tfliteOptions.setNumThreads(numThreads);
+  tflite = new Interpreter(tfliteModel, tfliteOptions);
+  labels = loadLabelList(activity);
+...
 ```
 
-### Create interpreter
+For Android devices, we recommend pre-loading and memory mapping the model file
+to offer faster load times and reduce the dirty pages in memory. The method
+`loadModelFile` does this, returning a `MappedByteBuffer` containing the model.
 
-To create the interpreter, we need to load the model file. In Android devices,
-we recommend pre-loading and memory mapping the model file as shown below to
-offer faster load times and reduce the dirty pages in memory. If your model file
-is compressed, then you will have to load the model as a `File`, as it cannot be
-directly mapped and used from memory.
-
-```
-// Memory-map the model file
-AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
-FileInputStream inputStream = new
-FileInputStream(fileDescriptor.getFileDescriptor()); FileChannel fileChannel =
-inputStream.getChannel(); long startOffset = fileDescriptor.getStartOffset();
-long declaredLength = fileDescriptor.getDeclaredLength(); return
-fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-```
-
-Then, create the interpreter object using `new Interpreter()` that takes the
-model file as argument as shown below.
-
-```
-// Create Interpreter
-c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
-```
-
-### Run inference
-
-The output of the inference is stored in a byte array `labelprob.` We
-pre-allocate the memory for the output buffer. Then, we run inference on the
-interpreter object using function `run()` that takes input and output buffers as
-arguments.
-
-```
-// Pre-allocate output buffers.
-c.labelProb = new byte[1][c.labels.size()];
-// Run Inference
-tfLite.run(imgData, labelProb);
-```
-
-### Post-process values
-
-Finally, we find the best set of classifications by storing them in a priority
-queue based on their confidence scores.
-
-```
-// Find the best classifications
-PriorityQueue<Recognition> pq = ...
-for (int i = 0; i < labels.size(); ++i)
-{
-  pq.add( new Recognition( ' '+ i,
-  labels.size() > i ? labels.get(i) : unknown,
-  (float) labelProb[0][i], null));
+```java
+private MappedByteBuffer loadModelFile(Activity activity) throws IOException {
+  AssetFileDescriptor fileDescriptor = activity.getAssets().openFd(getModelPath());
+  FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+  FileChannel fileChannel = inputStream.getChannel();
+  long startOffset = fileDescriptor.getStartOffset();
+  long declaredLength = fileDescriptor.getDeclaredLength();
+  return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
 }
 ```
 
-And we display up to MAX_RESULTS number of classifications in the application,
-where Recognition is a generic class defined in `Classifier.java` that contains
-the following information of the classified object: id, title, label, and its
-location when the model is an object detection model.
+Note: If your model file is compressed then you will have to load the model as a
+`File`, as it cannot be directly mapped and used from memory.
 
+The `MappedByteBuffer` is passed into the `Interpreter` constructor, along with
+an `Interpreter.Options` object. This object can be used to configure the
+interpreter, for example by setting the number of threads (`.setNumThreads(1)`)
+or enabling [NNAPI](https://developer.android.com/ndk/guides/neuralnetworks)
+(`.setUseNNAPI(true)`).
+
+#### Pre-process bitmap image
+
+Next in the `Classifier` constructor, we take the input camera bitmap image and
+convert it to a `ByteBuffer` format for efficient processing. We pre-allocate
+the memory for the `ByteBuffer` object based on the image dimensions because
+Bytebuffer objects can't infer the object shape.
+
+The `ByteBuffer` represents the image as a 1D array with three bytes per channel
+(red, green, and blue). We call `order(ByteOrder.nativeOrder())` to ensure bits
+are stored in the device's native order.
+
+```java
+imgData =
+  ByteBuffer.allocateDirect(
+    DIM_BATCH_SIZE
+      * getImageSizeX()
+      * getImageSizeY()
+      * DIM_PIXEL_SIZE
+      * getNumBytesPerChannel());
+imgData.order(ByteOrder.nativeOrder());
 ```
-// Display the best classifications
-final ArrayList<Recognition> recognitions =
-  new ArrayList<Recognition>();
-int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
-for (int i = 0; i < recognitionsSize; ++i) {
-  recognitions.add(pq.poll());
+
+The code in `convertBitmapToByteBuffer` pre-processes the incoming bitmap images
+from the camera to this `ByteBuffer`. It calls the method `addPixelValue` to add
+each set of pixel values to the `ByteBuffer` sequentially.
+
+```java
+imgData.rewind();
+bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+// Convert the image to floating point.
+int pixel = 0;
+for (int i = 0; i < getImageSizeX(); ++i) {
+  for (int j = 0; j < getImageSizeY(); ++j) {
+    final int val = intValues[pixel++];
+    addPixelValue(val);
+  }
 }
 ```
 
-### Load onto display
+In `ClassifierQuantizedMobileNet`, `addPixelValue` is overridden to put a single
+byte for each channel. The bitmap contains an encoded color for each pixel in
+ARGB format, so we need to mask the least significant 8 bits to get blue, and
+next 8 bits to get green and next 8 bits to get blue. Since we have an opaque
+image, alpha can be ignored.
 
-We render the results on the Android device screen using the following lines in
-`processImage()` function in `ClassifierActivity.java` which uses the UI defined
-in `RecognitionScoreView.java.`
-
+```java
+@Override
+protected void addPixelValue(int pixelValue) {
+  imgData.put((byte) ((pixelValue >> 16) & 0xFF));
+  imgData.put((byte) ((pixelValue >> 8) & 0xFF));
+  imgData.put((byte) (pixelValue & 0xFF));
+}
 ```
-resultsView.setResults(results);
-requestRender();
+
+For `ClassifierFloatMobileNet`, we must provide a floating point number for each
+channel where the value is between `0` and `1`. To do this, we mask out each
+color channel as before, but then divide each resulting value by `255.f`.
+
+```java
+@Override
+protected void addPixelValue(int pixelValue) {
+  imgData.putFloat(((pixelValue >> 16) & 0xFF) / 255.f);
+  imgData.putFloat(((pixelValue >> 8) & 0xFF) / 255.f);
+  imgData.putFloat((pixelValue & 0xFF) / 255.f);
+}
+```
+
+#### Run inference
+
+The method that runs inference, `runInference`, is implemented by each subclass
+of `Classifier`. In `ClassifierQuantizedMobileNet`, the method looks as follows:
+
+```java
+protected void runInference() {
+  tflite.run(imgData, labelProbArray);
+}
+```
+
+The output of the inference is stored in a byte array `labelProbArray`, which is
+allocated in the subclass's constructor. It consists of a single outer element,
+containing one innner element for each label in the classification model.
+
+To run inference, we call `run()` on the interpreter instance, passing the input
+and output buffers as arguments.
+
+#### Recognize image
+
+Rather than call `runInference` directly, the method `recognizeImage` is used.
+It accepts a bitmap, runs inference, and returns a sorted `List` of
+`Recognition` instances, each corresponding to a label. The method will return a
+number of results bounded by `MAX_RESULTS`, which is 3 by default.
+
+`Recognition` is a simple class that contains information about a specific
+recognition result, including its `title` and `confidence`.
+
+A `PriorityQueue` is used for sorting. Each `Classifier` subclass has a
+`getNormalizedProbability` method, which is expected to return a probability
+between 0 and 1 of a given class being represented by the image.
+
+```java
+PriorityQueue<Recognition> pq =
+  new PriorityQueue<Recognition>(
+    3,
+    new Comparator<Recognition>() {
+      @Override
+      public int compare(Recognition lhs, Recognition rhs) {
+        // Intentionally reversed to put high confidence at the head of the queue.
+        return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+      }
+    });
+for (int i = 0; i < labels.size(); ++i) {
+  pq.add(
+    new Recognition(
+      "" + i,
+      labels.size() > i ? labels.get(i) : "unknown",
+      getNormalizedProbability(i),
+      null));
+}
+```
+
+### Display results
+
+The classifier is invoked and inference results are displayed by the
+`processImage()` function in
+[`ClassifierActivity.java`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/ClassifierActivity.java).
+
+`ClassifierActivity` is a subclass of `CameraActivity` that contains method
+implementations that render the camera image, run classification, and display
+the results. The method `processImage()` runs classification on a background
+thread as fast as possible, rendering information on the UI thread to avoid
+blocking inference and creating latency.
+
+```java
+protected void processImage() {
+  rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+  final Canvas canvas = new Canvas(croppedBitmap);
+  canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+
+  runInBackground(
+      new Runnable() {
+        @Override
+        public void run() {
+          if (classifier != null) {
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = classifier.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+            LOGGER.v("Detect: %s", results);
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+
+            runOnUiThread(
+                new Runnable() {
+                  @Override
+                  public void run() {
+                    showResultsInBottomSheet(results);
+                    showFrameInfo(previewWidth + "x" + previewHeight);
+                    showCropInfo(cropCopyBitmap.getWidth() + "x" + cropCopyBitmap.getHeight());
+                    showCameraResolution(canvas.getWidth() + "x" + canvas.getHeight());
+                    showRotationInfo(String.valueOf(sensorOrientation));
+                    showInference(lastProcessingTimeMs + "ms");
+                  }
+                });
+          }
+          readyForNextImage();
+        }
+      });
+}
+```
+
+Another important role of `ClassifierActivity` is to determine user preferences
+(by interrogating `CameraActivity`), and instantiate the appropriately
+configured `Classifier` subclass. This happens when the video feed begins (via
+`onPreviewSizeChosen()`) and when options are changed in the UI (via
+`onInferenceConfigurationChanged()`).
+
+```java
+private void recreateClassifier(Model model, Device device, int numThreads) {
+    if (classifier != null) {
+      LOGGER.d("Closing classifier.");
+      classifier.close();
+      classifier = null;
+    }
+    if (device == Device.GPU) {
+      if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
+        LOGGER.d("Not creating classifier: GPU support unavailable.");
+        runOnUiThread(
+            () -> {
+              Toast.makeText(this, "GPU acceleration unavailable.", Toast.LENGTH_LONG).show();
+            });
+        return;
+      } else if (model == Model.QUANTIZED && device == Device.GPU) {
+        LOGGER.d("Not creating classifier: GPU doesn't support quantized models.");
+        runOnUiThread(
+            () -> {
+              Toast.makeText(
+                      this, "GPU does not yet supported quantized models.", Toast.LENGTH_LONG)
+                  .show();
+            });
+        return;
+      }
+    }
+    try {
+      LOGGER.d(
+          "Creating classifier (model=%s, device=%s, numThreads=%d)", model, device, numThreads);
+      classifier = Classifier.create(this, model, device, numThreads);
+    } catch (IOException e) {
+      LOGGER.e(e, "Failed to create classifier.");
+    }
+  }
 ```
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
deleted file mode 100644
index 916639c..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
deleted file mode 100644
index 366ec83..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
deleted file mode 100644
index 360b843..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
deleted file mode 100644
index d6192ae..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
deleted file mode 100644
index 4216153..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
deleted file mode 100644
index 034eedb..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
deleted file mode 100644
index 9403953..0000000
--- a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
+++ /dev/null
Binary files differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/ios.md b/tensorflow/lite/g3doc/models/image_classification/ios.md
index 63e3abd..fde965b 100644
--- a/tensorflow/lite/g3doc/models/image_classification/ios.md
+++ b/tensorflow/lite/g3doc/models/image_classification/ios.md
@@ -1,229 +1,224 @@
-# TensorFlow Lite iOS Image Classifier App Example
+# TensorFlow Lite iOS image classification example
 
-This tutorial provides a simple iOS mobile application to classify images using
-the iOS device camera. In this tutorial, you will download the demo application
-from the Tensorflow repository, build it on your computer, and install it on
-your iOS Device. You will also learn how to customize the application to suit
-your needs.
+This document walks through the code of a simple iOS mobile application that
+demonstrates [image classification](overview.md) using the device camera.
 
-## Prerequisites
+The application code is located in the
+[Tensorflow examples](https://github.com/tensorflow/examples) repository, along
+with instructions for building and deploying the app.
 
-*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
-    a valid Apple Developer ID, and have an iOS device set up and linked to your
-    developer account with all of the appropriate certificates. For these
-    instructions, we assume that you have already been able to build and deploy
-    an app to an iOS device with your current developer environment.
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios">Example
+application</a>
 
-*   The demo app requires a camera and must be executed on a real iOS device.
-    You can build it and run with the iPhone Simulator but it won't have any
-    camera information to classify.
+## Explore the code
 
-*   You don't need to build the entire TensorFlow library to run the demo, but
-    you will need to clone the TensorFlow repository if you haven't already:
+The app is written entirely in Swift and uses the TensorFlow Lite
+[Swift library](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/swift)
+for performing image classification.
 
-        git clone https://github.com/tensorflow/tensorflow
-        cd tensorflow
+Note: Objective-C developers should use the TensorFlow Lite
+[Objective-C library](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/objc).
 
-*   You'll also need the Xcode command-line tools:
-
-        xcode-select --install
-
-    If this is a new install, you will need to run the Xcode application once to
-    agree to the license before continuing.
-
-*   Install CocoaPods if you don't have it:
-
-        sudo gem install cocoapods
-
-### Step 1. Clone the TensorFlow source code
-
-lone the GitHub repository onto your computer to get the
-demo application.
-
-```
-git clone https://github.com/tensorflow/tensorflow
-```
-
-### Step 2. Download required dependencies
-
-Execute the shell script to download the model files used by the demo app (this
-is done from inside the cloned directory):
-
-```
-    tensorflow/lite/examples/ios/download_models.sh
-```
-
-Run the following command to install TensorFlow Lite pod:
-
-```
-    cd tensorflow/lite/examples/ios/camera
-    pod install
-```
-
-If you have installed this pod before and that command doesn't work, try
-
-```
-    pod repo update
-```
-
-### Step 3. Build the XCode project
-
-Open the `tflite_camera_example.xcworkspace` project file generated in the last
-step:
-
-```
-    open tflite_camera_example.xcworkspace
-```
-
-Under `Project navigator -> tflite_camera_example -> Targets ->
-tflite_camera_example -> General` change the bundle identifier by pre-pending
-your name:
-
-![pre-pend your name to the bundle identifier](images/bundle_identifier.png)
-
-Plug in your iOS device. Note that the app must be executed with a real device with
-a camera. Select the iOS device from the drop-down menu.
-
-![Device selection](images/device_selection.png)
-
-Click the "Run" button to build and run the app
-
-![Build and execute](images/build_and_execute.png)
-
-Note that, as mentioned earlier, you must already have a device set up and linked
-to your Apple Developer account in order to deploy the app onto a device.
-
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
-
-## Understanding iOS App Code
+We're now going to walk through the most important parts of the sample code.
 
 ### Get camera input
 
-The main logic of this app is in the Objective C++ source file
-`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+The app's main view is represented by the `ViewController` class in
+[`ViewController.swift`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/ImageClassification/ViewControllers/ViewController.swift),
+which we extend with functionality from the `CameraFeedManagerDelegate` protocol
+to process frames from a camera feed. To run inference on a given frame, we
+implement the `didOutput` method, which is called whenever a frame is available
+from the camera.
 
-The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
-delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
-called for every captured frame. It calls `runModelOnFrame` to run the model for
-every frame.
+Our implementation of `didOutput` includes a call to the `runModel` method of a
+`ModelDataHandler` instance. As we will see below, this class gives us access to
+the TensorFlow Lite `Interpreter` class for performing image classification.
 
-### Create an interpreter
+```swift
+extension ViewController: CameraFeedManagerDelegate {
 
-To create the interpreter, we need to load the model file. The following code
-will load a model and create an interpreter.
+  func didOutput(pixelBuffer: CVPixelBuffer) {
+    let currentTimeMs = Date().timeIntervalSince1970 * 1000
+    guard (currentTimeMs - previousInferenceTimeMs) >= delayBetweenInferencesMs else { return }
+    previousInferenceTimeMs = currentTimeMs
 
-```
-model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
-```
+    // Pass the pixel buffer to TensorFlow Lite to perform inference.
+    result = modelDataHandler?.runModel(onFrame: pixelBuffer)
 
-Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
-load times and reduce the dirty pages in memory.
-
-Construct a `BuiltinOpResolver` to use the TensorFliw Lite buildin ops. Then,
-create the interpreter object using `InterpreterBuilder` that takes the model
-file as argument as shown below.
-
-```
-tflite::ops::builtin::BuiltinOpResolver resolver;
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-```
-
-### Obtain the input buffer
-
-By default, the app uses a quantized model since it's smaller and faster. The
-buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
-following code obtains the input buffer from the interpreter:
-
-```
-// Get the index of first input tensor.
-int input_tensor_index = interpreter->inputs()[0];
-// Get the pointer to the input buffer.
-uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
-```
-
-Throughout this document, it's assumed that a quantized model is used.
-
-### Pre-process bitmap image
-
-The MobileNet model that we're using takes 224x224x3 inputs, where the dimensions are
-width, height, and colors (RGB). The images returned from `AVCaptureSession` is
-bigger and has 4 color channels (RGBA).
-
-Many image classification models (like MobileNet) take fixe-sized inputs. It's
-required to scale or crop the image before feeding it into the model and change
-the channels from RGBA to RGB.
-
-The code to pre-process the images is in `ProcessInputWithQuantizedModel`
-function in
-`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
-simple implementation for nearest neighbor color sampling and it only copies
-the first 3 bytes for each pixel.
-
-```
-void ProcessInputWithQuantizedModel(
-    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
-  for (int y = 0; y < wanted_input_height; ++y) {
-    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
-    for (int x = 0; x < wanted_input_width; ++x) {
-      const int in_x = (y * image_width) / wanted_input_width;
-      const int in_y = (x * image_height) / wanted_input_height;
-      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
-      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
-      for (int c = 0; c < wanted_input_channels; ++c) {
-        out_pixel[c] = in_pixel[c];
-      }
+    // Display results by handing off to the InferenceViewController.
+    DispatchQueue.main.async {
+      let resolution = CGSize(width: CVPixelBufferGetWidth(pixelBuffer), height: CVPixelBufferGetHeight(pixelBuffer))
+      self.inferenceViewController?.inferenceResult = self.result
+      self.inferenceViewController?.resolution = resolution
+      self.inferenceViewController?.tableView.reloadData()
     }
   }
+...
+```
+
+### ModelDataHandler
+
+The Swift class `ModelDataHandler`, defined in
+[`ModelDataHandler.swift`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/ImageClassification/ModelDataHandler/ModelDataHandler.swift),
+handles all data preprocessing and makes calls to run inference on a given frame
+using the TensorFlow Lite [`Interpreter`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/swift/Sources/Interpreter.swift).
+It then formats the inferences obtained from invoking the `Interpreter` and
+returns the top N results for a successful inference.
+
+The following sections show how this works.
+
+#### Initialization
+
+The `init` method creates a new instance of the `Interpreter` and loads the
+specified model and labels files from the app's main bundle.
+
+```swift
+init?(modelFileInfo: FileInfo, labelsFileInfo: FileInfo, threadCount: Int = 1) {
+  let modelFilename = modelFileInfo.name
+
+  // Construct the path to the model file.
+  guard let modelPath = Bundle.main.path(
+    forResource: modelFilename,
+    ofType: modelFileInfo.extension
+  ) else {
+    print("Failed to load the model file with name: \(modelFilename).")
+    return nil
+  }
+
+  // Specify the options for the `Interpreter`.
+  self.threadCount = threadCount
+  var options = InterpreterOptions()
+  options.threadCount = threadCount
+  options.isErrorLoggingEnabled = true
+  do {
+    // Create the `Interpreter`.
+    interpreter = try Interpreter(modelPath: modelPath, options: options)
+  } catch let error {
+    print("Failed to create the interpreter with error: \(error.localizedDescription)")
+    return nil
+  }
+  // Load the classes listed in the labels file.
+  loadLabels(fileInfo: labelsFileInfo)
 }
 ```
 
-Note that the code pre-processes and prepares the model input from the camera
-data. Therefore, the first parameter `input` should be the camera buffer. The
-second parameter `output` should be the buffer of model input.
+#### Process input
 
-### Run inference and obtain output buffer
+The method `runModel` accepts a `CVPixelBuffer` of camera data, which can be
+obtained from the `didOutput` method defined in `ViewController`.
 
-After pre-processing and filling the data into the input buffer of the
-interpreter, it's really easy to run the interpreter:
+We crop the image to the size that the model was trained on. For example,
+`224x224` for the MobileNet v1 model.
 
-```
-if (interpreter->Invoke() != kTfLiteOk) {
-  NSLog("Failed to invoke!");
+The image buffer contains an encoded color for each pixel in `BGRA` format
+(where `A` represents Alpha, or transparency). Our model expects the format to
+be `RGB`, so we use the following helper method to remove the alpha component
+from the image buffer to get the `RGB` data representation:
+
+```swift
+private let alphaComponent = (baseOffset: 4, moduloRemainder: 3)
+private func rgbDataFromBuffer(
+  _ buffer: CVPixelBuffer,
+  byteCount: Int,
+  isModelQuantized: Bool
+) -> Data? {
+  CVPixelBufferLockBaseAddress(buffer, .readOnly)
+  defer { CVPixelBufferUnlockBaseAddress(buffer, .readOnly) }
+  guard let mutableRawPointer = CVPixelBufferGetBaseAddress(buffer) else {
+    return nil
+  }
+  let count = CVPixelBufferGetDataSize(buffer)
+  let bufferData = Data(bytesNoCopy: mutableRawPointer, count: count, deallocator: .none)
+  var rgbBytes = [UInt8](repeating: 0, count: byteCount)
+  var index = 0
+  for component in bufferData.enumerated() {
+    let offset = component.offset
+    let isAlphaComponent = (offset % alphaComponent.baseOffset) == alphaComponent.moduloRemainder
+    guard !isAlphaComponent else { continue }
+    rgbBytes[index] = component.element
+    index += 1
+  }
+  if isModelQuantized { return Data(bytes: rgbBytes) }
+  return Data(copyingBufferOf: rgbBytes.map { Float($0) / 255.0 })
 }
 ```
 
-The result is stored in the output tensor buffer of the interpreter. The
-following code obtains the pointer to the buffer:
+#### Run inference
 
-```
-// Get the index of first output tensor.
-const int output_tensor_index = interpreter->outputs()[0];
-// Get the pointer to the output buffer.
-uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
-```
+Here's the code for getting the `RGB` data representation of the pixel buffer,
+copying that data to the input
+[`Tensor`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/swift/Sources/Tensor.swift),
+and running inference by invoking the `Interpreter`:
 
-### Post-process values
+```swift
+let outputTensor: Tensor
+do {
+  // Allocate memory for the model's input `Tensor`s.
+  try interpreter.allocateTensors()
+  let inputTensor = try interpreter.input(at: 0)
 
-The output buffer contains an array of `uint8_t`, and the value range is from 0-255.
-We need to convert the value to float to get the probabilities with a value range from
-0.0-1.0. The formula of the quantization value mapping is:
+  // Remove the alpha component from the image buffer to get the RGB data.
+  guard let rgbData = rgbDataFromBuffer(
+    thumbnailPixelBuffer,
+    byteCount: batchSize * inputWidth * inputHeight * inputChannels,
+    isModelQuantized: inputTensor.dataType == .uInt8
+  ) else {
+    print("Failed to convert the image buffer to RGB data.")
+    return
+  }
 
-    float_value = (quantized_value - zero_point) * scale
+  // Copy the RGB data to the input `Tensor`.
+  try interpreter.copy(rgbData, toInputAt: 0)
 
-The following code converts quantized values back to float values, using the
-quantizaiton parameters in tensors:
+  // Run inference by invoking the `Interpreter`.
+  try interpreter.invoke()
 
-```
-uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
-int32_t zero_point = input_tensor->params.zero_point;
-float scale = input_tensor->params.scale;
-float output[output_size];
-for (int i = 0; i < output_size; ++i) {
-  output[i] = (quantized_output[i] - zero_point) * scale;
+  // Get the output `Tensor` to process the inference results.
+  outputTensor = try interpreter.output(at: 0)
+} catch let error {
+  print("Failed to invoke the interpreter with error: \(error.localizedDescription)")
+  return
 }
 ```
 
-Finally, we find the best set of classifications by storing them in a priority
-queue based on their confidence scores. See the `GetTopN` function in
-`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+#### Process results
+
+If the model is quantized, the output `Tensor` contains one `UInt8` value per
+class label. Dequantize the results so the values are floats, ranging from 0.0
+to 1.0, where each value represents the confidence that a label is present in
+the image:
+
+```swift
+guard let quantization = outputTensor.quantizationParameters else {
+  print("No results returned because the quantization values for the output tensor are nil.")
+  return
+}
+
+// Get the quantized results from the output tensor's `data` property.
+let quantizedResults = [UInt8](outputTensor.data)
+
+// Dequantize the results using the quantization values.
+let results = quantizedResults.map {
+  quantization.scale * Float(Int($0) - quantization.zeroPoint)
+}
+```
+
+Next, the results are sorted to get the top `N` results (where `N` is
+`resultCount`):
+
+```swift
+// Create a zipped array of tuples [(labelIndex: Int, confidence: Float)].
+let zippedResults = zip(labels.indices, results)
+
+// Sort the zipped results by confidence value in descending order.
+let sortedResults = zippedResults.sorted { $0.1 > $1.1 }.prefix(resultCount)
+
+// Get the top N `Inference` results.
+let topNInferences = sortedResults.map { result in Inference(confidence: result.1, label: labels[result.0]) }
+```
+
+### Display results
+
+The file
+[`InferenceViewController.swift`](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/ImageClassification/ViewControllers/InferenceViewController.swift)
+defines the app's UI. A `UITableView` is used to display the results.
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
index e02e96f..d4046c9 100644
--- a/tensorflow/lite/g3doc/models/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -11,21 +11,12 @@
 by reading <a href="#what_is_image_classification">What is image
 classification?</a>
 
-If you understand image classification, you’re new to TensorFlow Lite, and
-you’re working with Android or iOS, we recommend following the corresponding
-tutorial that will walk you through our sample code.
-
-<a class="button button-primary" href="android.md">Android</a>
-<a class="button button-primary" href="ios.md">iOS</a>
-
-We also provide <a href="example_applications">example applications</a> you can
-use to get started.
+To learn how to use image classification in a mobile app, we recommend exploring
+our <a href="example_applications">Example applications and guides</a>.
 
 If you are using a platform other than Android or iOS, or you are already
-familiar with the
-<a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite
-APIs</a>, you can download our starter image classification model and the
-accompanying labels.
+familiar with the TensorFlow Lite APIs, you can download our starter image
+classification model and the accompanying labels.
 
 <a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">Download
 starter model and labels</a>
@@ -35,16 +26,28 @@
 performance, accuracy, and model size. For guidance, see
 <a href="#choose_a_different_model">Choose a different model</a>.
 
-### Example applications
+### Example applications and guides
 
 We have example applications for image classification for both Android and iOS.
+For each example, we provide a guide that explains how it works.
 
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">Android
-example</a>
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios.md">iOS
-example</a>
+#### Android
 
-The following screenshot shows the Android image classification example:
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">View
+Android example</a>
+
+Read the [Android example guide](android.md) to learn how the app works.
+
+#### iOS
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios.md">View
+iOS example</a>
+
+Read the [iOS example guide](ios.md) to learn how the app works.
+
+#### Screenshot
+
+The following screenshot shows the Android image classification example.
 
 <img src="images/android_banana.png" alt="Screenshot of Android example" width="30%">
 
@@ -199,8 +202,8 @@
 For the following use cases, you should use a different type of model:
 
 <ul>
-  <li>Predicting the type and position of one or more objects within an image (see <a href="../object_detection/overview.md">object detection</a>)</li>
-  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
+  <li>Predicting the type and position of one or more objects within an image (see <a href="../object_detection/overview.md">Object detection</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">Segmentation</a>)</li>
 </ul>
 
 Once you have the starter model running on your target device, you can
@@ -226,7 +229,7 @@
 must be faster than 33ms to perform real-time inference on a 30fps video
 stream).
 
-Our quantized Mobilenet models’ performance ranges from 3.7ms to 80.3 ms.
+Our quantized MobileNet models’ performance ranges from 3.7ms to 80.3 ms.
 
 ### Accuracy
 
@@ -240,7 +243,7 @@
 refers to how often the correct label appears in the top 5 highest probabilities
 in the model’s output.
 
-Our quantized Mobilenet models’ Top-5 accuracy ranges from 64.4 to 89.9%.
+Our quantized MobileNet models’ Top-5 accuracy ranges from 64.4 to 89.9%.
 
 ### Size
 
@@ -248,13 +251,13 @@
 be important for mobile development (where it might impact app download sizes)
 or when working with hardware (where available storage might be limited).
 
-Our quantized Mobilenet models’ size ranges from 0.5 to 3.4 Mb.
+Our quantized MobileNet models’ size ranges from 0.5 to 3.4 Mb.
 
 ### Architecture
 
 There are several different architectures of models available on
 <a href="../../guide/hosted_models.md">List of hosted models</a>, indicated by
-the model’s name. For example, you can choose between Mobilenet, Inception, and
+the model’s name. For example, you can choose between MobileNet, Inception, and
 others.
 
 The architecture of a model impacts its performance, accuracy, and size. All of
@@ -277,5 +280,5 @@
 images for each of the new labels you wish to train.
 
 Learn how to perform transfer learning in the
-<a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/">TensorFlow
-for Poets</a> codelab.
+<a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">Recognize
+flowers with TensorFlow</a> codelab.
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
index 20c359e..b2363ad 100644
--- a/tensorflow/lite/g3doc/models/smart_reply/overview.md
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -13,12 +13,15 @@
 
 ### Sample application
 
-We have provided a pre-built APK that demonstrates the smart reply model on
-Android.
+There is a TensorFlow Lite sample application that demonstrates the smart reply
+model on Android.
 
-Go to the
-<a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc">GitHub
-page</a> for instructions and list of supported ops and functionalities.
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply">View
+Android example</a>
+
+Read the
+[GitHub page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc)
+to learn how the app works.
 
 ## How it works
 
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index e4c108e..8798b49 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -44,7 +44,7 @@
 dependencies {
     ...
     implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-    implementation 'org.tensorflow:tensorflow-lite:0.0.0-gpu-nightly'
+    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
 }
 ```
 
@@ -123,14 +123,13 @@
 
 ### Android
 
-Look at the demo to see how to add the
-delegate. In your application, add the AAR as above, import
-`org.tensorflow.lite.experimental.GpuDelegate` module, and use the`addDelegate`
-function to register the GPU delegate to the interpreter:
+Look at the demo to see how to add the delegate. In your application, add the
+AAR as above, import `org.tensorflow.lite.gpu.GpuDelegate` module, and use
+the`addDelegate` function to register the GPU delegate to the interpreter:
 
 ```java
 import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.experimental.GpuDelegate;
+import org.tensorflow.lite.gpu.GpuDelegate;
 
 // Initialize interpreter with GPU delegate
 GpuDelegate delegate = new GpuDelegate();
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 6274948..9f47c2e 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -57,7 +57,7 @@
 
 ## Basic Usage
 
-### Android
+### Android (Java)
 
 Run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify the
 GpuDelegate through `Interpreter.Options`.
@@ -79,7 +79,50 @@
 delegate.close();
 ```
 
-### iOS
+### Android (C/C++)
+
+For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be
+created with `TfLiteGpuDelegateCreate()` and destroyed with
+`TfLiteGpuDelegateDelete()`.
+
+```c++
+// Set up interpreter.
+auto model = FlatBufferModel::BuildFromFile(model_path);
+if (!model) return false;
+ops::builtin::BuiltinOpResolver op_resolver;
+std::unique_ptr<Interpreter> interpreter;
+InterpreterBuilder(*model, op_resolver)(&interpreter);
+
+// NEW: Prepare GPU delegate.
+const TfLiteGpuDelegateOptions options = {
+  .metadata = NULL,
+  .compile_options = {
+    .precision_loss_allowed = 1,  // FP16
+    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
+    .dynamic_batch_enabled = 0,   // Not fully functional yet
+  },
+};
+auto* delegate = TfLiteGpuDelegateCreate(&options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference.
+WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
+if (interpreter->Invoke() != kTfLiteOk) return false;
+ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
+
+// NEW: Clean up.
+TfLiteGpuDelegateDelete(delegate);
+```
+
+TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system.
+The delegate can be built, for example, using the following command:
+
+```sh
+bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:gl_delegate                  # for static library
+bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_gl.so  # for dynamic library
+```
+
+### iOS (ObjC++)
 
 To use TensorFlow Lite on GPU, get the GPU delegate via `NewGpuDelegate()` and
 then pass it to `Interpreter::ModifyGraphWithDelegate()` (instead of calling
diff --git a/tensorflow/lite/g3doc/r2/convert/cmdline.md b/tensorflow/lite/g3doc/r2/convert/cmdline.md
new file mode 100644
index 0000000..3155ee5
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/convert/cmdline.md
@@ -0,0 +1,47 @@
+# Converter command line reference
+
+This page describes how to use the [TensorFlow Lite converter](index.md) using
+the command line tool in TensorFlow 2.0. The preferred approach for conversion
+is using the [Python API](python_api.md).
+
+[TOC]
+
+## High-level overview
+
+The TensorFlow Lite Converter has a command line tool `tflite_convert` which
+supports basic models. Use the `TFLiteConverter` [Python API](python_api.md) for
+any conversions involving quantization or any additional parameters (e.g.
+signatures in SavedModels or custom objects in Keras models).
+
+## Usage
+
+The following flags specify the input and output files.
+
+*   `--output_file`. Type: string. Specifies the full path of the output file.
+*   --saved_model_dir. Type: string. Specifies the full path to the directory
+    containing the SavedModel generated in 1.X or 2.0.
+*   --keras_model_file. Type: string. Specifies the full path of the HDF5 file
+    containing the tf.keras model generated in 1.X or 2.0.
+
+The following is an example usage.
+
+```
+tflite_convert \
+  --saved_model_dir=/tmp/mobilenet_saved_model \
+  --output_file=/tmp/mobilenet.tflite
+```
+
+## Additional instructions
+
+### Building from source
+
+In order to run the latest version of the TensorFlow Lite Converter either
+install the nightly build using [pip](https://www.tensorflow.org/install/pip) or
+[clone the TensorFlow repository](https://www.tensorflow.org/install/source) and
+use `bazel`. An example can be seen below.
+
+```
+bazel run //third_party/tensorflow/lite/python:tflite_convert -- \
+  --saved_model_dir=/tmp/mobilenet_saved_model \
+  --output_file=/tmp/mobilenet.tflite
+```
diff --git a/tensorflow/lite/g3doc/r2/convert/index.md b/tensorflow/lite/g3doc/r2/convert/index.md
index bb9a56e..15755dd 100644
--- a/tensorflow/lite/g3doc/r2/convert/index.md
+++ b/tensorflow/lite/g3doc/r2/convert/index.md
@@ -1,8 +1,11 @@
 # TensorFlow Lite converter
 
-The TensorFlow Lite converter takes a TensorFlow model represented as a
-[concrete function](concrete_function.md), and generates a TensorFlow Lite
-[`FlatBuffer`](https://google.github.io/flatbuffers/) file (`.tflite`).
+The TensorFlow Lite converter takes a TensorFlow model and generates a
+TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/) file
+(`.tflite`). The converter supports
+[SavedModel directories](https://www.tensorflow.org/alpha/guide/saved_model),
+[`tf.keras` models](https://www.tensorflow.org/alpha/guide/keras/overview), and
+[concrete functions](concrete_function.md).
 
 Note: This page contains documentation on the converter API for TensorFlow 2.0.
 The API for TensorFlow 1.X is available
@@ -18,7 +21,8 @@
 
 ## Converting models
 
-The TensorFlow Lite converter can be used from the [Python API](python_api.md).
-Using the Python API makes it easier to convert models as part of a model
-development pipeline and helps mitigate
+The TensorFlow Lite converter should be used from the
+[Python API](python_api.md). Using the Python API makes it easier to convert
+models as part of a model development pipeline and helps mitigate
 [compatibility](../../guide/ops_compatibility.md) issues early on.
+Alternatively, the [command line tool](cmdline.md) supports basic models.
diff --git a/tensorflow/lite/g3doc/r2/convert/python_api.md b/tensorflow/lite/g3doc/r2/convert/python_api.md
index 078d5d0..0fed408 100644
--- a/tensorflow/lite/g3doc/r2/convert/python_api.md
+++ b/tensorflow/lite/g3doc/r2/convert/python_api.md
@@ -18,6 +18,12 @@
 *   `TFLiteConverter.from_concrete_functions()`: Converts
     [concrete functions](concrete_function.md).
 
+Note: The TensorFlow Lite 2.0 alpha had a different version of the
+`TFLiteConverter` API which only contained the classmethod
+[`from_concrete_function`](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/lite/TFLiteConverter#from_concrete_function).
+The API detailed in this document can be installed using the
+[`tf-nightly-2.0-preview`](#installing_the_tensorflow_20_nightly_) pip install.
+
 This document contains [example usages](#examples) of the API, a detailed list
 of [changes in the API between 1.X and 2.0](#differences), and
 [instructions](#versioning) on running the different versions of TensorFlow.
@@ -144,13 +150,13 @@
   np.testing.assert_almost_equal(tf_result, tflite_result, decimal=5)
 ```
 
-## Summary of changes in `TFLiteConverter` between 1.X and 2.0 <a name="differences"></a>
+## Summary of changes in Python API between 1.X and 2.0 <a name="differences"></a>
 
-The following section summarizes the changes in `TFLiteConverter` from 1.X to
-2.0. If any of the changes raise concerns, please file a
+The following section summarizes the changes in the Python API from 1.X to 2.0.
+If any of the changes raise concerns, please file a
 [GitHub issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Supported formats
+### Formats supported by `TFLiteConverter`
 
 `TFLiteConverter` in 2.0 supports SavedModels and Keras model files generated in
 both 1.X and 2.0. However, the conversion process no longer supports frozen
@@ -180,7 +186,7 @@
 want to convert models generated by the rewriter function can use
 `tf.compat.v1.TFLiteConverter`.
 
-### Changes to attributes
+### Changes to `TFLiteConverter` attributes
 
 The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
 to `supported_ops` in line with future additions to the optimization framework.
@@ -199,7 +205,9 @@
     *   `dump_graphviz_dir`
     *   `dump_graphviz_video`
 
-### Deprecated APIs
+### General API changes
+
+#### Conversion methods
 
 The following methods that were previously deprecated in 1.X will no longer be
 exported in 2.0:
@@ -207,6 +215,29 @@
 *   `lite.toco_convert`
 *   `lite.TocoConverter`
 
+#### `lite.constants`
+
+The `lite.constants` API was removed in 2.0 in order to decrease duplication
+between TensorFlow and TensorFlow Lite. The following list maps the
+`lite.constant` type to the TensorFlow type:
+
+*   `lite.constants.FLOAT`: `tf.float32`
+*   `lite.constants.INT8`: `tf.int8`
+*   `lite.constants.INT32`: `tf.int32`
+*   `lite.constants.INT64`: `tf.int64`
+*   `lite.constants.STRING`: `tf.string`
+*   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
+
+Additionally, `lite.constants.TFLITE` and `lite.constants.GRAPHVIZ_DOT` were
+removed due to the deprecation of the `output_format` flag in `TFLiteConverter`.
+
+#### `lite.OpHint`
+
+The `OpHint` API is currently not available in 2.0 due to an incompatibility
+with the 2.0 APIs. This API enables conversion of LSTM based models. Support for
+LSTMs in 2.0 is being investigated. All related `lite.experimental` APIs have
+been removed due to this issue.
+
 ## Installing TensorFlow <a name="versioning"></a>
 
 ### Installing the TensorFlow 2.0 nightly <a name="2.0-nightly"></a>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index f6f7972..9edef37 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -26,11 +26,14 @@
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/nnapi_delegate.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
+// TODO(b/132087118): move static_assert to c_api_internal when compiled with
+// C++.
+static_assert(sizeof(TfLiteFloat16) == sizeof(uint16_t),
+              "Float 16 type must be 16 bits.");
+
 namespace tflite {
 
 namespace {
@@ -257,11 +260,11 @@
   return kTfLiteOk;
 }
 
-void Interpreter::SetProfiler(profiling::Profiler* profiler) {
+void Interpreter::SetProfiler(Profiler* profiler) {
   for (auto& subgraph : subgraphs_) subgraph->SetProfiler(profiler);
 }
 
-profiling::Profiler* Interpreter::GetProfiler() {
+Profiler* Interpreter::GetProfiler() {
   return primary_subgraph().GetProfiler();
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 806b66c..2d72eea 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -25,9 +25,9 @@
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/memory_planner.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/stderr_reporter.h"
 
 namespace tflite {
@@ -74,6 +74,10 @@
   return kTfLiteString;
 }
 
+template <>
+constexpr TfLiteType typeToTfLiteType<TfLiteFloat16>() {
+  return kTfLiteFloat16;
+}
 // An interpreter for a graph of nodes that input and output from tensors.
 // Each node of the graph processes a set of input tensors and produces a
 // set of output Tensors. All inputs/output tensors are referenced by index.
@@ -402,9 +406,14 @@
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
-  void SetProfiler(profiling::Profiler* profiler);
+  // Sets the profiler to tracing execution. The caller retains ownership
+  // of the profiler and must ensure its validity.
+  // WARNING: This is an experimental API and subject to change.
+  void SetProfiler(Profiler* profiler);
 
-  profiling::Profiler* GetProfiler();
+  // Gets the profiler used for op tracing.
+  // WARNING: This is an experimental API and subject to change.
+  Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index fe3d6dd..0c0c32b 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -14,8 +14,10 @@
 ==============================================================================*/
 
 #include "tensorflow/lite/interpreter.h"
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -164,7 +166,7 @@
   } cases[] = {
       {kTfLiteFloat32, sizeof(float)}, {kTfLiteInt32, sizeof(int32_t)},
       {kTfLiteUInt8, sizeof(uint8_t)}, {kTfLiteInt64, sizeof(int64_t)},
-      {kTfLiteInt16, sizeof(int16_t)},
+      {kTfLiteInt16, sizeof(int16_t)}, {kTfLiteFloat16, sizeof(TfLiteFloat16)},
   };
 
   for (auto test : cases) {
@@ -237,6 +239,8 @@
   const uint8_t uint8s[] = {3, 4};
   const int64_t int64s[] = {6, -7};
   const int16_t int16s[] = {8, -9};
+  const Eigen::half float16s[] = {Eigen::half_impl::float_to_half_rtne(-3.f),
+                                  Eigen::half_impl::float_to_half_rtne(-4.f)};
 
   struct {
     TfLiteType type;
@@ -248,6 +252,8 @@
       {kTfLiteUInt8, sizeof(uint8_t), reinterpret_cast<const char*>(uint8s)},
       {kTfLiteInt64, sizeof(int64_t), reinterpret_cast<const char*>(int64s)},
       {kTfLiteInt16, sizeof(int16_t), reinterpret_cast<const char*>(int16s)},
+      {kTfLiteFloat16, sizeof(TfLiteFloat16),
+       reinterpret_cast<const char*>(float16s)},
   };
 
   for (auto test : cases) {
@@ -282,10 +288,8 @@
 TEST(BasicInterpreter, CheckAlignment) {
   struct {
     TfLiteType type;
-  } cases[] = {
-      {kTfLiteFloat32}, {kTfLiteInt32}, {kTfLiteUInt8},
-      {kTfLiteInt64},   {kTfLiteInt16},
-  };
+  } cases[] = {{kTfLiteFloat32}, {kTfLiteInt32}, {kTfLiteUInt8},
+               {kTfLiteInt64},   {kTfLiteInt16}, {kTfLiteFloat16}};
 
   for (auto test : cases) {
     Interpreter interpreter;
@@ -733,6 +737,17 @@
   ASSERT_EQ(reporter.num_calls(), 1);
 }
 
+TEST(BasicInterpreter, TestUseNNAPI) {
+  TestErrorReporter reporter;
+  Interpreter interpreter(&reporter);
+  interpreter.UseNNAPI(true);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+  interpreter.UseNNAPI(false);
+  ASSERT_EQ(reporter.error_messages(),
+            "Attempting to disable NNAPI delegate after it's applied.");
+}
+
 TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
   Interpreter interpreter;
   ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index e9b83cf..4bab056 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -13,7 +13,12 @@
 
 JAVA_SRCS = glob([
     "src/main/java/org/tensorflow/lite/*.java",
-])
+]) + select({
+    "//tensorflow:android": [
+        "//tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi:nnapi_delegate_src",
+    ],
+    "//conditions:default": [],
+})
 
 # Building tensorflow-lite.aar including 4 variants of .so
 # To build an aar for release, run below command:
@@ -66,7 +71,7 @@
 # also include the core `tensorflowlite` runtime.
 android_library(
     name = "tensorflowlite_gpu",
-    srcs = ["//tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/experimental:gpu_delegate"],
+    srcs = ["//tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu:gpu_delegate"],
     manifest = "AndroidManifest.xml",
     visibility = ["//visibility:public"],
     deps = [
@@ -261,6 +266,7 @@
 tflite_jni_binary(
     name = "libtensorflowlite_jni.so",
     deps = [
+        "//tensorflow/lite/delegates/nnapi/java/src/main/native",
         "//tensorflow/lite/java/src/main/native",
     ],
 )
@@ -270,6 +276,7 @@
     name = "libtensorflowlite_flex_jni.so",
     deps = [
         "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/delegates/nnapi/java/src/main/native",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/main/native:init_tensorflow",
     ],
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index 8ea16a3..c353b2c 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -54,6 +54,7 @@
 
     // Build off of nightly TensorFlow Lite
     implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
     // Use local TensorFlow library
     // implementation 'org.tensorflow:tensorflow-lite-local:0.0.0'
 }
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index c6f315b..5330657 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -368,10 +368,7 @@
       classifier.setNumThreads(numThreads);
       if (device.equals(cpu)) {
       } else if (device.equals(gpu)) {
-        if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
-          showToast("gpu not in this build.");
-          classifier = null;
-        } else if (model.equals(mobilenetV1Quant)) {
+        if (model.equals(mobilenetV1Quant)) {
           showToast("gpu requires float model.");
           classifier = null;
         } else {
@@ -405,9 +402,7 @@
     // Build list of devices
     int defaultModelIndex = 0;
     deviceStrings.add(cpu);
-    if (GpuDelegateHelper.isGpuDelegateAvailable()) {
-      deviceStrings.add(gpu);
-    }
+    deviceStrings.add(gpu);
     deviceStrings.add(nnApi);
 
     deviceView.setAdapter(
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
deleted file mode 100644
index 8dca177..0000000
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package com.example.android.tflitecamerademo;
-
-import org.tensorflow.lite.Delegate;
-
-/**
- * Helper class for {@code GpuDelegate}.
- *
- * <p>WARNING: This is an experimental API and subject to change.
- */
-public class GpuDelegateHelper {
-  private GpuDelegateHelper() {}
-
-  /** Checks whether {@code GpuDelegate} is available. */
-  public static boolean isGpuDelegateAvailable() {
-    try {
-      Class.forName("org.tensorflow.lite.experimental.GpuDelegate");
-      return true;
-    } catch (Exception e) {
-      return false;
-    }
-  }
-
-  /** Returns an instance of {@code GpuDelegate} if available. */
-  public static Delegate createGpuDelegate() {
-    try {
-      return Class.forName("org.tensorflow.lite.experimental.GpuDelegate")
-          .asSubclass(Delegate.class)
-          .getDeclaredConstructor()
-          .newInstance();
-    } catch (Exception e) {
-      throw new IllegalStateException(e);
-    }
-  }
-}
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 512f8b6..2e483d8 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -38,8 +38,9 @@
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
-import org.tensorflow.lite.Delegate;
 import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.gpu.GpuDelegate;
+import org.tensorflow.lite.nnapi.NnApiDelegate;
 
 /**
  * Classifies images with Tensorflow Lite.
@@ -95,7 +96,9 @@
           });
 
   /** holds a gpu delegate */
-  Delegate gpuDelegate = null;
+  GpuDelegate gpuDelegate = null;
+  /** holds an nnapi delegate */
+  NnApiDelegate nnapiDelegate = null;
 
   /** Initializes an {@code ImageClassifier}. */
   ImageClassifier(Activity activity) throws IOException {
@@ -163,27 +166,25 @@
   private void recreateInterpreter() {
     if (tflite != null) {
       tflite.close();
-      // TODO(b/120679982)
-      // gpuDelegate.close();
       tflite = new Interpreter(tfliteModel, tfliteOptions);
     }
   }
 
   public void useGpu() {
-    if (gpuDelegate == null && GpuDelegateHelper.isGpuDelegateAvailable()) {
-      gpuDelegate = GpuDelegateHelper.createGpuDelegate();
+    if (gpuDelegate == null) {
+      gpuDelegate = new GpuDelegate();
       tfliteOptions.addDelegate(gpuDelegate);
       recreateInterpreter();
     }
   }
 
   public void useCPU() {
-    tfliteOptions.setUseNNAPI(false);
     recreateInterpreter();
   }
 
   public void useNNAPI() {
-    tfliteOptions.setUseNNAPI(true);
+    nnapiDelegate = new NnApiDelegate();
+    tfliteOptions.addDelegate(nnapiDelegate);
     recreateInterpreter();
   }
 
@@ -196,6 +197,14 @@
   public void close() {
     tflite.close();
     tflite = null;
+    if (gpuDelegate != null) {
+      gpuDelegate.close();
+      gpuDelegate = null;
+    }
+    if (nnapiDelegate != null) {
+      nnapiDelegate.close();
+      nnapiDelegate = null;
+    }
     tfliteModel = null;
   }
 
diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md
index b7bf658..0d7d07e 100644
--- a/tensorflow/lite/java/ovic/README.md
+++ b/tensorflow/lite/java/ovic/README.md
@@ -1,6 +1,6 @@
-# OVIC Benchmarker for NIPS 2018
+# OVIC Benchmarker for CVPR 2019
 
-This folder contains the SDK for track one of the [Low Power ImageNet Recognition Challenge workshop at NIPS 2018.](https://lpirc.ecn.purdue.edu/)
+This folder contains the SDK for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2019.](https://lpirc.ecn.purdue.edu/)
 
 ## Pre-requisite
 
@@ -191,3 +191,26 @@
 
 The detection models above are both single-shot models (i.e. no object proposal generation) using TfLite's *fast* version of Non-Max-Suppression (NMS). The fast NMS is significant faster than the regular NMS (used by the ObjectDetectionAPI in training) at the expense of about 1% mAP for the listed models.
 
+
+### Latency table
+
+We have compiled a latency table for common neural network operators such as
+convolutions, separable convolutions, and matrix multiplications.
+The table of results is available here:
+
+* https://storage.cloud.google.com/ovic-data/latency_table.csv
+
+The results were generated by creating a small network containing a single
+operation, and running the op under the test harness. For more details see the
+NetAdapt paper<sup>1</sup>. We plan to expand table regularly as we test with
+newer OS releases and updates to Tensorflow Lite.
+
+### References
+
+1. **NetAdapt: Platform-Aware Neural Network Adaptation for Mobile
+   Applications**<br />
+   Yang, Tien-Ju, Andrew Howard, Bo Chen, Xiao Zhang, Alec Go, Mark Sandler,
+   Vivienne Sze, and Hartwig Adam. In Proceedings of the European Conference
+   on Computer Vision (ECCV), pp. 285-300. 2018<br />
+  [[link]](https://arxiv.org/abs/1804.03230) arXiv:1804.03230, 2018.
+
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index e184b8f..e28e6a6 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -125,6 +125,15 @@
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
+ *  Signature: (JZ)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_allowBufferHandleOutput(
+    JNIEnv* env, jclass clazz, jlong handle, jboolean allow);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
  *  Signature: (JI)V
  */
 JNIEXPORT void JNICALL
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index ff33256..4aabcd8 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -61,7 +61,13 @@
   @Test
   public void testInterpreterWithOptions() throws Exception {
     Interpreter interpreter =
-        new Interpreter(MODEL_FILE, new Interpreter.Options().setNumThreads(2).setUseNNAPI(true));
+        new Interpreter(
+            MODEL_FILE,
+            new Interpreter.Options()
+                .setNumThreads(2)
+                .setUseNNAPI(true)
+                .setAllowFp16PrecisionForFp32(false)
+                .setAllowBufferHandleOutput(false));
     assertThat(interpreter).isNotNull();
     assertThat(interpreter.getInputTensorCount()).isEqualTo(1);
     assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 2eb2d7f..c2e923c 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -8,6 +8,13 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
 
+# Enables usage of ruy in TF Lite kernels.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "tflite_with_ruy",
+    define_values = {"tflite_with_ruy": "true"},
+)
+
 # Suppress warnings that are introduced by Eigen Tensor.
 EXTRA_EIGEN_COPTS = select({
     "//tensorflow:ios": [
@@ -45,6 +52,7 @@
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "@com_google_googletest//:gtest",
@@ -80,15 +88,118 @@
 )
 
 cc_library(
-    name = "gemmlowp_support",
+    name = "cpu_backend_context",
     srcs = [
-        "gemmlowp_support.cc",
+        "cpu_backend_context.cc",
     ],
     hdrs = [
-        "gemmlowp_support.h",
+        "cpu_backend_context.h",
+    ],
+    copts = tflite_copts(),
+    defines = select({
+        "//tensorflow/lite/kernels:tflite_with_ruy": [
+            "TFLITE_WITH_RUY",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":op_macros",
+        # For now this unconditionally depends on both ruy and gemmlowp.
+        # See the comment inside class CpuBackendContext on the
+        # gemmlowp_context_ and ruy_context_ members.
+        "//tensorflow/lite/experimental/ruy:context",
+        "@gemmlowp",
+    ],
+)
+
+cc_library(
+    name = "cpu_backend_threadpool",
+    hdrs = [
+        "cpu_backend_threadpool.h",
     ],
     copts = tflite_copts(),
     deps = [
+        "//tensorflow/lite/kernels/internal:types",
+        ":cpu_backend_context",
+        # For now this unconditionally depends on both ruy and gemmlowp.
+        # We only need to depend on gemmlowp when tflite_with_ruy
+        # is false, but putting these dependencies in a select() seems to
+        # defeat copybara's rewriting rules.
+        "//tensorflow/lite/experimental/ruy:context",
+        "//tensorflow/lite/experimental/ruy:thread_pool",
+        "@gemmlowp",
+    ],
+)
+
+cc_test(
+    name = "cpu_backend_threadpool_test",
+    srcs = ["cpu_backend_threadpool_test.cc"],
+    deps = [
+        ":cpu_backend_context",
+        ":cpu_backend_threadpool",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "cpu_backend_gemm",
+    srcs = [
+        "cpu_backend_gemm_ruy.h",
+        "cpu_backend_gemm_custom_gemv.h",
+    ] + select({
+        "//tensorflow/lite/kernels:tflite_with_ruy": [],
+        "//conditions:default": [
+            "cpu_backend_gemm_gemmlowp.h",
+            "cpu_backend_gemm_eigen.h",
+            "cpu_backend_gemm_eigen.cc",
+        ],
+    }),
+    hdrs = [
+        "cpu_backend_gemm.h",
+        "cpu_backend_gemm_params.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/kernels/internal:common",
+        ":cpu_backend_context",
+        ":cpu_backend_threadpool",
+        # Depend on ruy regardless of `tflite_with_ruy`. See the comment in
+        # cpu_backend_gemm.h about why ruy is the generic path.
+        "//tensorflow/lite/experimental/ruy",
+        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy
+        # is false, but putting these dependencies in a select() seems to
+        # defeat copybara's rewriting rules.
+        "@gemmlowp",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_test(
+    name = "cpu_backend_gemm_test",
+    srcs = ["cpu_backend_gemm_test.cc"],
+    tags = ["notsan"],
+    deps = [
+        ":cpu_backend_context",
+        ":cpu_backend_gemm",
+        "@com_google_googletest//:gtest",
+        # ruy's reference path provides the reference implementation
+        # that this test compares against.
+        "//tensorflow/lite/experimental/ruy",
+    ],
+)
+
+cc_library(
+    name = "cpu_backend_support",
+    srcs = [
+        "cpu_backend_support.cc",
+    ],
+    hdrs = [
+        "cpu_backend_support.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":cpu_backend_context",
         ":op_macros",
         "//tensorflow/lite/c:c_api_internal",
         "@gemmlowp",
@@ -220,6 +331,7 @@
         "resize_nearest_neighbor.cc",
         "reverse.cc",
         "reverse_sequence.cc",
+        "round.cc",
         "select.cc",
         "shape.cc",
         "skip_gram.cc",
@@ -252,8 +364,8 @@
     visibility = ["//visibility:private"],
     deps = [
         ":activation_functor",
+        ":cpu_backend_support",
         ":eigen_support",
-        ":gemmlowp_support",
         ":kernel_util",
         ":lstm_eval",
         ":op_macros",
@@ -279,11 +391,13 @@
     srcs = ["lstm_eval.cc"],
     hdrs = ["lstm_eval.h"],
     deps = [
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//third_party/eigen3",
+        "@gemmlowp",
     ],
 )
 
@@ -579,6 +693,21 @@
 )
 
 cc_test(
+    name = "round_test",
+    size = "small",
+    srcs = ["round_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
     name = "elementwise_test",
     size = "small",
     srcs = ["elementwise_test.cc"],
@@ -803,8 +932,9 @@
     srcs = ["embedding_lookup_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_util",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 920fbe3..db9433a 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -54,6 +54,12 @@
   int32_t reverse_scaling_right_shift = 0;
 };
 
+struct LeakyReluOpData : public OpData {
+  uint8_t q_alpha;
+  int32_t output_multiplier = 0;
+  int output_shift = 0;
+};
+
 struct PreluOpData : public OpData {
   int32_t output_multiplier = 0;
   int output_shift = 0;
@@ -112,6 +118,42 @@
                                TfLiteIntArrayCopy(input->dims));
 }
 
+void* LeakyReluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new LeakyReluOpData;
+}
+
+void LeakyReluFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<LeakyReluOpData*>(buffer);
+}
+
+TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
+
+  if (output->type == kTfLiteUInt8) {
+    const auto* params =
+        reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+    // Quantize the alpha with same zero-point and scale as of input
+    data->q_alpha = static_cast<uint8_t>(std::max<float>(
+        std::numeric_limits<uint8_t>::min(),
+        std::min<float>(std::numeric_limits<uint8_t>::max(),
+                        std::round(input->params.zero_point +
+                                   (params->alpha / input->params.scale)))));
+
+    double real_multiplier =
+        input->params.scale * input->params.scale / output->params.scale;
+    QuantizeMultiplierSmallerThanOneExp(
+        real_multiplier, &data->output_multiplier, &data->output_shift);
+  }
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
 TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -345,6 +387,32 @@
   }
 }
 
+namespace {
+template <typename T>
+void QuantizedRelu1(const TfLiteTensor* input, TfLiteTensor* output) {
+  ActivationParams params;
+  int32 kMin = -1;
+  int32 kMax = 1;
+  params.activation_type = FusedActivationFunctionType::kRelu1;
+
+  // Relu1 has a min range of -1, we need to quantize this
+  params.quantized_activation_min =
+      std::max(static_cast<int32_t>(std::numeric_limits<T>::min()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(kMin / output->params.scale)));
+
+  // Relu1 has a max range of 1, we need to quantize this
+  params.quantized_activation_max =
+      std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(kMax / output->params.scale)));
+
+  // Reused the optimized function written for ReluX
+  optimized_ops::ReluX(params, GetTensorShape(input), GetTensorData<T>(input),
+                       GetTensorShape(output), GetTensorData<T>(output));
+}
+}  // namespace
+
 TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -355,9 +423,18 @@
                            GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      QuantizedRelu1<uint8_t>(input, output);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      QuantizedRelu1<int8_t>(input, output);
+      return kTfLiteOk;
+    } break;
     default:
       context->ReportError(context,
-                           "Only float32 is supported currently, got %s.",
+                           "Only float32, uint8, int8 supported "
+                           "currently, got %s.",
                            TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
@@ -925,11 +1002,31 @@
   }
 }
 
+namespace {
+template <typename T>
+void QLeakyRelu(const TfLiteTensor* input, TfLiteTensor* output, float alpha,
+                const LeakyReluOpData* data) {
+  LeakyReluParams op_params;
+  op_params.input_offset = input->params.zero_point;
+  op_params.alpha_offset = input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+
+  reference_ops::QuantizeLeakyRelu(
+      op_params, data->q_alpha, GetTensorShape(input), GetTensorData<T>(input),
+      GetTensorShape(output), GetTensorData<T>(output));
+}
+}  // namespace
+
 TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   const auto* params =
       reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+  const LeakyReluOpData* data =
+      reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
   LeakyReluParams op_params;
   op_params.alpha = params->alpha;
@@ -940,10 +1037,14 @@
           GetTensorShape(output), GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      QLeakyRelu<uint8_t>(input, output, params->alpha, data);
+      return kTfLiteOk;
+    } break;
     default:
-      context->ReportError(context,
-                           "Only float32 is supported currently, got %s.",
-                           TfLiteTypeGetName(input->type));
+      context->ReportError(
+          context, "Only float32 and uint8 is supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -1054,9 +1155,9 @@
 }
 
 TfLiteRegistration* Register_LEAKY_RELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 activations::GenericPrepare,
-                                 activations::LeakyReluEval};
+  static TfLiteRegistration r = {
+      activations::LeakyReluInit, activations::LeakyReluFree,
+      activations::LeakyReluPrepare, activations::LeakyReluEval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 25b17a9..923f7d1 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -56,6 +57,21 @@
     BuildInterpreter({GetShape(input_)});
   }
 
+  // A dedicated constructor for LeakyRelu, which does some options.
+  BaseActivationsOpModel(TensorData input, float alpha) {
+    input_ = AddInput(input);
+    if (input.type == TensorType_UINT8) {
+      output_ = AddOutput({input.type, {}, input.min, input.max});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({TensorType_INT8, {}, input.min, input.max});
+    } else {
+      output_ = AddOutput({input.type, {}});
+    }
+    SetBuiltinOp(BuiltinOperator_LEAKY_RELU, BuiltinOptions_LeakyReluOptions,
+                 CreateLeakyReluOptions(builder_, alpha).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
   BaseActivationsOpModel(BuiltinOperator type, const TensorData& input,
                          const TensorData& output) {
     input_ = AddInput(input);
@@ -111,6 +127,7 @@
   std::vector<T> GetOutput() {
     return ExtractVector<T>(output_);
   }
+
   template <typename T>
   std::vector<float> GetDequantizedOutput() {
     return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
@@ -211,6 +228,80 @@
               ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
 }
 
+TEST(QuantizedActivationsOpTest, LeakyReluUint8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      /*input=*/{TensorType_UINT8, {2, 3}, 8 * kMin, 8 * kMax}, 0.5);
+
+  m.SetInput<uint8_t>({
+      0.0f, 1.0f, 3.0f,    // Row 1
+      1.0f, -1.0f, -2.0f,  // Row 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 1.0f, 3.0f,    // Row 1
+                      1.0f, -0.5f, -1.0f,  // Row 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128,
+                                          144,
+                                          176,
+                                          144,
+                                          120,
+                                          112,
+                                      }));
+}
+
+TEST(QuantizedActivationsOpTest, Relu1Int8) {
+  const float kMin = -1;
+  const float kMax = 1;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU_N1_TO_1,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 2 * kMin, kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, 2 * kMin, kMax});
+
+  m.SetInput<int8_t>({
+      0.0, -0.6, 0.2, -0.4,  //
+      0.3, -2.0, 1.1, -0.1,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0, -0.6, 0.2, -0.4,  //
+                      0.3, -1.0, 1.0, -0.1,  //
+                  },
+                  kQuantizedTolerance)));
+}
+
+TEST(QuantizedActivationsOpTest, Relu1UInt8) {
+  const float kMin = -1;
+  const float kMax = 1;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU_N1_TO_1,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 2 * kMin, kMax},
+      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, 2 * kMin, kMax});
+
+  m.SetInput<uint8_t>({
+      0.0, -0.6, 0.2, -0.4,  //
+      0.3, -2.0, 1.1, -0.1,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0, -0.6, 0.2, -0.4,  //
+                      0.3, -1.0, 1.0, -0.1,  //
+                  },
+                  kQuantizedTolerance)));
+}
+
 TEST(QuantizedActivationsOpTest, Relu6Int8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
@@ -317,21 +408,33 @@
 TEST(QuantizedActivationsOpTest, SigmoidUint8) {
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
+      /*input=*/{TensorType_UINT8, {1, 6, 4, 1}, -10, 10});
   m.SetInput<uint8_t>({
-      0, -6, 2, 4,   //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
       3, -2, 10, 1,  //
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
                       0.952574, 0.119203, 0.999955, 0.731059,  //
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+              ElementsAreArray({
+                  128, 1, 227, 251, 244, 32, 255, 188,  //
+                  128, 1, 227, 251, 244, 32, 255, 188,  //
+                  128, 1, 227, 251, 244, 32, 255, 188,  //
+              }));
 }
 
 TEST(QuantizedActivationsOpTest, SigmoidInt8) {
@@ -970,7 +1073,6 @@
                                  1.0f, -0.5f, -1.0f,  // Row 2
                              }));
 }
-
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index d5795ee..7fc3666 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -185,6 +185,7 @@
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
   TF_LITE_ENSURE(context, (input_to_forget_weights->type == kTfLiteFloat32) ||
+                              (input_to_forget_weights->type == kTfLiteInt8) ||
                               (input_to_forget_weights->type == kTfLiteUInt8));
 
   const TfLiteTensor* input_to_input_weights =
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 707f06a..89da297 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -187,17 +187,25 @@
 
     if (use_aux_input) {
       aux_input_ = AddInput(TensorType_FLOAT32);
+      fw_aux_input_to_input_weights_ = AddInput(weight_type);
+      fw_aux_input_to_forget_weights_ = AddInput(weight_type);
+      fw_aux_input_to_cell_weights_ = AddInput(weight_type);
+      fw_aux_input_to_output_weights_ = AddInput(weight_type);
+      bw_aux_input_to_input_weights_ = AddInput(weight_type);
+      bw_aux_input_to_forget_weights_ = AddInput(weight_type);
+      bw_aux_input_to_cell_weights_ = AddInput(weight_type);
+      bw_aux_input_to_output_weights_ = AddInput(weight_type);
     } else {
       aux_input_ = AddNullInput();
+      fw_aux_input_to_input_weights_ = AddNullInput();
+      fw_aux_input_to_forget_weights_ = AddNullInput();
+      fw_aux_input_to_cell_weights_ = AddNullInput();
+      fw_aux_input_to_output_weights_ = AddNullInput();
+      bw_aux_input_to_input_weights_ = AddNullInput();
+      bw_aux_input_to_forget_weights_ = AddNullInput();
+      bw_aux_input_to_cell_weights_ = AddNullInput();
+      bw_aux_input_to_output_weights_ = AddNullInput();
     }
-    fw_aux_input_to_input_weights_ = AddNullInput();
-    fw_aux_input_to_forget_weights_ = AddNullInput();
-    fw_aux_input_to_cell_weights_ = AddNullInput();
-    fw_aux_input_to_output_weights_ = AddNullInput();
-    bw_aux_input_to_input_weights_ = AddNullInput();
-    bw_aux_input_to_forget_weights_ = AddNullInput();
-    bw_aux_input_to_cell_weights_ = AddNullInput();
-    bw_aux_input_to_output_weights_ = AddNullInput();
 
     SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
                  BuiltinOptions_BidirectionalSequenceLSTMOptions,
@@ -310,6 +318,26 @@
     PopulateTensor(aux_input_, offset, begin, end);
   }
 
+  void SetAuxInputToInputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_aux_input_to_input_weights_, f);
+    PopulateWeightTensor(bw_aux_input_to_input_weights_, f);
+  }
+
+  void SetAuxInputToForgetWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_aux_input_to_forget_weights_, f);
+    PopulateWeightTensor(bw_aux_input_to_forget_weights_, f);
+  }
+
+  void SetAuxInputToCellWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_aux_input_to_cell_weights_, f);
+    PopulateWeightTensor(bw_aux_input_to_cell_weights_, f);
+  }
+
+  void SetAuxInputToOutputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_aux_input_to_output_weights_, f);
+    PopulateWeightTensor(bw_aux_input_to_output_weights_, f);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -472,17 +500,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
@@ -637,17 +663,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
@@ -801,17 +825,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
@@ -962,17 +984,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
@@ -1115,17 +1135,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
@@ -1268,17 +1286,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToInputWeights(
@@ -1974,14 +1990,14 @@
           {n_batch, n_cell},    // cell_state tensor
 
           {n_batch, sequence_length, 0},  // aux_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_input tensor
-          {n_cell, 0},                    // aux_fw_input_to_forget tensor
-          {n_cell, 0},                    // aux_fw_input_to_cell tensor
-          {n_cell, 0},                    // aux_fw_input_to_output tensor
-          {n_cell, 0},                    // aux_bw_input_to_input tensor
-          {n_cell, 0},                    // aux_bw_input_to_forget tensor
-          {n_cell, 0},                    // aux_bw_input_to_cell tensor
-          {n_cell, 0},                    // aux_bw_input_to_output tensor
+          {0},                            // aux_fw_input_to_input tensor
+          {0},                            // aux_fw_input_to_forget tensor
+          {0},                            // aux_fw_input_to_cell tensor
+          {0},                            // aux_fw_input_to_output tensor
+          {0},                            // aux_bw_input_to_input tensor
+          {0},                            // aux_bw_input_to_forget tensor
+          {0},                            // aux_bw_input_to_cell tensor
+          {0},                            // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToInputWeights(
@@ -2608,6 +2624,177 @@
 // Same as the no cifg no peephole no projection no clipping test, but have an
 // aux input (without aux input weights), this is the case when stacking but no
 // cross-links.
+TEST_P(LSTMOpTest, BlackBoxTestWithAuxInputZeroAuxWeight) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+  const bool quantize_weights = GetParam();
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/true, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          // Forward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          // Backward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {sequence_length, n_batch, n_input},  // aux_input tensor
+          {n_cell, n_input},                    // aux_fw_input_to_input tensor
+          {n_cell, n_input},                    // aux_fw_input_to_forget tensor
+          {n_cell, n_input},                    // aux_fw_input_to_cell tensor
+          {n_cell, n_input},                    // aux_fw_input_to_output tensor
+          {n_cell, n_input},                    // aux_bw_input_to_input tensor
+          {n_cell, n_input},                    // aux_bw_input_to_forget tensor
+          {n_cell, n_input},                    // aux_bw_input_to_cell tensor
+          {n_cell, n_input},                    // aux_bw_input_to_output tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
+      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+  static float lstm_bw_golden_output[] = {
+      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
+      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+  // Aux input and input are the same, so we should observe the same outputs
+  // as there's no aux input.
+  lstm.SetAuxInput(0, batch0_start, batch0_end);
+  std::vector<float> dummpy_weights(n_cell * n_input, 0.0f);
+  lstm.SetAuxInputToInputWeights(dummpy_weights);
+  lstm.SetAuxInputToForgetWeights(dummpy_weights);
+  lstm.SetAuxInputToCellWeights(dummpy_weights);
+  lstm.SetAuxInputToOutputWeights(dummpy_weights);
+
+  lstm.Invoke();
+
+  float* fw_golden_start = lstm_fw_golden_output;
+  float* fw_golden_end =
+      fw_golden_start + lstm.num_fw_outputs() * lstm.sequence_length();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(fw_expected, quantize_weights ? 1e-2 : 1e-5)));
+
+  float* bw_golden_start = lstm_bw_golden_output;
+  float* bw_golden_end =
+      bw_golden_start + lstm.num_bw_outputs() * lstm.sequence_length();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(bw_expected, quantize_weights ? 1e-2 : 1e-5)));
+}
+
+// Same as the no cifg no peephole no projection no clipping test, but have an
+// aux input with non-zero weights.
 TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -2678,17 +2865,15 @@
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          // TODO(b/121134029): Update tests so tensor shapes after state tensor
-          // are used. They are currently ignored by test_util.
           {sequence_length, n_batch, n_input},  // aux_input tensor
-          {n_cell, 0},                          // aux_fw_input_to_input tensor
-          {n_cell, 0},                          // aux_fw_input_to_forget tensor
-          {n_cell, 0},                          // aux_fw_input_to_cell tensor
-          {n_cell, 0},                          // aux_fw_input_to_output tensor
-          {n_cell, 0},                          // aux_bw_input_to_input tensor
-          {n_cell, 0},                          // aux_bw_input_to_forget tensor
-          {n_cell, 0},                          // aux_bw_input_to_cell tensor
-          {n_cell, 0},                          // aux_bw_input_to_output tensor
+          {n_cell, n_input},                    // aux_fw_input_to_input tensor
+          {n_cell, n_input},                    // aux_fw_input_to_forget tensor
+          {n_cell, n_input},                    // aux_fw_input_to_cell tensor
+          {n_cell, n_input},                    // aux_fw_input_to_output tensor
+          {n_cell, n_input},                    // aux_bw_input_to_input tensor
+          {n_cell, n_input},                    // aux_bw_input_to_forget tensor
+          {n_cell, n_input},                    // aux_bw_input_to_cell tensor
+          {n_cell, n_input},                    // aux_bw_input_to_output tensor
       });
 
   lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
@@ -2738,19 +2923,21 @@
   // Input should have n_input * sequence_length many values.
   static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
   static float lstm_fw_golden_output[] = {
-      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
-      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
-      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+      0.153335, 0.542754, 0.708602, 0.742855, 0.247581, 0.835739,
+      0.947797, 0.958177, 0.410892, 0.672268, 0.761909, 0.829133};
   static float lstm_bw_golden_output[] = {
-      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
-      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+      0.342275, 0.883431, 0.955930, 0.975621, 0.204939, 0.806858,
+      0.914849, 0.934871, 0.123236, 0.373087, 0.465377, 0.517630};
+
+  lstm.SetAuxInputToInputWeights({0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  lstm.SetAuxInputToForgetWeights({0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 1.0});
+  lstm.SetAuxInputToCellWeights({0.5, 0.6, 0.7, 0.8, 0.5, 0.6, 0.7, 0.8});
+  lstm.SetAuxInputToOutputWeights({0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
 
   float* batch0_start = lstm_input;
   float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
 
   lstm.SetInput(0, batch0_start, batch0_end);
-  // Aux input and input are the same, so we should observe the same outputs
-  // as there's no aux input.
   lstm.SetAuxInput(0, batch0_start, batch0_end);
 
   lstm.Invoke();
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 3ef4404..870a10a 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -111,72 +111,64 @@
 // allocate and populate these during Prepare().
 // TODO(ycling): Activation function parameter is ignored. For now we dont have
 // a model with a Concatenation with fused activation function.
-#define TF_LITE_CONCATENATION(type, scalar)                                \
-  {                                                                        \
-    VectorOfTensors<scalar> all_inputs(*context, *node->inputs);           \
-    tflite::ConcatenationParams op_params;                                 \
-    op_params.axis = axis;                                                 \
-    op_params.inputs_count = node->inputs->size;                           \
-    type::Concatenation(op_params, all_inputs.shapes(), all_inputs.data(), \
-                        GetTensorShape(output),                            \
-                        GetTensorData<scalar>(output));                    \
-  }
-
-#define TF_LITE_CONCATENATION_QUANTIZED(type)                                 \
+#define TF_LITE_CONCATENATION(scalar)                                         \
   {                                                                           \
-    VectorOfQuantizedTensors all_inputs(*context, *node->inputs);             \
+    VectorOfTensors<scalar> all_inputs(*context, *node->inputs);              \
     tflite::ConcatenationParams op_params;                                    \
     op_params.axis = axis;                                                    \
-    op_params.input_zeropoint = all_inputs.zero_point();                      \
-    op_params.input_scale = all_inputs.scale();                               \
     op_params.inputs_count = node->inputs->size;                              \
-    op_params.output_zeropoint = output->params.zero_point;                   \
-    op_params.output_scale = output->params.scale;                            \
-    type::ConcatenationWithScaling(op_params, all_inputs.shapes(),            \
+    if (kernel_type == kReference) {                                          \
+      reference_ops::Concatenation(op_params, all_inputs.shapes(),            \
                                    all_inputs.data(), GetTensorShape(output), \
-                                   GetTensorData<uint8>(output));             \
+                                   GetTensorData<scalar>(output));            \
+    } else {                                                                  \
+      optimized_ops::Concatenation(op_params, all_inputs.shapes(),            \
+                                   all_inputs.data(), GetTensorShape(output), \
+                                   GetTensorData<scalar>(output));            \
+    }                                                                         \
+  }
+
+#define TF_LITE_CONCATENATION_QUANTIZED()                         \
+  {                                                               \
+    VectorOfQuantizedTensors all_inputs(*context, *node->inputs); \
+    tflite::ConcatenationParams op_params;                        \
+    op_params.axis = axis;                                        \
+    op_params.input_zeropoint = all_inputs.zero_point();          \
+    op_params.input_scale = all_inputs.scale();                   \
+    op_params.inputs_count = node->inputs->size;                  \
+    op_params.output_zeropoint = output->params.zero_point;       \
+    op_params.output_scale = output->params.scale;                \
+    if (kernel_type == kReference) {                              \
+      reference_ops::ConcatenationWithScaling(                    \
+          op_params, all_inputs.shapes(), all_inputs.data(),      \
+          GetTensorShape(output), GetTensorData<uint8>(output));  \
+    } else {                                                      \
+      optimized_ops::ConcatenationWithScaling(                    \
+          op_params, all_inputs.shapes(), all_inputs.data(),      \
+          GetTensorShape(output), GetTensorData<uint8>(output));  \
+    }                                                             \
   }
 
   switch (output->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
-      if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, float);
-      } else {
-        TF_LITE_CONCATENATION(optimized_ops, float);
-      }
+      TF_LITE_CONCATENATION(float);
       break;
     case kTfLiteInt32:
-      if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, int32);
-      } else {
-        TF_LITE_CONCATENATION(optimized_ops, int32);
-      }
+      TF_LITE_CONCATENATION(int32);
       break;
     case kTfLiteUInt8:
-      if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION_QUANTIZED(reference_ops);
-      } else {
-        TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
-      }
+      TF_LITE_CONCATENATION_QUANTIZED();
       break;
-    case kTfLiteInt8: {
-      if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, int8_t);
-      } else {
-        TF_LITE_CONCATENATION(optimized_ops, int8_t);
-      }
-    } break;
+    case kTfLiteInt8:
+      TF_LITE_CONCATENATION(int8_t);
+      break;
     case kTfLiteInt64:
-      if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, int64_t);
-      } else {
-        TF_LITE_CONCATENATION(optimized_ops, int64_t);
-      }
+      TF_LITE_CONCATENATION(int64_t);
       break;
 
     default:
-      context->ReportError(context,
-                           "Only float32 and uint8 are currently supported.");
+      context->ReportError(context, "Type '%s' is not supported currently.",
+                           TfLiteTypeGetName(output->type));
       return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index f419a28..f3eb4ab 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -327,6 +327,83 @@
               }));
 }
 
+TEST(ConcatenationOpTest, ThreeDimensionalNonQuantizedOneInput) {
+  QuantizedConcatenationOpModel m0(
+      {TensorType_UINT8, {2, 1, 2}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/1,
+      /*num_inputs=*/1);
+  m0.SetInput<uint8_t>(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({1.0f, 3.0f, 4.0f, 7.0f})));
+}
+
+TEST(ConcatenationOpTest, OneTrivialNonQuantizedInput) {
+  QuantizedConcatenationOpModel m0(
+      {TensorType_UINT8, {1}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/0,
+      /*num_inputs=*/1);
+  m0.SetInput<uint8_t>(0, {5.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput<uint8_t>(), ::testing::ElementsAre(5));
+}
+
+TEST(ConcatenationOpTest, TwoDimensionalNonQuantizedOneInput) {
+  QuantizedConcatenationOpModel m0(
+      {TensorType_UINT8, {2, 3}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/0,
+      /*num_inputs=*/1);
+  m0.SetInput<uint8_t>(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput<uint8_t>(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxesNonQuantized) {
+  // We will concatenate two tensors along different dimensions.
+  auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+
+  QuantizedConcatenationOpModel m0(
+      {TensorType_UINT8, {2, 3}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/0,
+      /*num_inputs=*/2);
+  m0.SetInput<uint8_t>(0, tensor0);
+  m0.SetInput<uint8_t>(1, tensor1);
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput<uint8_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
+  QuantizedConcatenationOpModel m0_negative(
+      {TensorType_UINT8, {2, 3}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/-2,
+      /*num_inputs=*/2);
+  m0_negative.SetInput<uint8_t>(0, tensor0);
+  m0_negative.SetInput<uint8_t>(1, tensor1);
+  m0_negative.Invoke();
+  EXPECT_THAT(m0_negative.GetOutput<uint8_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
+  QuantizedConcatenationOpModel m1(
+      {TensorType_UINT8, {2, 3}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/1,
+      /*num_inputs=*/2);
+  m1.SetInput<uint8_t>(0, tensor0);
+  m1.SetInput<uint8_t>(1, tensor1);
+  m1.Invoke();
+  EXPECT_THAT(m1.GetOutput<uint8_t>(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+
+  QuantizedConcatenationOpModel m1_negative(
+      {TensorType_UINT8, {2, 3}, 0, std::numeric_limits<uint8_t>::max()},
+      /*axis=*/-1,
+      /*num_inputs=*/2);
+  m1_negative.SetInput<uint8_t>(0, tensor0);
+  m1_negative.SetInput<uint8_t>(1, tensor1);
+  m1_negative.Invoke();
+  EXPECT_THAT(m1_negative.GetOutput<uint8_t>(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 8da8bd1..072d6c6 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -24,9 +24,12 @@
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
-#include "tensorflow/lite/kernels/gemmlowp_support.h"
+// b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
+#endif
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
@@ -111,14 +114,14 @@
   // Instead, we allocate a new object to use as scratch space for im2col, and
   // to carry information from Prepare() to Eval().
   auto* data = new OpData;
-  gemmlowp_support::IncrementUsageCounter(context);
   eigen_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
   eigen_support::DecrementUsageCounter(context);
-  gemmlowp_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -284,28 +287,11 @@
 
   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
-  auto compute_out_size = [padding](int image_size, int filter_size, int stride,
-                                    int dilation_rate) -> int {
-    int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
-    return padding == kTfLitePaddingSame
-               ? (image_size + stride - 1) / stride
-               : padding == kTfLitePaddingValid
-                     ? (image_size - effective_filter_size + stride) / stride
-                     : 0;
-  };
-
-  int out_width = compute_out_size(width, filter_width, params->stride_width,
-                                   params->dilation_width_factor);
-  int out_height =
-      compute_out_size(height, filter_height, params->stride_height,
-                       params->dilation_height_factor);
-
-  data->padding.height =
-      ComputePadding(params->stride_height, params->dilation_height_factor,
-                     height, filter_height, out_height);
-  data->padding.width =
-      ComputePadding(params->stride_width, params->dilation_width_factor, width,
-                     filter_width, out_width);
+  int out_width, out_height;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      params->dilation_height_factor, params->dilation_width_factor, height,
+      width, filter_height, filter_width, padding, &out_height, &out_width);
 
   TF_LITE_ENSURE(context, has_bias);
 
@@ -434,9 +420,6 @@
                    TfLiteTensor* filter, TfLiteTensor* bias,
                    TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
                    TfLiteTensor* output) {
-  gemmlowp::GemmContext* gemmlowp_context =
-      gemmlowp_support::GetFromContext(context);
-
   auto input_offset = -input->params.zero_point;
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -470,26 +453,26 @@
   op_params.quantized_activation_max = data->output_activation_max;
   switch (effective_kernel_type) {
     case kReference: {
-      reference_ops::Conv(op_params, GetTensorShape(input),
-                          GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                          GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                          GetTensorData<int32_t>(bias), GetTensorShape(output),
-                          GetTensorData<uint8_t>(output),
-                          GetTensorShape(im2col),
-                          GetTensorData<uint8_t>(im2col), gemmlowp_context);
+      reference_ops::Conv(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+          GetTensorShape(bias), GetTensorData<int32_t>(bias),
+          GetTensorShape(output), GetTensorData<uint8_t>(output),
+          GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
+          /* cpu_backend_context = */ nullptr);
       break;
     }
     case kGenericOptimized:
     case kMultithreadOptimized:
     case kCblasOptimized: {
       // There is only one optimized implementation for Quantized Conv.
-      optimized_ops::Conv(op_params, GetTensorShape(input),
-                          GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                          GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                          GetTensorData<int32_t>(bias), GetTensorShape(output),
-                          GetTensorData<uint8_t>(output),
-                          GetTensorShape(im2col),
-                          GetTensorData<uint8_t>(im2col), gemmlowp_context);
+      optimized_ops::Conv(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+          GetTensorShape(bias), GetTensorData<int32_t>(bias),
+          GetTensorShape(output), GetTensorData<uint8_t>(output),
+          GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
+          cpu_backend_support::GetFromContext(context));
       break;
     }
   }
@@ -501,15 +484,6 @@
                              TfLiteTensor* input, TfLiteTensor* filter,
                              TfLiteTensor* bias, TfLiteTensor* output,
                              TfLiteTensor* im2col) {
-  KernelType effective_kernel_type;
-  effective_kernel_type = kernel_type;
-
-// If not running on NEON we force a fallback to the reference kernels, until
-// we have optimized support on other platforms.
-#ifndef GEMMLOWP_NEON
-  effective_kernel_type = kReference;
-#endif
-
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -520,7 +494,7 @@
   op_params.padding_values.height = data->padding.height;
   op_params.padding_values.width = data->padding.width;
 
-  switch (effective_kernel_type) {
+  switch (kernel_type) {
     case kReference: {
       reference_integer_ops::ConvPerChannel(
           op_params, data->per_channel_output_multiplier.data(),
@@ -534,9 +508,6 @@
     case kGenericOptimized:
     case kMultithreadOptimized:
     case kCblasOptimized: {
-#ifdef GEMMLOWP_NEON
-      gemmlowp::GemmContext* gemmlowp_context =
-          gemmlowp_support::GetFromContext(context);
       optimized_integer_ops::ConvPerChannel(
           op_params, data->per_channel_output_multiplier.data(),
           data->per_channel_output_shift.data(), GetTensorShape(input),
@@ -544,8 +515,8 @@
           GetTensorData<int8>(filter), GetTensorShape(bias),
           GetTensorData<int32>(bias), GetTensorShape(output),
           GetTensorData<int8>(output), GetTensorShape(im2col),
-          GetTensorData<int8>(im2col), gemmlowp_context);
-#endif
+          GetTensorData<int8>(im2col),
+          cpu_backend_support::GetFromContext(context));
       break;
     }
   }
@@ -592,10 +563,17 @@
                           GetTensorData<float>(filter), GetTensorShape(bias),
                           GetTensorData<float>(bias), GetTensorShape(output),
                           GetTensorData<float>(output), GetTensorShape(im2col),
-                          GetTensorData<float>(im2col));
+                          GetTensorData<float>(im2col),
+                          cpu_backend_support::GetFromContext(context));
       break;
     }
     case kMultithreadOptimized: {
+#ifdef TFLITE_WITH_RUY
+      // See Register_CONV_2D: we should never be here when tflite_with_ruy
+      // was enabled. We #if out this code in order to get the corresponding
+      // binary size benefits.
+      TFLITE_DCHECK(false);
+#else
       const float* filter_data;
       if (data->need_hwcn_weights) {
         filter_data = GetTensorData<float>(hwcn_weights);
@@ -610,6 +588,7 @@
           GetTensorData<float>(output), GetTensorShape(im2col),
           GetTensorData<float>(im2col));
       break;
+#endif
     }
   }
 }
@@ -719,12 +698,9 @@
       if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
         EvalHybrid<kernel_type>(context, node, params, data, input, filter,
                                 bias, im2col, hwcn_weights, output);
-      } else if (data->supports_multithreaded_kernel) {
+      } else {
         EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
                                im2col, hwcn_weights, output);
-      } else {
-        EvalFloat<kGenericOptimized>(context, node, params, data, input, filter,
-                                     bias, im2col, hwcn_weights, output);
       }
       break;
     case kTfLiteUInt8:
@@ -774,8 +750,11 @@
 }
 
 TfLiteRegistration* Register_CONV_2D() {
-#ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
+#if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
+#elif defined TFLITE_WITH_RUY
+  // tflite_with_ruy optimizes the generic kernel type.
+  return Register_CONVOLUTION_GENERIC_OPT();
 #else
   return Register_CONVOLUTION_MULTITHREADED_OPT();
 #endif
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 319f528..c0b64d6 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -124,8 +124,10 @@
 const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
     {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
+#ifndef TFLITE_WITH_RUY
     {"MultithreadedOptimized",
      ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+#endif
     {"CblasOptimized", ops::builtin::Register_CONVOLUTION_CBLAS_OPT()},
 });
 
@@ -966,8 +968,8 @@
        255},
       {TensorType_UINT8,
        {depth, filter_size, filter_size, filter_count},
-       0,
-       255},
+       -128,
+       127},
       {TensorType_UINT8, {}, 0, 255}, stride_width, stride_height, padding,
       ActivationFunctionType_NONE, dilation_width_factor,
       dilation_height_factor);
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
new file mode 100644
index 0000000..15ab1bc
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/experimental/ruy/context.h"
+
+namespace tflite {
+
+CpuBackendContext::CpuBackendContext()
+    : ruy_context_(new ruy::Context),
+      gemmlowp_context_(new gemmlowp::GemmContext) {
+  set_max_num_threads(1);
+}
+
+CpuBackendContext::~CpuBackendContext() {}
+
+void CpuBackendContext::set_max_num_threads(int max_num_threads) {
+  max_num_threads_ = max_num_threads;
+  ruy_context_->max_num_threads = max_num_threads;
+  gemmlowp_context_->set_max_num_threads(max_num_threads);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
new file mode 100644
index 0000000..066d4a1
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_CONTEXT_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_CONTEXT_H_
+
+#include <memory>
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/experimental/ruy/context.h"
+
+namespace tflite {
+
+class CpuBackendContext final {
+ public:
+  CpuBackendContext();
+  ~CpuBackendContext();
+
+  ruy::Context* ruy_context() const { return ruy_context_.get(); }
+
+  gemmlowp::GemmContext* gemmlowp_context() const {
+    return gemmlowp_context_.get();
+  }
+
+  // Sets the maximum-number-of-threads-to-use parameter.
+  // This is only a means of passing around this information.
+  // cpu_backend_threadpool::Execute creates as many threads as it's
+  // asked to, regardless of this. Typically a call site would query
+  // cpu_backend_context->max_num_threads() and used that to determine
+  // the number of tasks to create and to give to
+  // cpu_backend_threadpool::Execute.
+  //
+  // This value also gets propagated to back-ends, where it plays the same
+  // information-only role.
+  void set_max_num_threads(int max_num_threads);
+
+  // See set_max_num_threads.
+  int max_num_threads() const { return max_num_threads_; }
+
+ private:
+  // To enable a smooth transition from the current direct usage
+  // of the underlying gemmlowp context to going through abstractions
+  // (see :cpu_backend_gemm), for now a CpuBackendContext always
+  // stores both a gemmlowp context and a ruy context.
+  // TODO(b/131416458): Once call sites all go through abstractions,
+  // elide what can be elided based on TFLITE_WITH_RUY.
+  const std::unique_ptr<ruy::Context> ruy_context_;
+  const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
+
+  // See set_max_num_threads.
+  int max_num_threads_;
+
+  CpuBackendContext(const CpuBackendContext&) = delete;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_CONTEXT_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
new file mode 100644
index 0000000..eccf69f
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -0,0 +1,112 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
+
+#ifndef TFLITE_WITH_RUY
+#include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
+#endif
+
+namespace tflite {
+
+namespace cpu_backend_gemm {
+
+/* Generic implementation using ruy.
+ * Non-ruy implementation will be partial specializations of this template.
+ */
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl : detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar,
+                                           DstScalar, quantization_flavor> {};
+
+#ifndef TFLITE_WITH_RUY
+
+/* Specializations using gemmlowp */
+
+template <typename SrcScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct GemmImpl<SrcScalar, SrcScalar, std::int32_t, DstScalar,
+                quantization_flavor>
+    : detail::GemmImplUsingGemmlowp<SrcScalar, SrcScalar, std::int32_t,
+                                    DstScalar, quantization_flavor> {};
+
+// When SrcScalar=int8 or DstScalar=int8, gemmlowp fails to compile
+// outside of NEON. We avoid the compilation failure by subspecializing these
+// cases, rerouting it back to ruy.
+#ifndef GEMMLOWP_NEON
+template <typename SrcScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                quantization_flavor>
+    : detail::GemmImplUsingRuy<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                               quantization_flavor> {};
+
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl<std::int8_t, std::int8_t, std::int32_t, DstScalar,
+                quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               DstScalar, quantization_flavor> {};
+
+template <QuantizationFlavor quantization_flavor>
+struct GemmImpl<std::int8_t, std::int8_t, std::int32_t, std::int8_t,
+                quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               std::int8_t, quantization_flavor> {};
+#endif  // not GEMMLOWP_NEON
+
+/* Specializations using Eigen */
+
+template <>
+struct GemmImpl<float, float, float, float, QuantizationFlavor::kFloatingPoint>
+    : detail::GemmImplUsingEigen {};
+
+#endif  // not TFLITE_WITH_RUY
+
+/* Public entry point */
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+          const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+          const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+          const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+          CpuBackendContext* context) {
+  ValidateParams(lhs_params, rhs_params, dst_params, params);
+  if (dst_params.cols == 1) {
+    // GEMV case: try a custom fast GEMV path.
+    if (detail::CustomGemv(lhs_params, lhs_data, rhs_params, rhs_data,
+                           dst_params, dst_data, params, context)) {
+      return;
+    }
+  }
+  GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+           quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
+                                     dst_params, dst_data, params, context);
+}
+
+}  // namespace cpu_backend_gemm
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
new file mode 100644
index 0000000..4d27cc8
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -0,0 +1,452 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Fast Gemv (i.e. matrix*vector multiplication) paths.
+// TODO(b/132094390): remove when GEMM performance is good enough on GEMV cases.
+
+// TFLite's runtime ops concentrate as much as possible the matrix*vector
+// use cases on the (matrix) * (column-vector) case, as opposed to
+// (row-vector) * (matrix).  So that is what we focus on optimizing here.
+// Accordingly, the public cpu_backend_gemm::Gemm() entry point checks
+// if we are in this (matrix) * (column-vector) case, and if so calls
+// CustomGemv.
+//
+// cpu_backend_gemm::Gemm is also currently restricted (as enforced in
+// ValidateParams) to the case where the left-hand side matrix is row-major.
+//
+// So the current scope of this CustomGemv function really is:
+// (row-major matrix) * (column-vector).
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
+
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+// CustomGemvImpl is what needs to be specialized for each custom GEMV path.
+//
+// It does not deal with any multi-threaded implementation detail. Rather,
+// it provides the single-thread implementation to be run by each thread.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct CustomGemvImpl {
+  // The number of rows of the left-hand-side matrix (and equivalently of the
+  // destination column-vector) that the kernel processes at a time.
+  // This will also be the minimum required number of rows for a Gemv shape
+  // to be supported by this path.
+  //
+  // Gemv implementations are expected to be able to deal with numbers of
+  // rows that aren't multiples of kKernelRows by possibly running the kernel
+  // again at an odd row_start, e.g. if kKernelRows==4, Run() should still
+  // support running on 7 rows by running twice: once with row_start=0 and then
+  // another time with row_start=3.
+  //
+  // On the other hand, gemv implementations are not expected to support
+  // running on fewer than kKernelRows rows. There is no interest in
+  // optimizing such narrow Gemv's that they are just a few dot-products.
+  // Supporting that would require custom kernel code only for that case.
+  static constexpr int kKernelRows = 1;
+
+  // Returns true if the Gemv shape is supported by Run(), provided that
+  // (row_end - row_start) > kKernelRows.
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<LhsScalar>& lhs_params,
+      const MatrixParams<RhsScalar>& rhs_params,
+      const MatrixParams<DstScalar>& dst_params,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+    return false;
+  }
+
+  // Performs the Gemv.
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end) {}
+};
+
+// Wraps CustomGemvImpl for multi-threaded operation.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+class CustomGemvTask : public cpu_backend_threadpool::Task {
+ public:
+  CustomGemvTask(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end)
+      : lhs_params_(lhs_params),
+        lhs_data_(lhs_data),
+        rhs_params_(rhs_params),
+        rhs_data_(rhs_data),
+        dst_params_(dst_params),
+        dst_data_(dst_data),
+        params_(params),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                                quantization_flavor>;
+    Impl::Run(lhs_params_, lhs_data_, rhs_params_, rhs_data_, dst_params_,
+              dst_data_, params_, row_start_, row_end_);
+  }
+
+ private:
+  const MatrixParams<LhsScalar>& lhs_params_;
+  const LhsScalar* lhs_data_;
+  const MatrixParams<RhsScalar>& rhs_params_;
+  const RhsScalar* rhs_data_;
+  const MatrixParams<DstScalar>& dst_params_;
+  DstScalar* dst_data_;
+  const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params_;
+  int row_start_;
+  int row_end_;
+};
+
+// Either performs the requested Gemv operation and returns true,
+// or immediately returns false.
+//
+// See the comment at the top of the file for the scope of what this handles.
+// In summary: (row-major matrix) * (column-vector).
+//
+// Here is only high-level logic.
+// The actual implementation details are in specializations of
+// CustomGemvImpl.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+bool CustomGemv(
+    const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+    const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+    const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+    CpuBackendContext* context) {
+  using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                              quantization_flavor>;
+  if (lhs_params.rows < Impl::kKernelRows) {
+    return false;
+  }
+  if (!Impl::IsSupportedGivenSufficientlyManyRows(lhs_params, rhs_params,
+                                                  dst_params, params)) {
+    return false;
+  }
+  TFLITE_DCHECK_GE(lhs_params.rows, Impl::kKernelRows);
+  int thread_count = LegacyHowManyThreads<Impl::kKernelRows>(
+      context->max_num_threads(), dst_params.rows, dst_params.cols,
+      lhs_params.cols);
+  if (thread_count == 1) {
+    Impl::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data,
+              params, 0, lhs_params.rows);
+  } else {
+    using Task = CustomGemvTask<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                                quantization_flavor>;
+    std::vector<Task> tasks;
+    tasks.reserve(thread_count);
+    const int kRowsPerThread =
+        RoundUp<Impl::kKernelRows>(CeilQuotient(dst_params.rows, thread_count));
+    int row_start = 0;
+    for (int i = 0; i < thread_count; i++) {
+      int row_end = std::min(dst_params.rows, row_start + kRowsPerThread);
+      tasks.emplace_back(lhs_params, lhs_data, rhs_params, rhs_data, dst_params,
+                         dst_data, params, row_start, row_end);
+      row_start = row_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), context);
+  }
+  return true;
+}
+
+#ifdef USE_NEON
+
+// Some NEON helper functions used by CustomGemvImpl specializations below,
+// allowing for some type genericity in them.
+
+inline int16x8x2_t LoadAndSubtractZeroPoint(const std::uint8_t* src,
+                                            std::uint8_t zero_point) {
+  uint8x16_t src_u8 = vld1q_u8(src);
+  int16x8_t src_s16_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
+  int16x8_t src_s16_1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
+  int16x8x2_t result;
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  result.val[0] = vsubq_s16(src_s16_0, zero_point_vec);
+  result.val[1] = vsubq_s16(src_s16_1, zero_point_vec);
+  return result;
+}
+
+inline int16x8x2_t LoadAndSubtractZeroPoint(const std::int8_t* src,
+                                            std::int8_t zero_point) {
+  int8x16_t src_s8 = vld1q_s8(src);
+  int16x8_t src_s16_0 = vmovl_s8(vget_low_s8(src_s8));
+  int16x8_t src_s16_1 = vmovl_s8(vget_high_s8(src_s8));
+  int16x8x2_t result;
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  result.val[0] = vsubq_s16(src_s16_0, zero_point_vec);
+  result.val[1] = vsubq_s16(src_s16_1, zero_point_vec);
+  return result;
+}
+
+inline void ClampAndStore(int32x4_t src, std::uint8_t clamp_min,
+                          std::uint8_t clamp_max, std::uint8_t* dst) {
+  // Narrow values down to 16 bit signed.
+  const int16x4_t res16 = vqmovn_s32(src);
+  // Narrow values down to 8 bit unsigned, saturating.
+  uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+  // Apply the clamping from the activation function
+  res8 = vmax_u8(res8, vdup_n_u8(clamp_min));
+  res8 = vmin_u8(res8, vdup_n_u8(clamp_max));
+  // Store results to destination.
+  vst1_lane_u8(dst + 0, res8, 0);
+  vst1_lane_u8(dst + 1, res8, 1);
+  vst1_lane_u8(dst + 2, res8, 2);
+  vst1_lane_u8(dst + 3, res8, 3);
+}
+
+inline void ClampAndStore(int32x4_t src, std::int8_t clamp_min,
+                          std::int8_t clamp_max, std::int8_t* dst) {
+  // Narrow values down to 16 bit signed.
+  const int16x4_t res16 = vqmovn_s32(src);
+  // Narrow values down to 8 bit unsigned, saturating.
+  int8x8_t res8 = vqmovn_s16(vcombine_s16(res16, res16));
+  // Apply the clamping from the activation function
+  res8 = vmax_s8(res8, vdup_n_s8(clamp_min));
+  res8 = vmin_s8(res8, vdup_n_s8(clamp_max));
+  // Store results to destination.
+  vst1_lane_s8(dst + 0, res8, 0);
+  vst1_lane_s8(dst + 1, res8, 1);
+  vst1_lane_s8(dst + 2, res8, 2);
+  vst1_lane_s8(dst + 3, res8, 3);
+}
+
+inline void ClampAndStore(int32x4_t src, std::int16_t clamp_min,
+                          std::int16_t clamp_max, std::int16_t* dst) {
+  // Narrow values down to 16 bit signed.
+  int16x4_t res16 = vqmovn_s32(src);
+  // Apply the clamping from the activation function
+  res16 = vmax_s16(res16, vdup_n_s16(clamp_min));
+  res16 = vmin_s16(res16, vdup_n_s16(clamp_max));
+  // Store results to destination.
+  vst1_lane_s16(dst + 0, res16, 0);
+  vst1_lane_s16(dst + 1, res16, 1);
+  vst1_lane_s16(dst + 2, res16, 2);
+  vst1_lane_s16(dst + 3, res16, 3);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
+                      quantization_flavor> {
+  // This partial template specialization is less generic than its declaration
+  // implies: it assumes the following constraints on its free template
+  // parameters. We guard these assumptions in the following static_assert's.
+  static_assert(std::is_same<LhsScalar, std::uint8_t>::value ||
+                    std::is_same<LhsScalar, std::int8_t>::value,
+                "");
+  static_assert(std::is_same<RhsScalar, std::uint8_t>::value ||
+                    std::is_same<RhsScalar, std::int8_t>::value,
+                "");
+  static_assert(std::is_same<DstScalar, std::uint8_t>::value ||
+                    std::is_same<DstScalar, std::int8_t>::value ||
+                    std::is_same<DstScalar, std::int16_t>::value,
+                "");
+  static_assert(quantization_flavor ==
+                        QuantizationFlavor::kIntegerWithUniformMultiplier ||
+                    quantization_flavor ==
+                        QuantizationFlavor::kIntegerWithPerRowMultiplier,
+                "");
+
+  // This implementation's inner loop processes 4 rows of the left-hand side
+  // matrix at a time.
+  static constexpr int kKernelRows = 4;
+
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<LhsScalar>& lhs_params,
+      const MatrixParams<RhsScalar>& rhs_params,
+      const MatrixParams<DstScalar>& dst_params,
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params) {
+    // There are no further requirements on the applicability of this kernel,
+    // beyond the left-hand-side matrix having at least kKernelRows rows,
+    // and the type requirements implied in this template partial
+    // specialization.
+    return true;
+  }
+
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end) {
+    // Handle kKernelRows ( == 4) rows of the left-hand side matrix at each
+    // iteration of this for loop.
+    TFLITE_DCHECK_GE(row_end - row_start, kKernelRows);
+    for (int row = row_start; row < row_end; row += kKernelRows) {
+      // Here is the magic where we allow this kernel to handle any odd number
+      // of rows as long as it's >= kKernelRows: the last group of `kKernelRows`
+      // rows will be nudged to fit, possibly by starting at an odd value of
+      // `row`.
+      row = std::min(row, row_end - kKernelRows);
+      const LhsScalar* filter_ptr = lhs_data + row * lhs_params.cols;
+      // 4 accumulator registers, one for each row being processed.
+      // Each has 4 int32 lanes that corresponds to columns modulo 4, and
+      // will need to be horizontally reduced at the end.
+      int32x4_t acc0 = vdupq_n_s32(0);
+      int32x4_t acc1 = acc0;
+      int32x4_t acc2 = acc0;
+      int32x4_t acc3 = acc0;
+      int in = 0;
+      // As much as possible, handle 16 columns of the left-hand side matrix
+      // at a time. This allows for decent NEON implementation.
+      for (; in <= lhs_params.cols - 16; in += 16) {
+        int16x8x2_t input_val =
+            LoadAndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
+        int16x8x2_t filter_val_0 = LoadAndSubtractZeroPoint(
+            filter_ptr + 0 * lhs_params.cols, lhs_params.zero_point);
+        int16x8x2_t filter_val_1 = LoadAndSubtractZeroPoint(
+            filter_ptr + 1 * lhs_params.cols, lhs_params.zero_point);
+        int16x8x2_t filter_val_2 = LoadAndSubtractZeroPoint(
+            filter_ptr + 2 * lhs_params.cols, lhs_params.zero_point);
+        int16x8x2_t filter_val_3 = LoadAndSubtractZeroPoint(
+            filter_ptr + 3 * lhs_params.cols, lhs_params.zero_point);
+        filter_ptr += 16;
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[1]),
+                         vget_high_s16(input_val.val[1]));
+      }
+      // Leftovers: fewer than 16 columns remain. Very slow code, could be
+      // improved upon if critical in some application.
+      if (in < lhs_params.cols) {
+        int32 buf[16];
+        vst1q_s32(buf + 0, acc0);
+        vst1q_s32(buf + 4, acc1);
+        vst1q_s32(buf + 8, acc2);
+        vst1q_s32(buf + 12, acc3);
+        for (; in < lhs_params.cols; in++) {
+          int lane = (in + 16 - lhs_params.cols) % 4;
+          const int32 input_val = rhs_data[in] - rhs_params.zero_point;
+          for (int k = 0; k < 4; k++) {
+            int32 filter_val = lhs_data[in + (row + k) * lhs_params.cols] -
+                               lhs_params.zero_point;
+            buf[lane + 4 * k] += filter_val * input_val;
+          }
+        }
+        acc0 = vld1q_s32(buf + 0);
+        acc1 = vld1q_s32(buf + 4);
+        acc2 = vld1q_s32(buf + 8);
+        acc3 = vld1q_s32(buf + 12);
+      }
+
+      // Horizontally reduce accumulators
+      int32x2_t pairwise_reduced_acc_0 =
+          vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+      int32x2_t pairwise_reduced_acc_1 =
+          vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+      int32x2_t pairwise_reduced_acc_2 =
+          vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+      int32x2_t pairwise_reduced_acc_3 =
+          vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+      const int32x2_t reduced_lo =
+          vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      const int32x2_t reduced_hi =
+          vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+      // End of horizontal reduction: now `reduced` is a single int32x4
+      // containing the 4 int32 accumulators corresponding to the 4 rows
+      // being processed.
+
+      // Add bias values.
+      int32x4_t bias_vec = vld1q_s32(params.bias + row);
+      reduced = vaddq_s32(reduced, bias_vec);
+
+      // Get multiplier parameters.
+      int multiplier_exponent;
+      std::int32_t multiplier_fixedpoint;
+      if (quantization_flavor ==
+          QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+        multiplier_exponent = params.multiplier_exponent_perchannel[row];
+        multiplier_fixedpoint = params.multiplier_fixedpoint_perchannel[row];
+      } else {
+        multiplier_exponent = params.multiplier_exponent;
+        multiplier_fixedpoint = params.multiplier_fixedpoint;
+      }
+
+      // If positive exponent, shift left.
+      if (multiplier_exponent > 0) {
+        reduced = vshlq_s32(reduced, vdupq_n_s32(multiplier_exponent));
+      }
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, multiplier_fixedpoint);
+      // If negative exponent, rounding-shift-right.
+      if (multiplier_exponent < 0) {
+        using gemmlowp::RoundingDivideByPOT;
+        reduced = RoundingDivideByPOT(reduced, -multiplier_exponent);
+      }
+
+      // Add the output offset.
+      const int32x4_t output_offset_vec = vdupq_n_s32(dst_params.zero_point);
+      reduced = vaddq_s32(reduced, output_offset_vec);
+
+      // Finally, clamp and store to the destination.
+      ClampAndStore(reduced, params.clamp_min, params.clamp_max,
+                    dst_data + row);
+    }
+  }
+};
+#endif
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
new file mode 100644
index 0000000..9a78ddd
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
+
+// See b/131835803: in TFLite code, because eigen_spatial_convolutions.h does
+// #define Eigen EigenForTFLite, it is difficult to have any #include of Eigen
+// headers in a header file, as that results in name clases (compilation
+// errors) depending on the order in which these headers are #included.
+// So we have moved the #include of Eigen here, in a .cc file, where we have
+// control over the header #include sequence.
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+// This function is out-of-line in a .cc file because of the issue
+// noted above in the comment on the #include for Eigen/Core.
+void GemmImplUsingEigen::Run(
+    const MatrixParams<float>& lhs_params, const float* lhs_data,
+    const MatrixParams<float>& rhs_params, const float* rhs_data,
+    const MatrixParams<float>& dst_params, float* dst_data,
+    const GemmParams<float, float>& params, CpuBackendContext* /* context */) {
+  // This code assumes specific storage orders, encoded in these Eigen types.
+  // These assumptions have been checked by TF_LITE_ASSERT's in the public
+  // Gemm entry point already, before the implementation gets to this point.
+  using EigenMatrixMapRowMajorConst =
+      Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic,
+                                     Eigen::RowMajor>>;
+  using EigenMatrixMapColMajorConst =
+      Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic,
+                                     Eigen::ColMajor>>;
+  using EigenMatrixMapColMajorMutable = Eigen::Map<
+      Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>;
+
+  EigenMatrixMapRowMajorConst eigen_lhs(lhs_data, lhs_params.rows,
+                                        lhs_params.cols);
+  EigenMatrixMapColMajorConst eigen_rhs(rhs_data, rhs_params.rows,
+                                        rhs_params.cols);
+  EigenMatrixMapColMajorMutable eigen_dst(dst_data, dst_params.rows,
+                                          dst_params.cols);
+
+  if (rhs_params.cols == 1) {
+    eigen_dst.col(0).noalias() = eigen_lhs * eigen_rhs.col(0);
+  } else if (lhs_params.rows == 1) {
+    eigen_dst.row(0).noalias() = eigen_lhs.row(0) * eigen_rhs;
+  } else {
+    eigen_dst.noalias() = eigen_lhs * eigen_rhs;
+  }
+
+  if (params.bias) {
+    BiasAndClamp(params.clamp_min, params.clamp_max, dst_params.rows,
+                 params.bias, dst_params.rows * dst_params.cols, dst_data);
+  } else {
+    eigen_dst = eigen_dst.cwiseMin(params.clamp_max).cwiseMax(params.clamp_min);
+  }
+}
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
new file mode 100644
index 0000000..8a87528
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+struct GemmImplUsingEigen {
+  static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
+                  const MatrixParams<float>& rhs_params, const float* rhs_data,
+                  const MatrixParams<float>& dst_params, float* dst_data,
+                  const GemmParams<float, float>& params,
+                  CpuBackendContext* /* context */);
+};
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
new file mode 100644
index 0000000..7a659c3
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/experimental/ruy/ruy.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+template <typename DstScalar>
+struct GemmlowpSaturatingCastStage {};
+
+template <>
+struct GemmlowpSaturatingCastStage<std::uint8_t> {
+  using Type = gemmlowp::OutputStageSaturatingCastToUint8;
+};
+
+template <>
+struct GemmlowpSaturatingCastStage<std::int8_t> {
+  using Type = gemmlowp::OutputStageSaturatingCastToInt8;
+};
+
+template <>
+struct GemmlowpSaturatingCastStage<std::int16_t> {
+  using Type = gemmlowp::OutputStageSaturatingCastToInt16;
+};
+
+template <typename DstScalar>
+struct GemmlowpBitDepthParams {};
+
+template <>
+struct GemmlowpBitDepthParams<std::uint8_t> {
+  using Type = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
+};
+
+template <>
+struct GemmlowpBitDepthParams<std::int8_t> {
+  using Type = gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams;
+};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplUsingGemmlowp {};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+struct GemmImplUsingGemmlowp<
+    LhsScalar, RhsScalar, AccumScalar, DstScalar,
+    QuantizationFlavor::kIntegerWithUniformMultiplier> {
+  static_assert(std::is_same<LhsScalar, RhsScalar>::value, "");
+  static_assert(std::is_same<AccumScalar, std::int32_t>::value, "");
+  using SrcScalar = LhsScalar;
+
+  static void Run(
+      const MatrixParams<SrcScalar>& lhs_params, const SrcScalar* lhs_data,
+      const MatrixParams<SrcScalar>& rhs_params, const SrcScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar,
+                       QuantizationFlavor::kIntegerWithUniformMultiplier>&
+          params,
+      CpuBackendContext* context) {
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::RowMajor>
+        gemmlowp_lhs(lhs_data, lhs_params.rows, lhs_params.cols);
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::ColMajor>
+        gemmlowp_rhs(rhs_data, rhs_params.rows, rhs_params.cols);
+    gemmlowp::MatrixMap<DstScalar, gemmlowp::MapOrder::ColMajor> gemmlowp_dst(
+        dst_data, dst_params.rows, dst_params.cols);
+
+    using ColVectorMap =
+        gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>;
+    ColVectorMap bias_vector(params.bias, lhs_params.rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+    scale_stage.result_offset_after_shift = dst_params.zero_point;
+    scale_stage.result_fixedpoint_multiplier = params.multiplier_fixedpoint;
+    scale_stage.result_exponent = params.multiplier_exponent;
+    using SaturatingCastStageType =
+        typename GemmlowpSaturatingCastStage<DstScalar>::Type;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = params.clamp_min;
+    clamp_stage.max = params.clamp_max;
+    SaturatingCastStageType saturating_cast_stage;
+    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
+                                           clamp_stage, saturating_cast_stage);
+    using BitDepthParams = typename GemmlowpBitDepthParams<SrcScalar>::Type;
+    gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+        context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
+        -lhs_params.zero_point, -rhs_params.zero_point, output_pipeline);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+struct GemmImplUsingGemmlowp<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                             QuantizationFlavor::kIntegerWithPerRowMultiplier> {
+  static_assert(std::is_same<LhsScalar, RhsScalar>::value, "");
+  static_assert(std::is_same<AccumScalar, std::int32_t>::value, "");
+  using SrcScalar = LhsScalar;
+
+  static void Run(
+      const MatrixParams<SrcScalar>& lhs_params, const SrcScalar* lhs_data,
+      const MatrixParams<SrcScalar>& rhs_params, const SrcScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar,
+                       QuantizationFlavor::kIntegerWithPerRowMultiplier>&
+          params,
+      CpuBackendContext* context) {
+    // gemmlowp support for this per-channel path is limited to NEON.
+    // We fall back to ruy outside of NEON.
+#ifdef GEMMLOWP_NEON
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::RowMajor>
+        gemmlowp_lhs(lhs_data, lhs_params.rows, lhs_params.cols);
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::ColMajor>
+        gemmlowp_rhs(rhs_data, rhs_params.rows, rhs_params.cols);
+    gemmlowp::MatrixMap<DstScalar, gemmlowp::MapOrder::ColMajor> gemmlowp_dst(
+        dst_data, dst_params.rows, dst_params.cols);
+
+    using ColVectorMap =
+        gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>;
+    ColVectorMap bias_vector(params.bias, lhs_params.rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
+        gemmlowp::VectorShape::Col>
+        scale_stage;
+    scale_stage.result_offset_after_shift = dst_params.zero_point;
+    scale_stage.result_fixedpoint_multiplier =
+        ColVectorMap(params.multiplier_fixedpoint_perchannel, dst_params.rows);
+    scale_stage.result_exponent =
+        ColVectorMap(params.multiplier_exponent_perchannel, dst_params.rows);
+    using SaturatingCastStageType =
+        typename GemmlowpSaturatingCastStage<DstScalar>::Type;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = params.clamp_min;
+    clamp_stage.max = params.clamp_max;
+    SaturatingCastStageType saturating_cast_stage;
+    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
+                                           clamp_stage, saturating_cast_stage);
+    using BitDepthParams = typename GemmlowpBitDepthParams<SrcScalar>::Type;
+    gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+        context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
+        -lhs_params.zero_point, -rhs_params.zero_point, output_pipeline);
+#else
+    GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                     QuantizationFlavor::kIntegerWithPerRowMultiplier>::
+        Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data,
+            params, context);
+#endif
+  }
+};
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_params.h b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
new file mode 100644
index 0000000..40e81dc
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
@@ -0,0 +1,237 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_PARAMS_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_PARAMS_H_
+
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+namespace cpu_backend_gemm {
+
+// Matrix storage order: column-major or row-major.
+enum class Order { kColMajor, kRowMajor };
+
+// MatrixParams encapsulates the parameters that Gemm needs about each
+// matrix, besides the buffer data pointer.
+// Compare to ruy::Matrix, which also encapsulates the data pointer.
+// Rationale for leaving the data pointer out of here: doing so
+// requires complicated const-correctness mechanics. See
+// ruy::ConstCheckingPtr.
+template <typename Scalar>
+struct MatrixParams {
+  // Storage layout order. For now we only do plain linear non-strided
+  // layout. It would be easy to support a stride if needed.
+  Order order = Order::kColMajor;
+  // Number of rows of the matrix.
+  int rows = 0;
+  // Number of columns of the matrix.
+  int cols = 0;
+  // The zero_point, i.e. which Scalar value is to be interpreted as zero.
+  // When Scalar is floating-point, this must be 0.
+  Scalar zero_point = 0;
+};
+
+// Enumeration of broad categories of Gemm.
+//
+// The primary reason for this to exist is to allow Gemm to compile
+// only uniform-quantized or only per-channel-quantized code paths.
+// This is unneeded with ruy as the back-end, as this is only a runtime
+// difference in ruy, but with gemmlowp these really are separate code
+// paths and templatizing in a QuantizationFlavor is necessary to avoid
+// compiling unused gemmlowp code. Indeed, TFLite currently uses
+// uint8 with uniform quantization and int8 with per-channel quantization,
+// and does not use uint8 with per-channel. We want to avoid compiling
+// the gemmlowp uint8 per-channel path when gemmlowp is the back-end.
+//
+// It's possible to drop this in the future if gemmlowp goes away and no
+// other then-relevant backend library handles quantized paths in a way that
+// requires knowing this at compile-time.
+enum class QuantizationFlavor {
+  // Floating-point Gemm: the accumulators are not multiplied by any
+  // 'multiplier'.
+  kFloatingPoint,
+  // Quantized Gemm using a single multiplier for all accumulators.
+  kIntegerWithUniformMultiplier,
+  // Quantized Gemm using a separate multipliers for accumulators of each
+  // row of the destination matrix. This is what is called 'per-channel'
+  // in GemmParams. Here we use the more specific 'per-row' terminology
+  // to allow for the possibility of 'per-column' in the future, and to
+  // allow for that to be a separate code path in some back-end such as
+  // gemmlowp.
+  kIntegerWithPerRowMultiplier
+};
+
+// Additional parameters that Gemm needs, beyond what falls into
+// the MatrixParams that it takes. Compare to ruy::Spec.
+//
+// Decoupling AccumScalar from DstScalar (rather than deducing it from that)
+// is useful future-proofing. Think of a float16 path using float32 accum.
+//
+// QuantizationFlavor is passed here even though it's technically not used
+// in this class. This is so that we retain the ability in the future to
+// specialize this class for quantization flavor, and this allows for
+// Gemm to be templatized in quantization_flavor via the GemmParams that it
+// takes, allowing for automatic template parameter deduction to take place,
+// so that most call sites don't need to specify a QuantizationFlavor
+// (only those that need perchannel quantization do).
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor =
+              std::is_floating_point<AccumScalar>::value
+                  ? QuantizationFlavor::kFloatingPoint
+                  : QuantizationFlavor::kIntegerWithUniformMultiplier>
+struct GemmParams {
+  // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
+  // of the multiplier by which accumulators are multiplied before being casted
+  // to the destination type.
+  AccumScalar multiplier_fixedpoint = 0;
+  // Only for non-floating-point cases. The exponent part of the aforementioned
+  // multiplier.
+  int multiplier_exponent = 0;
+  // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_fixedpoint.
+  const AccumScalar* multiplier_fixedpoint_perchannel = nullptr;
+  // Per-channel variant of multiplier_exponent. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_exponent.
+  //
+  // Either none or both of multiplier_exponent_perchannel and
+  // multiplier_fixedpoint_perchannel must be nullptr.
+  const int* multiplier_exponent_perchannel = nullptr;
+  // The bias vector data, if not null.
+  const AccumScalar* bias = nullptr;
+  // min clamp bound of destination values.
+  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+                            ? -std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::lowest();
+  // max clamp bound of destination values.
+  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+                            ? std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::max();
+};
+
+/* Convenience typedefs */
+
+template <typename DstScalar>
+using QuantizedGemmParams = GemmParams<std::int32_t, DstScalar>;
+
+using FloatGemmParams = GemmParams<float, float>;
+
+/* Validation functions */
+
+// Note that this uses TFLITE_DCHECK from kernels/internal/compatibility.h
+// and not TF_LITE_ASSERT from op_macros.h. We want this to be explicitly
+// debug-build-only assertions so that there's not reason not to
+// generously validate, and TF_LITE_ASSERT is actually at the moment
+// a release-build assertion. See b/131587258.
+
+// Validates self-consistency of GemmParams.
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+void ValidateGemmParams(
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+  // Guard consistency of the quantized multiplier fields.
+  if (quantization_flavor == QuantizationFlavor::kFloatingPoint) {
+    TFLITE_DCHECK(!params.multiplier_fixedpoint);
+    TFLITE_DCHECK(!params.multiplier_exponent);
+    TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
+  } else if (quantization_flavor ==
+             QuantizationFlavor::kIntegerWithUniformMultiplier) {
+    // For now require a bias vector. Ruy does not care, but for gemmlowp
+    // it's a separate instantiation of the whole GEMM, so we save a lot of
+    // binary size by requiring a bias vector, and that's what we've been
+    // doing all along in our usage of gemmlowp, so somehow that must
+    // be OK with all existing users.
+    TFLITE_DCHECK(params.bias);
+    TFLITE_DCHECK(params.multiplier_fixedpoint);
+    // Nothing to check about multiplier_exponent
+    TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
+  } else if (quantization_flavor ==
+             QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+    // See above comment about requiring bias.
+    TFLITE_DCHECK(params.bias);
+    TFLITE_DCHECK(!params.multiplier_fixedpoint);
+    TFLITE_DCHECK(!params.multiplier_exponent);
+    TFLITE_DCHECK(params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(params.multiplier_exponent_perchannel);
+  }
+}
+
+namespace detail {
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct ValidateTypes {
+  // This generic implementation is for quantized flavors.
+  // kFloatingPoint will be a specialization below.
+  static_assert(!std::is_floating_point<LhsScalar>::value, "");
+  static_assert(!std::is_floating_point<RhsScalar>::value, "");
+  static_assert(!std::is_floating_point<AccumScalar>::value, "");
+  // No requirement on DstScalar --- we might in the future allow it
+  // to be floating point even in a quantized Gemm.
+};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+struct ValidateTypes<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                     QuantizationFlavor::kFloatingPoint> {
+  static_assert(std::is_floating_point<LhsScalar>::value, "");
+  static_assert(std::is_floating_point<RhsScalar>::value, "");
+  static_assert(std::is_floating_point<AccumScalar>::value, "");
+  static_assert(std::is_floating_point<DstScalar>::value, "");
+};
+
+}  // namespace detail
+
+// Validates overall consistency of all the parameters taken by a Gemm call:
+// the 3 MatrixParams and the GemmParams.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+void ValidateParams(
+    const MatrixParams<LhsScalar>& lhs_params,
+    const MatrixParams<RhsScalar>& rhs_params,
+    const MatrixParams<DstScalar>& dst_params,
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+  (void)detail::ValidateTypes<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                              quantization_flavor>();
+  ValidateGemmParams(params);
+  // For now, Gemm only supports this particular combination of storage orders.
+  // Actually the generic ruy path already supports all combinations (with
+  // various performance penalties). On the other hand, gemmlowp and Eigen
+  // paths would require more source code and larger binary code to handle
+  // other combinations (because orders are template parameters in gemmlowp
+  // and Eigen). Since this is TFLite's own internal Gemm library, there is
+  // no point in supporting more than what TFlite currently uses, and that
+  // is for now this single combination.
+  TFLITE_DCHECK(lhs_params.order == Order::kRowMajor);
+  TFLITE_DCHECK(rhs_params.order == Order::kColMajor);
+  TFLITE_DCHECK(dst_params.order == Order::kColMajor);
+}
+
+}  // namespace cpu_backend_gemm
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_PARAMS_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
new file mode 100644
index 0000000..abee370
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
+
+#include "tensorflow/lite/experimental/ruy/ruy.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+template <typename Scalar, typename DataPointer>
+void MakeRuyMatrix(const MatrixParams<Scalar>& params, DataPointer data_ptr,
+                   ruy::Matrix<Scalar>* dst) {
+  dst->layout.rows = params.rows;
+  dst->layout.cols = params.cols;
+  if (params.order == Order::kColMajor) {
+    dst->layout.order = ruy::Order::kColMajor;
+    dst->layout.stride = params.rows;
+  } else {
+    dst->layout.order = ruy::Order::kRowMajor;
+    dst->layout.stride = params.cols;
+  }
+  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+  // It does care whether we assign to it a Scalar* or a const Scalar*.
+  dst->data = data_ptr;
+  dst->zero_point = params.zero_point;
+}
+
+template <typename GemmParamsType, typename RuySpecType>
+void MakeRuySpec(const GemmParamsType& params, RuySpecType* ruy_spec) {
+  // This validation has already been performed by the Gemm API entry point,
+  // but it doesn't hurt to test specifically this again here, where it's
+  // being used.
+  ValidateGemmParams(params);
+
+  ruy_spec->multiplier_fixedpoint = params.multiplier_fixedpoint;
+  ruy_spec->multiplier_exponent = params.multiplier_exponent;
+  ruy_spec->multiplier_fixedpoint_perchannel =
+      params.multiplier_fixedpoint_perchannel;
+  ruy_spec->multiplier_exponent_perchannel =
+      params.multiplier_exponent_perchannel;
+  ruy_spec->bias = params.bias;
+  ruy_spec->clamp_min = params.clamp_min;
+  ruy_spec->clamp_max = params.clamp_max;
+}
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplUsingRuy {
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      CpuBackendContext* context) {
+    ruy::Matrix<LhsScalar> ruy_lhs;
+    ruy::Matrix<RhsScalar> ruy_rhs;
+    ruy::Matrix<DstScalar> ruy_dst;
+    MakeRuyMatrix(lhs_params, lhs_data, &ruy_lhs);
+    MakeRuyMatrix(rhs_params, rhs_data, &ruy_rhs);
+    MakeRuyMatrix(dst_params, dst_data, &ruy_dst);
+
+    ruy::BasicSpec<AccumScalar, DstScalar> ruy_spec;
+    MakeRuySpec(params, &ruy_spec);
+
+    ruy::Mul<ruy::kAllPaths>(ruy_lhs, ruy_rhs, ruy_spec, context->ruy_context(),
+                             &ruy_dst);
+  }
+};
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
new file mode 100644
index 0000000..5f8210f
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -0,0 +1,628 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+
+#include <algorithm>
+#include <cstdarg>
+#include <limits>
+#include <random>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/ruy/ruy.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+
+namespace tflite {
+
+namespace {
+
+using cpu_backend_gemm::Gemm;
+using cpu_backend_gemm::GemmParams;
+using cpu_backend_gemm::MatrixParams;
+using cpu_backend_gemm::QuantizationFlavor;
+
+template <typename Scalar>
+std::string ToString(const std::vector<Scalar>& vector) {
+  std::stringstream s;
+  if (vector.empty()) {
+    s << "{}";
+  } else {
+    s << "{ " << static_cast<double>(vector[0]);
+    for (int i = 1; i < vector.size(); i++) {
+      s << ", " << static_cast<double>(vector[i]);
+    }
+    s << "}";
+  }
+  return s.str();
+}
+
+template <typename Scalar>
+void MakeDeterministicPseudoRandomVector(int size,
+                                         std::vector<Scalar>* vector) {
+  // Intentionally create a new local random_engine in each invocation,
+  // so pseudorandom values don't depend on invocation order.
+  // Otherwise, test results would be affecting by e.g. filtering.
+  std::default_random_engine random_engine;
+  (void)random_engine();
+  // Do not use std::uniform*_distribution: the values that it
+  // generates are implementation-defined.
+  const double random_min = static_cast<double>(random_engine.min());
+  const double random_max = static_cast<double>(random_engine.max());
+  const double result_min =
+      std::is_floating_point<Scalar>::value
+          ? -1.0
+          : std::max(-256., static_cast<double>(
+                                std::numeric_limits<Scalar>::lowest()));
+  const double result_max =
+      std::is_floating_point<Scalar>::value
+          ? 1.0
+          : std::min(256.,
+                     static_cast<double>(std::numeric_limits<Scalar>::max()));
+  const double random_scale =
+      (result_max - result_min) / (random_max - random_min);
+
+  vector->resize(size);
+  for (int i = 0; i < size; i++) {
+    double val = random_scale * (random_engine() - random_min);
+    val = std::max(val,
+                   static_cast<double>(std::numeric_limits<Scalar>::lowest()));
+    val =
+        std::min(val, static_cast<double>(std::numeric_limits<Scalar>::max()));
+    (*vector)[i] = static_cast<Scalar>(val);
+  }
+}
+
+template <typename Scalar>
+void MakeVectorFilledWithConsecutiveInts(int size,
+                                         std::vector<Scalar>* vector) {
+  vector->resize(size);
+  EXPECT_LE(size, std::numeric_limits<Scalar>::max());
+  for (int i = 0; i < size; i++) {
+    (*vector)[i] = static_cast<Scalar>(i + 1);
+  }
+}
+
+template <typename Scalar>
+Scalar Median(const std::vector<Scalar>& vector) {
+  EXPECT_GT(vector.size(), 0);
+  std::vector<Scalar> vector_copy = vector;
+  std::sort(std::begin(vector_copy), std::end(vector_copy));
+  return vector_copy[vector_copy.size() / 2];
+}
+
+template <typename Scalar>
+double MedianAbs(const std::vector<Scalar>& vector) {
+  EXPECT_GT(vector.size(), 0);
+  std::vector<double> vector_abs;
+  vector_abs.resize(vector.size());
+  for (int i = 0; i < vector.size(); i++) {
+    vector_abs[i] = std::abs(static_cast<double>(vector[i]));
+  }
+  std::sort(std::begin(vector_abs), std::end(vector_abs));
+  return vector_abs[vector_abs.size() / 2];
+}
+
+template <typename Scalar>
+void Clamp(const std::vector<Scalar>& src, Scalar clamp_min, Scalar clamp_max,
+           std::vector<Scalar>* dst) {
+  dst->resize(src.size());
+  for (int i = 0; i < src.size(); i++) {
+    (*dst)[i] = std::max(std::min(src[i], clamp_max), clamp_min);
+  }
+}
+
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+void Clamp(const GemmParams<AccumScalar, DstScalar, quantization_flavor>& src,
+           DstScalar clamp_min, DstScalar clamp_max,
+           GemmParams<AccumScalar, DstScalar, quantization_flavor>* dst) {
+  *dst = src;
+  dst->clamp_min = clamp_min;
+  dst->clamp_max = clamp_max;
+}
+
+struct ErrorStats {
+  int size;
+  double scale_factor;
+  double max_abs_diff;
+  double mean_abs_diff;
+  double abs_mean_diff;
+};
+
+template <typename Scalar>
+void ComputeErrorStats(const std::vector<Scalar>& actual,
+                       const std::vector<Scalar>& expected,
+                       ErrorStats* error_stats) {
+  double max_abs_diff = 0;
+  double sum_abs_diff = 0;
+  double sum_diff = 0;
+  double max_abs_expected = 0;
+  EXPECT_EQ(actual.size(), expected.size());
+  for (int i = 0; i < actual.size(); i++) {
+    double actual_val = static_cast<double>(actual[i]);
+    double expected_val = static_cast<double>(expected[i]);
+    double diff = actual_val - expected_val;
+    max_abs_expected = std::max(max_abs_expected, std::abs(expected_val));
+    sum_diff += diff;
+    sum_abs_diff += std::abs(diff);
+    max_abs_diff = std::max(max_abs_diff, std::abs(diff));
+  }
+  error_stats->scale_factor = max_abs_expected;
+  error_stats->max_abs_diff = max_abs_diff;
+  error_stats->mean_abs_diff = sum_abs_diff / actual.size();
+  error_stats->abs_mean_diff = std::abs(sum_diff / actual.size());
+  error_stats->size = actual.size();
+}
+
+template <typename AccumScalar, typename DstScalar>
+bool CheckErrorStats(const ErrorStats& error_stats, int accumulation_depth) {
+  double tolerated_relative_max_abs_diff = 0;
+  double tolerated_relative_mean_abs_diff = 0;
+  double tolerated_relative_abs_mean_diff = 0;
+
+  double inverse_size = 1. / error_stats.size;
+
+  if (std::is_floating_point<AccumScalar>::value) {
+    // Somewhat naive requirement: the worst case should be epsilons
+    // adding up towards the same direction, on values of same magnitude.
+    tolerated_relative_max_abs_diff =
+        accumulation_depth * std::numeric_limits<DstScalar>::epsilon();
+    // Naive interpretation of the Central Limit Theorem is the rationale
+    // for the sqrt here. We haven't even worked out the correct scale factor,
+    // or how applicable that theorem is here (the random variables being added
+    // might not be mutually independent).
+    tolerated_relative_mean_abs_diff =
+        std::sqrt(static_cast<double>(accumulation_depth)) *
+        std::numeric_limits<DstScalar>::epsilon();
+    // Unbiasing requirement: we require the bias, abs_mean_diff, to be much
+    // smaller than the mean_abs_diff, except when there are very few values.
+    tolerated_relative_abs_mean_diff =
+        tolerated_relative_mean_abs_diff * std::sqrt(inverse_size);
+  } else {
+    // In quantized arithmetic, tolerate minor rounding differences, resulting
+    // in off-by-one errors (tolerated_relative_max_abs_diff = 1), as long
+    // as they are rare (tolerated_relative_mean_abs_diff) and unbiased
+    // (tolerated_relative_abs_mean_diff).
+    tolerated_relative_max_abs_diff = 1;
+    // Naively require mean_abs_diff and abs_mean_diff to converge to zero
+    // as size gets large. We don't know at all how quick that convergence
+    // should be: this is just based on trial-and-error and striking a
+    // compromise between something that works and something that's simple
+    // enough code that doesn't feel too ad-hoc. As above in the float path,
+    // abs_mean_diff is subject to a stricter requirement as it is a bias.
+    tolerated_relative_mean_abs_diff = std::sqrt(inverse_size);
+    tolerated_relative_abs_mean_diff = inverse_size;
+  }
+
+  double tolerated_max_abs_diff =
+      tolerated_relative_max_abs_diff * error_stats.scale_factor;
+  double tolerated_mean_abs_diff =
+      tolerated_relative_mean_abs_diff * error_stats.scale_factor;
+  double tolerated_abs_mean_diff =
+      tolerated_relative_abs_mean_diff * error_stats.scale_factor;
+
+  EXPECT_LE(error_stats.max_abs_diff, tolerated_max_abs_diff);
+  EXPECT_LE(error_stats.mean_abs_diff, tolerated_mean_abs_diff);
+  EXPECT_LE(error_stats.abs_mean_diff, tolerated_abs_mean_diff);
+
+  return error_stats.max_abs_diff <= tolerated_max_abs_diff &&
+         error_stats.mean_abs_diff <= tolerated_mean_abs_diff &&
+         error_stats.abs_mean_diff <= tolerated_abs_mean_diff;
+}
+
+template <typename AccumScalar, typename DstScalar>
+void CheckErrorForAccumulation(int accumulation_depth,
+                               const std::vector<DstScalar>& actual,
+                               const std::vector<DstScalar>& expected) {
+  ErrorStats error_stats;
+  ComputeErrorStats(actual, expected, &error_stats);
+  bool success =
+      CheckErrorStats<AccumScalar, DstScalar>(error_stats, accumulation_depth);
+  EXPECT_TRUE(success) << "Actual vector\n"
+                       << ToString(actual) << "\ndiffers from expected vector\n"
+                       << ToString(expected) << "\n";
+}
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+void PerformGemmThenCompareResultsThenAgainWithClamping(
+    const MatrixParams<LhsScalar>& lhs_params,
+    const std::vector<LhsScalar>& lhs_data,
+    const MatrixParams<RhsScalar>& rhs_params,
+    const std::vector<RhsScalar>& rhs_data,
+    const MatrixParams<DstScalar>& dst_params, std::vector<DstScalar>* dst_data,
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+    const std::vector<DstScalar>& expected,
+    CpuBackendContext* cpu_backend_context) {
+  const int accumulation_depth = lhs_params.cols;
+  Gemm(lhs_params, lhs_data.data(), rhs_params, rhs_data.data(), dst_params,
+       dst_data->data(), params, cpu_backend_context);
+  CheckErrorForAccumulation<AccumScalar>(accumulation_depth, *dst_data,
+                                         expected);
+  DstScalar expected_median = Median(expected);
+  std::vector<DstScalar> expected_with_clamp;
+  GemmParams<AccumScalar, DstScalar, quantization_flavor> params_with_clamp;
+  DstScalar clamp_min, clamp_max;
+
+  clamp_min = std::numeric_limits<DstScalar>::lowest();
+  clamp_max = expected_median;
+  Clamp(expected, clamp_min, clamp_max, &expected_with_clamp);
+  Clamp(params, clamp_min, clamp_max, &params_with_clamp);
+  Gemm(lhs_params, lhs_data.data(), rhs_params, rhs_data.data(), dst_params,
+       dst_data->data(), params_with_clamp, cpu_backend_context);
+  CheckErrorForAccumulation<AccumScalar>(accumulation_depth, *dst_data,
+                                         expected_with_clamp);
+
+  clamp_min = expected_median;
+  clamp_max = std::numeric_limits<DstScalar>::max();
+  Clamp(expected, clamp_min, clamp_max, &expected_with_clamp);
+  Clamp(params, clamp_min, clamp_max, &params_with_clamp);
+  Gemm(lhs_params, lhs_data.data(), rhs_params, rhs_data.data(), dst_params,
+       dst_data->data(), params_with_clamp, cpu_backend_context);
+  CheckErrorForAccumulation<AccumScalar>(accumulation_depth, *dst_data,
+                                         expected_with_clamp);
+}
+
+// When generating testcases for a quantized GEMM, it's not trivial to
+// pick multiplier exponents: a too low value will result in too many zeros,
+// a too high value will result in too many large clamped values, in both
+// cases testing coverage is harmed. Therefore to ensure good testing coverage
+// we must find a multiplier exponent that's just right.  It would be possible
+// to do so by analysis of the random distribution of values in the result
+// matrix. That however would require some mathematical work that we haven't
+// done so far. Until that is done, the best that we can do is to search for
+// a good exponent value by trial-and-error. This is expensive, as each try
+// requires computing a whole GEMM. This is thus probably a major contribution
+// to the overall latency of this tesat. To partially mitigate that,
+// we use a bisection to reduce the required number of tries.
+//
+// This function is recursive. The bisect_min and bisect_max arguments
+// are the current bisection bounds. It performs a Gemm with the mid-point,
+// named bisect_mid, as the multiplier exponent. Based on whether the values
+// in the resulting matrix are rather too low or too large in absolute
+// value, it then recurses into the corresponding half of the bisection range.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+int BisectReasonableMultiplierExponent(
+    int bisect_min, int bisect_max, const MatrixParams<LhsScalar>& lhs_params,
+    const std::vector<LhsScalar>& lhs_data,
+    const MatrixParams<RhsScalar>& rhs_params,
+    const std::vector<RhsScalar>& rhs_data,
+    const MatrixParams<DstScalar>& dst_params, std::vector<DstScalar>* dst_data,
+    const GemmParams<AccumScalar, DstScalar>& params,
+    CpuBackendContext* cpu_backend_context) {
+  if (bisect_min == bisect_max) {
+    return bisect_min;
+  }
+  // Compute the midpoint as the floor of the average of bisect_min and
+  // bisect_max. As C++ integer division is rounding towards zero and our values
+  // may be of any sign, it is not trivial to implement this using only integer
+  // arithmetic.
+  int bisect_mid =
+      static_cast<int>(std::floor(0.5 * (bisect_min + bisect_max)));
+  GemmParams<AccumScalar, DstScalar> params_copy(params);
+  params_copy.multiplier_exponent = bisect_mid;
+  double clamp_abs = std::max(std::abs(static_cast<double>(params.clamp_min)),
+                              std::abs(static_cast<double>(params.clamp_max)));
+  Gemm(lhs_params, lhs_data.data(), rhs_params, rhs_data.data(), dst_params,
+       dst_data->data(), params_copy, cpu_backend_context);
+  double median_abs = MedianAbs(*dst_data);
+  if (median_abs < 0.25 * clamp_abs) {
+    return BisectReasonableMultiplierExponent(
+        bisect_mid + 1, bisect_max, lhs_params, lhs_data, rhs_params, rhs_data,
+        dst_params, dst_data, params_copy, cpu_backend_context);
+  } else {
+    return BisectReasonableMultiplierExponent(
+        bisect_min, bisect_mid, lhs_params, lhs_data, rhs_params, rhs_data,
+        dst_params, dst_data, params_copy, cpu_backend_context);
+  }
+}
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+void ReferenceGemm(const MatrixParams<LhsScalar>& lhs_params,
+                   const LhsScalar* lhs_data,
+                   const MatrixParams<RhsScalar>& rhs_params,
+                   const RhsScalar* rhs_data,
+                   const MatrixParams<DstScalar>& dst_params,
+                   DstScalar* dst_data,
+                   const GemmParams<AccumScalar, DstScalar>& params,
+                   CpuBackendContext* context) {
+  ruy::Matrix<LhsScalar> ruy_lhs;
+  ruy::Matrix<RhsScalar> ruy_rhs;
+  ruy::Matrix<DstScalar> ruy_dst;
+  cpu_backend_gemm::detail::MakeRuyMatrix(lhs_params, lhs_data, &ruy_lhs);
+  cpu_backend_gemm::detail::MakeRuyMatrix(rhs_params, rhs_data, &ruy_rhs);
+  cpu_backend_gemm::detail::MakeRuyMatrix(dst_params, dst_data, &ruy_dst);
+
+  ruy::BasicSpec<AccumScalar, DstScalar> ruy_spec;
+  cpu_backend_gemm::detail::MakeRuySpec(params, &ruy_spec);
+
+  ruy::Mul<ruy::Path::kReference>(ruy_lhs, ruy_rhs, ruy_spec,
+                                  context->ruy_context(), &ruy_dst);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+void TestSomeGemm(int rows, int depth, int cols,
+                  const std::vector<DstScalar>& golden) {
+  CpuBackendContext cpu_backend_context;
+  std::default_random_engine random_engine;
+  cpu_backend_context.set_max_num_threads(1 + (random_engine() % 8));
+
+  const bool use_golden = !golden.empty();
+
+  std::vector<LhsScalar> lhs_data;
+  std::vector<RhsScalar> rhs_data;
+  std::vector<AccumScalar> bias_data;
+  std::vector<DstScalar> dst_data;
+  if (use_golden) {
+    MakeVectorFilledWithConsecutiveInts(rows * depth, &lhs_data);
+    MakeVectorFilledWithConsecutiveInts(depth * cols, &rhs_data);
+    MakeVectorFilledWithConsecutiveInts(rows, &bias_data);
+  } else {
+    MakeDeterministicPseudoRandomVector(rows * depth, &lhs_data);
+    MakeDeterministicPseudoRandomVector(depth * cols, &rhs_data);
+    MakeDeterministicPseudoRandomVector(rows, &bias_data);
+  }
+  MakeDeterministicPseudoRandomVector(rows * cols, &dst_data);
+
+  MatrixParams<LhsScalar> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = rows;
+  lhs_params.cols = depth;
+  if (!std::is_floating_point<LhsScalar>::value) {
+    lhs_params.zero_point = 1;
+    if (!use_golden) {
+      lhs_params.zero_point += random_engine() % 8;
+    }
+  }
+
+  MatrixParams<RhsScalar> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = depth;
+  rhs_params.cols = cols;
+  if (!std::is_floating_point<RhsScalar>::value) {
+    rhs_params.zero_point = 1;
+    if (!use_golden) {
+      rhs_params.zero_point += random_engine() % 8;
+    }
+  }
+
+  MatrixParams<DstScalar> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = rows;
+  dst_params.cols = cols;
+  if (!std::is_floating_point<DstScalar>::value) {
+    dst_params.zero_point = 1;
+    if (!use_golden) {
+      dst_params.zero_point += random_engine() % 8;
+    }
+  }
+
+  GemmParams<AccumScalar, DstScalar> params;
+  if (use_golden || !std::is_floating_point<AccumScalar>::value ||
+      (random_engine() % 2)) {
+    // cpu_backend_gemm supports bias=null only in the float path. Test that
+    // in 50% of float testcases.
+    params.bias = bias_data.data();
+  }
+  if (!std::is_floating_point<AccumScalar>::value) {
+    // some large int32 value. Not being a multiple of a large
+    // power of two helps testing rounding behavior.
+    params.multiplier_fixedpoint = 1234567890;
+    // Now find a suitable value for multiplier_exponent.
+    // It needs to be low enough for a substantial amount of dst values
+    // to avoid getting clamped.
+    int bisect_min = -8 * static_cast<int>(sizeof(AccumScalar));
+    // We don't increase test coverage by using positive multipliers,
+    // and using very large positive multipliers may at the moment
+    // result in overflow in some paths.
+    // TODO(benoitjacob): fix that.
+    int bisect_max = 0;
+    params.multiplier_exponent = BisectReasonableMultiplierExponent(
+        bisect_min, bisect_max, lhs_params, lhs_data, rhs_params, rhs_data,
+        dst_params, &dst_data, params, &cpu_backend_context);
+  }
+
+  std::vector<DstScalar> expected;
+  if (use_golden) {
+    EXPECT_EQ(golden.size(), dst_data.size());
+    expected = golden;
+  } else {
+    expected.resize(dst_data.size());
+    ReferenceGemm(lhs_params, lhs_data.data(), rhs_params, rhs_data.data(),
+                  dst_params, expected.data(), params, &cpu_backend_context);
+  }
+
+  PerformGemmThenCompareResultsThenAgainWithClamping(
+      lhs_params, lhs_data, rhs_params, rhs_data, dst_params, &dst_data, params,
+      expected, &cpu_backend_context);
+
+  if (!std::is_floating_point<AccumScalar>::value) {
+    // Try with per-channel quantized multipliers. Just a naive check
+    // duplicating the same multiplier --- would already catch most bugs.
+    std::vector<AccumScalar> multiplier_fixedpoint_perchannel(
+        rows, params.multiplier_fixedpoint);
+    std::vector<int> multiplier_exponent_perchannel(rows,
+                                                    params.multiplier_exponent);
+    static constexpr QuantizationFlavor perchannel_flavor =
+        std::is_floating_point<AccumScalar>::value
+            ? QuantizationFlavor::kFloatingPoint
+            : QuantizationFlavor::kIntegerWithPerRowMultiplier;
+    GemmParams<AccumScalar, DstScalar, perchannel_flavor> params_perchannel;
+    params_perchannel.bias = params.bias;
+    params_perchannel.clamp_min = params.clamp_min;
+    params_perchannel.clamp_max = params.clamp_max;
+    params_perchannel.multiplier_fixedpoint_perchannel =
+        multiplier_fixedpoint_perchannel.data();
+    params_perchannel.multiplier_exponent_perchannel =
+        multiplier_exponent_perchannel.data();
+    PerformGemmThenCompareResultsThenAgainWithClamping(
+        lhs_params, lhs_data, rhs_params, rhs_data, dst_params, &dst_data,
+        params_perchannel, expected, &cpu_backend_context);
+  }
+}
+
+TEST(CpuBackendGemmSimpleTestAgainstGolden, Float) {
+  TestSomeGemm<float, float, float, float>(2, 3, 4,
+                                           {15, 34, 33, 79, 51, 124, 69, 169});
+}
+
+TEST(CpuBackendGemmSimpleTestAgainstGolden, Uint8) {
+  TestSomeGemm<std::uint8_t, std::uint8_t, std::int32_t, std::uint8_t>(
+      5, 2, 3, {2, 4, 6, 7, 9, 3, 10, 16, 22, 29, 4, 15, 26, 37, 48});
+}
+
+TEST(CpuBackendGemmSimpleTestAgainstGolden, Int8) {
+  TestSomeGemm<std::int8_t, std::int8_t, std::int32_t, std::int8_t>(
+      2, 6, 3, {13, 32, 31, 81, 50, 127});
+}
+
+TEST(CpuBackendGemmSimpleTestAgainstGolden, Int8Int16) {
+  TestSomeGemm<std::int8_t, std::int8_t, std::int32_t, std::int16_t>(
+      3, 5, 4, {19, 48, 77, 48, 149, 250, 76, 249, 422, 105, 350, 595});
+}
+
+template <typename tLhsScalar, typename tRhsScalar, typename tAccumScalar,
+          typename tDstScalar>
+struct TypesTuple {
+  using LhsScalar = tLhsScalar;
+  using RhsScalar = tRhsScalar;
+  using AccumScalar = tAccumScalar;
+  using DstScalar = tDstScalar;
+};
+
+template <typename TypesTupleType>
+void TestRandomGemms(const std::vector<std::tuple<int, int, int>>& shapes) {
+  using LhsScalar = typename TypesTupleType::LhsScalar;
+  using RhsScalar = typename TypesTupleType::RhsScalar;
+  using AccumScalar = typename TypesTupleType::AccumScalar;
+  using DstScalar = typename TypesTupleType::DstScalar;
+  for (const auto& shape : shapes) {
+    int rows = std::get<0>(shape);
+    int depth = std::get<1>(shape);
+    int cols = std::get<2>(shape);
+    TestSomeGemm<LhsScalar, RhsScalar, AccumScalar, DstScalar>(rows, depth,
+                                                               cols, {});
+  }
+}
+
+template <typename TypesTupleType>
+class CpuBackendGemmTest : public testing::Test {};
+
+TYPED_TEST_SUITE_P(CpuBackendGemmTest);
+
+typedef ::testing::Types<
+    TypesTuple<float, float, float, float>,
+    TypesTuple<std::uint8_t, std::uint8_t, std::int32_t, std::uint8_t>,
+    TypesTuple<std::int8_t, std::int8_t, std::int32_t, std::int8_t>,
+    TypesTuple<std::int8_t, std::int8_t, std::int32_t, std::int16_t>,
+    TypesTuple<std::uint8_t, std::uint8_t, std::int32_t, std::int8_t>>
+    CpuBackendGemmTestInstantiations;
+
+TYPED_TEST_SUITE(CpuBackendGemmTest, CpuBackendGemmTestInstantiations);
+
+TYPED_TEST(CpuBackendGemmTest, Square) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 50; size++) {
+    shapes.push_back(std::make_tuple(size, size, size));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, SquarePowerOfTwo) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 64; size <= 128; size *= 2) {
+    shapes.push_back(std::make_tuple(size, size, size));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, MatrixTimesVector) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 200; size++) {
+    shapes.push_back(std::make_tuple(size, size, 1));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, VectorTimesMatrix) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 200; size++) {
+    shapes.push_back(std::make_tuple(1, size, size));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, MatrixTimesNarrow) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 50; size++) {
+    shapes.push_back(std::make_tuple(size, size, 2));
+    shapes.push_back(std::make_tuple(size, size, 3));
+    shapes.push_back(std::make_tuple(size, size, 4));
+    shapes.push_back(std::make_tuple(size, size, 8));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, Rectangular) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 50; size++) {
+    shapes.push_back(std::make_tuple(size, size + 5, size + 1));
+    shapes.push_back(std::make_tuple(size + 10, size + 2, size));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, HighlyRectangular) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size <= 10000; size *= 10) {
+    shapes.push_back(std::make_tuple(size, 10, 10));
+    shapes.push_back(std::make_tuple(10, size, 10));
+    shapes.push_back(std::make_tuple(10, 10, size));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, InnerProduct) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 200; size++) {
+    shapes.push_back(std::make_tuple(1, size, 1));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+TYPED_TEST(CpuBackendGemmTest, OuterProduct) {
+  std::vector<std::tuple<int, int, int>> shapes;
+  for (int size = 1; size < 100; size++) {
+    shapes.push_back(std::make_tuple(size, 1, size));
+  }
+  TestRandomGemms<TypeParam>(shapes);
+}
+
+}  // namespace
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/cpu_backend_support.cc b/tensorflow/lite/kernels/cpu_backend_support.cc
new file mode 100644
index 0000000..5d7f41a
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_support.cc
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
+
+#include <memory>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace cpu_backend_support {
+
+namespace {
+
+// TODO(b/130950871) we probably shouldn't be using any reference-counting
+// but this is an existing idiom.
+struct RefCountedCpuBackendContext : public TfLiteExternalContext {
+  std::unique_ptr<CpuBackendContext> cpu_backend_context;
+  int num_references = 0;
+};
+
+RefCountedCpuBackendContext* GetCpuBackendContext(TfLiteContext* context) {
+  return static_cast<RefCountedCpuBackendContext*>(
+      context->GetExternalContext(context, kTfLiteCpuBackendContext));
+}
+
+TfLiteStatus Refresh(TfLiteContext* context) {
+  auto* refcounted = GetCpuBackendContext(context);
+  if (refcounted != nullptr) {
+    refcounted->cpu_backend_context->set_max_num_threads(
+        context->recommended_num_threads);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void IncrementUsageCounter(TfLiteContext* context) {
+  RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context);
+  if (refcounted == nullptr) {
+    refcounted = new RefCountedCpuBackendContext;
+    refcounted->type = kTfLiteCpuBackendContext;
+    refcounted->Refresh = Refresh;
+    refcounted->cpu_backend_context.reset(new CpuBackendContext);
+    if (context->recommended_num_threads != -1) {
+      refcounted->cpu_backend_context->set_max_num_threads(
+          context->recommended_num_threads);
+    }
+    refcounted->num_references = 0;
+    context->SetExternalContext(context, kTfLiteCpuBackendContext, refcounted);
+  }
+  refcounted->num_references++;
+}
+
+void DecrementUsageCounter(TfLiteContext* context) {
+  RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context);
+  if (refcounted == nullptr) {
+    TF_LITE_FATAL(
+        "Call to DecrementUsageCounter() not preceded by "
+        "IncrementUsageCounter()");
+  }
+  if (--refcounted->num_references == 0) {
+    delete refcounted;
+    context->SetExternalContext(context, kTfLiteCpuBackendContext, nullptr);
+  }
+}
+
+CpuBackendContext* GetFromContext(TfLiteContext* context) {
+  RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context);
+  if (refcounted == nullptr) {
+    TF_LITE_FATAL(
+        "Call to GetFromContext() not preceded by IncrementUsageCounter()");
+  }
+  return refcounted->cpu_backend_context.get();
+}
+
+}  // namespace cpu_backend_support
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cpu_backend_support.h b/tensorflow/lite/kernels/cpu_backend_support.h
new file mode 100644
index 0000000..e7cec5c
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_support.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
+namespace tflite {
+
+namespace cpu_backend_support {
+
+CpuBackendContext* GetFromContext(TfLiteContext* context);
+
+void IncrementUsageCounter(TfLiteContext* context);
+
+void DecrementUsageCounter(TfLiteContext* context);
+
+}  // namespace cpu_backend_support
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool.h b/tensorflow/lite/kernels/cpu_backend_threadpool.h
new file mode 100644
index 0000000..d1e1d14
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_THREADPOOL_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_THREADPOOL_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+#ifdef TFLITE_WITH_RUY
+#include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/experimental/ruy/thread_pool.h"
+#else
+#include "public/gemmlowp.h"
+#endif
+
+namespace tflite {
+namespace cpu_backend_threadpool {
+
+#ifdef TFLITE_WITH_RUY
+
+using Task = ruy::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType* tasks,
+             CpuBackendContext* cpu_backend_context) {
+  TFLITE_DCHECK_LE(tasks_count, cpu_backend_context->max_num_threads());
+  cpu_backend_context->ruy_context()->workers_pool.Execute(tasks_count, tasks);
+}
+
+#else  // not TFLITE_WITH_RUY
+
+using Task = gemmlowp::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType* tasks,
+             CpuBackendContext* cpu_backend_context) {
+  TFLITE_DCHECK_LE(tasks_count, cpu_backend_context->max_num_threads());
+  cpu_backend_context->gemmlowp_context()->workers_pool()->Execute(tasks_count,
+                                                                   tasks);
+}
+
+#endif
+
+}  // namespace cpu_backend_threadpool
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_THREADPOOL_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
new file mode 100644
index 0000000..45208a3
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
+namespace tflite {
+
+namespace {
+
+class TestGenerateArrayOfIncrementingIntsTask
+    : public cpu_backend_threadpool::Task {
+ public:
+  TestGenerateArrayOfIncrementingIntsTask(int* buffer, int start, int end)
+      : buffer_(buffer), start_(start), end_(end) {}
+
+  void Run() override {
+    for (int i = start_; i < end_; i++) {
+      buffer_[i] = i;
+    }
+  }
+
+ private:
+  int* buffer_;
+  int start_;
+  int end_;
+};
+
+void TestGenerateArrayOfIncrementingInts(int num_threads, int size) {
+  // The buffer that our threads will write to.
+  std::vector<int> buffer(size);
+
+  // The tasks that our threads will run.
+  std::vector<TestGenerateArrayOfIncrementingIntsTask> tasks;
+
+  // Create task objects.
+  int rough_size_per_thread = size / num_threads;
+  int start = 0;
+  for (int thread = 0; thread < num_threads; thread++) {
+    int end = start + rough_size_per_thread;
+    if (thread == num_threads - 1) {
+      end = size;
+    }
+    tasks.emplace_back(buffer.data(), start, end);
+    start = end;
+  }
+  ASSERT_EQ(num_threads, tasks.size());
+
+  CpuBackendContext context;
+  // This set_max_num_threads is only to satisfy an assertion in Execute.
+  // What actually determines the number of threads used is the parameter
+  // passed to Execute, since Execute does 1:1 mapping of tasks to threads.
+  context.set_max_num_threads(num_threads);
+
+  // Execute tasks on the threadpool.
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), &context);
+
+  // Check contents of the generated buffer.
+  for (int i = 0; i < size; i++) {
+    ASSERT_EQ(buffer[i], i);
+  }
+}
+
+TEST(CpuBackendThreadpoolTest, OneThreadSize100) {
+  TestGenerateArrayOfIncrementingInts(1, 100);
+}
+
+TEST(CpuBackendThreadpoolTest, ThreeThreadsSize1000000) {
+  TestGenerateArrayOfIncrementingInts(3, 1000000);
+}
+
+TEST(CpuBackendThreadpoolTest, TenThreadsSize1234567) {
+  TestGenerateArrayOfIncrementingInts(10, 1234567);
+}
+
+}  // namespace
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 93c5bcb..7946a0e 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -24,7 +24,7 @@
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/gemmlowp_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -70,7 +70,7 @@
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  gemmlowp_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
@@ -78,7 +78,7 @@
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  gemmlowp_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -138,28 +138,12 @@
 
   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
-  auto compute_out_size = [padding](int image_size, int filter_size, int stride,
-                                    int dilation_rate) -> int {
-    int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
-    return padding == kTfLitePaddingSame
-               ? (image_size + stride - 1) / stride
-               : padding == kTfLitePaddingValid
-                     ? (image_size - effective_filter_size + stride) / stride
-                     : 0;
-  };
+  int out_width, out_height;
 
-  int out_width = compute_out_size(width, filter_width, params->stride_width,
-                                   params->dilation_width_factor);
-  int out_height =
-      compute_out_size(height, filter_height, params->stride_height,
-                       params->dilation_height_factor);
-
-  data->padding.height =
-      ComputePadding(params->stride_height, params->dilation_height_factor,
-                     height, filter_height, out_height);
-  data->padding.width =
-      ComputePadding(params->stride_width, params->dilation_width_factor, width,
-                     filter_width, out_width);
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      params->dilation_height_factor, params->dilation_width_factor, height,
+      width, filter_height, filter_width, padding, &out_height, &out_width);
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training or
@@ -259,14 +243,12 @@
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output));
   } else {
-    gemmlowp::GemmContext* gemmlowp_context =
-        gemmlowp_support::GetFromContext(context);
     optimized_ops::DepthwiseConv(
         op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
         GetTensorShape(filter), GetTensorData<uint8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output),
-        gemmlowp_context);
+        cpu_backend_support::GetFromContext(context));
   }
 }
 
@@ -301,15 +283,14 @@
         GetTensorData<int32>(bias), GetTensorShape(output),
         GetTensorData<int8>(output));
   } else {
-    gemmlowp::GemmContext* gemmlowp_context =
-        gemmlowp_support::GetFromContext(context);
     optimized_integer_ops::DepthwiseConvPerChannel(
         op_params, data->per_channel_output_multiplier.data(),
         data->per_channel_output_shift.data(), GetTensorShape(input),
         GetTensorData<int8>(input), GetTensorShape(filter),
         GetTensorData<int8>(filter), GetTensorShape(bias),
         GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output), gemmlowp_context);
+        GetTensorData<int8>(output),
+        cpu_backend_support::GetFromContext(context));
   }
 }
 
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 2820f36..a0683b6 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -39,14 +39,79 @@
 
 class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
  public:
-  // TODO(ahentz): Also test different activation types, bias, padding types,
-  // stride values.
+  BaseDepthwiseConvolutionOpModel(
+      TfLiteRegistration* registration, const TensorData& input,
+      const TensorData& filter, const TensorData& output, Padding padding_type,
+      int stride_width, int stride_height,
+      const ActivationFunctionType& fused_activation_function,
+      int dilation_factor = 1) {
+    input_ = AddInput(input);
+    filter_ = AddInput(filter);
+
+    int bias_size = GetShape(filter_)[3];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (int i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        TensorData bias{TensorType_INT32,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_scale=*/bias_scale,
+                        /*per_channel_zero_point=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
+    }
+
+    output_ = AddOutput(output);
+
+    int input_depth = GetShape(input_)[3];
+    int output_depth = GetShape(filter_)[3];
+    int depth_mul = output_depth / input_depth;
+
+    SetBuiltinOp(
+        BuiltinOperator_DEPTHWISE_CONV_2D,
+        BuiltinOptions_DepthwiseConv2DOptions,
+        CreateDepthwiseConv2DOptions(
+            builder_, padding_type, stride_width, stride_height, depth_mul,
+            fused_activation_function, dilation_factor, dilation_factor)
+            .Union());
+
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_DEPTHWISE_CONV_2D, registration);
+
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+  }
+
   BaseDepthwiseConvolutionOpModel(TfLiteRegistration* registration,
                                   const TensorData& input,
                                   const TensorData& filter,
                                   const TensorData& output,
-                                  Padding padding_type,
-                                  int dilation_factor = 1) {
+                                  Padding padding_type, int dilation_factor = 1,
+                                  int stride_width = 1, int stride_height = 1) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -97,9 +162,9 @@
     SetBuiltinOp(
         BuiltinOperator_DEPTHWISE_CONV_2D,
         BuiltinOptions_DepthwiseConv2DOptions,
-        CreateDepthwiseConv2DOptions(builder_, padding_type, 1, 1, depth_mul,
-                                     ActivationFunctionType_NONE,
-                                     dilation_factor, dilation_factor)
+        CreateDepthwiseConv2DOptions(
+            builder_, padding_type, stride_width, stride_height, depth_mul,
+            ActivationFunctionType_NONE, dilation_factor, dilation_factor)
             .Union());
 
     resolver_ = absl::make_unique<SingleOpResolver>(
@@ -144,6 +209,215 @@
   }
 };
 
+TEST_P(DepthwiseConvolutionOpTest, ActivationReluTest) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_VALID,
+      /*stride_width*/ 1,
+      /*stride_height*/ 1,
+      /*ActivationFunctionType*/ ActivationFunctionType_RELU);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, 0, 99, 0,   //
+                                 91, 0, 127, 0,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, ActivationReluN1Test) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_VALID,
+      /*stride_width*/ 1,
+      /*stride_height*/ 1,
+      /*ActivationFunctionType*/ ActivationFunctionType_RELU_N1_TO_1);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 1, -1, 1, -1,  //
+                                 1, -1, 1, -1,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, ActivationRelu6Test) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_VALID,
+      /*stride_width*/ 1,
+      /*stride_height*/ 1,
+      /*ActivationFunctionType*/ ActivationFunctionType_RELU6);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 6, 0, 6, 0,  //
+                                 6, 0, 6, 0,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, ActivationTanhTest) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_VALID,
+      /*stride_width*/ 1,
+      /*stride_height*/ 1,
+      /*ActivationFunctionType*/ ActivationFunctionType_TANH);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, -34, 99, -20,  //
+                                 91, -26, 127, -4,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, ActivationSignTest) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_VALID,
+      /*stride_width*/ 1,
+      /*stride_height*/ 1,
+      /*ActivationFunctionType*/ ActivationFunctionType_SIGN_BIT);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 10, 11,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, -34, 99, -20,  //
+                                 78, -12, 112, 12,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, StrideTest) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_VALID,
+      /*stride_width*/ 2,
+      /*stride_height*/ 2,
+      /*ActivationFunctionType*/ ActivationFunctionType_NONE);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, -34, 99, -20,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, PaddingTest) {
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
+      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
+      Padding_SAME,
+      /*stride_width*/ 2,
+      /*stride_height*/ 2,
+      /*ActivationFunctionType*/ ActivationFunctionType_NONE);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, -34, 99, -20,     //
+                                 -93, 122, -111, 172,  //
+                             }));
+}
+
 TEST_P(DepthwiseConvolutionOpTest, SimpleTest) {
   DepthwiseConvolutionOpModel m(GetRegistration(),
                                 {TensorType_FLOAT32, {1, 3, 2, 2}},
@@ -502,7 +776,7 @@
               ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
 }
 
-TEST_P(DepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
+TEST_P(QuantizedDepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 28;
@@ -562,7 +836,7 @@
 
   // clang-format off
   EXPECT_THAT(
-      m.GetOutput(),
+      m.GetDequantizedOutput(),
       ElementsAreArray({
           0, 0,    0, 0,    0, 0,    0, 0,
           0, 0,    0, 0,    24, 24,  39, 39,
@@ -591,7 +865,8 @@
   // clang-format on
 }
 
-TEST_P(DepthwiseConvolutionOpTest, MultithreadOnBatchUint8GeneralTest) {
+TEST_P(QuantizedDepthwiseConvolutionOpTest,
+       MultithreadOnBatchUint8GeneralTest) {
   const int depth = 1;
   const int image_width = 8;
   const int image_height = 4;
@@ -645,7 +920,7 @@
 
   // clang-format off
   EXPECT_THAT(
-      m.GetOutput(),
+      m.GetDequantizedOutput(),
       ElementsAreArray({
           39, 39, 39, 39, 39, 39,
           21, 21, 21, 21, 21, 21,
@@ -668,6 +943,532 @@
   // clang-format on
 }
 
+TEST_P(QuantizedDepthwiseConvolutionOpTest, MultithreadOnRowValidPaddingTest) {
+  // This test runs through DepthwiseConv3x3Filter with __aarch64__, and runs
+  // through DepthwiseConvGeneral with other configs.
+  const int input_batch = 1;
+  const int input_width = 3;
+  const int input_height = 3;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 8;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_VALID);
+
+  // clang-format off
+  m.SetInput({
+    // array of 9 x 8 => [1, 3, 3, 8]
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0
+  });
+
+  m.SetFilter({
+    // array of 9 x 8 => [1, 3, 3, 8]
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(4);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        9, 18, 0, 0, 46, 55, 0, 0
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, MultithreadOnRowSamePaddingTest) {
+  // This test runs through DepthwiseConv3x3Filter with __aarch64__, and runs
+  // through DepthwiseConvGeneral with other configs.
+  const int input_batch = 1;
+  const int input_width = 3;
+  const int input_height = 3;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 8;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_SAME);
+
+  // clang-format off
+  m.SetInput({
+      // array of 9 x 8 => [1, 3, 3, 8]
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0,
+      1, 1, 0, 0,  1, 1, 0, 0
+  });
+
+  m.SetFilter({
+      // array of 9 x 8 => [1, 3, 3, 8]
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(3);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        // array of 9 x 8 => [1, 3, 3, 8]
+        4, 8, 0, 0, 20, 24, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,
+        4, 8, 0, 0, 20, 24, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,
+        9, 18, 0, 0, 46, 55, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,
+        4, 8, 0, 0, 20, 24, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,
+        4, 8, 0, 0, 20, 24, 0, 0,
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest,
+       MultithreadOnBatchValidPaddingTest) {
+  // This test runs through DepthwiseConv3x3Filter with __aarch64__, and runs
+  // through DepthwiseConvGeneral with other configs.
+  const int input_batch = 2;
+  const int input_width = 3;
+  const int input_height = 3;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 8;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_VALID);
+
+  // clang-format off
+  m.SetInput({
+      // array of 2 x 3 x 24 => [2, 3, 3, 8]
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0
+  });
+
+  m.SetFilter({
+      // array of 9 x 8 => [1, 3, 3, 8]
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(2);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        9, 18, 0, 0, 46, 55, 0, 0,
+        9, 18, 0, 0, 46, 55, 0, 0
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, MultithreadOnBatchSamePaddingTest) {
+  // This test runs through DepthwiseConv3x3Filter with __aarch64__, and runs
+  // through DepthwiseConvGeneral with other configs.
+  const int input_batch = 2;
+  const int input_width = 3;
+  const int input_height = 3;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 8;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_SAME);
+
+  // clang-format off
+  m.SetInput({
+      // array of 2 x 3 x 24 => [2, 3, 3, 8]
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0
+  });
+
+  m.SetFilter({
+      // array of 9 x 8 => [1, 3, 3, 8]
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(3);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        // array of 9 x 16 => [2, 3, 3, 8]
+        4, 8,  0, 0, 20, 24, 0, 0,   6, 12, 0, 0, 30, 37, 0, 0,
+        4, 8,  0, 0, 20, 24, 0, 0,   6, 12, 0, 0, 30, 37, 0, 0,
+        9, 18, 0, 0, 46, 55, 0, 0,   6, 12, 0, 0, 30, 37, 0, 0,
+        4, 8,  0, 0, 20, 24, 0, 0,   6, 12, 0, 0, 30, 37, 0, 0,
+        4, 8,  0, 0, 20, 24, 0, 0,   4, 8,  0, 0, 20, 24, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,   4, 8,  0, 0, 20, 24, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,   9, 18, 0, 0, 46, 55, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,   4, 8,  0, 0, 20, 24, 0, 0,
+        6, 12, 0, 0, 30, 37, 0, 0,   4, 8,  0, 0, 20, 24, 0, 0,
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest,
+       MultithreadOnRowSamePaddingStrideTest) {
+  // This test runs through DepthwiseConv3x3Filter with __aarch64__, and runs
+  // through DepthwiseConvGeneral with other configs.
+  const int input_batch = 1;
+  const int input_width = 3;
+  const int input_height = 3;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 8;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_SAME,
+      /* dilation_factor = */ 1,
+      /* stride_width = */ 2,
+      /* stride_height = */ 2);
+
+  // clang-format off
+  m.SetInput({
+      // array of 3 x 24 => [1, 3, 3, 8]
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0
+  });
+
+  m.SetFilter({
+      // array of 9 x 8 => [1, 3, 3, 8]
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(4);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        4, 8, 0, 0, 20, 24, 0, 0,
+        4, 8, 0, 0, 20, 24, 0, 0,
+        4, 8, 0, 0, 20, 24, 0, 0,
+        4, 8, 0, 0, 20, 24, 0, 0,
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest,
+       MultithreadOnRowValidPaddingStrideTest) {
+  const int input_batch = 1;
+  const int input_width = 5;
+  const int input_height = 5;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 8;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_VALID,
+      /* dilation_factor = */ 1,
+      /* stride_width = */ 2,
+      /* stride_height = */ 2);
+
+  // clang-format off
+  m.SetInput({
+    // array of 8 x 24 + 8 => [1, 5, 5, 8]
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0
+  });
+
+  m.SetFilter({
+      // array of 9 x 8 => [1, 3, 3, 8]
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(4);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        9, 18, 0, 0, 46, 55, 0, 0,
+        9, 18, 0, 0, 46, 55, 0, 0,
+        9, 18, 0, 0, 46, 55, 0, 0,
+        9, 18, 0, 0, 46, 55, 0, 0
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest,
+       MultithreadOnRowDepthMultiplierTest) {
+  const int input_batch = 1;
+  const int input_width = 3;
+  const int input_height = 3;
+  const int input_depth = 8;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 16;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_VALID);
+
+  // clang-format off
+  m.SetInput({
+      // array of 3 x 24 => [1, 3, 3, 8]
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+      1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0
+  });
+
+  m.SetFilter({
+      // array of 9 x 16 => [1, 3, 3, 16]
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+      1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetNumThreads(4);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        9, 18, 27, 37, 0, 0, 0, 0,
+        9, 18, 27, 37, 0, 0, 0, 0
+      }));
+  // clang-format on
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, MultithreadDifferentPaddingTest) {
+  const int input_batch = 1;
+  const int input_width = 4;
+  const int input_height = 5;
+  const int input_depth = 2;
+  const int filter_batch = 1;
+  const int filter_size = 3;
+  const int filter_depth = 2;
+
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {input_batch, input_height, input_width, input_depth},
+       0,
+       128},
+      {TensorType_UINT8,
+       {filter_batch, filter_size, filter_size, filter_depth},
+       0,
+       128},
+      {TensorType_UINT8, {}, 0, 255}, Padding_SAME,
+      /* dilation_factor = */ 1,
+      /* stride_width = */ 2,
+      /* stride_height = */ 2);
+
+  // clang-format off
+  m.SetInput({
+      // array of 2 x 16 => [1, 4, 4, 2]
+      1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+      1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0
+  });
+
+  m.SetFilter({
+      // array of 8 x 2 + 2 => [1, 3, 3, 2]
+      1, 2, 1, 2, 1, 2, 1, 2,
+      1, 2, 1, 2, 1, 2, 1, 2,
+      1, 2
+  });
+  // clang-format on
+
+  // No bias for this test.
+  m.SetBias({0, 0});
+  m.SetNumThreads(4);
+  m.Invoke();
+
+  // clang-format off
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray({
+        6, 0, 4, 0,
+        9, 0, 6, 0,
+        6, 0, 4, 0
+      }));
+  // clang-format on
+}
+
 class PerChannelQuantizedDepthwiseConvolutionOpModel
     : public BaseDepthwiseConvolutionOpModel {
  public:
@@ -841,6 +1642,42 @@
               ElementsAreArray({79, 95, 53, 32, -1, -9, -49, -85}));
 }
 
+TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple3x3FilterTest) {
+  PerChannelQuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 3, 3, 8}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [1 * 3 * 3 * 8] as [input_channel, y, x, output_channel]
+       {1, 3, 3, 8},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/
+       {1, 2, 3, 4, 4, 3, 2, 1},
+       /*per_channel_quantization_offsets=*/{0, 0, 0, 0, 0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
+  m.SetInput({// array of 9 x 8 => [1, 3, 3, 8]
+              1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+              0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+              1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+              0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0});
+  m.SetFilter(
+      /*filter data*/
+      {// array of 9 x 8 => [1, 3, 3, 8]
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+
+  // Invoke and verify output.
+  m.Invoke();
+  printf("\n");
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({9, 18, 0, 0, 36, 54, 0, 0})));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index 3f1d623..8a285f6 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -69,9 +69,9 @@
   return context->ResizeTensor(context, output, outputSize);
 }
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       const TfLiteTensor* lookup, const TfLiteTensor* value,
-                       TfLiteTensor* output) {
+TfLiteStatus EvalSimple(TfLiteContext* context, TfLiteNode* node,
+                        const TfLiteTensor* lookup, const TfLiteTensor* value,
+                        TfLiteTensor* output) {
   const int row_size = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / row_size;
 
@@ -138,10 +138,14 @@
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (value->type) {
     case kTfLiteFloat32:
-      return EvalFloat(context, node, lookup, value, output);
+      return EvalSimple(context, node, lookup, value, output);
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      return EvalHybrid(context, node, lookup, value, output);
+      if (output->type == kTfLiteFloat32) {
+        return EvalHybrid(context, node, lookup, value, output);
+      } else {
+        return EvalSimple(context, node, lookup, value, output);
+      }
     default:
       context->ReportError(context, "Type not currently supported.");
       return kTfLiteError;
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index 2462ff2..cf90ed0 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -21,6 +21,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -36,10 +37,11 @@
  public:
   BaseEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
                              std::initializer_list<int> weight_shape,
-                             TensorType weight_type = TensorType_FLOAT32) {
+                             TensorType weight_type = TensorType_FLOAT32,
+                             TensorType output_type = TensorType_FLOAT32) {
     input_ = AddInput(TensorType_INT32);
     weight_ = AddInput(weight_type);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output_type);
     SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0);
     BuildInterpreter({index_shape, weight_shape});
   }
@@ -48,7 +50,10 @@
     PopulateTensor(input_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
 
  protected:
   int input_;
@@ -60,15 +65,17 @@
  public:
   using BaseEmbeddingLookupOpModel::BaseEmbeddingLookupOpModel;
 
-  void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) {
+  template <typename T>
+  void Set3DWeightMatrix(const std::function<T(int, int, int)>& function) {
     TfLiteTensor* tensor = interpreter_->tensor(weight_);
     int rows = tensor->dims->data[0];
     int columns = tensor->dims->data[1];
     int features = tensor->dims->data[2];
+    T* data = GetTensorData<T>(tensor);
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < columns; j++) {
         for (int k = 0; k < features; k++) {
-          tensor->data.f[(i * columns + j) * features + k] = function(i, j, k);
+          data[(i * columns + j) * features + k] = function(i, j, k);
         }
       }
     }
@@ -96,12 +103,12 @@
 TEST(EmbeddingLookupOpTest, SimpleTest) {
   EmbeddingLookupOpModel m({3}, {3, 2, 4});
   m.SetInput({1, 0, 2});
-  m.Set3DWeightMatrix(
-      [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+  m.Set3DWeightMatrix<float>(
+      [](int i, int j, int k) -> float { return i + j / 10.0f + k / 100.0f; });
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({
                   1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
                   0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -120,7 +127,7 @@
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
@@ -141,7 +148,7 @@
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
@@ -162,7 +169,7 @@
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
@@ -183,7 +190,7 @@
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
@@ -204,7 +211,7 @@
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
@@ -225,7 +232,7 @@
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
@@ -235,6 +242,22 @@
                   kTestTolerance)));
 }
 
+TEST(EmbeddingLookupHybridOpTest, Simple3DTestQuantized) {
+  EmbeddingLookupOpModel m({3}, {3, 2, 4}, TensorType_UINT8, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.Set3DWeightMatrix<uint8_t>(
+      [](int i, int j, int k) -> uint8_t { return 100 * i + 10 * j + k; });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({
+                  100, 101, 102, 103, 110, 111, 112, 113,  // Row 1
+                  0,   1,   2,   3,   10,  11,  12,  13,   // Row 0
+                  200, 201, 202, 203, 210, 211, 212, 213,  // Row 2
+              }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 95b5bd1..7d943fd 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -25,7 +25,7 @@
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
-#include "tensorflow/lite/kernels/gemmlowp_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
@@ -115,7 +115,7 @@
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
-  gemmlowp_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
   auto* op_data = new OpData();
   context->AddTensors(context, /*tensors_to_add=*/2,
                       &op_data->scratch_tensor_index);
@@ -123,7 +123,7 @@
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  gemmlowp_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -320,7 +320,7 @@
 void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
                         const TfLiteTensor* filter, const TfLiteTensor* bias,
                         TfLiteTensor* output,
-                        gemmlowp::GemmContext* gemmlowp_context) {
+                        CpuBackendContext* cpu_backend_context) {
   FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
@@ -334,15 +334,14 @@
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(filter), GetTensorData<int8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output),
-        gemmlowp_context);
+        GetTensorShape(output), GetTensorData<int8_t>(output));
   } else {
     optimized_integer_ops::FullyConnected(
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(filter), GetTensorData<int8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<int8_t>(output),
-        gemmlowp_context);
+        cpu_backend_context);
   }
 }
 }  // namespace
@@ -353,29 +352,9 @@
                            const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
-  gemmlowp::GemmContext* gemmlowp_context =
-      gemmlowp_support::GetFromContext(context);
-
   int32_t input_offset = -input->params.zero_point;
   int32_t filter_offset = -filter->params.zero_point;
   int32_t output_offset = output->params.zero_point;
-#define TF_LITE_FULLY_CONNECTED(type, output_data_type)                  \
-  {                                                                      \
-    FullyConnectedParams op_params;                                      \
-    op_params.input_offset = input_offset;                               \
-    op_params.weights_offset = filter_offset;                            \
-    op_params.output_offset = output_offset;                             \
-    op_params.output_multiplier = data->output_multiplier;               \
-    op_params.output_shift = data->output_shift;                         \
-    op_params.quantized_activation_min = data->output_activation_min;    \
-    op_params.quantized_activation_max = data->output_activation_max;    \
-    type::FullyConnected(                                                \
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-        GetTensorShape(output), GetTensorData<output_data_type>(output), \
-        gemmlowp_context);                                               \
-  }
   // Only the Pie path supports quantized models and float inputs/outputs.
   if (input->type == kTfLiteFloat32) {
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
@@ -383,23 +362,50 @@
     return EvalHybrid(context, node, params, data, input, filter, bias,
                       input_quantized, scaling_factors, output);
   } else {
+    FullyConnectedParams op_params;
+    op_params.input_offset = input_offset;
+    op_params.weights_offset = filter_offset;
+    op_params.output_offset = output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    op_params.quantized_activation_min = data->output_activation_min;
+    op_params.quantized_activation_max = data->output_activation_max;
     switch (output->type) {
       case kTfLiteUInt8:
         if (kernel_type == kReference) {
-          TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t);
+          reference_ops::FullyConnected(
+              op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+              GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+              GetTensorShape(bias), GetTensorData<int32_t>(bias),
+              GetTensorShape(output), GetTensorData<uint8_t>(output));
         } else {
-          TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t);
+          optimized_ops::FullyConnected(
+              op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+              GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+              GetTensorShape(bias), GetTensorData<int32_t>(bias),
+              GetTensorShape(output), GetTensorData<uint8_t>(output),
+              cpu_backend_support::GetFromContext(context));
         }
         break;
       case kTfLiteInt8:
-        FullyConnectedInt8<kernel_type>(data, input, filter, bias, output,
-                                        gemmlowp_context);
+        FullyConnectedInt8<kernel_type>(
+            data, input, filter, bias, output,
+            cpu_backend_support::GetFromContext(context));
         break;
       case kTfLiteInt16:
         if (kernel_type == kReference) {
-          TF_LITE_FULLY_CONNECTED(reference_ops, int16_t);
+          reference_ops::FullyConnected(
+              op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+              GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+              GetTensorShape(bias), GetTensorData<int32_t>(bias),
+              GetTensorShape(output), GetTensorData<int16_t>(output));
         } else {
-          TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t);
+          optimized_ops::FullyConnected(
+              op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+              GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+              GetTensorShape(bias), GetTensorData<int32_t>(bias),
+              GetTensorShape(output), GetTensorData<int16_t>(output),
+              cpu_backend_support::GetFromContext(context));
         }
         break;
       default:
@@ -409,7 +415,6 @@
         return kTfLiteError;
     }
   }
-#undef TF_LITE_FULLY_CONNECTED
 
   return kTfLiteOk;
 }
@@ -422,9 +427,6 @@
                                    const TfLiteTensor* bias,
                                    TfLiteTensor* output,
                                    TfLiteTensor* shuffled_input_workspace) {
-  gemmlowp::GemmContext* gemmlowp_context =
-      gemmlowp_support::GetFromContext(context);
-
   // TODO(b/110697972) decide more consistently if / how / where we want
   // to perform this kind of runtime data type checks.
   if (shuffled_input_workspace->type != kTfLiteUInt8) {
@@ -432,24 +434,36 @@
     return kTfLiteError;
   }
 
-#define TF_LITE_SHUFFLED_FULLY_CONNECTED(type)                               \
-  {                                                                          \
-    FullyConnectedParams op_params;                                          \
-    op_params.output_multiplier = data->output_multiplier;                   \
-    op_params.output_shift = data->output_shift;                             \
-    op_params.quantized_activation_min = data->output_activation_min;        \
-    op_params.quantized_activation_max = data->output_activation_max;        \
-    type::ShuffledFullyConnected(                                            \
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),     \
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),              \
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),                  \
-        GetTensorShape(output), GetTensorData<int16_t>(output),              \
-        GetTensorData<uint8_t>(shuffled_input_workspace), gemmlowp_context); \
+#define TF_LITE_SHUFFLED_FULLY_CONNECTED(type)                           \
+  {                                                                      \
+    type::ShuffledFullyConnected(                                        \
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+        GetTensorShape(output), GetTensorData<int16_t>(output),          \
+        GetTensorData<uint8_t>(shuffled_input_workspace),                \
+        cpu_backend_support::GetFromContext(context));                   \
   }
+  FullyConnectedParams op_params;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
   if (kernel_type == kReference) {
-    TF_LITE_SHUFFLED_FULLY_CONNECTED(reference_ops);
+    reference_ops::ShuffledFullyConnected(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
+        GetTensorShape(output), GetTensorData<int16_t>(output),
+        GetTensorData<uint8_t>(shuffled_input_workspace));
   } else {
-    TF_LITE_SHUFFLED_FULLY_CONNECTED(optimized_ops);
+    optimized_ops::ShuffledFullyConnected(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
+        GetTensorShape(output), GetTensorData<int16_t>(output),
+        GetTensorData<uint8_t>(shuffled_input_workspace),
+        cpu_backend_support::GetFromContext(context));
   }
 #undef TF_LITE_SHUFFLED_FULLY_CONNECTED
 
@@ -464,25 +478,28 @@
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
-#define TF_LITE_FULLY_CONNECTED(type)                                         \
-  {                                                                           \
-    FullyConnectedParams op_params;                                           \
-    op_params.float_activation_min = output_activation_min;                   \
-    op_params.float_activation_max = output_activation_max;                   \
-    type::FullyConnected(op_params, GetTensorShape(input),                    \
-                         GetTensorData<float>(input), GetTensorShape(filter), \
-                         GetTensorData<float>(filter), GetTensorShape(bias),  \
-                         GetTensorData<float>(bias), GetTensorShape(output),  \
-                         GetTensorData<float>(output));                       \
-  }
   if (kernel_type == kReference) {
-    TF_LITE_FULLY_CONNECTED(reference_ops);
+    FullyConnectedParams op_params;
+    op_params.float_activation_min = output_activation_min;
+    op_params.float_activation_max = output_activation_max;
+    reference_ops::FullyConnected(
+        op_params, GetTensorShape(input), GetTensorData<float>(input),
+        GetTensorShape(filter), GetTensorData<float>(filter),
+        GetTensorShape(bias), GetTensorData<float>(bias),
+        GetTensorShape(output), GetTensorData<float>(output));
   } else if (kernel_type == kLegacyPie) {
     return EvalPie(context, node, params, data, input, filter, bias, output);
   } else {
-    TF_LITE_FULLY_CONNECTED(optimized_ops);
+    FullyConnectedParams op_params;
+    op_params.float_activation_min = output_activation_min;
+    op_params.float_activation_max = output_activation_max;
+    optimized_ops::FullyConnected(
+        op_params, GetTensorShape(input), GetTensorData<float>(input),
+        GetTensorShape(filter), GetTensorData<float>(filter),
+        GetTensorShape(bias), GetTensorData<float>(bias),
+        GetTensorShape(output), GetTensorData<float>(output),
+        cpu_backend_support::GetFromContext(context));
   }
-#undef TF_LITE_FULLY_CONNECTED
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index ff98db4..bf329cd 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -168,10 +168,33 @@
 )
 
 cc_library(
+    name = "common",
+    srcs = [],
+    hdrs = ["common.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":types",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
     name = "optimized_base",
     srcs = [],
     hdrs = [
         "common.h",
+        "optimized/depthwiseconv_3x3_filter_common.h",
         "optimized/depthwiseconv_float.h",
         "optimized/depthwiseconv_uint8.h",
         "optimized/depthwiseconv_uint8_3x3_filter.h",
@@ -179,6 +202,7 @@
         "optimized/integer_ops/add.h",
         "optimized/integer_ops/conv.h",
         "optimized/integer_ops/depthwise_conv.h",
+        "optimized/integer_ops/depthwise_conv_3x3_filter.h",
         "optimized/integer_ops/fully_connected.h",
         "optimized/integer_ops/mul.h",
         "optimized/integer_ops/pooling.h",
@@ -195,8 +219,12 @@
         ":tensor",
         ":tensor_utils",
         "//third_party/eigen3",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels:cpu_backend_threadpool",
+        "//tensorflow/lite/kernels:cpu_backend_gemm",
     ] + select({
         ":haswell": tflite_deps_intel,
         ":ios_x86_64": tflite_deps_intel,
@@ -215,6 +243,7 @@
     srcs = [],
     hdrs = [
         "common.h",
+        "optimized/depthwiseconv_3x3_filter_common.h",
         "optimized/depthwiseconv_float.h",
         "optimized/depthwiseconv_uint8.h",
         "optimized/depthwiseconv_uint8_3x3_filter.h",
@@ -224,6 +253,7 @@
     ],
     copts = tflite_copts(),
     deps = [
+        ":optimized_base",
         ":quantization_util",
         ":strided_slice_logic",
         ":tensor",
@@ -235,6 +265,9 @@
         "//third_party/eigen3",
         "@gemmlowp",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels:cpu_backend_threadpool",
+        "//tensorflow/lite/kernels:cpu_backend_gemm",
     ] + select({
         ":haswell": tflite_deps_intel,
         ":ios_x86_64": tflite_deps_intel,
@@ -350,7 +383,8 @@
         ":strided_slice_logic",
         ":tensor",
         ":types",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:op_macros",
     ] + select({
@@ -470,7 +504,8 @@
         "//tensorflow/lite/kernels:activation_functor",
         "//tensorflow/lite/kernels:op_macros",
         "@arm_neon_2_x86_sse",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
     ],
 )
 
@@ -526,7 +561,8 @@
         "//tensorflow/lite/c:c_api_internal",
         "@arm_neon_2_x86_sse",
         "//tensorflow/lite/kernels:op_macros",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
     ] + select({
         ":aarch64": [
             ":neon_tensor_utils",
@@ -633,7 +669,6 @@
         ":types",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 2b8226c..71a3060 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -46,6 +46,7 @@
 #endif
 
 #include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -87,6 +88,73 @@
                                       output_activation_max);
 }
 
+inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
+                         const float* bias_data, int array_size,
+                         float* array_data) {
+  // Note: see b/132215220: in May 2019 we thought it would be OK to replace
+  // this with the Eigen one-liner:
+  //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
+  // This turned out to severely regress performance: +4ms (i.e. 8%) on
+  // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
+  gemmlowp::ScopedProfilingLabel label("BiasAndClamp");
+  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+#ifdef USE_NEON
+  float* array_ptr = array_data;
+  float* array_end_ptr = array_ptr + array_size;
+  const auto clamp_min_vec = vdupq_n_f32(clamp_min);
+  const auto clamp_max_vec = vdupq_n_f32(clamp_max);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16) {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      x0 = vmaxq_f32(clamp_min_vec, x0);
+      x1 = vmaxq_f32(clamp_min_vec, x1);
+      x2 = vmaxq_f32(clamp_min_vec, x2);
+      x3 = vmaxq_f32(clamp_min_vec, x3);
+      x0 = vminq_f32(clamp_max_vec, x0);
+      x1 = vminq_f32(clamp_max_vec, x1);
+      x2 = vminq_f32(clamp_max_vec, x2);
+      x3 = vminq_f32(clamp_max_vec, x3);
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4) {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      x = vmaxq_f32(clamp_min_vec, x);
+      x = vminq_f32(clamp_max_vec, x);
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++) {
+      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
+                                                  clamp_min, clamp_max);
+    }
+  }
+#else  // not NEON
+  for (int array_offset = 0; array_offset < array_size;
+       array_offset += bias_size) {
+    for (int i = 0; i < bias_size; i++) {
+      array_data[array_offset + i] = ActivationFunctionWithMinMax(
+          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+    }
+  }
+#endif
+}
+
 inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
     int32 x, int32 quantized_multiplier, int left_shift) {
   using gemmlowp::RoundingDivideByPOT;
@@ -547,6 +615,95 @@
   }
 }
 
+// Copied from gemmlowp::RoundDown when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundDown(Integer i) {
+  return i - (i % Modulus);
+}
+
+// Copied from gemmlowp::RoundUp when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundUp(Integer i) {
+  return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Copied from gemmlowp::CeilQuotient when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+  return (a + b - 1) / b;
+}
+
+// This function is a copy of gemmlowp::HowManyThreads, copied when we dropped
+// the direct dependency of internal/optimized/ on gemmlowp.
+//
+// It computes a reasonable number of threads to use for a GEMM of shape
+// (rows, cols, depth).
+//
+// TODO(b/131910176): get rid of this function by switching each call site
+// to its own more sensible logic for its own workload.
+template <int KernelRows>
+inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols,
+                                int depth) {
+  // Early-exit in the default case where multi-threading is disabled.
+  if (max_num_threads == 1) {
+    return 1;
+  }
+
+  // Ensure that each thread has KernelRows rows to process, if at all possible.
+  int thread_count = std::min(max_num_threads, rows / KernelRows);
+
+  // Limit the number of threads according to the overall size of the problem.
+  if (thread_count > 1) {
+    // Empirically determined value.
+    static constexpr std::uint64_t min_cubic_size_per_thread = 64 * 1024;
+
+    // We can only multiply two out of three sizes without risking overflow
+    const std::uint64_t cubic_size =
+        std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
+
+    thread_count = std::min(
+        thread_count, static_cast<int>(cubic_size / min_cubic_size_per_thread));
+  }
+
+  if (thread_count < 1) {
+    thread_count = 1;
+  }
+
+  assert(thread_count > 0 && thread_count <= max_num_threads);
+  return thread_count;
+}
+
+template <typename T>
+void optimized_ops_preload_l1_stream(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 0 means no locality */ 0);
+#else
+  (void)ptr;
+#endif
+}
+
+template <typename T>
+void optimized_ops_preload_l1_keep(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
new file mode 100644
index 0000000..2ccc406
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -0,0 +1,541 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+constexpr int kDepthwiseConvScratchWorkspaceSize = 10 * 10 * 64;
+constexpr int kDepthwiseConvAdjustedBiasLimit = 64;
+// In cases such as depth multiplication, we want to be able to load data from
+// the workspace that is beyond the valid range. Macro-block sizes are adjusted
+// to allow for this.
+constexpr int kWorkspaceExtension = 16;
+
+#ifdef USE_NEON
+
+#ifndef __aarch64__
+inline int8x16_t vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
+  const uint8x16_t mask = vtstq_s8(b, vdupq_n_s8(8));
+
+  // Delete bit 3 from the indices.
+  const int8x16_t high_bits = vshrq_n_s8(b, 4);
+  int8x16_t deleted_bit_3 = b;
+  deleted_bit_3 = vsliq_n_s8(deleted_bit_3, high_bits, 3);
+
+  int8x8x4_t repacked_data;
+
+  // Calculate for lower indices.
+  repacked_data.val[0] = vget_low_s8(a.val[0]);
+  repacked_data.val[1] = vget_low_s8(a.val[1]);
+  repacked_data.val[2] = vget_low_s8(a.val[2]);
+  repacked_data.val[3] = vget_low_s8(a.val[3]);
+  const int8x16_t output_for_lower =
+      vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_s8(deleted_bit_3)));
+
+  // Calculate for high indices.
+  repacked_data.val[0] = vget_high_s8(a.val[0]);
+  repacked_data.val[1] = vget_high_s8(a.val[1]);
+  repacked_data.val[2] = vget_high_s8(a.val[2]);
+  repacked_data.val[3] = vget_high_s8(a.val[3]);
+  const int8x16_t output_for_higher =
+      vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_s8(deleted_bit_3)));
+
+  // Merge.
+  int8x16_t output = vbslq_s8(mask, output_for_higher, output_for_lower);
+  return output;
+}
+#endif  // !__aarch64__
+
+// Convenience-compatibility functions.
+// Compatibility: Intrinsics reflect a mixture of older and newer ARM
+//     instructions. This actually results in ZIP1 / ZIP2 asm instructions, but
+//     one intrinsic is provided. Also older instructions operated in place,
+//     and it seems more defensive to assume that some versions of intrinsics
+//     might reflect this
+// Convenience: Callers in these kernels want both ZIP1 and ZIP2, and we do not
+//     want the calling code to get cluttered with unpacking int8x16x2_t.
+inline void vzipq_s8_in_place(int8x16_t* a, int8x16_t* b) {
+  int8x16x2_t r8x16;
+  r8x16 = vzipq_s8(*a, *b);
+  *a = r8x16.val[0];
+  *b = r8x16.val[1];
+}
+
+inline void vzipq_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vzipq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+  *b = vreinterpretq_s8_s16(r16x8.val[1]);
+}
+
+// Similar rationale to the zip-in_place functions, but callers only actually
+// need the TRN1 asm instruction result.
+inline void vtrn1_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vtrnq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+}
+
+// Similar rationale to the zip-in_place functions, but callers only actually
+// need the ZIP1 or ZIP2 asm instruction results.
+inline int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
+  return vzipq_s8(a, b).val[0];
+}
+inline int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
+  return vzipq_s8(a, b).val[1];
+}
+
+inline void biregister_rotate_8(int8x16_t* left, int8x16_t* right) {
+  *left = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*left), 8));
+  *left = vreinterpretq_s8_u32(vsliq_n_u32(vreinterpretq_u32_s8(*left),
+                                           vreinterpretq_u32_s8(*right), 24));
+  *right = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*right), 8));
+}
+
+#ifndef __aarch64__
+inline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
+  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
+}
+#endif  // !__aarch64__
+
+#ifdef __ARM_FEATURE_DOTPROD
+// The vdotq_lane_s32 takes int8x8t for the rhs parameter, whereas the actual
+// instruction selects from between 4 32-bit (4x8-bit packed) sub-registers, an
+// unusual interpretation of "lane".
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, const int lane) {
+  switch (lane) {
+    case 0:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 0);
+    case 1:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 1);
+    case 2:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
+                            0);
+    case 3:
+    default:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
+                            1);
+  }
+}
+
+#else
+
+inline int32x4_t vdotq_s32(int32x4_t acc, int8x16_t lhs, int8x16_t rhs) {
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, int lane) {
+  int8x8_t lane_rhs;
+  if (lane == 0) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0));
+  } else if (lane == 1) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 1));
+  } else if (lane == 2) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 0));
+  } else {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 1));
+  }
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), lane_rhs));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), lane_rhs));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+#endif  // !__ARM_FEATURE_DOTPROD
+#endif  // ARM NEON
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DivideByPOT {};
+
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kAwayFromZero> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return RoundingDivideByPOT(x, exponent);
+  }
+};
+
+#ifdef USE_NEON
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kUpward> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32>(-exponent)));
+  }
+};
+#endif  // ARM NEON
+
+// See CategorizeDotProductKernel for definitive taxonomy.
+enum class DotProduct3x3KernelType {
+  kNone = 0,  // Parameter combination is not supported for dot product kernels.
+  kPlain,
+  kWithDepthMultiplicationStride1,
+  kWithDepthMultiplicationStride2,
+  kStride2,
+};
+
+inline DotProduct3x3KernelType CategorizeDotProductKernel(
+    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
+    const DepthwiseParams& params) {
+  constexpr int kSymmetricZeroPoint = 128;
+  const int padding =
+      std::max(params.padding_values.width, params.padding_values.height);
+  const int stride = params.stride_width;
+  const int32 input_depth = input_shape.Dims(3);
+  const int32 depth_multiplier = params.depth_multiplier;
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+
+  bool supported =
+      params.weights_offset == -kSymmetricZeroPoint &&
+      stride == params.stride_height && stride <= 2 && padding <= 1 &&
+      filter_width == 3 && filter_height == 3 && params.output_shift <= 0 &&
+      params.dilation_width_factor == 1 && params.dilation_height_factor == 1 &&
+      (((input_depth % 8) == 0 && depth_multiplier == 1) ||
+       (input_depth == 1 && depth_multiplier > 1));
+
+  if (!supported) {
+    return DotProduct3x3KernelType::kNone;
+  }
+
+  if (params.depth_multiplier == 1) {
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kPlain;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kStride2;
+    } else {
+      return DotProduct3x3KernelType::kNone;
+    }
+  } else {
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride2;
+    } else {
+      return DotProduct3x3KernelType::kNone;
+    }
+  }
+}
+
+// Encapsulates constant parameters used in DepthwiseConv.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+struct DepthwiseConvParams {
+  int64_t input_depth;
+  int64_t input_row_size;
+  int64_t output_depth;
+  int64_t output_row_size;
+  int64_t filter_row_size;
+  int32 input_offset;
+  int32 output_offset;
+  int32 filter_offset;
+  int32 output_multiplier;
+  int32 output_activation_min;
+  int32 output_activation_max;
+  int32 output_right_shift;
+  int32 input_width;
+  int32 input_height;
+  int32 stride_width;
+  int32 stride_height;
+  int32 output_width;
+  int32 output_height;
+};
+
+// Encapsulates constant parameters used in DepthwiseConv using dot-product ops.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+//
+// This structure is specifically designed for use in asm.
+struct DepthwiseConvDotProdParams {
+  int64_t input_depth;
+  int64_t output_depth;
+  int32 stride;
+  int32 bias_increment;
+  //
+  int32 input_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int32 output_shift;
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  //
+  int32 padding_left;
+  int32 padding_right;
+  int32 padding_top;
+  int32 padding_bottom;
+  //
+  int32 depth_micro_repeats;
+  //
+  int32 width_macro_count;
+  int32 input_width_overall_micro_repeats;
+  int32 input_width_micro_repeats;
+  int32 residual_width;
+  int32 output_width_overall_micro_repeats;
+  int32 output_width_micro_repeats;
+  int32 output_residual_width;
+  int32 workspace_width_micro_repeats;
+  //
+  int32 height_macro_count;
+  int32 inbound_block_height;
+  int32 outbound_block_height;
+  int32 input_height_stride;
+  int32 output_height_stride;
+  int32 workspace_height_stride;
+  //
+  int32 four_over_stride;
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
+          int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvWindow {};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
+          int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvWindowPerChannel {};
+
+enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter };
+
+template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
+          int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartial {};
+
+template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
+          int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartialPerChannel {};
+
+// Copies a subset of the input designated by |input_ptr| into |output_ptr|
+// with the specified output dimensions. Supports output depths of 64 only as
+// this is the cache line size.
+template <typename T>
+inline void ShuffleInput(const T* input_ptr, int64_t input_depth,
+                         int32 input_width, int32 input_height,
+                         int64_t output_depth, int32 output_width,
+                         int32 output_height, T* output_ptr) {
+  const int64_t input_row_size = input_depth * input_width;
+  for (int32 y = 0; y < output_height; y++) {
+    const T* ptr = input_ptr;
+    for (int32 x = 0; x < output_width; x++) {
+      memcpy(output_ptr, ptr, output_depth);
+      output_ptr += output_depth;
+      ptr += input_depth;
+    }
+    input_ptr += input_row_size;
+  }
+}
+
+// Calculates the input size depending on stride and output.
+inline int32 get_shuffle_input_size(int32 stride, int32 output) {
+  return stride * (output - 1) + 3;
+}
+
+// Indicates the input and output dimensions used when shuffling input
+// activations.
+struct ShuffleParams {
+  int32 output_width;
+  int32 output_height;
+  int32 input_width;
+  int32 input_height;
+
+  ShuffleParams() = default;
+  ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
+                int32 stride_height)
+      : output_width(output_width),
+        output_height(output_height),
+        input_width(get_shuffle_input_size(stride_width, output_width)),
+        input_height(get_shuffle_input_size(stride_height, output_height)) {}
+};
+
+enum class QuantizationType {
+  kNonPerChannelUint8 = 0,
+  kPerChannelInt8 = 1,
+};
+
+template <
+    QuantizationType quantization_type = QuantizationType::kNonPerChannelUint8>
+inline bool Fast3x3FilterKernelSupported(
+    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
+    int32 stride_width, int32 stride_height, int32 dilation_width_factor,
+    int32 dilation_height_factor, int32 pad_width, int32 pad_height,
+    int32 depth_multiplier, const RuntimeShape& output_shape,
+    int32 output_shift, const int32* output_shift_ptr = nullptr) {
+  const int32 input_height = input_shape.Dims(1);
+  const int32 input_width = input_shape.Dims(2);
+  const int32 input_depth = input_shape.Dims(3);
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+  const int32 output_height = output_shape.Dims(1);
+  const int32 output_width = output_shape.Dims(2);
+  const int32 output_depth = output_shape.Dims(3);
+
+  bool supported =
+      filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
+      (stride_width == 1 || stride_width == 2) &&
+      (stride_height == 1 || stride_height == 2) &&
+      (stride_width == stride_height) && (pad_width == 0 || pad_width == 1) &&
+      (pad_height == 0 || pad_height == 1) && (pad_width == pad_height) &&
+      (input_depth % 8) == 0 && (output_shift <= 0) &&
+      dilation_width_factor == 1 && dilation_height_factor == 1;
+
+  if (!supported) {
+    return false;
+  }
+
+  if (quantization_type == QuantizationType::kPerChannelInt8) {
+    for (int i = 0; i < output_depth; ++i) {
+      if (output_shift_ptr[i] <= 0) {
+        return false;
+      }
+    }
+  }
+
+  // Handle case where padding is zero but padding type is not kValid.
+  // This would require special boundary case handling that is not supported.
+
+  const int32 out_x = output_width - 1;
+  const int32 out_y = output_height - 1;
+
+  const int32 in_x_origin = (out_x * stride_width) - pad_width;
+  const int32 in_y_origin = (out_y * stride_height) - pad_height;
+
+  const int32 in_x_end = in_x_origin + filter_width;
+  const int32 in_y_end = in_y_origin + filter_height;
+
+  // Supported only if filter on the right and bottom boundary lies completely
+  // within the input if padding is zero.
+  if (pad_width == 0 && pad_height == 0) {
+    return in_x_end <= input_width && in_y_end <= input_height;
+  }
+
+  // Else if padding is 1, supported if bottom right filter lies +1 past input
+  // width and height.
+  supported = in_x_end <= (input_width + 1) && in_y_end <= (input_height + 1);
+
+  if (!supported) {
+    return false;
+  }
+
+  // Shapes with width 1 and height > 1, and vice versa are not supported yet.
+  if (input_width == 1) {
+    supported = (input_width == input_height);
+  } else if (input_height == 1) {
+    supported = (input_width == input_height);
+  }
+  return supported;
+}
+
+// Permute filter data, and adjust bias data to account for symmetric input
+// offset. Details are provided in the implementation of the
+// kUseCModel3x3DotProduct version.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation>
+struct ProcessPerDepth {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Copy a macro block of data from the input buffer into the workspace,
+// permuting data within each micro block.
+//
+// (a) Copy a macro block of data, padding as required along the width and
+//     height.
+// (b) Transpose the data within each micro block.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication,
+          int32 max_padding>
+struct PackMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Apply filter to macro block of input data and store results. Details are
+// provided in the implementation of the kUseCModel3x3DotProduct version.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication, int32 stride>
+struct KernelMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+#if defined(USE_NEON) && defined(__aarch64__)
+// Experiments suggest that a modest performance improvement is seen, at least
+// on 855 chipset big cores, with cache hints.
+template <typename T>
+inline void PreloadInputBlock(
+    const T* input_block_data,
+    const DepthwiseConvDotProdParams* function_params) {
+  // Preload.
+  const int input_width_micro_repeats =
+      function_params->input_width_micro_repeats;
+  const int block_height = function_params->inbound_block_height;
+  const int residual_width = function_params->residual_width;
+  const int input_height_stride = function_params->input_height_stride;
+  const int input_depth = function_params->input_depth;
+
+  const int total_width = 4 * input_width_micro_repeats + residual_width;
+  const T* row_ptr = input_block_data;
+  for (int k_height = 0; k_height < block_height; ++k_height) {
+    const T* ptr = row_ptr;
+    for (int j = 0; j < total_width; ++j) {
+      // Input data is loaded once.
+      asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+      ptr += input_depth;
+    }
+    row_ptr += input_height_stride;
+  }
+}
+#endif  // USE_NEON &&__aarch64__
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index c77715d..64e5898 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -15,7 +15,7 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 802c022..e100660 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -15,8 +15,9 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
 
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
@@ -2045,7 +2046,7 @@
 }
 
 template <typename T, typename TS>
-struct DepthwiseConvWorkerTask : public gemmlowp::Task {
+struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
   DepthwiseConvWorkerTask(const DepthwiseParams& params,
                           const RuntimeShape& input_shape, const T* input_data,
                           const RuntimeShape& filter_shape,
@@ -2106,7 +2107,7 @@
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) {
+    uint8* output_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
 
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -2128,9 +2129,8 @@
     thread_count = thread_count_row;
   }
 
-  // TODO(b/130555917): Allow multi-threading after fixing ARM accuracy issues.
-  constexpr int kMaxThreads = 1;
-  thread_count = std::max(1, std::min(thread_count, kMaxThreads));
+  const int max_threads = cpu_backend_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
 
   if (thread_count == 1) {
     DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
@@ -2138,18 +2138,21 @@
                       output_data, /*thread_start=*/0,
                       /*thread_end=*/output_rows, /*thread_dim=*/1);
   } else {
-    std::vector<gemmlowp::Task*> tasks(thread_count);
+    std::vector<DepthwiseConvWorkerTask<uint8, int32>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
     int thread_start = 0;
     for (int i = 0; i < thread_count; ++i) {
       int thread_end =
           thread_start + (thread_dim_size - thread_start) / (thread_count - i);
-      tasks[i] = new DepthwiseConvWorkerTask<uint8, int32>(
-          params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data, thread_start,
-          thread_end, thread_dim);
+      tasks.emplace_back(params, input_shape, input_data, filter_shape,
+                         filter_data, bias_shape, bias_data, output_shape,
+                         output_data, thread_start, thread_end, thread_dim);
       thread_start = thread_end;
     }
-    gemmlowp_context->workers_pool()->Execute(tasks);
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 3121fd0..eff9f24 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -17,9 +17,9 @@
 
 #include <memory>
 
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -27,13 +27,6 @@
 namespace optimized_ops {
 namespace depthwise_conv {
 
-constexpr int kDepthwiseConvScratchWorkspaceSize = 10 * 10 * 64;
-constexpr int kDepthwiseConvAdjustedBiasLimit = 64;
-// In cases such as depth multiplication, we want to be able to load data from
-// the workspace that is beyond the valid range. Macro-block sizes are adjusted
-// to allow for this.
-constexpr int kWorkspaceExtension = 16;
-
 #ifdef USE_NEON
 // Lane operations are for clarity and convenience. We want to load and store
 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
@@ -46,6 +39,9 @@
   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
   vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
 
+// Important! Most compilation configurations will compile and run without
+// reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
+// obscure bug or mis-feature probably in unhygienic macro expansion.
 #define vld1q_lane_s8x8(src, reg, lane_num) \
   vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
 #define vld1_lane_8x4(src, reg, lane_num) \
@@ -54,224 +50,6 @@
   vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
 #define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
 
-#ifndef __aarch64__
-inline int8x16_t vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
-  const uint8x16_t mask = vtstq_s8(b, vdupq_n_s8(8));
-
-  // Delete bit 3 from the indices.
-  const int8x16_t high_bits = vshrq_n_s8(b, 4);
-  int8x16_t deleted_bit_3 = b;
-  deleted_bit_3 = vsliq_n_s8(deleted_bit_3, high_bits, 3);
-
-  int8x8x4_t repacked_data;
-
-  // Calculate for lower indices.
-  repacked_data.val[0] = vget_low_s8(a.val[0]);
-  repacked_data.val[1] = vget_low_s8(a.val[1]);
-  repacked_data.val[2] = vget_low_s8(a.val[2]);
-  repacked_data.val[3] = vget_low_s8(a.val[3]);
-  const int8x16_t output_for_lower =
-      vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)),
-                  vtbl4_s8(repacked_data, vget_high_s8(deleted_bit_3)));
-
-  // Calculate for high indices.
-  repacked_data.val[0] = vget_high_s8(a.val[0]);
-  repacked_data.val[1] = vget_high_s8(a.val[1]);
-  repacked_data.val[2] = vget_high_s8(a.val[2]);
-  repacked_data.val[3] = vget_high_s8(a.val[3]);
-  const int8x16_t output_for_higher =
-      vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)),
-                  vtbl4_s8(repacked_data, vget_high_s8(deleted_bit_3)));
-
-  // Merge.
-  int8x16_t output = vbslq_s8(mask, output_for_higher, output_for_lower);
-  return output;
-}
-#endif  // !__aarch64__
-
-// Convenience-compatibility functions.
-// Compatibility: Intrinsics reflect a mixture of older and newer ARM
-//     instructions. This actually results in ZIP1 / ZIP2 asm instructions, but
-//     one intrinsic is provided. Also older instructions operated in place,
-//     and it seems more defensive to assume that some versions of intrinsics
-//     might reflect this
-// Convenience: Callers in these kernels want both ZIP1 and ZIP2, and we do not
-//     want the calling code to get cluttered with unpacking int8x16x2_t.
-inline void vzipq_s8_in_place(int8x16_t* a, int8x16_t* b) {
-  int8x16x2_t r8x16;
-  r8x16 = vzipq_s8(*a, *b);
-  *a = r8x16.val[0];
-  *b = r8x16.val[1];
-}
-
-inline void vzipq_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
-  int16x8x2_t r16x8;
-  r16x8 = vzipq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
-  *a = vreinterpretq_s8_s16(r16x8.val[0]);
-  *b = vreinterpretq_s8_s16(r16x8.val[1]);
-}
-
-// Similar rationale to the zip-in_place functions, but callers only actually
-// need the TRN1 asm instruction result.
-inline void vtrn1_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
-  int16x8x2_t r16x8;
-  r16x8 = vtrnq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
-  *a = vreinterpretq_s8_s16(r16x8.val[0]);
-}
-
-// Similar rationale to the zip-in_place functions, but callers only actually
-// need the ZIP1 or ZIP2 asm instruction results.
-inline int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
-  return vzipq_s8(a, b).val[0];
-}
-inline int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
-  return vzipq_s8(a, b).val[1];
-}
-
-inline void biregister_rotate_8(int8x16_t* left, int8x16_t* right) {
-  *left = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*left), 8));
-  *left = vreinterpretq_s8_u32(vsliq_n_u32(vreinterpretq_u32_s8(*left),
-                                           vreinterpretq_u32_s8(*right), 24));
-  *right = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*right), 8));
-}
-
-#ifndef __aarch64__
-inline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
-  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
-}
-#endif  // !__aarch64__
-
-#ifdef __ARM_FEATURE_DOTPROD
-// The vdotq_lane_s32 takes int8x8t for the rhs parameter, whereas the actual
-// instruction selects from between 4 32-bit (4x8-bit packed) sub-registers, an
-// unusual interpretation of "lane".
-inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
-                                     int8x16_t rhs, const int lane) {
-  switch (lane) {
-    case 0:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 0);
-    case 1:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 1);
-    case 2:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
-                            0);
-    case 3:
-    default:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
-                            1);
-  }
-}
-
-#else
-
-inline int32x4_t vdotq_s32(int32x4_t acc, int8x16_t lhs, int8x16_t rhs) {
-  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)));
-  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)));
-  int32x4_t sum = vpaddq_s32(sum0, sum1);
-  return vaddq_s32(acc, sum);
-}
-
-inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
-                                     int8x16_t rhs, int lane) {
-  int8x8_t lane_rhs;
-  if (lane == 0) {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0));
-  } else if (lane == 1) {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 1));
-  } else if (lane == 2) {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 0));
-  } else {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 1));
-  }
-  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), lane_rhs));
-  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), lane_rhs));
-  int32x4_t sum = vpaddq_s32(sum0, sum1);
-  return vaddq_s32(acc, sum);
-}
-
-#endif  // !__ARM_FEATURE_DOTPROD
-#endif  // ARM NEON
-
-template <DepthwiseConvOutputRounding output_rounding>
-struct DivideByPOT {};
-
-template <>
-struct DivideByPOT<DepthwiseConvOutputRounding::kAwayFromZero> {
-  template <typename IntegerType>
-  static inline IntegerType Run(IntegerType x, int exponent) {
-    return RoundingDivideByPOT(x, exponent);
-  }
-};
-
-#ifdef USE_NEON
-template <>
-struct DivideByPOT<DepthwiseConvOutputRounding::kUpward> {
-  template <typename IntegerType>
-  static inline IntegerType Run(IntegerType x, int exponent) {
-    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32>(-exponent)));
-  }
-};
-#endif  // ARM NEON
-
-// See CategorizeDotProductKernel for definitive taxonomy.
-enum class DotProduct3x3KernelType {
-  kNone = 0,  // Parameter combination is not supported for dot product kernels.
-  kPlain,
-  kWithDepthMultiplicationStride1,
-  kWithDepthMultiplicationStride2,
-  kStride2,
-};
-
-inline DotProduct3x3KernelType CategorizeDotProductKernel(
-    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
-    const DepthwiseParams& params) {
-  constexpr int kSymmetricZeroPoint = 128;
-  const int padding =
-      std::max(params.padding_values.width, params.padding_values.height);
-  const int stride = params.stride_width;
-  const int32 input_depth = input_shape.Dims(3);
-  const int32 depth_multiplier = params.depth_multiplier;
-  const int32 filter_height = filter_shape.Dims(1);
-  const int32 filter_width = filter_shape.Dims(2);
-
-  bool supported =
-      params.weights_offset == -kSymmetricZeroPoint &&
-      stride == params.stride_height && stride <= 2 && padding <= 1 &&
-      filter_width == 3 && filter_height == 3 && params.output_shift <= 0 &&
-      params.dilation_width_factor == 1 && params.dilation_height_factor == 1 &&
-      (((input_depth % 8) == 0 && depth_multiplier == 1) ||
-       (input_depth == 1 && depth_multiplier > 1));
-
-  if (!supported) {
-    return DotProduct3x3KernelType::kNone;
-  }
-
-  if (params.depth_multiplier == 1) {
-    if (stride == 1) {
-      return DotProduct3x3KernelType::kPlain;
-    } else if (stride == 2) {
-      return DotProduct3x3KernelType::kStride2;
-    } else {
-      return DotProduct3x3KernelType::kNone;
-    }
-  } else {
-    if (stride == 1) {
-      return DotProduct3x3KernelType::kWithDepthMultiplicationStride1;
-    } else if (stride == 2) {
-      return DotProduct3x3KernelType::kWithDepthMultiplicationStride2;
-    } else {
-      return DotProduct3x3KernelType::kNone;
-    }
-  }
-}
-
-#ifdef USE_NEON
-
 #define STR(s) STR_UNEXPANDED(s)
 #define STR_UNEXPANDED(s) #s
 
@@ -280,29 +58,6 @@
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 #include <stddef.h>
 
-// Encapsulates constant parameters used in DepthwiseConv.
-// 64-bit is used for types that will be added to 64-bit addresses in asm.
-struct DepthwiseConvParams {
-  int64_t input_depth;
-  int64_t input_row_size;
-  int64_t output_depth;
-  int64_t output_row_size;
-  int64_t filter_row_size;
-  int32 input_offset;
-  int32 output_offset;
-  int32 filter_offset;
-  int32 output_multiplier;
-  int32 output_activation_min;
-  int32 output_activation_max;
-  int32 output_right_shift;
-  int32 input_width;
-  int32 input_height;
-  int32 stride_width;
-  int32 stride_height;
-  int32 output_width;
-  int32 output_height;
-};
-
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvParams struct. This is used in the asm to load parameters.
 // Keep these values in sync with the static_asserts below.
@@ -380,49 +135,6 @@
 #endif  // __aarch64__
 #endif  // ARM NEON
 
-// Encapsulates constant parameters used in DepthwiseConv using dot-product ops.
-// 64-bit is used for types that will be added to 64-bit addresses in asm.
-//
-// This structure is specifically designed for use in asm.
-struct DepthwiseConvDotProdParams {
-  int64_t input_depth;
-  int64_t output_depth;
-  int32 stride;
-  int32 bias_increment;
-  //
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int32 output_shift;
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  //
-  int32 padding_left;
-  int32 padding_right;
-  int32 padding_top;
-  int32 padding_bottom;
-  //
-  int32 depth_micro_repeats;
-  //
-  int32 width_macro_count;
-  int32 input_width_overall_micro_repeats;
-  int32 input_width_micro_repeats;
-  int32 residual_width;
-  int32 output_width_overall_micro_repeats;
-  int32 output_width_micro_repeats;
-  int32 output_residual_width;
-  int32 workspace_width_micro_repeats;
-  //
-  int32 height_macro_count;
-  int32 inbound_block_height;
-  int32 outbound_block_height;
-  int32 input_height_stride;
-  int32 output_height_stride;
-  int32 workspace_height_stride;
-  //
-  int32 four_over_stride;
-};
-
 #ifdef USE_NEON
 #if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
 // Represents the number of bytes offset from the start of the
@@ -574,9 +286,6 @@
 #endif  // __ARM_FEATURE_DOTPROD && !GOOGLE_L4T
 
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
-template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
-          int32 kStrideWidth, int32 kStrideHeight>
-struct DepthwiseConvWindow {};
 
 template <>
 struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kAwayFromZero, 8, 1,
@@ -652,7 +361,6 @@
         "dup v30.16b, w4\n"
         "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v31.16b, w0\n"
-        "neg w9, w9\n"
         "dup v28.4s, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
         "add x10, %[bias_ptr], #16\n"
@@ -1586,7 +1294,6 @@
         "dup v30.16b, w4\n"
         "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v31.16b, w0\n"
-        "neg w9, w9\n"
         "dup v28.4s, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
         "add x10, %[bias_ptr], #16\n"
@@ -2403,7 +2110,6 @@
         "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "cmp %w[output_window_height], #2\n"
         "dup v28.8h, w0\n"
-        "neg w9, w9\n"
         "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "dup v26.4s, w9\n"
         "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
@@ -3436,7 +3142,6 @@
         "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "cmp %w[output_window_height], #2\n"
         "dup v28.8h, w0\n"
-        "neg w9, w9\n"
         "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "dup v26.4s, w9\n"
         "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
@@ -4296,12 +4001,6 @@
   }
 };
 
-enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter };
-
-template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
-          int kPadWidth, int kPadHeight>
-struct DepthwiseConvPartial {};
-
 template <>
 struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
                             EdgeType::kCenter, 1, 1> {
@@ -4326,7 +4025,6 @@
         "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w10, w10\n"
         "dup v29.4s, w10\n"
         "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.16b, w9\n"
@@ -4440,7 +4138,6 @@
         "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w10, w10\n"
         "dup v29.4s, w10\n"
         "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.16b, w9\n"
@@ -4562,7 +4259,6 @@
         "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w6\n"
         "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w7, w7\n"
         "dup v29.4s, w7\n"
         "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.16b, w6\n"
@@ -4728,7 +4424,6 @@
         "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w6\n"
         "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w7, w7\n"
         "dup v29.4s, w7\n"
         "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.16b, w6\n"
@@ -4888,7 +4583,6 @@
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w13, w13\n"
         "dup v29.4s, w13\n"
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.8b, w12\n"
@@ -5088,7 +4782,6 @@
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w13, w13\n"
         "dup v29.4s, w13\n"
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.8b, w12\n"
@@ -5278,7 +4971,6 @@
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w13, w13\n"
         "dup v29.4s, w13\n"
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.8b, w12\n"
@@ -5483,7 +5175,6 @@
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "neg w13, w13\n"
         "dup v29.4s, w13\n"
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.8b, w12\n"
@@ -5640,47 +5331,6 @@
 #undef OFFSET_OUTPUT_WIDTH
 #undef OFFSET_OUTPUT_HEIGHT
 
-// Copies a subset of the input designated by |input_ptr| into |output_ptr|
-// with the specified output dimensions. Supports output depths of 64 only as
-// this is the cache line size.
-inline void ShuffleInput(const uint8* input_ptr, int64_t input_depth,
-                         int32 input_width, int32 input_height,
-                         int64_t output_depth, int32 output_width,
-                         int32 output_height, uint8* output_ptr) {
-  const int64_t input_row_size = input_depth * input_width;
-  for (int32 y = 0; y < output_height; y++) {
-    const uint8* ptr = input_ptr;
-    for (int32 x = 0; x < output_width; x++) {
-      memcpy(output_ptr, ptr, output_depth);
-      output_ptr += output_depth;
-      ptr += input_depth;
-    }
-    input_ptr += input_row_size;
-  }
-}
-
-// Calculates the input size depending on stride and output.
-inline int32 get_shuffle_input_size(int32 stride, int32 output) {
-  return stride * (output - 1) + 3;
-}
-
-// Indicates the input and output dimensions used when shuffling input
-// activations.
-struct ShuffleParams {
-  int32 output_width;
-  int32 output_height;
-  int32 input_width;
-  int32 input_height;
-
-  ShuffleParams() = default;
-  ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
-                int32 stride_height)
-      : output_width(output_width),
-        output_height(output_height),
-        input_width(get_shuffle_input_size(stride_width, output_width)),
-        input_height(get_shuffle_input_size(stride_height, output_height)) {}
-};
-
 template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
           int32 kStrideHeight>
 struct DepthwiseConvThroughDepth {
@@ -5688,8 +5338,8 @@
   // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
   // binary size. We use a DepthwiseConvParams struct for read only params
   // to minimize call overhead.
-  static __attribute__((noinline)) void Run(
-      const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+  static void __attribute__((noinline))
+  Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
       uint8* output_ptr, int64_t start_depth, int64_t end_depth,
       int64_t input_depth, int64_t input_row_size, int32 output_window_height,
       int32 output_window_width, const DepthwiseConvParams& params) {
@@ -5904,68 +5554,6 @@
       input_ptr, filter_ptr, bias_data, output_ptr, &params);
 }
 
-inline bool Fast3x3FilterKernelSupported(
-    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
-    int32 stride_width, int32 stride_height, int32 dilation_width_factor,
-    int32 dilation_height_factor, int32 pad_width, int32 pad_height,
-    int32 depth_multiplier, const RuntimeShape& output_shape,
-    int32 output_shift) {
-  const int32 input_height = input_shape.Dims(1);
-  const int32 input_width = input_shape.Dims(2);
-  const int32 input_depth = input_shape.Dims(3);
-  const int32 filter_height = filter_shape.Dims(1);
-  const int32 filter_width = filter_shape.Dims(2);
-  const int32 output_height = output_shape.Dims(1);
-  const int32 output_width = output_shape.Dims(2);
-
-  bool supported =
-      filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
-      (stride_width == 1 || stride_width == 2) &&
-      (stride_height == 1 || stride_height == 2) &&
-      (stride_width == stride_height) && (pad_width == 0 || pad_width == 1) &&
-      (pad_height == 0 || pad_height == 1) && (pad_width == pad_height) &&
-      (input_depth % 8) == 0 && (output_shift <= 0) &&
-      dilation_width_factor == 1 && dilation_height_factor == 1;
-
-  if (!supported) {
-    return false;
-  }
-
-  // Handle case where padding is zero but padding type is not kValid.
-  // This would require special boundary case handling that is not supported.
-
-  const int32 out_x = output_width - 1;
-  const int32 out_y = output_height - 1;
-
-  const int32 in_x_origin = (out_x * stride_width) - pad_width;
-  const int32 in_y_origin = (out_y * stride_height) - pad_height;
-
-  const int32 in_x_end = in_x_origin + filter_width;
-  const int32 in_y_end = in_y_origin + filter_height;
-
-  // Supported only if filter on the right and bottom boundary lies completely
-  // within the input if padding is zero.
-  if (pad_width == 0 && pad_height == 0) {
-    return in_x_end <= input_width && in_y_end <= input_height;
-  }
-
-  // Else if padding is 1, supported if bottom right filter lies +1 past input
-  // width and height.
-  supported = in_x_end <= (input_width + 1) && in_y_end <= (input_height + 1);
-
-  if (!supported) {
-    return false;
-  }
-
-  // Shapes with width 1 and height > 1, and vice versa are not supported yet.
-  if (input_width == 1) {
-    supported = (input_width == input_height);
-  } else if (input_height == 1) {
-    supported = (input_width == input_height);
-  }
-  return supported;
-}
-
 template <DepthwiseConvOutputRounding output_rounding>
 inline void DepthwiseConv3x3Filter(
     const DepthwiseParams& rt_params, const RuntimeShape& input_shape,
@@ -6002,7 +5590,7 @@
   params.output_offset = output_offset;
   params.filter_offset = filter_offset;
   params.output_multiplier = output_multiplier;
-  params.output_right_shift = -output_shift;
+  params.output_right_shift = output_shift;
   params.output_activation_min = output_activation_min;
   params.output_activation_max = output_activation_max;
 
@@ -6079,9 +5667,9 @@
   }
 
   for (int32 b = batch_start; b < batch_end; ++b) {
+    // input_ptr and output_ptr point to the start of each batch
     const uint8* input_ptr = input_data + b * input_batch_size;
-    uint8* output_ptr = output_data + b * output_batch_size +
-                        row_start * params.output_width * params.output_depth;
+    uint8* output_ptr = output_data + b * output_batch_size;
 
     int32 out_x = 0;
     int32 out_y = row_start;
@@ -6097,13 +5685,19 @@
       end_x = params.output_width - 1;
       out_y = std::max(1, out_y);
       end_y = std::min(params.output_height - 1, end_y);
-      const int in_x = (out_x * stride_width) - pad_width;
-      const int in_y = (out_y * stride_height) - pad_height;
-      input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
-      output_ptr +=
-          out_y * params.output_row_size + out_x * params.output_depth;
     }
 
+    // pad_width and pad_height can both be 0 or 1, depending on padding option,
+    // such as Padding_VALID / Padding_SAME.
+    const int in_x = (out_x * stride_width) - pad_width;
+    const int in_y = (out_y * stride_height) - pad_height;
+
+    // input_ptr and output_ptr point to (in_y, in_x) and (out_y, out_x),
+    // respectively. (in_y, in_x) and (out_y, out_x) change along with
+    // row_start.
+    input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+    output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
+
     // Shuffling shapes that maximize width over the shuffle workspace size
     // perform better since the inputs are closer together, minimizing
     // shuffling time.
@@ -6160,209 +5754,145 @@
 
 #endif
 
-// Permute filter data, and adjust bias data to account for symmetric input
-// offset. Details are provided in the implementation of the
-// kUseCModel3x3DotProduct version.
-//
-// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+// Perform any necessary cache hinting and pre-writing.
 template <DepthwiseConvImplementation implementation>
-struct ProcessPerDepth {
-  // Routine is contained in a static Run() method. No default template version
-  // is supplied, so that all implementations are deliberate choices of template
-  // specialization.
-  //
-  // Note that the signature of the Run() method will be designed for the asm
-  // implementation rather than conforming to style.
-};
-
-// Copy a macro block of data from the input buffer into the workspace,
-// permuting data within each micro block.
-//
-// (a) Copy a macro block of data, padding as required along the width and
-//     height.
-// (b) Transpose the data within each micro block.
-//
-// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
-template <DepthwiseConvImplementation implementation,
-          DepthwiseConvDepthMultiplication depth_multiplication,
-          int32 max_padding>
-struct PackMacroBlock {
-  // Routine is contained in a static Run() method. No default template version
-  // is supplied, so that all implementations are deliberate choices of template
-  // specialization.
-  //
-  // Note that the signature of the Run() method will be designed for the asm
-  // implementation rather than conforming to style.
-};
-
-// Apply filter to macro block of input data and store results. Details are
-// provided in the implementation of the kUseCModel3x3DotProduct version.
-//
-// Parameters for repeats and residual sizes are in terms of outputs.
-//
-// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
-template <DepthwiseConvImplementation implementation,
-          DepthwiseConvDepthMultiplication depth_multiplication, int32 stride>
-struct KernelMacroBlock {
-  // Routine is contained in a static Run() method. No default template version
-  // is supplied, so that all implementations are deliberate choices of template
-  // specialization.
-  //
-  // Note that the signature of the Run() method will be designed for the asm
-  // implementation rather than conforming to style.
+struct WorkspacePrefetchWrite {
+  static inline void Run(int8 fill_data, int size, int8* workspace) {}
 };
 
 #if defined(USE_NEON) && defined(__aarch64__)
-// Experiments suggest that a modest performance improvement is seen, at least
-// on 855 chipset big cores, with cache hints.
-inline void PreloadInputBlock(
-    const uint8* input_block_data,
-    const DepthwiseConvDotProdParams* function_params) {
-  // Preload.
-  const int input_width_micro_repeats =
-      function_params->input_width_micro_repeats;
-  const int block_height = function_params->inbound_block_height;
-  const int residual_width = function_params->residual_width;
-  const int input_height_stride = function_params->input_height_stride;
-  const int input_depth = function_params->input_depth;
-
-  const int total_width = 4 * input_width_micro_repeats + residual_width;
-  const uint8* row_ptr = input_block_data;
-  for (int k_height = 0; k_height < block_height; ++k_height) {
-    const uint8* ptr = row_ptr;
-    for (int j = 0; j < total_width; ++j) {
-      // Input data is loaded once.
-      asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-      ptr += input_depth;
+// Encourage the processor to keep the workspace in cache. Both the cache hint
+// and some memory writes are required.
+//
+// This code is extremely fragile.
+// Do not edit without extensive comparative performance testing.
+// Do not inline without great care.
+// Do not rely on results before and after getting coffee: non-thermal changes
+//    of more than 10% can occur with hidden underlying processor state changes.
+template <>
+struct WorkspacePrefetchWrite<
+    DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
+  static void __attribute__((noinline))
+  Run(int8 fill_data, int size, int8* workspace) {
+    const int8x8_t fill_data_vec = vdup_n_s8(fill_data);
+    int i = 0;
+    for (; i < (size - 15); i += 64) {
+      int8* ptr = workspace + i;
+      asm volatile("prfm pstl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+      vst1_lane_u32(reinterpret_cast<uint32_t*>(ptr), fill_data_vec, 0);
     }
-    row_ptr += input_height_stride;
+    vst1_lane_u32(reinterpret_cast<uint32_t*>(workspace + size - 4),
+                  fill_data_vec, 0);
   }
-}
+};
 #endif  // USE_NEON &&__aarch64__
 
 #if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
 
 template <>
 struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
-  static void ProcessPerDepthNeon(
+  static inline void ProcessPerDepthNeon(
       const uint8* filter_data, const int32* bias_data,
       int8* shuffled_filter_data, int32* adjusted_bias_data,
       const DepthwiseConvDotProdParams* function_params) {
-    const int depth = function_params->output_depth;
-    const int depth_micro_repeats = function_params->depth_micro_repeats;
-    const int bias_increment = function_params->bias_increment;
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[filter_data]
+    // x1 %[bias_data]
+    // x2 %[shuffled_filter_data]
+    // x3 %[adjusted_bias_data]
+    // x4 %[function_params]
 
-    constexpr int kSymmetricZeroPoint = 128;
-    constexpr uint8 kSignBit = 0x80;
-    const int32 input_offset = function_params->input_offset;
-    TFLITE_DCHECK_GE(input_offset, -255);
-    TFLITE_DCHECK_LE(input_offset, 0);
-    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
-    const int8x16_t ones_vector = vdupq_n_s8(1);
-
-    // Simulate NEON-register transposition of subset of filter.
-    int8x16_t input_0_a;
-    int8x16_t input_0_b;
-    int8x16_t input_0_c;
-    int8x16_t input_1_a;
-    int8x16_t input_1_b;
-    int8x16_t input_1_c;
-    int8x16_t input_2_a;
-    int8x16_t input_2_b;
-    int8x16_t input_2_c;
-
-    int8x16_t filter_0_a;
-    int8x16_t filter_0_b;
-    int8x16_t filter_1_a;
-    int8x16_t filter_1_b;
-    int8x16_t filter_2_a;
-    int8x16_t filter_2_b;
-
-    // Register pairs for each height.
-    // Effect subtraction of zero-point = 128 by XOR of sign bit.
-    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
-
-    const uint8* filter_block = filter_data;
-    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
-      // Filter data is provided as filter_block[3][3][depth/8][2][4].
-      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
-      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
-
-      const uint8* filter_block_ptr = filter_block;
-      input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
-      filter_block_ptr += depth;
-      input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
-      filter_block_ptr += depth;
-      input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
-      filter_block_ptr += depth;
-      input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
-      filter_block_ptr += depth;
-      input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
-      filter_block_ptr += depth;
-      input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
-      filter_block_ptr += depth;
-      input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
-      filter_block_ptr += depth;
-      input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
-      filter_block_ptr += depth;
-      input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
-
-      filter_0_a = vzip1q_s8(input_0_a, input_0_b);
-      filter_0_b = vzip1q_s8(input_0_c, sign_bit);
-      filter_1_a = vzip1q_s8(input_1_a, input_1_b);
-      filter_1_b = vzip1q_s8(input_1_c, sign_bit);
-      filter_2_a = vzip1q_s8(input_2_a, input_2_b);
-      filter_2_b = vzip1q_s8(input_2_c, sign_bit);
-      filter_0_a = veorq_s8(filter_0_a, sign_bit);
-      filter_0_b = veorq_s8(filter_0_b, sign_bit);
-      filter_1_a = veorq_s8(filter_1_a, sign_bit);
-      filter_1_b = veorq_s8(filter_1_b, sign_bit);
-      filter_2_a = veorq_s8(filter_2_a, sign_bit);
-      filter_2_b = veorq_s8(filter_2_b, sign_bit);
-      vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
-      vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
-      vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
-
-      vst1q_s8(shuffled_filter_data, filter_0_a);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_0_b);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_1_a);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_1_b);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_2_a);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_2_b);
-      shuffled_filter_data += 16;
-
-      int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
-      bias_data += bias_increment;
-      int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
-      bias_data += bias_increment;
-      // For instance, if input_offset == 128, no adjustment is needed.
-
-      int32x4_t filter_sum_a = vdupq_n_s32(0);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
-      int32x4_t filter_sum_b = vdupq_n_s32(0);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
-
-      adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
-                                         input_offset_difference);
-      adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
-                                         input_offset_difference);
-
-      vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
-      adjusted_bias_data += 4;
-      vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
-      adjusted_bias_data += 4;
-
-      filter_block += 8;
-    }
+    asm volatile(
+        // %bb.0:
+        "ldp    w12, w11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
+        "ldrsw  x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr    w10, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "mov    x8, xzr\n"
+        "add    w11, w11, #128\n"  // =128
+        "sxtw   x12, w12\n"
+        "movi   v0.16b, #128\n"
+        "dup    v1.4s, w11\n"
+        "lsl    x11, x12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "movi   v2.16b, #1\n"
+        // implicit-def: $q3
+        // implicit-def: $q4
+        // implicit-def: $q5
+        // implicit-def: $q6
+        // implicit-def: $q7
+        // implicit-def: $q16
+        // implicit-def: $q17
+        // implicit-def: $q18
+        // implicit-def: $q19
+        "b      DC_PER_DEPTH_2\n"
+        "   DC_PER_DEPTH_1:\n"  // in Loop: Header=BB177_2 Depth=1
+        "add    x13, %[filter_data], x8, lsl #3\n"
+        "ld1    { v19.d }[0], [x13], x9\n"
+        "movi   v21.2d, #0\n"
+        "movi   v20.2d, #0\n"
+        "add    x8, x8, #1\n"  // =1
+        "ld1    { v18.d }[0], [x13], x9\n"
+        "ld1    { v17.d }[0], [x13], x9\n"
+        "zip1   v22.16b, v19.16b, v18.16b\n"
+        "eor    v22.16b, v22.16b, v0.16b\n"
+        "ld1    { v16.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v17.16b, v0.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "ld1    { v7.d }[0], [x13], x9\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "sdot   v21.4s, v22.16b, v2.16b\n"
+        "sdot   v20.4s, v24.16b, v2.16b\n"
+        "ld1    { v6.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v16.16b, v7.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "ld1    { v5.d }[0], [x13], x9\n"
+        "zip1   v25.16b, v6.16b, v0.16b\n"
+        "eor    v25.16b, v25.16b, v0.16b\n"
+        "zip1   v26.8h, v23.8h, v25.8h\n"
+        "ld1    { v4.d }[0], [x13], x9\n"
+        "zip2   v23.8h, v23.8h, v25.8h\n"
+        "sdot   v21.4s, v23.16b, v2.16b\n"
+        "sdot   v20.4s, v26.16b, v2.16b\n"
+        "ld1    { v3.d }[0], [x13]\n"
+        "zip1   v25.16b, v5.16b, v4.16b\n"
+        "stp    q26, q23, [%[shuffled_filter_data], #32]\n"
+        "stp    q24, q22, [%[shuffled_filter_data]]\n"
+        "zip1   v23.16b, v3.16b, v0.16b\n"
+        "eor    v22.16b, v25.16b, v0.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "stp    q24, q22, [%[shuffled_filter_data], #64]\n"
+        "sdot   v21.4s, v22.16b, v2.16b\n"
+        "ldr    q22, [%[bias_data]]\n"
+        "ldr    q23, [%[bias_data], x12]\n"
+        "sdot   v20.4s, v24.16b, v2.16b\n"
+        "add    %[shuffled_filter_data], x2, #96\n"  // =96
+        "mla    v22.4s, v20.4s, v1.4s\n"
+        "mla    v23.4s, v21.4s, v1.4s\n"
+        "add    %[bias_data], x1, x11\n"
+        "stp    q22, q23, [%[adjusted_bias_data]], #32\n"
+        "   DC_PER_DEPTH_2:\n"  // =>This Inner Loop Header: Depth=1
+        "cmp    w8, w10\n"
+        "b.lt   DC_PER_DEPTH_1\n"
+        :
+        // Outputs.
+        [ filter_data ] "+r"(filter_data),
+        [ bias_data ] "+r"(bias_data),
+        [ shuffled_filter_data ] "+r"(shuffled_filter_data),
+        [ adjusted_bias_data ] "+r"(adjusted_bias_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x15", "x16");
   }
 
   static inline void Run(const uint8* filter_data, const int32* bias_data,
@@ -6571,11 +6101,11 @@
         scratch_block_data + block_height * workspace_height_stride);
   }
 
-  static inline void Run(int32 height_block_number, int32 width_block_number,
-                         const uint8* input_block_data,
-                         int8* scratch_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock(input_block_data, function_params);
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock<uint8>(input_block_data, function_params);
     PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
   }
 };
@@ -6962,11 +6492,11 @@
         scratch_block_data + block_height * workspace_height_stride);
   }
 
-  static inline void Run(int32 height_block_number, int32 width_block_number,
-                         const uint8* input_block_data,
-                         int8* scratch_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock(input_block_data, function_params);
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock<uint8>(input_block_data, function_params);
     PackMacroBlockNeon(height_block_number, width_block_number,
                        input_block_data, scratch_block_data, function_params);
   }
@@ -7165,10 +6695,6 @@
 
         // Main copy loop.
         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
-          // Important! Most compilation configurations will compile and run
-          // without the reinterpret_cast. Sanitizers may fail silently on
-          // lane-loading, with a obscure bug or mis-feature probably in
-          // unhygienic macro expansion.
           half_work_reg =
               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
                             half_work_reg, 0);
@@ -7225,10 +6751,9 @@
       TFLITE_DCHECK_EQ(start_width, 1);
       TFLITE_DCHECK(leading_width_padding);
       TFLITE_DCHECK(trailing_width_padding);
-      // ASM should use MOVI 64-bit set.
-      padding_mask = vcreate_u64(~0xffffff00L);
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vdup_n_u8(-input_offset);
         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
                                          input_block_data + input_block_offset),
                                      half_work_reg, 1);
@@ -7240,8 +6765,6 @@
             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
                                                        input_block_offset + 2),
                          half_work_reg, 3);
-        half_work_reg =
-            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
 
         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
@@ -7315,11 +6838,11 @@
         scratch_block_data + block_height * workspace_height_stride);
   }
 
-  static inline void Run(int32 height_block_number, int32 width_block_number,
-                         const uint8* input_block_data,
-                         int8* scratch_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock(input_block_data, function_params);
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock<uint8>(input_block_data, function_params);
     PackMacroBlockNeon(height_block_number, width_block_number,
                        input_block_data, scratch_block_data, function_params);
   }
@@ -7451,10 +6974,6 @@
 
         // Main copy loop.
         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
-          // Important! Most compilation configurations will compile and run
-          // without the reinterpret_cast. Sanitizers may fail silently on
-          // lane-loading, with a obscure bug or mis-feature probably in
-          // unhygienic macro expansion.
           half_work_reg =
               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
                             half_work_reg, 0);
@@ -7536,11 +7055,11 @@
         scratch_block_data + block_height * workspace_height_stride);
   }
 
-  static inline void Run(int32 height_block_number, int32 width_block_number,
-                         const uint8* input_block_data,
-                         int8* scratch_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock(input_block_data, function_params);
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock<uint8>(input_block_data, function_params);
     PackMacroBlockNeon(height_block_number, width_block_number,
                        input_block_data, scratch_block_data, function_params);
   }
@@ -7949,7 +7468,7 @@
             const bool no_right_block = output_width < 3;
 
             if (no_right_block) {
-              // Only needed for santizer checks.
+              // Only needed for sanitizer checks.
               right_bank_0_reg = vdupq_n_s8(0);
               right_bank_1_reg = vdupq_n_s8(0);
               right_bank_2_reg = vdupq_n_s8(0);
@@ -8088,7 +7607,7 @@
 
             // Load next sub-micro block of data.
             if (no_right_block) {
-              // Only needed for santizer checks.
+              // Only needed for sanitizer checks.
               right_bank_0_reg_a = vdupq_n_s8(0);
               right_bank_1_reg_a = vdupq_n_s8(0);
               right_bank_2_reg_a = vdupq_n_s8(0);
@@ -8158,10 +7677,10 @@
     }
   }  // NOLINT(readability/fn_size) Manually unrolled.
 
-  static inline void Run(const int8* scratch_block_data,
-                         const int8* filter_workspace, const int32* bias_data,
-                         uint8* output_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
     KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
                          output_block_data, function_params);
   }
@@ -8426,6 +7945,7 @@
           bias_data += kBiasIncrement;
         }
       } else {
+        // block_height == 1.
         int8x16_t filter_reg_0_a;
         int8x16_t filter_reg_1_a;
         int8x16_t filter_reg_2_a;
@@ -8584,10 +8104,10 @@
     }
   }  // NOLINT(readability/fn_size) Manually unrolled.
 
-  static inline void Run(const int8* scratch_block_data,
-                         const int8* filter_workspace, const int32* bias_data,
-                         uint8* output_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
     KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
                          output_block_data, function_params);
   }
@@ -9227,10 +8747,10 @@
     }
   }  // NOLINT(readability/fn_size) Manually unrolled.
 
-  static inline void Run(const int8* scratch_block_data,
-                         const int8* filter_workspace, const int32* bias_data,
-                         uint8* output_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
     KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
                          output_block_data, function_params);
   }
@@ -9753,10 +9273,10 @@
     }
   }
 
-  static inline void Run(const int8* scratch_block_data,
-                         const int8* filter_workspace, const int32* bias_data,
-                         uint8* output_block_data,
-                         const DepthwiseConvDotProdParams* function_params) {
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
     KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
                          output_block_data, function_params);
   }
@@ -10231,6 +9751,15 @@
   function_params.output_height_stride = output_height_stride;
   function_params.residual_width = residual_micro_width;
 
+  // Prefetch workspace for write, along with any necessary dummy writes.
+  const int max_workspace_height_stride =
+      16 * ((workspace_width_micro_repeats + 3) >> 2) * largest_macro_depth;
+  const int workspace_fill_size = std::min(
+      kDepthwiseConvScratchWorkspaceSize,
+      height_block_size * max_workspace_height_stride + kWorkspaceExtension);
+  WorkspacePrefetchWrite<implementation>::Run(
+      params.weights_offset, workspace_fill_size, macroblock_workspace);
+
   // Main process.
   //
   // Most kernels are nested batch-height-width-depth. Here we proceed over
@@ -10274,6 +9803,15 @@
               : function_params.output_width_micro_repeats + 1;
 
       for (int j_depth = 0; j_depth < depth_overall_macro_count; ++j_depth) {
+        // Process filter and bias data.
+        //
+        function_params.depth_micro_repeats =
+            j_depth == depth_macro_count ? depth_trailing_micro_repeats : 8;
+        ProcessPerDepth<implementation>::Run(
+            filter_data + 64 * j_depth,
+            bias_data + 8 * 2 * bias_increment * j_depth,
+            filter_workspace[0][0][0][0], adjusted_bias_data, &function_params);
+
         const uint8* input_data_block =
             input_data + b * input_batch_stride +
             j_depth * input_depth_macro_stride +
@@ -10284,15 +9822,6 @@
                                    j_depth * 64 +
                                    k_width * output_width_macro_stride;
 
-        // Process filter and bias data.
-        //
-        function_params.depth_micro_repeats =
-            j_depth == depth_macro_count ? depth_trailing_micro_repeats : 8;
-        ProcessPerDepth<implementation>::Run(
-            filter_data + 64 * j_depth,
-            bias_data + 8 * 2 * bias_increment * j_depth,
-            filter_workspace[0][0][0][0], adjusted_bias_data, &function_params);
-
         // Under depth multiplication the workspace_height_stride does not have
         // to depend on input_width_overall_micro_repeats, but this improves the
         // compactness of workspace use.
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
index e7fafa0..d23b88c 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -24,7 +24,6 @@
 
 #include <algorithm>
 
-#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
@@ -48,6 +47,9 @@
   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
   vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
 
+// Important! Most compilation configurations will compile and run without
+// reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
+// obscure bug or mis-feature probably in unhygienic macro expansion.
 #define vld1q_lane_s8x8(src, reg, lane_num) \
   vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
 #define vld1_lane_8x4(src, reg, lane_num) \
@@ -291,12 +293,22 @@
     const int8x16_t ones_vector = vdupq_n_s8(1);
 
     // Simulate NEON-register transposition of subset of filter.
-    int8x16_t filter_reg_0_a;
-    int8x16_t filter_reg_0_b;
-    int8x16_t filter_reg_1_a;
-    int8x16_t filter_reg_1_b;
-    int8x16_t filter_reg_2_a;
-    int8x16_t filter_reg_2_b;
+    int8x16_t input_0_a;
+    int8x16_t input_0_b;
+    int8x16_t input_0_c;
+    int8x16_t input_1_a;
+    int8x16_t input_1_b;
+    int8x16_t input_1_c;
+    int8x16_t input_2_a;
+    int8x16_t input_2_b;
+    int8x16_t input_2_c;
+
+    int8x16_t filter_0_a;
+    int8x16_t filter_0_b;
+    int8x16_t filter_1_a;
+    int8x16_t filter_1_b;
+    int8x16_t filter_2_a;
+    int8x16_t filter_2_b;
 
     // Register pairs for each height.
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
@@ -308,56 +320,52 @@
       // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
       // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
 
-      // Load zero-point into effective position of zero-padding of filter
-      // (register B, upper part).
-      filter_reg_0_b = vdupq_n_u8(kSignBit);
-      filter_reg_1_b = vdupq_n_u8(kSignBit);
-      filter_reg_2_b = vdupq_n_u8(kSignBit);
-
       const uint8* filter_block_ptr = filter_block;
-      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 0);
+      input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
       filter_block_ptr += depth;
-      filter_reg_0_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_b, 0);
+      input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
       filter_block_ptr += depth;
-      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 1);
+      input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
       filter_block_ptr += depth;
-      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 0);
+      input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
       filter_block_ptr += depth;
-      filter_reg_1_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_b, 0);
+      input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
       filter_block_ptr += depth;
-      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 1);
+      input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
       filter_block_ptr += depth;
-      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 0);
+      input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
       filter_block_ptr += depth;
-      filter_reg_2_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_b, 0);
+      input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
       filter_block_ptr += depth;
-      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 1);
+      input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
 
-      filter_reg_0_a = veorq_s8(filter_reg_0_a, sign_bit);
-      filter_reg_0_b = veorq_s8(filter_reg_0_b, sign_bit);
-      filter_reg_1_a = veorq_s8(filter_reg_1_a, sign_bit);
-      filter_reg_1_b = veorq_s8(filter_reg_1_b, sign_bit);
-      filter_reg_2_a = veorq_s8(filter_reg_2_a, sign_bit);
-      filter_reg_2_b = veorq_s8(filter_reg_2_b, sign_bit);
+      filter_0_a = vzip1q_s8(input_0_a, input_0_b);
+      filter_0_b = vzip1q_s8(input_0_c, sign_bit);
+      filter_1_a = vzip1q_s8(input_1_a, input_1_b);
+      filter_1_b = vzip1q_s8(input_1_c, sign_bit);
+      filter_2_a = vzip1q_s8(input_2_a, input_2_b);
+      filter_2_b = vzip1q_s8(input_2_c, sign_bit);
+      filter_0_a = veorq_s8(filter_0_a, sign_bit);
+      filter_0_b = veorq_s8(filter_0_b, sign_bit);
+      filter_1_a = veorq_s8(filter_1_a, sign_bit);
+      filter_1_b = veorq_s8(filter_1_b, sign_bit);
+      filter_2_a = veorq_s8(filter_2_a, sign_bit);
+      filter_2_b = veorq_s8(filter_2_b, sign_bit);
+      vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
+      vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
+      vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
 
-      vzipq_s8_in_place(&filter_reg_0_a, &filter_reg_0_b);
-      vzipq_s8_in_place(&filter_reg_1_a, &filter_reg_1_b);
-      vzipq_s8_in_place(&filter_reg_2_a, &filter_reg_2_b);
-      vzipq_s8x2_in_place(&filter_reg_0_a, &filter_reg_0_b);
-      vzipq_s8x2_in_place(&filter_reg_1_a, &filter_reg_1_b);
-      vzipq_s8x2_in_place(&filter_reg_2_a, &filter_reg_2_b);
-
-      vst1q_s8(shuffled_filter_data, filter_reg_0_a);
+      vst1q_s8(shuffled_filter_data, filter_0_a);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_0_b);
+      vst1q_s8(shuffled_filter_data, filter_0_b);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_1_a);
+      vst1q_s8(shuffled_filter_data, filter_1_a);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_1_b);
+      vst1q_s8(shuffled_filter_data, filter_1_b);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_2_a);
+      vst1q_s8(shuffled_filter_data, filter_2_a);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_2_b);
+      vst1q_s8(shuffled_filter_data, filter_2_b);
       shuffled_filter_data += 16;
 
       int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
@@ -367,13 +375,13 @@
       // For instance, if input_offset == 128, no adjustment is needed.
 
       int32x4_t filter_sum_a = vdupq_n_s32(0);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_0_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_1_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_2_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
       int32x4_t filter_sum_b = vdupq_n_s32(0);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_0_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_1_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_2_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
 
       adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
                                          input_offset_difference);
@@ -1213,12 +1221,6 @@
     const int input_height_stride = function_params->input_height_stride;
     const int input_depth = function_params->input_depth;
 
-    static const uint8 perm_data[64] = {
-        0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,  //
-        4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
-        8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
-        12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
-
     TFLITE_DCHECK_GE(depth_micro_repeats, 0);
     constexpr uint8 kSignBit = 0x80;
     const int micro_block_size = 4 * 8;
@@ -1234,10 +1236,6 @@
     // code. Note the blocks of 4x4 are still interleaved down the depth.
     int8x16_t work_reg_a;
     int8x16_t work_reg_b;
-    const int8x16_t perm_data_0 = vld1q_u8(perm_data);
-    const int8x16_t perm_data_1 = vld1q_u8(perm_data + 16);
-    const int8x16_t perm_data_2 = vld1q_u8(perm_data + 32);
-    const int8x16_t perm_data_3 = vld1q_u8(perm_data + 48);
 
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
     const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
@@ -1247,9 +1245,10 @@
 
     for (int k_height = 0; k_height < block_height; ++k_height) {
       const uint8* input_data_0 = input_block_data;
-      const uint8* input_data_1 = input_block_data + input_depth;
-      const uint8* input_data_2 = input_block_data + 2 * input_depth;
-      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+      int8x16_t input_data_a;
+      int8x16_t input_data_b;
+      int8x16_t input_data_c;
+      int8x16_t input_data_d;
 
       // Traverse the width one point at a time, but the depth in (micro) blocks
       // of size 8.
@@ -1258,68 +1257,103 @@
       // larger than is strictly needed to calculate output. This is because the
       // conv calculation is performed across complete micro blocks.
       for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        int8x16_t work_reg_a_sp;
+        int8x16_t work_reg_b_sp;
+
         int i_depth = 0;
-        for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
-          int8x16x4_t input_data;
-          input_data.val[0] = vld1q_u8(input_data_0);
-          input_data.val[1] = vld1q_u8(input_data_1);
-          input_data.val[2] = vld1q_u8(input_data_2);
-          input_data.val[3] = vld1q_u8(input_data_3);
-          input_data_1 += 16;
+
+        if (depth_micro_repeats >= 2) {
+          i_depth += 2;
+
+          //
+
+          input_data_a = vld1q_u8(input_data_0);
+          input_data_b = vld1q_u8(input_data_0 + 1 * input_depth);
+          input_data_c = vld1q_u8(input_data_0 + 2 * input_depth);
+          input_data_d = vld1q_u8(input_data_0 + 3 * input_depth);
           input_data_0 += 16;
 
-          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
-          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
-          work_reg_a = veorq_s8(tmp_0, sign_bit);
-          work_reg_b = veorq_s8(tmp_1, sign_bit);
+          //
 
+          for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+            work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+            work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+            vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+            work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+            work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+            vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+            input_data_a = vld1q_u8(input_data_0);
+            input_data_b = vld1q_u8(input_data_0 + 1 * input_depth);
+            vst1q_s8(scratch_data_0, work_reg_a);
+            vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+            scratch_data_0 += depth_advance;
+
+            work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+            work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+
+            input_data_c = vld1q_u8(input_data_0 + 2 * input_depth);
+            input_data_d = vld1q_u8(input_data_0 + 3 * input_depth);
+            vst1q_s8(scratch_data_0, work_reg_a_sp);
+            vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+            scratch_data_0 += depth_advance;
+
+            //
+
+            input_data_0 += 16;
+          }
+
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+          work_reg_a = veorq_s8(work_reg_a, sign_bit);
+          work_reg_b = veorq_s8(work_reg_b, sign_bit);
           vst1q_s8(scratch_data_0, work_reg_a);
           vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
           scratch_data_0 += depth_advance;
-          input_data_2 += 16;
-          input_data_3 += 16;
+          //
 
-          tmp_0 = vqtbl4q_s8(input_data, perm_data_2);
-          tmp_1 = vqtbl4q_s8(input_data, perm_data_3);
-          work_reg_a = veorq_s8(tmp_0, sign_bit);
-          work_reg_b = veorq_s8(tmp_1, sign_bit);
+          work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+          work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+          vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+          work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+          work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
 
-          vst1q_s8(scratch_data_0, work_reg_a);
-          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+          vst1q_s8(scratch_data_0, work_reg_a_sp);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
 
           scratch_data_0 += depth_advance;
         }
         for (; i_depth < depth_micro_repeats; ++i_depth) {
-          int8x16x4_t input_data;
-          input_data.val[0] =
-              vld1q_lane_s8x8(input_data_0, input_data.val[0], 0);
-          input_data.val[1] =
-              vld1q_lane_s8x8(input_data_1, input_data.val[1], 0);
-          input_data.val[2] =
-              vld1q_lane_s8x8(input_data_2, input_data.val[2], 0);
-          input_data.val[3] =
-              vld1q_lane_s8x8(input_data_3, input_data.val[3], 0);
-          input_data_1 += 8;
+          input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+          input_data_b =
+              vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
+          input_data_c =
+              vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
+          input_data_d =
+              vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
           input_data_0 += 8;
 
-          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
-          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
-          work_reg_a = veorq_s8(tmp_0, sign_bit);
-          work_reg_b = veorq_s8(tmp_1, sign_bit);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+          work_reg_a = veorq_s8(work_reg_a, sign_bit);
+          work_reg_b = veorq_s8(work_reg_b, sign_bit);
 
           vst1q_s8(scratch_data_0, work_reg_a);
           vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
           scratch_data_0 += depth_advance;
-          input_data_2 += 8;
-          input_data_3 += 8;
         }
         scratch_data_0 += width_advance;
         input_data_0 += input_depth_skip;
-        input_data_1 += input_depth_skip;
-        input_data_2 += input_depth_skip;
-        input_data_3 += input_depth_skip;
       }
       if (width_overall_micro_repeats > input_width_micro_repeats) {
         TFLITE_DCHECK_EQ(width_overall_micro_repeats,
@@ -1327,21 +1361,22 @@
         TFLITE_DCHECK_GT(residual_width, 0);
         TFLITE_DCHECK_LT(residual_width, 4);
         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-          work_reg_a = vdupq_n_u8(kSignBit);
-          work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
-          work_reg_b = vdupq_n_u8(kSignBit);
+          input_data_c = vdupq_n_u8(kSignBit);
+          input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+          input_data_d = vdupq_n_u8(kSignBit);
           if (residual_width > 1) {
-            work_reg_b =
-                vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+            input_data_b =
+                vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
             if (residual_width == 3) {
-              work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
-                                           work_reg_a, 1);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
             }
           }
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
           work_reg_a = veorq_s8(work_reg_a, sign_bit);
           work_reg_b = veorq_s8(work_reg_b, sign_bit);
-
-          vzipq_s8_in_place(&work_reg_a, &work_reg_b);
           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
 
           vst1q_s8(scratch_data_0, work_reg_a);
@@ -1349,16 +1384,11 @@
 
           scratch_data_0 += depth_advance;
           input_data_0 += 8;
-          input_data_1 += 8;
-          input_data_2 += 8;
-          input_data_3 += 8;
         }
         scratch_data_0 += width_advance;
         input_data_0 += input_depth_skip;
-        input_data_1 += input_depth_skip;
-        input_data_2 += input_depth_skip;
-        input_data_3 += input_depth_skip;
       }
+
       scratch_data_0 += height_advance;
       input_block_data += input_height_stride;
     }
@@ -1374,7 +1404,6 @@
 #ifdef __aarch64__
     PreloadInputBlock(input_block_data, function_params);
 #endif
-
     PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
                              function_params);
   }
@@ -1457,9 +1486,10 @@
 
     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
       const uint8* input_data_0 = input_block_data;
-      const uint8* input_data_1 = input_block_data + input_depth;
-      const uint8* input_data_2 = input_block_data + 2 * input_depth;
-      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+      int8x16_t input_data_a;
+      int8x16_t input_data_b;
+      int8x16_t input_data_c;
+      int8x16_t input_data_d;
 
       // Traverse the width one point at a time, but the depth in (micro) blocks
       // of size 8.
@@ -1482,53 +1512,126 @@
         }
         if (start_width == 0) {
           if (adjusted_residual_width == 4) {
-            // Load, then zero.
-            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
-              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
-              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
-              input_data_1 += 8;
-              work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+            int8x16_t work_reg_a_sp;
+            int8x16_t work_reg_b_sp;
+
+            int i_depth = 0;
+
+            if (depth_micro_repeats >= 2) {
+              i_depth += 2;
+
+              //
+
+              input_data_a = vld1q_u8(input_data_0);
+              input_data_b = vld1q_u8(input_data_0 + 1 * input_depth);
+              input_data_c = vld1q_u8(input_data_0 + 2 * input_depth);
+              input_data_d = vld1q_u8(input_data_0 + 3 * input_depth);
+              input_data_0 += 16;
+
+              //
+
+              for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+                work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+                work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+                work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+                work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+                input_data_a = vld1q_u8(input_data_0);
+                input_data_b = vld1q_u8(input_data_0 + 1 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+                scratch_data_0 += depth_advance;
+
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+
+                input_data_c = vld1q_u8(input_data_0 + 2 * input_depth);
+                input_data_d = vld1q_u8(input_data_0 + 3 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a_sp);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+                scratch_data_0 += depth_advance;
+
+                //
+
+                input_data_0 += 16;
+              }
+
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              //
+
+              work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+              work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+
+              vst1q_s8(scratch_data_0, work_reg_a_sp);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+              scratch_data_0 += depth_advance;
+            }
+            for (; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+              input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
+                                             input_data_b, 0);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+              input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
+                                             input_data_d, 0);
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
               input_data_0 += 8;
+
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
               work_reg_a = veorq_s8(work_reg_a, sign_bit);
               work_reg_b = veorq_s8(work_reg_b, sign_bit);
 
-              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
-              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-
               vst1q_s8(scratch_data_0, work_reg_a);
-              scratch_data_0 += 16;
-              vst1q_s8(scratch_data_0, work_reg_b);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
-              scratch_data_0 += depth_advance - 16;
-              input_data_2 += 8;
-              input_data_3 += 8;
+              scratch_data_0 += depth_advance;
             }
             scratch_data_0 += width_advance;
             input_data_0 += input_depth_skip;
-            input_data_1 += input_depth_skip;
-            input_data_2 += input_depth_skip;
-            input_data_3 += input_depth_skip;
           } else {
             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-              work_reg_a = vdupq_n_u8(-input_offset);
-              work_reg_b = vdupq_n_u8(-input_offset);
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = vdupq_n_u8(-input_offset);
+              input_data_c = vdupq_n_u8(-input_offset);
+              input_data_d = vdupq_n_u8(-input_offset);
               if (adjusted_residual_width > 0) {
-                work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+                input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
                 if (adjusted_residual_width > 1) {
-                  work_reg_b = vld1q_lane_s8x8(input_data_0 + input_depth,
-                                               work_reg_b, 0);
+                  input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                                 input_data_b, 0);
                   if (adjusted_residual_width == 3) {
-                    work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
-                                                 work_reg_a, 1);
+                    input_data_c = vld1q_lane_s8x8(
+                        input_data_0 + 2 * input_depth, input_data_c, 0);
                   }
                 }
               }
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
               work_reg_a = veorq_s8(work_reg_a, sign_bit);
               work_reg_b = veorq_s8(work_reg_b, sign_bit);
-
-              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
 
               vst1q_s8(scratch_data_0, work_reg_a);
@@ -1536,64 +1639,131 @@
 
               scratch_data_0 += depth_advance;
               input_data_0 += 8;
-              input_data_1 += 8;
-              input_data_2 += 8;
-              input_data_3 += 8;
             }
             scratch_data_0 += width_advance;
             input_data_0 += input_depth_skip;
-            input_data_1 += input_depth_skip;
-            input_data_2 += input_depth_skip;
-            input_data_3 += input_depth_skip;
           }
         } else {
           if (adjusted_residual_width == 4) {
-            // Load, then zero.
-            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-              work_reg_a = vdupq_n_u8(-input_offset);
-              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
-              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
-              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
-              input_data_1 += 8;
-              // Skip loading first column.
+            int8x16_t work_reg_a_sp;
+            int8x16_t work_reg_b_sp;
+
+            int i_depth = 0;
+
+            if (depth_micro_repeats >= 2) {
+              i_depth += 2;
+
+              //
+
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = vld1q_u8(input_data_0 + 1 * input_depth);
+              input_data_c = vld1q_u8(input_data_0 + 2 * input_depth);
+              input_data_d = vld1q_u8(input_data_0 + 3 * input_depth);
+              input_data_0 += 16;
+
+              //
+
+              for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+                work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+                work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+                work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+                work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+                input_data_a = vdupq_n_u8(-input_offset);
+                input_data_b = vld1q_u8(input_data_0 + 1 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+                scratch_data_0 += depth_advance;
+
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+
+                input_data_c = vld1q_u8(input_data_0 + 2 * input_depth);
+                input_data_d = vld1q_u8(input_data_0 + 3 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a_sp);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+                scratch_data_0 += depth_advance;
+
+                //
+
+                input_data_0 += 16;
+              }
+
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              //
+
+              work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+              work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+
+              vst1q_s8(scratch_data_0, work_reg_a_sp);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+              scratch_data_0 += depth_advance;
+            }
+            for (; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
+                                             input_data_b, 0);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+              input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
+                                             input_data_d, 0);
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
               input_data_0 += 8;
+
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
               work_reg_a = veorq_s8(work_reg_a, sign_bit);
               work_reg_b = veorq_s8(work_reg_b, sign_bit);
 
-              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
-              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-
               vst1q_s8(scratch_data_0, work_reg_a);
-              scratch_data_0 += 16;
-              vst1q_s8(scratch_data_0, work_reg_b);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
-              scratch_data_0 += depth_advance - 16;
-              input_data_2 += 8;
-              input_data_3 += 8;
+              scratch_data_0 += depth_advance;
             }
             scratch_data_0 += width_advance;
             input_data_0 += input_depth_skip;
-            input_data_1 += input_depth_skip;
-            input_data_2 += input_depth_skip;
-            input_data_3 += input_depth_skip;
           } else {
             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+
             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-              work_reg_a = vdupq_n_u8(-input_offset);
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = vdupq_n_u8(-input_offset);
+              input_data_c = vdupq_n_u8(-input_offset);
+              input_data_d = vdupq_n_u8(-input_offset);
               // Skip loading first column.
-              work_reg_b = vdupq_n_u8(-input_offset);
               if (adjusted_residual_width > 1) {
-                work_reg_b =
-                    vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+                input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                               input_data_b, 0);
                 if (adjusted_residual_width == 3) {
-                  work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
-                                               work_reg_a, 1);
+                  input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                                 input_data_c, 0);
                 }
               }
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
               work_reg_a = veorq_s8(work_reg_a, sign_bit);
               work_reg_b = veorq_s8(work_reg_b, sign_bit);
-
-              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
 
               vst1q_s8(scratch_data_0, work_reg_a);
@@ -1601,15 +1771,9 @@
 
               scratch_data_0 += depth_advance;
               input_data_0 += 8;
-              input_data_1 += 8;
-              input_data_2 += 8;
-              input_data_3 += 8;
             }
             scratch_data_0 += width_advance;
             input_data_0 += input_depth_skip;
-            input_data_1 += input_depth_skip;
-            input_data_2 += input_depth_skip;
-            input_data_3 += input_depth_skip;
           }
         }
       }
@@ -1834,10 +1998,6 @@
 
         // Main copy loop.
         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
-          // Important! Most compilation configurations will compile and run
-          // without the reinterpret_cast. Sanitizers may fail silently on
-          // lane-loading, with a obscure bug or mis-feature probably in
-          // unhygienic macro expansion.
           half_work_reg =
               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
                             half_work_reg, 0);
@@ -1894,10 +2054,9 @@
       TFLITE_DCHECK_EQ(start_width, 1);
       TFLITE_DCHECK(leading_width_padding);
       TFLITE_DCHECK(trailing_width_padding);
-      // ASM should use MOVI 64-bit set.
-      padding_mask = vcreate_u64(~0xffffff00L);
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vdup_n_u8(-input_offset);
         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
                                          input_block_data + input_block_offset),
                                      half_work_reg, 1);
@@ -1909,8 +2068,6 @@
             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
                                                        input_block_offset + 2),
                          half_work_reg, 3);
-        half_work_reg =
-            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
 
         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
@@ -2124,10 +2281,6 @@
 
         // Main copy loop.
         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
-          // Important! Most compilation configurations will compile and run
-          // without the reinterpret_cast. Sanitizers may fail silently on
-          // lane-loading, with a obscure bug or mis-feature probably in
-          // unhygienic macro expansion.
           half_work_reg =
               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
                             half_work_reg, 0);
@@ -2971,7 +3124,7 @@
     const int block_height = function_params->outbound_block_height;
     const int residual_width = function_params->output_residual_width;
     const int output_height_stride = function_params->output_height_stride;
-    const int bias_increment = function_params->bias_increment;
+    constexpr int kBiasIncrement = 4;
 
     TFLITE_DCHECK(depth_micro_repeats > 0);
     const int width_micro_stride = 4 * 8;
@@ -3040,8 +3193,7 @@
           uint8* output_data = output_data_base;
 
           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
-          TFLITE_DCHECK_EQ(bias_increment, 4);
-          bias_data += bias_increment;
+          bias_data += kBiasIncrement;
 
           // Load first sub-micro block of data into operational banks.
           int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
@@ -3129,32 +3281,19 @@
             int8x16_t right_bank_3_reg;
             int8x16_t right_bank_4_reg;
             int8x16_t right_bank_5_reg;
-            // Logic: (i_width == output_width_micro_repeats) &&
-            //        ((residual_width - 1) * stride_val < 2)
-            const bool no_right_block =
-                i_width == output_width_micro_repeats && residual_width < 3;
 
-            if (no_right_block) {
-              // Only needed for santizer checks.
-              right_bank_0_reg = vdupq_n_s8(0);
-              right_bank_1_reg = vdupq_n_s8(0);
-              right_bank_2_reg = vdupq_n_s8(0);
-              right_bank_3_reg = vdupq_n_s8(0);
-              right_bank_4_reg = vdupq_n_s8(0);
-              right_bank_5_reg = vdupq_n_s8(0);
-            } else {
-              right_bank_0_reg = vld1q_s8(next_input_data);
-              right_bank_1_reg =
-                  vld1q_s8(next_input_data + workspace_height_stride);
-              right_bank_2_reg =
-                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
-              right_bank_3_reg =
-                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
-              right_bank_4_reg =
-                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
-              right_bank_5_reg =
-                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
-            }
+            // Loading of next block always valid.
+            right_bank_0_reg = vld1q_s8(next_input_data);
+            right_bank_1_reg =
+                vld1q_s8(next_input_data + workspace_height_stride);
+            right_bank_2_reg =
+                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+            right_bank_3_reg =
+                vld1q_s8(next_input_data + 3 * workspace_height_stride);
+            right_bank_4_reg =
+                vld1q_s8(next_input_data + 4 * workspace_height_stride);
+            right_bank_5_reg =
+                vld1q_s8(next_input_data + 5 * workspace_height_stride);
 
             {
               acc0 = adjusted_bias_data;
@@ -3366,7 +3505,7 @@
             const bool no_right_block = output_width < 3;
 
             if (no_right_block) {
-              // Only needed for santizer checks.
+              // Only needed for sanitizer checks.
               right_bank_0_reg = vdupq_n_s8(0);
               right_bank_1_reg = vdupq_n_s8(0);
               right_bank_2_reg = vdupq_n_s8(0);
@@ -3464,94 +3603,110 @@
           filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
         }
       } else {
-        for (int s = 0; s < 2; ++s) {
-          // Work through one slice, by row, at a time.
-          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
-          uint8* output_data_base = output_data_depthwise + 4 * s;
+        const int8* input_data_base = input_data_depthwise;
+        uint8* output_data_base = output_data_depthwise;
 
-          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
-          TFLITE_DCHECK_EQ(bias_increment, 4);
-          bias_data += bias_increment;
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
 
-          for (int k_height = 0; k_height < block_height; ++k_height) {
-            const int8* next_input_data = input_data_base;
-            uint8* output_data = output_data_base;
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* next_input_data = input_data_base;
+          uint8* output_data = output_data_base;
 
-            // Load first sub-micro block of data into operational banks.
-            int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
-            int8x16_t left_bank_1_reg =
-                vld1q_s8(next_input_data + workspace_height_stride);
-            int8x16_t left_bank_2_reg =
-                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg_a =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg_a =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
+          int8x16_t left_bank_1_reg_b =
+              vld1q_s8(next_input_data + workspace_height_stride + 16);
+          int8x16_t left_bank_2_reg_b =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
 
-            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
-                 ++i_width) {
-              next_input_data += width_micro_stride;
-              const int output_width =
-                  i_width == output_width_micro_repeats ? residual_width : 4;
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+            const int output_width =
+                i_width == output_width_micro_repeats ? residual_width : 4;
 
-              // Load next sub-micro block of data.
-              int8x16_t right_bank_0_reg;
-              int8x16_t right_bank_1_reg;
-              int8x16_t right_bank_2_reg;
-              // Logic: (output_width - 1) * stride_val < 2.
-              const bool no_right_block = output_width < 3;
+            int8x16_t right_bank_0_reg_a;
+            int8x16_t right_bank_1_reg_a;
+            int8x16_t right_bank_2_reg_a;
+            int8x16_t right_bank_0_reg_b;
+            int8x16_t right_bank_1_reg_b;
+            int8x16_t right_bank_2_reg_b;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
 
-              if (no_right_block) {
-                // Only needed for santizer checks.
-                right_bank_0_reg = vdupq_n_s8(0);
-                right_bank_1_reg = vdupq_n_s8(0);
-                right_bank_2_reg = vdupq_n_s8(0);
-              } else {
-                right_bank_0_reg = vld1q_s8(next_input_data);
-                right_bank_1_reg =
-                    vld1q_s8(next_input_data + workspace_height_stride);
-                right_bank_2_reg =
-                    vld1q_s8(next_input_data + 2 * workspace_height_stride);
-              }
-              // Load next sub-micro block of data.
-
-              // Iterate over input width shifts within 4x4 blocks.
-              for (int x = 0; x < output_width; ++x) {
-                int32x4_t acc = adjusted_bias_data;
-                acc = vdotq_s32(acc, filter_reg_0_a, left_bank_0_reg);
-                acc = vdotq_s32(acc, filter_reg_1_a, left_bank_1_reg);
-                acc = vdotq_s32(acc, filter_reg_2_a, left_bank_2_reg);
-
-                // Fixed-point multiplication.
-                acc = vqrdmulhq_n_s32(acc, output_multiplier);
-                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
-                    acc, -output_shift);
-                // Add the output offset.
-                // Note that we need to fill the top half with vcombine, but can
-                // drop the instruction in ASM code.
-                int16x8_t acc_s16_0_0 =
-                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
-                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
-                // Apply the activation function.
-                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
-                acc_u8_0_0 =
-                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
-                acc_u8_0_0 =
-                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
-
-                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
-
-                biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
-                biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
-                biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
-
-                output_data += depth;
-              }
+            // Load next sub-micro block of data.
+            if (no_right_block) {
+              // Only needed for sanitizer checks.
+              right_bank_0_reg_a = vdupq_n_s8(0);
+              right_bank_1_reg_a = vdupq_n_s8(0);
+              right_bank_2_reg_a = vdupq_n_s8(0);
+              right_bank_0_reg_b = vdupq_n_s8(0);
+              right_bank_1_reg_b = vdupq_n_s8(0);
+              right_bank_2_reg_b = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg_a = vld1q_s8(next_input_data);
+              right_bank_1_reg_a =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg_a =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
+              right_bank_1_reg_b =
+                  vld1q_s8(next_input_data + workspace_height_stride + 16);
+              right_bank_2_reg_b =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
             }
-            input_data_base += workspace_height_stride;
-            output_data_base += output_height_stride;
-          }
 
-          // Move to next sub-block: advance to second set of filters.
-          filter_reg_0_a = filter_reg_0_b;
-          filter_reg_1_a = filter_reg_1_b;
-          filter_reg_2_a = filter_reg_2_b;
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              int32x4_t acc_a = adjusted_bias_data_a;
+              int32x4_t acc_b = adjusted_bias_data_b;
+              acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
+              acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
+              acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
+              acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
+              acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
+              acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
+
+              // Fixed-point multiplication.
+              acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
+              acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
+              acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_a, -output_shift);
+              acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_b, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_0 =
+                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
+              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+              acc_u8_0_0 =
+                  vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+              acc_u8_0_0 =
+                  vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+              vst1_u8(output_data, acc_u8_0_0);
+
+              biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
+              biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
+              biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
+              biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
+              biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
+              biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
+
+              output_data += depth;
+            }
+          }
+          input_data_base += workspace_height_stride;
+          output_data_base += output_height_stride;
         }
       }
       input_data_depthwise += depth_micro_stride;
@@ -3585,8 +3740,10 @@
         function_params->output_width_micro_repeats;
     const int depth_micro_repeats = function_params->depth_micro_repeats;
     const int depth = function_params->input_depth;
-    const int stride_val = function_params->stride;
-    const int four_over_stride = function_params->four_over_stride;
+    constexpr int kStrideVal = 2;
+    constexpr int kFourOverStride = 2;
+    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
+    TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
 
     const int workspace_width_micro_repeats =
         function_params->workspace_width_micro_repeats;
@@ -3595,7 +3752,7 @@
     const int block_height = function_params->outbound_block_height;
     const int residual_width = function_params->output_residual_width;
     const int output_height_stride = function_params->output_height_stride;
-    const int bias_increment = function_params->bias_increment;
+    constexpr int kBiasIncrement = 4;
 
     TFLITE_DCHECK(depth_micro_repeats > 0);
     const int width_micro_stride = 4 * 8;
@@ -3626,7 +3783,6 @@
 
     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
 
-    TFLITE_DCHECK_EQ(stride_val, 2);
     TFLITE_DCHECK_LE(block_height, 2);
 
     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
@@ -3650,7 +3806,6 @@
           const int8* input_data_0 = scratch_data + s * 2 * 8;
 
           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
-          TFLITE_DCHECK_EQ(bias_increment, 4);
 
           // Load first sub-micro block of data into operational banks.
           int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
@@ -3671,31 +3826,114 @@
 
           int32x4_t acc0;
           int32x4_t acc1;
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8;
 
-          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
-               ++i_width) {
-            const int output_width = i_width == output_width_micro_repeats
-                                         ? residual_width
-                                         : four_over_stride;
-            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          int i_width = 0;
+
+          // When output_width_micro_repeats <
+          // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
+          // residual_width == 1 is then true iff residual_width < 2.
+          const int adjusted_width_micro_repeats =
+              (output_width_micro_repeats <
+               output_width_overall_micro_repeats) &&
+                      (residual_width == 1)
+                  ? output_width_micro_repeats
+                  : output_width_overall_micro_repeats;
+
+          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+            const int output_width = kFourOverStride;
+            TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
             const int8* input_data =
                 input_data_0 + width_micro_stride * i_width;
-            const bool no_right_block = i_width == output_width_micro_repeats &&
-                                        output_width_overall_micro_repeats ==
-                                            workspace_width_micro_repeats;
+            acc0 = adjusted_bias_data;
+            acc1 = adjusted_bias_data;
+            right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+            right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                        workspace_height_stride);
 
-            if (!no_right_block) {
-              // Load next sub-micro block of data.
-              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
-              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
-                                          workspace_height_stride);
-              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
-                                          2 * workspace_height_stride);
-              right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
-                                          3 * workspace_height_stride);
-              right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
-                                          4 * workspace_height_stride);
-            }
+            acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                        2 * workspace_height_stride);
+            right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
+                                        3 * workspace_height_stride);
+            acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+            right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
+                                        4 * workspace_height_stride);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8 = vqmovun_s16(acc_s16_0_1);
+            acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+            acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+            left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+            left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+            left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+            left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+            left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+            acc0 = adjusted_bias_data;
+            acc1 = adjusted_bias_data;
+            vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+            vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+            vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+            vst1_lane_8x4(output_data_base, acc_u8, 0);
+            vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
+
+            vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+            vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+
+            acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8 = vqmovun_s16(acc_s16_0_1);
+            acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+            acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+            vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+            vst1_lane_8x4(output_data_base + depth + output_height_stride,
+                          acc_u8, 1);
+
+            left_bank_0_reg = right_bank_0_reg;
+            left_bank_1_reg = right_bank_1_reg;
+            left_bank_2_reg = right_bank_2_reg;
+            left_bank_3_reg = right_bank_3_reg;
+            left_bank_4_reg = right_bank_4_reg;
+          }
+          for (; i_width < output_width_overall_micro_repeats; ++i_width) {
+            TFLITE_DCHECK_NE(residual_width, kFourOverStride);
+
+            // No need to load next ("right") block of data.
 
             uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
 
@@ -3741,161 +3979,164 @@
               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
             }
-
-            if (output_width > 1) {
-              acc0 = adjusted_bias_data;
-              acc1 = adjusted_bias_data;
-
-              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
-              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
-              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
-              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
-              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
-              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
-
-              // Fixed-point multiplication.
-              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
-              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
-                  acc0, -output_shift);
-              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
-              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
-                  acc1, -output_shift);
-              // Add the output offset.
-              int16x8_t acc_s16_0_1 =
-                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
-              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
-              // Apply the activation function.
-              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
-              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
-              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
-
-              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
-              vst1_lane_8x4(output_data_base + depth + output_height_stride,
-                            acc_u8, 1);
-
-              left_bank_0_reg = right_bank_0_reg;
-              left_bank_1_reg = right_bank_1_reg;
-              left_bank_2_reg = right_bank_2_reg;
-              left_bank_3_reg = right_bank_3_reg;
-              left_bank_4_reg = right_bank_4_reg;
-            }
           }
-          bias_data += bias_increment;
+          bias_data += kBiasIncrement;
         }
       } else {
-        for (int s = 0; s < 2; ++s) {
-          // Simulate NEON-register transposition of subset of filter.
-          int8x16_t filter_reg_0_a;
-          int8x16_t filter_reg_1_a;
-          int8x16_t filter_reg_2_a;
+        // block_height == 1.
+        int8x16_t filter_reg_0_a;
+        int8x16_t filter_reg_1_a;
+        int8x16_t filter_reg_2_a;
+        int8x16_t filter_reg_0_b;
+        int8x16_t filter_reg_1_b;
+        int8x16_t filter_reg_2_b;
 
-          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
-          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
-          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+        filter_reg_0_a = vld1q_s8(filter_block);
+        filter_reg_1_a = vld1q_s8(filter_block + 32);
+        filter_reg_2_a = vld1q_s8(filter_block + 64);
+        filter_reg_0_b = vld1q_s8(filter_block + 16);
+        filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
+        filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
 
-          const int8* scratch_data =
-              scratch_block_data + depth_micro_stride * j_depth;
-          uint8* output_data = output_block_data + 8 * j_depth;
-          const int8* input_data_0 = scratch_data + s * 2 * 8;
+        const int8* scratch_data =
+            scratch_block_data + depth_micro_stride * j_depth;
+        uint8* output_data = output_block_data + 8 * j_depth;
+        const int8* input_data_0 = scratch_data;
 
-          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
-          TFLITE_DCHECK_EQ(bias_increment, 4);
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
 
-          // Load first sub-micro block of data into operational banks.
-          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
-          int8x16_t left_bank_1_reg =
-              vld1q_s8(input_data_0 + workspace_height_stride);
-          int8x16_t left_bank_2_reg =
-              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+        // Load first sub-micro block of data into operational banks.
+        int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
+        int8x16_t left_bank_1_reg_a =
+            vld1q_s8(input_data_0 + workspace_height_stride);
+        int8x16_t left_bank_2_reg_a =
+            vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+        int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
+        int8x16_t left_bank_1_reg_b =
+            vld1q_s8(input_data_0 + workspace_height_stride + 16);
+        int8x16_t left_bank_2_reg_b =
+            vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
 
-          int8x16_t right_bank_0_reg;
-          int8x16_t right_bank_1_reg;
-          int8x16_t right_bank_2_reg;
+        int8x16_t right_bank_0_reg_a;
+        int8x16_t right_bank_1_reg_a;
+        int8x16_t right_bank_2_reg_a;
+        int8x16_t right_bank_0_reg_b;
+        int8x16_t right_bank_1_reg_b;
+        int8x16_t right_bank_2_reg_b;
 
-          int32x4_t acc0;
+        int32x4_t acc0_a;
+        int32x4_t acc0_b;
 
-          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
-               ++i_width) {
-            const int output_width = i_width == output_width_micro_repeats
-                                         ? residual_width
-                                         : four_over_stride;
-            TFLITE_DCHECK_LE(output_width * stride_val, 4);
-            const int8* input_data =
-                input_data_0 + width_micro_stride * i_width;
-            const bool no_right_block = i_width == output_width_micro_repeats &&
-                                        output_width_overall_micro_repeats ==
-                                            workspace_width_micro_repeats;
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : kFourOverStride;
+          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+          const int8* input_data = input_data_0 + width_micro_stride * i_width;
+          const bool no_right_block = i_width == output_width_micro_repeats &&
+                                      output_width_overall_micro_repeats ==
+                                          workspace_width_micro_repeats;
 
-            if (!no_right_block) {
-              // Load next sub-micro block of data.
-              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
-              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+          if (!no_right_block) {
+            // Load next sub-micro block of data.
+            right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
+            right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
                                           workspace_height_stride);
-              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+            right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
                                           2 * workspace_height_stride);
-            }
-
-            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
-
-            // Iterate over input width shifts within 4x4 blocks.
-            {
-              acc0 = adjusted_bias_data;
-
-              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
-              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
-              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
-
-              // Fixed-point multiplication.
-              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
-              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
-                  acc0, -output_shift);
-              // Add the output offset.
-              int16x8_t acc_s16_0_1 =
-                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
-              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
-              // Apply the activation function.
-              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
-              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
-              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
-
-              vst1_lane_8x4(output_data_base, acc_u8, 0);
-
-              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
-              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
-              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
-              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
-              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
-              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
-            }
-
-            if (output_width > 1) {
-              acc0 = adjusted_bias_data;
-
-              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
-              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
-              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
-
-              // Fixed-point multiplication.
-              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
-              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
-                  acc0, -output_shift);
-              // Add the output offset.
-              int16x8_t acc_s16_0_1 =
-                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
-              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
-              // Apply the activation function.
-              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
-              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
-              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
-
-              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
-
-              left_bank_0_reg = right_bank_0_reg;
-              left_bank_1_reg = right_bank_1_reg;
-              left_bank_2_reg = right_bank_2_reg;
-            }
+            right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
+            right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride + 16);
+            right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride + 16);
           }
-          bias_data += bias_increment;
+
+          uint8* output_data_base = output_data + depth * 2 * i_width;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0_a = adjusted_bias_data_a;
+            acc0_b = adjusted_bias_data_b;
+
+            acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
+
+            // Fixed-point multiplication.
+            acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
+            acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
+            acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_a, -output_shift);
+            acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_b, -output_shift);
+            // Add the output offset.
+            int16x8_t acc_s16_0_1 =
+                vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+            acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+            acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+            vst1_u8(output_data_base, acc_u8);
+
+            left_bank_0_reg_a = vrev32q_u16(left_bank_0_reg_a);
+            left_bank_1_reg_a = vrev32q_u16(left_bank_1_reg_a);
+            left_bank_2_reg_a = vrev32q_u16(left_bank_2_reg_a);
+            left_bank_0_reg_b = vrev32q_u16(left_bank_0_reg_b);
+            left_bank_1_reg_b = vrev32q_u16(left_bank_1_reg_b);
+            left_bank_2_reg_b = vrev32q_u16(left_bank_2_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
+          }
+
+          if (output_width > 1) {
+            acc0_a = adjusted_bias_data_a;
+            acc0_b = adjusted_bias_data_b;
+
+            acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
+
+            // Fixed-point multiplication.
+            acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
+            acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
+            acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_a, -output_shift);
+            acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_b, -output_shift);
+            // Add the output offset.
+            int16x8_t acc_s16_0_1 =
+                vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+            acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+            acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+            vst1_u8(output_data_base + depth, acc_u8);
+
+            left_bank_0_reg_a = right_bank_0_reg_a;
+            left_bank_1_reg_a = right_bank_1_reg_a;
+            left_bank_2_reg_a = right_bank_2_reg_a;
+            left_bank_0_reg_b = right_bank_0_reg_b;
+            left_bank_1_reg_b = right_bank_1_reg_b;
+            left_bank_2_reg_b = right_bank_2_reg_b;
+          }
         }
       }
     }
@@ -3932,12 +4173,10 @@
     const int block_height = function_params->outbound_block_height;
     const int residual_width = function_params->output_residual_width;
     const int output_height_stride = function_params->output_height_stride;
-    const int bias_increment = function_params->bias_increment;
+    constexpr int kBiasIncrement = 4;
 
     TFLITE_DCHECK(depth_micro_repeats > 0);
 
-    TFLITE_DCHECK_EQ(bias_increment, 4);
-
     const int32 output_activation_min =
         function_params->quantized_activation_min;
     const int32 output_activation_max =
@@ -3989,6 +4228,15 @@
       filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
       filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
 
+      // When output_width_micro_repeats < output_width_overall_micro_repeats,
+      // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
+      // residual_width < 2.
+      const int adjusted_width_micro_repeats =
+          (output_width_micro_repeats < output_width_overall_micro_repeats) &&
+                  (residual_width < 4)
+              ? output_width_micro_repeats
+              : output_width_overall_micro_repeats;
+
       if (block_height == 4) {
         for (int s = 0; s < 2; ++s) {
           // Work through one slice, by row, at a time.
@@ -3998,8 +4246,7 @@
           uint8* output_data = output_data_base;
 
           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
-          TFLITE_DCHECK_EQ(bias_increment, 4);
-          bias_data += bias_increment;
+          bias_data += kBiasIncrement;
 
           int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
           int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
@@ -4041,8 +4288,8 @@
           acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
           acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
 
-          for (int i_width = 0; i_width < output_width_micro_repeats;
-               ++i_width) {
+          int i_width = 0;
+          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
             next_input_data += 4;
 
             // Iterate over input width shifts within 4x4 blocks.
@@ -4340,7 +4587,7 @@
             }
           }
 
-          if (residual_width > 0) {
+          if (i_width < output_width_overall_micro_repeats) {
             next_input_data += 4;
             const int output_width = residual_width;
 
@@ -4449,90 +4696,90 @@
         }
       } else {
         // Block height < 4.
-        for (int s = 0; s < 2; ++s) {
-          // Work through one slice, by row, at a time.
-          uint8* output_data_base = output_data_depthwise + 4 * s;
+        uint8* output_data_base = output_data_depthwise;
 
-          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
-          TFLITE_DCHECK_EQ(bias_increment, 4);
-          bias_data += bias_increment;
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
 
-          for (int k_height = 0; k_height < block_height; ++k_height) {
-            const int8* next_input_data =
-                scratch_block_data + k_height * workspace_height_stride;
-            uint8* output_data = output_data_base;
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* next_input_data =
+              scratch_block_data + k_height * workspace_height_stride;
+          uint8* output_data = output_data_base;
 
-            int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
-            int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+          int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
 
-            // Load first sub-micro block of data into operational banks.
-            input_bank_a_reg =
-                vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
-                                                  // uninitialized variable.
-            input_bank_a_reg = vld1q_lane_8x4(
-                next_input_data + workspace_height_stride, input_bank_a_reg, 2);
-            input_bank_b_reg = vld1q_dup_s8x4(
-                next_input_data +
-                2 * workspace_height_stride);  // Load lane 0, avoiding
-                                               // uninitialized variable.
+          // Load first sub-micro block of data into operational banks.
+          input_bank_p_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_p_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_p_reg, 2);
+          input_bank_q_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
 
-            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
-                 ++i_width) {
-              next_input_data += 4;
-              const int output_width =
-                  i_width == output_width_micro_repeats ? residual_width : 4;
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            next_input_data += 4;
+            const int output_width =
+                i_width == output_width_micro_repeats ? residual_width : 4;
 
-              // Load next sub-micro block of data.
-              input_bank_a_reg =
-                  vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
-              input_bank_a_reg =
-                  vld1q_lane_8x4(next_input_data + workspace_height_stride,
-                                 input_bank_a_reg, 3);
-              input_bank_b_reg =
-                  vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
-                                 input_bank_b_reg, 1);
-              // Iterate over input width shifts within 4x4 blocks.
-              for (int x = 0; x < output_width; ++x) {
-                int32x4_t acc = adjusted_bias_data;
-                acc = vdotq_four_lane_s32(acc, filter_reg_0_a, input_bank_a_reg,
-                                          0);
-                acc = vdotq_four_lane_s32(acc, filter_reg_1_a, input_bank_a_reg,
-                                          2);
-                acc = vdotq_four_lane_s32(acc, filter_reg_2_a, input_bank_b_reg,
-                                          0);
+            // Load next sub-micro block of data.
+            input_bank_p_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
+            input_bank_p_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_p_reg, 3);
+            input_bank_q_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_q_reg, 1);
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              int32x4_t acc_a = adjusted_bias_data_a;
+              int32x4_t acc_b = adjusted_bias_data_b;
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
+                                          input_bank_p_reg, 0);
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
+                                          input_bank_p_reg, 2);
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
+                                          input_bank_q_reg, 0);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
+                                          input_bank_p_reg, 0);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
+                                          input_bank_p_reg, 2);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
+                                          input_bank_q_reg, 0);
 
-                // Fixed-point multiplication.
-                acc = vqrdmulhq_n_s32(acc, output_multiplier);
-                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
-                    acc, -output_shift);
-                // Add the output offset.
-                // Note that we need to fill the top half with vcombine, but can
-                // drop the instruction in ASM code.
-                int16x8_t acc_s16_0_0 =
-                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
-                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
-                // Apply the activation function.
-                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
-                acc_u8_0_0 =
-                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
-                acc_u8_0_0 =
-                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+              // Fixed-point multiplication.
+              acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
+              acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
+              acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_a, -output_shift);
+              acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_b, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_0 =
+                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
+              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+              acc_u8_0_0 =
+                  vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+              acc_u8_0_0 =
+                  vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
 
-                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+              vst1_u8(output_data, acc_u8_0_0);
 
-                input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
-                input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+              input_bank_p_reg = vshrq_n_u64(input_bank_p_reg, 8);
+              input_bank_q_reg = vshrq_n_u64(input_bank_q_reg, 8);
 
-                output_data += output_depth;
-              }
+              output_data += output_depth;
             }
-            output_data_base += output_height_stride;
           }
-
-          // Move to next sub-block: advance to second set of filters.
-          filter_reg_0_a = filter_reg_0_b;
-          filter_reg_1_a = filter_reg_1_b;
-          filter_reg_2_a = filter_reg_2_b;
+          output_data_base += output_height_stride;
         }
       }
       output_data_depthwise += 8;
@@ -4563,15 +4810,15 @@
         function_params->output_width_micro_repeats;
     const int depth_micro_repeats = function_params->depth_micro_repeats;
     const int output_depth = function_params->output_depth;
-    const int stride_val = function_params->stride;
-    const int four_over_stride = function_params->four_over_stride;
+    constexpr int kStrideVal = 2;
+    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
 
     const int output_width_overall_micro_repeats =
         function_params->output_width_overall_micro_repeats;
     const int block_height = function_params->outbound_block_height;
     const int residual_width = function_params->output_residual_width;
     const int output_height_stride = function_params->output_height_stride;
-    const int bias_increment = function_params->bias_increment;
+    constexpr int kBiasIncrement = 4;
 
     const int32 output_activation_min =
         function_params->quantized_activation_min;
@@ -4588,7 +4835,6 @@
     TFLITE_DCHECK_LT(output_offset, 32768);
 
     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
-    TFLITE_DCHECK_EQ(bias_increment, 4);
 
     const int16x8_t output_offset_vec =
         vdupq_n_s16(static_cast<int16>(output_offset));
@@ -4618,11 +4864,10 @@
       filter_reg_2_b = vld1q_s8(filter_workspace);
       filter_workspace += 16;
 
-      TFLITE_DCHECK_EQ(bias_increment, 4);
       const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
-      bias_data += bias_increment;
+      bias_data += kBiasIncrement;
       const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
-      bias_data += bias_increment;
+      bias_data += kBiasIncrement;
 
       if (block_height == 2) {
         const int8* scratch_data = scratch_block_data;
@@ -4652,15 +4897,17 @@
         int32x4_t acc0;
         int32x4_t acc1;
 
-        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
-             ++i_width) {
-          const int output_width = i_width == output_width_micro_repeats
-                                       ? residual_width
-                                       : four_over_stride;
+        // When output_width_micro_repeats < output_width_overall_micro_repeats,
+        // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
+        // residual_width < 2.
+        const int adjusted_width_micro_repeats =
+            (output_width_micro_repeats < output_width_overall_micro_repeats) &&
+                    (residual_width < 2)
+                ? output_width_micro_repeats
+                : output_width_overall_micro_repeats;
 
-          TFLITE_DCHECK_LE(output_width, 2);
-          TFLITE_DCHECK_GE(output_width, 1);
-          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+        int i_width = 0;
+        for (; i_width < adjusted_width_micro_repeats; ++i_width) {
           const int8* input_data = scratch_data + 4 + 4 * i_width;
 
           // Load next sub-micro block of data.
@@ -4757,7 +5004,93 @@
 
             output_data += output_depth;
           }
-          if (output_width == 2) {
+
+          // output_width == four_over_stride.
+          acc0 = adjusted_bias_data_s_0;
+          acc1 = adjusted_bias_data_s_0;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc0, -output_shift);
+          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc1, -output_shift);
+          // Add the output offset.
+          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+          // Apply the activation function.
+          acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+          acc_u8_0_1 =
+              vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+          acc_u8_0_1 =
+              vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+          vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+          vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+          acc0 = adjusted_bias_data_s_1;
+          acc1 = adjusted_bias_data_s_1;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc0, -output_shift);
+          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc1, -output_shift);
+          // Add the output offset.
+          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+          // Apply the activation function.
+          acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+          acc_u8_0_1 =
+              vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+          acc_u8_0_1 =
+              vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+          vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+          vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
+
+          input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+          input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+          input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+          output_data += output_depth;
+        }
+        for (; i_width < output_width_overall_micro_repeats; ++i_width) {
+          // output_width == 1.
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
             acc0 = adjusted_bias_data_s_0;
             acc1 = adjusted_bias_data_s_0;
 
@@ -4869,7 +5202,7 @@
 
           TFLITE_DCHECK_LE(output_width, 2);
           TFLITE_DCHECK_GE(output_width, 1);
-          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
           const int8* input_data = scratch_data + 4 + 4 * i_width;
 
           // Load next sub-micro block of data.
diff --git a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
index e3600a7..e6dd6f8 100644
--- a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
@@ -15,7 +15,7 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index da839ca..2d6362a 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -15,7 +15,7 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
index f0cd4c9..2c67b97 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
@@ -15,13 +15,10 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
 
-// This must be #included first because it is what defines GEMMLOWP_NEON
-#include "public/gemmlowp.h"
-
-#ifdef GEMMLOWP_NEON
-
-#include "fixedpoint/fixedpoint.h"
-#include "public/map.h"
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -29,43 +26,6 @@
 namespace tflite {
 namespace optimized_integer_ops {
 
-struct GemmlowpOutputPipelineFixedPointPCLhs {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
-                         gemmlowp::VectorShape::Col>,
-                     gemmlowp::OutputStageClamp,
-                     gemmlowp::OutputStageSaturatingCastToInt8>
-      Pipeline;
-  static Pipeline MakeExp(const int32* bias_data, int output_rows,
-                          const int32 output_offset,
-                          const int32* output_multiplier,
-                          const int* output_left_shift,
-                          int32 output_activation_min,
-                          int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
-        gemmlowp::VectorShape::Col>
-        quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier =
-        ColVectorMap(output_multiplier, output_rows);
-    quantize_down_stage.result_exponent =
-        ColVectorMap(output_left_shift, output_rows);
-
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
-  }
-};
-
 // Fixed-point per-channel-quantization convolution reference kernel.
 inline void ConvPerChannel(
     const ConvParams& params, const int32* output_multiplier,
@@ -74,7 +34,7 @@
     const int8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
     const RuntimeShape& im2col_shape, int8* im2col_data,
-    gemmlowp::GemmContext* gemmlowp_context) {
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("Conv/8bit");
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -136,26 +96,37 @@
   TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
-  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, filter_rows, filter_cols);
-  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      gemm_input_data, gemm_input_rows, gemm_input_cols);
-  gemmlowp::MatrixMap<int8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, output_cols);
 
-  const auto& output_pipeline = GemmlowpOutputPipelineFixedPointPCLhs::MakeExp(
-      bias_data, output_rows, output_offset, output_multiplier, output_shift,
-      output_activation_min, output_activation_max);
-
-  gemmlowp::GemmWithOutputPipeline<
-      int8, int8, gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams>(
-      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
-      /*filter_offset*/ 0, input_offset, output_pipeline);
+  cpu_backend_gemm::MatrixParams<int8> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = 0;  // filter is symmetric-quantized
+  cpu_backend_gemm::MatrixParams<int8> rhs_params;
+  rhs_params.rows = gemm_input_rows;
+  rhs_params.cols = gemm_input_cols;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<int8> dst_params;
+  dst_params.rows = output_rows;
+  dst_params.cols = output_cols;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<
+      int32, int8,
+      cpu_backend_gemm::QuantizationFlavor::kIntegerWithPerRowMultiplier>
+      gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint_perchannel = output_multiplier;
+  gemm_params.multiplier_exponent_perchannel = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
 }  // namespace optimized_integer_ops
 }  // namespace tflite
 
-#endif  // GEMMLOWP_NEON
-
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 363b996..1edf736 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -15,9 +15,12 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
 
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -1912,7 +1915,7 @@
 }
 
 template <typename T, typename TS>
-struct DepthwiseConvWorkerTask : public gemmlowp::Task {
+struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
   DepthwiseConvWorkerTask(const DepthwiseParams& params,
                           const int32* output_multiplier,
                           const int32* output_shift,
@@ -1981,9 +1984,8 @@
     const int8* input_data, const RuntimeShape& filter_shape,
     const int8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
-    gemmlowp::GemmContext* gemmlowp_context = nullptr) {
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("DepthwiseConvInt8");
-
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@@ -2003,8 +2005,7 @@
     thread_count = thread_count_row;
   }
 
-  const int max_threads =
-      gemmlowp_context ? gemmlowp_context->max_num_threads() : 1;
+  const int max_threads = cpu_backend_context->max_num_threads();
   thread_count = std::max(1, std::min(thread_count, max_threads));
 
   if (thread_count == 1) {
@@ -2013,18 +2014,22 @@
                       bias_data, output_shape, output_data, /*thread_start=*/0,
                       /*thread_end=*/output_rows, /*thread_dim=*/1);
   } else {
-    std::vector<gemmlowp::Task*> tasks(thread_count);
+    std::vector<DepthwiseConvWorkerTask<int8, int32>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
     int thread_start = 0;
     for (int i = 0; i < thread_count; ++i) {
       int thread_end =
           thread_start + (thread_dim_size - thread_start) / (thread_count - i);
-      tasks[i] = new DepthwiseConvWorkerTask<int8, int32>(
-          params, output_multiplier, output_shift, input_shape, input_data,
-          filter_shape, filter_data, bias_shape, bias_data, output_shape,
-          output_data, thread_start, thread_end, thread_dim);
+      tasks.emplace_back(params, output_multiplier, output_shift, input_shape,
+                         input_data, filter_shape, filter_data, bias_shape,
+                         bias_data, output_shape, output_data, thread_start,
+                         thread_end, thread_dim);
       thread_start = thread_end;
     }
-    gemmlowp_context->workers_pool()->Execute(tasks);
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
new file mode 100644
index 0000000..b14371f
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@@ -0,0 +1,2957 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
+
+#include <memory>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#ifdef USE_NEON
+
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#include <stddef.h>
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_FILTER_OFFSET 48
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_RIGHT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+                  OFFSET_OUTPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+                  OFFSET_INPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+                  OFFSET_OUTPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
+                  OFFSET_FILTER_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+                  OFFSET_INPUT_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+                  OFFSET_OUTPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
+#endif  // __aarch64__
+#endif  // ARM NEON
+
+#ifdef USE_NEON
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+
+template <>
+struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
+                                     1> {
+ public:
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v30.16b, w4\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v31.16b, w0\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "add x10, %[bias_ptr], #16\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "dup v9.8h, w9\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v10.8h, v26.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v11.8h, v26.8h, v11.8b\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "uaddw v13.8h, v26.8h, v13.8b\n"
+            "uaddw v14.8h, v26.8h, v14.8b\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v16.8h, v26.8h, v16.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "uaddw v17.8h, v26.8h, v17.8b\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "uaddw v19.8h, v26.8h, v19.8b\n"
+            "uaddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "ld1 {v21.4s}, [%[bias_ptr]]\n"
+        "ld1 {v22.4s}, [x10]\n"
+        "ld1 {v23.4s}, [%[bias_ptr]]\n"
+        "ld1 {v24.4s}, [x10]\n"
+
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "uaddw v14.8h, v26.8h, v14.8b\n"
+        "uaddw v15.8h, v26.8h, v15.8b\n"
+        "uaddw v17.8h, v26.8h, v17.8b\n"
+        "uaddw v18.8h, v26.8h, v18.8b\n"
+        "uaddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [%[output_ptr]], x3\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [%[output_ptr]], x3\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "uaddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "uaddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrshl v21.4s, v21.4s, v28.4s\n"
+        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqrshl v23.4s, v23.4s, v28.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtn v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqadd v23.8h, v23.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "sqxtun2 v21.16b, v23.8h\n"
+        "umax v21.16b, v21.16b, v30.16b\n"
+        "umin v21.16b, v21.16b, v31.16b\n"
+        "st1 {v21.8b}, [%[output_ptr]], x3\n"
+        "mov v23.d[0], v21.d[1]\n"
+        "st1 {v23.8b}, [%[output_ptr]], x3\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrshl v21.4s, v21.4s, v28.4s\n"
+        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "umax v21.8b, v21.8b, v30.8b\n"
+        "umin v21.8b, v21.8b, v31.8b\n"
+        "st1 {v21.8b}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
+                                     2> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v28.8h, w0\n"
+        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.4s, w9\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w1\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w3\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "dup v31.16b, w4\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+
+        // Load filters and add offsets.
+        "add x10, %[bias_ptr], #16\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "dup v9.8h, w20\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v19.4s}, [%[bias_ptr]]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "ld1 {v20.4s}, [x10]\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v25.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [x10]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x6], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+            "sqrshl v19.4s, v19.4s, v28.4s\n"
+            "sqrshl v20.4s, v20.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v28.4s\n"
+            "sqrshl v26.4s, v26.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v19.4h, v19.4s\n"
+            "sqxtn2 v19.8h, v20.4s\n"
+            "sqxtn v25.4h, v25.4s\n"
+            "sqxtn2 v25.8h, v26.4s\n"
+            "sqadd v19.8h, v19.8h, v29.8h\n"
+            "sqadd v25.8h, v25.8h, v29.8h\n"
+            "sqxtun v19.8b, v19.8h\n"
+            "sqxtun2 v19.16b, v25.8h\n"
+            "ld1 {v20.4s}, [x10]\n"
+            "umax v19.16b, v19.16b, v30.16b\n"
+            "umin v19.16b, v19.16b, v31.16b\n"
+            "ld1 {v26.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v19.8b}, [x7], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v25.d[0], v19.d[1]\n"
+            "st1 {v25.8b}, [x7], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "ld1 {v19.4s}, [%[bias_ptr]]\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "ld1 {v25.4s}, [%[bias_ptr]]\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x6]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+          "sqrshl v19.4s, v19.4s, v28.4s\n"
+          "sqrshl v20.4s, v20.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v28.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v19.4h, v19.4s\n"
+          "sqxtn2 v19.8h, v20.4s\n"
+          "sqxtn v25.4h, v25.4s\n"
+          "sqxtn2 v25.8h, v26.4s\n"
+          "sqadd v19.8h, v19.8h, v29.8h\n"
+          "sqadd v25.8h, v25.8h, v29.8h\n"
+          "sqxtun v19.8b, v19.8h\n"
+          "sqxtun2 v19.16b, v25.8h\n"
+          "umax v19.16b, v19.16b, v30.16b\n"
+          "umin v19.16b, v19.16b, v31.16b\n"
+          "st1 {v19.8b}, [x7], x5\n"
+          "mov v25.d[0], v19.d[1]\n"
+          "st1 {v25.8b}, [x7]\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "dup v26.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v26.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "umax v21.8b, v21.8b, v30.8b\n"
+          "umin v21.8b, v21.8b, v31.8b\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v23.4s, v23.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v26.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "umax v23.8b, v23.8b, v30.8b\n"
+          "umin v23.8b, v23.8b, v31.8b\n"
+          "st1 {v23.8b}, [x7]\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "uaddw v9.8h, v28.8h, v9.8b\n"
+        "ld1 {v24.4s}, [%[bias_ptr]]\n"
+        "uaddw v10.8h, v28.8h, v10.8b\n"
+        "ld1 {v25.4s}, [x10]\n"
+        "uaddw v11.8h, v28.8h, v11.8b\n"
+        "ld1 {v26.4s}, [%[bias_ptr]]\n"
+        "ld1 {v27.4s}, [x10]\n"
+        "uaddw v12.8h, v28.8h, v12.8b\n"
+        "uaddw v13.8h, v28.8h, v13.8b\n"
+        "uaddw v14.8h, v28.8h, v14.8b\n"
+        "uaddw v15.8h, v28.8h, v15.8b\n"
+        "uaddw v16.8h, v28.8h, v16.8b\n"
+        "uaddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "uaddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "uaddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "uaddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "uaddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "uaddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "dup v28.4s, w1\n"
+          "dup v29.4s, w9\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+          "dup v28.8h, w2\n"
+          "sqrshl v24.4s, v24.4s, v29.4s\n"
+          "sqrshl v25.4s, v25.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v29.4s\n"
+          "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "sqxtn v24.4h, v24.4s\n"
+          "sqxtn2 v24.8h, v25.4s\n"
+          "sqxtn v26.4h, v26.4s\n"
+          "sqxtn2 v26.8h, v27.4s\n"
+          "sqadd v24.8h, v24.8h, v28.8h\n"
+          "sqadd v26.8h, v26.8h, v28.8h\n"
+          "sqxtun v24.8b, v24.8h\n"
+          "sqxtun2 v24.16b, v26.8h\n"
+          "dup v28.8h, w0\n"
+          "ld1 {v25.4s}, [x10]\n"
+          "umax v24.16b, v24.16b, v30.16b\n"
+          "umin v24.16b, v24.16b, v31.16b\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v24.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v26.d[0], v24.d[1]\n"
+          "st1 {v26.8b}, [x6], x5\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v24.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "uaddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "uaddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "uaddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "uaddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "uaddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "uaddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "dup v28.4s, w1\n"
+        "dup v29.4s, w9\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+        "dup v28.8h, w2\n"
+        "sqrshl v24.4s, v24.4s, v29.4s\n"
+        "sqrshl v25.4s, v25.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v29.4s\n"
+        "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtn v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd v24.8h, v24.8h, v28.8h\n"
+        "sqadd v26.8h, v26.8h, v28.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqxtun2 v24.16b, v26.8h\n"
+        "dup v28.8h, w0\n"
+        "umax v24.16b, v24.16b, v30.16b\n"
+        "umin v24.16b, v24.16b, v31.16b\n"
+        "st1 {v24.8b}, [x6], x5\n"
+        "mov v26.d[0], v24.d[1]\n"
+        "st1 {v26.8b}, [x6]\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v26.4s, w9\n"
+        "dup v27.4s, w1\n"
+        "dup v29.8h, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "sqrshl v24.4s, v24.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v26.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqadd v24.8h, v24.8h, v29.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax v24.8b, v24.8b, v30.8b\n"
+        "umin v24.8b, v24.8b, v31.8b\n"
+        "st1 {v24.8b}, [x6]\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kCenter, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w10\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w10\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w10\n"
+        "dup v25.8h, w9\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kCorner, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w7\n"
+        "dup v25.8h, w6\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
+        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kVertical, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        // TODO(b/129852264): Improve testing coverage.
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_FILTER_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_RIGHT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvThroughDepthPerChannel {
+  // Runs the DepthwiseConvWindowPerChannel kernels through the depth dimension
+  // from |start_depth| to |end_depth|. Keep this not inlined to maintain a
+  // small binary size. We use a DepthwiseConvParams struct for read only params
+  // to minimize call overhead.
+  static void __attribute__((noinline))
+  Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
+      int32 output_window_width, const DepthwiseConvParams& params) {
+    for (; start_depth <= end_depth - 8; start_depth += 8) {
+      DepthwiseConvWindowPerChannel<output_rounding, 8, kStrideWidth,
+                                    kStrideHeight>::Run(input_ptr, filter_ptr,
+                                                        bias_ptr, output_ptr,
+                                                        input_depth,
+                                                        input_row_size,
+                                                        output_window_height,
+                                                        output_window_width,
+                                                        &params);
+      input_ptr += 8;
+      output_ptr += 8;
+      filter_ptr += 8;
+      bias_ptr += 8;
+    }
+  }
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvMultiRowPerChannel {
+  using ConvKernel =
+      DepthwiseConvThroughDepthPerChannel<output_rounding, kStrideWidth,
+                                          kStrideHeight>;
+
+  static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
+                         const uint8* filter_data, const int32* bias_data,
+                         uint8* output_data, const DepthwiseConvParams& params,
+                         const ShuffleParams& shuffle_params,
+                         uint8* shuffle_workspace) {
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
+        get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
+        get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+    TFLITE_DCHECK_LE(
+        64 * shuffle_params.input_width * shuffle_params.input_height,
+        kDepthwiseConvScratchWorkspaceSize);
+
+    int32 out_x = start_x;
+
+    // Run shuffling on inputs with sufficiently large depth and width. When
+    // these parameters are large enough, more time is taken to load inputs
+    // from memory. At this point, it becomes useful to prefetch and
+    // preshuffle the input data to maximize locality.
+    if (params.output_depth > 64 ||
+        (params.output_depth <= 64 && params.input_width > 150)) {
+      for (; out_x <= (end_x - shuffle_params.output_width);
+           out_x += shuffle_params.output_width) {
+        const uint8* input_ptr = input_data;
+        const int32* bias_ptr = bias_data;
+        const uint8* filter_ptr = filter_data;
+        uint8* output_ptr = output_data;
+        int64_t depth = 0;
+        const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+        for (; depth <= params.output_depth - 64; depth += 64) {
+          // Preload.
+          const uint8* h_ptr = input_ptr;
+          for (int32 i = 0; i < shuffle_params.input_height; i++) {
+            const uint8* ptr = h_ptr;
+            for (int32 j = 0; j < shuffle_params.input_width; j++) {
+              asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+              ptr += params.input_depth;
+            }
+            h_ptr += params.input_row_size;
+          }
+
+          // For a large enough input, shuffle into buckets.
+          ShuffleInput(input_ptr, params.input_depth, params.input_width,
+                       params.input_height, 64, shuffle_params.input_width,
+                       shuffle_params.input_height, shuffle_workspace);
+          ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+                          0, 64, 64, shuffle_row_size,
+                          shuffle_params.output_height,
+                          shuffle_params.output_width, params);
+          input_ptr += 64;
+          output_ptr += 64;
+          filter_ptr += 64;
+          bias_ptr += 64;
+        }
+
+        // Preload.
+        const uint8* h_ptr = input_ptr;
+        for (int32 i = 0; i < shuffle_params.input_height; i++) {
+          const uint8* ptr = h_ptr;
+          for (int32 j = 0; j < shuffle_params.input_width; j++) {
+            asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+            ptr += params.input_depth;
+          }
+          h_ptr += params.input_row_size;
+        }
+
+        // Handle leftover depth.
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
+                        params.input_row_size, shuffle_params.output_height,
+                        shuffle_params.output_width, params);
+
+        input_data +=
+            shuffle_params.output_width * kStrideWidth * params.input_depth;
+        output_data += shuffle_params.output_width * params.output_depth;
+      }
+    }
+
+    const int32 output_leftover_width = end_x - out_x;
+    if (output_leftover_width > 0) {
+      ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
+                      params.output_depth, params.input_depth,
+                      params.input_row_size, shuffle_params.output_height,
+                      output_leftover_width, params);
+    }
+  }
+};
+
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConvHandlePaddingPerChannel(
+    const uint8* input_data, const uint8* filter_data, const int32* bias_data,
+    uint8* output_data, const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const uint8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
+                                   1>::Run(input_data, filter_ptr, bias_data,
+                                           output_data, &params);
+    return;
+  }
+
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const uint8* input_ptr = input_data;
+  const uint8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
+  uint8* output_ptr = output_data;
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+               (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConv3x3FilterPerChannel(
+    const DepthwiseParams& rt_params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvParams params;
+
+  const int32 stride_width = rt_params.stride_width;
+  const int32 stride_height = rt_params.stride_height;
+  const int32 pad_width = rt_params.padding_values.width;
+  const int32 pad_height = rt_params.padding_values.height;
+  const int32 depth_multiplier = rt_params.depth_multiplier;
+  const int32 output_activation_min = rt_params.quantized_activation_min;
+  const int32 output_activation_max = rt_params.quantized_activation_max;
+  const int32 input_offset = rt_params.input_offset;
+  const int32 filter_offset = rt_params.weights_offset;
+  const int32 output_offset = rt_params.output_offset;
+  const int32 output_multiplier = rt_params.output_multiplier;
+  const int32 output_shift = rt_params.output_shift;
+
+  params.input_depth = input_shape.Dims(3);
+  params.input_width = input_shape.Dims(2);
+  params.input_height = input_shape.Dims(1);
+  params.input_row_size = params.input_depth * params.input_width;
+  params.input_offset = input_offset;
+  params.stride_width = stride_width;
+  params.stride_height = stride_height;
+  params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  params.output_width = output_shape.Dims(2);
+  params.output_height = output_shape.Dims(1);
+  params.output_row_size = params.output_depth * params.output_width;
+  params.output_offset = output_offset;
+  params.filter_offset = filter_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_right_shift = output_shift;
+  params.output_activation_min = output_activation_min;
+  params.output_activation_max = output_activation_max;
+
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+  params.filter_row_size = params.output_depth * filter_width;
+
+  // Algorithm assumes below constraints. It is optimized for depth
+  // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
+  TFLITE_DCHECK(depth_multiplier == 1);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 3);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int64_t input_batch_size = params.input_row_size * params.input_height;
+  const int64_t output_batch_size =
+      params.output_row_size * params.output_height;
+
+  ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+      four_row_shuffle_params, eight_row_shuffle_params;
+  if (stride_width == 1) {
+    one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+    two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+    four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+    eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+  } else {
+    one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+    two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+    four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+    eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+  }
+
+  using conv_multirow_func_t =
+      decltype(&DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run);
+  conv_multirow_func_t conv_multirow_func =
+      DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run;
+  if (stride_width == 2) {
+    conv_multirow_func =
+        DepthwiseConvMultiRowPerChannel<output_rounding, 2, 2>::Run;
+  }
+
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array
+  // used in gemmlowp.
+  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
+
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = params.output_height;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, params.output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      break;
+  }
+
+  for (int32 b = batch_start; b < batch_end; ++b) {
+    // input_ptr and output_ptr point to the start of each batch
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
+
+    int32 out_x = 0;
+    int32 out_y = row_start;
+    int32 end_x = params.output_width;
+    int32 end_y = row_end;
+
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePaddingPerChannel<output_rounding>(
+          input_ptr, filter_data, bias_data, output_ptr, params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = std::max(1, out_y);
+      end_y = std::min(params.output_height - 1, end_y);
+    }
+
+    // pad_width and pad_height can both be 0 or 1, depending on padding option,
+    // such as Padding_VALID / Padding_SAME.
+    const int in_x = (out_x * stride_width) - pad_width;
+    const int in_y = (out_y * stride_height) - pad_height;
+
+    // input_ptr and output_ptr point to (in_y, in_x) and (out_y, out_x),
+    // respectively. (in_y, in_x) and (out_y, out_x) change along with
+    // row_start.
+    input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+    output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
+
+    // Shuffling shapes that maximize width over the shuffle workspace size
+    // perform better since the inputs are closer together, minimizing
+    // shuffling time.
+    //
+    // If the input shape has width large enough for the 2 row kernels,
+    // we prefer to use this. The innermost loop of the kernels handle
+    // 2 height x 2 width so this is the fastest path.
+    //
+    // If the input shape has smaller width but larger height, shuffling is
+    // still useful and can benefit from kernels 4 row and 8 row kernels.
+
+    // Handle 8 rows at a time.
+    if (params.input_width < four_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, eight_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 8 * stride_height * params.input_row_size;
+        output_ptr += 8 * params.output_row_size;
+      }
+    }
+
+    // Handle 4 rows at a time.
+    if (params.input_width < two_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, four_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 4 * stride_height * params.input_row_size;
+        output_ptr += 4 * params.output_row_size;
+      }
+    }
+
+    // Handle 2 rows at a time.
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, two_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += 2 * stride_height * params.input_row_size;
+      output_ptr += 2 * params.output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, one_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += stride_height * params.input_row_size;
+      output_ptr += params.output_row_size;
+    }
+  }
+}
+#endif  // __aarch64__
+
+#endif
+
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
index 3b01e00..e45f250 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
@@ -15,29 +15,16 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 
 namespace tflite {
 namespace optimized_integer_ops {
 
-inline void optimized_ops_preload_l1_stream(const int8_t* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-inline void optimized_ops_preload_l1_keep(const int8_t* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
 #ifdef USE_NEON
 inline void FullyConnectedAsGEMVWorkerImpl(
     const RuntimeShape& input_shape, const int8_t* input_data,
@@ -256,7 +243,7 @@
   }
 }
 
-struct FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+struct FullyConnectedAsGEMVWorkerTask : public cpu_backend_threadpool::Task {
   FullyConnectedAsGEMVWorkerTask(
       const RuntimeShape& input_shape, const int8_t* input_data,
       int32 input_offset, const RuntimeShape& filter_shape,
@@ -319,14 +306,14 @@
     const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
     int32 output_multiplier, int output_shift, int32 output_activation_min,
     int32 output_activation_max, const RuntimeShape& output_shape,
-    int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+    int8_t* output_data, CpuBackendContext* cpu_backend_context) {
   const int output_dim_count = output_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
   const int output_rows = output_shape.Dims(output_dim_count - 1);
   const int input_size = FlatSizeSkipDim(input_shape, 0);
   static constexpr int kKernelRows = 4;
-  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      gemmlowp_context->max_num_threads(), output_rows, batches, input_size);
+  const int thread_count = LegacyHowManyThreads<kKernelRows>(
+      cpu_backend_context->max_num_threads(), output_rows, batches, input_size);
   if (thread_count == 1) {
     // Single-thread case: do the computation on the current thread, don't
     // use a threadpool
@@ -340,61 +327,36 @@
 
   // Multi-threaded case: use the gemmlowp context's threadpool.
   TFLITE_DCHECK_GT(thread_count, 1);
-  std::vector<gemmlowp::Task*> tasks(thread_count);
-  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
-      gemmlowp::CeilQuotient(output_rows, thread_count));
+  std::vector<FullyConnectedAsGEMVWorkerTask> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  const int kRowsPerWorker =
+      RoundUp<kKernelRows>(CeilQuotient(output_rows, thread_count));
   int row_start = 0;
   for (int i = 0; i < thread_count; ++i) {
     int row_end = std::min(output_rows, row_start + kRowsPerWorker);
-    tasks[i] = new FullyConnectedAsGEMVWorkerTask(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data, row_start, row_end);
+    tasks.emplace_back(input_shape, input_data, input_offset, filter_shape,
+                       filter_data, filter_offset, bias_shape, bias_data,
+                       output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_shape, output_data, row_start, row_end);
     row_start = row_end;
   }
   TFLITE_DCHECK_EQ(row_start, output_rows);
-  gemmlowp_context->workers_pool()->Execute(tasks);
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
 }
 #endif  // USE_NEON
 
-struct GemmlowpOutputPipeline {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
-                     gemmlowp::OutputStageClamp,
-                     gemmlowp::OutputStageSaturatingCastToInt8>
-      Pipeline;
-  static Pipeline MakeExp(const int32* bias_data, int output_rows,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_left_shift, int32 output_activation_min,
-                          int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
-    quantize_down_stage.result_exponent = output_left_shift;
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
-  }
-};
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const int8* input_data, const RuntimeShape& filter_shape,
     const int8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
-    gemmlowp::GemmContext* gemmlowp_context) {
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("FullyConnectedInt8/8bit");
 
-#ifdef USE_NEON
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
   const int32 output_offset = params.output_offset;
@@ -412,6 +374,8 @@
   const int output_dim_count = output_shape.DimensionsCount();
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+
+#ifdef USE_NEON
   if (batches == 1) {
     const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
                                         output_shape, output_dim_count - 1);
@@ -420,12 +384,12 @@
           input_shape, input_data, input_offset, filter_shape, filter_data,
           filter_offset, bias_shape, bias_data, output_offset,
           output_multiplier, output_shift, output_activation_min,
-          output_activation_max, output_shape, output_data, gemmlowp_context);
+          output_activation_max, output_shape, output_data,
+          cpu_backend_context);
     }
   }
 #endif  // USE_NEON
 
-#ifdef GEMMLOWP_NEON
   const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
   const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
   TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
@@ -433,28 +397,30 @@
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
 
-  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, output_rows, filter_cols, filter_cols);
-  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      input_data, filter_cols, batches, filter_cols);
-  gemmlowp::MatrixMap<int8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, batches, output_rows);
-  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
-      bias_data, output_rows, output_offset, output_multiplier, output_shift,
-      output_activation_min, output_activation_max);
-
-  gemmlowp::GemmWithOutputPipeline<
-      int8, int8, gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams>(
-      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
-      filter_offset, input_offset, output_pipeline);
-  return;
-#endif  // GEMMLOWP_NEON
-
-  // If both GEMMLOWP_NEON && NEON paths are skipped, fallback to reference
-  // implementation.
-  reference_integer_ops::FullyConnected(
-      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
-      bias_data, output_shape, output_data, gemmlowp_context);
+  cpu_backend_gemm::MatrixParams<int8> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  cpu_backend_gemm::MatrixParams<int8> rhs_params;
+  rhs_params.rows = filter_cols;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<int8> dst_params;
+  dst_params.rows = filter_rows;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<int32, int8> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
 }  // namespace optimized_integer_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index a16d39b..ff261a8 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -15,7 +15,7 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
index 5037328..beabd61 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
@@ -18,6 +18,7 @@
 #include <assert.h>
 #include <stdint.h>
 #include <sys/types.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -27,7 +28,7 @@
 #include <type_traits>
 
 #include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
index 6e6cf2e..dc29fcb 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
@@ -15,7 +15,8 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
 
-#include "public/gemmlowp.h"
+#include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index 639c9ab..7e2dc10 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -18,9 +18,12 @@
 #include <stdint.h>
 #include <sys/types.h>
 
+#include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -304,6 +307,202 @@
                     output_dims);
 }
 
+template <typename T, typename TS>
+struct LegacyDepthwiseConvWorkerTask : public gemmlowp::Task {
+  LegacyDepthwiseConvWorkerTask(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const T* input_data, const RuntimeShape& filter_shape,
+      const T* filter_data, const RuntimeShape& bias_shape, const TS* bias_data,
+      const RuntimeShape& output_shape, T* output_data, int thread_start,
+      int thread_end, int thread_dim)
+      : params_(params),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        thread_start_(thread_start),
+        thread_end_(thread_end),
+        thread_dim_(thread_dim) {}
+
+  void Run() override {
+    DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
+                      filter_data_, bias_shape_, bias_data_, output_shape_,
+                      output_data_, thread_start_, thread_end_, thread_dim_);
+  }
+
+ private:
+  const DepthwiseParams& params_;
+  const RuntimeShape& input_shape_;
+  const T* input_data_;
+  const RuntimeShape& filter_shape_;
+  const T* filter_data_;
+  const RuntimeShape& bias_shape_;
+  const TS* bias_data_;
+  const RuntimeShape& output_shape_;
+  T* output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  const int max_threads =
+      gemmlowp_context ? gemmlowp_context->max_num_threads() : 1;
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1) {
+    DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
+                      filter_data, bias_shape, bias_data, output_shape,
+                      output_data, /*thread_start=*/0,
+                      /*thread_end=*/output_rows, /*thread_dim=*/1);
+  } else {
+    std::vector<gemmlowp::Task*> tasks(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i) {
+      int thread_end =
+          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks[i] = new LegacyDepthwiseConvWorkerTask<uint8, int32>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data, thread_start,
+          thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+  }
+}
+
+template <typename T, typename TS>
+struct LegacyPerChannelDepthwiseConvWorkerTask : public gemmlowp::Task {
+  LegacyPerChannelDepthwiseConvWorkerTask(
+      const DepthwiseParams& params, const int32* output_multiplier,
+      const int32* output_shift, const RuntimeShape& input_shape,
+      const T* input_data, const RuntimeShape& filter_shape,
+      const T* filter_data, const RuntimeShape& bias_shape, const TS* bias_data,
+      const RuntimeShape& output_shape, T* output_data, int thread_start,
+      int thread_end, int thread_dim)
+      : params_(params),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        thread_start_(thread_start),
+        thread_end_(thread_end),
+        thread_dim_(thread_dim) {}
+
+  void Run() override {
+    optimized_integer_ops::DepthwiseConvImpl(
+        params_, output_multiplier_, output_shift_, input_shape_, input_data_,
+        filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
+        output_data_, thread_start_, thread_end_, thread_dim_);
+  }
+
+ private:
+  const DepthwiseParams& params_;
+  const int32* output_multiplier_;
+  const int32* output_shift_;
+  const RuntimeShape& input_shape_;
+  const T* input_data_;
+  const RuntimeShape& filter_shape_;
+  const T* filter_data_;
+  const RuntimeShape& bias_shape_;
+  const TS* bias_data_;
+  const RuntimeShape& output_shape_;
+  T* output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    gemmlowp::GemmContext* gemmlowp_context = nullptr) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvInt8");
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  const int max_threads =
+      gemmlowp_context ? gemmlowp_context->max_num_threads() : 1;
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1) {
+    optimized_integer_ops::DepthwiseConvImpl(
+        params, output_multiplier, output_shift, input_shape, input_data,
+        filter_shape, filter_data, bias_shape, bias_data, output_shape,
+        output_data, /*thread_start=*/0,
+        /*thread_end=*/output_rows, /*thread_dim=*/1);
+  } else {
+    std::vector<gemmlowp::Task*> tasks(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i) {
+      int thread_end =
+          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks[i] = new LegacyPerChannelDepthwiseConvWorkerTask<int8, int32>(
+          params, output_multiplier, output_shift, input_shape, input_data,
+          filter_shape, filter_data, bias_shape, bias_data, output_shape,
+          output_data, thread_start, thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+  }
+}
+
 inline void AddBiasAndEvalActivationFunction(const float* bias_data,
                                              const Dims<4>& bias_dims,
                                              float* array_data,
@@ -328,6 +527,60 @@
                                    output_activation_max);
 }
 
+template <typename Lhs, typename Rhs, typename Result>
+void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
+          Eigen::MatrixBase<Result>* result) {
+  if (rhs.cols() == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    result->col(0).noalias() = lhs * rhs.col(0);
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    result->noalias() = lhs * rhs;
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& bias_shape,
+    const float* optional_bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected");
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  // TODO(b/62193649): this convoluted shape computation (determining
+  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
+  // is because the current --variable_batch hack consists in overwriting the
+  // 3rd dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  // When that is fixed, this should become:
+  // const auto input_matrix_map =
+  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const int dims_count = weights_shape.DimensionsCount();
+  const int input_rows = weights_shape.Dims(dims_count - 1);
+  const auto input_matrix_map =
+      MapAsMatrixWithGivenNumberOfRows(input_data, input_shape, input_rows);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsRows(weights_data, weights_shape);
+  auto output_matrix_map =
+      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  if (optional_bias_data != nullptr) {
+    AddBiasAndEvalActivationFunction(
+        output_activation_min, output_activation_max, bias_shape,
+        optional_bias_data, output_shape, output_data);
+  } else {
+    const int flat_size = output_shape.FlatSize();
+    for (int i = 0; i < flat_size; ++i) {
+      output_data[i] = ActivationFunctionWithMinMax(
+          output_data[i], output_activation_min, output_activation_max);
+    }
+  }
+}
+
 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                            const float* weights_data,
                            const Dims<4>& weights_dims, const float* bias_data,
@@ -358,6 +611,311 @@
                  output_data, output_dims);
 }
 
+struct GemmlowpOutputPipeline {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline MakeExp(const int32* bias_data, int output_rows,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_left_shift, int32 output_activation_min,
+                          int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+struct GemmlowpOutputPipelineInt8 {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToInt8>
+      Pipeline;
+  static Pipeline MakeExp(const int32* bias_data, int output_rows,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_left_shift, int32 output_activation_min,
+                          int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+#ifdef USE_NEON
+struct LegacyFullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  LegacyFullyConnectedAsGEMVWorkerTask(
+      const RuntimeShape& input_shape, const uint8* input_data,
+      int32 input_offset, const RuntimeShape& filter_shape,
+      const uint8* filter_data, int32 filter_offset,
+      const RuntimeShape& bias_shape, const int32* bias_data,
+      int32 output_offset, int32 output_multiplier, int output_shift,
+      int32 output_activation_min, int32 output_activation_max,
+      const RuntimeShape& output_shape, uint8* output_data, int row_start,
+      int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
+  }
+
+  const RuntimeShape& input_shape_;
+  const uint8* input_data_;
+  int32 input_offset_;
+  const RuntimeShape& filter_shape_;
+  const uint8* filter_data_;
+  int32 filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32* bias_data_;
+  int32 output_offset_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int32 output_activation_min_;
+  int32 output_activation_max_;
+  const RuntimeShape& output_shape_;
+  uint8* output_data_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const uint8* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemmlowp_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
+      gemmlowp::CeilQuotient(output_rows, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks[i] = new LegacyFullyConnectedAsGEMVWorkerTask(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+}
+#endif  // USE_NEON
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+#ifdef USE_NEON
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemmlowp_context);
+    }
+  }
+#endif  // USE_NEON
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, batches, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, batches, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data_int32, const RuntimeShape& output_shape,
+    int16* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected/Uint8Int16");
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+  (void)gemmlowp_context;  // only used in properly optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+  // Implementation of the fully connected node suited to the inside of an LSTM
+  // cell. The operands are 8-bit integers, the accumulators are internally
+  // 32bit integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+#ifdef GEMMLOWP_NEON
+  if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
+      output_activation_max == 32767) {
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
+      GEMVForLstmCellWithSymmetricRange(
+          input_shape, input_data, filter_shape, filter_data, bias_shape,
+          bias_data_int32, output_multiplier, output_shift, output_shape,
+          output_data);
+      return;
+    }
+    if (!(output_depth % 4) && !(accum_depth % 8)) {
+      GEMVForLstmCell(input_shape, input_data, filter_shape, filter_data,
+                      filter_offset, bias_shape, bias_data_int32,
+                      output_multiplier, output_shift, output_shape,
+                      output_data);
+      return;
+    }
+  }
+#endif
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> weights_matrix(
+      filter_data, output_depth, accum_depth);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, accum_depth, batches);
+  gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_depth, batches);
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  ColVectorMap bias_vector(bias_data_int32, output_depth);
+  gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+  bias_addition_stage.bias_vector = bias_vector;
+  gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+  scale_stage.result_offset_after_shift = 0;
+  scale_stage.result_fixedpoint_multiplier = output_multiplier;
+  // Note that this shift is negated wrt ordinary FC.
+  scale_stage.result_exponent = output_shift;
+  gemmlowp::OutputStageClamp clamp_stage;
+  clamp_stage.min = output_activation_min;
+  clamp_stage.max = output_activation_max;
+  gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+  auto output_pipeline =
+      std::make_tuple(bias_addition_stage, scale_stage, clamp_stage,
+                      saturating_cast_int16_stage);
+  gemmlowp::GemmWithOutputPipeline<uint8, int16,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, weights_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
 inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
                            int32 input_offset, const uint8* filter_data,
                            const Dims<4>& filter_dims, int32 filter_offset,
@@ -429,6 +987,552 @@
                  gemmlowp_context);
 }
 
+#ifdef USE_NEON
+inline void LegacyInt8FullyConnectedAsGEMVWorkerImpl(
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    int8_t* output_data, int row_start, int row_end) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMVInt8/8bit");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kPeel = 4;
+  const bool shift_left = (output_shift > 0);
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const int8x16_t input_val_s8 = vld1q_s8(input_data + in);
+      const int8_t* filter_ptr = filter_data + in + out * input_size;
+      int8x16_t filter_val_s8_0 = vld1q_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x16_t filter_val_s8_1 = vld1q_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x16_t filter_val_s8_2 = vld1q_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x16_t filter_val_s8_3 = vld1q_s8(filter_ptr);
+      int16x8_t input_val_0, input_val_1;
+      int8x8_t low = vget_low_s8(input_val_s8);
+      int8x8_t high = vget_high_s8(input_val_s8);
+      input_val_0 = vmovl_s8(low);
+      input_val_1 = vmovl_s8(high);
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_s8(filter_val_s8_0);
+      high = vget_high_s8(filter_val_s8_0);
+      int16x8_t filter_val_0_0 = vmovl_s8(low);
+      int16x8_t filter_val_0_1 = vmovl_s8(high);
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_s8(filter_val_s8_1);
+      high = vget_high_s8(filter_val_s8_1);
+      int16x8_t filter_val_1_0 = vmovl_s8(low);
+      int16x8_t filter_val_1_1 = vmovl_s8(high);
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_s8(filter_val_s8_2);
+      high = vget_high_s8(filter_val_s8_2);
+      int16x8_t filter_val_2_0 = vmovl_s8(low);
+      int16x8_t filter_val_2_1 = vmovl_s8(high);
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_s8(filter_val_s8_3);
+      high = vget_high_s8(filter_val_s8_3);
+      int16x8_t filter_val_3_0 = vmovl_s8(low);
+      int16x8_t filter_val_3_1 = vmovl_s8(high);
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
+    }
+    for (; in <= input_size - 8; in += 8) {
+      const int8x8_t input_val_s8 = vld1_s8(input_data + in);
+      const int8_t* filter_ptr = filter_data + in + out * input_size;
+      int8x8_t filter_val_s8_0 = vld1_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x8_t filter_val_s8_1 = vld1_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x8_t filter_val_s8_2 = vld1_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x8_t filter_val_s8_3 = vld1_s8(filter_ptr);
+      int16x8_t input_val = vmovl_s8(input_val_s8);
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val_0 = vmovl_s8(filter_val_s8_0);
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vmovl_s8(filter_val_s8_1);
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vmovl_s8(filter_val_s8_2);
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vmovl_s8(filter_val_s8_3);
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
+    }
+    if (in < input_size) {
+      int32 buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32 input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32 filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
+    }
+
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
+    reduced = vaddq_s32(reduced, bias_vec);
+    if (shift_left) {
+      const int32 multiplier_power_of_two = 1 << output_shift;
+      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    } else {
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, -output_shift);
+    }
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit signed, saturating.
+    int8x8_t res8 = vqmovn_s16(vcombine_s16(res16, res16));
+    // Apply the clamping from the activation function
+    res8 = vmax_s8(res8, vdup_n_s8(output_activation_min));
+    res8 = vmin_s8(res8, vdup_n_s8(output_activation_max));
+    // Store results to destination.
+    vst1_lane_s8(output_data + out + 0, res8, 0);
+    vst1_lane_s8(output_data + out + 1, res8, 1);
+    vst1_lane_s8(output_data + out + 2, res8, 2);
+    vst1_lane_s8(output_data + out + 3, res8, 3);
+  }
+}
+
+struct LegacyInt8FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  LegacyInt8FullyConnectedAsGEMVWorkerTask(
+      const RuntimeShape& input_shape, const int8_t* input_data,
+      int32 input_offset, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, int32 filter_offset,
+      const RuntimeShape& bias_shape, const int32* bias_data,
+      int32 output_offset, int32 output_multiplier, int output_shift,
+      int32 output_activation_min, int32 output_activation_max,
+      const RuntimeShape& output_shape, int8_t* output_data, int row_start,
+      int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    LegacyInt8FullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
+  }
+
+  const RuntimeShape& input_shape_;
+  const int8_t* input_data_;
+  int32 input_offset_;
+  const RuntimeShape& filter_shape_;
+  const int8_t* filter_data_;
+  int32 filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32* bias_data_;
+  int32 output_offset_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int32 output_activation_min_;
+  int32 output_activation_max_;
+  const RuntimeShape& output_shape_;
+  int8_t* output_data_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void LegacyInt8FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemmlowp_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    LegacyInt8FullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<LegacyInt8FullyConnectedAsGEMVWorkerTask> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
+      gemmlowp::CeilQuotient(output_rows, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks.emplace_back(input_shape, input_data, input_offset, filter_shape,
+                       filter_data, filter_offset, bias_shape, bias_data,
+                       output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemmlowp_context->workers_pool()->Execute(tasks.size(), tasks.data());
+}
+#endif  // USE_NEON
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnectedInt8/8bit");
+
+#ifdef USE_NEON
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return LegacyInt8FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemmlowp_context);
+    }
+  }
+#endif  // USE_NEON
+
+#ifdef GEMMLOWP_NEON
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, batches, filter_cols);
+  gemmlowp::MatrixMap<int8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, batches, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipelineInt8::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+
+  gemmlowp::GemmWithOutputPipeline<
+      int8, int8, gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+  return;
+#endif  // GEMMLOWP_NEON
+
+  // If both GEMMLOWP_NEON && NEON paths are skipped, fallback to reference
+  // implementation.
+  reference_integer_ops::FullyConnected(params, input_shape, input_data,
+                                        filter_shape, filter_data, bias_shape,
+                                        bias_data, output_shape, output_data);
+}
+
+struct LegacyShuffledFullyConnectedWorkerTask : gemmlowp::Task {
+  LegacyShuffledFullyConnectedWorkerTask(const uint8* input_data,
+                                         const int8* shuffled_weights_data,
+                                         int batches, int output_depth,
+                                         int output_stride, int accum_depth,
+                                         const int32* bias_data,
+                                         int32 output_multiplier,
+                                         int output_shift, int16* output_data)
+      : input_data_(input_data),
+        shuffled_weights_data_(shuffled_weights_data),
+        batches_(batches),
+        output_depth_(output_depth),
+        output_stride_(output_stride),
+        accum_depth_(accum_depth),
+        bias_data_(bias_data),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_data_(output_data) {}
+
+  void Run() override {
+    ShuffledFullyConnectedWorkerImpl(
+        input_data_, shuffled_weights_data_, batches_, output_depth_,
+        output_stride_, accum_depth_, bias_data_, output_multiplier_,
+        output_shift_, output_data_);
+  }
+
+  const uint8* input_data_;
+  const int8* shuffled_weights_data_;
+  int batches_;
+  int output_depth_;
+  int output_stride_;
+  int accum_depth_;
+  const int32* bias_data_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int16* output_data_;
+};
+
+inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& weights_shape,
+    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int16* output_data, uint8* shuffled_input_workspace_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  gemmlowp::ScopedProfilingLabel label("ShuffledFullyConnected/8bit");
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  (void)gemmlowp_context;  // only used in optimized code.
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* int8_shuffled_weights_data =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  if (batches == 1) {
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (int i = 0; i < accum_depth; i += 16) {
+      uint8x16_t val = vld1q_u8(input_data + i);
+      val = veorq_u8(val, signbit);
+      vst1q_u8(shuffled_input_workspace_data + i, val);
+    }
+#else
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+#endif
+  } else if (batches == 4) {
+    uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+    int c = 0;
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (c = 0; c < accum_depth; c += 16) {
+      const uint8* src_data_ptr = input_data + c;
+      uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
+      uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
+      uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
+      uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth);
+      val0 = veorq_u8(val0, signbit);
+      val1 = veorq_u8(val1, signbit);
+      val2 = veorq_u8(val2, signbit);
+      val3 = veorq_u8(val3, signbit);
+      vst1q_u8(shuffled_input_workspace_ptr + 0, val0);
+      vst1q_u8(shuffled_input_workspace_ptr + 16, val1);
+      vst1q_u8(shuffled_input_workspace_ptr + 32, val2);
+      vst1q_u8(shuffled_input_workspace_ptr + 48, val3);
+      shuffled_input_workspace_ptr += 64;
+    }
+#else
+    for (c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+#endif
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemmlowp_context->max_num_threads(), output_depth, batches, accum_depth);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    ShuffledFullyConnectedWorkerImpl(
+        shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
+        output_depth, output_depth, accum_depth, bias_data, output_multiplier,
+        output_shift, output_data);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
+      gemmlowp::CeilQuotient(output_depth, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; i++) {
+    int row_end = std::min(output_depth, row_start + kRowsPerWorker);
+    tasks[i] = new LegacyShuffledFullyConnectedWorkerTask(
+        shuffled_input_workspace_data,
+        int8_shuffled_weights_data + row_start * accum_depth, batches,
+        row_end - row_start, output_depth, accum_depth, bias_data + row_start,
+        output_multiplier, output_shift, output_data + row_start);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_depth);
+  gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+}
+
 inline void ShuffledFullyConnected(
     const uint8* input_data, const Dims<4>& input_dims,
     const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
@@ -513,6 +1617,109 @@
          kwidth, zero_byte, output_data, output_dims);
 }
 
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  (void)im2col_data;
+  (void)im2col_shape;
+  gemmlowp::ScopedProfilingLabel label("Conv");
+
+  // NB: the float 0.0f value is represented by all zero bytes.
+  const uint8 float_zero_byte = 0x00;
+  const float* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col) {
+    DilatedIm2col(params, float_zero_byte, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    Im2col(params, filter_height, filter_width, float_zero_byte, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    // TODO(aselle): We need to make sure to not send im2col if it is not
+    // needed.
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using Eigen.
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+      Matrix;
+  typedef Eigen::Map<Matrix> MatrixRef;
+  typedef Eigen::Map<const Matrix> ConstMatrixRef;
+
+  MatrixRef matrix_c(c, m, n);
+  ConstMatrixRef matrix_a(a, m, k);
+  ConstMatrixRef matrix_b(b, n, k);
+
+  // The following special casing for when a or b is a vector is required
+  // as Eigen seem to fail to make this optimization on its own.
+  if (n == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
+  } else if (m == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    matrix_c.noalias() = matrix_a * matrix_b.transpose();
+  }
+
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
+}
+
 inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  const float* filter_data, const Dims<4>& filter_dims,
                  const float* bias_data, const Dims<4>& bias_dims,
@@ -608,6 +1815,112 @@
            output_dims, im2col_data, im2col_dims);
 }
 
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const uint8* input_data, const RuntimeShape& filter_shape,
+                 const uint8* filter_data, const RuntimeShape& bias_shape,
+                 const int32* bias_data, const RuntimeShape& output_shape,
+                 uint8* output_data, const RuntimeShape& im2col_shape,
+                 uint8* im2col_data, gemmlowp::GemmContext* gemmlowp_context) {
+  gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const uint8* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    DilatedIm2col(params, input_zero_point, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    Im2col(params, filter_height, filter_width, input_zero_point, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
+  // The root cause has not yet been identified though. Same applies below for
+  // the other calls commented out. This is a partial rollback of cl/196819423.
+  // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int gemm_input_cols = gemm_input_shape->Dims(0) *
+                              gemm_input_shape->Dims(1) *
+                              gemm_input_shape->Dims(2);
+  const int filter_rows = filter_shape.Dims(0);
+  // See b/79927784.
+  // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+  const int filter_cols =
+      filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
+  const int output_rows = output_shape.Dims(3);
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_shape, 3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+#ifdef USE_NEON
+  if (gemm_input_cols == 1 && output_rows >= 4) {
+    RuntimeShape fc_filter_shape{
+        filter_shape.Dims(0),
+        filter_shape.Dims(filter_shape.DimensionsCount() - 1)};
+
+    return FullyConnectedAsGEMV(
+        *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
+        filter_data, filter_offset, bias_shape, bias_data, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_shape, output_data, gemmlowp_context);
+  }
+#endif
+
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols);
+  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_offset, const uint8* filter_data,
                  const Dims<4>& filter_dims, int32 filter_offset,
@@ -786,6 +2099,28 @@
       filter_offset, input_offset, output_pipeline);
 }
 
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  gemmlowp::ScopedProfilingLabel label("TransposeConv");
+  // Note we could use transposed weights with forward conv for unstrided
+  // cases. But we are already getting good performance with this code as-is.
+  TFLITE_DCHECK(im2col_data);
+  TransposeIm2col(params, 0, input_shape, input_data, filter_shape,
+                  output_shape, im2col_data);
+
+  const auto im2col_matrix_map =
+      MapAsMatrixWithLastDimAsRows(im2col_data, im2col_shape);
+  const auto filter_matrix_map =
+      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
+  auto output_matrix_map =
+      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+
+  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+}
+
 inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
                           const float* filter_data, const Dims<4>& filter_dims,
                           int stride_width, int stride_height, int pad_width,
@@ -824,6 +2159,123 @@
                   im2col_data);
 }
 
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
+    const float* prev_activ_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& unextended_bias_shape,
+    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
+    const float* prev_state_data,
+    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
+    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
+    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
+  gemmlowp::ScopedProfilingLabel label("LstmCell");
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  MatchingDim(  // batches
+      input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+      output_state_shape, 0, output_activ_shape, 0);
+  MatchingDim(  // height
+      input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+      output_state_shape, 1, output_activ_shape, 1);
+  MatchingDim(  // width
+      input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+      output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<RuntimeShape const*> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]),
+                &(concat_input_arrays_data[0]), concat_temp_shape,
+                concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
+                 weights_data, bias_shape, bias_data, activ_temp_shape,
+                 activ_temp_data);
+
+  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+  // operations.
+  ArrayMap<float> activ_temp_map =
+      MapAsArrayWithLastDimAsRows(activ_temp_data, activ_temp_shape);
+  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+                                            activ_temp_map.cols());
+  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+                                           activ_temp_map.cols());
+  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  ArrayMap<const float> prev_state_map =
+      MapAsArrayWithLastDimAsRows(prev_state_data, prev_state_shape);
+  ArrayMap<float> output_state_map =
+      MapAsArrayWithLastDimAsRows(output_state_data, output_state_shape);
+  ArrayMap<float> output_activ_map =
+      MapAsArrayWithLastDimAsRows(output_activ_data, output_activ_shape);
+
+  // Combined memory state and final output calculation
+  gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
+  output_state_map =
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+          new_input_sm.tanh() +
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+          prev_state_map;
+  output_activ_map =
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+      output_state_map.tanh();
+}
+
 inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
                      const float* prev_activ_data,
                      const Dims<4>& prev_activ_dims, const float* weights_data,
@@ -848,6 +2300,293 @@
 }
 
 template <int StateIntegerBits>
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const uint8* input_data_uint8,
+    const RuntimeShape& unextended_prev_activ_shape,
+    const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
+    const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
+    const int32* bias_data_int32,
+    const RuntimeShape& unextended_prev_state_shape,
+    const int16* prev_state_data_int16,
+    const RuntimeShape& unextended_output_state_shape,
+    int16* output_state_data_int16,
+    const RuntimeShape& unextended_output_activ_shape,
+    uint8* output_activ_data_uint8,
+    const RuntimeShape& unextended_concat_temp_shape,
+    uint8* concat_temp_data_uint8,
+    const RuntimeShape& unextended_activ_temp_shape,
+    int16* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) {
+  gemmlowp::ScopedProfilingLabel label(
+      "LstmCell/quantized (8bit external, 16bit internal)");
+  int32 weights_zero_point = params.weights_zero_point;
+  int32 accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(
+      input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
+      output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
+                                              prev_activ_data_uint8};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape,
+                concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  bool gemm_already_performed = false;
+#ifdef GEMMLOWP_NEON
+  if (fc_batches == 1 && !(fc_output_depth % 4) && !(fc_accum_depth % 8)) {
+    GEMVForLstmCell(concat_temp_shape, concat_temp_data_uint8, weights_shape,
+                    weights_data_uint8, weights_zero_point, bias_shape,
+                    bias_data_int32, accum_multiplier, accum_shift,
+                    activ_temp_shape, activ_temp_data_int16);
+    gemm_already_performed = true;
+  }
+#endif
+  if (!gemm_already_performed) {
+    gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor>
+        weights_matrix(weights_data_uint8, fc_output_depth, fc_accum_depth);
+    gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+        concat_temp_data_uint8, fc_accum_depth, fc_batches);
+    gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix(
+        activ_temp_data_int16, fc_output_depth, fc_batches);
+    typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+        ColVectorMap;
+    ColVectorMap bias_vector(bias_data_int32, fc_output_depth);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+    scale_stage.result_offset_after_shift = 0;
+    scale_stage.result_fixedpoint_multiplier = accum_multiplier;
+    scale_stage.result_exponent = accum_shift;
+    gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
+                                           saturating_cast_int16_stage);
+    gemmlowp::GemmWithOutputPipeline<
+        uint8, int16, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+        gemmlowp_context, weights_matrix, input_matrix, &output_matrix,
+        -weights_zero_point, -128, output_pipeline);
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int16* input_gate_input_ptr = activ_temp_data_int16;
+  const int16* input_modulation_gate_input_ptr =
+      activ_temp_data_int16 + output_depth;
+  const int16* forget_gate_input_ptr = activ_temp_data_int16 + 2 * output_depth;
+  const int16* output_gate_input_ptr = activ_temp_data_int16 + 3 * output_depth;
+  const int16* prev_state_ptr = prev_state_data_int16;
+  int16* output_state_data_ptr = output_state_data_int16;
+  uint8* output_activ_data_ptr = output_activ_data_uint8;
+
+  for (int b = 0; b < outer_size; ++b) {
+    int c = 0;
+#ifdef GEMMLOWP_NEON
+    for (; c <= output_depth - 8; c += 8) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<int16x8_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(vld1q_s16(input_gate_input_ptr));
+      input_gate_input_ptr += 8;
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+          F3::FromRaw(vld1q_s16(input_modulation_gate_input_ptr));
+      input_modulation_gate_input_ptr += 8;
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(vld1q_s16(forget_gate_input_ptr));
+      forget_gate_input_ptr += 8;
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(vld1q_s16(output_gate_input_ptr));
+      output_gate_input_ptr += 8;
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(vld1q_s16(prev_state_ptr));
+      prev_state_ptr += 8;
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      vst1q_s16(output_state_data_ptr, new_state.raw());
+      output_state_data_ptr += 8;
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16x8_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int8x8_t int8_output_activ = vqmovn_s16(rescaled_output_activ);
+      uint8x8_t uint8_output_activ =
+          vadd_u8(vdup_n_u8(128), vreinterpret_u8_s8(int8_output_activ));
+      vst1_u8(output_activ_data_ptr, uint8_output_activ);
+      output_activ_data_ptr += 8;
+    }
+#endif
+    for (; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(*input_gate_input_ptr++);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+          F3::FromRaw(*input_modulation_gate_input_ptr++);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(*forget_gate_input_ptr++);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(*output_gate_input_ptr++);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(*prev_state_ptr++);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      *output_state_data_ptr++ = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+          std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      *output_activ_data_ptr++ = 128 + clamped_output_activ;
+    }
+    input_gate_input_ptr += 3 * output_depth;
+    input_modulation_gate_input_ptr += 3 * output_depth;
+    forget_gate_input_ptr += 3 * output_depth;
+    output_gate_input_ptr += 3 * output_depth;
+  }
+}
+
+template <int StateIntegerBits>
 void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
               const uint8* prev_activ_data_uint8,
               const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index fda0828..46bba41 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -18,6 +18,7 @@
 #include <assert.h>
 #include <stdint.h>
 #include <sys/types.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -26,6 +27,8 @@
 #include <tuple>
 #include <type_traits>
 
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
 #if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 #include <Accelerate/Accelerate.h>
 #endif
@@ -33,8 +36,12 @@
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -87,6 +94,7 @@
 using reference_ops::Relu1;
 using reference_ops::Relu6;
 using reference_ops::ReluX;
+using reference_ops::Round;
 using reference_ops::Select;
 using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
@@ -194,99 +202,9 @@
                                              const float* bias_data,
                                              const RuntimeShape& array_shape,
                                              float* array_data) {
-#ifdef USE_NEON
-  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_shape.FlatSize();
-  const int array_size = array_shape.FlatSize();
-  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
-  float* array_ptr = array_data;
-  float* array_end_ptr = array_ptr + array_size;
-  const auto activation_min = vdupq_n_f32(output_activation_min);
-  const auto activation_max = vdupq_n_f32(output_activation_max);
-  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
-    int i = 0;
-    for (; i <= bias_size - 16; i += 16) {
-      auto b0 = vld1q_f32(bias_data + i);
-      auto b1 = vld1q_f32(bias_data + i + 4);
-      auto b2 = vld1q_f32(bias_data + i + 8);
-      auto b3 = vld1q_f32(bias_data + i + 12);
-      auto a0 = vld1q_f32(array_ptr + i);
-      auto a1 = vld1q_f32(array_ptr + i + 4);
-      auto a2 = vld1q_f32(array_ptr + i + 8);
-      auto a3 = vld1q_f32(array_ptr + i + 12);
-      auto x0 = vaddq_f32(a0, b0);
-      auto x1 = vaddq_f32(a1, b1);
-      auto x2 = vaddq_f32(a2, b2);
-      auto x3 = vaddq_f32(a3, b3);
-      x0 = vmaxq_f32(activation_min, x0);
-      x1 = vmaxq_f32(activation_min, x1);
-      x2 = vmaxq_f32(activation_min, x2);
-      x3 = vmaxq_f32(activation_min, x3);
-      x0 = vminq_f32(activation_max, x0);
-      x1 = vminq_f32(activation_max, x1);
-      x2 = vminq_f32(activation_max, x2);
-      x3 = vminq_f32(activation_max, x3);
-      vst1q_f32(array_ptr + i, x0);
-      vst1q_f32(array_ptr + i + 4, x1);
-      vst1q_f32(array_ptr + i + 8, x2);
-      vst1q_f32(array_ptr + i + 12, x3);
-    }
-    for (; i <= bias_size - 4; i += 4) {
-      auto b = vld1q_f32(bias_data + i);
-      auto a = vld1q_f32(array_ptr + i);
-      auto x = vaddq_f32(a, b);
-      x = vmaxq_f32(activation_min, x);
-      x = vminq_f32(activation_max, x);
-      vst1q_f32(array_ptr + i, x);
-    }
-    for (; i < bias_size; i++) {
-      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
-                                                  output_activation_min,
-                                                  output_activation_max);
-    }
-  }
-#else  // not NEON
-  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_shape.FlatSize();
-  const int array_size = array_shape.FlatSize();
-  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
-  for (int array_offset = 0; array_offset < array_size;
-       array_offset += bias_size) {
-    for (int i = 0; i < bias_size; i++) {
-      array_data[array_offset + i] = ActivationFunctionWithMinMax(
-          array_data[array_offset + i] + bias_data[i], output_activation_min,
-          output_activation_max);
-    }
-  }
-#endif
-}
-
-template <typename Lhs, typename Rhs, typename Result>
-void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
-          Eigen::MatrixBase<Result>* result) {
-  if (rhs.cols() == 1) {
-    gemmlowp::ScopedProfilingLabel label("GEMV");
-    result->col(0).noalias() = lhs * rhs.col(0);
-  } else {
-    gemmlowp::ScopedProfilingLabel label("GEMM");
-    result->noalias() = lhs * rhs;
-  }
-}
-
-inline void optimized_ops_preload_l1_stream(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-inline void optimized_ops_preload_l1_keep(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
+  BiasAndClamp(output_activation_min, output_activation_max,
+               bias_shape.FlatSize(), bias_data, array_shape.FlatSize(),
+               array_data);
 }
 
 #ifdef GEMMLOWP_NEON
@@ -779,41 +697,31 @@
     const float* input_data, const RuntimeShape& weights_shape,
     const float* weights_data, const RuntimeShape& bias_shape,
     const float* optional_bias_data, const RuntimeShape& output_shape,
-    float* output_data) {
+    float* output_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("FullyConnected");
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-
-  // TODO(b/62193649): this convoluted shape computation (determining
-  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
-  // is because the current --variable_batch hack consists in overwriting the
-  // 3rd dimension with the runtime batch size, as we don't keep track for each
-  // array of which dimension is the batch dimension in it.
-  // When that is fixed, this should become:
-  // const auto input_matrix_map =
-  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
   const int dims_count = weights_shape.DimensionsCount();
   const int input_rows = weights_shape.Dims(dims_count - 1);
-  const auto input_matrix_map =
-      MapAsMatrixWithGivenNumberOfRows(input_data, input_shape, input_rows);
-  const auto filter_matrix_map =
-      MapAsMatrixWithLastDimAsRows(weights_data, weights_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
-
-  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-
-  if (optional_bias_data != nullptr) {
-    AddBiasAndEvalActivationFunction(
-        output_activation_min, output_activation_max, bias_shape,
-        optional_bias_data, output_shape, output_data);
-  } else {
-    const int flat_size = output_shape.FlatSize();
-    for (int i = 0; i < flat_size; ++i) {
-      output_data[i] = ActivationFunctionWithMinMax(
-          output_data[i], output_activation_min, output_activation_max);
-    }
-  }
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = input_rows;
+  rhs_params.cols = input_shape.FlatSize() / input_rows;
+  TFLITE_DCHECK_EQ(input_shape.FlatSize(), rhs_params.rows * rhs_params.cols);
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.cols = weights_shape.Dims(dims_count - 1);
+  lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  dst_params.cols =
+      FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = optional_bias_data;
+  gemm_params.clamp_min = params.float_activation_min;
+  gemm_params.clamp_max = params.float_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, weights_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
 #ifdef USE_NEON
@@ -1034,7 +942,7 @@
   }
 }
 
-struct FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+struct FullyConnectedAsGEMVWorkerTask : cpu_backend_threadpool::Task {
   FullyConnectedAsGEMVWorkerTask(const RuntimeShape& input_shape,
                                  const uint8* input_data, int32 input_offset,
                                  const RuntimeShape& filter_shape,
@@ -1099,14 +1007,14 @@
     const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
     int32 output_multiplier, int output_shift, int32 output_activation_min,
     int32 output_activation_max, const RuntimeShape& output_shape,
-    uint8* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+    uint8* output_data, CpuBackendContext* cpu_backend_context) {
   const int output_dim_count = output_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
   const int output_rows = output_shape.Dims(output_dim_count - 1);
   const int input_size = FlatSizeSkipDim(input_shape, 0);
   static constexpr int kKernelRows = 4;
-  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      gemmlowp_context->max_num_threads(), output_rows, batches, input_size);
+  const int thread_count = LegacyHowManyThreads<kKernelRows>(
+      cpu_backend_context->max_num_threads(), output_rows, batches, input_size);
   if (thread_count == 1) {
     // Single-thread case: do the computation on the current thread, don't
     // use a threadpool
@@ -1120,58 +1028,34 @@
 
   // Multi-threaded case: use the gemmlowp context's threadpool.
   TFLITE_DCHECK_GT(thread_count, 1);
-  std::vector<gemmlowp::Task*> tasks(thread_count);
-  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
-      gemmlowp::CeilQuotient(output_rows, thread_count));
+  std::vector<FullyConnectedAsGEMVWorkerTask> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  const int kRowsPerWorker =
+      RoundUp<kKernelRows>(CeilQuotient(output_rows, thread_count));
   int row_start = 0;
   for (int i = 0; i < thread_count; ++i) {
     int row_end = std::min(output_rows, row_start + kRowsPerWorker);
-    tasks[i] = new FullyConnectedAsGEMVWorkerTask(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data, row_start, row_end);
+    tasks.emplace_back(input_shape, input_data, input_offset, filter_shape,
+                       filter_data, filter_offset, bias_shape, bias_data,
+                       output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_shape, output_data, row_start, row_end);
     row_start = row_end;
   }
   TFLITE_DCHECK_EQ(row_start, output_rows);
-  gemmlowp_context->workers_pool()->Execute(tasks);
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
 }
 #endif  // USE_NEON
 
-struct GemmlowpOutputPipeline {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
-                     gemmlowp::OutputStageClamp,
-                     gemmlowp::OutputStageSaturatingCastToUint8>
-      Pipeline;
-  static Pipeline MakeExp(const int32* bias_data, int output_rows,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_left_shift, int32 output_activation_min,
-                          int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
-    quantize_down_stage.result_exponent = output_left_shift;
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
-  }
-};
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+    uint8* output_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
@@ -1199,7 +1083,8 @@
           input_shape, input_data, input_offset, filter_shape, filter_data,
           filter_offset, bias_shape, bias_data, output_offset,
           output_multiplier, output_shift, output_activation_min,
-          output_activation_max, output_shape, output_data, gemmlowp_context);
+          output_activation_max, output_shape, output_data,
+          cpu_backend_context);
     }
   }
 #endif  // USE_NEON
@@ -1210,19 +1095,30 @@
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
 
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, output_rows, filter_cols, filter_cols);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      input_data, filter_cols, batches, filter_cols);
-  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, batches, output_rows);
-  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
-      bias_data, output_rows, output_offset, output_multiplier, output_shift,
-      output_activation_min, output_activation_max);
-  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
-                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
-      filter_offset, input_offset, output_pipeline);
+  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  rhs_params.rows = filter_cols;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<uint8> dst_params;
+  dst_params.rows = filter_rows;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<int32, uint8> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
 inline void FullyConnected(
@@ -1230,7 +1126,7 @@
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data_int32, const RuntimeShape& output_shape,
-    int16* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+    int16* output_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("FullyConnected/Uint8Int16");
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
@@ -1239,9 +1135,6 @@
   const int output_shift = params.output_shift;
   const int32 output_activation_min = params.quantized_activation_min;
   const int32 output_activation_max = params.quantized_activation_max;
-  // This is a copy of the reference implementation. We do not currently have a
-  // properly optimized version.
-  (void)gemmlowp_context;  // only used in properly optimized code.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   TFLITE_DCHECK_EQ(output_offset, 0);
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
@@ -1283,33 +1176,31 @@
     }
   }
 #endif
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> weights_matrix(
-      filter_data, output_depth, accum_depth);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      input_data, accum_depth, batches);
-  gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_depth, batches);
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  ColVectorMap bias_vector(bias_data_int32, output_depth);
-  gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-  bias_addition_stage.bias_vector = bias_vector;
-  gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
-  scale_stage.result_offset_after_shift = 0;
-  scale_stage.result_fixedpoint_multiplier = output_multiplier;
-  // Note that this shift is negated wrt ordinary FC.
-  scale_stage.result_exponent = output_shift;
-  gemmlowp::OutputStageClamp clamp_stage;
-  clamp_stage.min = output_activation_min;
-  clamp_stage.max = output_activation_max;
-  gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
-  auto output_pipeline =
-      std::make_tuple(bias_addition_stage, scale_stage, clamp_stage,
-                      saturating_cast_int16_stage);
-  gemmlowp::GemmWithOutputPipeline<uint8, int16,
-                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemmlowp_context, weights_matrix, input_matrix, &output_matrix,
-      filter_offset, input_offset, output_pipeline);
+
+  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  lhs_params.rows = output_depth;
+  lhs_params.cols = accum_depth;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<int16> dst_params;
+  dst_params.rows = output_depth;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = 0;
+  cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
+  gemm_params.bias = bias_data_int32;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
 // Internal function doing the actual arithmetic work for
@@ -1593,7 +1484,7 @@
 
 // Wraps ShuffledFullyConnectedWorkerImpl into a Task class
 // to allow using gemmlowp's threadpool.
-struct ShuffledFullyConnectedWorkerTask : gemmlowp::Task {
+struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task {
   ShuffledFullyConnectedWorkerTask(const uint8* input_data,
                                    const int8* shuffled_weights_data,
                                    int batches, int output_depth,
@@ -1637,13 +1528,12 @@
     const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
     int16* output_data, uint8* shuffled_input_workspace_data,
-    gemmlowp::GemmContext* gemmlowp_context) {
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("ShuffledFullyConnected/8bit");
   const int32 output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
   const int32 output_activation_min = params.quantized_activation_min;
   const int32 output_activation_max = params.quantized_activation_max;
-  (void)gemmlowp_context;  // only used in optimized code.
   TFLITE_DCHECK_EQ(output_activation_min, -32768);
   TFLITE_DCHECK_EQ(output_activation_max, 32767);
   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
@@ -1725,8 +1615,9 @@
   }
 
   static constexpr int kKernelRows = 4;
-  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      gemmlowp_context->max_num_threads(), output_depth, batches, accum_depth);
+  const int thread_count =
+      LegacyHowManyThreads<kKernelRows>(cpu_backend_context->max_num_threads(),
+                                        output_depth, batches, accum_depth);
   if (thread_count == 1) {
     // Single-thread case: do the computation on the current thread, don't
     // use a threadpool
@@ -1739,21 +1630,25 @@
 
   // Multi-threaded case: use the gemmlowp context's threadpool.
   TFLITE_DCHECK_GT(thread_count, 1);
-  std::vector<gemmlowp::Task*> tasks(thread_count);
-  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
-      gemmlowp::CeilQuotient(output_depth, thread_count));
+  std::vector<ShuffledFullyConnectedWorkerTask> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  const int kRowsPerWorker =
+      RoundUp<kKernelRows>(CeilQuotient(output_depth, thread_count));
   int row_start = 0;
   for (int i = 0; i < thread_count; i++) {
     int row_end = std::min(output_depth, row_start + kRowsPerWorker);
-    tasks[i] = new ShuffledFullyConnectedWorkerTask(
-        shuffled_input_workspace_data,
-        int8_shuffled_weights_data + row_start * accum_depth, batches,
-        row_end - row_start, output_depth, accum_depth, bias_data + row_start,
-        output_multiplier, output_shift, output_data + row_start);
+    tasks.emplace_back(shuffled_input_workspace_data,
+                       int8_shuffled_weights_data + row_start * accum_depth,
+                       batches, row_end - row_start, output_depth, accum_depth,
+                       bias_data + row_start, output_multiplier, output_shift,
+                       output_data + row_start);
     row_start = row_end;
   }
   TFLITE_DCHECK_EQ(row_start, output_depth);
-  gemmlowp_context->workers_pool()->Execute(tasks);
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
 }
 
 inline void MeanImpl(const tflite::MeanParams& op_params,
@@ -1877,7 +1772,7 @@
   }
 }
 
-struct MeanWorkerTask : public gemmlowp::Task {
+struct MeanWorkerTask : cpu_backend_threadpool::Task {
   MeanWorkerTask(const tflite::MeanParams& op_params,
                  const RuntimeShape& input_shape, const uint8_t* input_data,
                  int32 input_zero_point, float input_scale,
@@ -1921,9 +1816,8 @@
                  const uint8_t* input_data, int32 input_zero_point,
                  float input_scale, const RuntimeShape& unextended_output_shape,
                  uint8_t* output_data, int32 output_zero_point,
-                 float output_scale, gemmlowp::GemmContext* gemmlowp_context) {
+                 float output_scale, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8");
-
   // Current implementation only supports dimension equals 4 and simultaneous
   // reduction over width and height.
   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
@@ -1946,28 +1840,32 @@
   int thread_count = output_depth / kMinDepthPerThread;
   thread_count = thread_count > 0 ? thread_count : 1;
   const int capped_thread_count =
-      std::min(thread_count, gemmlowp_context->max_num_threads());
+      std::min(thread_count, cpu_backend_context->max_num_threads());
 
-  if (thread_count == 1) {
+  if (capped_thread_count == 1) {
     MeanImpl(op_params, input_shape, input_data, input_zero_point, input_scale,
              output_shape, output_data, output_zero_point, output_scale, 0,
              output_depth);
   } else {
     // Instead parrallel for batch, we loop for the output_depth since batch
     // is typical 1.
-    std::vector<gemmlowp::Task*> tasks(capped_thread_count);
+    std::vector<MeanWorkerTask> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(capped_thread_count);
     int depth_start = 0;
     for (int i = 0; i < capped_thread_count; ++i) {
       // Try to distribute the tasks as even as possible.
       int depth_end = depth_start +
                       (output_depth - depth_start) / (capped_thread_count - i);
-      tasks[i] = new MeanWorkerTask(op_params, input_shape, input_data,
-                                    input_zero_point, input_scale, output_shape,
-                                    output_data, output_zero_point,
-                                    output_scale, depth_start, depth_end);
+      tasks.emplace_back(op_params, input_shape, input_data, input_zero_point,
+                         input_scale, output_shape, output_data,
+                         output_zero_point, output_scale, depth_start,
+                         depth_end);
       depth_start = depth_end;
     }
-    gemmlowp_context->workers_pool()->Execute(tasks);
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
   }
 }
 
@@ -1976,7 +1874,7 @@
                  const float* filter_data, const RuntimeShape& bias_shape,
                  const float* bias_data, const RuntimeShape& output_shape,
                  float* output_data, const RuntimeShape& im2col_shape,
-                 float* im2col_data) {
+                 float* im2col_data, CpuBackendContext* cpu_backend_context) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
@@ -1991,7 +1889,7 @@
   (void)im2col_shape;
   gemmlowp::ScopedProfilingLabel label("Conv");
 
-  // NB: static_cast<float>(0x00000000h) == 0.0f
+  // NB: the float 0.0f value is represented by all zero bytes.
   const uint8 float_zero_byte = 0x00;
   const float* gemm_input_data = nullptr;
   const RuntimeShape* gemm_input_shape = nullptr;
@@ -2020,6 +1918,12 @@
     gemm_input_shape = &input_shape;
   }
 
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
   // The following code computes matrix multiplication c = a * transponse(b)
   // with CBLAS, where:
   // * `a` is a matrix with dimensions (m, k).
@@ -2029,12 +1933,6 @@
   const float* a = gemm_input_data;
   const float* b = filter_data;
   float* c = output_data;
-  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
-  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
-  int n = output_shape.Dims(3);
-  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
-
-#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
   // The stride of matrix a, b and c respectively.
   int stride_a = k;
   int stride_b = k;
@@ -2042,36 +1940,32 @@
 
   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
               stride_a, b, stride_b, 0.0f, c, stride_c);
-#else
-  // When an optimized CBLAS implementation is not available, fall back
-  // to using Eigen.
-  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
-      Matrix;
-  typedef Eigen::Map<Matrix> MatrixRef;
-  typedef Eigen::Map<const Matrix> ConstMatrixRef;
-
-  MatrixRef matrix_c(c, m, n);
-  ConstMatrixRef matrix_a(a, m, k);
-  ConstMatrixRef matrix_b(b, n, k);
-
-  // The following special casing for when a or b is a vector is required
-  // as Eigen seem to fail to make this optimization on its own.
-  if (n == 1) {
-    gemmlowp::ScopedProfilingLabel label("GEMV");
-    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
-  } else if (m == 1) {
-    gemmlowp::ScopedProfilingLabel label("GEMV");
-    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
-  } else {
-    gemmlowp::ScopedProfilingLabel label("GEMM");
-    matrix_c.noalias() = matrix_a * matrix_b.transpose();
-  }
-
-#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
-
   optimized_ops::AddBiasAndEvalActivationFunction(
       output_activation_min, output_activation_max, bias_shape, bias_data,
       output_shape, output_data);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using cpu_backend_gemm.
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = n;
+  lhs_params.cols = k;
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = k;
+  rhs_params.cols = m;
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = n;
+  dst_params.cols = m;
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 }
 
 inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
@@ -2159,7 +2053,7 @@
                  const uint8* filter_data, const RuntimeShape& bias_shape,
                  const int32* bias_data, const RuntimeShape& output_shape,
                  uint8* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, gemmlowp::GemmContext* gemmlowp_context) {
+                 uint8* im2col_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("Conv/8bit");
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -2241,23 +2135,34 @@
         *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
         filter_data, filter_offset, bias_shape, bias_data, output_offset,
         output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_shape, output_data, gemmlowp_context);
+        output_activation_max, output_shape, output_data, cpu_backend_context);
   }
 #endif
 
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, filter_rows, filter_cols);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      gemm_input_data, gemm_input_rows, gemm_input_cols);
-  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, output_cols);
-  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
-      bias_data, output_rows, output_offset, output_multiplier, output_shift,
-      output_activation_min, output_activation_max);
-  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
-                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
-      filter_offset, input_offset, output_pipeline);
+  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  rhs_params.rows = gemm_input_rows;
+  rhs_params.cols = gemm_input_cols;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<uint8> dst_params;
+  dst_params.rows = output_rows;
+  dst_params.cols = output_cols;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<int32, uint8> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
 template <typename T>
@@ -3356,7 +3261,8 @@
     const RuntimeShape& unextended_output_state_shape, float* output_state_data,
     const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
     const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
-    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data,
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("LstmCell");
   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
@@ -3430,7 +3336,7 @@
   fc_params.float_activation_max = std::numeric_limits<float>::max();
   FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
                  weights_data, bias_shape, bias_data, activ_temp_shape,
-                 activ_temp_data);
+                 activ_temp_data, cpu_backend_context);
 
   // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
   // operations.
@@ -3463,9 +3369,6 @@
       output_state_map.tanh();
 }
 
-// Quantized LSTM cell. Currently just a copy of the reference impl in
-// reference_ops.h. See the big function comment there, not replicating it
-// here.
 template <int StateIntegerBits>
 inline void LstmCell(
     const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
@@ -3483,7 +3386,7 @@
     const RuntimeShape& unextended_concat_temp_shape,
     uint8* concat_temp_data_uint8,
     const RuntimeShape& unextended_activ_temp_shape,
-    int16* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) {
+    int16* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label(
       "LstmCell/quantized (8bit external, 16bit internal)");
   int32 weights_zero_point = params.weights_zero_point;
@@ -3569,28 +3472,28 @@
   }
 #endif
   if (!gemm_already_performed) {
-    gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor>
-        weights_matrix(weights_data_uint8, fc_output_depth, fc_accum_depth);
-    gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-        concat_temp_data_uint8, fc_accum_depth, fc_batches);
-    gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix(
-        activ_temp_data_int16, fc_output_depth, fc_batches);
-    typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-        ColVectorMap;
-    ColVectorMap bias_vector(bias_data_int32, fc_output_depth);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
-    scale_stage.result_offset_after_shift = 0;
-    scale_stage.result_fixedpoint_multiplier = accum_multiplier;
-    scale_stage.result_exponent = accum_shift;
-    gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
-    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
-                                           saturating_cast_int16_stage);
-    gemmlowp::GemmWithOutputPipeline<
-        uint8, int16, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-        gemmlowp_context, weights_matrix, input_matrix, &output_matrix,
-        -weights_zero_point, -128, output_pipeline);
+    cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+    lhs_params.rows = fc_output_depth;
+    lhs_params.cols = fc_accum_depth;
+    lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+    lhs_params.zero_point = weights_zero_point;
+    cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+    rhs_params.rows = fc_accum_depth;
+    rhs_params.cols = fc_batches;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.zero_point = 128;
+    cpu_backend_gemm::MatrixParams<int16> dst_params;
+    dst_params.rows = fc_output_depth;
+    dst_params.cols = fc_batches;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.zero_point = 0;
+    cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
+    gemm_params.bias = bias_data_int32;
+    gemm_params.multiplier_fixedpoint = accum_multiplier;
+    gemm_params.multiplier_exponent = accum_shift;
+    cpu_backend_gemm::Gemm(
+        lhs_params, weights_data_uint8, rhs_params, concat_temp_data_uint8,
+        dst_params, activ_temp_data_int16, gemm_params, cpu_backend_context);
   }
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
@@ -3819,10 +3722,11 @@
   }
 }
 
-inline void AveragePool(const PoolParams& params,
-                        const RuntimeShape& input_shape,
-                        const uint8* input_data,
-                        const RuntimeShape& output_shape, uint8* output_data) {
+inline void AveragePool16(const PoolParams& params,
+                          const RuntimeShape& input_shape,
+                          const uint8* input_data,
+                          const RuntimeShape& output_shape,
+                          uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
 
   // Here, and in other pooling ops, in order to maintain locality of reference,
@@ -3950,6 +3854,155 @@
   }
 }
 
+inline void AveragePool32(const PoolParams& params,
+                          const RuntimeShape& input_shape,
+                          const uint8* input_data,
+                          const RuntimeShape& output_shape,
+                          uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  uint32 acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch) {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint16x4_t acc_reg[4];
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg)));
+                acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg)));
+                acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg)));
+                acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg)));
+                for (int i = 0; i < 4; i++) {
+                  vst1q_u32(
+                      acc + channel + 4 * i,
+                      vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint16x4_t acc_reg[2];
+                uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_u16(input_reg);
+                acc_reg[1] = vget_high_u16(input_reg);
+                for (int i = 0; i < 2; i++) {
+                  vst1q_u32(
+                      acc + channel + 4 * i,
+                      vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
+  if (filter_count == FILTER_COUNT) {                                   \
+    for (; channel <= tranche_depth - 8; channel += 8) {                \
+      uint16 buf[8];                                                    \
+      for (int i = 0; i < 8; i++) {                                     \
+        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
+      }                                                                 \
+      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                      \
+      buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
+      buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
+      vst1_u8(output_ptr + channel, buf8);                              \
+    }                                                                   \
+  }
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint16 buf[8];
+            for (int i = 0; i < 8; i++) {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel) {
+            uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16>(a, params.quantized_activation_min);
+            a = std::min<uint16>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const uint8* input_data,
+                        const RuntimeShape& output_shape, uint8* output_data) {
+  if (params.filter_height * params.filter_width > 16 * 16) {
+    AveragePool32(params, input_shape, input_data, output_shape, output_data);
+  } else {
+    AveragePool16(params, input_shape, input_data, output_shape, output_data);
+  }
+}
+
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                     const float* input_data, const RuntimeShape& output_shape,
                     float* output_data) {
@@ -5971,19 +6024,19 @@
   // We front-pad the begin and size vectors.
   const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
   const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
-                         ? ext_shape.Dims(0) - start_b
+                         ? ext_shape.Dims(0)
                          : start_b + op_params.size[0];
   const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
   const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
-                         ? ext_shape.Dims(1) - start_h
+                         ? ext_shape.Dims(1)
                          : start_h + op_params.size[size_count - 3];
   const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
   const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
-                         ? ext_shape.Dims(2) - start_w
+                         ? ext_shape.Dims(2)
                          : start_w + op_params.size[size_count - 2];
   const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
   const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
-                         ? ext_shape.Dims(3) - start_d
+                         ? ext_shape.Dims(3)
                          : start_d + op_params.size[size_count - 1];
 
   for (int in_b = start_b; in_b < stop_b; ++in_b) {
@@ -6170,7 +6223,8 @@
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
     const float* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
-    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data) {
+    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data,
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("TransposeConvV2");
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
@@ -6203,21 +6257,25 @@
   const int hwoi_ordered_filter_total_size =
       filter_height * filter_width * output_depth;
 
-  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
-      Matrix;
-  typedef Eigen::Map<Matrix> MatrixRef;
-  typedef Eigen::Map<const Matrix> ConstMatrixRef;
-  ConstMatrixRef hwoi_ordered_filter_matrix_map(
-      hwoi_ordered_filter_data, hwoi_ordered_filter_total_size, input_depth);
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = hwoi_ordered_filter_total_size;
+  lhs_params.cols = input_depth;
   float* output_data_p = output_data;
   tensor_utils::ZeroVector(output_data, output_offset * batch_size);
   for (int i = 0; i < batch_size; ++i) {
-    ConstMatrixRef input_matrix_map(input_data + input_offset * i,
-                                    input_image_size, input_depth);
-    MatrixRef output_matrix_map(col2im_data, input_image_size,
-                                hwoi_ordered_filter_total_size);
-    Gemm(input_matrix_map, hwoi_ordered_filter_matrix_map.transpose(),
-         &output_matrix_map);
+    cpu_backend_gemm::MatrixParams<float> rhs_params;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.rows = input_depth;
+    rhs_params.cols = input_image_size;
+    cpu_backend_gemm::MatrixParams<float> dst_params;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.rows = hwoi_ordered_filter_total_size;
+    dst_params.cols = input_image_size;
+    cpu_backend_gemm::GemmParams<float, float> gemm_params;
+    cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
+                           input_data + input_offset * i, dst_params,
+                           col2im_data, gemm_params, cpu_backend_context);
 
     Col2im(col2im_data, output_depth, output_height, output_width,
            filter_height, filter_width, padding_top, padding_left,
@@ -6227,29 +6285,6 @@
   }
 }
 
-// TODO(renjieliu): Investigate whether we need to keep this.
-inline void TransposeConv(
-    const ConvParams& params, const RuntimeShape& input_shape,
-    const float* input_data, const RuntimeShape& filter_shape,
-    const float* filter_data, const RuntimeShape& output_shape,
-    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
-  gemmlowp::ScopedProfilingLabel label("TransposeConv");
-  // Note we could use transposed weights with forward conv for unstrided
-  // cases. But we are already getting good performance with this code as-is.
-  TFLITE_DCHECK(im2col_data);
-  TransposeIm2col(params, 0, input_shape, input_data, filter_shape,
-                  output_shape, im2col_data);
-
-  const auto im2col_matrix_map =
-      MapAsMatrixWithLastDimAsRows(im2col_data, im2col_shape);
-  const auto filter_matrix_map =
-      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
-
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
-}
-
 // Integer-only version of ResizeNearestNeighbor. Since scales are represented
 // in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
 // reference version. Debug checks are in place to test if this occurs.
diff --git a/tensorflow/lite/kernels/internal/reference/conv.h b/tensorflow/lite/kernels/internal/reference/conv.h
index 2185389..d23d97c 100644
--- a/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/conv.h
@@ -103,11 +103,10 @@
                  const uint8* filter_data, const RuntimeShape& bias_shape,
                  const int32* bias_data, const RuntimeShape& output_shape,
                  uint8* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, void* gemmlowp_context) {
-  (void)gemmlowp_context;  // only used in optimized code.
+                 uint8* im2col_data, void* cpu_backend_context) {
+  (void)cpu_backend_context;  // only used in optimized code.
   (void)im2col_data;   // only used in optimized code.
   (void)im2col_shape;  // only used in optimized code.
-  (void)gemmlowp_context;  // only used in optimized code.
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
index 705adf8..51c1def 100644
--- a/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -15,7 +15,6 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 
-#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/round.h"
@@ -67,8 +66,7 @@
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, void* gemmlowp_context) {
-  (void)gemmlowp_context;  // only used in optimized code.
+    uint8* output_data) {
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
   const int32 output_offset = params.output_offset;
@@ -116,8 +114,7 @@
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, void* gemmlowp_context) {
-  (void)gemmlowp_context;  // only used in optimized code.
+    int16* output_data) {
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
   const int32 output_offset = params.output_offset;
@@ -170,9 +167,7 @@
     const uint8* input_data, const RuntimeShape& weights_shape,
     const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, uint8* shuffled_input_workspace_data,
-    void* gemmlowp_context) {
-  (void)gemmlowp_context;  // only used in optimized code.
+    int16* output_data, uint8* shuffled_input_workspace_data) {
   const int32 output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
   const int32 output_activation_min = params.quantized_activation_min;
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index a694ba2..e3138e8 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -16,7 +16,8 @@
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
 
 #include <limits>
-#include "public/gemmlowp.h"
+
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index b424a3e..737e9d2 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -15,7 +15,7 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index 301994f..6431384 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -25,8 +25,7 @@
     const int8_t* input_data, const RuntimeShape& filter_shape,
     const int8_t* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    int8_t* output_data, void* gemmlowp_context) {
-  (void)gemmlowp_context;  // only used in optimized code.
+    int8_t* output_data) {
   const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
   const int32 output_offset = params.output_offset;
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
index 5e33d08..dad17fb 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -15,7 +15,8 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
 
-#include "public/gemmlowp.h"
+#include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index 081928b..cc70438 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -16,6 +16,8 @@
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
 
 #include <limits>
+
+#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index a782821..0c399ba 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -18,6 +18,7 @@
 #include <stdint.h>
 #include <sys/types.h>
 
+#include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/legacy_types.h"
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
@@ -420,6 +421,26 @@
                  output_data, output_dims);
 }
 
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext*) {
+  FullyConnected(params, input_shape, input_data, filter_shape, filter_data,
+                 bias_shape, bias_data, output_shape, output_data);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int16* output_data, gemmlowp::GemmContext*) {
+  FullyConnected(params, input_shape, input_data, filter_shape, filter_data,
+                 bias_shape, bias_data, output_shape, output_data);
+}
+
 inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
                            int32 input_offset, const uint8* filter_data,
                            const Dims<4>& filter_dims, int32 filter_offset,
@@ -471,6 +492,19 @@
 }
 
 inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& weights_shape,
+    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int16* output_data, uint8* shuffled_input_workspace_data,
+    gemmlowp::GemmContext*) {
+  ShuffledFullyConnected(params, input_shape, input_data, weights_shape,
+                         shuffled_weights_data, bias_shape, bias_data,
+                         output_shape, output_data,
+                         shuffled_input_workspace_data);
+}
+
+inline void ShuffledFullyConnected(
     const uint8* input_data, const Dims<4>& input_dims,
     const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
     const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
diff --git a/tensorflow/lite/kernels/internal/reference/pooling.h b/tensorflow/lite/kernels/internal/reference/pooling.h
index 847fac7..2cb2347 100644
--- a/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -15,7 +15,6 @@
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
 
-#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/round.h"
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 5886740..94a4c35 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -27,7 +27,7 @@
 #include <type_traits>
 
 #include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -318,6 +318,34 @@
   }
 }
 
+template <typename T>
+inline void QuantizeLeakyRelu(const LeakyReluParams& params, T q_alpha,
+                              const RuntimeShape& input_shape,
+                              const T* input_data,
+                              const RuntimeShape& output_shape,
+                              T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static const int32 quantized_min = std::numeric_limits<T>::min();
+  static const int32 quantized_max = std::numeric_limits<T>::max();
+  static const int32 alpha_value = q_alpha - params.alpha_offset;
+  for (int i = 0; i < flat_size; ++i) {
+    const int32 input_value = input_data[i] - params.input_offset;
+    if (input_value >= 0) {
+      output_data[i] = input_data[i];
+    } else {
+      const int32 unclamped_output =
+          params.output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                     input_value * alpha_value,
+                                     params.output_multiplier,
+                                     params.output_shift);
+      const T clamped_output =
+          std::min(quantized_max, std::max(quantized_min, unclamped_output));
+      output_data[i] = static_cast<uint8>(clamped_output);
+    }
+  }
+}
+
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const float* input_data,
@@ -1886,23 +1914,25 @@
 // aiming for 16-bit fixed-point quantization of these internal nodes here.
 //
 template <int StateIntegerBits>
-inline void LstmCell(
-    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
-    const uint8* input_data_uint8,
-    const RuntimeShape& unextended_prev_activ_shape,
-    const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
-    const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
-    const int32* bias_data_int32,
-    const RuntimeShape& unextended_prev_state_shape,
-    const int16* prev_state_data_int16,
-    const RuntimeShape& unextended_output_state_shape,
-    int16* output_state_data_int16,
-    const RuntimeShape& unextended_output_activ_shape,
-    uint8* output_activ_data_uint8,
-    const RuntimeShape& unextended_concat_temp_shape,
-    uint8* concat_temp_data_uint8,
-    const RuntimeShape& unextended_activ_temp_shape,
-    int16* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) {
+inline void LstmCell(const LstmCellParams& params,
+                     const RuntimeShape& unextended_input_shape,
+                     const uint8* input_data_uint8,
+                     const RuntimeShape& unextended_prev_activ_shape,
+                     const uint8* prev_activ_data_uint8,
+                     const RuntimeShape& weights_shape,
+                     const uint8* weights_data_uint8,
+                     const RuntimeShape& unextended_bias_shape,
+                     const int32* bias_data_int32,
+                     const RuntimeShape& unextended_prev_state_shape,
+                     const int16* prev_state_data_int16,
+                     const RuntimeShape& unextended_output_state_shape,
+                     int16* output_state_data_int16,
+                     const RuntimeShape& unextended_output_activ_shape,
+                     uint8* output_activ_data_uint8,
+                     const RuntimeShape& unextended_concat_temp_shape,
+                     uint8* concat_temp_data_uint8,
+                     const RuntimeShape& unextended_activ_temp_shape,
+                     int16* activ_temp_data_int16, void* gemmlowp_context) {
   (void)gemmlowp_context;  // only used in optimized code.
   int32 weights_zero_point = params.weights_zero_point;
   int32 accum_multiplier = params.accum_multiplier;
@@ -2591,6 +2621,29 @@
   }
 }
 
+inline float RoundToNearest(float value) {
+  auto floor_val = std::floor(value);
+  auto diff = value - floor_val;
+  if ((diff < 0.5f) ||
+      ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
+    return floor_val;
+  } else {
+    return floor_val = floor_val + 1.0f;
+  }
+}
+
+inline void Round(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++) {
+    // Note that this implementation matches that of tensorFlow tf.round
+    // and corresponds to the bankers rounding method.
+    // cfenv (for fesetround) is not yet supported universally on Android, so
+    // using a work around.
+    output_data[i] = RoundToNearest(input_data[i]);
+  }
+}
+
 template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -3051,19 +3104,19 @@
   // We front-pad the begin and size vectors.
   const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
   const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
-                         ? ext_shape.Dims(0) - start_b
+                         ? ext_shape.Dims(0)
                          : start_b + op_params.size[0];
   const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
   const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
-                         ? ext_shape.Dims(1) - start_h
+                         ? ext_shape.Dims(1)
                          : start_h + op_params.size[size_count - 3];
   const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
   const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
-                         ? ext_shape.Dims(2) - start_w
+                         ? ext_shape.Dims(2)
                          : start_w + op_params.size[size_count - 2];
   const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
   const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
-                         ? ext_shape.Dims(3) - start_d
+                         ? ext_shape.Dims(3)
                          : start_d + op_params.size[size_count - 1];
 
   for (int in_b = start_b; in_b < stop_b; ++in_b) {
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index f77fae2..8ee95d4 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -67,6 +67,11 @@
 }
 
 template <>
+inline const TfLiteFloat16* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.f16 : nullptr;
+}
+
+template <>
 inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
 }
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 102525b..4ae704c 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -1022,6 +1022,11 @@
 
 struct LeakyReluParams {
   float alpha;
+  int32 input_offset;
+  int32 alpha_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
 };
 
 template <typename P>
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 7ff61ac..7f5ab19 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -110,11 +110,14 @@
                                               TfLiteTensor* output,
                                               double* multiplier) {
   const double input_product_scale = input->params.scale * filter->params.scale;
-  const double bias_scale = bias->params.scale;
   // TODO(ahentz): The following conditions must be guaranteed by the training
   // pipeline.
-  TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <=
-                              1e-6 * std::min(input_product_scale, bias_scale));
+  if (bias) {
+    const double bias_scale = bias->params.scale;
+    TF_LITE_ENSURE(context,
+                   std::abs(input_product_scale - bias_scale) <=
+                       1e-6 * std::min(input_product_scale, bias_scale));
+  }
   return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                           multiplier);
 }
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 6e58c2a..04d559a 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -468,6 +468,185 @@
   TfLiteTensorFree(&output);
 }
 
+TEST_F(KernelUtilTest, CheckAndPopulateUint8) {
+  // Create input.
+  TfLiteTensor input;
+  input.type = kTfLiteUInt8;
+  input.allocation_type = kTfLiteArenaRw;
+  input.dims = TfLiteIntArrayCreate(1);
+  input.dims->data[0] = 2;
+  TfLiteQuantizationParams input_quant = {1, 5};
+  input.params = input_quant;
+  input.quantization.type = kTfLiteAffineQuantization;
+  auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  input_params->scale = TfLiteFloatArrayCreate(1);
+  input_params->scale->data[0] = 1;
+  input_params->zero_point = TfLiteIntArrayCreate(1);
+  input_params->zero_point->data[0] = 5;
+  input.quantization.params = reinterpret_cast<void*>(input_params);
+
+  // Create filter.
+  TfLiteTensor filter;
+  filter.type = kTfLiteUInt8;
+  filter.allocation_type = kTfLiteArenaRw;
+  filter.dims = TfLiteIntArrayCreate(4);
+  filter.dims->data[0] = 3;
+  filter.dims->data[1] = 4;
+  filter.dims->data[2] = 5;
+  filter.dims->data[3] = 6;
+  TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
+  filter.params = filter_quant;
+  filter.quantization.type = kTfLiteAffineQuantization;
+  auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  filter_params->scale = TfLiteFloatArrayCreate(1);
+  int32_t two_pow_neg_31 = 0x30000000;  // 2^-31 so shift = -30.
+  filter_params->scale->data[0] = *reinterpret_cast<float*>(&two_pow_neg_31);
+  filter_params->zero_point = TfLiteIntArrayCreate(1);
+  filter_params->zero_point->data[0] = 0;
+  filter_params->quantized_dimension = 0;
+  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+
+  // Create bias.
+  TfLiteTensor bias;
+  bias.type = kTfLiteInt32;
+  bias.allocation_type = kTfLiteArenaRw;
+  bias.dims = TfLiteIntArrayCreate(4);
+  TfLiteQuantizationParams bias_quant = {4.6566129e-10, 9};
+  bias.params = bias_quant;
+  bias.quantization.type = kTfLiteAffineQuantization;
+  auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  bias_params->scale = TfLiteFloatArrayCreate(1);
+  bias_params->scale->data[0] = 4.6566129e-10;  // 2^-31
+  bias_params->zero_point = TfLiteIntArrayCreate(1);
+  bias_params->zero_point->data[0] = 11;
+  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+
+  // Create output.
+  TfLiteTensor output;
+  output.type = kTfLiteUInt8;
+  output.allocation_type = kTfLiteArenaRw;
+  output.dims = nullptr;
+  TfLiteQuantizationParams output_quant = {1, -128};
+  output.params = output_quant;
+  output.quantization.type = kTfLiteAffineQuantization;
+  auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  output_params->scale = TfLiteFloatArrayCreate(1);
+  output_params->scale->data[0] = 1;
+  output_params->zero_point = TfLiteIntArrayCreate(1);
+  output_params->zero_point->data[0] = -128;
+  output.quantization.params = reinterpret_cast<void*>(output_params);
+
+  // Create call parameters.
+  TfLiteContext context;
+  int32_t multiplier;
+  int shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  std::vector<int32_t> per_channel_multiplier(1);
+  std::vector<int> per_channel_shift(1);
+
+  // Call and verify results for per channel case.
+  EXPECT_EQ(
+      kTfLiteOk,
+      PopulateConvolutionQuantizationParams(
+          &context, &input, &filter, &bias, &output, kTfLiteActRelu,
+          &multiplier, &shift, &output_activation_min, &output_activation_max,
+          per_channel_multiplier.data(), per_channel_shift.data()));
+  EXPECT_THAT(per_channel_multiplier, ::testing::ElementsAre(1073741824));
+  EXPECT_THAT(per_channel_shift, ::testing::ElementsAre(-30));
+
+  // Release.
+  TfLiteTensorFree(&input);
+  TfLiteTensorFree(&filter);
+  TfLiteTensorFree(&bias);
+  TfLiteTensorFree(&output);
+}
+
+TEST_F(KernelUtilTest, CheckAndPopulateWithoutBias) {
+  // Create input.
+  TfLiteTensor input;
+  input.type = kTfLiteUInt8;
+  input.allocation_type = kTfLiteArenaRw;
+  input.dims = TfLiteIntArrayCreate(1);
+  input.dims->data[0] = 2;
+  TfLiteQuantizationParams input_quant = {1, 5};
+  input.params = input_quant;
+  input.quantization.type = kTfLiteAffineQuantization;
+  auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  input_params->scale = TfLiteFloatArrayCreate(1);
+  input_params->scale->data[0] = 1;
+  input_params->zero_point = TfLiteIntArrayCreate(1);
+  input_params->zero_point->data[0] = 5;
+  input.quantization.params = reinterpret_cast<void*>(input_params);
+
+  // Create filter.
+  TfLiteTensor filter;
+  filter.type = kTfLiteUInt8;
+  filter.allocation_type = kTfLiteArenaRw;
+  filter.dims = TfLiteIntArrayCreate(4);
+  filter.dims->data[0] = 3;
+  filter.dims->data[1] = 4;
+  filter.dims->data[2] = 5;
+  filter.dims->data[3] = 6;
+  TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
+  filter.params = filter_quant;
+  filter.quantization.type = kTfLiteAffineQuantization;
+  auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  filter_params->scale = TfLiteFloatArrayCreate(1);
+  int32_t two_pow_neg_31 = 0x30000000;  // 2^-31 so shift = -30.
+  filter_params->scale->data[0] = *reinterpret_cast<float*>(&two_pow_neg_31);
+  filter_params->zero_point = TfLiteIntArrayCreate(1);
+  filter_params->zero_point->data[0] = 0;
+  filter_params->quantized_dimension = 0;
+  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+
+  // Create output.
+  TfLiteTensor output;
+  output.type = kTfLiteUInt8;
+  output.allocation_type = kTfLiteArenaRw;
+  output.dims = nullptr;
+  TfLiteQuantizationParams output_quant = {1, -128};
+  output.params = output_quant;
+  output.quantization.type = kTfLiteAffineQuantization;
+  auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  output_params->scale = TfLiteFloatArrayCreate(1);
+  output_params->scale->data[0] = 1;
+  output_params->zero_point = TfLiteIntArrayCreate(1);
+  output_params->zero_point->data[0] = -128;
+  output.quantization.params = reinterpret_cast<void*>(output_params);
+
+  // Create call parameters.
+  TfLiteContext context;
+  int32_t multiplier;
+  int shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  std::vector<int32_t> per_channel_multiplier(1);
+  std::vector<int> per_channel_shift(1);
+
+  // Call and verify results for per channel case.
+  EXPECT_EQ(
+      kTfLiteOk,
+      PopulateConvolutionQuantizationParams(
+          &context, &input, &filter, nullptr, &output, kTfLiteActRelu,
+          &multiplier, &shift, &output_activation_min, &output_activation_max,
+          per_channel_multiplier.data(), per_channel_shift.data()));
+  EXPECT_THAT(per_channel_multiplier, ::testing::ElementsAre(1073741824));
+  EXPECT_THAT(per_channel_shift, ::testing::ElementsAre(-30));
+
+  // Release.
+  TfLiteTensorFree(&input);
+  TfLiteTensorFree(&filter);
+  TfLiteTensorFree(&output);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index f356288..4dd98e0 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -23,7 +23,7 @@
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
-#include "tensorflow/lite/kernels/gemmlowp_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -125,6 +125,15 @@
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+  TF_LITE_ENSURE(context, (input_to_forget_weights->type == kTfLiteFloat32) ||
+                              (input_to_forget_weights->type == kTfLiteUInt8) ||
+                              (input_to_forget_weights->type == kTfLiteInt8));
+
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
@@ -132,19 +141,17 @@
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->type,
+                      input_to_forget_weights->type);
   }
 
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
-
   const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->type,
+                    input_to_forget_weights->type);
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
@@ -154,6 +161,8 @@
                       n_cell);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
                       n_output);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->type,
+                      input_to_forget_weights->type);
   }
 
   const TfLiteTensor* recurrent_to_forget_weights =
@@ -163,6 +172,8 @@
                     n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->type,
+                    input_to_forget_weights->type);
 
   const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
@@ -170,6 +181,8 @@
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
                     n_output);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->type,
+                    input_to_forget_weights->type);
 
   // We make sure the input-gate's parameters are either both present (regular
   // LSTM) or not at all (CIFG-LSTM).
@@ -185,6 +198,8 @@
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->type,
+                      input_to_forget_weights->type);
   }
 
   const TfLiteTensor* cell_to_forget_weights =
@@ -192,6 +207,8 @@
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->type,
+                      input_to_forget_weights->type);
   }
 
   const TfLiteTensor* cell_to_output_weights =
@@ -199,6 +216,8 @@
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->type,
+                      input_to_forget_weights->type);
   }
 
   // Making sure the peephole weights are there all or none.
@@ -219,21 +238,25 @@
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
@@ -241,6 +264,8 @@
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+    TF_LITE_ENSURE_EQ(context, projection_weights->type,
+                      input_to_forget_weights->type);
   }
 
   const TfLiteTensor* projection_bias =
@@ -248,6 +273,7 @@
   if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
   }
 
   // Making sure the projection tensors are consistent:
@@ -269,6 +295,8 @@
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
                         n_cell);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
+                        kTfLiteFloat32);
     }
 
     const TfLiteTensor* forget_layer_norm_coefficients =
@@ -277,6 +305,8 @@
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
                       n_cell);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
+                      kTfLiteFloat32);
 
     const TfLiteTensor* cell_layer_norm_coefficients =
         GetInput(context, node, kCellLayerNormCoefficientsTensor);
@@ -284,6 +314,8 @@
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
                       n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
+                      kTfLiteFloat32);
 
     const TfLiteTensor* output_layer_norm_coefficients =
         GetInput(context, node, kOutputLayerNormCoefficientsTensor);
@@ -291,6 +323,8 @@
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
                       n_cell);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
+                      kTfLiteFloat32);
   }
 
   return kTfLiteOk;
@@ -571,7 +605,6 @@
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  // TODO(mirkov): add a check that weights are all uint8s or all floats.
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
       return lstm_eval::EvalFloat(
@@ -762,7 +795,8 @@
         GetTensorShape(state_out), GetTensorData<float>(state_out),
         GetTensorShape(activation_out), GetTensorData<float>(activation_out),
         GetTensorShape(concat_temp), GetTensorData<float>(concat_temp),
-        GetTensorShape(activation_temp), GetTensorData<float>(activation_temp));
+        GetTensorShape(activation_temp), GetTensorData<float>(activation_temp),
+        cpu_backend_support::GetFromContext(context));
   } else if (input->type == kTfLiteUInt8 &&
              prev_activation->type == kTfLiteUInt8 &&
              weights->type == kTfLiteUInt8 && bias->type == kTfLiteInt32 &&
@@ -771,8 +805,6 @@
              activation_out->type == kTfLiteUInt8 &&
              concat_temp->type == kTfLiteUInt8 &&
              activation_temp->type == kTfLiteInt16) {
-    gemmlowp::GemmContext* gemmlowp_context =
-        gemmlowp_support::GetFromContext(context);
     int state_scale_log2_rounded;
     if (!CheckedLog2(state_out->params.scale, &state_scale_log2_rounded)) {
       context->ReportError(
@@ -811,7 +843,8 @@
         GetTensorShape(activation_out), GetTensorData<uint8_t>(activation_out),
         GetTensorShape(concat_temp), GetTensorData<uint8_t>(concat_temp),
         GetTensorShape(activation_temp),
-        GetTensorData<int16_t>(activation_temp), gemmlowp_context);
+        GetTensorData<int16_t>(activation_temp),
+        cpu_backend_support::GetFromContext(context));
   } else {
     context->ReportError(context,
                          "Unsupported combination of data types for LstmCell");
@@ -830,7 +863,7 @@
 }  // namespace basic
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  gemmlowp_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
 
   const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer);
   switch (params->kernel_type) {
@@ -844,7 +877,7 @@
   return nullptr;
 }
 void Free(TfLiteContext* context, void* buffer) {
-  gemmlowp_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
 
   delete reinterpret_cast<OpData*>(buffer);
 }
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 816b78c..a518daf 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -16,6 +16,12 @@
 
 #include <cstdint>
 
+#ifdef GEMMLOWP_PROFILING
+#include "profiling/profiler.h"
+#endif
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -41,7 +47,7 @@
 //  - n_batch: size of batch,
 //  - n_cell: number of cells (or units),
 //  - n_input: the input size,
-//  - n_aux_input: the auxilary input size.
+//  - n_aux_input: the auxiliary input size.
 //  - n_output: the output size.
 //  - output_batch_leading_dim: the leading dimension of the output buffer.
 //
@@ -51,7 +57,7 @@
 //   input_to_forget_weights
 //   input_to_cell_weights
 //   input_to_output_weights
-// Auxilary input weights of size 'n_cell * n_aux_input':
+// Auxiliary input weights of size 'n_cell * n_aux_input':
 //   aux_input_to_input_weights        - optional
 //   aux_input_to_forget_weights       - optional
 //   aux_input_to_cell_weights         - optional
@@ -119,6 +125,9 @@
     float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label("LstmStepWithAuxInputFloat");
+#endif
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
@@ -362,6 +371,28 @@
   }
 }
 
+void ApplyActivationsToVector(float* input, int input_size,
+                              TfLiteFusedActivation activation_type,
+                              float* output) {
+  using VectorMap = Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, 1>>;
+  VectorMap input_map(input, input_size, 1);
+  VectorMap output_map(output, input_size, 1);
+  switch (activation_type) {
+    case kTfLiteActSigmoid: {
+      output_map.array() = input_map.array().logistic();
+      break;
+    }
+    case kTfLiteActTanh: {
+      output_map.array() = input_map.array().tanh();
+      break;
+    }
+    default: {
+      tensor_utils::ApplyActivationToVector(input, input_size, activation_type,
+                                            output);
+    }
+  }
+}
+
 // Same as above but with quantized weight matrices. In detail:
 // Input of size 'n_batch * n_input':
 //   input_ptr_batch
@@ -372,7 +403,7 @@
 //   input_to_forget_weights
 //   input_to_cell_weights
 //   input_to_input_weights
-// Quantized auxilary input weights of size 'n_cell * n_aux_input':
+// Quantized auxiliary input weights of size 'n_cell * n_aux_input':
 //   aux_input_to_input_weights        - optional
 //   aux_input_to_forget_weights       - optional
 //   aux_input_to_cell_weights         - optional
@@ -473,6 +504,9 @@
     int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr,
     int8_t* quantized_cell_state_ptr, float* output_state_ptr,
     float* cell_state_ptr, float* output_ptr_batch) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label("LstmStepWithAuxInputHybrid");
+#endif
   // Since we have already checked that weights are all there or none, we
   // can check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
@@ -674,8 +708,8 @@
       tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
                                          input_gate_scratch);
     }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
+    ApplyActivationsToVector(input_gate_scratch, n_cell * n_batch,
+                             kTfLiteActSigmoid, input_gate_scratch);
   }
 
   // For each batch and cell: update forget gate.
@@ -697,8 +731,8 @@
     tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
                                        forget_gate_scratch);
   }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
+  ApplyActivationsToVector(forget_gate_scratch, n_cell * n_batch,
+                           kTfLiteActSigmoid, forget_gate_scratch);
 
   // For each batch and cell: update the cell.
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
@@ -712,8 +746,8 @@
     tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
                                        cell_scratch);
   }
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  ApplyActivationsToVector(cell_scratch, n_batch * n_cell, params->activation,
+                           cell_scratch);
   if (use_cifg) {
     tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
                              forget_gate_scratch);
@@ -749,10 +783,10 @@
     tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
                                        output_gate_scratch);
   }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  ApplyActivationsToVector(output_gate_scratch, n_batch * n_cell,
+                           kTfLiteActSigmoid, output_gate_scratch);
+  ApplyActivationsToVector(cell_state_ptr, n_batch * n_cell, params->activation,
+                           cell_scratch);
   tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
                                          n_batch * n_cell, output_gate_scratch);
 
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 04c2f18..35a98c5 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2555,6 +2555,82 @@
                 &layer_norm_lstm);
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
+TEST(LSTMOpModel, InvalidTypeTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  EXPECT_DEATH(LSTMOpModel lstm(
+                   n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/false,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
+
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
+
+                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
+
+                       {0},  // cell_to_input_weight tensor
+                       {0},  // cell_to_forget_weight tensor
+                       {0},  // cell_to_output_weight tensor
+
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
+
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_INT32),
+               "");
+
+  EXPECT_DEATH(LSTMOpModel lstm(
+                   n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/false,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
+
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
+
+                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
+
+                       {0},  // cell_to_input_weight tensor
+                       {0},  // cell_to_forget_weight tensor
+                       {0},  // cell_to_output_weight tensor
+
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
+
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_COMPLEX64),
+               "");
+}
+#endif
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/padding.h b/tensorflow/lite/kernels/padding.h
index 1aecf10..1116b1d 100644
--- a/tensorflow/lite/kernels/padding.h
+++ b/tensorflow/lite/kernels/padding.h
@@ -42,31 +42,36 @@
 
 // Matching GetWindowedOutputSize in TensorFlow.
 inline int ComputeOutSize(TfLitePadding padding, int image_size,
-                          int filter_size, int stride) {
+                          int filter_size, int stride, int dilation_rate = 1) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
   switch (padding) {
     case kTfLitePaddingSame:
       return (image_size + stride - 1) / stride;
     case kTfLitePaddingValid:
-      return (image_size + stride - filter_size) / stride;
+      return (image_size + stride - effective_filter_size) / stride;
     default:
       return 0;
   }
 }
 
 inline TfLitePaddingValues ComputePaddingHeightWidth(
-    int stride_height, int stride_width, int dilation_rate, int in_height,
-    int in_width, int filter_height, int filter_width, TfLitePadding padding) {
-  int out_width = ComputeOutSize(padding, in_width, filter_width, stride_width);
-  int out_height =
-      ComputeOutSize(padding, in_height, filter_height, stride_height);
+    int stride_height, int stride_width, int dilation_rate_height,
+    int dilation_rate_width, int in_height, int in_width, int filter_height,
+    int filter_width, TfLitePadding padding, int* out_height, int* out_width) {
+  *out_width = ComputeOutSize(padding, in_width, filter_width, stride_width,
+                              dilation_rate_width);
+  *out_height = ComputeOutSize(padding, in_height, filter_height, stride_height,
+                               dilation_rate_height);
 
   TfLitePaddingValues padding_values;
   int offset = 0;
-  padding_values.height = ComputePaddingWithOffset(
-      stride_height, 1, in_height, filter_height, out_height, &offset);
+  padding_values.height =
+      ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height,
+                               filter_height, *out_height, &offset);
   padding_values.height_offset = offset;
-  padding_values.width = ComputePaddingWithOffset(
-      stride_width, 1, in_width, filter_width, out_width, &offset);
+  padding_values.width =
+      ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width,
+                               filter_width, *out_width, &offset);
   padding_values.width_offset = offset;
   return padding_values;
 }
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index 9a90fa3..71dd349 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -81,24 +81,12 @@
 
   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
-  auto compute_out_size = [padding](int image_size, int filter_size,
-                                    int stride) -> int {
-    return padding == kTfLitePaddingSame
-               ? (image_size + stride - 1) / stride
-               : padding == kTfLitePaddingValid
-                     ? (image_size - filter_size + stride) / stride
-                     : 0;
-  };
+  int out_width, out_height;
 
-  int out_width =
-      compute_out_size(width, params->filter_width, params->stride_width);
-  int out_height =
-      compute_out_size(height, params->filter_height, params->stride_height);
-
-  data->padding.height = ComputePadding(params->stride_height, 1, height,
-                                        params->filter_height, out_height);
-  data->padding.width = ComputePadding(params->stride_width, 1, width,
-                                       params->filter_width, out_width);
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      params->filter_height, params->filter_width, padding, &out_height,
+      &out_width);
 
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (pool_type == kAverage || pool_type == kMax) {
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index b46b648..56e4b3a 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -558,8 +558,7 @@
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({44 - 128, 80 - 128, 92 - 128}));
 }
 
-// Send in a white image, expect something other than a white pixel, due to
-// overflow.
+// Send in a white image and expect a white pixel.
 TEST(QuantizedPoolingOpTest, AveragePoolImageSize17) {
   int image_size = 17;
   QuantizedPoolingOpModel m(
@@ -573,10 +572,7 @@
   m.SetInput(input);
   m.Invoke();
 
-  // Ordinarily we would see '255' here. However, the optimized version of
-  // AveragePool uses a uint16 accumulator which causes it to overflow for
-  // images this large.
-  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(28));
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(255));
 }
 
 TEST(FloatPoolingOpTest, MaxPool) {
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index f1a3d20..eee45aa 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -19,7 +19,7 @@
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/gemmlowp_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
@@ -61,7 +61,7 @@
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  gemmlowp_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
   // Creates two temp tensors to store index and axis for internal
   // implementation only.
   auto* op_data = new OpData();
@@ -70,7 +70,7 @@
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  gemmlowp_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -297,15 +297,14 @@
         ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
          (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
       if (op_context.input->type == kTfLiteUInt8) {
-        gemmlowp::GemmContext* gemmlowp_context =
-            gemmlowp_support::GetFromContext(context);
         optimized_ops::Mean(
             op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
             op_context.input->params.zero_point, op_context.input->params.scale,
             GetTensorShape(op_context.output),
             GetTensorData<uint8_t>(op_context.output),
             op_context.output->params.zero_point,
-            op_context.output->params.scale, gemmlowp_context);
+            op_context.output->params.scale,
+            cpu_backend_support::GetFromContext(context));
       } else {
         reference_ops::Mean(op_params, GetTensorShape(input),
                             GetTensorData<float>(input),
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index aa49b2e..d8a6d5d 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/lite/kernels/register.h"
+
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -95,6 +96,7 @@
 TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_FLOOR();
 TfLiteRegistration* Register_CEIL();
+TfLiteRegistration* Register_ROUND();
 TfLiteRegistration* Register_TILE();
 TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_SUM();
@@ -210,7 +212,7 @@
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
@@ -233,7 +235,8 @@
   AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_MUL, Register_MUL());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
              /* min_version */ 1,
              /* max_version */ 2);
@@ -269,7 +272,9 @@
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
   AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
              /* min_version */ 1,
@@ -320,6 +325,7 @@
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
   AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
+  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
   AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
              /* min_version */ 1,
@@ -363,7 +369,9 @@
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
   AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index 7383d03..c9e8aa9 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -40,9 +40,12 @@
                                 const TfLiteTensor* input,
                                 const TfLiteTensor* size,
                                 TfLiteTensor* output) {
+  const int32* size_data = GetTensorData<int32>(size);
+  // Sanity check, the up/down sampling size should always be positive.
+  TF_LITE_ENSURE(context, size_data[0] > 0);
+  TF_LITE_ENSURE(context, size_data[1] > 0);
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
   output_size->data[0] = input->dims->data[0];
-  const int32* size_data = GetTensorData<int32>(size);
   output_size->data[1] = size_data[0];
   output_size->data[2] = size_data[1];
   output_size->data[3] = input->dims->data[3];
diff --git a/tensorflow/lite/kernels/round.cc b/tensorflow/lite/kernels/round.cc
new file mode 100644
index 0000000..908e355
--- /dev/null
+++ b/tensorflow/lite/kernels/round.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace round {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  output->type = input->type;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  optimized_ops::Round(GetTensorShape(input), GetTensorData<float>(input),
+                       GetTensorShape(output), GetTensorData<float>(output));
+
+  return kTfLiteOk;
+}
+}  // namespace round
+
+TfLiteRegistration* Register_ROUND() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, round::Prepare, round::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
new file mode 100644
index 0000000..37304fb
--- /dev/null
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class RoundOpModel : public SingleOpModel {
+ public:
+  RoundOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_ROUND, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(RoundOpTest, SingleDim) {
+  RoundOpModel model({6}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0, 3.5, 4.2, -3.5, -4.5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0, 4, 4, -4, -4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
+}
+
+TEST(RoundOpTest, MultiDims) {
+  RoundOpModel model({2, 1, 1, 6}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(
+      model.input(), {0.0001, 8.0001, 0.9999, 9.9999, 0.5, -0.0001, -8.0001,
+                      -0.9999, -9.9999, -0.5, -2.5, 1.5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({0, 8, 1, 10, 0, 0, -8, -1, -10, -0, -2, 2}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index b9b8821..4147802 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -166,6 +166,39 @@
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis1) {
+  SliceOpModel<int32_t, int32_t> m({3, 3, 2, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9});
+  m.SetBegin({1, 1, 0, 0});
+  m.SetSize({2, -1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 6, 8, 9}));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 1, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 5, 5}));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis3) {
+  SliceOpModel<int32_t, int32_t> m({3, 1, 2, 3}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 1});
+  m.SetSize({2, 1, 1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 5, 5}));
+}
+
 TEST(SliceOpTest, SliceUint8) {
   SliceOpModel<uint8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
                                    TensorType_UINT8);
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 87434ab..44f8aa3 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -20,7 +20,6 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -343,20 +342,74 @@
   std::unique_ptr<OpResolver> resolver_;
 
  private:
-  // TODO(gavinbelson): sync this method with
-  // //tensorflow/lite/kernels/internal/quantization_util.h?l=31
   template <typename T>
   std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
-    // These are required by many quantized operations.
+    int32_t zero_point = 0;
+    float scale = 0;
+    const T qmin = std::numeric_limits<T>::min();
+    const T qmax = std::numeric_limits<T>::max();
+    const float qmin_double = qmin;
+    const float qmax_double = qmax;
+    // 0 should always be a representable value. Let's assume that the initial
+    // min,max range contains 0.
     CHECK_LE(f_min, 0);
     CHECK_GE(f_max, 0);
-    T q_min = std::numeric_limits<T>::min();
-    T q_max = std::numeric_limits<T>::max();
-    float range = q_max - q_min;
-    float scale = (f_max - f_min) / range;
-    int32_t zero_point = std::min(
-        q_max,
-        std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
+    if (f_min == f_max) {
+      // Special case where the min,max range is a point. Should be {0}.
+      CHECK_EQ(f_min, 0);
+      CHECK_EQ(f_max, 0);
+      return {scale, zero_point};
+    }
+
+    // General case.
+    //
+    // First determine the scale.
+    scale = (f_max - f_min) / (qmax_double - qmin_double);
+
+    // Zero-point computation.
+    // First the initial floating-point computation. The zero-point can be
+    // determined from solving an affine equation for any known pair
+    // (real value, corresponding quantized value).
+    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+    // The arithmetic error on the zero point computed from either pair
+    // will be roughly machine_epsilon * (sum of absolute values of terms)
+    // so we want to use the variant that adds the smaller terms.
+    const float zero_point_from_min = qmin_double - f_min / scale;
+    const float zero_point_from_max = qmax_double - f_max / scale;
+
+    const float zero_point_from_min_error =
+        std::abs(qmin_double) + std::abs(f_min / scale);
+
+    const float zero_point_from_max_error =
+        std::abs(qmax_double) + std::abs(f_max / scale);
+
+    const float zero_point_double =
+        zero_point_from_min_error < zero_point_from_max_error
+            ? zero_point_from_min
+            : zero_point_from_max;
+
+    // Now we need to nudge the zero point to be an integer
+    // (our zero points are integer, and this is motivated by the requirement
+    // to be able to represent the real value "0" exactly as a quantized value,
+    // which is required in multiple places, for example in Im2col with SAME
+    //  padding).
+
+    T nudged_zero_point = 0;
+    if (zero_point_double < qmin_double) {
+      nudged_zero_point = qmin;
+    } else if (zero_point_double > qmax_double) {
+      nudged_zero_point = qmax;
+    } else {
+      nudged_zero_point = static_cast<T>(round(zero_point_double));
+    }
+
+    // The zero point should always be in the range of quantized value,
+    // // [qmin, qmax].
+    CHECK_GE(nudged_zero_point, qmin);
+    CHECK_LE(nudged_zero_point, qmax);
+
+    zero_point = nudged_zero_point;
+    // finally, return the values
     return {scale, zero_point};
   }
 
@@ -514,6 +567,7 @@
 template <typename T>
 TensorType GetTensorType() {
   if (std::is_same<T, float>::value) return TensorType_FLOAT32;
+  if (std::is_same<T, TfLiteFloat16>::value) return TensorType_FLOAT16;
   if (std::is_same<T, int32_t>::value) return TensorType_INT32;
   if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
   if (std::is_same<T, string>::value) return TensorType_STRING;
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index cc55671..8bca828 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -21,6 +21,7 @@
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -85,11 +86,13 @@
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* data = new OpData;
   eigen_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
   eigen_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -306,8 +309,9 @@
 }
 
 template <KernelType kernel_type>
-void EvalFloat(const TfLiteTransposeConvParams* params, const OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* weights,
+void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
+               const OpData* data, const TfLiteTensor* input,
+               const TfLiteTensor* weights,
                const TfLiteTensor* transposed_weights, TfLiteTensor* col2im,
                TfLiteTensor* output) {
   tflite::ConvParams op_params;
@@ -333,7 +337,8 @@
           GetTensorShape(transposed_weights),
           GetTensorData<float>(transposed_weights), GetTensorShape(output),
           GetTensorData<float>(output), GetTensorShape(col2im),
-          GetTensorData<float>(col2im));
+          GetTensorData<float>(col2im),
+          cpu_backend_support::GetFromContext(context));
       break;
     }
   }
@@ -404,9 +409,11 @@
   const int filter_width = SizeOfDimension(weights, 2);
   const int filter_height = SizeOfDimension(weights, 1);
 
+  int unused_output_height, unused_output_width;
   data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width, 1, height, width,
-      filter_height, filter_width, params->padding);
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
 
   // Currently support float32 and uint8.
   switch (input->type) {
@@ -417,8 +424,8 @@
           ResizeAndTransposeWeights(context, weights, transposed_weights);
         }
       }
-      EvalFloat<kernel_type>(params, data, input, weights, transposed_weights,
-                             col2im, output);
+      EvalFloat<kernel_type>(context, params, data, input, weights,
+                             transposed_weights, col2im, output);
       break;
     }
     case kTfLiteUInt8: {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index bc35d90..c9f9f15 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -331,6 +331,7 @@
   std::vector<float> cell_to_forget_weights_;
   std::vector<float> cell_to_output_weights_;
   std::vector<float> projection_weights_;
+  std::vector<float> projection_bias_;
 
   // LSTM input is stored as num_batch x num_inputs vector.
   std::vector<std::vector<float>> lstm_input_;
@@ -1733,6 +1734,675 @@
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
 
+class NoCifgPeepholeProjectionAndBiasClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {
+        0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+        0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+        -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+        -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+        -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+        -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+        -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+        0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+        0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+        0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+        -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+        0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+        -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+        -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+        -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+        0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+        -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+        -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+        -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+        -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
+
+    input_to_forget_weights_ = {
+        -0.0018401089, -0.004852237, 0.03698424,    0.014181704,
+        0.028273236,   -0.016726194, -0.05249759,   -0.10204261,
+        0.00861066,    -0.040979505, -0.009899187,  0.01923892,
+        -0.028177269,  -0.08535103,  -0.14585495,   0.10662567,
+        -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+        0.0030784295,  0.076784775,  0.07463696,    0.094531395,
+        0.0814421,     -0.12257899,  -0.033945758,  -0.031303465,
+        0.045630626,   0.06843887,   -0.13492945,   -0.012480007,
+        -0.0811829,    -0.07224499,  -0.09628791,   0.045100946,
+        0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+        0.06958324,    0.034257296,  0.0482646,     0.06267997,
+        0.052625068,   0.12784666,   0.07077897,    0.025725935,
+        0.04165009,    0.07241905,   0.018668644,   -0.037377294,
+        -0.06277783,   -0.08833636,  -0.040120605,  -0.011405586,
+        -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+        0.05483423,    0.11449111,   0.11289652,    0.10939839,
+        0.13396506,    -0.08402166,  -0.01901462,   -0.044678304,
+        -0.07720565,   0.014350063,  -0.11757958,   -0.0652038,
+        -0.08185733,   -0.076754324, -0.092614375,  0.10405491,
+        0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+        0.036881298,   0.02913376,   0.03420159,    0.05448447,
+        -0.054523353,  0.02582715,   0.02327355,    -0.011857179,
+        -0.0011980024, -0.034641717, -0.026125094,  -0.17582615,
+        -0.15923657,   -0.27486774,  -0.0006143371, 0.0001771948,
+        -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
+
+    input_to_cell_weights_ = {
+        -0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+        -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+        -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+        -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+        -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+        0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+        -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+        0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+        -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+        -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+        -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+        0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+        0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+        0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+        -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+        -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+        -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+        -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+        -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+        -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+        0.05453865,    0.091149814,   0.06387331,    0.007518393,
+        0.055960953,   0.069779344,   0.046411168,   0.10509911,
+        0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+        0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+        0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
+
+    input_to_output_weights_ = {
+        -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+        -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+        0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+        -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+        -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+        0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+        -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+        -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+        -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+        -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+        0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+        0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+        0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+        -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+        0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+        0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+        -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+        0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+        -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+        -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
+
+    input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
+                        0.053110216,  -0.06928846, -0.13942584, -0.11816189,
+                        0.19483899,   0.03652339,  -0.10250295, 0.036714908,
+                        -0.18426876,  0.036065217, 0.21810818,  0.02383196,
+                        -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
+
+    forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                         0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                         0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                         -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                         0.40694186,  0.06030037,   0.012413437, -0.06108739};
+
+    cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                       -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                       -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                       -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                       0.016178843,  0.1749513,    0.13975595,   0.92058027};
+
+    output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
+                         0.027195795, 0.35373217,    -0.018957434, 0.008907322,
+                         -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
+                         0.040952638, 0.3147856,     0.08225149,   -0.057416286,
+                         -0.14995944, -0.008040261,  0.13208859,   0.029760877};
+
+    recurrent_to_input_weights_ = {
+        -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+        -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+        -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+        -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+        0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+        0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+        -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+        0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+        -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+        0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+        -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+        0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+        -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+        0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+        -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+        -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+        -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+        -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+        -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+        0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+        0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+        0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+        0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+        0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+        -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+        -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+        0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+        -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+        -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+        -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+        -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+        -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+        -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+        0.0365468,      0.07590991,     0.08838724,    0.021681072,
+        -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+        0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+        -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+        -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+        0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+        -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+        -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+        0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+        -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+        0.015963363,    0.00871737,     0.060130805,   0.028611384,
+        0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+        0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+        0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+        0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+        0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+        -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+        -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+        -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+        -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+        -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+        0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+        0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+        -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+        0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+        0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+        0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+        -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+        -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+        0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+        -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+        -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+        0.06358255,     0.18531723,     0.07759293,    0.12006465,
+        0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+        -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+        -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+        0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+        0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+        0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+        0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+        -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+        -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+        -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+        -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+        0.026351685,    0.012641483,    0.07466548,    0.044301085,
+        -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+        -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
+
+    recurrent_to_cell_weights_ = {
+        -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+        0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+        0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+        -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+        0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+        0.08089997,     0.05143358,    0.038261272,   0.03339287,
+        -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+        -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+        -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+        -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+        0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+        -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+        -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+        0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+        0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+        0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+        -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+        0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+        0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+        -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+        0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+        0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+        0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+        -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+        0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+        -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+        0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+        -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+        0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+        -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+        0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+        0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+        -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+        0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+        -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+        0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+        -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+        -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+        -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+        -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+        0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+        0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+        -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+        0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+        0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+        0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+        -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+        0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+        0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+        0.02295182,     0.030739572,   0.056506045,   0.004612461,
+        0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+        -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+        0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+        -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+        0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+        -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+        -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+        -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+        -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+        0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+        0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+        -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+        -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+        -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+        -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+        -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+        0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+        0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+        -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+        0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+        0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+        -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+        -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+        0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+        -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+        -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+        0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+        -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+        -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+        -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
+
+    recurrent_to_forget_weights_ = {
+        -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+        0.14811787,    0.10826372,    0.09471067,     0.03987225,
+        -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+        0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+        0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+        -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+        -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+        0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+        -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+        -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+        0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+        -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+        -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+        -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+        0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+        0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+        -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+        0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+        0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+        -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+        -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+        0.060212336,   0.055259194,   0.06974018,     0.049454916,
+        -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+        0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+        -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+        0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+        -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+        0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+        0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+        0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+        -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+        -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+        -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+        0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+        0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+        0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+        0.052958444,   0.07558703,    0.04817258,     0.044462286,
+        -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+        0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+        0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+        -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+        -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+        -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+        0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+        0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+        0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+        0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+        -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+        -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+        0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+        -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+        -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+        -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+        -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+        0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+        -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+        -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+        0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+        -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+        0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+        0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+        0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+        0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+        0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+        0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+        -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+        0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+        -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+        -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+        0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+        -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+        -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+        0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+        0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+        0.014410365,   0.020995233,   0.17040324,     0.11511526,
+        0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+        -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+        -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+        0.007076659,   0.10964551,    0.0409152,      0.008275321,
+        -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
+
+    recurrent_to_output_weights_ = {
+        0.025825322,   -0.05813119,   0.09495884,     -0.045984812,
+        -0.01255415,   -0.0026479573, -0.08196161,    -0.054914974,
+        -0.0046604523, -0.029587349,  -0.044576716,   -0.07480124,
+        -0.082868785,  0.023254942,   0.027502948,    -0.0039728214,
+        -0.08683098,   -0.08116779,   -0.014675607,   -0.037924774,
+        -0.023314456,  -0.007401714,  -0.09255757,    0.029460307,
+        -0.08829125,   -0.005139627,  -0.08989442,    -0.0555066,
+        0.13596267,    -0.025062224,  -0.048351806,   -0.03850004,
+        0.07266485,    -0.022414139,  0.05940088,     0.075114764,
+        0.09597592,    -0.010211725,  -0.0049794707,  -0.011523867,
+        -0.025980417,  0.072999895,   0.11091378,     -0.081685916,
+        0.014416728,   0.043229222,   0.034178585,    -0.07530371,
+        0.035837382,   -0.085607,     -0.007721233,   -0.03287832,
+        -0.043848954,  -0.06404588,   -0.06632928,    -0.073643476,
+        0.008214239,   -0.045984086,  0.039764922,    0.03474462,
+        0.060612556,   -0.080590084,  0.049127717,    0.04151091,
+        -0.030063879,  0.008801774,   -0.023021035,   -0.019558564,
+        0.05158114,    -0.010947698,  -0.011825728,   0.0075720972,
+        0.0699727,     -0.0039981045, 0.069350146,    0.08799282,
+        0.016156472,   0.035502106,   0.11695009,     0.006217345,
+        0.13392477,    -0.037875112,  0.025745004,    0.08940699,
+        -0.00924166,   0.0046702605,  -0.036598757,   -0.08811812,
+        0.10522024,    -0.032441203,  0.008176899,    -0.04454919,
+        0.07058152,    0.0067963637,  0.039206743,    0.03259838,
+        0.03725492,    -0.09515802,   0.013326398,    -0.052055415,
+        -0.025676316,  0.03198509,    -0.015951829,   -0.058556724,
+        0.036879618,   0.043357447,   0.028362012,    -0.05908629,
+        0.0059240665,  -0.04995891,   -0.019187413,   0.0276265,
+        -0.01628143,   0.0025863599,  0.08800015,     0.035250366,
+        -0.022165963,  -0.07328642,   -0.009415526,   -0.07455109,
+        0.11690406,    0.0363299,     0.07411125,     0.042103454,
+        -0.009660886,  0.019076364,   0.018299393,    -0.046004917,
+        0.08891175,    0.0431396,     -0.026327137,   -0.051502608,
+        0.08979574,    -0.051670972,  0.04940282,     -0.07491107,
+        -0.021240504,  0.022596184,   -0.034280192,   0.060163025,
+        -0.058211457,  -0.051837247,  -0.01349775,    -0.04639988,
+        -0.035936575,  -0.011681591,  0.064818054,    0.0073146066,
+        -0.021745546,  -0.043124277,  -0.06471268,    -0.07053354,
+        -0.029321948,  -0.05330136,   0.016933719,    -0.053782392,
+        0.13747959,    -0.1361751,    -0.11569455,    0.0033329215,
+        0.05693899,    -0.053219706,  0.063698,       0.07977434,
+        -0.07924483,   0.06936997,    0.0034815092,   -0.007305279,
+        -0.037325785,  -0.07251102,   -0.033633437,   -0.08677009,
+        0.091591336,   -0.14165086,   0.021752775,    0.019683983,
+        0.0011612234,  -0.058154266,  0.049996935,    0.0288841,
+        -0.0024567875, -0.14345716,   0.010955264,    -0.10234828,
+        0.1183656,     -0.0010731248, -0.023590032,   -0.072285876,
+        -0.0724771,    -0.026382286,  -0.0014920527,  0.042667855,
+        0.0018776858,  0.02986552,    0.009814309,    0.0733756,
+        0.12289186,    0.018043943,   -0.0458958,     0.049412545,
+        0.033632483,   0.05495232,    0.036686596,    -0.013781798,
+        -0.010036754,  0.02576849,    -0.08307328,    0.010112348,
+        0.042521734,   -0.05869831,   -0.071689695,   0.03876447,
+        -0.13275425,   -0.0352966,    -0.023077697,   0.10285965,
+        0.084736146,   0.15568255,    -0.00040734606, 0.027835453,
+        -0.10292561,   -0.032401145,  0.10053256,     -0.026142767,
+        -0.08271222,   -0.0030240538, -0.016368777,   0.1070414,
+        0.042672627,   0.013456989,   -0.0437609,     -0.022309763,
+        0.11576483,    0.04108048,    0.061026827,    -0.0190714,
+        -0.0869359,    0.037901703,   0.0610107,      0.07202949,
+        0.01675338,    0.086139716,   -0.08795751,    -0.014898893,
+        -0.023771819,  -0.01965048,   0.007955471,    -0.043740474,
+        0.03346837,    -0.10549954,   0.090567775,    0.042013682,
+        -0.03176985,   0.12569028,    -0.02421228,    -0.029526481,
+        0.023851605,   0.031539805,   0.05292009,     -0.02344001,
+        -0.07811758,   -0.08834428,   0.10094801,     0.16594367,
+        -0.06861939,   -0.021256343,  -0.041093912,   -0.06669611,
+        0.035498552,   0.021757556,   -0.09302526,    -0.015403468,
+        -0.06614931,   -0.051798206,  -0.013874718,   0.03630673,
+        0.010412845,   -0.08077351,   0.046185967,    0.0035662893,
+        0.03541868,    -0.094149634,  -0.034814864,   0.003128424,
+        -0.020674974,  -0.03944324,   -0.008110165,   -0.11113267,
+        0.08484226,    0.043586485,   0.040582247,    0.0968012,
+        -0.065249965,  -0.028036479,  0.0050708856,   0.0017462453,
+        0.0326779,     0.041296225,   0.09164146,     -0.047743853,
+        -0.015952192,  -0.034451712,  0.084197424,    -0.05347844,
+        -0.11768019,   0.085926116,   -0.08251791,    -0.045081906,
+        0.0948852,     0.068401024,   0.024856757,    0.06978981,
+        -0.057309967,  -0.012775832,  -0.0032452994,  0.01977615,
+        -0.041040014,  -0.024264973,  0.063464895,    0.05431621,
+    };
+
+    cell_to_input_weights_ = {
+        0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+        -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+        -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+        0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
+
+    cell_to_forget_weights_ = {
+        -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+        -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+        -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+        0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
+
+    cell_to_output_weights_ = {
+        0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+        -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+        -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+        0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
+
+    projection_weights_ = {
+        -0.009802181, 0.09401916,   0.0717386,     -0.13895074,
+        0.09641832,   0.060420845,  0.08539281,    0.054285463,
+        0.061395317,  0.034448683,  -0.042991187,  0.019801661,
+        -0.16840284,  -0.015726732, -0.23041931,   -0.024478018,
+        -0.10959692,  -0.013875541, 0.18600968,    -0.061274476,
+        0.0138165,    -0.08160894,  -0.07661644,   0.032372914,
+        0.16169067,   0.22465782,   -0.03993472,   -0.004017731,
+        0.08633481,   -0.28869787,  0.08682067,    0.17240396,
+        0.014975425,  0.056431185,  0.031037588,   0.16702051,
+        0.0077946745, 0.15140012,   0.29405436,    0.120285,
+        -0.188994,    -0.027265169, 0.043389652,   -0.022061434,
+        0.014777949,  -0.20203483,  0.094781205,   0.19100232,
+        0.13987629,   -0.036132768, -0.06426278,   -0.05108664,
+        0.13221376,   0.009441198,  -0.16715929,   0.15859416,
+        -0.040437475, 0.050779544,  -0.022187516,  0.012166504,
+        0.027685808,  -0.07675938,  -0.0055694645, -0.09444123,
+        0.0046453946, 0.050794356,  0.10770313,    -0.20790008,
+        -0.07149004,  -0.11425117,  0.008225835,   -0.035802525,
+        0.14374903,   0.15262283,   0.048710253,   0.1847461,
+        -0.007487823, 0.11000021,   -0.09542012,   0.22619456,
+        -0.029149994, 0.08527916,   0.009043713,   0.0042746216,
+        0.016261552,  0.022461696,  0.12689082,    -0.043589946,
+        -0.12035478,  -0.08361797,  -0.050666027,  -0.1248618,
+        -0.1275799,   -0.071875185, 0.07377272,    0.09944291,
+        -0.18897448,  -0.1593054,   -0.06526116,   -0.040107165,
+        -0.004618631, -0.067624845, -0.007576253,  0.10727444,
+        0.041546922,  -0.20424393,  0.06907816,    0.050412357,
+        0.00724631,   0.039827548,  0.12449835,    0.10747581,
+        0.13708383,   0.09134148,   -0.12617786,   -0.06428341,
+        0.09956831,   0.1208086,    -0.14676677,   -0.0727722,
+        0.1126304,    0.010139365,  0.015571211,   -0.038128063,
+        0.022913318,  -0.042050496, 0.16842307,    -0.060597885,
+        0.10531834,   -0.06411776,  -0.07451711,   -0.03410368,
+        -0.13393489,  0.06534304,   0.003620307,   0.04490757,
+        0.05970546,   0.05197996,   0.02839995,    0.10434969,
+        -0.013699693, -0.028353551, -0.07260381,   0.047201227,
+        -0.024575593, -0.036445823, 0.07155557,    0.009672501,
+        -0.02328883,  0.009533515,  -0.03606021,   -0.07421458,
+        -0.028082801, -0.2678904,   -0.13221288,   0.18419984,
+        -0.13012612,  -0.014588381, -0.035059117,  -0.04824723,
+        0.07830115,   -0.056184657, 0.03277091,    0.025466874,
+        0.14494097,   -0.12522776,  -0.098633975,  -0.10766018,
+        -0.08317623,  0.08594209,   0.07749552,    0.039474737,
+        0.1776665,    -0.07409566,  -0.0477268,    0.29323658,
+        0.10801441,   0.1154011,    0.013952499,   0.10739139,
+        0.10708251,   -0.051456142, 0.0074137426,  -0.10430189,
+        0.10034707,   0.045594677,  0.0635285,     -0.0715442,
+        -0.089667566, -0.10811871,  0.00026344223, 0.08298446,
+        -0.009525053, 0.006585689,  -0.24567553,   -0.09450807,
+        0.09648481,   0.026996298,  -0.06419476,   -0.04752702,
+        -0.11063944,  -0.23441927,  -0.17608605,   -0.052156363,
+        0.067035615,  0.19271925,   -0.0032889997, -0.043264326,
+        0.09663576,   -0.057112187, -0.10100678,   0.0628376,
+        0.04447668,   0.017961001,  -0.10094388,   -0.10190601,
+        0.18335468,   0.10494553,   -0.052095775,  -0.0026118709,
+        0.10539724,   -0.04383912,  -0.042349473,  0.08438151,
+        -0.1947263,   0.02251204,   0.11216432,    -0.10307853,
+        0.17351969,   -0.039091777, 0.08066188,    -0.00561982,
+        0.12633002,   0.11335965,   -0.0088127935, -0.019777594,
+        0.06864014,   -0.059751723, 0.016233567,   -0.06894641,
+        -0.28651384,  -0.004228674, 0.019708522,   -0.16305895,
+        -0.07468996,  -0.0855457,   0.099339016,   -0.07580735,
+        -0.13775392,  0.08434318,   0.08330512,    -0.12131499,
+        0.031935584,  0.09180414,   -0.08876437,   -0.08049874,
+        0.008753825,  0.03498998,   0.030215185,   0.03907079,
+        0.089751154,  0.029194152,  -0.03337423,   -0.019092513,
+        0.04331237,   0.04299654,   -0.036394123,  -0.12915532,
+        0.09793732,   0.07512415,   -0.11319543,   -0.032502122,
+        0.15661901,   0.07671967,   -0.005491124,  -0.19379048,
+        -0.218606,    0.21448623,   0.017840758,   0.1416943,
+        -0.07051762,  0.19488361,   0.02664691,    -0.18104725,
+        -0.09334311,  0.15026465,   -0.15493552,   -0.057762887,
+        -0.11604192,  -0.262013,    -0.01391798,   0.012185008,
+        0.11156489,   -0.07483202,  0.06693364,    -0.26151478,
+        0.046425626,  0.036540434,  -0.16435726,   0.17338543,
+        -0.21401681,  -0.11385144,  -0.08283257,   -0.069031075,
+        0.030635102,  0.010969227,  0.11109743,    0.010919218,
+        0.027526086,  0.13519906,   0.01891392,    -0.046839405,
+        -0.040167913, 0.017953383,  -0.09700955,   0.0061885654,
+        -0.07000971,  0.026893595,  -0.038844477,  0.14543656};
+
+    projection_bias_ = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
+                        0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6};
+
+    lstm_input_ = {
+        {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+         0.787926, 0.151646, 0.071352, 0.118426, 0.458058,   // step 0
+         0.596268, 0.998386, 0.568695, 0.864524, 0.571277,   // step 1
+         0.073204, 0.296072, 0.743333, 0.069199, 0.045348,   // step 2
+         0.867394, 0.291279, 0.013714, 0.482521, 0.626339},  // step 3
+
+        {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+         0.295743, 0.544053, 0.690064, 0.858138, 0.497181,  // step 0
+         0.642421, 0.524260, 0.134799, 0.003639, 0.162482,  // step 1
+         0.640394, 0.930399, 0.050782, 0.432485, 0.988078,  // step 2
+         0.082922, 0.563329, 0.865614, 0.333232, 0.259916}  // step 3
+    };
+
+    lstm_golden_output_ = {
+        {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+         0.0960319489, 0.229351997,  0.297207743,  0.415997744,  0.491644233,
+         0.578822136,  0.728351235,  0.788540304,  0.909073055,  0.975599587,
+         1.08478093,   1.17409372,   1.30914319,   1.4041512,    1.51714694,
+         1.61342025,   0.0634541437, 0.190279216,  0.317923307,  0.415168911,
+         0.458113253,  0.609743774,  0.731511116,  0.795806408,  0.876155913,
+         0.960330188,  1.12396312,   1.22149014,   1.33917773,   1.43213499,
+         1.54139447,   1.65451813,   0.0485293195, 0.160991609,  0.337073475,
+         0.428976893,  0.459505379,  0.617044866,  0.743735075,  0.790821671,
+         0.85271728,   0.946818829,  1.12779701,   1.23345077,   1.35309088,
+         1.44595909,   1.56173062,   1.67839324,   0.0445971154, 0.156434938,
+         0.341761589,  0.425259203,  0.449760497,  0.633765697,  0.745093822,
+         0.791106999,  0.84820503,   0.952787101,  1.13438797,   1.24063754,
+         1.34668994,   1.44879568,   1.57038593,   1.67956686},
+        {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+         0.0861309841, 0.228726774,  0.296653062,  0.40733397,   0.47120741,
+         0.581307411,  0.719366193,  0.788456261,  0.904226124,  0.965476751,
+         1.10223258,   1.19042683,   1.32106233,   1.41333091,   1.51509535,
+         1.62168002,   0.0652779415, 0.18218407,   0.324066937,  0.42611438,
+         0.47292757,   0.602282405,  0.739310443,  0.791508496,  0.870626807,
+         0.955534995,  1.10976851,   1.21598971,   1.34197009,   1.43256509,
+         1.54804492,   1.65581059,   0.0492607877, 0.169714347,  0.332315415,
+         0.419173867,  0.44699502,   0.630063772,  0.737177074,  0.792844594,
+         0.858417571,  0.956391335,  1.13453305,   1.23976779,   1.34693861,
+         1.4410423,    1.55988359,   1.67204297,   0.0390465111, 0.15099439,
+         0.3439475,    0.424439192,  0.444207728,  0.632501483,  0.742233515,
+         0.791400731,  0.845713973,  0.944575012,  1.14116096,   1.24791968,
+         1.35954499,   1.45086145,   1.56633317,   1.68943977}};
+  }
+};
+
+TEST_F(NoCifgPeepholeProjectionAndBiasClippingLstmTest, LstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/true,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {n_output},          // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      });
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
+  lstm.SetProjectionBias(projection_bias_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 99ad4bb..3af2e96 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -27,24 +27,9 @@
 
 constexpr int kInputTensor = 0;
 
-// Op data for unpack op.
-struct OpData {
-  int num;
-  int axis;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  data->axis = 0;
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+  const TfLiteUnpackParams* data =
+      reinterpret_cast<TfLiteUnpackParams*>(node->builtin_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), data->num);
@@ -57,9 +42,10 @@
     axis += NumDimensions(input);
   }
   TF_LITE_ENSURE(context, 0 <= axis && axis < NumDimensions(input));
-  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) {
-    context->ReportError(context,
-                         "Currently pack only supports int32 and float32.");
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
+      input->type != kTfLiteUInt8 && input->type != kTfLiteInt8) {
+    context->ReportError(context, "Type '%s' is not supported by unpack.",
+                         TfLiteTypeGetName(input->type));
     return kTfLiteError;
   }
 
@@ -79,6 +65,11 @@
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
     TfLiteTensor* output = GetOutput(context, node, i);
     TF_LITE_ENSURE_EQ(context, output->type, input->type);
+    // Guarantee input/output quantization params match as we do not support
+    // rescaling of unpacked quantized tensors.
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point,
+                      output->params.zero_point);
+    TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
     TF_LITE_ENSURE_OK(
         context, context->ResizeTensor(context, output, copied_output_shape));
   }
@@ -100,7 +91,8 @@
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+  const TfLiteUnpackParams* data =
+      reinterpret_cast<TfLiteUnpackParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   switch (input->type) {
@@ -112,9 +104,17 @@
       UnpackImpl<int32_t>(context, node, input, data->num, data->axis);
       break;
     }
+    case kTfLiteUInt8: {
+      UnpackImpl<uint8_t>(context, node, input, data->num, data->axis);
+      break;
+    }
+    case kTfLiteInt8: {
+      UnpackImpl<int8_t>(context, node, input, data->num, data->axis);
+      break;
+    }
     default: {
-      context->ReportError(context,
-                           "Currently pack only supports int32 and float32.");
+      context->ReportError(context, "Type '%s' is not supported by unpack.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
   }
@@ -125,7 +125,7 @@
 }  // namespace unpack
 
 TfLiteRegistration* Register_UNPACK() {
-  static TfLiteRegistration r = {unpack::Init, unpack::Free, unpack::Prepare,
+  static TfLiteRegistration r = {nullptr, nullptr, unpack::Prepare,
                                  unpack::Eval};
   return &r;
 }
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 365970d..487fc95 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -159,6 +159,104 @@
                  /*type=*/TensorType_INT32);
 }
 
+// uint8 tests.
+TEST(UnpackOpTest, Uint8ThreeOutputs) {
+  Check<uint8_t>(/*axis=*/0, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{2}, {2}, {2}},
+                 /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeOutputsAxisOne) {
+  Check<uint8_t>(/*axis=*/1, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{3}, {3}},
+                 /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeOutputsNegativeAxisOne) {
+  Check<uint8_t>(/*axis=*/-1, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{3}, {3}},
+                 /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeOutputsNegativeAxisTwo) {
+  Check<uint8_t>(/*axis=*/-2, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{2}, {2}, {2}},
+                 /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8OneOutput) {
+  Check<uint8_t>(/*axis=*/0, /*input_shape=*/{1, 6},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{6}},
+                 /*expected_output_data=*/{{1, 2, 3, 4, 5, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeDimensionsOutputs) {
+  Check<uint8_t>(/*axis=*/2, /*input_shape=*/{2, 2, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8},
+                 /*expected_output_shape=*/{{2, 2}, {2, 2}},
+                 /*expected_output_data=*/{{1, 3, 5, 7}, {2, 4, 6, 8}},
+                 /*type=*/TensorType_UINT8);
+}
+
+// int8 tests.
+TEST(UnpackOpTest, Int8ThreeOutputs) {
+  Check<int8_t>(/*axis=*/0, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{2}, {2}, {2}},
+                /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeOutputsAxisOne) {
+  Check<int8_t>(/*axis=*/1, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{3}, {3}},
+                /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeOutputsNegativeAxisOne) {
+  Check<int8_t>(/*axis=*/-1, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{3}, {3}},
+                /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeOutputsNegativeAxisTwo) {
+  Check<int8_t>(/*axis=*/-2, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{2}, {2}, {2}},
+                /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8OneOutput) {
+  Check<int8_t>(/*axis=*/0, /*input_shape=*/{1, 6},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{6}},
+                /*expected_output_data=*/{{1, 2, 3, 4, 5, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeDimensionsOutputs) {
+  Check<int8_t>(/*axis=*/2, /*input_shape=*/{2, 2, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8},
+                /*expected_output_shape=*/{{2, 2}, {2, 2}},
+                /*expected_output_data=*/{{1, 3, 5, 7}, {2, 4, 6, 8}},
+                /*type=*/TensorType_INT8);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index d11c5cd..4992095 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -25,9 +25,6 @@
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/model.h"
-#ifndef TFLITE_MCU
-#include "tensorflow/lite/nnapi_delegate.h"
-#endif
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -69,9 +66,6 @@
                                                   bool use_nnapi) {
   std::unique_ptr<Allocation> allocation;
   if (mmap_file && MMAPAllocation::IsSupported()) {
-    if (use_nnapi && NNAPIDelegate::IsSupported())
-      allocation.reset(new NNAPIAllocation(filename, error_reporter));
-    else
       allocation.reset(new MMAPAllocation(filename, error_reporter));
   } else {
     allocation.reset(new FileCopyAllocation(filename, error_reporter));
diff --git a/tensorflow/lite/models/smartreply/ops/predict.cc b/tensorflow/lite/models/smartreply/ops/predict.cc
index 24b7d54..38ebe8b 100644
--- a/tensorflow/lite/models/smartreply/ops/predict.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict.cc
@@ -29,6 +29,7 @@
 
 #include <algorithm>
 #include <cstdlib>
+#include <cstdio>
 #include <unordered_map>
 #include <vector>
 
diff --git a/tensorflow/lite/mutable_op_resolver_test.cc b/tensorflow/lite/mutable_op_resolver_test.cc
index 64fc68a..22641eb 100644
--- a/tensorflow/lite/mutable_op_resolver_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_test.cc
@@ -40,11 +40,21 @@
   return kTfLiteOk;
 }
 
+TfLiteStatus Dummy2Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void* Dummy2Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Dummy2free(TfLiteContext* context, void* buffer) {}
+
 TfLiteRegistration* GetDummy2Registration() {
   static TfLiteRegistration registration = {
-      .init = nullptr,
-      .free = nullptr,
-      .prepare = nullptr,
+      .init = Dummy2Init,
+      .free = Dummy2free,
+      .prepare = Dummy2Prepare,
       .invoke = Dummy2Invoke,
   };
   return &registration;
@@ -112,8 +122,41 @@
   EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_CUSTOM);
   EXPECT_TRUE(found_registration->invoke == DummyInvoke);
   EXPECT_EQ(found_registration->version, 1);
-  // TODO(ycling): The `custom_name` in TfLiteRegistration isn't properly
-  // filled yet. Fix this and add tests.
+}
+
+TEST(MutableOpResolverTest, FindCustomName) {
+  MutableOpResolver resolver;
+  TfLiteRegistration* reg = GetDummyRegistration();
+
+  reg->custom_name = "UPDATED";
+  resolver.AddCustom(reg->custom_name, reg);
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp(reg->custom_name, 1);
+
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_CUSTOM);
+  EXPECT_EQ(found_registration->invoke, GetDummyRegistration()->invoke);
+  EXPECT_EQ(found_registration->version, 1);
+  EXPECT_EQ(found_registration->custom_name, "UPDATED");
+}
+
+TEST(MutableOpResolverTest, FindBuiltinName) {
+  MutableOpResolver resolver1;
+  TfLiteRegistration* reg = GetDummy2Registration();
+
+  reg->custom_name = "UPDATED";
+  resolver1.AddBuiltin(BuiltinOperator_ADD, reg);
+
+  ASSERT_EQ(resolver1.FindOp(BuiltinOperator_ADD, 1)->invoke,
+            GetDummy2Registration()->invoke);
+  ASSERT_EQ(resolver1.FindOp(BuiltinOperator_ADD, 1)->prepare,
+            GetDummy2Registration()->prepare);
+  ASSERT_EQ(resolver1.FindOp(BuiltinOperator_ADD, 1)->init,
+            GetDummy2Registration()->init);
+  ASSERT_EQ(resolver1.FindOp(BuiltinOperator_ADD, 1)->free,
+            GetDummy2Registration()->free);
+  // custom_name for builtin ops will be nullptr
+  EXPECT_EQ(resolver1.FindOp(BuiltinOperator_ADD, 1)->custom_name, nullptr);
 }
 
 TEST(MutableOpResolverTest, FindMissingCustomOp) {
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 109c6b0..d476422 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -86,6 +86,13 @@
   ANEURALNETWORKS_STRIDED_SLICE = 35,
   ANEURALNETWORKS_SUB = 36,
   ANEURALNETWORKS_TRANSPOSE = 37,
+  ANEURALNETWORKS_ABS = 38,
+  ANEURALNETWORKS_EXP = 49,
+  ANEURALNETWORKS_LOG = 60,
+  ANEURALNETWORKS_PRELU = 71,
+  ANEURALNETWORKS_RSQRT = 83,
+  ANEURALNETWORKS_SIN = 85,
+  ANEURALNETWORKS_SQRT = 88,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
deleted file mode 100644
index c8807e3..0000000
--- a/tensorflow/lite/nnapi_delegate.cc
+++ /dev/null
@@ -1,852 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/nnapi_delegate.h"
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/nnapi/nnapi_implementation.h"
-
-#ifdef __ANDROID__
-#include <android/log.h>
-#include <sys/system_properties.h>
-#endif
-
-namespace tflite {
-
-void logError(const char* format, ...) {
-  // stderr is convenient for native tests, but is not captured for apps
-  va_list args_for_stderr;
-  va_start(args_for_stderr, format);
-  vfprintf(stderr, format, args_for_stderr);
-  va_end(args_for_stderr);
-  fprintf(stderr, "\n");
-  fflush(stderr);
-#ifdef __ANDROID__
-  // produce logcat output for general consumption
-  va_list args_for_log;
-  va_start(args_for_log, format);
-  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
-  va_end(args_for_log);
-#endif
-}
-
-#define FATAL(...)       \
-  logError(__VA_ARGS__); \
-  exit(1);
-
-// TODO(aselle): Change the error model to use status codes.
-#define CHECK_TFLITE_SUCCESS(x)                                           \
-  if (x != kTfLiteOk) {                                                   \
-    FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \
-          __LINE__);                                                      \
-  }
-
-#define CHECK_NN(x)                                                     \
-  if (x != ANEURALNETWORKS_NO_ERROR) {                                  \
-    FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \
-          __LINE__);                                                    \
-  }
-
-#define RETURN_ERROR_IF_TFLITE_FAILED(x)                                       \
-  if (x != kTfLiteOk) {                                                        \
-    logError(                                                                  \
-        "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \
-        __LINE__);                                                             \
-    return kTfLiteError;                                                       \
-  }
-
-#define RETURN_ERROR_IF_NN_FAILED(x)                                          \
-  if (x != ANEURALNETWORKS_NO_ERROR) {                                        \
-    logError(                                                                 \
-        "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \
-        __LINE__);                                                            \
-    return kTfLiteError;                                                      \
-  }
-
-// Tracking of NNAPI operand ids
-static const int64_t kOperandIdNotSet = -1;
-static const int64_t kOperandNotNeeded = -2;
-
-NNAPIAllocation::NNAPIAllocation(const char* filename,
-                                 ErrorReporter* error_reporter)
-    : MMAPAllocation(filename, error_reporter) {
-  if (mmapped_buffer_ != MAP_FAILED)
-    CHECK_NN(NnApiImplementation()->ANeuralNetworksMemory_createFromFd(
-        buffer_size_bytes_, PROT_READ, mmap_fd_, 0, &handle_));
-}
-
-NNAPIAllocation::~NNAPIAllocation() {
-  if (handle_) {
-    NnApiImplementation()->ANeuralNetworksMemory_free(handle_);
-  }
-}
-
-NNAPIDelegate::~NNAPIDelegate() {
-  if (nn_compiled_model_) {
-    NnApiImplementation()->ANeuralNetworksCompilation_free(nn_compiled_model_);
-    nn_compiled_model_ = nullptr;
-  }
-  if (nn_model_) {
-    NnApiImplementation()->ANeuralNetworksModel_free(nn_model_);
-    nn_model_ = nullptr;
-    // TODO(aselle): Is this thread-safe and callable multiple times?
-  }
-  // ANeuralNetworksShutdown();
-}
-
-// Adds the tensors of the subgraph to the NN API model.
-TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
-                               ANeuralNetworksModel* nn_model,
-                               uint32_t* no_of_operands_added,
-                               std::vector<int64_t>* nnapi_ids) {
-  const NnApi* nnapi = NnApiImplementation();
-  uint32_t next_id = 0;
-  for (size_t i = 0; i < subgraph->tensors_size(); i++) {
-    // Skip temporaries and RNN back-edges.
-    if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
-
-    (*nnapi_ids)[i] = int64_t(next_id);
-
-    int32_t nn_type = 0;
-    // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
-    float scale = 0.0f;
-    int32_t zeroPoint = 0;
-    TfLiteTensor* tensor = subgraph->tensor(i);
-    switch (tensor->type) {
-      case kTfLiteNoType:
-        // Tensors added during initialization of Ops don't have a type yet and
-        // should not be registered with the NNAPI.
-        continue;
-      case kTfLiteFloat32:
-        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
-        break;
-      case kTfLiteUInt8:
-        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
-        scale = tensor->params.scale;
-        zeroPoint = tensor->params.zero_point;
-        break;
-      case kTfLiteInt32:
-        nn_type = ANEURALNETWORKS_TENSOR_INT32;
-        scale = tensor->params.scale;
-        zeroPoint = tensor->params.zero_point;
-        break;
-      default:
-        logError("Unsupported tensor type %d", tensor->type);
-        return kTfLiteError;
-    }
-    if (tensor->dims->size == 0) {
-      logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)",
-               i, tensor->name);
-      return kTfLiteError;
-    }
-    if (tensor->dims->size > 4) {
-      logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)",
-               i, tensor->name);
-      return kTfLiteError;
-    }
-    // TODO(aselle): Note, many of these are intermediate results. Do I need
-    // to ever specify these sizes. I am currently below doing setValue
-    // on all of them, but I shouldn't in the future.
-    // Answer(jeanluc): If all the operators can set the dimension correctly,
-    // you won't need to.
-    ANeuralNetworksOperandType operand_type{
-        nn_type, static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    RETURN_ERROR_IF_NN_FAILED(
-        nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-    // TODO(aselle): Based on Michael's suggestion, limiting this to read
-    // only memory
-    if (tensor->allocation_type == kTfLiteMmapRo) {
-      if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
-              static_cast<const Allocation*>(tensor->allocation))) {
-        RETURN_ERROR_IF_NN_FAILED(
-            nnapi->ANeuralNetworksModel_setOperandValueFromMemory(
-                nn_model, next_id, alloc->memory(),
-                alloc->offset(tensor->data.raw), tensor->bytes));
-      } else {
-        RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
-            nn_model, next_id, tensor->data.raw, tensor->bytes));
-      }
-    } else if (tensor->bytes == 0) {
-      // These size 0 tensors are optional tensors reserved.
-      RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
-          nn_model, next_id, nullptr, 0));
-    }
-
-    ++next_id;
-  }
-  *no_of_operands_added = next_id;
-  return kTfLiteOk;
-}
-
-void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
-                        std::vector<uint32_t>* into,
-                        const std::vector<int64_t>& map) {
-  for (size_t i = 0; i < from_ids_count; i++) {
-    int from_id = from_ids_buf[i];
-    if (from_id == kOptionalTensor) {
-      into->push_back(from_id);
-    } else {
-      into->push_back(map[from_id]);
-    }
-  }
-}
-
-// Adds the operations and their parameters to the NN API model.
-// 'next-id' is the operand ID of the next operand of the model.
-TfLiteStatus AddOpsAndParams(
-    tflite::Subgraph* subgraph, ANeuralNetworksModel* nn_model,
-    uint32_t next_id, std::vector<int>* model_state_inputs,
-    std::vector<int>* model_state_outputs,
-    const std::vector<int64_t>& tensor_id_to_nnapi_id) {
-  const NnApi* nnapi = NnApiImplementation();
-  for (size_t i = 0; i < subgraph->nodes_size(); i++) {
-    const auto* node_and_registration = subgraph->node_and_registration(i);
-    const TfLiteNode& node = node_and_registration->first;
-    const TfLiteRegistration& registration = node_and_registration->second;
-    tflite::BuiltinOperator builtin =
-        static_cast<tflite::BuiltinOperator>(registration.builtin_code);
-
-    // Add the parameters.
-    std::vector<uint32_t> augmented_inputs, augmented_outputs;
-    MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs,
-                       tensor_id_to_nnapi_id);
-    MapAndAddTensorIds(node.outputs->data, node.outputs->size,
-                       &augmented_outputs, tensor_id_to_nnapi_id);
-
-    auto add_scalar_int32 = [nnapi, &nn_model, &augmented_inputs,
-                             &next_id](int value) {
-      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
-      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
-          nn_model, next_id, &value, sizeof(int32_t)))
-      augmented_inputs.push_back(next_id++);
-    };
-
-    auto add_scalar_float32 = [nnapi, &nn_model, &augmented_inputs,
-                               &next_id](float value) {
-      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
-      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
-          nn_model, next_id, &value, sizeof(float)))
-      augmented_inputs.push_back(next_id++);
-    };
-
-    auto add_vector_int32 = [&](const int* values, uint32_t num_values) {
-      ANeuralNetworksOperandType operand_type{
-          .type = ANEURALNETWORKS_TENSOR_INT32,
-          .dimensionCount = 1,
-          .dimensions = &num_values};
-      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
-          nn_model, next_id, values, sizeof(int32_t) * num_values));
-      augmented_inputs.push_back(next_id++);
-    };
-
-    // Handle state tensors of RNN, LSTM, SVDF.
-    // For each state_out tensor, a corresponding state_in operand needs to be
-    // created for NNAPI.
-    auto duplicate_state_tensor_float32 =
-        [nnapi, subgraph, &nn_model, &next_id, &augmented_inputs,
-         &model_state_inputs, &model_state_outputs](int tensor_id) {
-          const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
-          ANeuralNetworksOperandType operand_type{
-              ANEURALNETWORKS_TENSOR_FLOAT32,
-              static_cast<uint32_t>(tensor->dims->size),
-              reinterpret_cast<uint32_t*>(tensor->dims->data),
-              tensor->params.scale, tensor->params.zero_point};
-          CHECK_NN(
-              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-          augmented_inputs.push_back(next_id);
-          model_state_inputs->push_back(next_id);
-          model_state_outputs->push_back(tensor_id);
-          next_id++;
-        };
-    auto check_and_add_activation = [&add_scalar_int32](int activation) {
-      if (activation > kTfLiteActRelu6) {
-        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
-        return kTfLiteError;
-      }
-      add_scalar_int32(activation);
-      return kTfLiteOk;
-    };
-
-    auto add_add_params = [&add_scalar_int32](void* data) {
-      auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
-      if (builtin->activation > kTfLiteActRelu6) {
-        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
-        return kTfLiteError;
-      }
-      add_scalar_int32(builtin->activation);
-      return kTfLiteOk;
-    };
-
-    auto add_pooling_params = [&add_scalar_int32,
-                               &check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
-      add_scalar_int32(builtin->padding);
-      add_scalar_int32(builtin->stride_width);
-      add_scalar_int32(builtin->stride_height);
-      add_scalar_int32(builtin->filter_width);
-      add_scalar_int32(builtin->filter_height);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_convolution_params = [&add_scalar_int32,
-                                   &check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
-      add_scalar_int32(builtin->padding);
-      add_scalar_int32(builtin->stride_width);
-      add_scalar_int32(builtin->stride_height);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_depthwise_conv_params = [&add_scalar_int32,
-                                      &check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
-      add_scalar_int32(builtin->padding);
-      add_scalar_int32(builtin->stride_width);
-      add_scalar_int32(builtin->stride_height);
-      add_scalar_int32(builtin->depth_multiplier);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_fully_connected_params = [&check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_concatenation_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
-      add_scalar_int32(builtin->axis);
-      if (builtin->activation != kTfLiteActNone) {
-        logError("Concatenation does not support fused activation in NNAPI");
-        return kTfLiteError;
-      }
-      return kTfLiteOk;
-    };
-
-    auto add_softmax_params = [&add_scalar_float32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
-      add_scalar_float32(builtin->beta);
-    };
-
-    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
-      add_scalar_int32(builtin->block_size);
-    };
-
-    auto add_lstm_params = [&add_scalar_int32,
-                            &add_scalar_float32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
-      add_scalar_int32(builtin->activation);
-      add_scalar_float32(builtin->cell_clip);
-      add_scalar_float32(builtin->proj_clip);
-    };
-
-    // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [nnapi, subgraph, &node, &nn_model,
-                                            &next_id, &augmented_outputs]() {
-      if (node.temporaries->size == 0) return;
-      int scratch_buffer_index = node.temporaries->data[0];
-      const TfLiteTensor* tensor = subgraph->tensor(scratch_buffer_index);
-      ANeuralNetworksOperandType operand_type{
-          ANEURALNETWORKS_TENSOR_FLOAT32,
-          static_cast<uint32_t>(tensor->dims->size),
-          reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
-          tensor->params.zero_point};
-      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-      augmented_outputs.insert(augmented_outputs.begin(), next_id++);
-    };
-
-    auto add_mean_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteReducerParams*>(data);
-      add_scalar_int32(builtin->keep_dims);
-    };
-
-    auto add_svdf_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
-      add_scalar_int32(builtin->rank);
-      add_scalar_int32(builtin->activation);
-    };
-
-    auto add_rnn_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
-      add_scalar_int32(builtin->activation);
-    };
-
-    auto add_squeeze_params = [&](void* data) {
-      const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data);
-      // Note that we add the squeeze dimensions even if the dimensions were
-      // unspecified (empty), as NNAPI requires the operand.
-      add_vector_int32(builtin->squeeze_dims,
-                       static_cast<uint32_t>(builtin->num_squeeze_dims));
-    };
-
-    // Handle optional input tensors.
-    auto add_optional_tensors = [nnapi, &nn_model, &augmented_inputs,
-                                 &next_id](int nn_type) {
-      for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
-        if (augmented_inputs[idx] == kOptionalTensor) {
-          const std::vector<uint32_t> dim = {0, 0};
-          ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
-          CHECK_NN(
-              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-          CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
-              nn_model, next_id, nullptr, 0))
-          augmented_inputs[idx] = next_id++;
-        }
-      }
-    };
-
-    int nnapi_version = 10;
-    ANeuralNetworksOperationType nn_op_type;
-
-    switch (builtin) {
-      case tflite::BuiltinOperator_ADD:
-        nn_op_type = ANEURALNETWORKS_ADD;
-        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
-        break;
-      case tflite::BuiltinOperator_MUL:
-        nn_op_type = ANEURALNETWORKS_MUL;
-        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
-        break;
-      case tflite::BuiltinOperator_AVERAGE_POOL_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
-        break;
-      case tflite::BuiltinOperator_MAX_POOL_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
-        break;
-      case tflite::BuiltinOperator_L2_POOL_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
-        break;
-      case tflite::BuiltinOperator_CONV_2D: {
-        auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data);
-        if (builtin->dilation_width_factor != 1 ||
-            builtin->dilation_height_factor != 1 || node.inputs->size != 3) {
-          logError("NNAPI does not support dilated Conv2D.");
-          return kTfLiteError;
-        }
-      }
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_convolution_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_CONV_2D;
-        break;
-      case tflite::BuiltinOperator_RELU:
-        nn_op_type = ANEURALNETWORKS_RELU;
-        break;
-      case tflite::BuiltinOperator_RELU6:
-        nn_op_type = ANEURALNETWORKS_RELU6;
-        break;
-      case tflite::BuiltinOperator_TANH:
-        nn_op_type = ANEURALNETWORKS_TANH;
-        break;
-      case tflite::BuiltinOperator_FLOOR:
-        nn_op_type = ANEURALNETWORKS_FLOOR;
-        break;
-      case tflite::BuiltinOperator_LOGISTIC:
-        nn_op_type = ANEURALNETWORKS_LOGISTIC;
-        break;
-      case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_depthwise_conv_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
-        break;
-      case tflite::BuiltinOperator_CONCATENATION:
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_concatenation_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_CONCATENATION;
-        break;
-      case tflite::BuiltinOperator_SOFTMAX:
-        add_softmax_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SOFTMAX;
-        break;
-      case tflite::BuiltinOperator_FULLY_CONNECTED:
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_fully_connected_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
-        break;
-      case tflite::BuiltinOperator_RESHAPE:
-        if (node.inputs->size != 2) {
-          logError("NNAPI only supports 2-input RESHAPE");
-          return kTfLiteError;
-        }
-        nn_op_type = ANEURALNETWORKS_RESHAPE;
-        // add_reshape_params(node.builtin_data);
-        break;
-      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
-        add_space_to_depth_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
-        break;
-      case tflite::BuiltinOperator_LSTM: {
-        if (node.inputs->size + /* no of params */ 3 != 21) {
-          logError("NNAPI only supports 21-input LSTMs");
-          return kTfLiteError;
-        }
-        duplicate_state_tensor_float32(
-            node.outputs->data[/*kOutputStateTensor*/ 0]);
-        duplicate_state_tensor_float32(
-            node.outputs->data[/*kCellStateTensor*/ 1]);
-        add_lstm_params(node.builtin_data);
-        add_lstm_scratch_tensor_float32();
-        add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
-        nn_op_type = ANEURALNETWORKS_LSTM;
-        break;
-      }
-      case tflite::BuiltinOperator_SVDF: {
-        duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
-        add_svdf_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SVDF;
-        break;
-      }
-      case tflite::BuiltinOperator_RNN: {
-        duplicate_state_tensor_float32(
-            node.outputs->data[/*kHiddenStateTensor*/ 0]);
-        add_rnn_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_RNN;
-        break;
-      }
-      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
-        nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
-        break;
-      case tflite::BuiltinOperator_PAD:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_PAD;
-        break;
-      case tflite::BuiltinOperator_MEAN:
-        nnapi_version = 11;  // require NNAPI 1.1
-        add_mean_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_MEAN;
-        break;
-      case tflite::BuiltinOperator_DIV:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_DIV;
-        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
-            reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation));
-        break;
-      case tflite::BuiltinOperator_SUB:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_SUB;
-        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
-            reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation));
-        break;
-      case tflite::BuiltinOperator_SQUEEZE:
-        nnapi_version = 11;  // requires NNAPI 1.1
-        add_squeeze_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SQUEEZE;
-        break;
-      case tflite::BuiltinOperator_TRANSPOSE:
-        // The permutation input tensor value dictates the output dimensions.
-        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
-        if ((node.inputs->size > 1) &&
-            (subgraph->tensor(node.inputs->data[1])->allocation_type !=
-             kTfLiteMmapRo)) {
-          logError("NNAPI does not yet support dynamic tensors.");
-          return kTfLiteError;
-        }
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_TRANSPOSE;
-        break;
-      case tflite::BuiltinOperator_L2_NORMALIZATION:
-        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
-        if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
-                ->activation != kTfLiteActNone) {
-          logError(
-              "NNAPI does not support L2Normalization with fused activations");
-          return kTfLiteError;
-        }
-        if ((node.inputs->size > 0) &&
-            (subgraph->tensor(node.inputs->data[0])->dims->size != 4)) {
-          logError("NNAPI only supports input rank 4 for L2Normalization");
-          return kTfLiteError;
-        }
-        break;
-      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-        if (subgraph->tensor(node.outputs->data[0])->type != kTfLiteFloat32) {
-          logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
-                   builtin);
-          return kTfLiteError;
-        }
-        nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP;
-        break;
-      case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
-      case tflite::BuiltinOperator_LSH_PROJECTION:
-      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
-      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
-      case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
-      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
-      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
-      case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
-      case tflite::BuiltinOperator_PADV2:
-      case tflite::BuiltinOperator_RESIZE_BILINEAR:
-      case tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
-      case tflite::BuiltinOperator_CALL:
-      case tflite::BuiltinOperator_SKIP_GRAM:
-      case tflite::BuiltinOperator_RELU_N1_TO_1:
-      case tflite::BuiltinOperator_GATHER:
-      case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
-      case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
-      case tflite::BuiltinOperator_TOPK_V2:
-      case tflite::BuiltinOperator_SPLIT:
-      case tflite::BuiltinOperator_STRIDED_SLICE:
-      case tflite::BuiltinOperator_EXP:
-      case tflite::BuiltinOperator_COS:
-      case tflite::BuiltinOperator_LOG_SOFTMAX:
-      case tflite::BuiltinOperator_DEQUANTIZE:
-      case tflite::BuiltinOperator_DELEGATE:
-      case tflite::BuiltinOperator_CAST:
-      case tflite::BuiltinOperator_PRELU:
-      case tflite::BuiltinOperator_MAXIMUM:
-      case tflite::BuiltinOperator_MINIMUM:
-      case tflite::BuiltinOperator_ARG_MAX:
-      case tflite::BuiltinOperator_ARG_MIN:
-      case tflite::BuiltinOperator_GREATER:
-      case tflite::BuiltinOperator_GREATER_EQUAL:
-      case tflite::BuiltinOperator_LESS:
-      case tflite::BuiltinOperator_LESS_EQUAL:
-      case tflite::BuiltinOperator_NEG:
-      case tflite::BuiltinOperator_SELECT:
-      case tflite::BuiltinOperator_SLICE:
-      case tflite::BuiltinOperator_SIN:
-      case tflite::BuiltinOperator_LOG:
-      case tflite::BuiltinOperator_TRANSPOSE_CONV:
-      case tflite::BuiltinOperator_TILE:
-      case tflite::BuiltinOperator_EXPAND_DIMS:
-      case tflite::BuiltinOperator_SPARSE_TO_DENSE:
-      case tflite::BuiltinOperator_EQUAL:
-      case tflite::BuiltinOperator_NOT_EQUAL:
-      case tflite::BuiltinOperator_SUM:
-      case tflite::BuiltinOperator_REDUCE_MAX:
-      case tflite::BuiltinOperator_REDUCE_MIN:
-      case tflite::BuiltinOperator_REDUCE_PROD:
-      case tflite::BuiltinOperator_SQRT:
-      case tflite::BuiltinOperator_RSQRT:
-      case tflite::BuiltinOperator_SHAPE:
-      case tflite::BuiltinOperator_POW:
-      case tflite::BuiltinOperator_FAKE_QUANT:
-      case tflite::BuiltinOperator_PACK:
-      case tflite::BuiltinOperator_LOGICAL_OR:
-      case tflite::BuiltinOperator_ONE_HOT:
-      case tflite::BuiltinOperator_LOGICAL_AND:
-      case tflite::BuiltinOperator_LOGICAL_NOT:
-      case tflite::BuiltinOperator_UNPACK:
-      case tflite::BuiltinOperator_FLOOR_DIV:
-      case tflite::BuiltinOperator_REDUCE_ANY:
-      case tflite::BuiltinOperator_SQUARE:
-      case tflite::BuiltinOperator_ZEROS_LIKE:
-      case tflite::BuiltinOperator_FILL:
-      case tflite::BuiltinOperator_FLOOR_MOD:
-      case tflite::BuiltinOperator_RANGE:
-      case tflite::BuiltinOperator_LEAKY_RELU:
-      case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
-      case tflite::BuiltinOperator_MIRROR_PAD:
-      case tflite::BuiltinOperator_ABS:
-      case tflite::BuiltinOperator_SPLIT_V:
-      case tflite::BuiltinOperator_UNIQUE:
-      case tflite::BuiltinOperator_CEIL:
-      case tflite::BuiltinOperator_REVERSE_V2:
-      case tflite::BuiltinOperator_ADD_N:
-      case tflite::BuiltinOperator_GATHER_ND:
-      case tflite::BuiltinOperator_WHERE:
-      case tflite::BuiltinOperator_RANK:
-      case tflite::BuiltinOperator_ELU:
-      case tflite::BuiltinOperator_REVERSE_SEQUENCE:
-      case tflite::BuiltinOperator_MATRIX_DIAG:
-      case tflite::BuiltinOperator_QUANTIZE:
-      case tflite::BuiltinOperator_MATRIX_SET_DIAG:
-        logError("Op code %d is currently not delegated to NNAPI", builtin);
-        return kTfLiteError;
-        break;
-      case tflite::BuiltinOperator_CUSTOM:
-        logError("Custom operations are not supported when using NNAPI.");
-        return kTfLiteError;
-        break;
-    }
-
-    if (nnapi_version == 11 && nnapi->android_sdk_version < 28) {
-      logError("Op %d needs NNAPI1.1", builtin);
-      return kTfLiteError;
-    }
-
-    // Add the operation.
-    RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_addOperation(
-        nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
-        augmented_inputs.data(),
-        static_cast<uint32_t>(augmented_outputs.size()),
-        reinterpret_cast<uint32_t*>(augmented_outputs.data())));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
-  if (nn_model_ && nn_compiled_model_) return model_status_;
-
-  const NnApi* nnapi = NnApiImplementation();
-  // TODO(aselle): This is not correct. need to handle resize invalidation.
-  if (!nn_model_) {
-    CHECK_NN(nnapi->ANeuralNetworksModel_create(&nn_model_));
-
-    // Find which tensors should be added to NNAPI. TFLite has temporaries
-    // and RNN back-edges which are are not valid for NNAPI. We look through all
-    // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
-    // kOperandIdNotSet. addTensorOperands will replace those with the
-    // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
-    std::vector<int64_t> tensor_id_to_nnapi_id(subgraph->tensors_size(),
-                                               kOperandNotNeeded);
-    auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
-                                                       size_t count) {
-      for (int j = 0; j < count; j++) {
-        auto tensor_id = buf[j];
-        if (tensor_id != kOptionalTensor) {
-          tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet;
-        }
-      }
-    };
-    for (size_t i = 0; i < subgraph->nodes_size(); i++) {
-      const auto* node_and_registration = subgraph->node_and_registration(i);
-      const TfLiteNode& node = node_and_registration->first;
-      set_ids_to_not_set(node.inputs->data, node.inputs->size);
-      set_ids_to_not_set(node.outputs->data, node.outputs->size);
-    }
-    set_ids_to_not_set(subgraph->inputs().data(), subgraph->inputs().size());
-    set_ids_to_not_set(subgraph->outputs().data(), subgraph->outputs().size());
-
-    uint32_t next_id = 0;
-    RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
-        subgraph, nn_model_, &next_id, &tensor_id_to_nnapi_id));
-    RETURN_ERROR_IF_TFLITE_FAILED(
-        AddOpsAndParams(subgraph, nn_model_, next_id, &model_states_inputs_,
-                        &model_states_outputs_, tensor_id_to_nnapi_id));
-
-    std::vector<uint32_t> augmented_inputs;
-    MapAndAddTensorIds(subgraph->inputs().data(), subgraph->inputs().size(),
-                       &augmented_inputs, tensor_id_to_nnapi_id);
-    augmented_inputs.insert(augmented_inputs.end(),
-                            model_states_inputs_.begin(),
-                            model_states_inputs_.end());
-    std::vector<uint32_t> augmented_outputs;
-    MapAndAddTensorIds(subgraph->outputs().data(), subgraph->outputs().size(),
-                       &augmented_outputs, tensor_id_to_nnapi_id);
-    MapAndAddTensorIds(model_states_outputs_.data(),
-                       model_states_outputs_.size(), &augmented_outputs,
-                       tensor_id_to_nnapi_id);
-
-    CHECK_NN(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs(
-        nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
-        reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
-        static_cast<uint32_t>(augmented_outputs.size()),
-        reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
-
-    if (nnapi->android_sdk_version >= 28) {
-      CHECK_NN(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-          nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
-    }
-    CHECK_NN(nnapi->ANeuralNetworksModel_finish(nn_model_));
-  }
-  if (!nn_compiled_model_) {
-    CHECK_NN(nnapi->ANeuralNetworksCompilation_create(nn_model_,
-                                                      &nn_compiled_model_));
-    CHECK_NN(nnapi->ANeuralNetworksCompilation_finish(nn_compiled_model_));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
-  if (!nn_model_) {
-    model_status_ = BuildGraph(subgraph);
-    if (model_status_ != kTfLiteOk) {
-      logError("Failed to build graph for NNAPI");
-    }
-  }
-  if (model_status_ != kTfLiteOk) {
-    return model_status_;
-  }
-
-  const NnApi* nnapi = NnApiImplementation();
-  ANeuralNetworksExecution* execution = nullptr;
-  CHECK_NN(
-      nnapi->ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
-
-  // Currently perform deep copy of input buffer
-  for (size_t i = 0; i < subgraph->inputs().size(); i++) {
-    int input = subgraph->inputs()[i];
-    // TODO(aselle): Is this what we want or do we want input instead?
-    // TODO(aselle): This should be called setInputValue maybe to be cons.
-    TfLiteTensor* tensor = subgraph->tensor(input);
-    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
-        execution, i, nullptr, tensor->data.raw, tensor->bytes));
-  }
-
-  // Tell nn api where to place final data.
-  for (size_t i = 0; i < subgraph->outputs().size(); i++) {
-    int output = subgraph->outputs()[i];
-    TfLiteTensor* tensor = subgraph->tensor(output);
-    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
-        execution, i, nullptr, tensor->data.raw, tensor->bytes));
-  }
-
-  // The state_out of previous invocation need to be mapped to state_in of
-  // current invocation.
-  for (size_t i = 0; i < model_states_outputs_.size(); i++) {
-    int state_tensor_idx = model_states_outputs_[i];
-    TfLiteTensor* tensor = subgraph->tensor(state_tensor_idx);
-    // Here we are using a deep copy for state_in tensors so that we are not
-    // reading and writing into the same buffer during a invocation.
-    // TODO(miaowang): using double shared buffer to minimize the copies.
-    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
-        execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
-        tensor->bytes));
-    // Tell NNAPI where to output the state_out.
-    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
-        execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
-        tensor->bytes));
-  }
-
-  // Currently use blocking compute.
-  ANeuralNetworksEvent* event = nullptr;
-  CHECK_NN(nnapi->ANeuralNetworksExecution_startCompute(execution, &event));
-  CHECK_NN(nnapi->ANeuralNetworksEvent_wait(event));
-  nnapi->ANeuralNetworksEvent_free(event);
-  nnapi->ANeuralNetworksExecution_free(execution);
-
-#if 0
-  printf("From the NN API:\n");
-  TfLiteTensor* tensor = subgraph->tensor(subgraph->outputs()[0]);
-  if (float* data =
-          subgraph->typed_tensor<float>(subgraph->outputs()[0])) {
-    size_t num = tensor->bytes / sizeof(float);
-    for (float* p = data; p < data + num; p++) {
-      printf(" %f", *p);
-    }
-    printf("\n");
-  }
-#endif
-
-  return kTfLiteOk;
-}
-
-bool NNAPIDelegate::IsSupported() {
-  return NnApiImplementation()->nnapi_exists;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/nnapi_delegate.h b/tensorflow/lite/nnapi_delegate.h
deleted file mode 100644
index 2fc4bc8..0000000
--- a/tensorflow/lite/nnapi_delegate.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_NNAPI_DELEGATE_H_
-#define TENSORFLOW_LITE_NNAPI_DELEGATE_H_
-
-#include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/interpreter.h"
-
-struct ANeuralNetworksModel;
-struct ANeuralNetworksMemory;
-struct ANeuralNetworksCompilation;
-
-namespace tflite {
-
-class NNAPIAllocation : public MMAPAllocation {
- public:
-  NNAPIAllocation(const char* filename, ErrorReporter* error_reporter);
-  ~NNAPIAllocation();
-
-  size_t offset(const void* ptr) const {
-    auto signed_offset = reinterpret_cast<const uint8_t*>(ptr) -
-                         reinterpret_cast<const uint8_t*>(mmapped_buffer_);
-
-    return static_cast<size_t>(signed_offset);
-  }
-
-  ANeuralNetworksMemory* memory() const { return handle_; }
-  bool valid() const override { return handle_ != nullptr; }
-
- private:
-  mutable ANeuralNetworksMemory* handle_ = nullptr;
-};
-
-class NNAPIDelegate {
- public:
-  ~NNAPIDelegate();
-
-  // Convert a tflite graph to NNAPI
-  TfLiteStatus BuildGraph(Subgraph* subgraph);
-
-  // Run
-  TfLiteStatus Invoke(Subgraph* subgraph);
-
-  // Whether the current platform supports NNAPI delegation.
-  static bool IsSupported();
-
- private:
-  // The NN API model handle
-  ANeuralNetworksModel* nn_model_ = nullptr;
-  // The NN API compilation handle
-  ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
-  // Model status
-  TfLiteStatus model_status_ = kTfLiteOk;
-
-  // List of state tensors for LSTM, RNN, SVDF.
-  // NN API does not allow ops to maintain states across multiple
-  // invocations. We need to manually create state input tensors from
-  // corresponding state output tensors of TFLite operations, and map them
-  // correctly.
-  std::vector<int> model_states_inputs_;   // holds NNAPI operand ids
-  std::vector<int> model_states_outputs_;  // holds TFLite tensor ids
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_NNAPI_DELEGATE_H_
diff --git a/tensorflow/lite/nnapi_delegate_disabled.cc b/tensorflow/lite/nnapi_delegate_disabled.cc
deleted file mode 100644
index a8f2c0b..0000000
--- a/tensorflow/lite/nnapi_delegate_disabled.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/nnapi_delegate.h"
-
-#include <cassert>
-
-namespace tflite {
-
-NNAPIAllocation::NNAPIAllocation(const char* filename,
-                                 ErrorReporter* error_reporter)
-    : MMAPAllocation(filename, error_reporter) {
-  // The disabled variant should never be created.
-  assert(false);
-}
-
-NNAPIAllocation::~NNAPIAllocation() {}
-
-NNAPIDelegate::~NNAPIDelegate() {
-#define UNUSED_MEMBER(x) (void)(x)
-  UNUSED_MEMBER(nn_model_);
-  UNUSED_MEMBER(nn_compiled_model_);
-  UNUSED_MEMBER(model_status_);
-#undef UNUSED_MEMBER
-}
-
-TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
-  return kTfLiteError;
-}
-
-TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) { return kTfLiteError; }
-
-bool NNAPIDelegate::IsSupported() { return false; }
-
-}  // namespace tflite
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 1113bf0..a59af3d 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -56,6 +56,8 @@
       return "kTfLiteInt16";
     case kTfLiteComplex64:
       return "kTfLiteComplex64";
+    case kTfLiteFloat16:
+      return "kTfLiteFloat16";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index aa5638f..452c53a 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -10,16 +10,21 @@
 
 cc_library(
     name = "profiler",
-    hdrs = ["profiler.h"],
+    hdrs = [
+        "buffered_profiler.h",
+        "noop_profiler.h",
+        "profiler.h",
+    ],
     copts = common_copts,
-    deps = [":profile_buffer"],
+    deps = [
+        ":profile_buffer",
+        "//tensorflow/lite/core/api",
+    ],
 )
 
 cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
-    copts = ["-DTFLITE_PROFILING_ENABLED"],
-    defines = ["TFLITE_PROFILING_ENABLED"],
     deps = [
         ":profiler",
         "//tensorflow/lite/testing:util",
@@ -31,7 +36,10 @@
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
     copts = common_copts,
-    deps = [":time"],
+    deps = [
+        ":time",
+        "//tensorflow/lite/core/api",
+    ],
 )
 
 cc_library(
@@ -58,7 +66,7 @@
     hdrs = ["profile_summarizer.h"],
     copts = common_copts,
     deps = [
-        ":profiler",
+        ":profile_buffer",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
@@ -71,6 +79,7 @@
     copts = common_copts,
     deps = [
         ":profile_summarizer",
+        ":profiler",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:kernel_util",
@@ -83,8 +92,6 @@
 cc_test(
     name = "profile_buffer_test",
     srcs = ["profile_buffer_test.cc"],
-    copts = ["-DTFLITE_PROFILING_ENABLED"],
-    defines = ["TFLITE_PROFILING_ENABLED"],
     deps = [
         ":profile_buffer",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/profiling/buffered_profiler.h b/tensorflow/lite/profiling/buffered_profiler.h
new file mode 100644
index 0000000..74acfe3
--- /dev/null
+++ b/tensorflow/lite/profiling/buffered_profiler.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+namespace tflite {
+namespace profiling {
+
+// Controls whether profiling is enabled or disabled and collects profiles.
+// TFLite is used on platforms that don't have posix threads, so the profiler is
+// kept as simple as possible. It is designed to be used only on a single
+// thread.
+//
+// Profiles are collected using Scoped*Profile objects that begin and end a
+// profile event.
+// An example usage is shown in the example below:
+//
+// Say Worker class has a DoWork method and we are interested in profiling
+// the overall execution time for DoWork and time spent in Task1 and Task2
+// functions.
+//
+// class Worker {
+//  public:
+//   void DoWork() {
+//    ScopedProfile(&controller, "DoWork");
+//    Task1();
+//    Task2();
+//    .....
+//   }
+//
+//   void Task1() {
+//    ScopedProfile(&controller, "Task1");
+//    ....
+//   }
+//
+//   void Task2() {
+//    ScopedProfile(&controller, "Task2");
+//   }
+//
+//    Profiler profiler;
+// }
+//
+// We instrument the functions that need to be profiled.
+//
+// Profile can be collected by enable profiling and then getting profile
+// events.
+//
+//  void ProfileWorker() {
+//    Worker worker;
+//    worker.profiler.EnableProfiling();
+//    worker.DoWork();
+//    worker.profiler.DisableProfiling();
+//    // Profiling is complete, extract profiles.
+//    auto profile_events = worker.profiler.GetProfiles();
+//  }
+//
+//
+class BufferedProfiler : public tflite::Profiler {
+ public:
+  BufferedProfiler() : buffer_(1024, false) {}
+
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      uint32_t event_metadata) override {
+    return buffer_.BeginEvent(tag, event_type, event_metadata);
+  }
+
+  void EndEvent(uint32_t event_handle) override {
+    buffer_.EndEvent(event_handle);
+  }
+
+  void StartProfiling() { buffer_.SetEnabled(true); }
+  void StopProfiling() { buffer_.SetEnabled(false); }
+  void Reset() { buffer_.Reset(); }
+  std::vector<const ProfileEvent*> GetProfileEvents() {
+    std::vector<const ProfileEvent*> profile_events;
+    profile_events.reserve(buffer_.Size());
+    for (size_t i = 0; i < buffer_.Size(); i++) {
+      profile_events.push_back(buffer_.At(i));
+    }
+    return profile_events;
+  }
+
+ private:
+  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
+  ProfileBuffer buffer_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
diff --git a/tensorflow/lite/profiling/noop_profiler.h b/tensorflow/lite/profiling/noop_profiler.h
new file mode 100644
index 0000000..18c12e1
--- /dev/null
+++ b/tensorflow/lite/profiling/noop_profiler.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+namespace tflite {
+namespace profiling {
+
+// A noop version of profiler when profiling is disabled.
+class NoopProfiler : public tflite::Profiler {
+ public:
+  NoopProfiler() {}
+
+  uint32_t BeginEvent(const char*, EventType, uint32_t) override { return 0; }
+  void EndEvent(uint32_t) override {}
+
+  void StartProfiling() {}
+  void StopProfiling() {}
+  void Reset() {}
+  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 2202df2..8e4aab4 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -18,24 +18,22 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <vector>
 
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/profiling/time.h"
 
 namespace tflite {
 namespace profiling {
 
+constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
+
 // A profiling event.
 struct ProfileEvent {
   // Describes the type of event.
   // The event_metadata field may contain additional data for interpreting
   // the event.
-  enum class EventType {
-    // Default event type, the metadata field has no special significance.
-    DEFAULT = 0,
-    // The event is an operator invocation and the event_metadata field is the
-    // index of operator node.
-    OPERATOR_INVOKE_EVENT = 1
-  };
+  using EventType = tflite::Profiler::EventType;
 
   // Label of the event. This usually describes the event.
   const char* tag;
@@ -49,17 +47,6 @@
   // Extra data describing the details of the event.
   uint32_t event_metadata;
 };
-}  // namespace profiling
-}  // namespace tflite
-
-#ifdef TFLITE_PROFILING_ENABLED
-
-#include <sys/time.h>
-#include <vector>
-
-namespace tflite {
-namespace profiling {
-constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
 
 // A ring buffer of profile events.
 // This class is not thread safe.
@@ -128,7 +115,7 @@
   // Returns the profile event at the given index. If the index is invalid a
   // nullptr is returned. The return event may get overwritten if more events
   // are added to buffer.
-  const struct ProfileEvent* const At(size_t index) const {
+  const struct ProfileEvent* At(size_t index) const {
     size_t size = Size();
     if (index >= size) {
       return nullptr;
@@ -145,7 +132,8 @@
   uint32_t current_index_;
   std::vector<ProfileEvent> event_buffer_;
 };
+
 }  // namespace profiling
 }  // namespace tflite
-#endif  // TFLITE_PROFILING_ENABLED
+
 #endif  // TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index d4f5da7..d75269e 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -18,9 +18,9 @@
 
 #include <vector>
 
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
 
 namespace tflite {
 namespace profiling {
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index bbb64b8..8891ac5 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -13,6 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/profiling/profile_summarizer.h"
+
 #include <string>
 #include <vector>
 
@@ -22,7 +24,7 @@
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -33,7 +35,6 @@
 
 const char* kOpName = "SimpleOpEval";
 
-#ifdef TFLITE_PROFILING_ENABLED
 TfLiteStatus SimpleOpEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = tflite::GetInput(context, node, /*index=*/0);
   const TfLiteTensor* input2 = tflite::GetInput(context, node, /*index=*/1);
@@ -69,7 +70,6 @@
                                             1};
   return &registration;
 }
-#endif
 
 class SimpleOpModel : public SingleOpModel {
  public:
@@ -101,9 +101,8 @@
   EXPECT_GT(output.size(), 0);
 }
 
-#ifdef TFLITE_PROFILING_ENABLED
 TEST(ProfileSummarizerTest, Interpreter) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   SimpleOpModel m;
   m.Init(RegisterSimpleOp);
   auto interpreter = m.GetInterpreter();
@@ -124,7 +123,7 @@
 }
 
 TEST(ProfileSummarizerTest, InterpreterPlusProfilingDetails) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   SimpleOpModel m;
   m.Init(RegisterSimpleOpWithProfilingDetails);
   auto interpreter = m.GetInterpreter();
@@ -145,8 +144,6 @@
       << output;
 }
 
-#endif
-
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/profiler.h b/tensorflow/lite/profiling/profiler.h
index dd45518..e75c90b 100644
--- a/tensorflow/lite/profiling/profiler.h
+++ b/tensorflow/lite/profiling/profiler.h
@@ -15,168 +15,23 @@
 #ifndef TENSORFLOW_LITE_PROFILING_PROFILER_H_
 #define TENSORFLOW_LITE_PROFILING_PROFILER_H_
 
-#include <vector>
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/noop_profiler.h"
 
-#include "tensorflow/lite/profiling/profile_buffer.h"
+namespace tflite {
+namespace profiling {
 
+// TODO(b/131688504): Remove this and use runtime flags for profiler selection.
 #ifdef TFLITE_PROFILING_ENABLED
-
-namespace tflite {
-namespace profiling {
-class ScopedProfile;
-class ScopedOperatorProfile;
-
-// Controls whether profiling is enabled or disabled and collects profiles.
-// TFLite is used on platforms that don't have posix threads, so the profiler is
-// kept as simple as possible. It is designed to be used only on a single
-// thread.
-//
-// Profiles are collected using Scoped*Profile objects that begin and end a
-// profile event.
-// An example usage is shown in the example below:
-//
-// Say Worker class has a DoWork method and we are interested in profiling
-// the overall execution time for DoWork and time spent in Task1 and Task2
-// functions.
-//
-// class Worker {
-//  public:
-//   void DoWork() {
-//    ScopedProfile(&controller, "DoWork");
-//    Task1();
-//    Task2();
-//    .....
-//   }
-//
-//   void Task1() {
-//    ScopedProfile(&controller, "Task1");
-//    ....
-//   }
-//
-//   void Task2() {
-//    ScopedProfile(&controller, "Task2");
-//   }
-//
-//    Profiler profiler;
-// }
-//
-// We instrument the functions that need to be profiled.
-//
-// Profile can be collected by enable profiling and then getting profile
-// events.
-//
-//  void ProfileWorker() {
-//    Worker worker;
-//    worker.profiler.EnableProfiling();
-//    worker.DoWork();
-//    worker.profiler.DisableProfiling();
-//    // Profiling is complete, extract profiles.
-//    auto profile_events = worker.profiler.GetProfiles();
-//  }
-//
-//
-class Profiler {
- public:
-  Profiler() : buffer_(1024, false) {}
-
-  void StartProfiling() { buffer_.SetEnabled(true); }
-  void StopProfiling() { buffer_.SetEnabled(false); }
-  void Reset() { buffer_.Reset(); }
-  std::vector<const ProfileEvent*> GetProfileEvents() {
-    std::vector<const ProfileEvent*> profile_events;
-    profile_events.reserve(buffer_.Size());
-    for (size_t i = 0; i < buffer_.Size(); i++) {
-      profile_events.push_back(buffer_.At(i));
-    }
-    return profile_events;
-  }
-
- private:
-  friend class ScopedProfile;
-  friend class ScopedOperatorProfile;
-  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
-  ProfileBuffer buffer_;
-};
-
-class ScopedProfile {
- public:
-  // Adds a profile event to profile that begins with the construction
-  // of object and ends when the object goes out of scope.
-  // The lifetime of tag should be at least the lifetime of profiler.
-
-  ScopedProfile(Profiler* profiler, const char* tag)
-      : buffer_(nullptr), event_handle_(0) {
-    if (profiler) {
-      buffer_ = profiler->GetProfileBuffer();
-      event_handle_ =
-          buffer_->BeginEvent(tag, ProfileEvent::EventType::DEFAULT, 0);
-    }
-  }
-  ~ScopedProfile() {
-    if (buffer_) {
-      buffer_->EndEvent(event_handle_);
-    }
-  }
-
- private:
-  ProfileBuffer* buffer_;
-  int32_t event_handle_;
-};
-
-class ScopedOperatorProfile {
- public:
-  // Adds a profile event to profile that begins with the construction
-  // of object and ends when the object goes out of scope.
-  // The lifetime of tag should be at least the lifetime of profiler.
-  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
-      : buffer_(nullptr), event_handle_(0) {
-    if (profiler) {
-      buffer_ = profiler->GetProfileBuffer();
-      event_handle_ = buffer_->BeginEvent(
-          tag, ProfileEvent::EventType::OPERATOR_INVOKE_EVENT, node_index);
-    }
-  }
-
-  ~ScopedOperatorProfile() {
-    if (buffer_) {
-      buffer_->EndEvent(event_handle_);
-    }
-  }
-
- private:
-  ProfileBuffer* buffer_;
-  int32_t event_handle_;
-};
-
-}  // namespace profiling
-}  // namespace tflite
-
-#define VARNAME_UNIQ(name, ctr) name##ctr
-
-#define SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index) \
-  tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ(          \
-      _profile_, __COUNTER__)((profiler), (tag), (node_index))
-#define SCOPED_OPERATOR_PROFILE(profiler, node_index) \
-  SCOPED_TAGGED_OPERATOR_PROFILE((profiler), "OpInvoke", (node_index))
+using Profiler = BufferedProfiler;
 #else
+using Profiler = NoopProfiler;
+#endif  // TFLITE_PROFILING_ENABLED
 
-namespace tflite {
-namespace profiling {
-// A noop version of profiler when profiling is disabled.
-class Profiler {
- public:
-  Profiler() {}
-  void StartProfiling() {}
-  void StopProfiling() {}
-  void Reset() {}
-  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
-};
 }  // namespace profiling
 }  // namespace tflite
 
-#define SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)
-#define SCOPED_OPERATOR_PROFILE(profiler, node_index)
-
-#endif  // TFLITE_PROFILING_ENABLED
+#define SCOPED_TAGGED_OPERATOR_PROFILE TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE
+#define SCOPED_OPERATOR_PROFILE TFLITE_SCOPED_OPERATOR_PROFILE
 
 #endif  // TENSORFLOW_LITE_PROFILING_PROFILER_H_
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index addebab..44dc3a9 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -31,17 +31,17 @@
   return (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
 }
 
-void SleepForQuarterSecond(Profiler* profiler) {
+void SleepForQuarterSecond(tflite::Profiler* profiler) {
   ScopedProfile profile(profiler, "SleepForQuarter");
   std::this_thread::sleep_for(std::chrono::milliseconds(250));
 }
 
-void ChildFunction(Profiler* profiler) {
+void ChildFunction(tflite::Profiler* profiler) {
   ScopedProfile profile(profiler, "Child");
   SleepForQuarterSecond(profiler);
 }
 
-void ParentFunction(Profiler* profiler) {
+void ParentFunction(tflite::Profiler* profiler) {
   ScopedProfile profile(profiler, "Parent");
   for (int i = 0; i < 2; i++) {
     ChildFunction(profiler);
@@ -49,14 +49,14 @@
 }
 
 TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   ParentFunction(&profiler);
   auto profile_events = profiler.GetProfileEvents();
   EXPECT_EQ(0, profile_events.size());
 }
 
 TEST(ProfilingTest, ProfilesAreCollected) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   profiler.StartProfiling();
   ParentFunction(&profiler);
   profiler.StopProfiling();
@@ -101,7 +101,7 @@
 }
 
 TEST(ProfilingTest, ScopedProfile) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   profiler.StartProfiling();
   { SCOPED_OPERATOR_PROFILE(&profiler, 1); }
   profiler.StopProfiling();
@@ -109,6 +109,15 @@
   EXPECT_EQ(1, profile_events.size());
 }
 
+TEST(ProfilingTest, NoopProfiler) {
+  NoopProfiler profiler;
+  profiler.StartProfiling();
+  { SCOPED_OPERATOR_PROFILE(&profiler, 1); }
+  profiler.StopProfiling();
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index fae6540..72b85d3 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -73,6 +73,7 @@
         ":op_hint",
         ":util",
         "//tensorflow/lite/experimental/examples/lstm:tflite_lstm_ops",
+        "//tensorflow/lite/experimental/tensorboard:ops_util",
         "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/python:graph_util",
         "//tensorflow/python/keras",
@@ -156,6 +157,17 @@
 )
 
 py_library(
+    name = "wrap_toco",
+    srcs = [
+        "wrap_toco.py",
+    ],
+    deps = [
+        "//tensorflow/lite/toco/python:tensorflow_wrap_toco",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
     name = "lite_constants",
     srcs = ["lite_constants.py"],
     srcs_version = "PY2AND3",
@@ -173,9 +185,9 @@
     deps = [
         ":lite_constants",
         ":util",
+        ":wrap_toco",
         "//tensorflow/lite/toco:model_flags_proto_py",
         "//tensorflow/lite/toco:toco_flags_proto_py",
-        "//tensorflow/lite/toco/python:tensorflow_wrap_toco",
         "//tensorflow/lite/toco/python:toco_from_protos",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:platform",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 78ed45bd..d06a5a6 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -27,21 +27,14 @@
 
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
+from tensorflow.lite.python import wrap_toco
 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util import deprecation
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies.
-_toco_python = LazyLoader(
-    "tensorflow_wrap_toco", globals(),
-    "tensorflow.lite.toco.python."
-    "tensorflow_wrap_toco")
-del LazyLoader
 
 # Find the toco_from_protos binary using the resource loader if using from
 # bazel, otherwise we are in a pip where console_scripts already has
@@ -81,6 +74,11 @@
   # WARNING: Experimental interface, subject to change.
   SELECT_TF_OPS = "SELECT_TF_OPS"
 
+  # Convert model using only TensorFlow Lite quantized int8 operations.
+  # Specifying this will throw an error for operations that do not yet have
+  # quantized implementations.
+  TFLITE_BUILTINS_INT8 = "TFLITE_BUILTINS_INT8"
+
   def __str__(self):
     return self.value
 
@@ -119,8 +117,8 @@
   # switch this on.
   if not _toco_from_proto_bin:
     try:
-      model_str = _toco_python.TocoConvert(model_flags_str, toco_flags_str,
-                                           input_data_str)
+      model_str = wrap_toco.wrapped_toco_convert(model_flags_str,
+                                                 toco_flags_str, input_data_str)
       return model_str
     except Exception as e:
       raise ConverterError("TOCO failed: %s" % e)
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index 4c07708..693f41c 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -34,7 +34,7 @@
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("Incompatible with 2.0.")
 class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -177,7 +177,7 @@
         "QUANTIZED_UINT8.", str(error.exception))
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("Incompatible with 2.0.")
 class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index b217792..6d9d37a 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -148,6 +148,20 @@
                                  'Invoke called on model that is not ready'):
       interpreter.invoke()
 
+  def testInvalidModelFileContent(self):
+    with self.assertRaisesRegexp(
+        ValueError, '`model_path` or `model_content` must be specified.'):
+      interpreter_wrapper.Interpreter(model_path=None, model_content=None)
+
+  def testInvalidIndex(self):
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/permute_float.tflite'))
+    interpreter.allocate_tensors()
+    #Invalid tensor index passed.
+    with self.assertRaisesRegexp(ValueError, 'Tensor with no shape found.'):
+      interpreter._get_tensor_details(4)
+
 
 class InterpreterTensorAccessorTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
index 22ec88b..110c3ac 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -32,6 +32,8 @@
   switch (tf_lite_type) {
     case kTfLiteFloat32:
       return NPY_FLOAT32;
+    case kTfLiteFloat16:
+      return NPY_FLOAT16;
     case kTfLiteInt32:
       return NPY_INT32;
     case kTfLiteInt16:
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 5783f3f..ec8aa1d 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -24,9 +24,11 @@
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.experimental.examples.lstm.rnn import dynamic_rnn  # pylint: disable=unused-import
 from tensorflow.lite.experimental.examples.lstm.rnn_cell import TFLiteLSTMCell  # pylint: disable=unused-import
 from tensorflow.lite.experimental.examples.lstm.rnn_cell import TfLiteRNNCell  # pylint: disable=unused-import
+from tensorflow.lite.experimental.tensorboard.ops_util import get_potentially_supported_ops  # pylint: disable=unused-import
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
@@ -47,7 +49,6 @@
 from tensorflow.lite.python.util import is_frozen_graph as _is_frozen_graph
 from tensorflow.lite.python.util import run_graph_optimizations as _run_graph_optimizations
 from tensorflow.lite.python.util import set_tensor_shapes as _set_tensor_shapes
-from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.eager import context
@@ -74,25 +75,29 @@
   Some optimizations may come at the cost of accuracy.
   """
 
+  # Default optimization strategy.
+  #
+  # Converter will do its best to improve size and latency based on the
+  # information provided.
+  # Enhanced optimizations can be gained by providing a representative_dataset.
+  # This is recommended, and is currently equivalent to the modes below.
+  # Currently, weights will be quantized and if representative_dataset is
+  # provided, activations for quantizable operations will also be quantized.
+  DEFAULT = "DEFAULT"
+
   # Optimize for size.
   #
   # Optimizations that reduce the size of the model.
   # The model size will be reduced.
-  # Current behavior:
-  # - If RepresentativeDataset is not provided, weights will be quantized and
-  #   activations will remain float.
-  # - If RepresentativeDataset is provided, weights and activations will be
-  #   quantized.
+  # Currently, weights will be quantized and if representative_dataset is
+  # provided, activations for quantizable operations will also be quantized.
   OPTIMIZE_FOR_SIZE = "OPTIMIZE_FOR_SIZE"
 
   # Optimize for latency.
   #
   # Optimizations that reduce the latency of the model.
-  # Current behavior:
-  # - If RepresentativeDataset is not provided, weights will be quantized and
-  #   activations will remain float.
-  # - If RepresentativeDataset is provided, weights and activations will be
-  #   quantized.
+  # Currently, weights will be quantized and if representative_dataset is
+  # provided, activations for quantizable operations will also be quantized.
   OPTIMIZE_FOR_LATENCY = "OPTIMIZE_FOR_LATENCY"
 
   def __str__(self):
@@ -139,8 +144,64 @@
     self.supported_ops = supported_ops
 
 
+class TFLiteConverterBase(object):
+  """Converter subclass to share functionality between V1 and V2 converters."""
+
+  def __init__(self):
+    self.representative_dataset = None
+    self.optimizations = []
+    self._target_ops = set([OpsSet.TFLITE_BUILTINS])
+
+  def _grappler_config(self):
+    is_only_flex_enabled = set([OpsSet.SELECT_TF_OPS]) == set(self._target_ops)
+    optimizers = ["constfold"]
+    if is_only_flex_enabled:
+      # The layout optimizer turns NHCW to NCHW. This provides performance
+      # optimizations when Flex mode is enabled. However, this is not compatible
+      # with builtin ops.
+      optimizers.append("layout")
+    return _get_grappler_config(optimizers)
+
+  def _validate_representative_dataset(self):
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        self.representative_dataset = RepresentativeDataset(
+            self.representative_dataset)
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+    elif self._int8_target_required():
+      raise ValueError("representative_dataset is required when specifying "
+                       "TFLITE_BUILTINs_INT8 target.")
+
+  def _int8_target_required(self):
+    return set([OpsSet.TFLITE_BUILTINS_INT8]) == set(self._target_ops)
+
+  def _is_post_training_optimize(self):
+    return (self._int8_target_required() or bool(
+        set(self.optimizations).intersection([
+            Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE,
+            Optimize.DEFAULT
+        ])))
+
+  def _is_weight_only_quantize(self):
+    return (self._is_post_training_optimize() and
+            (self.representative_dataset is None))
+
+  def _is_calibration_quantize(self):
+    return self._is_post_training_optimize() and self.representative_dataset
+
+  def _calibrate_quantize_model(self, result, inference_input_type,
+                                inference_output_type):
+    allow_float = not self._int8_target_required()
+    calibrate_quantize = _calibrator.Calibrator(result)
+    return calibrate_quantize.calibrate_and_quantize(
+        self.representative_dataset.input_gen, inference_input_type,
+        inference_output_type, allow_float)
+
+
 @_tf_export("lite.TFLiteConverter", v1=[])
-class TFLiteConverterV2(object):
+class TFLiteConverterV2(TFLiteConverterBase):
   """Converts a TensorFlow model into TensorFlow Lite model.
 
   Attributes:
@@ -152,12 +213,7 @@
     target_spec: Experimental flag, subject to change. Specification of target
       device.
     optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. The converter applies the
-      optimizations by giving priority to the optimizations specified earlier in
-      the list. E.g. `[optimize.OPTIMIZE_FOR_SIZE,
-      optimize.OPTIMIZE_FOR_LATENCY]` requires the converter to do both size and
-      latency optimizations giving priority to size optimizations over latency
-      optimizations.
+      to apply when converting the model. E.g. `[Optimize.DEFAULT]
     representative_dataset: A representative dataset that can be used to
       generate input and output samples for the model. The converter can use the
       dataset to evaluate different optimizations.
@@ -191,12 +247,11 @@
         Variables. This is only required when the tf.AutoTrackable object is not
         maintained by the user (e.g. `from_saved_model`).
     """
+    super(TFLiteConverterV2, self).__init__()
     self._funcs = funcs
     self._trackable_obj = trackable_obj
     self.allow_custom_ops = False
     self.target_spec = TargetSpec()
-    self.representative_dataset = None
-    self.optimizations = []
 
   @classmethod
   def from_concrete_functions(cls, funcs):
@@ -240,7 +295,10 @@
     Raises:
       Invalid signature keys.
     """
-    saved_model = _load(saved_model_dir, tags)
+    # Ensures any graphs created in Eager mode are able to run. This is required
+    # in order to create a tf.estimator.Exporter that exports a TFLite model.
+    with context.eager_mode():
+      saved_model = _load(saved_model_dir, tags)
     if not signature_keys:
       signature_keys = saved_model.signatures
 
@@ -280,6 +338,7 @@
         Invalid quantization parameters.
     """
     # TODO(b/130297984): Add support for converting multiple function.
+    self._target_ops = self.target_spec.supported_ops
     if len(self._funcs) != 1:
       raise ValueError("This converter can only convert a single "
                        "ConcreteFunction. Converting multiple functions is "
@@ -294,14 +353,12 @@
     output_tensors = frozen_func.outputs
 
     # Run a Grappler pass.
-    is_only_flex_enabled = set(
-        [OpsSet.SELECT_TF_OPS]) == self.target_spec.supported_ops
-    config = _get_grappler_config(enable_layout_optimizer=is_only_flex_enabled)
+    graph_def = frozen_func.graph.as_graph_def()
     graph_def = _run_graph_optimizations(
-        frozen_func.graph.as_graph_def(),
+        graph_def,
         input_tensors,
         output_tensors,
-        config,
+        config=self._grappler_config(),
         graph=frozen_func.graph)
 
     # Checks dimensions in input tensor.
@@ -318,29 +375,12 @@
         shape[0] = 1
         tensor.set_shape(shape)
 
-    if self.representative_dataset:
-      if not isinstance(self.representative_dataset, RepresentativeDataset):
-        raise TypeError("`representative_dataset` must be an instance of "
-                        "`RepresentativeDataset`")
-      if self.representative_dataset.input_gen is None:
-        raise ValueError(
-            "Provide an input generator for `representative_dataset`")
-
-    # TODO(shashishekhar): For now use optimizations order is ignored.
-    # Both size and latency optimizations decide whether to apply post
-    # training optimizations.
-    post_training_optimize = bool(
-        len(
-            set(self.optimizations)
-            & set([Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE])))
-    # Do weights only quantization if there is no dataset for calibration.
-    weights_only_quantize_flag = (
-        post_training_optimize and (self.representative_dataset is None))
+    self._validate_representative_dataset()
 
     converter_kwargs = {
         "input_format": constants.TENSORFLOW_GRAPHDEF,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": weights_only_quantize_flag,
+        "post_training_quantize": self._is_weight_only_quantize(),
         "target_ops": self.target_spec.supported_ops,
     }
 
@@ -351,16 +391,15 @@
         output_tensors=output_tensors,
         **converter_kwargs)
 
-    if self.representative_dataset and post_training_optimize:
-      calibrate_quantize = _calibrator.Calibrator(result)
-      result = calibrate_quantize.calibrate_and_quantize(
-          self.representative_dataset.input_gen)
+    if self._is_calibration_quantize():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT)
 
     return result
 
 
 @_tf_export(v1=["lite.TFLiteConverter"])
-class TFLiteConverter(object):
+class TFLiteConverter(TFLiteConverterBase):
   """Convert a TensorFlow model into `output_format`.
 
   This is used to convert from a TensorFlow GraphDef or SavedModel into either a
@@ -415,7 +454,7 @@
       these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
     post_training_quantize: deprecated, please specify
-     `[optimize.OPTIMIZE_FOR_SIZE]` for `optimizations` instead. Boolean
+     `[Optimize.DEFAULT]` for `optimizations` instead. Boolean
      indicating whether to quantize the weights of the converted float model.
      Model size will be reduced and there will be latency improvements
      (at the cost of accuracy). (default False)
@@ -428,13 +467,8 @@
     target_ops: Experimental flag, subject to change. Set of OpsSet
       options indicating which converter to use.
       (default set([OpsSet.TFLITE_BUILTINS]))
-    optimizations: Experimental flag, subject to change, A list of
-      optimizations to apply when converting the model. The converter applies
-      the optimizations by giving priority to the optimizations specified
-      earlier in the list. E.g.
-      `[optimize.OPTIMIZE_FOR_SIZE, optimize.OPTIMIZE_FOR_LATENCY]` requires
-      the converter to do both size and latency optimizations giving priority
-      to size optimizations over latency optimizations.
+    optimizations: Experimental flag, subject to change. A list of optimizations
+      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
       generate input and output samples for the model. The converter can use
       the dataset to evaluate different optimizations.
@@ -488,6 +522,7 @@
     Raises:
       ValueError: Invalid arguments.
     """
+    super(TFLiteConverter, self).__init__()
     self._graph_def = graph_def
     self._input_tensors = input_tensors
     self._output_tensors = output_tensors
@@ -505,8 +540,6 @@
     self.dump_graphviz_dir = None
     self.dump_graphviz_video = False
     self.target_ops = set([OpsSet.TFLITE_BUILTINS])
-    self.representative_dataset = None
-    self.optimizations = []
 
     # Attributes are used by models that cannot be loaded into TensorFlow.
     if not self._has_valid_tensors():
@@ -732,11 +765,10 @@
   def __setattr__(self, name, value):
     if name == "post_training_quantize":
       warnings.warn("Property %s is deprecated, "
-                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    "please use optimizations=[Optimize.DEFAULT]"
                     " instead." % name)
       if value:
-        # Use OPTIMIZE_FOR_SIZE for post training for now.
-        self.optimizations = [Optimize.OPTIMIZE_FOR_SIZE]
+        self.optimizations = [Optimize.DEFAULT]
       else:
         self.optimizations = []
       return
@@ -745,9 +777,9 @@
   def __getattribute__(self, name):
     if name == "post_training_quantize":
       warnings.warn("Property %s is deprecated, "
-                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    "please use optimizations=[Optimize.DEFAULT]"
                     " instead." % name)
-      return Optimize.OPTIMIZE_FOR_SIZE in set(self.optimizations)
+      return Optimize.DEFAULT in set(self.optimizations)
     return object.__getattribute__(self, name)
 
   def convert(self):
@@ -762,6 +794,7 @@
         Input shape is not specified.
         None value for dimension in input_tensor.
     """
+    self._target_ops = self.target_ops
     # Checks dimensions in input tensor.
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
@@ -795,24 +828,13 @@
                          "tensors '{0}'.".format(",".join(invalid_stats)))
     else:
       quantized_stats = None
-    if self.representative_dataset:
-      if not isinstance(self.representative_dataset, RepresentativeDataset):
-        self.representative_dataset = RepresentativeDataset(
-            self.representative_dataset)
-      if self.representative_dataset.input_gen is None:
-        raise ValueError(
-            "Provide an input generator for representative_dataset")
 
-    post_training_optimize = bool(
-        len(set(self.optimizations) & set([Optimize.OPTIMIZE_FOR_LATENCY,
-                                           Optimize.OPTIMIZE_FOR_SIZE])))
-    # Do weights only quantization if there is no dataset for calibration.
-    weights_only_quantize_flag = (
-        post_training_optimize and (self.representative_dataset is None))
+    self._validate_representative_dataset()
 
     toco_inference_input_type = self.inference_input_type
     inference_input_type = self.inference_input_type
     inference_output_type = self.inference_output_type
+    post_training_optimize = self._is_post_training_optimize()
     if post_training_optimize:
       # Post training optimizations require that TOCO outputs a float model.
       if self.inference_type != constants.FLOAT:
@@ -825,7 +847,8 @@
       if inference_output_type is None:
         inference_output_type = constants.FLOAT
 
-    if weights_only_quantize_flag:
+    weight_only_quantize = self._is_weight_only_quantize()
+    if weight_only_quantize:
       # Currently, weight only quantization requires float inputs and outputs.
       if (inference_input_type != constants.FLOAT or
           inference_output_type != constants.FLOAT):
@@ -849,22 +872,20 @@
         "reorder_across_fake_quant": self.reorder_across_fake_quant,
         "change_concat_input_ranges": self.change_concat_input_ranges,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": weights_only_quantize_flag,
+        "post_training_quantize": weight_only_quantize,
         "target_ops": self.target_ops,
         "dump_graphviz_dir": self.dump_graphviz_dir,
         "dump_graphviz_video": self.dump_graphviz_video
     }
 
-    optimized_graph = None
-    if self.inference_type == constants.QUANTIZED_UINT8:
-      optimized_graph = self._graph_def
-    else:
+    optimized_graph = self._graph_def
+    if self.inference_type != constants.QUANTIZED_UINT8:
       try:
-        is_only_flex_enabled = set([OpsSet.SELECT_TF_OPS]) == self.target_ops
-        config = _get_grappler_config(
-            enable_layout_optimizer=is_only_flex_enabled)
         optimized_graph = _run_graph_optimizations(
-            self._graph_def, self._input_tensors, self._output_tensors, config)
+            self._graph_def,
+            self._input_tensors,
+            self._output_tensors,
+            config=self._grappler_config())
       except Exception:
         optimized_graph = self._graph_def
 
@@ -882,11 +903,9 @@
           output_arrays=self._output_arrays,
           **converter_kwargs)
 
-    if self.representative_dataset and post_training_optimize:
-      calibrate_quantize = _calibrator.Calibrator(result)
-      result = calibrate_quantize.calibrate_and_quantize(
-          self.representative_dataset.input_gen, inference_input_type,
-          inference_output_type)
+    if self._is_calibration_quantize():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type)
 
     return result
 
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
index b3418fd..3ca6519 100644
--- a/tensorflow/lite/python/lite_flex_test.py
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -31,7 +31,7 @@
 from tensorflow.python.training.tracking import tracking
 
 
-@test_util.run_v1_only('b/120545219')
+@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFlexMode(self):
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index e72ce07..d2a82bb 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -20,6 +20,7 @@
 
 import os
 import tempfile
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.lite.python import lite
@@ -27,12 +28,14 @@
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python import keras
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
@@ -84,7 +87,7 @@
     self.assertTrue(converter._has_valid_tensors())
 
 
-@test_util.run_v1_only('b/120545219')
+@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -502,8 +505,7 @@
 
     quantized_converter.post_training_quantize = True
     self.assertTrue(quantized_converter.post_training_quantize)
-    self.assertEqual(quantized_converter.optimizations,
-                     [lite.Optimize.OPTIMIZE_FOR_SIZE])
+    self.assertEqual(quantized_converter.optimizations, [lite.Optimize.DEFAULT])
 
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
@@ -531,17 +533,17 @@
     # Convert quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
-    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertTrue(len(quantized_tflite) < len(float_tflite))
 
-  def testPostTrainingCalibrateAndQuantize(self):
+  def _getCalibrationQuantizeModel(self):
     np.random.seed(0)
-    inp = array_ops.placeholder(dtype=dtypes.float32, shape=(1, 5, 5, 3),
-                                name='input')
+    inp = array_ops.placeholder(
+        dtype=dtypes.float32, shape=(1, 5, 5, 3), name='input')
     conv = nn_ops.conv2d(
         inp,
         filter=array_ops.ones([3, 3, 3, 16]),
@@ -553,6 +555,10 @@
       for _ in range(5):
         yield [np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32)]
 
+    return (inp, output, calibration_gen)
+
+  def testPostTrainingCalibrateAndQuantize(self):
+    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
     sess = session.Session()
 
     # Convert float model.
@@ -560,10 +566,41 @@
     float_tflite = float_converter.convert()
     self.assertTrue(float_tflite)
 
-    # Convert quantized weights model.
+    # Convert quantized model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
-    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
+  def testCalibrateAndQuantizeBuiltinInt8(self):
+    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+    sess = session.Session()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    # Convert model by specifying target spec (instead of optimizations), since
+    # when targeting an integer only backend, quantization is mandatory.
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [inp], [output])
+    quantized_converter.target_ops = [lite.OpsSet.TFLITE_BUILTINS_INT8]
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
@@ -582,20 +619,7 @@
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   def testPostTrainingCalibrateAndQuantizeInt8Inputs(self):
-    np.random.seed(0)
-    inp = array_ops.placeholder(dtype=dtypes.float32, shape=(1, 5, 5, 3),
-                                name='input')
-    conv = nn_ops.conv2d(
-        inp,
-        filter=array_ops.ones([3, 3, 3, 16]),
-        strides=[1, 1, 1, 1],
-        padding='SAME')
-    output = nn_ops.relu(conv, name='output')
-
-    def calibration_gen():
-      for _ in range(5):
-        yield [np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32)]
-
+    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
     sess = session.Session()
 
     # Convert float model.
@@ -608,7 +632,7 @@
         sess, [inp], [output])
     quantized_converter.inference_input_type = lite_constants.INT8
     quantized_converter.inference_output_type = lite_constants.INT8
-    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
@@ -714,8 +738,86 @@
     self.assertTrue(([1] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  def testInferenceInputOutputTypeFloatDefault(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
 
-@test_util.run_v1_only('b/120545219')
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+
+  def testInferenceInputOutputTypeQuantizedUint8Default(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+
+  def testReusingConverterWithDifferentPostTrainingQuantization(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+
+    converter.post_training_quantize = True
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    converter.post_training_quantize = False
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+
+@test_util.run_v1_only('Incompatible with 2.0.')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -855,7 +957,30 @@
         'Unable to parse input file \'{}\'.'.format(graph_def_file),
         str(error.exception))
 
-  # TODO(nupurgarg): Test model loading in open source.
+  def testFloatTocoConverter(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
+                                                     ['Placeholder'], ['add'])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Ensure the model is able to load.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+
+class FromFrozenGraphObjectDetection(test_util.TensorFlowTestCase):
+
   def _initObjectDetectionArgs(self):
     # Initializes the arguments required for the object detection model.
     # Looks for the model file which is saved in a different location internally
@@ -941,29 +1066,8 @@
         'input_shapes must contain a value for each item in input_array.',
         str(error.exception))
 
-  def testFloatTocoConverter(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
 
-    # Write graph to file.
-    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
-    write_graph(sess.graph_def, '', graph_def_file, False)
-    sess.close()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
-                                                     ['Placeholder'], ['add'])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Ensure the model is able to load.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-
-@test_util.run_v1_only('b/120545219')
+@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSavedModel(self, shape):
@@ -1123,63 +1227,71 @@
     return config
 
 
-@test_util.run_v1_only('b/120545219')
-class FromKerasFile(test_util.TensorFlowTestCase):
+@test_util.run_v1_only('Incompatible with 2.0.')
+class FromKerasFile(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def setUp(self):
-    keras.backend.clear_session()
+    super(FromKerasFile, self).setUp()
+    self._keras_file = None
+    self._custom_objects = None
+    if not context.executing_eagerly():
+      keras.backend.clear_session()
+
+  def tearDown(self):
+    if self._keras_file:
+      os.remove(self._keras_file)
+    super(FromKerasFile, self).tearDown()
 
   def _getSequentialModel(self, include_custom_layer=False):
-    with session.Session().as_default():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      if include_custom_layer:
-        model.add(MyAddLayer(1.0))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
-          metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-      model.predict(x)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    if include_custom_layer:
+      model.add(MyAddLayer(1.0))
+    model.add(keras.layers.RepeatVector(3))
+    model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='sgd',
+        metrics=[keras.metrics.categorical_accuracy],
+        sample_weight_mode='temporal')
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3, 3))
+    model.train_on_batch(x, y)
+    model.predict(x)
 
-      try:
-        fd, keras_file = tempfile.mkstemp('.h5')
-        keras.models.save_model(model, keras_file)
-      finally:
-        os.close(fd)
+    try:
+      fd, self._keras_file = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, self._keras_file)
+    finally:
+      os.close(fd)
 
-      if include_custom_layer:
-        custom_objects = {'MyAddLayer': MyAddLayer}
-        return keras_file, custom_objects
-      return keras_file
+    if include_custom_layer:
+      self._custom_objects = {'MyAddLayer': MyAddLayer}
 
-  def testSequentialModel(self):
+  @parameterized.named_parameters(('_graph', context.graph_mode),
+                                  ('_eager', context.eager_mode))
+  def testSequentialModel(self, test_context):
     """Test a Sequential tf.keras model with default inputs."""
-    keras_file = self._getSequentialModel()
+    with test_context():
+      self._getSequentialModel()
 
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+      converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('dense_input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('time_distributed/Reshape_1', output_details[0]['name'])
+    self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
@@ -1190,22 +1302,22 @@
     interpreter.invoke()
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
-    keras_model = keras.models.load_model(keras_file)
+    keras_model = keras.models.load_model(self._keras_file)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
-  def testCustomLayer(self):
+  @parameterized.named_parameters(('_graph', context.graph_mode),
+                                  ('_eager', context.eager_mode))
+  def testCustomLayer(self, test_context):
     """Test a Sequential tf.keras model with default inputs."""
-    keras_file, custom_objects = self._getSequentialModel(
-        include_custom_layer=True)
+    with test_context():
+      self._getSequentialModel(include_custom_layer=True)
 
-    converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, custom_objects=custom_objects)
-
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+      converter = lite.TFLiteConverter.from_keras_model_file(
+          self._keras_file, custom_objects=self._custom_objects)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1221,47 +1333,44 @@
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
     keras_model = keras.models.load_model(
-        keras_file, custom_objects=custom_objects)
+        self._keras_file, custom_objects=self._custom_objects)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
   def testSequentialModelInputArray(self):
     """Test a Sequential tf.keras model testing input arrays argument."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
     # Invalid input array raises error.
     with self.assertRaises(ValueError) as error:
       lite.TFLiteConverter.from_keras_model_file(
-          keras_file, input_arrays=['invalid-input'])
+          self._keras_file, input_arrays=['invalid-input'])
     self.assertEqual("Invalid tensors 'invalid-input' were found.",
                      str(error.exception))
 
     # Valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_arrays=['dense_input'])
+        self._keras_file, input_arrays=['dense_input'])
     tflite_model = converter.convert()
-    os.remove(keras_file)
     self.assertTrue(tflite_model)
 
   def testSequentialModelInputShape(self):
     """Test a Sequential tf.keras model testing input shapes argument."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
     # Passing in shape of invalid input array raises error.
     with self.assertRaises(ValueError) as error:
       converter = lite.TFLiteConverter.from_keras_model_file(
-          keras_file, input_shapes={'invalid-input': [2, 3]})
+          self._keras_file, input_shapes={'invalid-input': [2, 3]})
     self.assertEqual(
         "Invalid tensor 'invalid-input' found in tensor shapes map.",
         str(error.exception))
 
     # Passing in shape of valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_shapes={'dense_input': [2, 3]})
+        self._keras_file, input_shapes={'dense_input': [2, 3]})
     tflite_model = converter.convert()
-    os.remove(keras_file)
     self.assertTrue(tflite_model)
 
     # Check input shape from converted model.
@@ -1269,31 +1378,32 @@
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('dense_input', input_details[0]['name'])
     self.assertTrue(([2, 3] == input_details[0]['shape']).all())
 
   def testSequentialModelOutputArray(self):
     """Test a Sequential tf.keras model testing output arrays argument."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
     # Invalid output array raises error.
     with self.assertRaises(ValueError) as error:
       lite.TFLiteConverter.from_keras_model_file(
-          keras_file, output_arrays=['invalid-output'])
+          self._keras_file, output_arrays=['invalid-output'])
     self.assertEqual("Invalid tensors 'invalid-output' were found.",
                      str(error.exception))
 
     # Valid output array.
     converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, output_arrays=['time_distributed/Reshape_1'])
+        self._keras_file, output_arrays=['time_distributed/Reshape_1'])
     tflite_model = converter.convert()
-    os.remove(keras_file)
     self.assertTrue(tflite_model)
 
-  def testFunctionalModel(self):
+  @parameterized.named_parameters(('_graph', context.graph_mode),
+                                  ('_eager', context.eager_mode))
+  def testFunctionalModel(self, test_context):
     """Test a Functional tf.keras model with default inputs."""
-    with session.Session().as_default():
+    with test_context():
       inputs = keras.layers.Input(shape=(3,), name='input')
       x = keras.layers.Dense(2)(inputs)
       output = keras.layers.Dense(3)(x)
@@ -1301,38 +1411,37 @@
       model = keras.models.Model(inputs, output)
       model.compile(
           loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
+          optimizer='sgd',
           metrics=[keras.metrics.categorical_accuracy])
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
 
       model.predict(x)
-      fd, keras_file = tempfile.mkstemp('.h5')
+      fd, self._keras_file = tempfile.mkstemp('.h5')
       try:
-        keras.models.save_model(model, keras_file)
+        keras.models.save_model(model, self._keras_file)
       finally:
         os.close(fd)
 
-    # Convert to TFLite model.
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+      # Convert to TFLite model.
+      converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('dense_1/BiasAdd', output_details[0]['name'])
+    self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
@@ -1343,55 +1452,51 @@
     interpreter.invoke()
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
-    keras_model = keras.models.load_model(keras_file)
+    keras_model = keras.models.load_model(self._keras_file)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
   def testFunctionalModelMultipleInputs(self):
     """Test a Functional tf.keras model with multiple inputs and outputs."""
-    with session.Session().as_default():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      model = keras.models.Model([a, b], [d, e])
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
-          metrics=[keras.metrics.mae],
-          loss_weights=[1., 0.5])
+    model = keras.models.Model([a, b], [d, e])
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='sgd',
+        metrics=[keras.metrics.mae],
+        loss_weights=[1., 0.5])
 
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
-      output_d_np = np.random.random((10, 4))
-      output_e_np = np.random.random((10, 4))
-      model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
-      model.predict([input_a_np, input_b_np], batch_size=5)
-      fd, keras_file = tempfile.mkstemp('.h5')
-      try:
-        keras.models.save_model(model, keras_file)
-      finally:
-        os.close(fd)
+    model.predict([input_a_np, input_b_np], batch_size=5)
+    fd, self._keras_file = tempfile.mkstemp('.h5')
+    try:
+      keras.models.save_model(model, self._keras_file)
+    finally:
+      os.close(fd)
 
     # Convert to TFLite model.
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
+    converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    os.remove(keras_file)
-
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(2, len(input_details))
+    self.assertLen(input_details, 2)
     self.assertEqual('input_a', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
@@ -1403,7 +1508,7 @@
     self.assertEqual((0., 0.), input_details[1]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(2, len(output_details))
+    self.assertLen(output_details, 2)
     self.assertEqual('dense_1/BiasAdd', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 4] == output_details[0]['shape']).all())
@@ -1416,32 +1521,31 @@
 
   def testFunctionalSequentialModel(self):
     """Test a Functional tf.keras model containing a Sequential model."""
-    with session.Session().as_default():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model = keras.models.Model(model.input, model.output)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    model.add(keras.layers.RepeatVector(3))
+    model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+    model = keras.models.Model(model.input, model.output)
 
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
-          metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-      model.predict(x)
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='sgd',
+        metrics=[keras.metrics.categorical_accuracy],
+        sample_weight_mode='temporal')
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3, 3))
+    model.train_on_batch(x, y)
+    model.predict(x)
 
-      model.predict(x)
-      fd, keras_file = tempfile.mkstemp('.h5')
-      try:
-        keras.models.save_model(model, keras_file)
-      finally:
-        os.close(fd)
+    model.predict(x)
+    fd, self._keras_file = tempfile.mkstemp('.h5')
+    try:
+      keras.models.save_model(model, self._keras_file)
+    finally:
+      os.close(fd)
 
     # Convert to TFLite model.
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
+    converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1450,14 +1554,14 @@
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('dense_input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('time_distributed/Reshape_1', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
@@ -1469,17 +1573,16 @@
     interpreter.invoke()
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
-    keras_model = keras.models.load_model(keras_file)
+    keras_model = keras.models.load_model(self._keras_file)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
   def testSequentialModelTocoConverter(self):
     """Test a Sequential tf.keras model with deprecated TocoConverter."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
-    converter = lite.TocoConverter.from_keras_model_file(keras_file)
+    converter = lite.TocoConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1487,17 +1590,23 @@
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
-  def testInferenceInputOutputTypeFloatDefault(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
+
+@test_util.run_v1_only('Incompatible with 2.0.')
+class GrapplerTest(test_util.TensorFlowTestCase):
+
+  def testConstantFolding(self):
+    # Constant folding handles the tf.broadcast_to operation which was not
+    # supported by the TFLite at the time this test was added.
+    in_tensor = array_ops.placeholder(shape=[3, 3], dtype=dtypes.float32)
+    y_const = constant_op.constant([1., 2., 3.])
+    y_broadcast = gen_array_ops.broadcast_to(y_const, [3, 3])
+    out_tensor = math_ops.matmul(in_tensor, y_broadcast, name='output')
     sess = session.Session()
 
-    # Convert model and ensure model is not None.
+    # Convert model.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1507,63 +1616,19 @@
     self.assertEqual(1, len(input_details))
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('add', output_details[0]['name'])
-    self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
-
-  def testInferenceInputOutputTypeQuantizedUint8Default(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
-    sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertTrue(([3, 3] == input_details[0]['shape']).all())
 
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual('output', output_details[0]['name'])
-    self.assertEqual(np.uint8, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([3, 3] == output_details[0]['shape']).all())
 
-  def testReusingConverterWithDifferentPostTrainingQuantization(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
-    sess = session.Session()
 
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
+class ImportOpsUtilTest(test_util.TensorFlowTestCase):
 
-    converter.post_training_quantize = True
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    converter.post_training_quantize = False
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+  def testGetPotentiallySupportedOps(self):
+    self.assertIsNotNone(lite.get_potentially_supported_ops())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index ff8df5a..fad1462 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -26,7 +26,13 @@
 from tensorflow.python import keras
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model.save import save
@@ -149,6 +155,90 @@
     self.assertIn('can only convert a single ConcreteFunction',
                   str(error.exception))
 
+  def _getCalibrationQuantizeModel(self):
+    np.random.seed(0)
+
+    root = tracking.AutoTrackable()
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[1, 5, 5, 3], dtype=dtypes.float32)
+    ])
+    def func(inp):
+      conv = nn_ops.conv2d(
+          inp,
+          filter=array_ops.ones([3, 3, 3, 16]),
+          strides=[1, 1, 1, 1],
+          padding='SAME')
+      output = nn_ops.relu(conv, name='output')
+      return output
+
+    def calibration_gen():
+      for _ in range(5):
+        yield [np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32)]
+
+    root.f = func
+    to_save = root.f.get_concrete_function()
+    return (to_save, calibration_gen)
+
+  def testPostTrainingCalibrateAndQuantize(self):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
+  def testCalibrateAndQuantizeBuiltinInt8(self):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    # Convert model by specifying target spec (instead of optimizations), since
+    # when targeting an integer only backend, quantization is mandatory.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
 
 class FromSavedModelTest(TestModels):
 
@@ -308,5 +398,34 @@
       np.testing.assert_almost_equal(tf_result[0], tflite_result, 5)
 
 
+class GrapplerTest(TestModels):
+
+  @test_util.run_v2_only
+  def testConstantFolding(self):
+    # Constant folding handles the tf.broadcast_to operation which was not
+    # supported by the TFLite at the time this test was added.
+    input_data = constant_op.constant([1., 2., 3., 4., 5., 6., 7., 8., 9.],
+                                      shape=[3, 3])
+
+    @def_function.function
+    def func(x):
+      y_const = constant_op.constant([1., 2., 3.])
+      y_broadcast = gen_array_ops.broadcast_to(y_const, [3, 3])
+      return math_ops.matmul(x, y_broadcast)
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    np.testing.assert_array_equal(expected_value.numpy(), actual_value)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 3ebc94e..8ea376c 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -61,6 +61,8 @@
       return TensorType_FLOAT32;  // TODO(b/129336260): No schema type for none.
     case kTfLiteFloat32:
       return TensorType_FLOAT32;
+    case kTfLiteFloat16:
+      return TensorType_FLOAT16;
     case kTfLiteInt32:
       return TensorType_INT32;
     case kTfLiteUInt8:
@@ -186,7 +188,8 @@
 }
 
 PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
-                                            int output_py_type) {
+                                            int output_py_type,
+                                            bool allow_float) {
   TfLiteType input_type = python_utils::TfLiteTypeFromPyType(input_py_type);
   TfLiteType output_type = python_utils::TfLiteTypeFromPyType(output_py_type);
   if (input_type == kTfLiteNoType || output_type == kTfLiteNoType) {
@@ -199,7 +202,7 @@
   flatbuffers::FlatBufferBuilder builder;
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
-      TfLiteTypeToSchemaType(output_type), error_reporter_.get());
+      TfLiteTypeToSchemaType(output_type), allow_float, error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index 801a100..3fe1629 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -59,7 +59,8 @@
 
   PyObject* FeedTensor(PyObject* input_value);
 
-  PyObject* QuantizeModel(int input_py_type, int output_py_type);
+  PyObject* QuantizeModel(int input_py_type, int output_py_type,
+                          bool allow_float);
 
  private:
   // CalibrationWrapper is not copyable or assignable. We avoid the use of
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 665d4a3..a9eb679 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -54,7 +54,8 @@
     if not self._calibrator:
       raise ValueError("Failed to parse the model.")
 
-  def calibrate_and_quantize(self, dataset_gen, input_type, output_type):
+  def calibrate_and_quantize(self, dataset_gen, input_type, output_type,
+                             allow_float):
     """Calibrates the model with specified generator and then quantizes it.
 
     Returns:
@@ -64,10 +65,14 @@
       dataset_gen: A generator that generates calibration samples.
       input_type: A tf.dtype representing the desired real-value input type.
       output_type: A tf.dtype representing the desired real-value output type.
+      allow_float: A boolean. False if the resulting model cannot perform float
+                   computation, useful when targeting an integer-only backend.
+                   If False, an error will be thrown if an operation cannot be
+                   quantized, otherwise the model will fallback to float ops.
     """
     self._calibrator.Prepare()
     for calibration_sample in dataset_gen():
       self._calibrator.FeedTensor(calibration_sample)
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
-        np.dtype(output_type.as_numpy_dtype()).num)
+        np.dtype(output_type.as_numpy_dtype()).num, allow_float)
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 1bb0175..ca4a86c 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -39,8 +39,25 @@
       for _ in range(10):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
-    quantized_model = quantizer.calibrate_and_quantize(
-        input_gen, constants.FLOAT, constants.FLOAT)
+    quantized_model = quantizer.calibrate_and_quantize(input_gen,
+                                                       constants.FLOAT,
+                                                       constants.FLOAT, False)
+    self.assertIsNotNone(quantized_model)
+
+  def test_calibration_with_quantization_allow_float(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator for the model.
+    def input_gen():
+      for _ in range(10):
+        yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
+
+    quantized_model = quantizer.calibrate_and_quantize(input_gen,
+                                                       constants.FLOAT,
+                                                       constants.FLOAT, True)
     self.assertIsNotNone(quantized_model)
 
   def test_calibration_with_quantization_multiple_inputs(self):
@@ -56,8 +73,9 @@
       for _ in range(10):
         yield [np.ones(shape=(1, 8, 8, 3), dtype=np.float32) for _ in range(4)]
 
-    quantized_model = quantizer.calibrate_and_quantize(
-        input_gen, constants.FLOAT, constants.FLOAT)
+    quantized_model = quantizer.calibrate_and_quantize(input_gen,
+                                                       constants.FLOAT,
+                                                       constants.FLOAT, False)
     self.assertIsNotNone(quantized_model)
 
   def test_invalid_model_buffer(self):
@@ -78,7 +96,7 @@
 
     with self.assertRaises(RuntimeError):
       quantizer.calibrate_and_quantize(empty_input_gen, constants.FLOAT,
-                                       constants.FLOAT)
+                                       constants.FLOAT, False)
 
   def test_invalid_shape_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
@@ -93,7 +111,7 @@
 
     with self.assertRaisesWithRegexpMatch(ValueError, 'Dimension mismatch'):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT)
+                                       constants.FLOAT, False)
 
   def test_invalid_type_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
@@ -108,7 +126,7 @@
 
     with self.assertRaises(ValueError):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT)
+                                       constants.FLOAT, False)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 65d9ae1..530d8d6 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -205,16 +205,11 @@
     ValueError: Unsupported file format.
   """
   # Load the model.
-  if os.path.isdir(flags.input_file):
-    converter = lite.TFLiteConverterV2.from_saved_model(flags.input_file)
-  elif (flags.input_file.endswith(".h5") or
-        flags.input_file.endswith(".keras") or
-        flags.input_file.endswith(".hdf5")):
-    model = keras.models.load_model(flags.input_file)
+  if flags.saved_model_dir:
+    converter = lite.TFLiteConverterV2.from_saved_model(flags.saved_model_dir)
+  elif flags.keras_model_file:
+    model = keras.models.load_model(flags.keras_model_file)
     converter = lite.TFLiteConverterV2.from_keras_model(model)
-  else:
-    raise ValueError("File format of '{}' is not supported.".format(
-        flags.input_file))
 
   # Convert the model.
   tflite_model = converter.convert()
@@ -468,12 +463,17 @@
       type=str,
       help="Full filepath of the output file.",
       required=True)
-  parser.add_argument(
-      "--input_file",
+
+  # Input file flags.
+  input_file_group = parser.add_mutually_exclusive_group(required=True)
+  input_file_group.add_argument(
+      "--saved_model_dir",
       type=str,
-      help=("Full filepath of input model file. Accepted formats are "
-            "SavedModel and tf.Keras HDF5 model file"),
-      required=True)
+      help="Full path of the directory containing the SavedModel.")
+  input_file_group.add_argument(
+      "--keras_model_file",
+      type=str,
+      help="Full filepath of HDF5 file containing tf.Keras model.")
   return parser
 
 
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index e91cc26..3a0352f 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -20,7 +20,6 @@
 
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2 as _rewriter_config_pb2
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
 from tensorflow.lite.toco import types_pb2 as _types_pb2
@@ -32,6 +31,7 @@
 # Map of tf.dtypes to TFLite types_flag_pb2.
 _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.float32: _types_pb2.FLOAT,
+    dtypes.float16: _types_pb2.FLOAT16,
     dtypes.int32: _types_pb2.INT32,
     dtypes.int64: _types_pb2.INT64,
     dtypes.string: _types_pb2.STRING,
@@ -148,33 +148,19 @@
           raise ValueError(message)
 
 
-def get_grappler_config(enable_layout_optimizer=False, function_only=False):
+def get_grappler_config(optimizers_list):
   """Creates a tf.compat.v1.ConfigProto for configuring Grappler.
 
   Args:
-    enable_layout_optimizer: Bool indicating whether to run the layout
-      optimizer. This turns NHCW to NCHW. This provides performance
-      optimizations when Flex mode is enabled. (default False)
-    function_only: Bool indiciating whether to only run the function optimizer.
-      This inlines functions and is required for freezing models with functions.
-      (default False)
+    optimizers_list: List of strings that represents the list of optimizers.
 
   Returns:
     tf.ConfigProto.
   """
   config = _config_pb2.ConfigProto()
   rewrite_options = config.graph_options.rewrite_options
-  if function_only:
-    rewrite_options.optimizers.append("function")
-  else:
-    if enable_layout_optimizer:
-      rewrite_options.layout_optimizer = _rewriter_config_pb2.RewriterConfig.ON
-    else:
-      rewrite_options.layout_optimizer = _rewriter_config_pb2.RewriterConfig.OFF
-
-    # Avoid remapping as it creates ops like _FusedConv2D, which are not
-    # supported by TFLite.
-    rewrite_options.remapping = _rewriter_config_pb2.RewriterConfig.OFF
+  for optimizer in optimizers_list:
+    rewrite_options.optimizers.append(optimizer)
   return config
 
 
@@ -242,7 +228,7 @@
     return _convert_op_hints_if_present(sess, output_tensors)
 
   # Runs a Grappler pass in order to inline any functions in the graph.
-  config = get_grappler_config(function_only=True)
+  config = get_grappler_config(["function"])
   graph_def = run_graph_optimizations(
       sess.graph_def, input_tensors, output_tensors, config, graph=sess.graph)
 
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 2aff735..65b53bc 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -29,7 +29,7 @@
 
 
 # TODO(nupurgarg): Add test for Grappler and frozen graph related functions.
-@test_util.run_v1_only("")
+@test_util.run_v1_only("Incompatible with 2.0.")
 class UtilTest(test_util.TensorFlowTestCase):
 
   def testConvertDtype(self):
@@ -50,6 +50,8 @@
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.complex64),
         _types_pb2.COMPLEX64)
+    self.assertEqual(
+        util.convert_dtype_to_tflite_type(dtypes.half), _types_pb2.FLOAT16)
     with self.assertRaises(ValueError):
       util.convert_dtype_to_tflite_type(dtypes.bool)
 
@@ -65,10 +67,9 @@
       self.assertEqual(got_name, expect_names[i])
 
 
-@test_util.run_v1_only("")
+@test_util.run_v1_only("Incompatible with 2.0.")
 class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_v1_only("b/120545219")
   def testGetTensorsValid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -78,7 +79,6 @@
     tensors = util.get_tensors_from_tensor_names(sess.graph, ["Placeholder"])
     self.assertEqual("Placeholder:0", tensors[0].name)
 
-  @test_util.run_v1_only("b/120545219")
   def testGetTensorsInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -90,7 +90,6 @@
     self.assertEqual("Invalid tensors 'invalid-input' were found.",
                      str(error.exception))
 
-  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeValid(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
@@ -98,7 +97,6 @@
     util.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
     self.assertEqual([5, 3, 5], tensor.shape.as_list())
 
-  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeNoneValid(self):
     tensor = array_ops.placeholder(dtype=dtypes.float32)
     self.assertEqual(None, tensor.shape)
@@ -106,7 +104,6 @@
     util.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
     self.assertEqual([1, 3, 5], tensor.shape.as_list())
 
-  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeArrayInvalid(self):
     # Tests set_tensor_shape where the tensor name passed in doesn't exist.
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
@@ -131,7 +128,6 @@
                   str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeEmpty(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
new file mode 100644
index 0000000..7b51480
--- /dev/null
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -0,0 +1,40 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wraps toco interface with python lazy loader."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+# TODO(b/131123224): Lazy load since some of the performance benchmark skylark
+# rules and monolithic build break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
+
+
+def wrapped_toco_convert(model_flags_str, toco_flags_str, input_data_str):
+  """Wraps TocoConvert with lazy loader."""
+  return _toco_python.TocoConvert(model_flags_str, toco_flags_str,
+                                  input_data_str)
+
+
+def wrapped_get_potentially_supported_ops():
+  """Wraps TocoGetPotentiallySupportedOps with lazy loader."""
+  return _toco_python.TocoGetPotentiallySupportedOps()
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 3dbdacd..b5fc0f3 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -228,7 +228,8 @@
   REVERSE_SEQUENCE = 112,
   MATRIX_DIAG = 113,
   QUANTIZE = 114,
-  MATRIX_SET_DIAG = 115
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
 }
 
 // Options for the builtin operators.
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 3520eff..6d14eb4 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -566,11 +566,12 @@
   BuiltinOperator_MATRIX_DIAG = 113,
   BuiltinOperator_QUANTIZE = 114,
   BuiltinOperator_MATRIX_SET_DIAG = 115,
+  BuiltinOperator_ROUND = 116,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_MATRIX_SET_DIAG
+  BuiltinOperator_MAX = BuiltinOperator_ROUND
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[115] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[116] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -686,7 +687,8 @@
     BuiltinOperator_REVERSE_SEQUENCE,
     BuiltinOperator_MATRIX_DIAG,
     BuiltinOperator_QUANTIZE,
-    BuiltinOperator_MATRIX_SET_DIAG
+    BuiltinOperator_MATRIX_SET_DIAG,
+    BuiltinOperator_ROUND
   };
   return values;
 }
@@ -809,6 +811,7 @@
     "MATRIX_DIAG",
     "QUANTIZE",
     "MATRIX_SET_DIAG",
+    "ROUND",
     nullptr
   };
   return names;
diff --git a/tensorflow/lite/schema/upgrade_schema.py b/tensorflow/lite/schema/upgrade_schema.py
index d9220ba..dfb38ec 100644
--- a/tensorflow/lite/schema/upgrade_schema.py
+++ b/tensorflow/lite/schema/upgrade_schema.py
@@ -118,7 +118,8 @@
         that lacked file_identifier require this.
 
     Raises:
-      RuntimeError: When flatc cannot be invoked.
+      RuntimeError: 1. When flatc cannot be invoked.
+                    2. When json file does not exists.
       ValueError: When the extension is not json or bin.
 
     Returns:
@@ -235,6 +236,8 @@
         operator_type: String representing the builtin operator data type
           string.
         (see :schema.fbs).
+      Raises:
+        ValueError: When the model has consistency problems.
       Returns:
         Upgraded builtin operator data type as a string.
       """
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index d0910b8..3059bea 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -93,6 +93,7 @@
     data = [
         "//tensorflow/lite/toco",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":generate_examples_lib",
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index 2a28a0d..31daaf2 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -3585,6 +3585,32 @@
 
 
 @register_make_test_function()
+def make_round_tests(options):
+  """Build the round op testing graph."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the round op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.round(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
+
+
+@register_make_test_function()
 def make_neg_tests(options):
   """Make a set of tests to do neg."""
 
@@ -3759,6 +3785,15 @@
           "begin": [[0, 0], [1, 0]],
           "size": [[2, 3], [2, 2]],
       },
+      # 4-D with size -1
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[4, 4, 4, 4]],
+          "begin": [[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0],
+                    [0, 0, 0, 1]],
+          "size": [[-1, 1, 1, 1], [1, -1, 1, 1], [1, 1, -1, 1], [1, 1, 1, -1]],
+      },
   ]
 
   def build_graph(parameters):
@@ -4376,31 +4411,58 @@
 def make_unroll_batch_matmul_tests(options):
   """Make a set of tests to test unroll_batch_matmul."""
 
+  # The test cases below requires broadcasting support (BatchMatMulV2 semantic),
+  # whis isn't supported as of this change.
+  broadcast_shape_params = [
+      # Simple broadcast.
+      [(1, 2, 3), (3, 5), False, False],
+      # Empty batch broadcast.
+      [(2, 5, 3), (3, 7), False, False],
+      # Single batch with non-empty batch broadcast.
+      [(1, 5, 3), (4, 3, 7), False, False],
+      # Broadcast both operands
+      [(3, 1, 5, 3), (1, 4, 3, 7), False, False],
+  ]
+
   test_parameters = [{
       "dtype": [tf.float32],
-      "shape": [[(2, 2, 3), (2, 3, 2), False, False],
-                [(2, 2, 3), (2, 3, 2), True, True],
-                [(2, 2, 3), (2, 2, 3), False, True],
-                [(2, 2, 3), (2, 2, 3), True, False],
-                [(4, 2, 2, 3), (4, 2, 3, 2), False, False],
-                [(4, 2, 2, 3), (4, 2, 3, 2), True, True],
-                [(4, 2, 2, 3), (4, 2, 2, 3), False, True],
-                [(4, 2, 2, 3), (4, 2, 2, 3), True, False]]
+      "shape": [
+          [(2, 2, 3), (2, 3, 2), False, False],
+          [(2, 2, 3), (2, 3, 2), True, True],
+          [(2, 2, 3), (2, 2, 3), False, True],
+          [(2, 2, 3), (2, 2, 3), True, False],
+          [(4, 2, 2, 3), (4, 2, 3, 2), False, False],
+          [(4, 2, 2, 3), (4, 2, 3, 2), True, True],
+          [(4, 2, 2, 3), (4, 2, 2, 3), False, True],
+          [(4, 2, 2, 3), (4, 2, 2, 3), True, False]
+      ] + broadcast_shape_params,
+      # TODO(b/130887442): Improve the forward compatibility tests for every
+      # ops.
+      "forward_compatibility_test": [False, True],
   }]
 
   def build_graph(parameters):
     """Build the batch_matmul op testing graph."""
-    input_tensor1 = tf.placeholder(
-        dtype=parameters["dtype"], shape=parameters["shape"][0])
-    input_tensor2 = tf.placeholder(
-        dtype=parameters["dtype"], shape=parameters["shape"][1])
-    # Should be unrolled and replaced with fully_connected ops in the end.
-    out = tf.matmul(
-        input_tensor1,
-        input_tensor2,
-        transpose_a=parameters["shape"][2],
-        transpose_b=parameters["shape"][3])
-    return [input_tensor1, input_tensor2], [out]
+    def _build_graph():
+      input_tensor1 = tf.placeholder(
+          dtype=parameters["dtype"], shape=parameters["shape"][0])
+      input_tensor2 = tf.placeholder(
+          dtype=parameters["dtype"], shape=parameters["shape"][1])
+      # Should be unrolled and replaced with fully_connected ops in the end.
+      out = tf.matmul(
+          input_tensor1,
+          input_tensor2,
+          transpose_a=parameters["shape"][2],
+          transpose_b=parameters["shape"][3])
+      return [input_tensor1, input_tensor2], [out]
+    if parameters["forward_compatibility_test"]:
+      # This is hardcoded to the date after MatMulV2 is activated.
+      # TODO(b/130887442): Improve the forward compatibility tests for every
+      # ops, and remove the hardcoded date.
+      with tf.compat.forward_compatibility_horizon(2019, 4, 26):
+        return _build_graph()
+    else:
+      return _build_graph()
 
   def build_inputs(parameters, sess, inputs, outputs):
     input_value1 = create_tensor_data(
@@ -4410,7 +4472,8 @@
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      options, test_parameters, build_graph, build_inputs)
 
 
 @register_make_test_function()
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index ba1534c..b48356f 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -20,7 +20,10 @@
 
 import os
 import numpy as np
+from six import PY3
 
+from google.protobuf import text_format as _text_format
+from google.protobuf.message import DecodeError
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.python import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python import lite as _lite
@@ -71,24 +74,53 @@
   return img_array
 
 
-def _convert(converter, **kwargs):
+def _convert(converter, version=1, **kwargs):
   """Converts the model.
 
   Args:
     converter: TFLiteConverter object.
+    version: Version of the converter. Only valid values are 1 and 2.
     **kwargs: Additional arguments to be passed into the converter. Supported
       flags are {"target_ops", "post_training_quantize"}.
 
   Returns:
     The converted TFLite model in serialized format.
+
+  Raises:
+    ValueError: Invalid version number.
   """
+  if version not in (1, 2):
+    raise ValueError("Invalid TFLiteConverter version number.")
+
   if "target_ops" in kwargs:
-    converter.target_ops = kwargs["target_ops"]
+    if version == 1:
+      converter.target_ops = kwargs["target_ops"]
+    else:
+      converter.target_spec.supported_ops = kwargs["target_ops"]
   if "post_training_quantize" in kwargs:
     converter.post_training_quantize = kwargs["post_training_quantize"]
   return converter.convert()
 
 
+def _get_input_data_map(tflite_model, input_data):
+  """Generates a map of input data based on the TFLite model.
+
+  Args:
+    tflite_model: Serialized TensorFlow Lite model.
+    input_data: List of np.ndarray.
+
+  Returns:
+    {str: [np.ndarray]}.
+  """
+  interpreter = _lite.Interpreter(model_content=tflite_model)
+  interpreter.allocate_tensors()
+  input_details = interpreter.get_input_details()
+  return {
+      input_tensor["name"]: data
+      for input_tensor, data in zip(input_details, input_data)
+  }
+
+
 def _generate_random_input_data(tflite_model, seed=None):
   """Generates input data based on the input tensors in the TFLite model.
 
@@ -97,7 +129,7 @@
     seed: Integer seed for the random generator. (default None)
 
   Returns:
-    List of np.ndarray.
+    ([np.ndarray], {str : [np.ndarray]}).
   """
   interpreter = _lite.Interpreter(model_content=tflite_model)
   interpreter.allocate_tensors()
@@ -105,11 +137,13 @@
 
   if seed:
     np.random.seed(seed=seed)
-  return [
+  input_data = [
       np.array(
           np.random.random_sample(input_tensor["shape"]),
           dtype=input_tensor["dtype"]) for input_tensor in input_details
   ]
+  input_data_map = _get_input_data_map(tflite_model, input_data)
+  return input_data, input_data_map
 
 
 def _evaluate_tflite_model(tflite_model, input_data):
@@ -136,7 +170,8 @@
       interpreter.get_tensor(output_tensor["index"])
       for output_tensor in output_details
   ]
-  return output_data
+  output_labels = [output_tensor["name"] for output_tensor in output_details]
+  return output_data, output_labels
 
 
 def evaluate_frozen_graph(filename, input_arrays, output_arrays):
@@ -155,7 +190,15 @@
       file_content = f.read()
 
     graph_def = _graph_pb2.GraphDef()
-    graph_def.ParseFromString(file_content)
+    try:
+      graph_def.ParseFromString(file_content)
+    except (_text_format.ParseError, DecodeError):
+      if not isinstance(file_content, str):
+        if PY3:
+          file_content = file_content.decode("utf-8")
+        else:
+          file_content = file_content.encode("utf-8")
+      _text_format.Merge(file_content, graph_def)
     _import_graph_def(graph_def, name="")
 
     inputs = _util.get_tensors_from_tensor_names(sess.graph, input_arrays)
@@ -216,15 +259,14 @@
     tolerance: Decimal place to check accuracy to. (default 5)
   """
   if input_data is None:
-    input_data = _generate_random_input_data(tflite_model)
+    input_data, _ = _generate_random_input_data(tflite_model)
   tf_results = tf_eval_func(input_data)
-  tflite_results = _evaluate_tflite_model(tflite_model, input_data)
+  tflite_results, _ = _evaluate_tflite_model(tflite_model, input_data)
   for tf_result, tflite_result in zip(tf_results, tflite_results):
     np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
 
 
-def compare_models_v2(tflite_model, concrete_func, input_data=None,
-                      tolerance=5):
+def compare_models_v2(tflite_model, tf_eval_func, input_data=None, tolerance=5):
   """Compares TensorFlow and TFLite models for TensorFlow 2.0.
 
   Unless the input data is provided, the models are compared with random data.
@@ -232,19 +274,36 @@
 
   Args:
     tflite_model: Serialized TensorFlow Lite model.
-    concrete_func: TensorFlow ConcreteFunction.
+    tf_eval_func: Function to evaluate TensorFlow model. Either a lambda
+      function that takes in input data and outputs the results or a TensorFlow
+      ConcreteFunction.
     input_data: np.ndarray to pass into models during inference. (default None)
     tolerance: Decimal place to check accuracy to. (default 5)
   """
+  # Convert the input data into a map.
   if input_data is None:
-    input_data = _generate_random_input_data(tflite_model)
-  input_data_func = constant_op.constant(input_data[0])
+    input_data, input_data_map = _generate_random_input_data(tflite_model)
+  else:
+    input_data_map = _get_input_data_map(tflite_model, input_data)
+  input_data_func_map = {
+      input_name: constant_op.constant(input_data)
+      for input_name, input_data in input_data_map.items()
+  }
 
-  # Gets the TensorFlow results as a map from the output names to outputs.
-  # Converts the map into a list that is equivalent to the TFLite list.
-  tf_results_map = concrete_func(input_data_func)
-  tf_results = [tf_results_map[tf_results_map.keys()[0]]]
-  tflite_results = _evaluate_tflite_model(tflite_model, input_data)
+  if len(input_data) > 1:
+    tf_results = tf_eval_func(**input_data_func_map)
+  else:
+    tf_results = tf_eval_func(constant_op.constant(input_data[0]))
+  tflite_results, tflite_labels = _evaluate_tflite_model(
+      tflite_model, input_data)
+
+  # Convert the output TensorFlow results into an ordered list.
+  if isinstance(tf_results, dict):
+    if len(tf_results) == 1:
+      tf_results = [tf_results[tf_results.keys()[0]]]
+    else:
+      tf_results = [tf_results[tflite_label] for tflite_label in tflite_labels]
+
   for tf_result, tflite_result in zip(tf_results, tflite_results):
     np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
 
@@ -402,7 +461,7 @@
   concrete_func = model.signatures[signature_key]
 
   converter = _lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-  tflite_model = _convert(converter, **kwargs)
+  tflite_model = _convert(converter, version=2, **kwargs)
 
   compare_models_v2(tflite_model, concrete_func, input_data=input_data)
 
@@ -455,7 +514,7 @@
       tensor.set_shape(shape)
 
   converter = _lite.TFLiteConverterV2.from_keras_model(keras_model)
-  tflite_model = _convert(converter, **kwargs)
+  tflite_model = _convert(converter, version=2, **kwargs)
 
   tf_eval_func = evaluate_keras_model(filename)
   compare_models_v2(tflite_model, tf_eval_func, input_data=input_data)
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 9cc32d1..d1309b7 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -38,6 +38,7 @@
 from tensorflow.python.training.training_util import write_graph
 
 
+@test_util.run_v1_only('Incompatible with 2.0.')
 class EvaluateFrozenGraph(test.TestCase):
 
   def _saveFrozenGraph(self, sess):
@@ -45,7 +46,6 @@
     write_graph(sess.graph_def, '', graph_def_file, False)
     return graph_def_file
 
-  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     with session.Session().as_default() as sess:
       in_tensor = array_ops.placeholder(
@@ -55,7 +55,6 @@
 
     model_coverage.test_frozen_graph(filename, ['Placeholder'], ['add'])
 
-  @test_util.run_v1_only('b/120545219')
   def testMultipleOutputs(self):
     with session.Session().as_default() as sess:
       in_tensor_1 = array_ops.placeholder(
@@ -73,7 +72,6 @@
                                      ['add', 'Mean'])
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only('b/120545219')
   def testFunctions(self):
     """Tests functions."""
 
@@ -111,18 +109,15 @@
     filename = self._saveFrozenGraph(sess)
     return filename
 
-  @test_util.run_v1_only('b/120545219')
   def testQuantized(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(filename, ['inputA'], ['output'])
 
-  @test_util.run_v1_only('b/120545219')
   def testQuantizedInputShapes(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
         filename, ['inputA'], ['output'], input_shapes={'inputA': [33, 33]})
 
-  @test_util.run_v1_only('b/120545219')
   def testQuantizedFlexAll(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
@@ -130,9 +125,9 @@
         target_ops=set([lite.OpsSet.SELECT_TF_OPS]))
 
 
+@test_util.run_v1_only('Incompatible with 2.0.')
 class EvaluateSavedModel(test.TestCase):
 
-  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
     with session.Session().as_default() as sess:
@@ -148,6 +143,7 @@
     model_coverage.test_saved_model(saved_model_dir)
 
 
+@test_util.run_v1_only('Incompatible with 2.0.')
 class EvaluateKerasModel(test.TestCase):
 
   def _getSingleInputKerasModel(self):
@@ -170,21 +166,18 @@
       os.close(fd)
     return keras_file
 
-  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
-  @test_util.run_v1_only('b/120545219')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
-  @test_util.run_v1_only('b/120545219')
   def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 8481b0b..e24c014 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -283,9 +283,11 @@
         ":runtime",
         ":toco_port",
         ":tooling_util",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:strided_slice_logic",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index d426a69..f9a307a 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -1215,6 +1215,16 @@
   (*ceil_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertRoundOperator(const Model& model, const RoundOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* round_op = tensorflow_graph->add_node();
+  round_op->set_op("Round");
+  round_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *round_op->add_input() = src_op.inputs[0];
+  (*round_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
                            GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* gather_op = tensorflow_graph->add_node();
@@ -2210,6 +2220,9 @@
   } else if (src_op.type == OperatorType::kCeil) {
     ConvertCeilOperator(model, static_cast<const CeilOperator&>(src_op),
                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRound) {
+    ConvertRoundOperator(model, static_cast<const RoundOperator&>(src_op),
+                         tensorflow_graph);
   } else if (src_op.type == OperatorType::kGather) {
     ConvertGatherOperator(model, static_cast<const GatherOperator&>(src_op),
                           tensorflow_graph);
diff --git a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
index 3b7c88a..c8f453c 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -25,20 +25,6 @@
 
 namespace toco {
 
-namespace {
-
-std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
-    Model* model, const Operator* op) {
-  auto it = model->operators.begin();
-  for (; it != model->operators.end(); ++it) {
-    if (it->get() == op) {
-      break;
-    }
-  }
-  return it;
-}
-}  // namespace
-
 ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
                                                   std::size_t op_index,
                                                   bool* modified) {
@@ -150,7 +136,7 @@
   AddMessageF("Creating %s replacing equivalent subgraph", LogName(*l2norm_op));
 
   // Erase the subgraph that is now replaced by L2Normalization
-  model->operators.erase(FindOperator(model, square_op));
+  model->operators.erase(FindOp(*model, square_op));
   DeleteOpAndArraysIfUnused(model, sum_op);
   if (add_op) {
     DeleteOpAndArraysIfUnused(model, add_op);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index f5704bf..d3d6441 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -2153,6 +2153,7 @@
     case OperatorType::kCast:
     case OperatorType::kFloor:
     case OperatorType::kCeil:
+    case OperatorType::kRound:
     case OperatorType::kExp:
     case OperatorType::kSin:
     case OperatorType::kCos:
@@ -2420,6 +2421,10 @@
       ProcessMatrixSetDiagOperator(model,
                                    static_cast<MatrixSetDiagOperator*>(op));
       break;
+    case OperatorType::kCTCBeamSearchDecoder:
+      // The sizes of the outputs are only known in runtime based on the input.
+      // Ignore shape progapation here and defer that to the interpreter.
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index f142719..62eaba0 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -62,8 +62,9 @@
          type == OperatorType::kLessEqual || type == OperatorType::kSelect ||
          type == OperatorType::kArgMax || type == OperatorType::kRelu ||
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
-         type == OperatorType::kShape || type == OperatorType::kExpandDims ||
-         type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
+         type == OperatorType::kLeakyRelu || type == OperatorType::kShape ||
+         type == OperatorType::kExpandDims || type == OperatorType::kPack ||
+         type == OperatorType::kUnpack || type == OperatorType::kTopK_V2 ||
          type == OperatorType::kRandomUniform ||
          type == OperatorType::kResizeNearestNeighbor ||
          type == OperatorType::kPRelu || type == OperatorType::kReduceMax ||
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 98105d3..2f935a6 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -37,6 +37,7 @@
     case OperatorType::kRelu:
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
+    case OperatorType::kRound:
     case OperatorType::kTanh:
     case OperatorType::kSqrt:
     case OperatorType::kSquare:
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index 7056ca9..ba26993 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -54,6 +54,18 @@
 )
 
 tf_cc_test(
+    name = "identify_l2_normalization_test",
+    srcs = ["identify_l2_normalization_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
     name = "fuse_binary_into_following_affine_test",
     srcs = ["fuse_binary_into_following_affine_test.cc"],
     deps = [
diff --git a/tensorflow/lite/toco/graph_transformations/tests/identify_l2_normalization_test.cc b/tensorflow/lite/toco/graph_transformations/tests/identify_l2_normalization_test.cc
new file mode 100644
index 0000000..4c55b7d
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/identify_l2_normalization_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+void RunIdentifyL2Normalization(const std::vector<float>& input,
+                                const std::vector<int>& input_shape,
+                                const std::vector<int>& output_shape,
+                                const bool div_square = false) {
+  Model model;
+  Array& input0 = model.GetOrCreateArray("input0");
+  Array& output = model.GetOrCreateArray("output");
+
+  *input0.mutable_shape()->mutable_dims() = input_shape;
+  input0.data_type = ArrayDataType::kFloat;
+  input0.GetMutableBuffer<ArrayDataType::kFloat>().data = input;
+
+  *output.mutable_shape()->mutable_dims() = output_shape;
+
+  auto sq_op = new TensorFlowSquareOperator;
+  sq_op->inputs = {"input0"};
+  sq_op->outputs = {"output"};
+
+  Array& sumoutput = model.GetOrCreateArray("Sumoutput");
+  *sumoutput.mutable_shape()->mutable_dims() = output_shape;
+
+  auto sum_op = new TensorFlowSumOperator;
+  sum_op->inputs = {sq_op->outputs[0]};
+  sum_op->outputs = {"Sumoutput"};
+
+  if (div_square) {
+    Array& sqrtoutput = model.GetOrCreateArray("squarertoutput");
+    *sqrtoutput.mutable_shape()->mutable_dims() = output_shape;
+
+    auto sqrt_op = new TensorFlowSqrtOperator;
+    sqrt_op->inputs = {sum_op->outputs[0]};
+    sqrt_op->outputs = {"squarertoutput"};
+
+    Array& divoutput = model.GetOrCreateArray("Divoutput");
+    *divoutput.mutable_shape()->mutable_dims() = output_shape;
+
+    auto div_op = new DivOperator;
+    div_op->inputs = {"input0", sqrt_op->outputs[0]};
+    div_op->outputs = {"Divoutput"};
+
+    /*Stack everything with the model*/
+    model.operators.push_back(std::unique_ptr<Operator>(div_op));
+    model.operators.push_back(std::unique_ptr<Operator>(sqrt_op));
+    model.operators.push_back(std::unique_ptr<Operator>(sum_op));
+    model.operators.push_back(std::unique_ptr<Operator>(sq_op));
+  } else {
+    Array& rsqoutput = model.GetOrCreateArray("Rsquareoutput");
+    *rsqoutput.mutable_shape()->mutable_dims() = output_shape;
+
+    auto rsqrt_op = new TensorFlowRsqrtOperator;
+    rsqrt_op->inputs = {sum_op->outputs[0]};
+    rsqrt_op->outputs = {"Rsquareoutput"};
+
+    Array& muloutput = model.GetOrCreateArray("Muloutput");
+    *muloutput.mutable_shape()->mutable_dims() = output_shape;
+
+    auto mul_op = new MulOperator;
+    mul_op->inputs = {"input0", rsqrt_op->outputs[0]};
+    mul_op->outputs = {"Muloutput"};
+
+    /*Stack everything with the model*/
+    model.operators.push_back(std::unique_ptr<Operator>(mul_op));
+    model.operators.push_back(std::unique_ptr<Operator>(rsqrt_op));
+    model.operators.push_back(std::unique_ptr<Operator>(sum_op));
+    model.operators.push_back(std::unique_ptr<Operator>(sq_op));
+  }
+
+  bool modified;
+  ASSERT_TRUE(IdentifyL2Normalization().Run(&model, 0, &modified).ok());
+  for (auto& op_it : model.operators) {
+    Operator* op = op_it.get();
+    // Since the optimization has kicked in we should not find any
+    // Mul, Rsqrt, Add, Sqr  operators
+    if (div_square) {
+      EXPECT_FALSE(op->type == OperatorType::kDiv);
+      EXPECT_FALSE(op->type == OperatorType::kSqrt);
+    } else {
+      EXPECT_FALSE(op->type == OperatorType::kMul);
+      EXPECT_FALSE(op->type == OperatorType::kRsqrt);
+    }
+    EXPECT_FALSE(op->type == OperatorType::kAdd);
+    EXPECT_FALSE(op->type == OperatorType::kSquare);
+  }
+}
+
+// Test for reverse input in Min
+TEST(IdentifyL2Normalization, MulRsqrtTest) {
+  RunIdentifyL2Normalization(
+      // Input data
+      {3, 1, 4, 1, -5, 9, -2, 6, 5, 3, 5, 8},
+
+      // Input shape
+      {3, 4},
+
+      {3, 4},
+
+      false);
+}
+
+TEST(IdentifyL2Normalization, DivSqrtNormTest) {
+  RunIdentifyL2Normalization(
+      // Input data
+      {3, 1, 4, 1, -5, 9, -2, 6, 5, 3, 5, 8},
+
+      // Input shape
+      {3, 4},
+
+      {3, 4},
+
+      true);
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 7492f3e..50087b1 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -18,147 +18,75 @@
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/matmul_bcast.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
-
 namespace {
 
-void UnrollBatchMatMul3D(
-    const string& input_lhs, const string& input_rhs,
-    const BatchMatMulOperator* batch_op, const std::vector<int> batch,
-    Model* model, std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
-    std::vector<string>* pack_inputs) {
-  const std::string batch_name =
-      absl::StrCat(batch_op->outputs[0], "_b", absl::StrJoin(batch, "-"));
-  const auto& input_array_a = model->GetArray(input_lhs);
-  const auto& input_array_b = model->GetArray(input_rhs);
-  const int dims_count = input_array_a.shape().dimensions_count();
-
-  // tf.slice(a, ...).
-  std::vector<int> begin_indices_a = batch;
-  begin_indices_a.resize(dims_count);
-  std::vector<int> slice_size_a = input_array_a.shape().dims();
-  for (int i = 0; i < batch.size(); ++i) {
-    slice_size_a[i] = 1;
-  }
-  auto* slice_a_op = new SliceOperator;
-  slice_a_op->inputs = {
-      input_lhs,
-      CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
-                       begin_indices_a),
-      CreateInt32Array(model, batch_name + "/slice_a/slice/size", slice_size_a),
-  };
-  slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
-  auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
-  slice_a_op_output.data_type = input_array_a.data_type;
-  *tail_it = model->operators.emplace(*tail_it, slice_a_op) + 1;
-
-  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-  auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
-  slice_a_reshape_op->inputs = {
-      slice_a_op->outputs[0],
-      CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
-                       {-1, input_array_a.shape().dims(dims_count - 1)})};
-  slice_a_reshape_op->outputs = {
-      AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
-  auto& slice_a_reshape_op_output =
-      model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
-  slice_a_reshape_op_output.data_type = input_array_a.data_type;
-  *tail_it = model->operators.emplace(*tail_it, slice_a_reshape_op) + 1;
-
-  // tf.slice(b, ...).
-  std::vector<int> begin_indices_b = batch;
-  begin_indices_b.resize(dims_count);
-  std::vector<int> slice_size_b = input_array_b.shape().dims();
-  for (int i = 0; i < batch.size(); ++i) {
-    slice_size_b[i] = 1;
-  }
-  auto* slice_b_op = new SliceOperator;
-  slice_b_op->inputs = {
-      input_rhs,
-      CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
-                       begin_indices_b),
-      CreateInt32Array(model, batch_name + "/slice_b/slice/size", slice_size_b),
-  };
-  slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
-  auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
-  slice_b_op_output.data_type = input_array_b.data_type;
-  *tail_it = model->operators.emplace(*tail_it, slice_b_op) + 1;
-
-  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-  auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
-  slice_b_reshape_op->inputs = {
-      slice_b_op->outputs[0],
-      CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
-                       {-1, input_array_b.shape().dims(dims_count - 1)})};
-  slice_b_reshape_op->outputs = {
-      AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
-  auto& slice_b_reshape_op_output =
-      model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
-  slice_b_reshape_op_output.data_type = input_array_b.data_type;
-  *tail_it = model->operators.emplace(*tail_it, slice_b_reshape_op) + 1;
-
-  // tf.matmul(slice_a, slice_b).
-  auto* matmul_op = new TensorFlowMatMulOperator;
-  matmul_op->inputs = {slice_a_reshape_op->outputs[0],
-                       slice_b_reshape_op->outputs[0]};
-  matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
-  auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
-  matmul_op_output.data_type = input_array_a.data_type;
-  *tail_it = model->operators.emplace(*tail_it, matmul_op) + 1;
-
-  // Add to stack.
-  pack_inputs->push_back(matmul_op->outputs[0]);
+absl::InlinedVector<int64, 4> ToInlinedVector(const std::vector<int>& vec) {
+  return absl::InlinedVector<int64, 4>(vec.begin(), vec.end());
 }
 
-std::vector<string> UnrollBatchMatMulRecursion(
-    const string& input_lhs, const string& input_rhs,
-    const BatchMatMulOperator* batch_op, Model* model,
-    std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
-    const std::vector<int>& batch_prefix) {
-  const auto& input_array_a = model->GetArray(input_lhs);
-  const auto& dims_vec = input_array_a.shape().dims();
-  const int current_dim_size = dims_vec[batch_prefix.size()];
-  std::vector<string> batch_pack_inputs;
+std::vector<string> SliceInput(
+    const string& input, const string& base_name, const string& input_name,
+    const int batch_size, const Array& input_array, Model* model,
+    std::vector<std::unique_ptr<Operator>>::iterator* tail_it) {
+  int rank = input_array.shape().dimensions_count();
+  int num_rows = input_array.shape().dims(rank - 2);
+  int num_cols = input_array.shape().dims(rank - 1);
+  // Reshape to rank-3 Tensor with first dimension as the batch size.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+  reshape_op->inputs = {
+      input,
+      CreateInt32Array(model, absl::StrCat(base_name, "/reshape_a/shape"),
+                       {batch_size, num_rows, num_cols})};
+  reshape_op->outputs = {AvailableArrayName(
+      *model, absl::StrCat(base_name, "/reshape_", input_name, "/reshape"))};
+  auto& reshape_op_output = model->GetOrCreateArray(reshape_op->outputs[0]);
+  reshape_op_output.data_type = input_array.data_type;
+  *tail_it = model->operators.emplace(*tail_it, reshape_op) + 1;
 
-  if (batch_prefix.size() + 3 == dims_vec.size()) {
-    // Base case
-    for (int batch = 0; batch < current_dim_size; ++batch) {
-      std::vector<int> new_batch_prefix = batch_prefix;
-      new_batch_prefix.emplace_back(batch);
-      UnrollBatchMatMul3D(input_lhs, input_rhs, batch_op, new_batch_prefix,
-                          model, tail_it, &batch_pack_inputs);
-    }
-  } else {
-    // Recursion
-    for (int batch = 0; batch < current_dim_size; ++batch) {
-      std::vector<int> new_batch_prefix = batch_prefix;
-      new_batch_prefix.emplace_back(batch);
-      std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
-          input_lhs, input_rhs, batch_op, model, tail_it, new_batch_prefix);
+  // Slice along each batch index and remember the slice output for future use.
+  std::vector<string> slice_outputs;
+  for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+    std::string batch_name =
+        absl::StrCat(base_name, "_b", batch_idx, "/slice_", input_name);
+    auto* slice_op = new SliceOperator;
+    slice_op->inputs = {
+        reshape_op->outputs[0],
+        CreateInt32Array(model, absl::StrCat(batch_name, "/slice/begin"),
+                         {batch_idx, 0, 0}),
+        CreateInt32Array(model, absl::StrCat(batch_name, "/slice/size"),
+                         {1, num_rows, num_cols})};
+    slice_op->outputs = {
+        AvailableArrayName(*model, absl::StrCat(batch_name, "/slice"))};
+    auto& slice_op_output = model->GetOrCreateArray(slice_op->outputs[0]);
+    slice_op_output.data_type = input_array.data_type;
+    *tail_it = model->operators.emplace(*tail_it, slice_op) + 1;
 
-      // The pack that will join all the individual matmul results together.
-      auto* pack_op = new PackOperator;
-      std::string batch_name = absl::StrCat(
-          batch_op->outputs[0], "_b", absl::StrJoin(new_batch_prefix, "-"));
-      pack_op->inputs = pack_inputs;
-      pack_op->outputs = {AvailableArrayName(*model, batch_name + "/pack")};
-      auto& pack_op_output = model->GetOrCreateArray(pack_op->outputs[0]);
-      pack_op_output.data_type = input_array_a.data_type;
-      pack_op->axis = 0;
-      pack_op->values_count = pack_inputs.size();
-      *tail_it = model->operators.emplace(*tail_it, pack_op) + 1;
+    // Reshape to rank-2: [1, num_rows, num_cols] -> [num_rows, num_cols].
+    auto* slice_reshape_op = new TensorFlowReshapeOperator;
+    slice_reshape_op->inputs = {
+        slice_op->outputs[0],
+        CreateInt32Array(model, absl::StrCat(batch_name, "/reshape/shape"),
+                         {num_rows, num_cols})};
+    slice_reshape_op->outputs = {
+        AvailableArrayName(*model, absl::StrCat(batch_name, "/reshape"))};
+    auto& slice_reshape_op_output =
+        model->GetOrCreateArray(slice_reshape_op->outputs[0]);
+    slice_reshape_op_output.data_type = input_array.data_type;
+    *tail_it = model->operators.emplace(*tail_it, slice_reshape_op) + 1;
 
-      batch_pack_inputs.push_back(pack_op->outputs[0]);
-    }
+    slice_outputs.push_back(slice_reshape_op->outputs[0]);
   }
-  return batch_pack_inputs;
+  return slice_outputs;
 }
 
 std::vector<int32> GetTransposePerm(const Array& input_array) {
@@ -202,15 +130,6 @@
 // Unrolls a BatchMatMul on the batch dimension.
 // We need to slice each batch out of the inputs, matmul them individually, then
 // stack them all back together at the end.
-//
-// This transform effectively looks like:
-//  result_slices = []
-//  for bat in B:
-//    slice_a = tf.reshape(tf.slice(a, [bat, 0, 0], [1, M, N]), [M, N])
-//    slice_b = tf.reshape(tf.slice(b, [bat, 0, 0], [1, M, N]), [M, N])
-//    slice_c = tf.matmul(slice_a, slice_b)
-//    result_slices[bat] = slice_c
-//  result = tf.stack(result_slices)
 ::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
                                             bool* modified) {
   *modified = false;
@@ -220,7 +139,6 @@
   }
   const auto* batch_op =
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
-
   auto& tail_it = batch_op_it;
 
   string input_lhs = batch_op->inputs[0];
@@ -246,20 +164,25 @@
   }
   const auto& input_array_b = model->GetArray(input_rhs);
 
-  const int dims = input_array_a.shape().dimensions_count();
-  for (int i = 0; i < dims - 2; ++i) {
-    CHECK_EQ(input_array_a.shape().dims(i), input_array_b.shape().dims(i))
-        << "input array not consistent at index " << i;
-  }
-  CHECK_EQ(input_array_a.shape().dims(dims - 1),
-           input_array_b.shape().dims(dims - 2))
+  // Ensure that input ranks are at least 2 and batch shapes are broadcastable.
+  const int dims_a = input_array_a.shape().dimensions_count();
+  const int dims_b = input_array_b.shape().dimensions_count();
+  CHECK_GE(dims_a, 2) << "First input must have rank >= 2";
+  CHECK_GE(dims_b, 2) << "Second input must have rank >= 2";
+
+  ::tensorflow::MatMulBCast bcast(
+      ToInlinedVector(input_array_a.shape().dims()),
+      ToInlinedVector(input_array_b.shape().dims()));
+  CHECK(bcast.IsValid()) << "Input batch dimensions must be broadcastable";
+
+  CHECK_EQ(input_array_a.shape().dims(dims_a - 1),
+           input_array_b.shape().dims(dims_b - 2))
       << "Input dimensions must be compatible for multipication. shape a = ["
       << absl::StrJoin(input_array_a.shape().dims(), ", ") << "], shape b = ["
       << absl::StrJoin(input_array_b.shape().dims(), ", ") << "]";
 
-  if (dims == 2) {
-    // This is really just a MatMul. This likely means that someone hand-crafted
-    // a graphdef with a BatchMatMul when they really wanted a MatMul.
+  if (dims_a == 2 && dims_b == 2) {
+    // This is really just a MatMul.
     AddMessageF("Replacing non-batch BatchMatMul %s by a MatMul operator",
                 LogName(*batch_op));
     auto* matmul_op = new TensorFlowMatMulOperator;
@@ -271,23 +194,65 @@
     *modified = true;
     return ::tensorflow::Status::OK();
   }
-
-  CHECK_GE(input_array_a.shape().dimensions_count(), 3)
-      << "Input arrays must have rank >= 3";
-
-  const auto& dims_vec = input_array_a.shape().dims();
   AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
-              std::accumulate(dims_vec.begin(), dims_vec.end() - 2, 1,
-                              std::multiplies<int>()));
+              bcast.output_batch_size());
+  string base_name = std::string(batch_op->outputs[0]);
 
-  std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
-      input_lhs, input_rhs, batch_op, model, &tail_it, {});
+  // Compute slices for each batch in the LHS and RHS.
+  std::vector<string> slice_a_outputs =
+      SliceInput(input_lhs, base_name, "a", bcast.x_batch_size(), input_array_a,
+                 model, &tail_it);
+  std::vector<string> slice_b_outputs =
+      SliceInput(input_rhs, base_name, "b", bcast.y_batch_size(), input_array_b,
+                 model, &tail_it);
+
+  // Compute (single batch) MatMul for each output batch. The MatMul outputs are
+  // then packed together into one output Tensor.
+  std::vector<string> pack_inputs;
+  for (int batch_idx = 0; batch_idx < bcast.output_batch_size(); ++batch_idx) {
+    std::string batch_name =
+        absl::StrCat(batch_op->outputs[0], "_b", batch_idx);
+    const int a_batch_idx = bcast.IsBroadcastingRequired()
+                                ? bcast.x_batch_indices()[batch_idx]
+                                : batch_idx;
+    const int b_batch_idx = bcast.IsBroadcastingRequired()
+                                ? bcast.y_batch_indices()[batch_idx]
+                                : batch_idx;
+    auto* matmul_op = new TensorFlowMatMulOperator;
+    matmul_op->inputs = {slice_a_outputs[a_batch_idx],
+                         slice_b_outputs[b_batch_idx]};
+    matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
+    auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
+    matmul_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+
+    // Add to stack.
+    pack_inputs.push_back(matmul_op->outputs[0]);
+  }
+
+  // Combine the result of each individual MatMul into a rank-3 Tensor.
   auto* pack_op = new PackOperator;
   pack_op->inputs = pack_inputs;
-  pack_op->outputs = {batch_op->outputs[0]};
+  pack_op->outputs = {AvailableArrayName(*model, base_name + "/pack")};
+  auto& pack_op_output = model->GetOrCreateArray(pack_op->outputs[0]);
+  pack_op_output.data_type = input_array_a.data_type;
   pack_op->axis = 0;
   pack_op->values_count = pack_inputs.size();
-  model->operators.emplace(tail_it, pack_op);
+  tail_it = model->operators.emplace(tail_it, pack_op) + 1;
+
+  // Reshape the rank-3 Tensor into the correct output shape.
+  const auto& result_batch_shape = bcast.output_batch_shape().dim_sizes();
+  std::vector<int> result_shape(result_batch_shape.begin(),
+                                result_batch_shape.end());
+  result_shape.push_back(input_array_a.shape().dims(dims_a - 2));
+  result_shape.push_back(input_array_b.shape().dims(dims_b - 1));
+
+  auto* reshape_result_op = new TensorFlowReshapeOperator;
+  reshape_result_op->inputs = {
+      pack_op->outputs[0],
+      CreateInt32Array(model, base_name + "/reshape_out/shape", result_shape)};
+  reshape_result_op->outputs = {batch_op->outputs[0]};
+  model->operators.emplace(tail_it, reshape_result_op);
 
   // Remove the old batch matmul now that we've unrolled.
   batch_op_it = model->operators.begin();
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 6b5e68c..1c78d35 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -1547,6 +1547,20 @@
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertRoundOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "Round");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  const auto data_type = GetDataTypeAttr(node, "T");
+  CHECK(data_type == DT_FLOAT);
+  auto* op = new RoundOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertGatherOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -2007,6 +2021,9 @@
   rnn_state->set_discardable(true);
   rnn_state->set_state_array(node.name());
   rnn_state->set_back_edge_source_array(node.input(0));
+  // TODO(tianjuny): Temporary set the size to 1 to avoid transient array
+  // allocation crash. The real value should depend on the hidden_size of RNN.
+  rnn_state->set_size(1);
   return tensorflow::Status::OK();
 }
 
@@ -2424,6 +2441,7 @@
        ConvertSimpleOperator<TensorFlowAssertOperator, kAnyNumInputs, 1>},
       {"AvgPool", ConvertAvgPoolOperator},
       {"BatchMatMul", ConvertBatchMatMulOperator},
+      {"BatchMatMulV2", ConvertBatchMatMulOperator},
       {"BatchNormWithGlobalNormalization",
        ConvertBatchNormWithGlobalNormalizationOperator},
       {"BatchToSpaceND", ConvertBatchToSpaceNDOperator},
@@ -2507,6 +2525,7 @@
       {"ResizeNearestNeighbor", ConvertResizeNearestNeighborOperator},
       {"ReverseSequence", ConvertReverseSequenceOperator},
       {"ReverseV2", ConvertSimpleOperator<ReverseV2Operator, 2, 1>},
+      {"Round", ConvertRoundOperator},
       {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1, 1>},
       {"Select", ConvertSimpleOperator<SelectOperator, 3, 1>},
       {"Shape", ConvertShapeOperator},
@@ -2629,4 +2648,16 @@
   }
   return ImportTensorFlowGraphDef(model_flags, tf_import_flags, *tf_graph);
 }
+
+std::vector<std::string> GetPotentiallySupportedOps() {
+  std::vector<std::string> supported_ops;
+  const internal::ConverterMapType& converter_map =
+      internal::GetTensorFlowNodeConverterMap();
+
+  for (const auto& item : converter_map) {
+    supported_ops.push_back(item.first);
+  }
+  return supported_ops;
+}
+
 }  // namespace toco
diff --git a/tensorflow/lite/toco/import_tensorflow.h b/tensorflow/lite/toco/import_tensorflow.h
index 5b74ff2..4ada25e 100644
--- a/tensorflow/lite/toco/import_tensorflow.h
+++ b/tensorflow/lite/toco/import_tensorflow.h
@@ -17,9 +17,9 @@
 
 #include <memory>
 #include <string>
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
 
 namespace toco {
 
@@ -34,14 +34,20 @@
   bool import_all_ops_as_unsupported = false;
 };
 
+// Converts TOCO model from TensorFlow GraphDef with given flags.
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
     const tensorflow::GraphDef& graph_def);
 
+// Converts TOCO model from the file content of TensorFlow GraphDef with given
+// flags.
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
     const string& input_file_contents);
 
+// Gets a list of supported ops by their names.
+std::vector<std::string> GetPotentiallySupportedOps();
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index e7318e7..67510c2 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -82,6 +82,7 @@
   kTransposeConv,
   kCast,
   kFloor,
+  kRound,
   kGather,
   kResizeBilinear,
   kSin,
@@ -222,6 +223,7 @@
   kUint64,  // 10
   kString,
   kComplex64,
+  kFloat16,
 };
 
 // Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
@@ -971,9 +973,8 @@
   TensorFlowIdentityOperator() : Operator(OperatorType::kIdentity) {}
 };
 
-// Batch matrix multiplication operator. This comes from the (deprecated)
-// tf.batch_matmul or a tf.matmul that has rank 3. dims(0) is the batch count
-// and it can be trivially unrolled into a series of matmuls on each element.
+// Batch matrix multiplication operator. This comes from a tf.matmul where one
+// of the operands has rank 3 or more.
 //
 // Inputs:
 //   inputs[0]: required: the left-hand side matrix
@@ -1716,6 +1717,16 @@
   CeilOperator() : Operator(OperatorType::kCeil) {}
 };
 
+// Round operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Round
+struct RoundOperator : Operator {
+  RoundOperator() : Operator(OperatorType::kRound) {}
+};
+
 // Gather operator. It gathers slices from params according to indices.
 // Only 1-D indices are supported at the moment.
 //
diff --git a/tensorflow/lite/toco/python/toco.i b/tensorflow/lite/toco/python/toco.i
index c7dfdc3..de10fca 100644
--- a/tensorflow/lite/toco/python/toco.i
+++ b/tensorflow/lite/toco/python/toco.i
@@ -32,4 +32,7 @@
                         PyObject* input_contents_txt_raw,
                         bool extended_return = false);
 
+// Returns a list of names of all ops potentially supported by tflite.
+PyObject* TocoGetPotentiallySupportedOps();
+
 } // namespace toco
\ No newline at end of file
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 6fad092..22557a3 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -12,11 +12,13 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <map>
 #include <string>
 #include <vector>
-#include "tensorflow/core/platform/logging.h"
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/toco/import_tensorflow.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/python/toco_python_api.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -49,21 +51,32 @@
   bool error;
   std::string model_flags_proto_txt =
       ConvertArg(model_flags_proto_txt_raw, &error);
-  if (error) return nullptr;
+  if (error) {
+    PyErr_SetString(PyExc_ValueError, "Model flags are invalid.");
+    return nullptr;
+  }
   std::string toco_flags_proto_txt =
       ConvertArg(toco_flags_proto_txt_raw, &error);
-  if (error) return nullptr;
+  if (error) {
+    PyErr_SetString(PyExc_ValueError, "Toco flags are invalid.");
+    return nullptr;
+  }
   std::string input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
-  if (error) return nullptr;
+  if (error) {
+    PyErr_SetString(PyExc_ValueError, "Input GraphDef is invalid.");
+    return nullptr;
+  }
 
   // Use TOCO to produce new outputs.
   toco::ModelFlags model_flags;
   if (!model_flags.ParseFromString(model_flags_proto_txt)) {
-    LOG(FATAL) << "Model proto failed to parse." << std::endl;
+    PyErr_SetString(PyExc_ValueError, "Model proto failed to parse.");
+    return nullptr;
   }
   toco::TocoFlags toco_flags;
   if (!toco_flags.ParseFromString(toco_flags_proto_txt)) {
-    LOG(FATAL) << "Toco proto failed to parse." << std::endl;
+    PyErr_SetString(PyExc_ValueError, "Toco proto failed to parse.");
+    return nullptr;
   }
 
   auto& dump_options = *GraphVizDumpOptions::singleton();
@@ -100,4 +113,16 @@
       output_file_contents_txt.data(), output_file_contents_txt.size());
 }
 
+PyObject* TocoGetPotentiallySupportedOps() {
+  std::vector<std::string> supported_ops = toco::GetPotentiallySupportedOps();
+  PyObject* list = PyList_New(supported_ops.size());
+  for (size_t i = 0; i < supported_ops.size(); ++i) {
+    const string& op = supported_ops[i];
+    PyObject* op_dict = PyDict_New();
+    PyDict_SetItemString(op_dict, "op", PyUnicode_FromString(op.c_str()));
+    PyList_SetItem(list, i, op_dict);
+  }
+  return list;
+}
+
 }  // namespace toco
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index 4ab0961..20390c3 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -31,6 +31,9 @@
                       PyObject* input_contents_txt_raw,
                       bool extended_return = false);
 
+// Returns a list of names of all ops potentially supported by tflite.
+PyObject* TocoGetPotentiallySupportedOps();
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 5054966..09fe72f 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1203,6 +1203,12 @@
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1800,6 +1806,13 @@
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8/uint8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8 ||
+        input_array.data_type == ArrayDataType::kUint8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -2512,6 +2525,8 @@
   ops.push_back(
       MakeUnique<SimpleOperator<EluOperator>>("ELU", OperatorType::kElu));
   ops.push_back(
+      MakeUnique<SimpleOperator<RoundOperator>>("ROUND", OperatorType::kRound));
+  ops.push_back(
       MakeUnique<SimpleOperator<ReluOperator>>("RELU", OperatorType::kRelu));
   ops.push_back(MakeUnique<SimpleOperator<Relu1Operator>>(
       "RELU_N1_TO_1", OperatorType::kRelu1));
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 410e556..eece773 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -17,11 +17,10 @@
 #include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/toco/model.h"
-#include "tensorflow/lite/toco/tooling_util.h"
-
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -114,6 +113,7 @@
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
   CheckSimpleOperator<CeilOperator>("CEIL", OperatorType::kCeil);
   CheckSimpleOperator<EluOperator>("ELU", OperatorType::kElu);
+  CheckSimpleOperator<RoundOperator>("ROUND", OperatorType::kRound);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
@@ -818,6 +818,31 @@
   SimpleVersioningTest<PackOperator>();
 }
 
+TEST_F(OperatorTest, VersioningUnpackTest) {
+  UnpackOperator op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model int32_model;
+  Array& int32_array = int32_model.GetOrCreateArray(op.inputs[0]);
+  int32_array.data_type = ArrayDataType::kInt32;
+  OperatorSignature int32_signature = {.op = &op, .model = &int32_model};
+  EXPECT_EQ(base_op->GetVersion(int32_signature), 1);
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &op, .model = &uint8_model};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 2);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &op, .model = &int8_model};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+}
+
 TEST_F(OperatorTest, VersioningBatchToSpaceNDTest) {
   SimpleVersioningTest<BatchToSpaceNDOperator>();
 }
@@ -866,6 +891,10 @@
   SimpleVersioningTest<TensorFlowMinimumOperator>();
 }
 
+TEST_F(OperatorTest, VersioningMeanTest) {
+  SimpleVersioningTest<MeanOperator>();
+}
+
 TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest<AddOperator>(); }
 
 TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest<SubOperator>(); }
@@ -927,6 +956,44 @@
   EXPECT_EQ(op->GetVersion(int8_signature), 4);
 }
 
+TEST_F(OperatorTest, VersioningConv2DTest) {
+  ConvOperator conv_op;
+  conv_op.inputs = {"input", "filter"};
+  conv_op.outputs = {"output"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op = operator_by_type_map.at(conv_op.type).get();
+
+  Model uint8_model;
+  Array& input_uint8_array = uint8_model.GetOrCreateArray(conv_op.inputs[0]);
+  input_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& filter_uint8_array = uint8_model.GetOrCreateArray(conv_op.inputs[1]);
+  filter_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& output_uint8_array = uint8_model.GetOrCreateArray(conv_op.outputs[0]);
+  output_uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &conv_op, .model = &uint8_model};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& input_int8_array = int8_model.GetOrCreateArray(conv_op.inputs[0]);
+  input_int8_array.data_type = ArrayDataType::kInt8;
+  Array& filter_int8_array = int8_model.GetOrCreateArray(conv_op.inputs[1]);
+  filter_int8_array.data_type = ArrayDataType::kInt8;
+  Array& output_int8_array = int8_model.GetOrCreateArray(conv_op.outputs[0]);
+  output_int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &conv_op, .model = &int8_model};
+  EXPECT_EQ(op->GetVersion(int8_signature), 3);
+
+  Model float_model;
+  Array& input_float_array = float_model.GetOrCreateArray(conv_op.inputs[0]);
+  input_float_array.data_type = ArrayDataType::kFloat;
+  Array& filter_int8_array1 = float_model.GetOrCreateArray(conv_op.inputs[1]);
+  filter_int8_array1.data_type = ArrayDataType::kInt8;
+  Array& output_float_array = float_model.GetOrCreateArray(conv_op.outputs[0]);
+  output_float_array.data_type = ArrayDataType::kFloat;
+  OperatorSignature float_signature = {.op = &conv_op, .model = &float_model};
+  EXPECT_EQ(op->GetVersion(float_signature), 2);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index bdae94e..9123b00 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -58,6 +58,7 @@
           "AvgPool3D",
           "AvgPoolGrad",
           "BatchMatMul",
+          "BatchMatMulV2",
           "BatchNormWithGlobalNormalization",
           "BatchNormWithGlobalNormalizationGrad",
           "BatchToSpace",
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index 69ecd5c..626a7be 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -389,6 +389,7 @@
     HANDLE_OPERATORTYPENAME_CASE(Cast)
     HANDLE_OPERATORTYPENAME_CASE(Floor)
     HANDLE_OPERATORTYPENAME_CASE(Ceil)
+    HANDLE_OPERATORTYPENAME_CASE(Round)
     HANDLE_OPERATORTYPENAME_CASE(Gather)
     HANDLE_OPERATORTYPENAME_CASE(GatherNd)
     HANDLE_OPERATORTYPENAME_CASE(ResizeBilinear)
diff --git a/tensorflow/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
index fa911b8..2c65551 100644
--- a/tensorflow/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -46,4 +46,7 @@
 
   // Int8, quantized based on QuantizationParameters in schema.
   INT8 = 9;
+
+  // Half precision float, not quantized.
+  FLOAT16 = 10;
 }
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index f67b3f9..a59a0d3 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -6,6 +6,7 @@
 
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 common_copts = ["-Wall"]
 
@@ -96,4 +97,23 @@
     ],
 )
 
+cc_library(
+    name = "command_line_flags",
+    srcs = ["command_line_flags.cc"],
+    hdrs = ["command_line_flags.h"],
+    copts = tflite_copts(),
+)
+
+cc_test(
+    name = "command_line_flags_test",
+    srcs = ["command_line_flags_test.cc"],
+    copts = tflite_copts(),
+    visibility = ["//visibility:private"],
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/accuracy/BUILD b/tensorflow/lite/tools/accuracy/BUILD
index 71f66dd..26e4cf8 100644
--- a/tensorflow/lite/tools/accuracy/BUILD
+++ b/tensorflow/lite/tools/accuracy/BUILD
@@ -4,93 +4,7 @@
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-
-common_linkopts = tflite_linkopts() + select({
-    "//conditions:default": [],
-    "//tensorflow:android": [
-        "-pie",
-        "-llog",
-    ],
-})
-
-cc_library(
-    name = "utils",
-    srcs = ["utils.cc"],
-    hdrs = ["utils.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "utils_test",
-    srcs = ["utils_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
-    ],
-    data = ["//tensorflow/lite:testdata/multi_add.bin"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":utils",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "run_tflite_model_op",
-    srcs = ["run_tflite_model_op.cc"],
-    copts = tflite_copts(),
-    deps = [
-        ":utils",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/core:protos_all_cc",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:ops",
-            ],
-        },
-    ),
-    alwayslink = 1,
-)
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 cc_library(
     name = "android_required_build_flags",
@@ -98,234 +12,12 @@
     copts = tflite_copts(),
 )
 
-tf_cc_test(
-    name = "run_tflite_model_op_test",
-    srcs = ["run_tflite_model_op_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
-    ],
-    data = ["//tensorflow/lite:testdata/multi_add.bin"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        ":run_tflite_model_op",
-        ":android_required_build_flags",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:protos_all_cc",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "stage",
-    hdrs = ["stage.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/cc:scope",
-    ],
-)
-
-cc_library(
-    name = "file_reader_stage",
-    srcs = ["file_reader_stage.cc"],
-    hdrs = ["file_reader_stage.h"],
-    deps = [
-        ":stage",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ],
-)
-
-# TODO(b/122597976) Restore portability by avoiding tf_cc_test
-tf_cc_test(
-    name = "file_reader_stage_test",
-    srcs = ["file_reader_stage_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable"],
-    deps = [
-        ":file_reader_stage",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core/kernels:android_whole_file_read_ops",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "run_tflite_model_stage",
-    srcs = ["run_tflite_model_stage.cc"],
-    hdrs = ["run_tflite_model_stage.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":run_tflite_model_op",
-        ":stage",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ],
-)
-
-cc_library(
-    name = "accuracy_eval_stage",
-    hdrs = ["accuracy_eval_stage.h"],
-    copts = tflite_copts(),
-    deps = [
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "eval_pipeline",
-    srcs = ["eval_pipeline.cc"],
-    hdrs = ["eval_pipeline.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":accuracy_eval_stage",
-        ":stage",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-            ],
-        },
-    ),
-)
-
-# TODO(b/122597976) Restore portability by avoiding tf_cc_test
-tf_cc_test(
-    name = "eval_pipeline_test",
-    srcs = ["eval_pipeline_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable"],
-    deps = [
-        ":eval_pipeline",
-        "//tensorflow/cc:cc_ops",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "eval_pipeline_builder",
-    srcs = ["eval_pipeline_builder.cc"],
-    hdrs = ["eval_pipeline_builder.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":eval_pipeline",
-        ":accuracy_eval_stage",
-        ":stage",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/cc:cc_ops",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-# TODO(b/122597976) Restore portability by avoiding tf_cc_test
-tf_cc_test(
-    name = "eval_pipeline_builder_test",
-    srcs = ["eval_pipeline_builder_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable"],
-    deps = [
-        ":eval_pipeline_builder",
-        "//tensorflow/cc:cc_ops",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
 cc_library(
     name = "csv_writer",
     hdrs = ["csv_writer.h"],
     copts = tflite_copts(),
-    deps = select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
+    deps = [
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
 )
-
-tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/accuracy/README.md b/tensorflow/lite/tools/accuracy/README.md
deleted file mode 100644
index 8100cd1..0000000
--- a/tensorflow/lite/tools/accuracy/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-## TFLite accuracy library.
-
-This library provides evaluation pipelines that can be used to evaluate
-accuracy and other metrics of a model. The resulting binary can be run on
-a desktop or on a mobile device.
-
-## Usage
-The tool provides an evaluation pipeline with different stages. Each
-stage outputs a Tensorflow graph.
-A sample usage is shown below.
-
-```C++
-// First build the pipeline.
-EvalPipelineBuilder builder;
-std::unique_ptr<EvalPipeline> eval_pipeline;
-auto status = builder.WithInput("pipeline_input", DT_FLOAT)
-     .WithInputStage(&input_stage)
-     .WithRunModelStage(&run_model_stage)
-     .WithPreprocessingStage(&preprocess_stage)
-     .WithAccuracyEval(&eval)
-     .Build(scope, &eval_pipeline);
-TF_CHECK_OK(status);
-
-// Now run the pipeline with inputs and outputs.
-std::unique_ptr<Session> session(NewSession(SessionOptions()));
-TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
-Tensor input = ... read input for the model ...
-Tensor ground_truth = ... read ground truth for the model ...
-TF_CHECK_OK(eval_pipeline.Run(input1, ground_truth1));
-```
-For further examples, check the usage in [imagenet accuracy evaluation binary](ilsvrc/imagenet_model_evaluator.cc)
-
-## Measuring accuracy of published models.
-
-### ILSVRC (Imagenet Large Scale Visual Recognition Contest) classification task
-For measuring accuracy for [ILSVRC 2012 image classification task](http://www.image-net.org/challenges/LSVRC/2012/), the binary can be built
-using these
-[instructions.](ilsvrc/)
diff --git a/tensorflow/lite/tools/accuracy/csv_writer.h b/tensorflow/lite/tools/accuracy/csv_writer.h
index d74a803..80df895 100644
--- a/tensorflow/lite/tools/accuracy/csv_writer.h
+++ b/tensorflow/lite/tools/accuracy/csv_writer.h
@@ -19,8 +19,8 @@
 #include <fstream>
 #include <vector>
 
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -36,14 +36,17 @@
  public:
   CSVWriter(const std::vector<string>& columns, std::ofstream* output_stream)
       : num_columns_(columns.size()), output_stream_(output_stream) {
-    TF_CHECK_OK(WriteRow(columns, output_stream_));
+    if (WriteRow(columns, output_stream_) != kTfLiteOk) {
+      LOG(ERROR) << "Could not write column names to file";
+    }
   }
 
   template <typename T>
-  Status WriteRow(const std::vector<T>& values) {
+  TfLiteStatus WriteRow(const std::vector<T>& values) {
     if (values.size() != num_columns_) {
-      return errors::InvalidArgument("Invalid size for row:", values.size(),
-                                     " expected: ", num_columns_);
+      LOG(ERROR) << "Invalid size for row:" << values.size()
+                 << " expected: " << num_columns_;
+      return kTfLiteError;
     }
     return WriteRow(values, output_stream_);
   }
@@ -54,8 +57,8 @@
 
  private:
   template <typename T>
-  static Status WriteRow(const std::vector<T>& values,
-                         std::ofstream* output_stream) {
+  static TfLiteStatus WriteRow(const std::vector<T>& values,
+                               std::ofstream* output_stream) {
     bool first = true;
     for (const auto& v : values) {
       if (!first) {
@@ -67,9 +70,10 @@
     }
     (*output_stream) << "\n";
     if (!output_stream->good()) {
-      return errors::Internal("Writing to stream failed.");
+      LOG(ERROR) << "Writing to stream failed.";
+      return kTfLiteError;
     }
-    return Status::OK();
+    return kTfLiteOk;
   }
   const size_t num_columns_;
   std::ofstream* output_stream_;
diff --git a/tensorflow/lite/tools/accuracy/eval_pipeline.cc b/tensorflow/lite/tools/accuracy/eval_pipeline.cc
deleted file mode 100644
index 658824a..0000000
--- a/tensorflow/lite/tools/accuracy/eval_pipeline.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
-
-namespace tensorflow {
-namespace metrics {
-
-Status EvalPipeline::AttachSession(std::unique_ptr<Session> session) {
-  session_ = std::move(session);
-  TF_RETURN_IF_ERROR(session_->Create(model_graph_));
-  return Status::OK();
-}
-
-Status EvalPipeline::Run(const Tensor& input, const Tensor& ground_truth) {
-  if (session_ == nullptr) {
-    return errors::Internal("No session is associated with the graph.");
-  }
-  std::vector<Tensor> outputs;
-  TF_RETURN_IF_ERROR(session_->Run({{params_.model_input_node_name, input}},
-                                   {params_.model_output_node_name}, {},
-                                   &outputs));
-  TF_RETURN_IF_ERROR(eval_->ComputeEval(outputs, ground_truth));
-  return Status::OK();
-}
-}  //  namespace metrics
-}  //  namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/eval_pipeline.h b/tensorflow/lite/tools/accuracy/eval_pipeline.h
deleted file mode 100644
index 1ec21b0..0000000
--- a/tensorflow/lite/tools/accuracy/eval_pipeline.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
-
-#include <string>
-
-#include "tensorflow/lite/tools/accuracy/accuracy_eval_stage.h"
-#include "tensorflow/lite/tools/accuracy/stage.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace metrics {
-
-// Pipeline for evaluating a model.
-// Runs the graph and passes the output of graph to
-// the provided instance of AccuracyEval.
-// Example usage:
-// AccuracyEval *eval;
-// GraphDef graph_def;
-// ... populate graph_def...
-//
-// EvalPipeline eval_pipeline(&graph_def,
-//    {.model_input_node_name = "model_input",
-//     .model_output_node_name = "model_output"},
-//     eval);
-//  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-//  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
-//  Tensor input = ... read input for the model ...
-//  Tensor ground_truth = ... read ground truth for the model ...
-//  TF_CHECK_OK(eval_pipeline.Run(input, ground_truth));
-//
-class EvalPipeline {
- public:
-  struct Params {
-    string model_input_node_name;
-    string model_output_node_name;
-  };
-
-  // Creates a new `EvalPipeline` object. The ownership of the `accuracy_eval`
-  // is retained by the caller. Lifetime of `accuracy_eval` instance should
-  // be longer than the lifetime of this instance of pipeline.
-  EvalPipeline(const GraphDef& graph, const Params& params,
-               AccuracyEval* accuracy_eval)
-      : model_graph_(graph),
-        params_(params),
-        eval_(accuracy_eval),
-        session_(nullptr) {}
-
-  EvalPipeline(const EvalPipeline&) = delete;
-  EvalPipeline& operator=(const EvalPipeline&) = delete;
-
-  EvalPipeline(const EvalPipeline&&) = delete;
-  EvalPipeline& operator=(const EvalPipeline&&) = delete;
-
-  // Attaches the given session to this instance of pipeline.
-  // The provided session object will be reused for subsequent calls to
-  // EvalPipeline::Run.
-  Status AttachSession(std::unique_ptr<Session> session);
-
-  // Runs the model by feeding `input` and then passes the output of the model
-  // along with provided `ground_truth` to the AccuracyEval instance by calling
-  // AccuracyEval::ComputeEval.
-  Status Run(const Tensor& input, const Tensor& ground_truth);
-
- private:
-  GraphDef model_graph_;
-  Params params_;
-  AccuracyEval* eval_;
-  std::unique_ptr<Session> session_;
-};
-}  //  namespace metrics
-}  //  namespace tensorflow
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
diff --git a/tensorflow/lite/tools/accuracy/eval_pipeline_builder.cc b/tensorflow/lite/tools/accuracy/eval_pipeline_builder.cc
deleted file mode 100644
index 1b360d3..0000000
--- a/tensorflow/lite/tools/accuracy/eval_pipeline_builder.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
-
-#include "absl/memory/memory.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-
-namespace tensorflow {
-namespace metrics {
-
-EvalPipelineBuilder& EvalPipelineBuilder::WithInputStage(Stage* input_stage) {
-  input_stage_ = input_stage;
-  return *this;
-}
-
-EvalPipelineBuilder& EvalPipelineBuilder::WithPreprocessingStage(
-    Stage* preprocessing_stage) {
-  preprocessing_stage_ = preprocessing_stage;
-  return *this;
-}
-
-EvalPipelineBuilder& EvalPipelineBuilder::WithRunModelStage(
-    Stage* run_model_stage) {
-  run_model_stage_ = run_model_stage;
-  return *this;
-}
-
-EvalPipelineBuilder& EvalPipelineBuilder::WithAccuracyEval(
-    AccuracyEval* accuracy_eval) {
-  accuracy_eval_ = accuracy_eval;
-  return *this;
-}
-
-EvalPipelineBuilder& EvalPipelineBuilder::WithInput(const string& input_name,
-                                                    DataType input_type) {
-  input_name_ = input_name;
-  input_type_ = input_type;
-  return *this;
-}
-
-Status EvalPipelineBuilder::Build(
-    const Scope& scope, std::unique_ptr<EvalPipeline>* eval_pipeline) {
-  if (input_stage_ == nullptr) {
-    return errors::InvalidArgument("Input stage is null.");
-  }
-  if (preprocessing_stage_ == nullptr) {
-    return errors::InvalidArgument("Preprocessing stage is null.");
-  }
-  if (run_model_stage_ == nullptr) {
-    return errors::InvalidArgument("Run model stage is null.");
-  }
-  if (accuracy_eval_ == nullptr) {
-    return errors::InvalidArgument("accuracy_eval is null.");
-  }
-  if (input_name_.empty()) {
-    return errors::InvalidArgument("input name is not set.");
-  }
-  if (input_type_ == DT_INVALID) {
-    return errors::InvalidArgument("input type is not set.");
-  }
-
-  auto input_placeholder =
-      ops::Placeholder(scope.WithOpName(input_name_), input_type_);
-  TF_RETURN_IF_ERROR(scope.status());
-
-  input_stage_->AddToGraph(scope, input_placeholder);
-  TF_RETURN_IF_ERROR(scope.status());
-
-  preprocessing_stage_->AddToGraph(scope, input_stage_->Output());
-  TF_RETURN_IF_ERROR(scope.status());
-
-  run_model_stage_->AddToGraph(scope, preprocessing_stage_->Output());
-  TF_RETURN_IF_ERROR(scope.status());
-
-  GraphDef graph_def;
-  TF_RETURN_IF_ERROR(scope.ToGraphDef(&graph_def));
-  EvalPipeline::Params params;
-  params.model_input_node_name = input_name_;
-  params.model_output_node_name = run_model_stage_->output_name();
-  *eval_pipeline =
-      absl::make_unique<EvalPipeline>(graph_def, params, accuracy_eval_);
-
-  return Status::OK();
-}
-
-}  //  namespace metrics
-}  //  namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/eval_pipeline_builder.h b/tensorflow/lite/tools/accuracy/eval_pipeline_builder.h
deleted file mode 100644
index 18b52ac..0000000
--- a/tensorflow/lite/tools/accuracy/eval_pipeline_builder.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
-
-#include <memory>
-#include <string>
-
-#include "tensorflow/lite/tools/accuracy/accuracy_eval_stage.h"
-#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
-#include "tensorflow/lite/tools/accuracy/stage.h"
-
-namespace tensorflow {
-namespace metrics {
-
-// A builder to simplify construction of an `EvalPipeline` instance.
-// The `Build` method creates an |EvalPipeline| with the following structure:
-// |input| -> |input_stage|
-//               |--> |preprocessing_stage|
-//                         |--> |run_model_stage| ->  |accuracy_eval_stage|.
-// The stages are chained in the order shown above. Any missing stage results in
-// an error. The ownership of the stage object is retained by the caller. Stage
-// objects need to exist until the |Build| method is called.
-//
-// Currently only single inputs are supported.
-//
-// Example Usage:
-// EvalPipelineBuilder builder;
-// std::unique_ptr<EvalPipeline> eval_pipeline;
-// auto status = builder.WithInput("pipeline_input", DT_FLOAT)
-//      .WithInputStage(&input_stage)
-//      .WithRunModelStage(&run_model_stage)
-//      .WithPreprocessingStage(&preprocess_stage)
-//      .WithAccuracyEval(&eval)
-//      .Build(scope, &eval_pipeline);
-// TF_CHECK_OK(status);
-class EvalPipelineBuilder {
- public:
-  EvalPipelineBuilder() = default;
-  EvalPipelineBuilder(const EvalPipelineBuilder&) = delete;
-  EvalPipeline& operator=(const EvalPipelineBuilder&) = delete;
-
-  EvalPipelineBuilder(const EvalPipelineBuilder&&) = delete;
-  EvalPipeline& operator=(const EvalPipelineBuilder&&) = delete;
-
-  // Sets the input stage for the pipeline.
-  // Input stage converts the input, say filename into appropriate format
-  // that can be consumed by the preprocessing stage.
-  EvalPipelineBuilder& WithInputStage(Stage* input_stage);
-
-  // Sets the preprocessing stage for the pipeline.
-  // Preprocessing stage converts the input into a format that can be used to
-  // run the model.
-  EvalPipelineBuilder& WithPreprocessingStage(Stage* preprocessing_stage);
-
-  // Sets the run model stage for the pipeline.
-  // This stage receives the preprocessing input and output of this stage is
-  // fed to the accuracy eval stage.
-  EvalPipelineBuilder& WithRunModelStage(Stage* run_model_stage);
-
-  // Sets the accuracy eval for the pipeline.
-  // Results of evaluating the pipeline are fed to the `accuracy_eval` instance.
-  EvalPipelineBuilder& WithAccuracyEval(AccuracyEval* accuracy_eval);
-
-  // Sets the name and type of input for the pipeline.
-  // TODO(shashishekhar): Support multiple inputs for the pipeline, use a vector
-  // here.
-  EvalPipelineBuilder& WithInput(const string& input_name, DataType input_type);
-
-  // Builds the pipeline and assigns the pipeline to `eval_pipeline`.
-  // If the pipeline creation fails `eval_pipeline` is untouched.
-  Status Build(const Scope& scope,
-               std::unique_ptr<EvalPipeline>* eval_pipeline);
-
- private:
-  Stage* input_stage_ = nullptr;
-  Stage* preprocessing_stage_ = nullptr;
-  Stage* run_model_stage_ = nullptr;
-  AccuracyEval* accuracy_eval_ = nullptr;
-  string input_name_;
-  DataType input_type_ = DT_INVALID;
-};
-
-}  //  namespace metrics
-}  //  namespace tensorflow
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
diff --git a/tensorflow/lite/tools/accuracy/eval_pipeline_builder_test.cc b/tensorflow/lite/tools/accuracy/eval_pipeline_builder_test.cc
deleted file mode 100644
index 9bf7254..0000000
--- a/tensorflow/lite/tools/accuracy/eval_pipeline_builder_test.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
-#include <gtest/gtest.h>
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace metrics {
-namespace {
-
-class IdentityStage : public Stage {
- public:
-  IdentityStage(const string& name, const string& output)
-      : name_(name), output_(output) {}
-
-  void AddToGraph(const Scope& scope, const Input& input) override {
-    called_count_++;
-    inputs_.push_back(input.node()->name());
-    stage_output_ = ops::Identity(scope.WithOpName(output_), input);
-  }
-
-  string name() const override { return name_; }
-  string output_name() const override { return output_; }
-
-  int times_called() const { return called_count_; }
-
-  const std::vector<string> input_params() { return inputs_; }
-
- private:
-  string name_;
-  string output_;
-  int called_count_ = 0;
-  std::vector<string> inputs_;
-};
-
-class FailingStage : public Stage {
- public:
-  FailingStage(const string& name, const string& output)
-      : name_(name), output_(output) {}
-
-  void AddToGraph(const Scope& scope, const Input& input) override {
-    called_count_++;
-    scope.UpdateStatus(errors::Internal("Stage failed:", name_));
-  }
-
-  string name() const override { return name_; }
-  string output_name() const override { return output_; }
-
-  int times_called() const { return called_count_; }
-
- private:
-  string name_;
-  string output_;
-  int called_count_ = 0;
-};
-
-class SimpleAccuracyEval : public AccuracyEval {
- public:
-  SimpleAccuracyEval() {}
-
-  Status ComputeEval(const std::vector<Tensor>& model_outputs,
-                     const Tensor& ground_truth) override {
-    return Status::OK();
-  }
-};
-
-TEST(EvalPipelineBuilder, MissingPipelineStages) {
-  IdentityStage input_stage("input_stage", "input_stage_out");
-  IdentityStage run_model_stage("run_model", "run_model_out");
-  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
-  const string pipeline_input = "pipeline_input";
-
-  SimpleAccuracyEval eval;
-
-  Scope scope = Scope::NewRootScope();
-  std::unique_ptr<EvalPipeline> eval_pipeline;
-  EvalPipelineBuilder builder;
-  auto status =
-      builder.WithInputStage(&input_stage).Build(scope, &eval_pipeline);
-  EXPECT_FALSE(status.ok());
-  EXPECT_FALSE(eval_pipeline);
-
-  status =
-      builder.WithRunModelStage(&run_model_stage).Build(scope, &eval_pipeline);
-  EXPECT_FALSE(status.ok());
-  EXPECT_FALSE(eval_pipeline);
-
-  status = builder.WithPreprocessingStage(&preprocess_stage)
-               .Build(scope, &eval_pipeline);
-  EXPECT_FALSE(status.ok());
-  EXPECT_FALSE(eval_pipeline);
-
-  status =
-      builder.WithInput(pipeline_input, DT_FLOAT).Build(scope, &eval_pipeline);
-  EXPECT_FALSE(status.ok());
-  EXPECT_FALSE(eval_pipeline);
-
-  status = builder.WithAccuracyEval(&eval).Build(scope, &eval_pipeline);
-  TF_CHECK_OK(status);
-  EXPECT_TRUE(eval_pipeline);
-}
-
-TEST(EvalPipeline, InputStageFailure) {
-  FailingStage input_stage("input_stage", "input_stage_out");
-  IdentityStage run_model_stage("run_model", "run_model_out");
-  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
-  const string pipeline_input = "pipeline_input";
-
-  SimpleAccuracyEval eval;
-
-  Scope scope = Scope::NewRootScope();
-  std::unique_ptr<EvalPipeline> eval_pipeline;
-  EvalPipelineBuilder builder;
-  auto status = builder.WithInputStage(&input_stage)
-                    .WithRunModelStage(&run_model_stage)
-                    .WithPreprocessingStage(&preprocess_stage)
-                    .WithInput(pipeline_input, DT_FLOAT)
-                    .WithAccuracyEval(&eval)
-                    .Build(scope, &eval_pipeline);
-
-  EXPECT_FALSE(scope.status().ok());
-  // None of the other stages would have been called.
-  EXPECT_EQ(1, input_stage.times_called());
-  EXPECT_EQ(0, preprocess_stage.times_called());
-  EXPECT_EQ(0, run_model_stage.times_called());
-}
-
-TEST(EvalPipeline, PreprocessingFailure) {
-  IdentityStage input_stage("input_stage", "input_stage_out");
-  FailingStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
-  IdentityStage run_model_stage("run_model", "run_model_out");
-  const string pipeline_input = "pipeline_input";
-
-  SimpleAccuracyEval eval;
-
-  Scope scope = Scope::NewRootScope();
-  std::unique_ptr<EvalPipeline> eval_pipeline;
-  EvalPipelineBuilder builder;
-  auto status = builder.WithInputStage(&input_stage)
-                    .WithRunModelStage(&run_model_stage)
-                    .WithPreprocessingStage(&preprocess_stage)
-                    .WithInput(pipeline_input, DT_FLOAT)
-                    .WithAccuracyEval(&eval)
-                    .Build(scope, &eval_pipeline);
-
-  EXPECT_FALSE(status.ok());
-  // None of the other stages would have been called.
-  EXPECT_EQ(1, input_stage.times_called());
-  EXPECT_EQ(1, preprocess_stage.times_called());
-  EXPECT_EQ(0, run_model_stage.times_called());
-}
-
-TEST(EvalPipeline, GraphEvalFailure) {
-  IdentityStage input_stage("input_stage", "input_stage_out");
-  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
-  FailingStage run_model_stage("run_model", "run_model_out");
-  const string pipeline_input = "pipeline_input";
-
-  SimpleAccuracyEval eval;
-
-  Scope scope = Scope::NewRootScope();
-  std::unique_ptr<EvalPipeline> eval_pipeline;
-  EvalPipelineBuilder builder;
-  auto status = builder.WithInputStage(&input_stage)
-                    .WithRunModelStage(&run_model_stage)
-                    .WithPreprocessingStage(&preprocess_stage)
-                    .WithInput(pipeline_input, DT_FLOAT)
-                    .WithAccuracyEval(&eval)
-                    .Build(scope, &eval_pipeline);
-
-  EXPECT_FALSE(status.ok());
-  // None of the other stages would have been called.
-  EXPECT_EQ(1, input_stage.times_called());
-  EXPECT_EQ(1, preprocess_stage.times_called());
-  EXPECT_EQ(1, run_model_stage.times_called());
-}
-
-TEST(EvalPipeline, PipelineHasCorrectSequence) {
-  IdentityStage input_stage("input_stage", "input_stage_out");
-  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
-  IdentityStage run_model_stage("run_model", "run_model_out");
-  const string pipeline_input = "pipeline_input";
-
-  SimpleAccuracyEval eval;
-
-  Scope scope = Scope::NewRootScope();
-  std::unique_ptr<EvalPipeline> eval_pipeline;
-  EvalPipelineBuilder builder;
-  auto status = builder.WithInputStage(&input_stage)
-                    .WithRunModelStage(&run_model_stage)
-                    .WithPreprocessingStage(&preprocess_stage)
-                    .WithInput(pipeline_input, DT_FLOAT)
-                    .WithAccuracyEval(&eval)
-                    .Build(scope, &eval_pipeline);
-  TF_CHECK_OK(status);
-
-  ASSERT_EQ(1, input_stage.times_called());
-  ASSERT_EQ(1, run_model_stage.times_called());
-  ASSERT_EQ(1, preprocess_stage.times_called());
-
-  EXPECT_EQ(pipeline_input, input_stage.input_params()[0]);
-  EXPECT_EQ(input_stage.output_name(), preprocess_stage.input_params()[0]);
-  EXPECT_EQ(preprocess_stage.output_name(), run_model_stage.input_params()[0]);
-}
-
-}  // namespace
-
-}  // namespace metrics
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/accuracy/eval_pipeline_test.cc b/tensorflow/lite/tools/accuracy/eval_pipeline_test.cc
deleted file mode 100644
index 53cbf8c..0000000
--- a/tensorflow/lite/tools/accuracy/eval_pipeline_test.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
-#include <gtest/gtest.h>
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace metrics {
-namespace {
-
-Tensor CreateFloatTensor(float value) {
-  Tensor tensor(DT_FLOAT, TensorShape({}));
-  tensor.scalar<float>()() = value;
-  return tensor;
-}
-
-class NoOpAccuracyEval : public AccuracyEval {
- public:
-  explicit NoOpAccuracyEval(const Status& status_to_return)
-      : status_to_return_(status_to_return) {}
-
-  Status ComputeEval(const std::vector<Tensor>& model_outputs,
-                     const Tensor& ground_truth) override {
-    model_outputs_ = model_outputs;
-    ground_truth_ = ground_truth;
-    was_called_ = true;
-    return status_to_return_;
-  }
-
-  bool WasCalled() { return was_called_; }
-  std::vector<Tensor> model_outputs() { return model_outputs_; }
-  Tensor ground_truth() { return ground_truth_; }
-
- private:
-  std::vector<Tensor> model_outputs_;
-  Tensor ground_truth_;
-  Status status_to_return_;
-  bool was_called_ = false;
-};
-
-TEST(EvalPipeline, AccuracyEvalIsCalled) {
-  Scope scope = Scope::NewRootScope();
-  // A graph that adds 1 to input.
-  auto input = ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
-  auto add_node = ops::Add(scope.WithOpName("output"), input, 1.0f);
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  EvalPipeline::Params params;
-  params.model_input_node_name = "input";
-  params.model_output_node_name = "output";
-  NoOpAccuracyEval accuracy_eval(Status::OK());
-
-  EvalPipeline eval_pipeline(graph_def, params, &accuracy_eval);
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
-  TF_CHECK_OK(eval_pipeline.Run(CreateFloatTensor(5), CreateFloatTensor(27)));
-
-  EXPECT_TRUE(accuracy_eval.WasCalled());
-  auto outputs = accuracy_eval.model_outputs();
-  ASSERT_EQ(1, outputs.size());
-  EXPECT_EQ(6.0f, outputs[0].scalar<float>()());
-  // Ground truth is unchanged.
-  EXPECT_EQ(27, accuracy_eval.ground_truth().scalar<float>()());
-}
-
-TEST(EvalPipeline, EvalIsNotCalledOnGraphRunFailure) {
-  Scope scope = Scope::NewRootScope();
-  // A graph that adds 1 to input.
-  auto input = ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
-  auto add_node = ops::Add(scope.WithOpName("output"), input, 1.0f);
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  EvalPipeline::Params params;
-  params.model_input_node_name = "input";
-  params.model_output_node_name = "output";
-  NoOpAccuracyEval accuracy_eval(Status::OK());
-
-  EvalPipeline eval_pipeline(graph_def, params, &accuracy_eval);
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
-
-  // Pass a string tensor instead of a float tensor.
-  Tensor string_tensor(DT_STRING, TensorShape{});
-  auto status = eval_pipeline.Run(string_tensor, CreateFloatTensor(27));
-  EXPECT_FALSE(accuracy_eval.WasCalled());
-  EXPECT_FALSE(status.ok());
-}
-
-TEST(EvalPipeline, AccuracyEvalFailureResultsInFailure) {
-  Scope scope = Scope::NewRootScope();
-  // A graph that adds 1 to input.
-  auto input = ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
-  auto add_node = ops::Add(scope.WithOpName("output"), input, 1.0f);
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  EvalPipeline::Params params;
-  params.model_input_node_name = "input";
-  params.model_output_node_name = "output";
-  NoOpAccuracyEval accuracy_eval(errors::Internal("accuracy_fail"));
-
-  EvalPipeline eval_pipeline(graph_def, params, &accuracy_eval);
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
-  auto status = eval_pipeline.Run(CreateFloatTensor(5), CreateFloatTensor(27));
-
-  EXPECT_TRUE(accuracy_eval.WasCalled());
-  EXPECT_FALSE(status.ok());
-}
-
-}  // namespace
-
-}  // namespace metrics
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/accuracy/file_reader_stage.cc b/tensorflow/lite/tools/accuracy/file_reader_stage.cc
deleted file mode 100644
index a106a79..0000000
--- a/tensorflow/lite/tools/accuracy/file_reader_stage.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
-
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-
-namespace tensorflow {
-namespace metrics {
-void FileReaderStage::AddToGraph(const Scope& scope, const Input& input) {
-  if (!scope.ok()) return;
-  Scope s = scope.WithOpName(name());
-  this->stage_output_ = ops::ReadFile(s.WithOpName(output_name()), input);
-}
-}  //  namespace metrics
-}  //  namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/file_reader_stage.h b/tensorflow/lite/tools/accuracy/file_reader_stage.h
deleted file mode 100644
index 19655e9..0000000
--- a/tensorflow/lite/tools/accuracy/file_reader_stage.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
-
-#include <string>
-
-#include "tensorflow/lite/tools/accuracy/stage.h"
-
-namespace tensorflow {
-namespace metrics {
-// A stage for reading a file into |string|.
-// Inputs: a string tensor: |file_name|.
-// Outputs: a string tensor: contents of |file_name|.
-class FileReaderStage : public Stage {
- public:
-  string name() const override { return "stage_filereader"; }
-  string output_name() const override { return "stage_filereader_output"; }
-
-  void AddToGraph(const Scope& scope, const Input& input) override;
-};
-}  //  namespace metrics
-}  //  namespace tensorflow
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
diff --git a/tensorflow/lite/tools/accuracy/file_reader_stage_test.cc b/tensorflow/lite/tools/accuracy/file_reader_stage_test.cc
deleted file mode 100644
index 21be0a7..0000000
--- a/tensorflow/lite/tools/accuracy/file_reader_stage_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdio>
-#include <fstream>
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace metrics {
-namespace {
-
-class TempFile {
- public:
-  TempFile() {
-    string file_path;
-    if (Env::Default()->LocalTempFilename(&file_path)) {
-      file_path_ = file_path;
-      created_ = true;
-    }
-  }
-
-  string filepath() { return file_path_; }
-  bool CreateFileWithContents(const std::string& contents) {
-    if (!created_) {
-      return false;
-    }
-    std::fstream file(file_path_, std::ios_base::out);
-    if (file) {
-      file << contents;
-    }
-    return file.good();
-  }
-
-  ~TempFile() {
-    if (created_) {
-      std::remove(file_path_.c_str());
-    }
-  }
-
- private:
-  bool created_ = false;
-  string file_path_;
-};
-
-TEST(FileReaderStageTest, FileIsRead) {
-  TempFile file;
-  const string kFileContents = "Hello world.";
-  ASSERT_TRUE(file.CreateFileWithContents(kFileContents));
-  Scope scope = Scope::NewRootScope();
-  FileReaderStage reader_stage;
-  reader_stage.AddToGraph(scope, file.filepath());
-  TF_CHECK_OK(scope.status());
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-  std::vector<Tensor> outputs;
-  auto run_status =
-      session->Run({},                               /*inputs*/
-                   {reader_stage.output_name()}, {}, /*target node names */
-                   &outputs);
-  TF_CHECK_OK(run_status);
-  EXPECT_EQ(1, outputs.size());
-  string contents = outputs[0].scalar<string>()();
-  EXPECT_EQ(kFileContents, contents);
-}
-
-TEST(FileReaderStageTest, InvalidFile) {
-  Scope scope = Scope::NewRootScope();
-  FileReaderStage reader_stage;
-  reader_stage.AddToGraph(scope, string("non_existent_file"));
-  TF_CHECK_OK(scope.status());
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-  std::vector<Tensor> outputs;
-  auto run_status =
-      session->Run({},                               /*inputs*/
-                   {reader_stage.output_name()}, {}, /*target node names */
-                   &outputs);
-  EXPECT_FALSE(run_status.ok());
-}
-
-}  // namespace
-
-}  // namespace metrics
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/BUILD b/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
index ee3a9b2..88162ac 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
@@ -4,9 +4,7 @@
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 common_linkopts = tflite_linkopts() + select({
     "//conditions:default": [],
@@ -17,167 +15,37 @@
 })
 
 cc_library(
-    name = "inception_preprocessing",
-    srcs = ["inception_preprocessing.cc"],
-    hdrs = ["inception_preprocessing.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite/tools/accuracy:android_required_build_flags",
-        "//tensorflow/lite/tools/accuracy:stage",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core/kernels:android_tensorflow_image_op",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/core:protos_all_cc",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:ops",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "inception_preprocessing_test",
-    srcs = ["inception_preprocessing_test.cc"],
-    args = [
-        "--test_image=$(location :testdata/grace_hopper.jpg)",
-    ],
-    data = [":testdata/grace_hopper.jpg"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = [
-        "no_oss",  # b/114307765
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":inception_preprocessing",
-        "//tensorflow/lite/tools/accuracy:android_required_build_flags",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "imagenet_topk_eval",
-    srcs = ["imagenet_topk_eval.cc"],
-    hdrs = ["imagenet_topk_eval.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite/tools/accuracy:accuracy_eval_stage",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-# TODO(b/122597976) Restore portability by avoiding tf_cc_test
-tf_cc_test(
-    name = "imagenet_topk_eval_test",
-    srcs = ["imagenet_topk_eval_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable"],
-    deps = [
-        ":imagenet_topk_eval",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-            ],
-        },
-    ),
-)
-
-cc_library(
     name = "imagenet_model_evaluator",
     srcs = ["imagenet_model_evaluator.cc"],
     hdrs = ["imagenet_model_evaluator.h"],
     copts = tflite_copts(),
     deps = [
-        ":imagenet_topk_eval",
-        ":inception_preprocessing",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools/accuracy:android_required_build_flags",
-        "//tensorflow/lite/tools/accuracy:eval_pipeline",
-        "//tensorflow/lite/tools/accuracy:eval_pipeline_builder",
-        "//tensorflow/lite/tools/accuracy:file_reader_stage",
-        "//tensorflow/lite/tools/accuracy:run_tflite_model_stage",
-        "//tensorflow/lite/tools/accuracy:utils",
+        "//tensorflow/lite/tools/evaluation:utils",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
+        "//tensorflow/lite/tools/evaluation/stages:image_classification_stage",
         "@com_google_absl//absl/memory",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core/kernels:android_whole_file_read_ops",
-                "//tensorflow/core/kernels:android_tensorflow_image_op",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/core:lib_internal",
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:core_cpu",
-            ],
-        },
-    ),
+    ],
 )
 
-tf_cc_binary(
+cc_binary(
     name = "imagenet_accuracy_eval",
     srcs = ["imagenet_accuracy_eval.cc"],
     copts = tflite_copts(),
     linkopts = common_linkopts,
     deps = [
         ":imagenet_model_evaluator",
-        ":imagenet_topk_eval",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/profiling:time",
+        "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools/accuracy:android_required_build_flags",
         "//tensorflow/lite/tools/accuracy:csv_writer",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:lib",
-                "//tensorflow/core:framework_internal",
-            ],
-        },
-    ),
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
+        "@com_google_absl//absl/memory",
+    ],
 )
-
-tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
index 28ad2e4..7d53fe7 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/README.md
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
@@ -28,7 +28,7 @@
     `mobilenet_labels.txt` where each label is in the same order as the output
     1001 dimension tensor.
 
-*   `output_path`: `string` \
+*   `output_file_path`: `string` \
     This is the path to the output file. The output is a CSV file that has
     top-10 accuracies in each row. Each line of output file is the cumulative
     accuracy after processing images in a sorted order. So first line is
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
index 090a023..48c2720 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
@@ -15,97 +15,125 @@
 
 #include <iomanip>
 #include <memory>
+#include <mutex>  // NOLINT(build/c++11)
+#include <string>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/profiling/time.h"
 #include "tensorflow/lite/tools/accuracy/csv_writer.h"
 #include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
-#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 
 namespace tensorflow {
 namespace metrics {
 
 namespace {
 
-std::vector<double> GetAccuracies(
-    const ImagenetTopKAccuracy::AccuracyStats& accuracy_stats) {
-  std::vector<double> results;
-  results.reserve(accuracy_stats.number_of_images);
-  if (accuracy_stats.number_of_images > 0) {
-    for (int n : accuracy_stats.topk_counts) {
-      double accuracy = 0;
-      if (accuracy_stats.number_of_images > 0) {
-        accuracy = (n * 100.0) / accuracy_stats.number_of_images;
-      }
-      results.push_back(accuracy);
-    }
+using ::tflite::evaluation::TopkAccuracyEvalMetrics;
+
+constexpr char kNumThreadsFlag[] = "num_threads";
+constexpr char kOutputFilePathFlag[] = "output_file_path";
+
+// TODO(b/130823599): Move to tools/evaluation/stages/topk_accuracy_eval_stage.
+// Computes total number of images processed & aggregates Top-K accuracies
+// into 'accuracies'.
+void AggregateAccuraciesAndNumImages(
+    int k,
+    const std::unordered_map<uint64_t, TopkAccuracyEvalMetrics>&
+        shard_id_accuracy_metrics_map,
+    const std::unordered_map<uint64_t, int>& shard_id_done_image_count_map,
+    std::vector<double>* accuracies, int* num_done_images) {
+  // Total images done.
+  *num_done_images = 0;
+  for (auto iter = shard_id_done_image_count_map.begin();
+       iter != shard_id_done_image_count_map.end(); ++iter) {
+    *num_done_images += iter->second;
   }
-  return results;
+
+  // Aggregated accuracies.
+  for (int i = 0; i < k; ++i) {
+    double correct_inferences = 0;
+    double total_inferences = 0;
+    for (auto iter = shard_id_done_image_count_map.begin();
+         iter != shard_id_done_image_count_map.end(); ++iter) {
+      const uint64_t shard_id = iter->first;
+      const TopkAccuracyEvalMetrics& accuracy_metrics =
+          shard_id_accuracy_metrics_map.at(shard_id);
+      const int num_images = iter->second;
+      correct_inferences += num_images * accuracy_metrics.topk_accuracies(i);
+      total_inferences += num_images;
+    }
+    // Convert to percentage.
+    accuracies->push_back(100.0 * correct_inferences / total_inferences);
+  }
 }
 
 }  // namespace
 
-// Writes results to a CSV file.
+// Writes results to a CSV file & logs progress to standard output with
+// `kLogDelayUs` microseconds.
 class ResultsWriter : public ImagenetModelEvaluator::Observer {
  public:
-  explicit ResultsWriter(std::unique_ptr<CSVWriter> writer)
-      : writer_(std::move(writer)) {}
+  explicit ResultsWriter(int k, std::unique_ptr<CSVWriter> writer)
+      : k_(k), writer_(std::move(writer)) {}
 
   void OnEvaluationStart(const std::unordered_map<uint64_t, int>&
-                             shard_id_image_count_map) override {}
-
-  void OnSingleImageEvaluationComplete(
-      uint64_t shard_id, const ImagenetTopKAccuracy::AccuracyStats& stats,
-      const string& image) override;
-
- private:
-  std::unique_ptr<CSVWriter> writer_ GUARDED_BY(mu_);
-  mutex mu_;
-};
-
-void ResultsWriter::OnSingleImageEvaluationComplete(
-    uint64_t shard_id, const ImagenetTopKAccuracy::AccuracyStats& stats,
-    const string& image) {
-  mutex_lock lock(mu_);
-  TF_CHECK_OK(writer_->WriteRow(GetAccuracies(stats)));
-  writer_->Flush();
-}
-
-// Logs results to standard output with `kLogDelayUs` microseconds.
-class ResultsLogger : public ImagenetModelEvaluator::Observer {
- public:
-  void OnEvaluationStart(const std::unordered_map<uint64_t, int>&
                              shard_id_image_count_map) override;
 
-  void OnSingleImageEvaluationComplete(
-      uint64_t shard_id, const ImagenetTopKAccuracy::AccuracyStats& stats,
-      const string& image) override;
+  void OnSingleImageEvaluationComplete(uint64_t shard_id,
+                                       const TopkAccuracyEvalMetrics& metrics,
+                                       const string& image) override;
 
  private:
-  uint64_t last_logged_time_us_ GUARDED_BY(mu_) = 0;
-  int total_num_images_ GUARDED_BY(mu_);
+  // For writing to CSV.
+  int k_;
+  std::unordered_map<uint64_t, TopkAccuracyEvalMetrics>
+      shard_id_accuracy_metrics_map_;
+  std::unordered_map<uint64_t, int> shard_id_done_image_count_map_;
+  std::unique_ptr<CSVWriter> writer_;
+
+  // For logging to stdout.
+  uint64_t last_logged_time_us_ = 0;
+  int total_num_images_;
   static constexpr int kLogDelayUs = 500 * 1000;
-  mutex mu_;
+
+  std::mutex mu_;
 };
 
-void ResultsLogger::OnEvaluationStart(
+void ResultsWriter::OnEvaluationStart(
     const std::unordered_map<uint64_t, int>& shard_id_image_count_map) {
   int total_num_images = 0;
   for (const auto& kv : shard_id_image_count_map) {
     total_num_images += kv.second;
   }
   LOG(ERROR) << "Starting model evaluation: " << total_num_images;
-  mutex_lock lock(mu_);
+  std::lock_guard<std::mutex> lock(mu_);
   total_num_images_ = total_num_images;
 }
 
-void ResultsLogger::OnSingleImageEvaluationComplete(
-    uint64_t shard_id, const ImagenetTopKAccuracy::AccuracyStats& stats,
+void ResultsWriter::OnSingleImageEvaluationComplete(
+    uint64_t shard_id,
+    const tflite::evaluation::TopkAccuracyEvalMetrics& metrics,
     const string& image) {
-  auto now_us = Env::Default()->NowMicros();
-  int num_evaluated = stats.number_of_images;
-  mutex_lock lock(mu_);
+  std::lock_guard<std::mutex> lock(mu_);
+  shard_id_done_image_count_map_[shard_id] += 1;
+  shard_id_accuracy_metrics_map_[shard_id] = metrics;
+
+  int num_evaluated;
+  std::vector<double> total_accuracies;
+  AggregateAccuraciesAndNumImages(k_, shard_id_accuracy_metrics_map_,
+                                  shard_id_done_image_count_map_,
+                                  &total_accuracies, &num_evaluated);
+  if (writer_->WriteRow(total_accuracies) != kTfLiteOk) {
+    LOG(ERROR) << "Could not write to file";
+    return;
+  }
+  writer_->Flush();
+
+  auto now_us = tflite::profiling::time::NowMicros();
   if ((now_us - last_logged_time_us_) >= kLogDelayUs) {
     last_logged_time_us_ = now_us;
     double current_percent = num_evaluated * 100.0 / total_num_images_;
@@ -116,44 +144,52 @@
 }
 
 int Main(int argc, char* argv[]) {
-  // TODO(shashishekhar): Make this binary configurable and model
-  // agnostic.
   string output_file_path;
   int num_threads = 4;
-  std::vector<Flag> flag_list = {
-      Flag("output_file_path", &output_file_path, "Path to output file."),
-      Flag("num_threads", &num_threads, "Number of threads."),
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kNumThreadsFlag, &num_threads,
+                               "Number of threads."),
+      tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path,
+                               "Path to output file."),
   };
-  Flags::Parse(&argc, argv, flag_list);
+  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
 
   std::unique_ptr<ImagenetModelEvaluator> evaluator;
-  CHECK(!output_file_path.empty()) << "Invalid output file path.";
+  if (output_file_path.empty()) {
+    LOG(ERROR) << "Invalid output file path.";
+    return 0;
+  }
 
-  CHECK(num_threads > 0) << "Invalid number of threads.";
+  if (num_threads <= 0) {
+    LOG(ERROR) << "Invalid number of threads.";
+    return 0;
+  }
 
-  TF_CHECK_OK(
-      ImagenetModelEvaluator::Create(argc, argv, num_threads, &evaluator));
+  if (ImagenetModelEvaluator::Create(argc, argv, num_threads, &evaluator) !=
+      kTfLiteOk)
+    return 0;
 
   std::ofstream output_stream(output_file_path, std::ios::out);
-  CHECK(output_stream) << "Unable to open output file path: '"
-                       << output_file_path << "'";
+  if (!output_stream) {
+    LOG(ERROR) << "Unable to open output file path: '" << output_file_path
+               << "'";
+  }
 
   output_stream << std::setprecision(3) << std::fixed;
   std::vector<string> columns;
   columns.reserve(evaluator->params().num_ranks);
   for (int i = 0; i < evaluator->params().num_ranks; i++) {
-    string column_name = "Top ";
-    tensorflow::strings::StrAppend(&column_name, i + 1);
+    std::string column_name = "Top ";
+    column_name = column_name + std::to_string(i + 1);
     columns.push_back(column_name);
   }
 
   ResultsWriter results_writer(
+      evaluator->params().num_ranks,
       absl::make_unique<CSVWriter>(columns, &output_stream));
-  ResultsLogger logger;
   evaluator->AddObserver(&results_writer);
-  evaluator->AddObserver(&logger);
   LOG(ERROR) << "Starting evaluation with: " << num_threads << " threads.";
-  TF_CHECK_OK(evaluator->EvaluateModel());
+  evaluator->EvaluateModel();
   return 0;
 }
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 129747f..ecbd8a7 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -15,43 +15,32 @@
 
 #include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
 
+#include <dirent.h>
+
 #include <fstream>
 #include <iomanip>
+#include <mutex>  // NOLINT(build/c++11)
 #include <string>
+#include <thread>  // NOLINT(build/c++11)
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
-#include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
-#include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
-#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
-#include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
-#include "tensorflow/lite/tools/accuracy/utils.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/image_classification_stage.h"
+#include "tensorflow/lite/tools/evaluation/utils.h"
 
 namespace {
-using tensorflow::string;
 
-string StripTrailingSlashes(const string& path) {
-  int end = path.size();
-  while (end > 0 && path[end - 1] == '/') {
-    end--;
-  }
-  return path.substr(0, end);
-}
-
-tensorflow::Tensor CreateStringTensor(const string& value) {
-  tensorflow::Tensor tensor(tensorflow::DT_STRING, tensorflow::TensorShape({}));
-  tensor.scalar<string>()() = value;
-  return tensor;
-}
+constexpr char kNumImagesFlag[] = "num_images";
+constexpr char kModelOutputLabelsFlag[] = "model_output_labels";
+constexpr char kGroundTruthImagesPathFlag[] = "ground_truth_images_path";
+constexpr char kGroundTruthLabelsFlag[] = "ground_truth_labels";
+constexpr char kBlacklistFilePathFlag[] = "blacklist_file_path";
+constexpr char kModelFileFlag[] = "model_file";
 
 template <typename T>
 std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
@@ -62,7 +51,9 @@
 
 template <typename T>
 std::vector<std::vector<T>> Split(const std::vector<T>& v, int n) {
-  CHECK_GT(n, 0);
+  if (n <= 0) {
+    return std::vector<std::vector<T>>();
+  }
   std::vector<std::vector<T>> vecs(n);
   int input_index = 0;
   int vec_index = 0;
@@ -71,7 +62,6 @@
     vec_index = (vec_index + 1) % n;
     input_index++;
   }
-  CHECK_EQ(vecs.size(), n);
   return vecs;
 }
 
@@ -90,149 +80,124 @@
 
   void OnEvaluationStart(const std::unordered_map<uint64_t, int>&
                              shard_id_image_count_map) override {
-    mutex_lock lock(mu_);
+    std::lock_guard<std::mutex> lock(mu_);
     for (auto observer : observers_) {
       observer->OnEvaluationStart(shard_id_image_count_map);
     }
   }
 
   void OnSingleImageEvaluationComplete(
-      uint64_t shard_id, const ImagenetTopKAccuracy::AccuracyStats& stats,
-      const string& image) override {
-    mutex_lock lock(mu_);
+      uint64_t shard_id,
+      const tflite::evaluation::TopkAccuracyEvalMetrics& metrics,
+      const std::string& image) override {
+    std::lock_guard<std::mutex> lock(mu_);
     for (auto observer : observers_) {
-      observer->OnSingleImageEvaluationComplete(shard_id, stats, image);
+      observer->OnSingleImageEvaluationComplete(shard_id, metrics, image);
     }
   }
 
  private:
-  const std::vector<ImagenetModelEvaluator::Observer*>& observers_
-      GUARDED_BY(mu_);
-  mutex mu_;
+  const std::vector<ImagenetModelEvaluator::Observer*>& observers_;
+  std::mutex mu_;
 };
 
-/*static*/ Status ImagenetModelEvaluator::Create(
+/*static*/ TfLiteStatus ImagenetModelEvaluator::Create(
     int argc, char* argv[], int num_threads,
     std::unique_ptr<ImagenetModelEvaluator>* model_evaluator) {
   Params params;
-  const std::vector<Flag> flag_list = {
-      Flag("model_output_labels", &params.model_output_labels_path,
-           "Path to labels that correspond to output of model."
-           " E.g. in case of mobilenet, this is the path to label "
-           "file where each label is in the same order as the output"
-           " of the model."),
-      Flag("ground_truth_images_path", &params.ground_truth_images_path,
-           "Path to ground truth images."),
-      Flag("ground_truth_labels", &params.ground_truth_labels_path,
-           "Path to ground truth labels."),
-      Flag("num_images", &params.number_of_images,
-           "Number of examples to evaluate, pass 0 for all "
-           "examples. Default: 100"),
-      Flag("blacklist_file_path", &params.blacklist_file_path,
-           "Path to blacklist file (optional)."
-           "Path to blacklist file where each line is a single integer that is "
-           "equal to number of blacklisted image."),
-      Flag("model_file", &params.model_file_path,
-           "Path to test tflite model file."),
-  };
-  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result)
-    return errors::InvalidArgument("Invalid command line flags");
-  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  params.number_of_images = 100;
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kNumImagesFlag, &params.number_of_images,
+                               "Number of examples to evaluate, pass 0 for all "
+                               "examples. Default: 100"),
+      tflite::Flag::CreateFlag(
+          kModelOutputLabelsFlag, &params.model_output_labels_path,
 
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      Env::Default()->IsDirectory(params.ground_truth_images_path),
-      "Invalid ground truth data path.");
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      Env::Default()->FileExists(params.ground_truth_labels_path),
-      "Invalid ground truth labels path.");
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      Env::Default()->FileExists(params.model_output_labels_path),
-      "Invalid model output labels path.");
+          "Path to labels that correspond to output of model."
+          " E.g. in case of mobilenet, this is the path to label "
+          "file where each label is in the same order as the output"
+          " of the model."),
+      tflite::Flag::CreateFlag(
+          kGroundTruthImagesPathFlag, &params.ground_truth_images_path,
 
-  if (!params.blacklist_file_path.empty()) {
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        Env::Default()->FileExists(params.blacklist_file_path),
-        "Invalid blacklist path.");
-  }
+          "Path to ground truth images. These will be evaluated in "
+          "alphabetical order of filename"),
+      tflite::Flag::CreateFlag(
+          kGroundTruthLabelsFlag, &params.ground_truth_labels_path,
+          "Path to ground truth labels, corresponding to alphabetical ordering "
+          "of ground truth images."),
+      tflite::Flag::CreateFlag(
+          kBlacklistFilePathFlag, &params.blacklist_file_path,
+          "Path to blacklist file (optional) where each line is a single "
+          "integer that is "
+          "equal to index number of blacklisted image."),
+      tflite::Flag::CreateFlag(kModelFileFlag, &params.model_file_path,
+                               "Path to test tflite model file.")};
+  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
 
   if (params.number_of_images < 0) {
-    return errors::InvalidArgument("Invalid: num_examples");
+    LOG(ERROR) << "Invalid: num_examples";
+    return kTfLiteError;
   }
 
-  utils::ModelInfo model_info;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      utils::GetTFliteModelInfo(params.model_file_path, &model_info),
-      "Invalid TFLite model.");
-
-  *model_evaluator = absl::make_unique<ImagenetModelEvaluator>(
-      model_info, params, num_threads);
-  return Status::OK();
+  *model_evaluator =
+      absl::make_unique<ImagenetModelEvaluator>(params, num_threads);
+  return kTfLiteOk;
 }
 
 struct ImageLabel {
-  string image;
-  string label;
+  std::string image;
+  std::string label;
 };
 
-Status EvaluateModelForShard(const uint64_t shard_id,
-                             const std::vector<ImageLabel>& image_labels,
-                             const std::vector<string>& model_labels,
-                             const utils::ModelInfo& model_info,
-                             const ImagenetModelEvaluator::Params& params,
-                             ImagenetModelEvaluator::Observer* observer,
-                             ImagenetTopKAccuracy* eval) {
-  const TensorShape& input_shape = model_info.input_shapes[0];
-  const int image_height = input_shape.dim_size(1);
-  const int image_width = input_shape.dim_size(2);
+TfLiteStatus EvaluateModelForShard(const uint64_t shard_id,
+                                   const std::vector<ImageLabel>& image_labels,
+                                   const std::vector<std::string>& model_labels,
+                                   const ImagenetModelEvaluator::Params& params,
+                                   ImagenetModelEvaluator::Observer* observer,
+                                   int num_ranks) {
+  tflite::evaluation::EvaluationStageConfig eval_config;
+  eval_config.set_name("image_classification");
+  auto* classification_params = eval_config.mutable_specification()
+                                    ->mutable_image_classification_params();
+  auto* inference_params = classification_params->mutable_inference_params();
+  inference_params->set_model_file_path(params.model_file_path);
+  classification_params->mutable_topk_accuracy_eval_params()->set_k(num_ranks);
 
-  RunTFLiteModelStage::Params tfl_model_params;
-  tfl_model_params.model_file_path = params.model_file_path;
-
-  tfl_model_params.input_type = {model_info.input_types[0]};
-  tfl_model_params.output_type = {model_info.input_types[0]};
-
-  Scope root = Scope::NewRootScope();
-  FileReaderStage reader;
-  InceptionPreprocessingStage inc(image_height, image_width,
-                                  model_info.input_types[0]);
-  RunTFLiteModelStage tfl_model_stage(tfl_model_params);
-  EvalPipelineBuilder builder;
-
-  std::unique_ptr<EvalPipeline> eval_pipeline;
-
-  auto build_status = builder.WithInputStage(&reader)
-                          .WithPreprocessingStage(&inc)
-                          .WithRunModelStage(&tfl_model_stage)
-                          .WithAccuracyEval(eval)
-                          .WithInput("input_file", DT_STRING)
-                          .Build(root, &eval_pipeline);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(build_status,
-                                  "Failure while building eval pipeline.");
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-
-  TF_RETURN_IF_ERROR(eval_pipeline->AttachSession(std::move(session)));
+  tflite::evaluation::ImageClassificationStage eval(eval_config);
+  eval.SetAllLabels(model_labels);
+  TF_LITE_ENSURE_STATUS(eval.Init());
 
   for (const auto& image_label : image_labels) {
-    TF_CHECK_OK(eval_pipeline->Run(CreateStringTensor(image_label.image),
-                                   CreateStringTensor(image_label.label)));
+    eval.SetInputs(image_label.image, image_label.label);
+
+    TF_LITE_ENSURE_STATUS(eval.Run());
     observer->OnSingleImageEvaluationComplete(
-        shard_id, eval->GetTopKAccuracySoFar(), image_label.image);
+        shard_id,
+        eval.LatestMetrics()
+            .process_metrics()
+            .image_classification_metrics()
+            .topk_accuracy_metrics(),
+        image_label.image);
   }
-  return Status::OK();
+  return kTfLiteOk;
 }
 
-Status FilterBlackListedImages(const string& blacklist_file_path,
-                               std::vector<ImageLabel>* image_labels) {
+// TODO(b/130823599): Move to tools/evaluation/utils.
+TfLiteStatus FilterBlackListedImages(const std::string& blacklist_file_path,
+                                     std::vector<ImageLabel>* image_labels) {
   if (!blacklist_file_path.empty()) {
-    std::vector<string> lines;
-    TF_RETURN_IF_ERROR(utils::ReadFileLines(blacklist_file_path, &lines));
+    std::vector<std::string> lines;
+    if (!tflite::evaluation::ReadFileLines(blacklist_file_path, &lines)) {
+      LOG(ERROR) << "Could not read: " << blacklist_file_path;
+      return kTfLiteError;
+    }
     std::vector<int> blacklist_ids;
     blacklist_ids.reserve(lines.size());
     // Populate blacklist_ids with indices of images.
     std::transform(lines.begin(), lines.end(),
                    std::back_inserter(blacklist_ids),
-                   [](const string& val) { return std::stoi(val) - 1; });
+                   [](const std::string& val) { return std::stoi(val) - 1; });
 
     std::vector<ImageLabel> filtered_images;
     std::sort(blacklist_ids.begin(), blacklist_ids.end());
@@ -251,38 +216,29 @@
     }
 
     if (filtered_images.size() != size_post_filtering) {
-      return errors::Internal("Invalid number of filtered images");
+      LOG(ERROR) << "Invalid number of filtered images";
+      return kTfLiteError;
     }
     *image_labels = filtered_images;
   }
-  return Status::OK();
+  return kTfLiteOk;
 }
 
-Status ImagenetModelEvaluator::EvaluateModel() const {
-  if (model_info_.input_shapes.size() != 1) {
-    return errors::InvalidArgument("Invalid input shape");
-  }
-
-  const TensorShape& input_shape = model_info_.input_shapes[0];
-  // Input should be of the shape {1, height, width, 3}
-  if (input_shape.dims() != 4 || input_shape.dim_size(3) != 3) {
-    return errors::InvalidArgument("Invalid input shape for the model.");
-  }
-
-  string data_path =
-      StripTrailingSlashes(params_.ground_truth_images_path) + "/";
-
-  const string imagenet_file_pattern = data_path + kImagenetFilePattern;
-  std::vector<string> image_files;
-  TF_CHECK_OK(
-      Env::Default()->GetMatchingPaths(imagenet_file_pattern, &image_files));
+TfLiteStatus ImagenetModelEvaluator::EvaluateModel() const {
+  const std::string data_path = tflite::evaluation::StripTrailingSlashes(
+                                    params_.ground_truth_images_path) +
+                                "/";
+  std::vector<std::string> image_files;
+  TF_LITE_ENSURE_STATUS(
+      tflite::evaluation::GetSortedFileNames(data_path, &image_files));
   std::vector<string> ground_truth_image_labels;
-  TF_CHECK_OK(utils::ReadFileLines(params_.ground_truth_labels_path,
-                                   &ground_truth_image_labels));
-  CHECK_EQ(image_files.size(), ground_truth_image_labels.size());
-
-  // Process files in filename sorted order.
-  std::sort(image_files.begin(), image_files.end());
+  if (!tflite::evaluation::ReadFileLines(params_.ground_truth_labels_path,
+                                         &ground_truth_image_labels))
+    return kTfLiteError;
+  if (image_files.size() != ground_truth_image_labels.size()) {
+    LOG(ERROR) << "Images and ground truth labels don't match";
+    return kTfLiteError;
+  }
 
   std::vector<ImageLabel> image_labels;
   image_labels.reserve(image_files.size());
@@ -291,56 +247,55 @@
   }
 
   // Filter any blacklisted images.
-  TF_CHECK_OK(
-      FilterBlackListedImages(params_.blacklist_file_path, &image_labels));
+  if (FilterBlackListedImages(params_.blacklist_file_path, &image_labels) !=
+      kTfLiteOk) {
+    LOG(ERROR) << "Could not filter by blacklist";
+    return kTfLiteError;
+  }
 
   if (params_.number_of_images > 0) {
     image_labels = GetFirstN(image_labels, params_.number_of_images);
   }
 
   std::vector<string> model_labels;
-  TF_RETURN_IF_ERROR(
-      utils::ReadFileLines(params_.model_output_labels_path, &model_labels));
-  if (model_labels.size() != 1001) {
-    return errors::InvalidArgument("Invalid number of labels: ",
-                                   model_labels.size());
+  if (!tflite::evaluation::ReadFileLines(params_.model_output_labels_path,
+                                         &model_labels)) {
+    LOG(ERROR) << "Could not read: " << params_.model_output_labels_path;
+    return kTfLiteError;
   }
-
-  ImagenetTopKAccuracy eval(model_labels, params_.num_ranks);
+  if (model_labels.size() != 1001) {
+    LOG(ERROR) << "Invalid number of labels: " << model_labels.size();
+    return kTfLiteError;
+  }
 
   auto img_labels = Split(image_labels, num_threads_);
 
-  BlockingCounter counter(num_threads_);
-
   CompositeObserver observer(observers_);
 
-  ::tensorflow::thread::ThreadPool pool(Env::Default(), "evaluation_pool",
-                                        num_threads_);
+  std::vector<std::thread> thread_pool;
+  bool all_okay = true;
   std::unordered_map<uint64_t, int> shard_id_image_count_map;
-  std::vector<std::function<void()>> thread_funcs;
-  thread_funcs.reserve(num_threads_);
+  thread_pool.reserve(num_threads_);
   for (int i = 0; i < num_threads_; i++) {
     const auto& image_label = img_labels[i];
     const uint64_t shard_id = i + 1;
     shard_id_image_count_map[shard_id] = image_label.size();
-    auto func = [shard_id, &image_label, &model_labels, this, &observer, &eval,
-                 &counter]() {
-      TF_CHECK_OK(EvaluateModelForShard(shard_id, image_label, model_labels,
-                                        model_info_, params_, &observer,
-                                        &eval));
-      counter.DecrementCount();
+    auto func = [shard_id, &image_label, &model_labels, this, &observer,
+                 &all_okay]() {
+      if (EvaluateModelForShard(shard_id, image_label, model_labels, params_,
+                                &observer, params_.num_ranks) != kTfLiteOk) {
+        all_okay = all_okay && false;
+      }
     };
-    thread_funcs.push_back(func);
+    thread_pool.push_back(std::thread(func));
   }
 
   observer.OnEvaluationStart(shard_id_image_count_map);
-  for (const auto& func : thread_funcs) {
-    pool.Schedule(func);
+  for (auto& thread : thread_pool) {
+    thread.join();
   }
 
-  counter.Wait();
-
-  return Status::OK();
+  return kTfLiteOk;
 }
 
 }  // namespace metrics
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index c3c49e9..3b8d193 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -13,15 +13,14 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_MODEL_EVALUATOR_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_MODEL_EVALUATOR_H_
+
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-#include "tensorflow/lite/tools/accuracy/utils.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -42,26 +41,26 @@
  public:
   struct Params {
     // Path to ground truth images.
-    string ground_truth_images_path;
+    std::string ground_truth_images_path;
 
     // Path to labels file for ground truth image.
     // This file should be generated with the scripts.
-    string ground_truth_labels_path;
+    std::string ground_truth_labels_path;
 
     // This is word labels generated by the model. The category
     // indices of output probabilities generated by the model maybe different
     // from the indices in the imagenet dataset.
-    string model_output_labels_path;
+    std::string model_output_labels_path;
 
     // Path to the model file.
-    string model_file_path;
+    std::string model_file_path;
 
     // Path to black list file. 1762 images were blacklisted from
     // original ILSVRC dataset. This black list file is present in
     // ILSVRC2014 devkit. Please refer to readme.txt of the ILSVRC2014
     // devkit for details.
     // This file is a list of image indices in a sorted order.
-    string blacklist_file_path;
+    std::string blacklist_file_path;
 
     // The maximum number of images to calculate accuracy.
     // 0 means all images, a positive number means only the specified
@@ -90,19 +89,20 @@
 
     // Called when evaluation was complete for `image`.
     virtual void OnSingleImageEvaluationComplete(
-        uint64_t shard_id, const ImagenetTopKAccuracy::AccuracyStats& stats,
+        uint64_t shard_id,
+        const tflite::evaluation::TopkAccuracyEvalMetrics& metrics,
         const string& image) = 0;
 
     virtual ~Observer() = default;
   };
 
-  ImagenetModelEvaluator(const utils::ModelInfo& model_info,
-                         const Params& params, const int num_threads)
-      : model_info_(model_info), params_(params), num_threads_(num_threads) {}
+  ImagenetModelEvaluator(const Params& params, const int num_threads)
+      : params_(params), num_threads_(num_threads) {}
 
   // Factory method to create the evaluator by parsing command line arguments.
-  static Status Create(int argc, char* argv[], int num_threads,
-                       std::unique_ptr<ImagenetModelEvaluator>* evaluator);
+  static TfLiteStatus Create(
+      int argc, char* argv[], int num_threads,
+      std::unique_ptr<ImagenetModelEvaluator>* evaluator);
 
   // Adds an observer that can observe evaluation events..
   void AddObserver(Observer* observer) { observers_.push_back(observer); }
@@ -110,10 +110,9 @@
   const Params& params() const { return params_; }
 
   // Evaluates the provided model over the dataset.
-  Status EvaluateModel() const;
+  TfLiteStatus EvaluateModel() const;
 
  private:
-  const utils::ModelInfo model_info_;
   const Params params_;
   const int num_threads_;
   std::vector<Observer*> observers_;
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
deleted file mode 100644
index f5642d5..0000000
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-
-#include <numeric>
-
-namespace {
-constexpr int kNumCategories = 1001;
-std::vector<int> GetTopK(const std::vector<float>& values, int k) {
-  CHECK_LE(k, values.size());
-  std::vector<int> indices(values.size());
-
-  std::iota(indices.begin(), indices.end(), 0);
-  std::sort(indices.begin(), indices.end(),
-            [&values](int a, int b) { return values[a] > values[b]; });
-
-  indices.resize(k);
-  return indices;
-}
-}  // namespace
-
-namespace tensorflow {
-namespace metrics {
-ImagenetTopKAccuracy::ImagenetTopKAccuracy(
-    const std::vector<string>& ground_truth_labels, int k)
-    : ground_truth_labels_(ground_truth_labels),
-      k_(k),
-      accuracy_counts_(k_, 0),
-      num_samples_(0) {
-  CHECK_EQ(kNumCategories, ground_truth_labels.size());
-}
-
-Status ImagenetTopKAccuracy::ComputeEval(
-    const std::vector<Tensor>& model_outputs, const Tensor& ground_truth) {
-  if (model_outputs.size() != 1) {
-    return errors::InvalidArgument("Invalid model output: ",
-                                   model_outputs.size());
-  }
-  const Tensor& output = model_outputs[0];
-  if (!output.shape().IsSameSize({1, kNumCategories})) {
-    return errors::InvalidArgument("Invalid shape of model output: ",
-                                   output.shape().DebugString());
-  }
-  if (ground_truth.dtype() != DT_STRING && ground_truth.dims() != 0) {
-    return errors::InvalidArgument("Invalid ground truth type: ",
-                                   ground_truth.DebugString());
-  }
-  string ground_truth_label = ground_truth.scalar<string>()();
-
-  std::vector<float> probabilities;
-  probabilities.reserve(kNumCategories);
-  if (output.dtype() == DT_FLOAT) {
-    auto probs = output.flat<float>();
-    for (size_t i = 0; i < probs.size(); i++) {
-      probabilities.push_back(probs(i));
-    }
-  } else if (output.dtype() == DT_UINT8) {
-    auto probs = output.flat<uint8>();
-    for (size_t i = 0; i < probs.size(); i++) {
-      probabilities.push_back(probs(i));
-    }
-  } else if (output.dtype() == DT_INT8) {
-    auto probs = output.flat<int8>();
-    for (size_t i = 0; i < probs.size(); i++) {
-      probabilities.push_back(probs(i));
-    }
-  } else {
-    return errors::InvalidArgument("Invalid datatype");
-  }
-
-  CHECK_EQ(kNumCategories, probabilities.size());
-  std::vector<int> topK = GetTopK(probabilities, k_);
-  int ground_truth_index = GroundTruthIndex(ground_truth_label);
-  UpdateSamples(topK, ground_truth_index);
-  return Status::OK();
-}
-
-const ImagenetTopKAccuracy::AccuracyStats
-ImagenetTopKAccuracy::GetTopKAccuracySoFar() const {
-  mutex_lock lock(mu_);
-  AccuracyStats stats;
-  stats.number_of_images = num_samples_;
-  stats.topk_counts = accuracy_counts_;
-  return stats;
-}
-
-void ImagenetTopKAccuracy::UpdateSamples(const std::vector<int>& counts,
-                                         int ground_truth_index) {
-  mutex_lock lock(mu_);
-  for (size_t i = 0; i < counts.size(); ++i) {
-    if (ground_truth_index == counts[i]) {
-      for (size_t j = i; j < counts.size(); j++) {
-        accuracy_counts_[j] += 1;
-      }
-      break;
-    }
-  }
-  num_samples_++;
-}
-
-int ImagenetTopKAccuracy::GroundTruthIndex(const string& label) const {
-  auto index = std::find(ground_truth_labels_.cbegin(),
-                         ground_truth_labels_.cend(), label);
-  CHECK(index != ground_truth_labels_.end()) << "Invalid label: " << label;
-  return std::distance(ground_truth_labels_.cbegin(), index);
-}
-}  //  namespace metrics
-}  //  namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
deleted file mode 100644
index e1fc445..0000000
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
-
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/tools/accuracy/accuracy_eval_stage.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace metrics {
-// An |AccuracyEval| stage that calculates the top K error rate for model
-// evaluations on imagenet like datasets.
-// Inputs: A {1, 1001} shaped tensor that contains the probabilities for objects
-// predicted by the model.
-// Ground truth: A |string| label for the image.
-// From the input object probabilities, the stage computes the predicted labels
-// and finds the top K error rates by comparing the predictions with ground
-// truths.
-class ImagenetTopKAccuracy : public AccuracyEval {
- public:
-  // Accuracy statistics.
-  struct AccuracyStats {
-    // Number of images evaluated.
-    int number_of_images;
-    // A vector of size |k| that contains the number of images
-    // that have correct labels in top K.
-    // E.g. topk_counts[0] contains number of images for which
-    // model returned the correct label as the first result.
-    // Similarly topk_counts[4] contains the number of images for which
-    // model returned the correct label in top 5 results.
-    // This can be used to compute the top K error-rate for the model.
-    std::vector<int> topk_counts;
-  };
-
-  // Creates a new instance of |ImagenetTopKAccuracy| with the given
-  // |ground_truth_labels| and |k|.
-  // Args:
-  // |ground_truth_labels| : an ordered vector of labels for images. This is
-  // used to compute the index for the predicted labels and ground_truth label.
-  ImagenetTopKAccuracy(const std::vector<string>& ground_truth_labels, int k);
-
-  // Computes accuracy for a given  image. The |model_outputs| should
-  // be a vector containing exactly one Tensor of shape: {1, 1001} where each
-  // item is a probability of the predicted object representing the image as
-  // output by the model.
-  // Uses |ground_truth_labels| to compute the index of |model_outputs| and
-  // |ground_truth| and computes the top K error rate.
-  Status ComputeEval(const std::vector<Tensor>& model_outputs,
-                     const Tensor& ground_truth) override;
-
-  // Gets the topK accuracy for images that have been evaluated till now.
-  const AccuracyStats GetTopKAccuracySoFar() const;
-
- private:
-  int GroundTruthIndex(const string& label) const;
-  void UpdateSamples(const std::vector<int>& counts, int ground_truth_index);
-  const std::vector<string> ground_truth_labels_;
-  const int k_;
-  std::vector<int> accuracy_counts_ GUARDED_BY(mu_);
-  int num_samples_ GUARDED_BY(mu_);
-  mutable mutex mu_;
-};
-}  //  namespace metrics
-}  //  namespace tensorflow
-
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_TOPK_EVAL_H_
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
deleted file mode 100644
index 61b7afc..0000000
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-#include <gtest/gtest.h>
-
-namespace tensorflow {
-namespace metrics {
-namespace {
-
-const int kNumCategories = 1001;
-
-Tensor CreateStringTensor(const string& value) {
-  Tensor tensor(DT_STRING, TensorShape({}));
-  tensor.scalar<string>()() = value;
-  return tensor;
-}
-
-Tensor CreateOutputTensor() {
-  Tensor tensor(DT_FLOAT, TensorShape({1, kNumCategories}));
-  for (int i = 0; i < kNumCategories; i++) {
-    tensor.flat<float>()(i) = 0;
-  }
-  return tensor;
-}
-
-std::vector<string> CreateGroundTruth() {
-  std::vector<string> ground_truth;
-  ground_truth.reserve(kNumCategories);
-  for (int i = 0; i < kNumCategories; i++) {
-    string category;
-    strings::StrAppend(&category, i);
-    ground_truth.push_back(category);
-  }
-  return ground_truth;
-}
-
-TEST(ImagenetTopKAccuracy, AllCorrect) {
-  ImagenetTopKAccuracy acc_top_5(CreateGroundTruth(), 5);
-  auto accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(0, accuracies.number_of_images);
-  EXPECT_EQ(5, accuracies.topk_counts.size());
-
-  for (int i : accuracies.topk_counts) {
-    EXPECT_EQ(0, i);
-  }
-  // First image was correctly identified as "0".
-  Tensor tensor = CreateOutputTensor();
-  tensor.flat<float>()(0) = 0.8;
-
-  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("0")));
-  accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(1, accuracies.number_of_images);
-
-  for (int i : accuracies.topk_counts) {
-    EXPECT_EQ(1, i);
-  }
-  tensor.flat<float>()(1) = 0.9;
-  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("1")));
-  accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(2, accuracies.number_of_images);
-
-  for (int i : accuracies.topk_counts) {
-    EXPECT_EQ(2, i);
-  }
-}
-
-TEST(ImagenetTopKAccuracy, Top5) {
-  ImagenetTopKAccuracy acc_top_5(CreateGroundTruth(), 5);
-  auto accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(0, accuracies.number_of_images);
-  EXPECT_EQ(5, accuracies.topk_counts.size());
-
-  // For first image, with ground truth "0" probabilities were
-  // 0.5 for "0",
-  // "0.6" for 1,
-  // "0.7" for 2,
-  // "0.8" for 3,
-  // "0.9" for 4.
-  // remaining all zeroes.
-
-  // First image was correctly identified as "0".
-  Tensor tensor = CreateOutputTensor();
-  tensor.flat<float>()(0) = 0.5;
-  tensor.flat<float>()(1) = 0.6;
-  tensor.flat<float>()(2) = 0.7;
-  tensor.flat<float>()(3) = 0.8;
-  tensor.flat<float>()(4) = 0.9;
-
-  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("0")));
-  accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(1, accuracies.number_of_images);
-  EXPECT_EQ(1, accuracies.topk_counts[4]);
-
-  for (int i = 0; i < 4; i++) {
-    EXPECT_EQ(0, accuracies.topk_counts[i]);
-  }
-
-  // Now for "1" only last two buckets are going to be affected.
-  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("1")));
-  accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(2, accuracies.number_of_images);
-  EXPECT_EQ(1, accuracies.topk_counts[3]);
-  EXPECT_EQ(2, accuracies.topk_counts[4]);
-  for (int i = 0; i < 3; i++) {
-    EXPECT_EQ(0, accuracies.topk_counts[i]);
-  }
-
-  // All buckets will be affected.
-  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("4")));
-  accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(3, accuracies.number_of_images);
-  EXPECT_EQ(1, accuracies.topk_counts[0]);
-  EXPECT_EQ(1, accuracies.topk_counts[1]);
-  EXPECT_EQ(1, accuracies.topk_counts[2]);
-  EXPECT_EQ(2, accuracies.topk_counts[3]);
-  EXPECT_EQ(3, accuracies.topk_counts[4]);
-
-  // No buckets will be affected
-  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("10")));
-  accuracies = acc_top_5.GetTopKAccuracySoFar();
-  EXPECT_EQ(4, accuracies.number_of_images);
-  EXPECT_EQ(1, accuracies.topk_counts[0]);
-  EXPECT_EQ(1, accuracies.topk_counts[1]);
-  EXPECT_EQ(1, accuracies.topk_counts[2]);
-  EXPECT_EQ(2, accuracies.topk_counts[3]);
-  EXPECT_EQ(3, accuracies.topk_counts[4]);
-}
-
-}  // namespace
-
-}  // namespace metrics
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
deleted file mode 100644
index b730b08..0000000
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
-
-#include <memory>
-
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace metrics {
-
-namespace {
-void CentralCropImage(const Scope& s, const tensorflow::Output& decoded_image,
-                      double crop_fraction, tensorflow::Output* cropped_image) {
-  auto image_dims = ops::Slice(s, ops::Shape(s, decoded_image), {0}, {2});
-  auto height_width = ops::Cast(s, image_dims, DT_DOUBLE);
-  auto cropped_begin = ops::Div(
-      s, ops::Sub(s, height_width, ops::Mul(s, height_width, crop_fraction)),
-      2.0);
-  auto bbox_begin = ops::Cast(s, cropped_begin, DT_INT32);
-  auto bbox_size = ops::Sub(s, image_dims, ops::Mul(s, bbox_begin, 2));
-  auto slice_begin = ops::Concat(s, {bbox_begin, Input({0})}, 0);
-  auto slice_size = ops::Concat(s, {bbox_size, {-1}}, 0);
-  *cropped_image = ops::Slice(s, decoded_image, slice_begin, slice_size);
-}
-
-}  // namespace
-
-void InceptionPreprocessingStage::AddToGraph(const Scope& scope,
-                                             const Input& input) {
-  if (!scope.ok()) return;
-  Scope s = scope.WithOpName(name());
-  ops::DecodeJpeg::Attrs attrs;
-  attrs.channels_ = 3;
-  auto decoded_jpeg = ops::DecodeJpeg(s, input, attrs);
-  tensorflow::Output cropped_image;
-  CentralCropImage(s, decoded_jpeg, params_.cropping_fraction, &cropped_image);
-  auto dims_expander = ops::ExpandDims(s, cropped_image, 0);
-  auto resized_image =
-      ops::ResizeBilinear(s.WithOpName("resize"), dims_expander,
-                          ops::Const(s, {image_height_, image_width_}));
-
-  ::tensorflow::Output preprocessed_image = resized_image;
-
-  if (!params_.input_means.empty()) {
-    preprocessed_image =
-        ops::Sub(s.WithOpName("sub"), preprocessed_image,
-                 {params_.input_means[0], params_.input_means[1],
-                  params_.input_means[2]});
-  }
-
-  if (std::abs(params_.scale) > 1e-7f) {
-    auto squeezed_image = ops::Squeeze(s, preprocessed_image);
-    preprocessed_image = ops::Div(s, squeezed_image, {params_.scale});
-    preprocessed_image = ops::ExpandDims(s, preprocessed_image, {0});
-  }
-
-  // Cast the output from float to output datatype.
-  if (output_datatype_ != DT_FLOAT) {
-    preprocessed_image =
-        ops::Cast(s.WithOpName("cast"), preprocessed_image, output_datatype_);
-  }
-
-  this->stage_output_ =
-      ops::Identity(s.WithOpName(output_name()), preprocessed_image);
-}
-
-}  // namespace metrics
-}  // namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
deleted file mode 100644
index 371feb3..0000000
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
-
-#include <utility>
-
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/lite/tools/accuracy/stage.h"
-
-namespace tensorflow {
-namespace metrics {
-
-// A stage that does inception preprocessing.
-// Inputs: A tensor containing bytes of a JPEG image.
-// Outputs: A tensor containing rescaled and preprocessed image that has
-// shape {1, image_height, image_width, 3}, where 3 is the number of channels.
-class InceptionPreprocessingStage : public Stage {
- public:
-  // Preprocessing params that govern scaling and normalization of channels of
-  // the image.
-  struct Params {
-    // Input means are subtracted from each channel.
-    // In case of an empty vector this is skipped.
-    std::vector<float> input_means;
-    // Scale is used to divide the input.
-    // A scale of 0 means divison is skipped.
-    float scale;
-    double cropping_fraction;
-  };
-
-  // Default preprocessing for inception stage based on |output_type|
-  static Params DefaultParamsForType(DataType output_type) {
-    const float kCroppingFraction = 0.875;
-    Params params = {};
-    params.cropping_fraction = kCroppingFraction;
-    if (output_type == DT_UINT8) {
-    } else if (output_type == DT_INT8) {
-      params.input_means = {128.0, 128.0, 128.0};
-    } else {
-      // Assume floating point preprocessing.
-      params.input_means = {127.5, 127.5, 127.5};
-      params.scale = 127.5;
-    }
-    return params;
-  }
-
-  // Creates a new preprocessing stage object with provided |image_width|
-  // |image_height| as the size of output image.
-  // |output_datatype| is the datatype of output of the stage.
-  InceptionPreprocessingStage(int image_width, int image_height,
-                              DataType output_datatype)
-      : output_datatype_(output_datatype),
-        image_width_(image_width),
-        image_height_(image_height) {
-    params_ = DefaultParamsForType(output_datatype);
-  }
-
-  // Creates a new preprocessing stage object with provided |image_width|
-  // |image_height| as the size of output image.
-  // |output_datatype| is the datatype of output of the stage.
-  InceptionPreprocessingStage(int image_width, int image_height,
-                              DataType output_datatype, Params params)
-      : output_datatype_(output_datatype),
-        image_width_(image_width),
-        image_height_(image_height),
-        params_(std::move(params)) {}
-
-  string name() const override { return "stage_inception_preprocess"; }
-  string output_name() const override {
-    return "stage_inception_preprocess_output";
-  }
-
-  void AddToGraph(const Scope& scope, const Input& input) override;
-
- private:
-  DataType output_datatype_;
-  int image_width_;
-  int image_height_;
-  bool is_quantized_;
-  Params params_;
-};
-
-}  // namespace metrics
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
deleted file mode 100644
index f888470..0000000
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <fstream>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
-
-namespace {
-tensorflow::string* g_test_image_file = nullptr;
-}  // namespace
-
-namespace tensorflow {
-namespace metrics {
-
-namespace {
-
-using tensorflow::Status;
-using tensorflow::Tensor;
-
-Status GetContents(const string& filename, string* output) {
-  std::ifstream input(filename, std::ios::binary);
-  const int kBufferSize = 2048;
-  char buffer[kBufferSize];
-  while (true) {
-    input.read(buffer, kBufferSize);
-    output->append(buffer, input.gcount());
-    if (!input.good()) {
-      if (input.eof()) return Status::OK();
-      return Status(tensorflow::error::ABORTED, "Failed to read file.");
-    }
-  }
-}
-
-TEST(InceptionPreprocessingTest, TestImagePreprocessUInt8Quantized) {
-  ASSERT_TRUE(g_test_image_file != nullptr);
-  string image_contents;
-  string image_path = *g_test_image_file;
-  auto status = GetContents(image_path, &image_contents);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-  const int width = 224;
-  const int height = 224;
-  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_UINT8);
-  InceptionPreprocessingStage preprocess_stage(width, height, DT_UINT8, params);
-  Scope scope = Scope::NewRootScope();
-  preprocess_stage.AddToGraph(scope, image_contents);
-  TF_CHECK_OK(scope.status());
-
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-  std::vector<Tensor> outputs;
-  auto run_status =
-      session->Run({},                                   /*inputs*/
-                   {preprocess_stage.output_name()}, {}, /*target node names */
-                   &outputs);
-  TF_CHECK_OK(run_status);
-  EXPECT_EQ(1, outputs.size());
-  EXPECT_EQ(DT_UINT8, outputs[0].dtype());
-  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
-}
-
-TEST(InceptionPreprocessingTest, TestImagePreprocessInt8Quantized) {
-  ASSERT_TRUE(g_test_image_file != nullptr);
-  string image_contents;
-  string image_path = *g_test_image_file;
-  auto status = GetContents(image_path, &image_contents);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-  const int width = 224;
-  const int height = 224;
-  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_INT8);
-  InceptionPreprocessingStage preprocess_stage(width, height, DT_INT8, params);
-  Scope scope = Scope::NewRootScope();
-  preprocess_stage.AddToGraph(scope, image_contents);
-  TF_CHECK_OK(scope.status());
-
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-  std::vector<Tensor> outputs;
-  auto run_status =
-      session->Run({},                                   /*inputs*/
-                   {preprocess_stage.output_name()}, {}, /*target node names */
-                   &outputs);
-  TF_CHECK_OK(run_status);
-  EXPECT_EQ(1, outputs.size());
-  EXPECT_EQ(DT_INT8, outputs[0].dtype());
-  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
-}
-
-TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
-  ASSERT_TRUE(g_test_image_file != nullptr);
-  string image_contents;
-  string image_path = *g_test_image_file;
-  auto status = GetContents(image_path, &image_contents);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-  const int width = 224;
-  const int height = 224;
-  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_FLOAT);
-  InceptionPreprocessingStage preprocess_stage(width, height, DT_FLOAT, params);
-  Scope scope = Scope::NewRootScope();
-  preprocess_stage.AddToGraph(scope, image_contents);
-  TF_CHECK_OK(scope.status());
-
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-  std::vector<Tensor> outputs;
-  auto run_status =
-      session->Run({},                                   /*inputs*/
-                   {preprocess_stage.output_name()}, {}, /*target node names */
-                   &outputs);
-  TF_CHECK_OK(run_status);
-  EXPECT_EQ(1, outputs.size());
-  EXPECT_EQ(DT_FLOAT, outputs[0].dtype());
-  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
-}
-
-}  // namespace
-}  // namespace metrics
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  g_test_image_file = new tensorflow::string();
-  const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("test_image", g_test_image_file,
-                       "Path to image file for test."),
-  };
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  CHECK(parse_result) << "Required test_model_file";
-  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/accuracy/run_tflite_model_op.cc b/tensorflow/lite/tools/accuracy/run_tflite_model_op.cc
deleted file mode 100644
index 5f413b8..0000000
--- a/tensorflow/lite/tools/accuracy/run_tflite_model_op.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/tools/accuracy/utils.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-
-namespace {
-Status ValidateInputsMatch(const OpInputList& input_tensors,
-                           const tflite::Interpreter& interpreter) {
-  std::vector<int> tflite_tensor_indices = interpreter.inputs();
-  if (tflite_tensor_indices.size() != input_tensors.size()) {
-    return errors::InvalidArgument(
-        "size mismatch, interpreter size: ", tflite_tensor_indices.size(),
-        " actual: ", input_tensors.size());
-  }
-
-  for (int i = 0; i < input_tensors.size(); i++) {
-    const TfLiteTensor* tflite_tensor =
-        interpreter.tensor(tflite_tensor_indices[i]);
-    if (tflite_tensor == nullptr) {
-      return errors::InvalidArgument("Tensor is null at index: ", i);
-    }
-
-    const Tensor& tensor = input_tensors[i];
-    auto i_type = metrics::utils::GetTFDataType(tflite_tensor->type);
-    auto i_shape = metrics::utils::GetTFLiteTensorShape(*tflite_tensor);
-    if (i_type != tensor.dtype()) {
-      return errors::InvalidArgument("Data types mismatch for tensors: ", i,
-                                     " expected: ", i_type,
-                                     " got: ", tensor.dtype());
-    }
-
-    if (i_shape != tensor.shape()) {
-      return errors::InvalidArgument("Data shapes mismatch for tensors: ", i,
-                                     " expected: ", i_shape,
-                                     " got: ", tensor.shape());
-    }
-  }
-
-  return Status::OK();
-}
-
-}  // namespace
-
-class RunTFLiteModelOp : public OpKernel {
- public:
-  explicit RunTFLiteModelOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string model_file_path;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("model_file_path", &model_file_path));
-    model_ = tflite::FlatBufferModel::BuildFromFile(model_file_path.data());
-    OP_REQUIRES(ctx, model_,
-                errors::InvalidArgument(
-                    "Model loading failed. Invalid model file path: ",
-                    model_file_path));
-    tflite::ops::builtin::BuiltinOpResolver resolver;
-
-    tflite::InterpreterBuilder(*model_, resolver)(&interpreter_);
-    OP_REQUIRES(ctx, interpreter_,
-                errors::Internal("Interpreter creation failed."));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    OpInputList input_tensors;
-    OP_REQUIRES_OK(context, context->input_list("model_input", &input_tensors));
-
-    OP_REQUIRES_OK(context, ValidateInputsMatch(input_tensors, *interpreter_));
-    OpOutputList output_tensors;
-    OP_REQUIRES_OK(context,
-                   context->output_list("model_output", &output_tensors));
-    auto tfl_outputs = interpreter_->outputs();
-    OP_REQUIRES(context, output_tensors.size() == tfl_outputs.size(),
-                errors::InvalidArgument(
-                    "Invalid output size, expected: ", tfl_outputs.size(),
-                    " got: ", output_tensors.size()));
-    for (int i = 0; i < output_tensors.size(); i++) {
-      DataType tfl_type = metrics::utils::GetTFDataType(
-          interpreter_->tensor(tfl_outputs[i])->type);
-      DataType otype = output_tensors.expected_output_dtype(i);
-      OP_REQUIRES(
-          context, tfl_type == otype,
-          errors::InvalidArgument("Invalid data type for output at index: ", i,
-                                  " expected: ", tfl_type, " got: ", otype));
-    }
-
-    auto allocation_status = interpreter_->AllocateTensors();
-    OP_REQUIRES(context, allocation_status == kTfLiteOk,
-                errors::Internal("Unable to allocate tensors."));
-    for (int i = 0; i < input_tensors.size(); i++) {
-      const int tfl_index = interpreter_->inputs()[i];
-      TfLiteTensor* tflite_tensor = interpreter_->tensor(tfl_index);
-      auto tensor_bytes = input_tensors[i].tensor_data();
-      OP_REQUIRES(context, tflite_tensor->bytes == tensor_bytes.size(),
-                  errors::InvalidArgument(
-                      "Size mismatch, expected: ", tflite_tensor->bytes,
-                      " got: ", tensor_bytes.size()));
-      std::memcpy(tflite_tensor->data.raw, tensor_bytes.data(),
-                  tensor_bytes.size());
-    }
-    auto invocation_status = interpreter_->Invoke();
-    OP_REQUIRES(context, invocation_status == kTfLiteOk,
-                errors::Internal("Interpreter invocation failed."));
-    for (int i = 0; i < output_tensors.size(); i++) {
-      auto tfl_tensor = interpreter_->tensor(tfl_outputs[i]);
-      TensorShape shape = metrics::utils::GetTFLiteTensorShape(*tfl_tensor);
-      Tensor* output = nullptr;
-      OP_REQUIRES_OK(context, output_tensors.allocate(i, shape, &output));
-      auto tensor_bytes = output->tensor_data();
-      OP_REQUIRES(context, tensor_bytes.size() == tfl_tensor->bytes,
-                  errors::Internal("Invalid size"));
-      std::memcpy(const_cast<char*>(tensor_bytes.data()), tfl_tensor->data.raw,
-                  tfl_tensor->bytes);
-    }
-  }
-
- private:
-  std::unique_ptr<tflite::FlatBufferModel> model_;
-  std::unique_ptr<tflite::Interpreter> interpreter_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("RunTFLiteModel").Device(DEVICE_CPU),
-                        RunTFLiteModelOp);
-
-REGISTER_OP("RunTFLiteModel")
-    .Input("model_input: input_type")
-    .Output("model_output: output_type")
-    .Attr("model_file_path: string")
-    .Attr("input_type : list(type)")
-    .Attr("output_type: list(type)")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // TODO(shashishekhar): Infer the correct shape based on output_type and
-      // maybe another attribute.
-      return shape_inference::UnknownShape(c);
-    });
-
-}  // namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/run_tflite_model_op_test.cc b/tensorflow/lite/tools/accuracy/run_tflite_model_op_test.cc
deleted file mode 100644
index 8817598..0000000
--- a/tensorflow/lite/tools/accuracy/run_tflite_model_op_test.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace {
-tensorflow::string* g_test_model_file = nullptr;
-}
-
-namespace tensorflow {
-namespace {
-
-TEST(RunTfliteModelOpTest, ModelIsRun) {
-  ASSERT_TRUE(g_test_model_file != nullptr);
-  string test_model_file = *g_test_model_file;
-  ASSERT_FALSE(test_model_file.empty());
-
-  Scope scope = Scope::NewRootScope();
-  TF_CHECK_OK(scope.status());
-  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
-  //  x = a+b+c, y=b+c+d
-
-  std::vector<Input> graph_inputs = {
-      ops::Const(scope, 1.0f, {1, 8, 8, 3}),  // a
-      ops::Const(scope, 2.1f, {1, 8, 8, 3}),  // b
-      ops::Const(scope, 3.2f, {1, 8, 8, 3}),  // c
-      ops::Const(scope, 4.3f, {1, 8, 8, 3}),  // d
-  };
-
-  std::vector<NodeBuilder::NodeOut> input_data;
-  std::transform(graph_inputs.begin(), graph_inputs.end(),
-                 std::back_inserter(input_data), [&scope](Input model_input) {
-                   return ops::AsNodeOut(scope, model_input);
-                 });
-
-  std::vector<DataType> model_input_type = {DT_FLOAT, DT_FLOAT, DT_FLOAT,
-                                            DT_FLOAT};
-  ::tensorflow::Node* ret;
-  auto builder = ::tensorflow::NodeBuilder("run_model_op", "RunTFLiteModel")
-                     .Input(input_data)
-                     .Attr("model_file_path", test_model_file)
-                     .Attr("input_type", model_input_type)
-                     .Attr("output_type", {DT_FLOAT, DT_FLOAT});
-
-  scope.UpdateBuilder(&builder);
-  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
-  TF_CHECK_OK(scope.status());
-
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-
-  std::vector<Tensor> outputs;
-  TF_CHECK_OK(
-      session->Run({}, {"run_model_op:0", "run_model_op:1"}, {}, &outputs));
-  EXPECT_EQ(2, outputs.size());
-
-  for (const auto& tensor : outputs) {
-    EXPECT_TRUE(tensor.shape().IsSameSize({1, 8, 8, 3}));
-  }
-  auto output_x = outputs[0].flat<float>();
-  auto output_y = outputs[1].flat<float>();
-  EXPECT_EQ(1 * 8 * 8 * 3, output_x.size());
-  EXPECT_EQ(1 * 8 * 8 * 3, output_y.size());
-  for (int i = 0; i < output_x.size(); i++) {
-    EXPECT_NEAR(6.3f, output_x(i), 1e-6f);  // a+b+c
-    EXPECT_NEAR(9.6f, output_y(i), 1e-6f);  // b+c+d
-  }
-}
-
-TEST(RunTfliteModelOpTest, NumInputsMismatch) {
-  ASSERT_TRUE(g_test_model_file != nullptr);
-  string test_model_file = *g_test_model_file;
-  ASSERT_FALSE(test_model_file.empty());
-
-  Scope scope = Scope::NewRootScope();
-  TF_CHECK_OK(scope.status());
-  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
-  //  x = a+b+c, y=b+c+d
-  //  Remove a from input.
-
-  std::vector<Input> graph_inputs = {
-      ops::Const(scope, 2.1f, {1, 8, 8, 3}),  // b
-      ops::Const(scope, 3.2f, {1, 8, 8, 3}),  // c
-      ops::Const(scope, 4.3f, {1, 8, 8, 3}),  // d
-  };
-
-  std::vector<NodeBuilder::NodeOut> input_data;
-  std::transform(graph_inputs.begin(), graph_inputs.end(),
-                 std::back_inserter(input_data), [&scope](Input model_input) {
-                   return ops::AsNodeOut(scope, model_input);
-                 });
-
-  std::vector<DataType> model_input_type = {DT_FLOAT, DT_FLOAT, DT_FLOAT};
-
-  ::tensorflow::Node* ret;
-  auto builder = ::tensorflow::NodeBuilder("run_model_op", "RunTFLiteModel")
-                     .Input(input_data)
-                     .Attr("model_file_path", test_model_file)
-                     .Attr("input_type", model_input_type)
-                     .Attr("output_type", {DT_FLOAT, DT_FLOAT});
-
-  scope.UpdateBuilder(&builder);
-  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
-  TF_CHECK_OK(scope.status());
-
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-
-  std::vector<Tensor> outputs;
-  auto status =
-      (session->Run({}, {"run_model_op:0", "run_model_op:1"}, {}, &outputs));
-  EXPECT_FALSE(status.ok());
-}
-
-TEST(RunTfliteModelOpTest, InputSizesMismatch) {
-  ASSERT_TRUE(g_test_model_file != nullptr);
-  string test_model_file = *g_test_model_file;
-  ASSERT_FALSE(test_model_file.empty());
-
-  Scope scope = Scope::NewRootScope();
-  TF_CHECK_OK(scope.status());
-  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
-  //  x = a+b+c, y=b+c+d
-  // Set a to be invalid size.
-  std::vector<Input> graph_inputs = {
-      ops::Const(scope, 1.0f, {1, 8, 8, 4}),  // a invalid size,
-      ops::Const(scope, 2.1f, {1, 8, 8, 3}),  // b
-      ops::Const(scope, 3.2f, {1, 8, 8, 3}),  // c
-      ops::Const(scope, 4.3f, {1, 8, 8, 3}),  // d
-  };
-
-  std::vector<NodeBuilder::NodeOut> input_data;
-  std::transform(graph_inputs.begin(), graph_inputs.end(),
-                 std::back_inserter(input_data), [&scope](Input model_input) {
-                   return ops::AsNodeOut(scope, model_input);
-                 });
-
-  std::vector<DataType> model_input_type = {DT_FLOAT, DT_FLOAT, DT_FLOAT,
-                                            DT_FLOAT};
-  ::tensorflow::Node* ret;
-  auto builder = ::tensorflow::NodeBuilder("run_model_op", "RunTFLiteModel")
-                     .Input(input_data)
-                     .Attr("model_file_path", test_model_file)
-                     .Attr("input_type", model_input_type)
-                     .Attr("output_type", {DT_FLOAT, DT_FLOAT});
-
-  scope.UpdateBuilder(&builder);
-  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
-  TF_CHECK_OK(scope.status());
-
-  GraphDef graph_def;
-  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_CHECK_OK(session->Create(graph_def));
-
-  std::vector<Tensor> outputs;
-  auto status =
-      (session->Run({}, {"run_model_op:0", "run_model_op:1"}, {}, &outputs));
-  EXPECT_FALSE(status.ok());
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  g_test_model_file = new tensorflow::string();
-  const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("test_model_file", g_test_model_file,
-                       "Path to test tflite model file."),
-  };
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  CHECK(parse_result) << "Required test_model_file";
-  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/accuracy/run_tflite_model_stage.cc b/tensorflow/lite/tools/accuracy/run_tflite_model_stage.cc
deleted file mode 100644
index 6082290..0000000
--- a/tensorflow/lite/tools/accuracy/run_tflite_model_stage.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
-
-#include <vector>
-
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-
-namespace tensorflow {
-namespace metrics {
-void RunTFLiteModelStage::AddToGraph(const Scope& scope, const Input& input) {
-  if (!scope.ok()) return;
-  Scope s = scope.WithOpName(name());
-
-  std::vector<NodeBuilder::NodeOut> _data = {ops::AsNodeOut(s, input)};
-  ::tensorflow::Node* ret;
-  auto builder = NodeBuilder(output_name(), "RunTFLiteModel")
-                     .Input(_data)
-                     .Attr("model_file_path", params_.model_file_path)
-                     .Attr("input_type", params_.input_type)
-                     .Attr("output_type", params_.output_type);
-
-  s.UpdateBuilder(&builder);
-  s.UpdateStatus(builder.Finalize(s.graph(), &ret));
-  if (!s.ok()) return;
-  s.UpdateStatus(s.DoShapeInference(ret));
-  this->stage_output_ = ::tensorflow::Output(ret, 0);
-}
-
-}  //  namespace metrics
-}  //  namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/run_tflite_model_stage.h b/tensorflow/lite/tools/accuracy/run_tflite_model_stage.h
deleted file mode 100644
index 6103449..0000000
--- a/tensorflow/lite/tools/accuracy/run_tflite_model_stage.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
-
-#include <string>
-
-#include "tensorflow/lite/tools/accuracy/stage.h"
-
-namespace tensorflow {
-namespace metrics {
-// Stage that loads and runs a TFLite model.
-// Inputs: The input to TFLite model.
-// Outputs: The output of running the TFLite model.
-class RunTFLiteModelStage : public Stage {
- public:
-  // The parameters for the stage.
-  struct Params {
-    string model_file_path;
-    std::vector<TensorShape> output_shape;
-    std::vector<DataType> input_type;
-    std::vector<DataType> output_type;
-  };
-
-  explicit RunTFLiteModelStage(const Params& params) : params_(params) {}
-
-  string name() const override { return "stage_run_tfl_model"; }
-  // TODO(shashishekhar): This stage can have multiple inputs and
-  // outputs, perhaps change the definition of stage.
-  string output_name() const override { return "stage_run_tfl_model_output"; }
-
-  void AddToGraph(const Scope& scope, const Input& input) override;
-
- private:
-  Params params_;
-};
-
-}  //  namespace metrics
-}  //  namespace tensorflow
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
diff --git a/tensorflow/lite/tools/accuracy/stage.h b/tensorflow/lite/tools/accuracy/stage.h
deleted file mode 100644
index 0a9e3fb..0000000
--- a/tensorflow/lite/tools/accuracy/stage.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_STAGE_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_STAGE_H_
-
-#include "tensorflow/cc/framework/scope.h"
-
-namespace tensorflow {
-namespace metrics {
-
-// A stage in an evaluation pipeline.
-// Each stage adds a subgraph to the pipeline. Stages can be chained
-// together.
-class Stage {
- public:
-  Stage() = default;
-  Stage(const Stage&) = delete;
-  Stage& operator=(const Stage&) = delete;
-
-  Stage(const Stage&&) = delete;
-  Stage& operator=(const Stage&&) = delete;
-
-  // Adds a subgraph to given scope that takes in `input` as a parameter.
-  virtual void AddToGraph(const Scope& scope, const Input& input) = 0;
-  virtual ~Stage() {}
-
-  // The name of the stage.
-  // Can be used by derived classes for naming the subscope for the stage
-  // graph.
-  virtual string name() const = 0;
-
-  // The name of the output for the stage.
-  virtual string output_name() const = 0;
-
-  const ::tensorflow::Output& Output() const { return stage_output_; }
-
- protected:
-  ::tensorflow::Output stage_output_;
-};
-}  //  namespace metrics
-}  //  namespace tensorflow
-
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_STAGE_H_
diff --git a/tensorflow/lite/tools/accuracy/utils.cc b/tensorflow/lite/tools/accuracy/utils.cc
deleted file mode 100644
index 953892b..0000000
--- a/tensorflow/lite/tools/accuracy/utils.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/accuracy/utils.h"
-
-#include <sys/stat.h>
-
-#include <cstring>
-#include <fstream>
-#include <memory>
-#include <string>
-
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/op_resolver.h"
-
-namespace tensorflow {
-namespace metrics {
-
-namespace utils {
-
-DataType GetTFDataType(TfLiteType tflite_type) {
-  switch (tflite_type) {
-    case kTfLiteFloat32:
-      return DT_FLOAT;
-    case kTfLiteUInt8:
-      return DT_UINT8;
-    case kTfLiteInt8:
-      return DT_INT8;
-    case kTfLiteInt32:
-      return DT_INT32;
-    case kTfLiteInt64:
-      return DT_INT64;
-    default:
-      return DT_INVALID;
-  }
-}
-
-TensorShape GetTFLiteTensorShape(const TfLiteTensor& tflite_tensor) {
-  TensorShape shape;
-  for (int i = 0; i < tflite_tensor.dims->size; i++) {
-    shape.AddDim(tflite_tensor.dims->data[i]);
-  }
-  return shape;
-}
-
-Status ReadFileLines(const string& file_path,
-                     std::vector<string>* lines_output) {
-  if (!lines_output) {
-    return errors::InvalidArgument("Invalid output");
-  }
-  std::vector<string> lines;
-  std::ifstream stream(file_path, std::ios_base::in);
-  if (!stream) {
-    return errors::InvalidArgument("Unable to open file: ", file_path);
-  }
-  std::string line;
-  while (std::getline(stream, line)) {
-    lines_output->push_back(line);
-  }
-  return Status::OK();
-}
-
-Status GetTFliteModelInfo(const string& model_file_path,
-                          ModelInfo* model_info) {
-  if (model_file_path.empty()) {
-    return errors::InvalidArgument("Invalid model file.");
-  }
-  struct stat stat_buf;
-  if (stat(model_file_path.c_str(), &stat_buf) != 0) {
-    int error_num = errno;
-    return errors::InvalidArgument("Invalid model file: ", model_file_path,
-                                   std::strerror(error_num));
-  }
-
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  model = tflite::FlatBufferModel::BuildFromFile(model_file_path.data());
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-
-  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-  if (!interpreter) {
-    return errors::InvalidArgument("Invalid model", model_file_path);
-  }
-  for (int i : interpreter->inputs()) {
-    TfLiteTensor* tensor = interpreter->tensor(i);
-    model_info->input_shapes.push_back(utils::GetTFLiteTensorShape(*tensor));
-    model_info->input_types.push_back(utils::GetTFDataType(tensor->type));
-  }
-  return Status::OK();
-}
-
-}  // namespace utils
-}  // namespace metrics
-}  // namespace tensorflow
diff --git a/tensorflow/lite/tools/accuracy/utils.h b/tensorflow/lite/tools/accuracy/utils.h
deleted file mode 100644
index 5b76393..0000000
--- a/tensorflow/lite/tools/accuracy/utils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_UTILS_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_UTILS_H_
-
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/context.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-
-namespace tensorflow {
-namespace metrics {
-
-namespace utils {
-
-struct ModelInfo {
-  std::vector<TensorShape> input_shapes;
-  std::vector<DataType> input_types;
-};
-
-Status GetTFliteModelInfo(const string& model_file_path, ModelInfo* model_info);
-
-DataType GetTFDataType(TfLiteType tflite_type);
-
-TensorShape GetTFLiteTensorShape(const TfLiteTensor& tflite_tensor);
-
-Status ReadFileLines(const string& file_path,
-                     std::vector<string>* lines_output);
-}  // namespace utils
-}  // namespace metrics
-}  // namespace tensorflow
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_UTILS_H_
diff --git a/tensorflow/lite/tools/accuracy/utils_test.cc b/tensorflow/lite/tools/accuracy/utils_test.cc
deleted file mode 100644
index 401872f..0000000
--- a/tensorflow/lite/tools/accuracy/utils_test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/tools/accuracy/utils.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace {
-tensorflow::string* g_test_model_file = nullptr;
-}
-
-namespace tensorflow {
-namespace metrics {
-namespace utils {
-namespace {
-
-TEST(UtilsTest, GetTFLiteModelInfoReturnsCorrectly) {
-  ASSERT_TRUE(g_test_model_file != nullptr);
-  string test_model_file = *g_test_model_file;
-  ASSERT_FALSE(test_model_file.empty());
-  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
-  //  x = a+b+c, y=b+c+d
-  // Input and outputs have shape : {1,8,8,3}
-  ModelInfo model_info;
-  auto status = GetTFliteModelInfo(test_model_file, &model_info);
-  TF_CHECK_OK(status);
-  ASSERT_EQ(4, model_info.input_shapes.size());
-  ASSERT_EQ(4, model_info.input_types.size());
-
-  for (int i = 0; i < 4; i++) {
-    const TensorShape& shape = model_info.input_shapes[i];
-    DataType dataType = model_info.input_types[i];
-    EXPECT_TRUE(shape.IsSameSize({1, 8, 8, 3}));
-    EXPECT_EQ(DT_FLOAT, dataType);
-  }
-}
-
-TEST(UtilsTest, GetTFliteModelInfoIncorrectFile) {
-  ModelInfo model_info;
-  auto status = GetTFliteModelInfo("non_existent_file", &model_info);
-  EXPECT_FALSE(status.ok());
-}
-
-}  // namespace
-}  // namespace utils
-}  // namespace metrics
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  g_test_model_file = new tensorflow::string();
-  const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("test_model_file", g_test_model_file,
-                       "Path to test tflite model file."),
-  };
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  CHECK(parse_result) << "Required test_model_file";
-  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 6a5460e..c692b94 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -6,8 +6,7 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
 common_copts = ["-Wall"] + tflite_copts()
 
@@ -70,31 +69,9 @@
     ],
     deps = [
         ":benchmark_tflite_model_lib",
-        ":command_line_flags",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "command_line_flags",
-    srcs = ["command_line_flags.cc"],
-    hdrs = ["command_line_flags.h"],
-    copts = common_copts,
-)
-
-cc_test(
-    name = "command_line_flags_test",
-    srcs = ["command_line_flags_test.cc"],
-    copts = common_copts,
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":command_line_flags",
-        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/tools:command_line_flags",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -110,18 +87,14 @@
     deps = [
         ":benchmark_model_lib",
         ":logging",
-        "@gemmlowp",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profile_summarizer",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/gpu:gl_delegate",
-        ],
-        "//conditions:default": [],
-    }),
+        "//tensorflow/lite/profiling:profiler",
+        "//tensorflow/lite/tools/evaluation:utils",
+        "@gemmlowp",
+    ],
 )
 
 cc_library(
@@ -143,11 +116,11 @@
     copts = common_copts,
     deps = [
         ":benchmark_params",
-        ":command_line_flags",
         ":logging",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/profiling:time",
+        "//tensorflow/lite/tools:command_line_flags",
     ],
 )
 
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index e432d81..d5c89bd 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -45,6 +45,8 @@
 *   `use_gpu`: `bool` (default=false) \
     Whether to use the [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
     This option is currently only available on Android devices.
+*   `enable_op_profiling`: `bool` (default=false) \
+    Whether to enable per-operator profiling measurement.
 
 ## To build/install/run
 
@@ -129,19 +131,18 @@
 Note: The affinity mask varies with the device.
 
 ## Profiling model operators
-The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
-compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
-to compile benchmark with profiling support.
-For example, to compile with profiling support on Android, add this flag to the previous command:
+The benchmark model binary also allows you to profile operators and give
+execution times of each operator. To do this, pass the flag
+`--enable_op_profiling=true` to `benchmark_model` during invocation, e.g.,
 
 ```
-bazel build -c opt \
-  --config=android_arm \
-  --cxxopt='--std=c++11' \
-  --copt=-DTFLITE_PROFILING_ENABLED \
-  tensorflow/lite/tools/benchmark:benchmark_model
+adb shell taskset f0 /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --enable_op_profiling=true
 ```
-This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+When enabled, the `benchmark_model` binary will produce detailed statistics for
+each operation similar to those shown below:
 
 ```
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index 7831221..ac97ad5 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -23,9 +23,9 @@
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
-#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
 #include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
 
 namespace tflite {
 namespace benchmark {
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 986425c..8fd625c 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -21,7 +21,7 @@
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
-#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
 
 namespace {
 const std::string* g_model_path = nullptr;
@@ -48,6 +48,7 @@
   params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   params.AddParam("use_legacy_nnapi", BenchmarkParam::Create<bool>(false));
   params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
+  params.AddParam("enable_op_profiling", BenchmarkParam::Create<bool>(false));
   return params;
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 19ca4ff..eddaedf 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -23,21 +23,19 @@
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/tools/evaluation/utils.h"
 
 #ifdef GEMMLOWP_PROFILING
 #include "profiling/profiler.h"
 #endif
 
-#if defined(__ANDROID__)
-#include "tensorflow/lite/delegates/gpu/gl_delegate.h"
-#endif
-
 #ifdef TFLITE_CUSTOM_OPS_HEADER
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 #endif
@@ -46,35 +44,42 @@
 namespace benchmark {
 namespace {
 
-#if defined(__ANDROID__)
-Interpreter::TfLiteDelegatePtr CreateGPUDelegate(
-    tflite::FlatBufferModel* model) {
-  TfLiteGpuDelegateOptions options;
-  options.metadata = TfLiteGpuDelegateGetModelMetadata(model->GetModel());
-  options.compile_options.precision_loss_allowed = 1;
-  options.compile_options.preferred_gl_object_type =
-      TFLITE_GL_OBJECT_TYPE_FASTEST;
-  options.compile_options.dynamic_batch_enabled = 0;
-  return Interpreter::TfLiteDelegatePtr(TfLiteGpuDelegateCreate(&options),
-                                        &TfLiteGpuDelegateDelete);
-}
+// Backward compat with previous approach to enabling op profiling.
+#if defined(TFLITE_PROFILING_ENABLED)
+constexpr int kOpProfilingEnabledDefault = true;
+#else
+constexpr int kOpProfilingEnabledDefault = false;
+#endif
 
-Interpreter::TfLiteDelegatePtr CreateNNAPIDelegate() {
-  return Interpreter::TfLiteDelegatePtr(
-      NnApiDelegate(),
-      // NnApiDelegate() returns a singleton, so provide a no-op deleter.
-      [](TfLiteDelegate*) {});
-}
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener(Interpreter* interpreter)
+      : interpreter_(interpreter), has_profiles_(false) {
+    TFLITE_BENCHMARK_CHECK(interpreter);
+    interpreter_->SetProfiler(&profiler_);
+  }
 
-#endif  // defined(__ANDROID__)
+  void OnSingleRunStart(RunType run_type) override;
 
-}  // namespace
+  void OnSingleRunEnd() override;
 
-void ProfilingListener::SetInterpreter(tflite::Interpreter* interpreter) {
-  TFLITE_BENCHMARK_CHECK(interpreter);
-  interpreter_ = interpreter;
-  interpreter_->SetProfiler(&profiler_);
-}
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::BufferedProfiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
 
 void ProfilingListener::OnSingleRunStart(RunType run_type) {
   if (run_type == REGULAR) {
@@ -111,8 +116,6 @@
 #endif
 }
 
-namespace {
-
 std::vector<std::string> Split(const std::string& str, const char delim) {
   std::istringstream input(str);
   std::vector<std::string> results;
@@ -230,6 +233,9 @@
                           BenchmarkParam::Create<bool>(false));
   default_params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
   default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam(
+      "enable_op_profiling",
+      BenchmarkParam::Create<bool>(kOpProfilingEnabledDefault));
   return default_params;
 }
 
@@ -238,8 +244,6 @@
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
-  AddListener(&profiling_listener_);
-  AddListener(&gemmlowp_profiling_listener_);
 }
 
 void BenchmarkTfLiteModel::CleanUp() {
@@ -265,7 +269,8 @@
       CreateFlag<bool>("use_nnapi", &params_, "use nnapi delegate api"),
       CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
       CreateFlag<bool>("use_gpu", &params_, "use gpu"),
-      CreateFlag<bool>("allow_fp16", &params_, "allow fp16")};
+      CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
+      CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
   return flags;
@@ -284,6 +289,8 @@
   TFLITE_LOG(INFO) << "Use gpu : [" << params_.Get<bool>("use_gpu") << "]";
   TFLITE_LOG(INFO) << "Allow fp16 : [" << params_.Get<bool>("allow_fp16")
                    << "]";
+  TFLITE_LOG(INFO) << "Enable op profiling: ["
+                   << params_.Get<bool>("enable_op_profiling") << "]";
 }
 
 bool BenchmarkTfLiteModel::ValidateParams() {
@@ -317,8 +324,7 @@
     TfLiteTensor* t = interpreter->tensor(i);
     std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
     int num_elements = 1;
-    // TODO(haoliang): Ignore the 0-th dimension (number of batches).
-    for (int i = 1; i < sizes.size(); ++i) {
+    for (int i = 0; i < sizes.size(); ++i) {
       num_elements *= sizes[i];
     }
     InputTensorData t_data;
@@ -336,6 +342,12 @@
       FillRandomValue<int32_t>(t_data.data.i32, num_elements, []() {
         return static_cast<int32_t>(rand()) % 100;
       });
+    } else if (t->type == kTfLiteInt16) {
+      t_data.bytes = sizeof(int16_t) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<int16_t>(t_data.data.i16, num_elements, []() {
+        return static_cast<int16_t>(rand()) % 100;
+      });
     } else if (t->type == kTfLiteUInt8) {
       t_data.bytes = sizeof(uint8_t) * num_elements;
       t_data.data.raw = new char[t_data.bytes];
@@ -370,6 +382,9 @@
     } else if (t->type == kTfLiteInt32) {
       std::memcpy(interpreter->typed_tensor<int32_t>(i),
                   inputs_data_[j].data.i32, inputs_data_[j].bytes);
+    } else if (t->type == kTfLiteInt16) {
+      std::memcpy(interpreter->typed_tensor<int16_t>(i),
+                  inputs_data_[j].data.i16, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteUInt8) {
       std::memcpy(interpreter->typed_tensor<uint8_t>(i),
                   inputs_data_[j].data.uint8, inputs_data_[j].bytes);
@@ -412,7 +427,6 @@
   if (!interpreter) {
     TFLITE_LOG(FATAL) << "Failed to construct interpreter";
   }
-  profiling_listener_.SetInterpreter(interpreter.get());
 
   interpreter->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
 
@@ -463,24 +477,37 @@
   if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
   }
+
+  // Install profilers if necessary.
+  if (params_.Get<bool>("enable_op_profiling")) {
+    profiling_listener_.reset(new ProfilingListener(interpreter.get()));
+    AddListener(profiling_listener_.get());
+  }
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp_profiling_listener_.reset(new GemmlowpProfilingListener());
+  AddListener(gemmlowp_profiling_listener_.get());
+#endif
 }
 
 BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
     const {
   TfLiteDelegatePtrMap delegates;
   if (params_.Get<bool>("use_gpu")) {
-#if defined(__ANDROID__)
-    delegates.emplace("GPU", CreateGPUDelegate(model.get()));
-#else
-    TFLITE_LOG(WARN) << "GPU acceleration is unsupported on this platform.";
-#endif
+    Interpreter::TfLiteDelegatePtr delegate =
+        evaluation::CreateGPUDelegate(model.get());
+    if (!delegate) {
+      TFLITE_LOG(WARN) << "GPU acceleration is unsupported on this platform.";
+    } else {
+      delegates.emplace("GPU", std::move(delegate));
+    }
   }
   if (params_.Get<bool>("use_nnapi")) {
-#if defined(__ANDROID__)
-    delegates.emplace("NNAPI", CreateNNAPIDelegate());
-#else
-    TFLITE_LOG(WARN) << "NNAPI acceleration is unsupported on this platform.";
-#endif
+    Interpreter::TfLiteDelegatePtr delegate = evaluation::CreateNNAPIDelegate();
+    if (!delegate) {
+      TFLITE_LOG(WARN) << "NNAPI acceleration is unsupported on this platform.";
+    } else {
+      delegates.emplace("NNAPI", std::move(delegate));
+    }
   }
   return delegates;
 }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 99b9ce3..dd0bec1 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -22,42 +22,12 @@
 #include <vector>
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
 namespace tflite {
 namespace benchmark {
 
-// Dumps profiling events if profiling is enabled.
-class ProfilingListener : public BenchmarkListener {
- public:
-  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
-
-  void SetInterpreter(Interpreter* interpreter);
-
-  void OnSingleRunStart(RunType run_type) override;
-
-  void OnSingleRunEnd() override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-
- private:
-  Interpreter* interpreter_;
-  profiling::Profiler profiler_;
-  profiling::ProfileSummarizer summarizer_;
-  bool has_profiles_;
-};
-
-// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
-class GemmlowpProfilingListener : public BenchmarkListener {
- public:
-  virtual ~GemmlowpProfilingListener() {}
-
-  void OnBenchmarkStart(const BenchmarkParams& params) override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-};
-
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
@@ -99,8 +69,8 @@
   };
   std::vector<InputLayerInfo> inputs;
   std::vector<InputTensorData> inputs_data_;
-  ProfilingListener profiling_listener_;
-  GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::unique_ptr<BenchmarkListener> profiling_listener_;
+  std::unique_ptr<BenchmarkListener> gemmlowp_profiling_listener_;
   TfLiteDelegatePtrMap delegates_;
 };
 
diff --git a/tensorflow/lite/tools/benchmark/command_line_flags.cc b/tensorflow/lite/tools/command_line_flags.cc
similarity index 98%
rename from tensorflow/lite/tools/benchmark/command_line_flags.cc
rename to tensorflow/lite/tools/command_line_flags.cc
index 2fad780..04095d3 100644
--- a/tensorflow/lite/tools/benchmark/command_line_flags.cc
+++ b/tensorflow/lite/tools/command_line_flags.cc
@@ -10,7 +10,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
 
 #include <cstring>
 #include <sstream>
diff --git a/tensorflow/lite/tools/benchmark/command_line_flags.h b/tensorflow/lite/tools/command_line_flags.h
similarity index 100%
rename from tensorflow/lite/tools/benchmark/command_line_flags.h
rename to tensorflow/lite/tools/command_line_flags.h
diff --git a/tensorflow/lite/tools/benchmark/command_line_flags_test.cc b/tensorflow/lite/tools/command_line_flags_test.cc
similarity index 98%
rename from tensorflow/lite/tools/benchmark/command_line_flags_test.cc
rename to tensorflow/lite/tools/command_line_flags_test.cc
index afdf279..4c5713d 100644
--- a/tensorflow/lite/tools/benchmark/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/command_line_flags_test.cc
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/testing/util.h"
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 7f9b57a..0d2f793 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -39,18 +39,29 @@
     hdrs = ["utils.h"],
     copts = tflite_copts(),
     deps = [
-        "//tensorflow/core:tflite_portable_logging",
-    ],
+        "//tensorflow/lite:context",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/lite/delegates/gpu:gl_delegate",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
-    data = ["testdata/labels.txt"],
+    data = [
+        "testdata/empty.txt",
+        "testdata/labels.txt",
+    ],
     linkopts = tflite_linkopts(),
     linkstatic = 1,
     deps = [
         ":utils",
+        "//tensorflow/lite:context",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index d0fc459..fd1f020 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -30,6 +30,11 @@
     deps = ["evaluation_stages_proto"],
 )
 
+java_proto_library(
+    name = "evaluation_stages_java_proto",
+    deps = ["evaluation_stages_proto"],
+)
+
 proto_library(
     name = "evaluation_config_proto",
     srcs = [
@@ -43,3 +48,8 @@
     name = "evaluation_config_cc_proto",
     deps = ["evaluation_config_proto"],
 )
+
+java_proto_library(
+    name = "evaluation_config_java_proto",
+    deps = ["evaluation_config_proto"],
+)
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
index b69ad6c..f95892c 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
@@ -19,6 +19,10 @@
 
 import "tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto";
 
+option cc_enable_arenas = true;
+option java_multiple_files = true;
+option java_package = "tflite.evaluation";
+
 // Contains parameters that define how an EvaluationStage will be executed.
 // This would typically be validated only once during initialization, so should
 // not contain any variables that change with each run.
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index 2e3414a..6c01787 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -17,6 +17,10 @@
 
 package tflite.evaluation;
 
+option cc_enable_arenas = true;
+option java_multiple_files = true;
+option java_package = "tflite.evaluation";
+
 // Defines the functionality executed by an EvaluationStage.
 //
 // Next ID: 5
@@ -88,6 +92,7 @@
   enum Delegate {
     NONE = 0;
     NNAPI = 1;
+    GPU = 2;
   }
   optional Delegate delegate = 2;
   // Number of threads available to the TFLite Interpreter.
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index be8116e..6a64255 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -108,6 +108,7 @@
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:time",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
+        "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
     ],
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index c447c33..bc0eab0 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -20,6 +20,7 @@
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/profiling/time.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/utils.h"
 
 namespace tflite {
 namespace evaluation {
@@ -57,14 +58,36 @@
   model_ = FlatBufferModel::BuildFromFile(params.model_file_path().c_str());
   resolver_.reset(new ops::builtin::BuiltinOpResolver);
   InterpreterBuilder(*model_, *resolver_)(&interpreter_);
-  if (params.delegate() == TfliteInferenceParams::NNAPI) {
-    interpreter_->UseNNAPI(true);
-  }
   if (!interpreter_) {
     LOG(ERROR) << "Could not build interpreter";
     return kTfLiteError;
   }
   interpreter_->SetNumThreads(params.num_threads());
+
+  // TODO(b/122482115): Add support for multiple delegates in
+  // TfLiteInferenceParams.
+  if (params.delegate() == TfliteInferenceParams::NNAPI) {
+    Interpreter::TfLiteDelegatePtr delegate = CreateNNAPIDelegate();
+    if (delegate) {
+      delegates_.push_back(std::move(delegate));
+    } else {
+      LOG(WARNING) << "NNAPI not supported";
+    }
+  } else if (params.delegate() == TfliteInferenceParams::GPU) {
+    Interpreter::TfLiteDelegatePtr delegate = CreateGPUDelegate(model_.get());
+    if (delegate) {
+      delegates_.push_back(std::move(delegate));
+    } else {
+      LOG(WARNING) << "GPU not supported";
+    }
+  }
+  for (int i = 0; i < delegates_.size(); ++i) {
+    if (interpreter_->ModifyGraphWithDelegate(delegates_[i].get()) !=
+        kTfLiteOk) {
+      LOG(FATAL) << "Failed to apply delegate %d" << i;
+    }
+  }
+
   interpreter_->AllocateTensors();
   model_info_ = GetTfliteModelInfo(*interpreter_);
 
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index e2278bb..ecb9c7f 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -67,6 +67,7 @@
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<ops::builtin::BuiltinOpResolver> resolver_;
   std::unique_ptr<Interpreter> interpreter_;
+  std::vector<Interpreter::TfLiteDelegatePtr> delegates_;
 
   TfLiteModelInfo model_info_;
   std::vector<void*>* inputs_ = nullptr;
diff --git a/tensorflow/tools/docker/__init__.py b/tensorflow/lite/tools/evaluation/testdata/empty.txt
similarity index 100%
rename from tensorflow/tools/docker/__init__.py
rename to tensorflow/lite/tools/evaluation/testdata/empty.txt
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 2c82e93..1154953 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -15,26 +15,38 @@
 
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
+#include <dirent.h>
 #include <sys/stat.h>
 
+#include <algorithm>
 #include <fstream>
 #include <memory>
 #include <string>
 
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+
+#if defined(__ANDROID__)
+#include "tensorflow/lite/delegates/gpu/gl_delegate.h"
+#endif
 
 namespace tflite {
 namespace evaluation {
 
+std::string StripTrailingSlashes(const std::string& path) {
+  int end = path.size();
+  while (end > 0 && path[end - 1] == '/') {
+    end--;
+  }
+  return path.substr(0, end);
+}
+
 bool ReadFileLines(const std::string& file_path,
                    std::vector<std::string>* lines_output) {
   if (!lines_output) {
-    LOG(ERROR) << "lines_output is null";
     return false;
   }
   std::ifstream stream(file_path.c_str());
   if (!stream) {
-    LOG(ERROR) << "Unable to open file: " << file_path;
     return false;
   }
   std::string line;
@@ -44,5 +56,55 @@
   return true;
 }
 
+TfLiteStatus GetSortedFileNames(const std::string& directory,
+                                std::vector<std::string>* result) {
+  DIR* dir;
+  struct dirent* ent;
+  if (result == nullptr) {
+    return kTfLiteError;
+  }
+  result->clear();
+  std::string dir_path = StripTrailingSlashes(directory);
+  if ((dir = opendir(dir_path.c_str())) != nullptr) {
+    while ((ent = readdir(dir)) != nullptr) {
+      std::string filename(std::string(ent->d_name));
+      if (filename.size() <= 2) continue;
+      result->emplace_back(dir_path + "/" + filename);
+    }
+    closedir(dir);
+  } else {
+    return kTfLiteError;
+  }
+  std::sort(result->begin(), result->end());
+  return kTfLiteOk;
+}
+
+Interpreter::TfLiteDelegatePtr CreateNNAPIDelegate() {
+#if defined(__ANDROID__)
+  return Interpreter::TfLiteDelegatePtr(
+      NnApiDelegate(),
+      // NnApiDelegate() returns a singleton, so provide a no-op deleter.
+      [](TfLiteDelegate*) {});
+#else
+  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+#endif  // defined(__ANDROID__)
+}
+
+Interpreter::TfLiteDelegatePtr CreateGPUDelegate(
+    tflite::FlatBufferModel* model) {
+#if defined(__ANDROID__)
+  TfLiteGpuDelegateOptions options;
+  options.metadata = TfLiteGpuDelegateGetModelMetadata(model->GetModel());
+  options.compile_options.precision_loss_allowed = 1;
+  options.compile_options.preferred_gl_object_type =
+      TFLITE_GL_OBJECT_TYPE_FASTEST;
+  options.compile_options.dynamic_batch_enabled = 0;
+  return Interpreter::TfLiteDelegatePtr(TfLiteGpuDelegateCreate(&options),
+                                        &TfLiteGpuDelegateDelete);
+#else
+  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+#endif  // defined(__ANDROID__)
+}
+
 }  // namespace evaluation
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index ecfae65..1e2dbe0 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -19,10 +19,23 @@
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+
 namespace tflite {
 namespace evaluation {
+std::string StripTrailingSlashes(const std::string& path);
+
 bool ReadFileLines(const std::string& file_path,
                    std::vector<std::string>* lines_output);
+
+TfLiteStatus GetSortedFileNames(const std::string& directory,
+                                std::vector<std::string>* result);
+
+Interpreter::TfLiteDelegatePtr CreateNNAPIDelegate();
+
+Interpreter::TfLiteDelegatePtr CreateGPUDelegate(FlatBufferModel* model);
+
 }  // namespace evaluation
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/evaluation/utils_test.cc b/tensorflow/lite/tools/evaluation/utils_test.cc
index 6406db7..498de13 100644
--- a/tensorflow/lite/tools/evaluation/utils_test.cc
+++ b/tensorflow/lite/tools/evaluation/utils_test.cc
@@ -18,16 +18,32 @@
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/context.h"
 
 namespace tflite {
 namespace evaluation {
 namespace {
 
-constexpr char kFilePath[] =
+constexpr char kLabelsPath[] =
     "tensorflow/lite/tools/evaluation/testdata/labels.txt";
+constexpr char kDirPath[] =
+    "tensorflow/lite/tools/evaluation/testdata";
+constexpr char kEmptyFilePath[] =
+    "tensorflow/lite/tools/evaluation/testdata/empty.txt";
+
+TEST(UtilsTest, StripTrailingSlashesTest) {
+  std::string path = "/usr/local/folder/";
+  EXPECT_EQ(StripTrailingSlashes(path), "/usr/local/folder");
+
+  path = "/usr/local/folder";
+  EXPECT_EQ(StripTrailingSlashes(path), path);
+
+  path = "folder";
+  EXPECT_EQ(StripTrailingSlashes(path), path);
+}
 
 TEST(UtilsTest, ReadFileErrors) {
-  std::string correct_path(kFilePath);
+  std::string correct_path(kLabelsPath);
   std::string wrong_path("xyz.txt");
   std::vector<std::string> lines;
   EXPECT_FALSE(ReadFileLines(correct_path, nullptr));
@@ -35,7 +51,7 @@
 }
 
 TEST(UtilsTest, ReadFileCorrectly) {
-  std::string file_path(kFilePath);
+  std::string file_path(kLabelsPath);
   std::vector<std::string> lines;
   EXPECT_TRUE(ReadFileLines(file_path, &lines));
 
@@ -44,6 +60,17 @@
   EXPECT_EQ(lines[1], "label2");
 }
 
+TEST(UtilsTest, SortedFilenamesTest) {
+  std::vector<std::string> files;
+  EXPECT_EQ(GetSortedFileNames(kDirPath, &files), kTfLiteOk);
+
+  EXPECT_EQ(files.size(), 2);
+  EXPECT_EQ(files[0], kEmptyFilePath);
+  EXPECT_EQ(files[1], kLabelsPath);
+
+  EXPECT_EQ(GetSortedFileNames("wrong_path", &files), kTfLiteError);
+}
+
 }  // namespace
 }  // namespace evaluation
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index af18fab..4c80880 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -86,14 +86,28 @@
 	tensorflow/lite/profiling/profile_summarizer.cc \
 	tensorflow/core/util/stats_calculator.cc
 
+CMD_LINE_TOOLS_SRCS := \
+  tensorflow/lite/tools/command_line_flags.cc
+
 CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/lite/*.cc) \
 $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/core/*.cc) \
+$(wildcard tensorflow/lite/core/api/*.cc) \
 $(wildcard tensorflow/lite/experimental/c/*.c) \
 $(wildcard tensorflow/lite/experimental/c/*.cc) \
-$(wildcard tensorflow/lite/core/*.cc) \
-$(wildcard tensorflow/lite/core/api/*.cc)
+$(wildcard tensorflow/lite/experimental/ruy/allocator.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/block_map.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/blocking_counter.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/context.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/detect_dotprod.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/kernel.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/pack.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/pmu.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/thread_pool.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/trace.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/tune.cc)
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
 $(wildcard tensorflow/lite/kernels/*.cc) \
@@ -135,9 +149,11 @@
 	BUILD_WITH_NNAPI=false
 endif
 ifeq ($(BUILD_WITH_NNAPI),true)
-	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate_disabled.cc
+	CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+	CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation.cc
 else
-	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate.cc
+	CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
+	CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
 endif
 
 ifeq ($(TARGET),ios)
@@ -154,9 +170,13 @@
 
 # Benchmark sources
 BENCHMARK_SRCS_DIR := tensorflow/lite/tools/benchmark
+EVALUATION_UTILS_SRCS := \
+  tensorflow/lite/tools/evaluation/utils.cc
 BENCHMARK_ALL_SRCS := $(TFLITE_CC_SRCS) \
 	$(wildcard $(BENCHMARK_SRCS_DIR)/*.cc) \
-	$(PROFILE_SUMMARIZER_SRCS)
+	$(PROFILE_SUMMARIZER_SRCS) \
+  $(CMD_LINE_TOOLS_SRCS) \
+	$(EVALUATION_UTILS_SRCS)
 
 BENCHMARK_SRCS := $(filter-out \
 	$(wildcard $(BENCHMARK_SRCS_DIR)/*_test.cc), \
@@ -171,9 +191,10 @@
 ALL_SRCS := \
 	$(MINIMAL_SRCS) \
 	$(PROFILER_SRCS) \
-	$(PROFILER_SUMMARY_SRCS) \
+	$(PROFILER_SUMMARIZER_SRCS) \
 	$(TF_LITE_CC_SRCS) \
-	$(BENCHMARK_SRCS)
+	$(BENCHMARK_SRCS) \
+  $(CMD_LINE_TOOLS_SRCS)
 
 # Where compiled objects are stored.
 GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
@@ -223,12 +244,16 @@
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
 
+lib: $(LIB_PATH)
+
 $(MINIMAL_BINARY): $(MINIMAL_OBJS) $(LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(MINIMAL_BINARY) $(MINIMAL_OBJS) \
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
+minimal: $(MINIMAL_BINARY)
+
 $(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_OBJS)
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_OBJS)
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 8b617ef..3678f55 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -32,7 +32,7 @@
 while getopts "a:p" opt_name; do
   case "$opt_name" in
     a) BUILD_ARCHS="${OPTARG}";;
-    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    p) profiling_args='-DGEMMLOWP_PROFILING';;
     *) usage;;
   esac
 done
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index ad9c992..22a473b 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -31,6 +31,7 @@
     srcs = ["model_utils.cc"],
     hdrs = ["model_utils.h"],
     deps = [
+        ":operator_property",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
@@ -179,6 +180,7 @@
         "//tensorflow/lite/tools/optimize:testdata/argmax.bin",
         "//tensorflow/lite/tools/optimize:testdata/concat.bin",
         "//tensorflow/lite/tools/optimize:testdata/fc.bin",
+        "//tensorflow/lite/tools/optimize:testdata/mixed.bin",
         "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
         "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
         "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index 0243671..e0755e4 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/tools/optimize/model_utils.h"
+
 #include <memory>
 
 #include "absl/memory/memory.h"
@@ -20,6 +21,7 @@
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/operator_property.h"
 
 namespace tflite {
 namespace optimize {
@@ -121,6 +123,19 @@
          !tensor->quantization->max.empty();
 }
 
+void SetOperatorCodeVersion(ModelT* model) {
+  for (int i = 0; i < model->operator_codes.size(); ++i) {
+    OperatorCodeT* op_code = model->operator_codes[i].get();
+    const BuiltinOperator op_buildin_code = op_code->builtin_code;
+    operator_property::OperatorProperty property =
+        operator_property::GetOperatorProperty(op_buildin_code);
+    if (property.quantizable) {
+      // Only update the versions of non-quantizable operations.
+      op_code->version = property.version;
+    }
+  }
+}
+
 }  // namespace utils
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/model_utils.h b/tensorflow/lite/tools/optimize/model_utils.h
index fea87e8..6583d6a 100644
--- a/tensorflow/lite/tools/optimize/model_utils.h
+++ b/tensorflow/lite/tools/optimize/model_utils.h
@@ -53,6 +53,10 @@
 
 bool HasMinMax(const TensorT* tensor);
 
+// Set version of OperatorCode. The version will only be applied for operations
+// that have been quantized.
+void SetOperatorCodeVersion(ModelT* model);
+
 }  // namespace utils
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 01a10a5..13e8d16 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -17,134 +17,206 @@
 namespace tflite {
 namespace optimize {
 namespace operator_property {
-TfLiteStatus GetOperatorProperty(const BuiltinOperator& op,
-                                 OperatorProperty* property) {
-  if (op == BuiltinOperator_ADD || op == BuiltinOperator_MUL) {
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0, 1};
-    property->output_indexes = {0};
-    property->biases = {};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
+OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
+  OperatorProperty property;
+  switch (op) {
+    case BuiltinOperator_ADD:
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
+    case BuiltinOperator_ARG_MAX:
+      property.input_indexes = {0};
+      // ArgMax has no quantizable output.
+      property.version = 2;
+      break;
+    case BuiltinOperator_AVERAGE_POOL_2D:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_BATCH_TO_SPACE_ND:
+    case BuiltinOperator_SPACE_TO_BATCH_ND:
+    case BuiltinOperator_SPACE_TO_DEPTH:
+      // We skip inputs 1 and 2 since they aren't real valued (they are shapes).
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_CONCATENATION:
+      property.arbitrary_inputs = true;
+      property.input_indexes = {};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_CONV_2D:
+      property.per_axis = true;
+      property.per_axis_index = 0;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.biases = {2};
+      property.version = 2;
+      break;
+    case BuiltinOperator_DEPTHWISE_CONV_2D:
+      property.per_axis = true;
+      property.per_axis_index = 3;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.biases = {2};
+      property.version = 3;
+      break;
+    case BuiltinOperator_EQUAL:
+    case BuiltinOperator_NOT_EQUAL:
+    case BuiltinOperator_GREATER:
+    case BuiltinOperator_GREATER_EQUAL:
+    case BuiltinOperator_LESS:
+    case BuiltinOperator_LESS_EQUAL:
+      property.input_indexes = {0, 1};
+      // Comparisons have no quantizable outputs.
+      property.version = 2;
+      break;
+    case BuiltinOperator_FULLY_CONNECTED:
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.biases = {2};
+      property.version = 4;
+      break;
+    case BuiltinOperator_GATHER:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_LOG_SOFTMAX:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {16.0 / 256.0, 127};
+      property.version = 2;
+      break;
+    case BuiltinOperator_LOGISTIC:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      // Logistic requires output with 1/256 as scale and -128 as zero point.
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {1 / 256.0, -128};
+      property.version = 2;
+      break;
+    case BuiltinOperator_L2_NORMALIZATION:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      // L2 Norm requires output with 1/128 as scale and 0 as zero point.
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {1 / 128.0, 0};
+      property.version = 2;
+      break;
+    case BuiltinOperator_MAX_POOL_2D:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_MAXIMUM:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_MEAN:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
+    case BuiltinOperator_MINIMUM:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_MUL:
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
+    case BuiltinOperator_PAD:
+    case BuiltinOperator_PADV2:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_QUANTIZE:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.version = 1;
+      break;
+    case BuiltinOperator_RESHAPE:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 1;
+      break;
+    case BuiltinOperator_RESIZE_BILINEAR:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_SHAPE:
+      property.input_indexes = {0};
+      // Shape has no quantizable output.
+      property.version = 1;
+      break;
+    case BuiltinOperator_SLICE:
+      // We skip inputs 1 and 2 since they aren't real valued (they are the
+      // index and size).
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    case BuiltinOperator_SQUEEZE:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 1;
+      break;
+    case BuiltinOperator_SOFTMAX:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      // Softmax requires output with 1/256 as scale and -128 as zero point.
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {1 / 256.0, -128};
+      property.version = 2;
+      break;
+    case BuiltinOperator_SUB:
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
+    case BuiltinOperator_TANH:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      // Tanh requires output with 1/128 as scale and 0 as zero point.
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {1 / 128.0, 0};
+      property.version = 2;
+      break;
+    case BuiltinOperator_TRANSPOSE:
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
+    default:
+      // No quantized implementation exists for this operation.
+      property.quantizable = false;
   }
-  if (op == BuiltinOperator_AVERAGE_POOL_2D ||
-      op == BuiltinOperator_MAX_POOL_2D || op == BuiltinOperator_SQUEEZE) {
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0};
-    property->output_indexes = {0};
-    property->biases = {};
-    property->restrict_same_input_output_scale = true;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_CONCATENATION) {
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = true;
-    property->input_indexes = {};
-    property->output_indexes = {0};
-    property->biases = {};
-    property->restrict_same_input_output_scale = true;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_CONV_2D) {
-    property->per_axis = true;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0, 1};
-    property->output_indexes = {0};
-    property->biases = {2};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_DEPTHWISE_CONV_2D) {
-    property->per_axis = true;
-    property->per_axis_index = 3;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0, 1};
-    property->output_indexes = {0};
-    property->biases = {2};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_FULLY_CONNECTED) {
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0, 1};
-    property->output_indexes = {0};
-    property->biases = {2};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_MEAN || op == BuiltinOperator_PAD ||
-      op == BuiltinOperator_QUANTIZE || op == BuiltinOperator_RESHAPE) {
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0};
-    property->output_indexes = {0};
-    property->biases = {};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_SOFTMAX) {
-    // Softmax requires output with 1/256 as scale and -128 as zero point.
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0};
-    property->output_indexes = {0};
-    property->biases = {};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = true;
-    property->restricted_value_on_output = {1 / 256.0, -128};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_TANH) {
-    // Tanh requires output with 1/128 as scale and 0 as zero point.
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0};
-    property->output_indexes = {0};
-    property->biases = {};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = true;
-    property->restricted_value_on_output = {1 / 128.0, 0};
-    return kTfLiteOk;
-  }
-  if (op == BuiltinOperator_ARG_MAX) {
-    property->per_axis = false;
-    property->per_axis_index = 0;
-    property->arbitrary_inputs = false;
-    property->input_indexes = {0};
-    // ArgMax has no quantizable output, so there is nothing to do here.
-    property->output_indexes = {};
-    property->biases = {};
-    property->restrict_same_input_output_scale = false;
-    property->restriction_on_output = false;
-    property->restricted_value_on_output = {};
-    return kTfLiteOk;
-  }
-  return kTfLiteError;
+  return property;
 }
 
 }  // namespace operator_property
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index dd076b7..7d01ab4 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -23,31 +23,35 @@
 namespace operator_property {
 
 struct OperatorProperty {
+  // Is a quantized operations currently supported.
+  bool quantizable = true;
   // Per axis.
-  bool per_axis;
+  bool per_axis = false;
   // TODO(jianlijianli): remove dimension index and read it from tensor instead.
-  int per_axis_index;
+  int per_axis_index = 0;
 
   // Op has arbitrary number of inputs, such as concat.
-  bool arbitrary_inputs;
+  bool arbitrary_inputs = false;
   // Input and weight indexes. Unable to separate the two because of ops such as
   // ADD.
-  std::vector<int> input_indexes;
+  std::vector<int> input_indexes = {};
 
   // Output indexes
-  std::vector<int> output_indexes;
+  std::vector<int> output_indexes = {};
 
   // Bias indexes.
-  std::vector<int> biases;
+  std::vector<int> biases = {};
 
   // Constraints.
-  bool restrict_same_input_output_scale;
-  bool restriction_on_output;
-  std::pair<float, float> restricted_value_on_output;
+  bool restrict_same_input_output_scale = false;
+  bool restriction_on_output = false;
+  std::pair<float, float> restricted_value_on_output = {0.0, 0.0};
+
+  // Op version.
+  int version = 1;
 };
 
-TfLiteStatus GetOperatorProperty(const BuiltinOperator& op,
-                                 OperatorProperty* property);
+OperatorProperty GetOperatorProperty(const BuiltinOperator& op);
 
 }  // namespace operator_property
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 027d121..058dc31 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -35,6 +35,7 @@
 namespace optimize {
 
 namespace {
+
 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                           const TensorT* weight_tensor, TensorT* bias_tensor,
                           bool is_per_channel, int channel_dim_index,
@@ -209,13 +210,23 @@
 // For Uint8 input and output, leading op is Quantize (uint8 to
 // int8, can be thought as "requant") and tailing op is also Quantize (int8 to
 // uint8, can be thought as "requant").
-void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
-                            const TensorType& output_type) {
+TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
+                                    const TensorType& output_type,
+                                    ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
 
     for (int i = 0; i < subgraph->inputs.size(); ++i) {
+      TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
+      // TODO(suharshs): Add support for this case if it ever comes up.
+      if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
+        error_reporter->Report(
+            "Unsupported input type %s for input tensor %d of type %s.",
+            EnumNameTensorType(input_type), subgraph->inputs[i],
+            EnumNameTensorType(tensor->type));
+        return kTfLiteError;
+      }
       const int32_t input_idx =
           SetInputType(model, subgraph, subgraph->inputs[i], input_type);
       if (input_idx < 0) {
@@ -224,6 +235,15 @@
       subgraph->inputs[i] = input_idx;
     }
     for (int i = 0; i < subgraph->outputs.size(); ++i) {
+      TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
+      // TODO(suharshs): Add support for this case if it ever comes up.
+      if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
+        error_reporter->Report(
+            "Unsupported output type %s for output tensor %d of type %s.",
+            EnumNameTensorType(output_type), subgraph->outputs[i],
+            EnumNameTensorType(tensor->type));
+        return kTfLiteError;
+      }
       const int32_t output_idx =
           SetOutputType(model, subgraph, subgraph->outputs[i], output_type);
       if (output_idx < 0) {
@@ -232,6 +252,7 @@
       subgraph->outputs[i] = output_idx;
     }
   }
+  return kTfLiteOk;
 }
 
 // Apply constraints to ops if they have any.
@@ -249,9 +270,11 @@
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property;
-      TF_LITE_ENSURE_STATUS(
-          operator_property::GetOperatorProperty(op_code, &property));
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(op_code);
+      if (!property.quantizable) {
+        continue;
+      }
       // Basically only Concat passes this check.
       if (!property.restrict_same_input_output_scale ||
           (property.input_indexes.size() == 1 &&
@@ -310,10 +333,215 @@
   return kTfLiteOk;
 }
 
+std::vector<int> GetInputIndexes(const OperatorT* op,
+                                 operator_property::OperatorProperty property) {
+  std::vector<int> input_indexes;
+  if (property.arbitrary_inputs || !property.quantizable) {
+    for (int i = 0; i < op->inputs.size(); ++i) {
+      input_indexes.push_back(i);
+    }
+  } else {
+    input_indexes = property.input_indexes;
+  }
+  return input_indexes;
+}
+
+bool ShouldRestrictSameInputOutputScale(
+    operator_property::OperatorProperty property) {
+  return (property.input_indexes.size() == 1 &&
+          property.output_indexes.size() == 1 && property.biases.empty() &&
+          property.restrict_same_input_output_scale);
+}
+
+bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
+  for (const int32_t input_idx : subgraph->inputs) {
+    if (index == input_idx) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Quantize the op input. Will increment op_idx if ops are added.
+TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
+                             size_t* op_idx,
+                             operator_property::OperatorProperty property,
+                             int32_t input_idx, ErrorReporter* error_reporter) {
+  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+  OperatorT* op = subgraph->operators[*op_idx].get();
+  const BuiltinOperator op_code =
+      model->operator_codes[op->opcode_index]->builtin_code;
+  const int32_t tensor_idx = op->inputs[input_idx];
+  TensorT* tensor = subgraph->tensors[tensor_idx].get();
+  const bool is_input_quantized = utils::IsQuantized(subgraph, tensor_idx);
+  if (input_idx >= op->inputs.size()) {
+    error_reporter->Report(
+        "Required input index %d is larger than the input length of op "
+        "%s at index %d in subgraph %d",
+        input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
+        subgraph_idx);
+    return kTfLiteError;
+  }
+  if (property.quantizable && !is_input_quantized) {
+    // The operation is quantizable, but the input isn't yet quantized.
+    if (utils::HasBuffer(model, subgraph, tensor_idx)) {
+      if (utils::QuantizeWeight(model, tensor, property.per_axis,
+                                property.per_axis_index) == kTfLiteError) {
+        error_reporter->Report(
+            "Unable to quantize buffer or min/max value for input %d "
+            "in op %s in subgraph %d, node: %d",
+            input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
+        return kTfLiteError;
+      }
+    } else if (utils::HasMinMax(tensor)) {
+      if (IsSubgraphInput(subgraph, tensor_idx)) {
+        utils::QuantizeActivation(tensor);
+      } else {
+        // If the tensor is not a model input, we need to add a Quantize
+        // operation since the preceding op may require a float output.
+        std::unique_ptr<TensorT> op_output;
+        utils::MakeTensor(tensor->name + "_int8", tensor->shape,
+                          TensorType_INT8, &op_output);
+        op_output->quantization = absl::make_unique<QuantizationParametersT>();
+        op_output->quantization->min.push_back(tensor->quantization->min[0]);
+        op_output->quantization->max.push_back(tensor->quantization->max[0]);
+        utils::QuantizeActivation(op_output.get());
+        const int32_t quant_op_output_idx = subgraph->tensors.size();
+        subgraph->tensors.push_back(std::move(op_output));
+        std::unique_ptr<OperatorT> quant_op;
+        utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
+                                    quant_op_output_idx);
+        subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
+                                   std::move(quant_op));
+        op->inputs[input_idx] = quant_op_output_idx;
+        *op_idx += 1;
+      }
+    } else {
+      error_reporter->Report(
+          "Unable to find buffer or min/max value for input activation "
+          "%d "
+          "in %s in subgraph %d, node: %d",
+          input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
+      return kTfLiteError;
+    }
+  } else if (!property.quantizable && is_input_quantized) {
+    // If the tensor is quantized, we have to add a Dequantize op after
+    // since this op is not quantizable.
+    std::unique_ptr<TensorT> op_output;
+    utils::MakeTensor(tensor->name + "_float", tensor->shape,
+                      TensorType_FLOAT32, &op_output);
+    const int32_t dequant_op_output_idx = subgraph->tensors.size();
+    subgraph->tensors.push_back(std::move(op_output));
+    std::unique_ptr<OperatorT> dequant_op;
+    utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
+                                  dequant_op_output_idx);
+    subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
+                               std::move(dequant_op));
+    op->inputs[input_idx] = dequant_op_output_idx;
+    *op_idx += 1;
+  }
+  return kTfLiteOk;
+}
+
+// Quantize the op output.
+TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
+                              int32_t op_idx,
+                              operator_property::OperatorProperty property,
+                              int32_t output_idx,
+                              ErrorReporter* error_reporter) {
+  // If the operator is not quantizable, we don't need to do anything for the
+  // output.
+  if (!property.quantizable) {
+    return kTfLiteOk;
+  }
+  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+  OperatorT* op = subgraph->operators[op_idx].get();
+  const BuiltinOperator op_code =
+      model->operator_codes[op->opcode_index]->builtin_code;
+  if (output_idx >= op->outputs.size()) {
+    error_reporter->Report(
+        "Required output index %d is larger than the output length of "
+        "op %s at index %d in subgraph %d",
+        output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
+        op_idx, subgraph_idx);
+    return kTfLiteError;
+  }
+
+  TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
+  if (ShouldRestrictSameInputOutputScale(property)) {
+    // Copy quantization parameter. For average pool, max pool, etc
+    // min/max can be different but we want them to be the same.
+    // Get scale and zero point of input.
+    if (property.input_indexes[0] >= op->inputs.size()) {
+      error_reporter->Report(
+          "Required input index %d is larger than the input length of "
+          "op  %s at index %d in subgraph %d",
+          property.input_indexes[0], op->inputs.size(),
+          EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
+      return kTfLiteError;
+    }
+    const int input_index = op->inputs[property.input_indexes[0]];
+    TensorT* input_tensor = subgraph->tensors[input_index].get();
+    if (input_tensor->quantization->scale.size() != 1 ||
+        input_tensor->quantization->zero_point.size() != 1 ||
+        input_tensor->quantization->min.size() != 1 ||
+        input_tensor->quantization->max.size() != 1) {
+      error_reporter->Report(
+          "Invalid quantization params for op %s at index %d "
+          "in subgraph %d",
+          EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
+      return kTfLiteError;
+    }
+
+    const float input_scale = input_tensor->quantization->scale[0];
+    const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
+
+    const float min = input_tensor->quantization->min[0];
+    const float max = input_tensor->quantization->max[0];
+    if (utils::HasMinMax(output_tensor)) {
+      if (output_tensor->quantization->min[0] != min ||
+          output_tensor->quantization->max[0] != max) {
+        printf(
+            "Note the output min/max is different from the input min/max "
+            "for op %s at index %d in subgraph %d. This is legal but "
+            "should happens rarely.",
+            EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
+      }
+    }
+
+    // Apply to output.
+    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    output_tensor->quantization->scale.push_back(input_scale);
+    output_tensor->quantization->zero_point.push_back(input_zero_point);
+    output_tensor->quantization->min.push_back(min);
+    output_tensor->quantization->max.push_back(max);
+    output_tensor->type = TensorType_INT8;
+  } else if (property.restriction_on_output) {
+    const auto scale_and_zp = property.restricted_value_on_output;
+    // Apply to output.
+    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    output_tensor->quantization->scale.push_back(scale_and_zp.first);
+    output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
+    output_tensor->type = TensorType_INT8;
+  } else {
+    // Process regular output that doesn't have any restrictions.
+    if (utils::HasMinMax(output_tensor)) {
+      utils::QuantizeActivation(output_tensor);
+    } else {
+      error_reporter->Report(
+          "Unable to find min/max value for output %d in %s in "
+          "subgraph %d, node: %d",
+          output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
 // Quantize inputs and weights.
 // Because of ops such as lstm, still need to do per op, instead of weights.
 TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
-                                        ModelT* model,
+                                        ModelT* model, bool allow_float,
                                         ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -322,154 +550,25 @@
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property;
-      if (operator_property::GetOperatorProperty(op_code, &property) ==
-          kTfLiteError) {
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(op_code);
+
+      if (!property.quantizable && !allow_float) {
         error_reporter->Report("Quantization not yet supported for op: %s",
                                EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
       }
-      // Quantize weight and inputs.
-      std::vector<int> input_indexes;
-      if (property.arbitrary_inputs) {
-        for (int i = 0; i < op->inputs.size(); ++i) {
-          input_indexes.push_back(i);
-        }
-      } else {
-        input_indexes = property.input_indexes;
+
+      // Quantize operator inputs/weights.
+      for (const int input_idx : GetInputIndexes(op, property)) {
+        TF_LITE_ENSURE_STATUS(QuantizeOpInput(
+            model, subgraph_idx, &op_idx, property, input_idx, error_reporter));
       }
-      for (const int input_idx : input_indexes) {
-        if (input_idx >= op->inputs.size()) {
-          error_reporter->Report(
-              "Required input index %d is larger than the input length of op "
-              "%s at index %d in subgraph %d",
-              input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code),
-              op_idx, subgraph_idx);
-          return kTfLiteError;
-        }
-        TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get();
-        // Quantize if it is not quantized already as the output of
-        // another op or input of another op.
-        if (!utils::IsQuantized(subgraph, op->inputs[input_idx])) {
-          if (utils::HasBuffer(model, subgraph, op->inputs[input_idx])) {
-            TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get();
-            utils::QuantizeWeight(model, tensor, property.per_axis,
-                                  property.per_axis_index);
-            continue;
-          }
-          if (utils::HasMinMax(tensor)) {
-            utils::QuantizeActivation(tensor);
-            continue;
-          }
-          // TODO(jianlijianli): Eventually we can insert a dequantize operation
-          // for all inputs and weights here, in the case that min/max is
-          // missing.
-          error_reporter->Report(
-              "Unable to find buffer or min/max value for input activation %d "
-              "in %s in subgraph %d, node: %d",
-              input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
-              op_idx);
-          return kTfLiteError;
-        }
-      }
-      // Quantize output.
+
+      // Quantize operator outputs.
       for (const int output_idx : property.output_indexes) {
-        if (output_idx >= op->outputs.size()) {
-          error_reporter->Report(
-              "Requaired output index %d is larger than the output length of "
-              "op %s at index %d in subgraph %d",
-              output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
-              op_idx, subgraph_idx);
-          return kTfLiteError;
-        }
-        if (property.input_indexes.size() == 1 &&
-            property.output_indexes.size() == 1 && property.biases.empty() &&
-            property.restrict_same_input_output_scale) {
-          // Copy quantization parameter. For average pool, max pool, etc
-          // min/max can be different but we want them to be the same.
-          // Get scale and zero point of input.
-          if (property.input_indexes[0] >= op->inputs.size()) {
-            error_reporter->Report(
-                "Requaired input index %d is larger than the input length of "
-                "op  %s at index %d in subgraph %d",
-                property.input_indexes[0], op->inputs.size(),
-                EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
-            return kTfLiteError;
-          }
-          const int input_index = op->inputs[property.input_indexes[0]];
-          TensorT* input_tensor = subgraph->tensors[input_index].get();
-          if (input_tensor->quantization->scale.size() != 1 ||
-              input_tensor->quantization->min.size() != 1 ||
-              input_tensor->quantization->max.size() != 1) {
-            error_reporter->Report(
-                "Quantization dimension is not 1 for op %s at index %d in "
-                "subgraph %d",
-                EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
-            return kTfLiteError;
-          }
-          const float input_scale = input_tensor->quantization->scale[0];
-          const float input_zero_point =
-              input_tensor->quantization->zero_point[0];
-          const float min = input_tensor->quantization->min[0];
-          const float max = input_tensor->quantization->max[0];
-
-          // Log a warning when we have to override the min/max (scale and zero
-          // point) of output using input.
-          TensorT* output_tensor =
-              subgraph->tensors[op->outputs[output_idx]].get();
-          if (utils::HasMinMax(output_tensor)) {
-            if (output_tensor->quantization->min[0] != min ||
-                output_tensor->quantization->max[0] != max) {
-              printf(
-                  "Note the output min/max is different from the input min/max "
-                  "for op %s at index %d in subgraph %d. This is legal but "
-                  "should happens rarely. ",
-                  EnumNameBuiltinOperator(op_code), static_cast<int>(op_idx),
-                  static_cast<int>(subgraph_idx));
-            }
-          }
-
-          // Apply to output.
-          output_tensor->quantization =
-              absl::make_unique<QuantizationParametersT>();
-          output_tensor->quantization->scale.push_back(input_scale);
-          output_tensor->quantization->zero_point.push_back(input_zero_point);
-          output_tensor->quantization->min.push_back(min);
-          output_tensor->quantization->max.push_back(max);
-          output_tensor->type = TensorType_INT8;
-          continue;
-        }
-        if (property.restriction_on_output) {
-          const std::pair<float, float> scale_and_zp =
-              property.restricted_value_on_output;
-          // Copy scale and zero point since they are fixed.
-          // Applies to softmax, tanh etc.
-          TensorT* output_tensor =
-              subgraph->tensors[op->outputs[output_idx]].get();
-          output_tensor->quantization =
-              absl::make_unique<QuantizationParametersT>();
-          output_tensor->quantization->scale.push_back(scale_and_zp.first);
-          output_tensor->quantization->zero_point.push_back(
-              scale_and_zp.second);
-          output_tensor->type = TensorType_INT8;
-          continue;
-        }
-
-        // Process regular output that doesn't have any restrictions.
-        TensorT* output_tensor =
-            subgraph->tensors[op->outputs[output_idx]].get();
-        if (utils::HasMinMax(output_tensor)) {
-          utils::QuantizeActivation(output_tensor);
-        } else {
-          // TODO(jianlijianli): Eventually we can insert a dequantize operation
-          // for output here, in the case that min/max is missing.
-          error_reporter->Report(
-              "Unable to find min/max value for output activation %d in %s in "
-              "subgraph %d, node: %d",
-              output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
-              op_idx);
-          return kTfLiteError;
-        }
+        TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
+            model, subgraph_idx, op_idx, property, output_idx, error_reporter));
       }
     }
   }
@@ -486,9 +585,11 @@
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property;
-      TF_LITE_ENSURE_STATUS(
-          operator_property::GetOperatorProperty(op_code, &property));
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(op_code);
+      if (!property.quantizable) {
+        continue;
+      }
       for (const int bias_idx : property.biases) {
         if (bias_idx >= op->inputs.size()) {
           error_reporter->Report(
@@ -532,13 +633,15 @@
 // Assumes that the operators in the model have been topologically sorted.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type,
+                           const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter) {
   TF_LITE_ENSURE_STATUS(
-      QuantizeWeightsInputOutput(builder, model, error_reporter));
+      QuantizeWeightsInputOutput(builder, model, allow_float, error_reporter));
   TF_LITE_ENSURE_STATUS(ApplyConstraints(builder, model, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeBiases(builder, model, error_reporter));
-  SetInputAndOutputTypes(model, input_type, output_type);
+  utils::SetOperatorCodeVersion(model);
+  TF_LITE_ENSURE_STATUS(
+      SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model);
@@ -548,9 +651,17 @@
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type,
+                           ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type,
+                       /*allow_float=*/false, error_reporter);
+}
+
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
-                       error_reporter);
+                       /*allow_float=*/false, error_reporter);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 58e5899..d651979 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -44,6 +44,15 @@
                            const TensorType& output_type,
                            ErrorReporter* error_reporter);
 
+// Same as above, but can enable allowing float intermediate operations for ops
+// that do not yet support quantizable.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           ErrorReporter* error_reporter);
+
 }  // namespace optimize
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index f02e93f..f41bf07 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -117,6 +117,10 @@
       EXPECT_EQ(quant_tensor->name, float_tensor->name()->str());
     }
   }
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
@@ -129,7 +133,7 @@
     const auto float_model_op = readonly_model_->operator_codes()->Get(i);
     EXPECT_EQ(model_.operator_codes[i]->builtin_code,
               float_model_op->builtin_code());
-    EXPECT_EQ(model_.operator_codes[i]->version, float_model_op->version());
+    EXPECT_EQ(model_.operator_codes[i]->version, 2);
   }
 
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
@@ -359,6 +363,14 @@
   EXPECT_EQ(concat->inputs[0], 3);
   EXPECT_EQ(concat->inputs[1], 1);
   EXPECT_EQ(concat->outputs[0], 2);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 2);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code,
+            BuiltinOperator_CONCATENATION);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
+  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_QUANTIZE);
+  EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
 class QuantizeConvModel1Test : public QuantizeModelTest {
@@ -455,6 +467,11 @@
       EXPECT_NEAR(dequantized_value, weights_float_buffer[element_idx], eps);
     }
   }
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 class QuantizeConvModel2Test : public QuantizeModelTest {
@@ -548,6 +565,11 @@
       EXPECT_EQ(zero_point, 0);
     }
   }
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 class QuantizeSoftmaxTest : public QuantizeModelTest {
@@ -603,6 +625,11 @@
   ASSERT_EQ(output_quant_params->zero_point.size(), 1);
   ASSERT_EQ(1.0f / 256.0f, output_quant_params->scale[0]);
   ASSERT_EQ(-128, output_quant_params->zero_point[0]);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_SOFTMAX);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 class QuantizeAvgPoolTest : public QuantizeModelTest {
@@ -658,6 +685,12 @@
   EXPECT_EQ(input_quant_params->min[0], output_quant_params->min[0]);
   EXPECT_EQ(input_quant_params->max[0], output_quant_params->max[0]);
   EXPECT_EQ(input_quant_params->scale[0], output_quant_params->scale[0]);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code,
+            BuiltinOperator_AVERAGE_POOL_2D);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 class QuantizeMultiInputAddWithReshapeTest : public QuantizeModelTest {
@@ -707,6 +740,13 @@
   ASSERT_EQ(float_output_quant_params->max()->size(), 1);
   ASSERT_EQ(output_quant_params->min.size(), 1);
   ASSERT_EQ(output_quant_params->max.size(), 1);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 2);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
+  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_RESHAPE);
+  EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
@@ -751,6 +791,13 @@
   ASSERT_EQ(float_output_quant_params->max()->size(), 1);
   ASSERT_EQ(output_quant_params->min.size(), 1);
   ASSERT_EQ(output_quant_params->max.size(), 1);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 2);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
+  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_RESHAPE);
+  EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
 class QuantizeConstInputTest : public QuantizeModelTest {
@@ -788,6 +835,11 @@
   }
 
   EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 class QuantizeArgMaxTest : public QuantizeModelTest {
@@ -825,6 +877,11 @@
   // The output of ArgMax should still be the same type.
   ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
             subgraph->tensors[op->outputs[0]].get()->type);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 1);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ARG_MAX);
+  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
 class QuantizeFCTest : public QuantizeModelTest {
@@ -867,6 +924,50 @@
   ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
             TensorType_FLOAT32);
   EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  // check op and versioning.
+  EXPECT_EQ(model_.operator_codes.size(), 2);
+  EXPECT_EQ(model_.operator_codes[0]->builtin_code,
+            BuiltinOperator_FULLY_CONNECTED);
+  EXPECT_EQ(model_.operator_codes[0]->version, 4);
+  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_RESHAPE);
+  EXPECT_EQ(model_.operator_codes[1]->version, 1);
+}
+
+class QuantizeCustomOpTest : public QuantizeModelTest {
+ protected:
+  QuantizeCustomOpTest() {
+    input_model_ = ReadModel(internal::kModelMixed);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    /*allow_float=*/true, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+  const auto& subgraph = model_.subgraphs[0];
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+  // The original model reshape->custom->custom->squeeze.
+  ASSERT_EQ(float_graph->operators()->size(), 4);
+  // The resulting model should be:
+  // reshape->dequantize->custom->custom->quantize->squeeze.
+  ASSERT_EQ(subgraph->operators.size(), 6);
+  const std::vector<BuiltinOperator> op_codes = {
+      BuiltinOperator_RESHAPE,  BuiltinOperator_DEQUANTIZE,
+      BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
+      BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
+  const std::vector<TensorType> op_input_types = {
+      TensorType_INT8,    TensorType_INT8,    TensorType_FLOAT32,
+      TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8};
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+              op_codes[i]);
+    ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 935dc01..dda8253 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -128,7 +128,6 @@
   } else if (builtin_op_code == BuiltinOperator_FULLY_CONNECTED ||
              builtin_op_code == BuiltinOperator_CONV_2D ||
              builtin_op_code == BuiltinOperator_SVDF ||
-             builtin_op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
              builtin_op_code == BuiltinOperator_RNN ||
              builtin_op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
              builtin_op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
@@ -262,7 +261,6 @@
   for (int i = 0; i < model->operator_codes.size(); ++i) {
     const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
     if (op_code == BuiltinOperator_CONV_2D || op_code == BuiltinOperator_SVDF ||
-        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
         op_code == BuiltinOperator_RNN ||
         op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
@@ -271,6 +269,7 @@
 
     } else if (op_code == BuiltinOperator_FULLY_CONNECTED ||
                op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
+               op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
                op_code == BuiltinOperator_LSTM) {
       model->operator_codes[i]->version = 3;
     }
@@ -286,7 +285,8 @@
   const OperatorT* consumer_op = consumer_op_infos.front().op;
   const BuiltinOperator op_code =
       model->operator_codes[consumer_op->opcode_index]->builtin_code;
-  return op_code == BuiltinOperator_GATHER;
+  return op_code == BuiltinOperator_GATHER ||
+         op_code == BuiltinOperator_EMBEDDING_LOOKUP;
 }
 
 // Copies quantization parameters from input to output and returns consumers of
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index 9c5a479..5f38d9a 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -45,6 +45,8 @@
 
 const char* kModelWithFCOp = "fc.bin";
 
+const char* kModelMixed = "mixed.bin";
+
 int FailOnErrorReporter::Report(const char* format, va_list args) {
   char buf[1024];
   vsnprintf(buf, sizeof(buf), format, args);
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index 4fcc728..1e7e14c 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -69,6 +69,10 @@
 // Test model with a argmax op.
 extern const char* kModelWithFCOp;
 
+// Test model with mixed quantizable and un-quantizable ops.
+// reshape->custom->custom->squeeze.
+extern const char* kModelMixed;
+
 // An error reporter that fails on testing.
 class FailOnErrorReporter : public ErrorReporter {
  public:
diff --git a/tensorflow/lite/tools/optimize/testdata/mixed.bin b/tensorflow/lite/tools/optimize/testdata/mixed.bin
new file mode 100644
index 0000000..b2eeba0
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/testdata/mixed.bin
Binary files differ
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index b6b9d4c..b19a3c0 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -5,6 +5,7 @@
 tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
 tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/__init__.py
 tensorflow/contrib/mpi/BUILD
+tensorflow/stream_executor/build_defs.bzl
 tensorflow/tools/ci_build/remote/BUILD
 tensorflow/tools/pip_package/README
 tensorflow/tools/pip_package/MANIFEST.in
@@ -62,11 +63,15 @@
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/py/BUILD
 tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
 tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD
 tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
@@ -253,7 +258,6 @@
 tensorflow/third_party/libxsmm.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/third_party/eigen.BUILD
-tensorflow/stream_executor/build_defs.bzl
 tensorflow/api_template_v1.__init__.py
 tensorflow/compat_template_v1.__init__.py
 tensorflow/compat_template.__init__.py
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a304425..0d478ea 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -196,7 +196,7 @@
         "//tensorflow/python/ops/signal",
         "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
-        "//tensorflow/python/tools:component_api_helper",
+        "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
         "//third_party/py/numpy",
@@ -443,6 +443,7 @@
         ":numpy_lib",
         ":safe_ptr",
         "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -699,11 +700,13 @@
         ":framework_for_generated_wrappers",
         ":function",
         ":graph_util",
+        ":indexed_slices_tensor_spec",
         ":lib",
         ":platform",
         ":pywrap_tensorflow",
         ":random_seed",
         ":sparse_tensor",
+        ":sparse_tensor_spec",
         ":tensor_spec",
         ":tensor_util",
         ":util",
@@ -1192,6 +1195,30 @@
 )
 
 py_library(
+    name = "sparse_tensor_spec",
+    srcs = ["framework/sparse_tensor_spec.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":sparse_tensor",
+        ":tensor_shape",
+        ":tensor_spec",
+    ],
+)
+
+py_library(
+    name = "indexed_slices_tensor_spec",
+    srcs = ["framework/indexed_slices_tensor_spec.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_shape",
+    ],
+)
+
+py_library(
     name = "tensor_util",
     srcs = ["framework/tensor_util.py"],
     srcs_version = "PY2AND3",
@@ -1393,9 +1420,7 @@
     ],
     main = "platform/benchmark_test.py",
     tags = [
-        "manual",
         "no_pip",
-        "notap",
     ],
 )
 
@@ -2837,6 +2862,7 @@
         ":platform",
         ":string_ops",
         ":util",
+        "//tensorflow/python/compat",
     ],
 )
 
@@ -2896,7 +2922,7 @@
 
 py_test(
     name = "loss_scale_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/experimental/loss_scale_test.py"],
     deps = [
         ":loss_scale",
@@ -2908,6 +2934,12 @@
 )
 
 py_library(
+    name = "mixed_precision_global_state",
+    srcs = ["training/experimental/mixed_precision_global_state.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
     name = "mixed_precision",
     srcs = ["training/experimental/mixed_precision.py"],
     srcs_version = "PY2AND3",
@@ -2915,6 +2947,7 @@
         ":config",
         ":loss_scale",
         ":loss_scale_optimizer",
+        ":mixed_precision_global_state",
         "//tensorflow/python:util",
     ],
 )
@@ -3425,6 +3458,7 @@
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":platform",
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
     ],
 )
 
@@ -3994,7 +4028,7 @@
 )
 
 py_library(
-    name = "training",
+    name = "training_lib",
     srcs = glob(
         ["training/**/*.py"],
         exclude = [
@@ -4064,11 +4098,19 @@
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras/optimizer_v2:learning_rate_schedule",
         "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "training",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":training_lib",
         "//tensorflow/python/training/tracking:base",
         "//tensorflow/python/training/tracking:python_state",
         "//tensorflow/python/training/tracking:util",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -4713,6 +4755,7 @@
         ":pywrap_tensorflow",
         ":session_ops",
         ":util",
+        "//tensorflow/python:mixed_precision_global_state",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 5fb9bcb..a924391 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -36,9 +36,6 @@
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core.converter import ConversionOptions
 from tensorflow.python.autograph.core.converter import Feature
-from tensorflow.python.autograph.core.errors import GraphConstructionError
-from tensorflow.python.autograph.core.errors import improved_errors
-from tensorflow.python.autograph.core.errors import TfRuntimeError
 from tensorflow.python.autograph.impl.api import convert
 from tensorflow.python.autograph.impl.api import converted_call
 from tensorflow.python.autograph.impl.api import do_not_convert
@@ -49,7 +46,7 @@
 from tensorflow.python.autograph.lang.directives import set_loop_options
 from tensorflow.python.autograph.lang.special_functions import stack
 from tensorflow.python.autograph.pyct.errors import AutoGraphError
-from tensorflow.python.autograph.lang.special_functions import tensor_list
+from tensorflow.python.autograph.pyct.errors import StagingError
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -66,10 +63,6 @@
     'to_graph',
     # Overloaded operators
     'operators',
-    # Errors
-    'improved_errors',
-    'GraphConstructionError',
-    'TfRuntimeError',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
@@ -77,6 +70,7 @@
     'tensor_list',
     # Exceptions
     'AutoGraphError',
+    'StagingError',
     # Utilities: to be removed
     'utils',
 ]
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index d534918..e2dc4b0 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -25,7 +25,6 @@
         "continue_statements.py",
         "control_flow.py",
         "directives.py",
-        "error_handlers.py",
         "function_scopes.py",
         "list_comprehensions.py",
         "lists.py",
@@ -206,18 +205,6 @@
 )
 
 py_test(
-    name = "error_handlers_test",
-    srcs = ["error_handlers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":converters",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/pyct",
-    ],
-)
-
-py_test(
     name = "slices_test",
     srcs = ["slices_test.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index dc1e8c8..028017d 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -46,9 +46,11 @@
       continue statement.
     create_guard: bool, whether a guard should be created because a continue
       statement has just been encountered.
+    is_loop_type: bool, whether this block is the body of a loop.
   """
 
   def __init__(self):
+    self.is_loop_type = False
     self.reset_guard_state()
 
   def reset_guard_state(self):
@@ -61,7 +63,13 @@
 
   def visit_Continue(self, node):
     self.state[_Continue].used = True
-    self.state[_Block].reset_guard_state()
+    for block in reversed(self.state[_Block].stack):
+      block.reset_guard_state()
+      # See ContinueCanonicalizationTest.test_multiple_continues for an example
+      # it's necessary to reset the state of all enclosing affected blocks, not
+      # just that of the current block.
+      if block.is_loop_type:
+        break
     template = """
       var_name = True
     """
@@ -112,6 +120,7 @@
   def _visit_loop_body(self, node, nodes):
     self.state[_Continue].enter()
     self.state[_Block].enter()
+    self.state[_Block].is_loop_type = True
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
     self.state[_Continue].control_var_name = continue_var
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index 0dbc8e7..8891d81 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -54,10 +54,10 @@
       v = []
       while x > 0:
         x -= 1
-        if x > 2:
-          continue
         if x > 1:
           continue
+        if x > 2:
+          continue
         v.append(x)
       return v
 
@@ -66,6 +66,26 @@
     self.assertTransformedEquivalent(test_fn, 3)
     self.assertTransformedEquivalent(test_fn, 4)
 
+  def test_multiple_continues_in_nested_scope(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x > 100:
+          continue
+        try:
+          raise ValueError('intentional')
+        except ValueError:
+          continue
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+
   def test_for_loop(self):
 
     def test_fn(a):
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a67f699..21c3d18 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -27,22 +27,6 @@
 from tensorflow.python.autograph.pyct.static_analysis import annos
 
 
-class SymbolNamer(object):
-  """Describes the interface for ControlFlowTransformer's namer."""
-
-  def new_symbol(self, name_root, reserved_locals):
-    """Generate a new unique symbol.
-
-    Args:
-      name_root: String, used as stem in the new name.
-      reserved_locals: Set(string), additional local symbols that are reserved
-          and which should not be used.
-    Returns:
-      String.
-    """
-    raise NotImplementedError()
-
-
 class ControlFlowTransformer(converter.Base):
   """Transforms control flow structures like loops an conditionals."""
 
@@ -499,7 +483,7 @@
             ssf_map)
     else:
       # Loop with no loop-carried state and no early stopping
-      assert not has_extra_test, ('Early stoppiong (e.g. break and/or return) '
+      assert not has_extra_test, ('Early stopping (e.g. break and/or return) '
                                   'should create state variables.')
       loop_nodes = self._for_loop_without_state(node, body_name, node_body)
 
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 9ad229c..4a58d52 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -135,7 +135,8 @@
         test_fn, control_flow, {'TestClass': TestClass}) as result:
       # The tested function would require `tc` to become part of the while loop
       # state, but TensorFlow doesn't support classes at the moment.
-      with self.assertRaisesRegexp(ValueError, 'must.*initialize.*Tensor.*tc'):
+      with self.assertRaisesRegexp(
+          ValueError, 'must be defined before the loop:.*tc.*'):
         result.test_fn(constant_op.constant(5))
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/autograph/converters/directives.py b/tensorflow/python/autograph/converters/directives.py
index 7402ec0..53a4791 100644
--- a/tensorflow/python/autograph/converters/directives.py
+++ b/tensorflow/python/autograph/converters/directives.py
@@ -113,8 +113,9 @@
   def visit_Attribute(self, node):
     node = self.generic_visit(node)
     parent_val = anno.getanno(node.value, STATIC_VALUE, default=None)
-    if parent_val is not None and hasattr(parent_val, node.attr):
-      anno.setanno(node, STATIC_VALUE, getattr(parent_val, node.attr))
+    if parent_val is not None and tf_inspect.ismodule(parent_val):
+      if hasattr(parent_val, node.attr):
+        anno.setanno(node, STATIC_VALUE, getattr(parent_val, node.attr))
     return node
 
   def visit_Expr(self, node):
diff --git a/tensorflow/python/autograph/converters/directives_test.py b/tensorflow/python/autograph/converters/directives_test.py
index 70252a5..cfe1827 100644
--- a/tensorflow/python/autograph/converters/directives_test.py
+++ b/tensorflow/python/autograph/converters/directives_test.py
@@ -99,6 +99,23 @@
     with self.assertRaisesRegexp(ValueError, 'Unexpected keyword.*'):
       directives_converter._map_args(node, invalid_directive)
 
+  def test_value_verification_does_not_trigger_properties(self):
+
+    class TestClass(object):
+
+      @property
+      def b(self):
+        raise ValueError('This should never be evaluated')
+
+    tc = TestClass()
+
+    def test_fn():
+      return tc.b + 1
+
+    node, ctx = self.prepare(test_fn, {'tc': tc})
+    node = directives_converter.transform(node, ctx)
+    self.assertIsNotNone(node)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/error_handlers.py b/tensorflow/python/autograph/converters/error_handlers.py
deleted file mode 100644
index de46c0c..0000000
--- a/tensorflow/python/autograph/converters/error_handlers.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Wraps function bodies with a try/except to rewrite error tracebacks.
-
-Only adds try/except wrappers to functions that have the anno.Basic.ORIGIN
-annotation because these are the functions originally written by the user.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import templates
-
-
-class ErrorRewritingTransformer(converter.Base):
-  """Possibly wraps the body of a function in a try/except.
-
-  Only wraps functions that were originally defined by the user, detected by
-  checking for the anno.Basic.ORIGIN annotation.
-  """
-
-  def visit_FunctionDef(self, node):
-    node = self.generic_visit(node)
-
-    if (anno.hasanno(node, anno.Basic.ORIGIN) and
-        len(self.enclosing_entities) <= 1):
-      template = """
-        try:
-          body
-        except:
-          ag__.rewrite_graph_construction_error(ag_source_map__)
-      """
-      node.body = templates.replace(template, body=node.body)
-    return node
-
-
-def transform(node, ctx):
-  return ErrorRewritingTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/error_handlers_test.py b/tensorflow/python/autograph/converters/error_handlers_test.py
deleted file mode 100644
index 1f6c5a6..0000000
--- a/tensorflow/python/autograph/converters/error_handlers_test.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for error_handlers module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.python.autograph.converters import error_handlers
-from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.core import errors
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.platform import test
-
-
-class ErrorHandlersTest(converter_testing.TestCase):
-
-  def test_basic(self):
-
-    def test_fn():
-      raise ValueError()
-
-    with self.converted(test_fn, error_handlers, {}) as result:
-      with self.assertRaises(errors.GraphConstructionError):
-        # Here we just assert that the handler works.
-        result.test_fn()
-
-  def test_no_origin_annotation(self):
-
-    def test_fn(x):
-      return x + 1
-
-    node, ctx = self.prepare(test_fn, {})
-    anno.delanno(node, anno.Basic.ORIGIN)
-    node = error_handlers.transform(node, ctx)
-    self.assertIsInstance(node.body[0], gast.Return)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 3173e67..a53206c 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -375,8 +375,10 @@
       if self.default_to_null_return:
         template = """
           do_return_var_name = False
-          retval_var_name = None
+          retval_var_name = ag__.UndefinedReturnValue()
           body
+          if ag__.is_undefined_return(retval_var_name):
+            retval_var_name = None
           return retval_var_name
         """
       else:
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index b2d3d1b..8697311 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -49,7 +49,7 @@
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_else(self):
+  def test_contitional_missing_else(self):
 
     def test_fn(x):
       if x > 0:
@@ -58,7 +58,7 @@
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_else_then_default(self):
+  def test_conditional_missing_else_then_default(self):
 
     def test_fn(x):
       if x > 0:
@@ -68,7 +68,7 @@
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_else_only_then_default(self):
+  def test_conditional_else_only_then_default(self):
 
     def test_fn(x):
       if x < 0:
@@ -216,6 +216,25 @@
     self.assertTransformedEquivalent(test_fn, 3)
     self.assertTransformedEquivalent(test_fn, 4)
 
+  def test_multiple_returns_in_nested_scope(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x > 100:
+          return v
+        try:
+          raise ValueError('intentional')
+        except ValueError:  # pylint:disable=bare-except
+          return v
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index fae327e..f2b6999 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -19,7 +19,6 @@
     srcs = [
         "config.py",
         "converter.py",
-        "errors.py",
         "function_wrapping.py",
         "naming.py",
         "unsupported_features_checker.py",
@@ -65,21 +64,6 @@
 )
 
 py_test(
-    name = "errors_test",
-    srcs = ["errors_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":core",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-    ],
-)
-
-py_test(
     name = "function_wrapping_test",
     srcs = ["function_wrapping_test.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 6142699..061c4cf 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -96,8 +96,6 @@
     ASSERT_STATEMENTS: Convert Tensor-dependent assert statements to tf.Assert.
     BUILTIN_FUNCTIONS: Convert builtin functions applied to Tensors to
       their TF counterparts.
-    ERROR_REWRITING: Rewrite errors that occur in the generated code to
-      indicate the source code to which the failing code corresponds.
     LISTS: Convert list idioms, like initializers, slices, append, etc.
     LOGICAL_EXPRESSIONS: Convert data-dependent logical expressions applied to
       Tensors to their TF counterparts.
@@ -110,7 +108,6 @@
   AUTO_CONTROL_DEPS = 'AUTO_CONTROL_DEPS'
   ASSERT_STATEMENTS = 'ASSERT_STATEMENTS'
   BUILTIN_FUNCTIONS = 'BUILTIN_FUNCTIONS'
-  ERROR_REWRITING = 'ERROR_REWRITING'
   LISTS = 'LISTS'
   LOGICAL_EXPRESSIONS = 'LOGICAL_EXPRESSIONS'
   NAME_SCOPES = 'NAME_SCOPES'
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index b7831ed..849df23 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -27,7 +27,6 @@
 from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.lang import special_functions
@@ -78,8 +77,6 @@
       fake_ag.ConversionOptions = converter.ConversionOptions
       fake_ag.Feature = converter.Feature
       fake_ag.utils = utils
-      fake_ag.rewrite_graph_construction_error = (
-          errors.rewrite_graph_construction_error)
       fake_ag.function_scope = function_wrapping.function_scope
       result.ag__ = fake_ag
       result.ag_source_map__ = source_map
diff --git a/tensorflow/python/autograph/core/errors.py b/tensorflow/python/autograph/core/errors.py
deleted file mode 100644
index e94b6d1..0000000
--- a/tensorflow/python/autograph/core/errors.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Error rewriting logic.
-
-Contains the functions responsible for rewriting tracebacks of errors raised
-in AutoGraph (AG) code to refer to user written code, so that errors only refer
-to the original user code.
-
-When 'user code' is used in comments it refers to the original source code that
-the user wrote and is converting using AutoGraph.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import logging
-import sys
-import traceback
-
-from tensorflow.python.autograph.pyct import origin_info
-from tensorflow.python.framework import errors_impl
-
-# TODO(mdan): Add a superclass common to all errors.
-
-
-class GraphConstructionError(Exception):
-  """Error for graph construction errors from AutoGraph generated code."""
-
-  def __init__(self, original_error, custom_traceback):
-    self.original_error = original_error
-    self.custom_traceback = custom_traceback
-    super(GraphConstructionError, self).__init__()
-
-  def __str__(self):
-    traceback_str = ''.join(traceback.format_list(self.custom_traceback))
-    return ('Traceback (most recent call last):\n' + traceback_str + '\n' + str(
-        self.original_error) + '\n')
-
-
-class TfRuntimeError(Exception):
-  """Error wrapper for runtime errors raised by AutoGraph generated code."""
-
-  def __init__(self, op_name, op_message, custom_traceback):
-    self.op_name = op_name
-    self.op_message = op_message
-    self.custom_traceback = custom_traceback
-    super(TfRuntimeError, self).__init__()
-
-  def __str__(self):
-    message = '%s\n\nCaused by op %r, defined at:\n' % (self.op_message,
-                                                        self.op_name)
-    return message + ''.join(traceback.format_list(self.custom_traceback))
-
-
-def _rewrite_tb(source_map, tb):
-  """Rewrites code references in a traceback.
-
-  Args:
-    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo], mapping
-        locations to their origin
-    tb: List[Tuple[Text, Text, Text, Text]], consistent with
-        traceback.extract_tb.
-  Returns:
-    List[Tuple[Text, Text, Text, Text]], the rewritten traceback
-  """
-  new_tb = []
-  for frame in tb:
-    filename, lineno, _, _ = frame
-    loc = origin_info.LineLocation(filename, lineno)
-    origin = source_map.get(loc)
-    if origin is not None:
-      new_tb.append(origin.as_frame())
-    else:
-      new_tb.append(frame)
-  return new_tb
-
-
-# TODO(mdan): rename to raise_*
-def rewrite_graph_construction_error(source_map):
-  """Rewrites errors raised by non-AG APIs inside AG generated code.
-
-  This is called from the except handler inside an AutoGraph generated function
-  (that is, during exception handling). Only rewrites the frames corresponding
-  to the function that this is called from, so each function is responsible
-  to call this to have its own frames rewritten.
-
-  This function always raises an error.
-
-  Args:
-    source_map: Dict[origin_info.Location, origin_info.OriginInfo], the source
-        map belonging to the calling function
-
-  Raises:
-    GraphConstructionError: The rewritten underlying error.
-    Exception: The underlying error, if it could not be rewritten.
-  """
-  error_info = sys.exc_info()
-  _, original_error, e_traceback = error_info
-  assert original_error is not None
-  try:
-    current_traceback = _cut_traceback_loops(source_map,
-                                             traceback.extract_tb(e_traceback))
-    if isinstance(original_error, GraphConstructionError):
-      # TODO(mdan): This is incomplete.
-      # The error might have bubbled through a non-converted function.
-      previous_traceback = original_error.custom_traceback
-      cleaned_traceback = [current_traceback[0]] + previous_traceback
-    else:
-      cleaned_traceback = current_traceback
-
-    cleaned_traceback = _rewrite_tb(source_map, cleaned_traceback)
-
-    if isinstance(original_error, GraphConstructionError):
-      original_error.custom_traceback = cleaned_traceback
-      new_error = original_error
-    else:
-      new_error = GraphConstructionError(original_error, cleaned_traceback)
-  except Exception:
-    logging.exception('Error while rewriting AutoGraph error:')
-    # TODO(mdan): Should reraise here, removing the top frame as well.
-    raise original_error
-  else:
-    raise new_error
-  finally:
-    # Addresses warning https://docs.python.org/2/library/sys.html#sys.exc_info.
-    del e_traceback
-
-
-def _cut_traceback_loops(source_map, original_traceback):
-  """Check for cases where we leave a user method and re-enter it.
-
-  This is done by looking at the function names when the filenames are from any
-  files the user code is in.  If we find a case where we return to a user method
-  after leaving it then we cut out the frames in between because we assume this
-  means these in between frames are from internal AutoGraph code that shouldn't
-  be included.
-
-  An example of this is:
-
-   File "file1.py", line 57, in my_func
-     ...
-   File "control_flow_ops.py", line 231, in cond
-     ...
-   File "control_flow_ops.py", line 1039, in inner_cond
-     ...
-   File "file1.py", line 68, in my_func
-     ...
-
-  Where we would remove the control_flow_ops.py frames because we re-enter
-  my_func in file1.py.
-
-  The source map keys are (file_path, line_number) so get the set of all user
-  file_paths.
-
-  Args:
-    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo], mapping
-      locations to their origin
-    original_traceback: List[Tuple[Text, Text, Text, Text]], consistent with
-      traceback.extract_tb.
-
-  Returns:
-    List[Tuple[Text, Text, Text, Text]], the traceback with any loops removed.
-  """
-  all_user_files = set(loc.filename for loc in source_map)
-  cleaned_traceback = []
-  last_user_frame_index = None
-  last_user_user_file_path = None
-  # TODO(mdan): Simplify this logic.
-  for fi, frame in enumerate(original_traceback):
-    frame_file_path, lineno, _, _ = frame
-    src_map_key = origin_info.LineLocation(frame_file_path, lineno)
-    if frame_file_path in all_user_files:
-      if src_map_key in source_map:
-        if (last_user_frame_index is not None and
-            last_user_user_file_path == frame_file_path):
-          cleaned_traceback = cleaned_traceback[:last_user_frame_index]
-      last_user_frame_index = fi
-      last_user_user_file_path = frame_file_path
-    cleaned_traceback.append(frame)
-  return cleaned_traceback
-
-
-# TODO(mdan): This should be consistent with rewrite_graph_construction_error
-# Both should either raise or return.
-def rewrite_tf_runtime_error(error, source_map):
-  """Rewrites TensorFlow runtime errors raised by ops created in AG code.
-
-  Args:
-    error: tf.errors.OpError
-    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo]
-
-  Returns:
-    TfRuntimeError, the rewritten underlying error.
-  """
-  try:
-    cleaned_traceback = _cut_traceback_loops(source_map, error.op.traceback)
-    cleaned_traceback = _rewrite_tb(source_map, cleaned_traceback)
-
-    op_name = error.op.name
-    op_message = error.message
-    rewritten_error = TfRuntimeError(op_name, op_message, cleaned_traceback)
-    return rewritten_error
-  except Exception:  # pylint: disable=broad-except
-    logging.exception('Error while rewriting AutoGraph error:')
-    return error
-
-
-# TODO(znado): Add arg to enable different levels of error rewriting.
-@contextlib.contextmanager
-def improved_errors(converted_function):
-  """Context manager that rewrites runtime errors.
-
-  This context manager will rewrite runtime errors so that their traceback
-  is relative to the original code before conversion.
-
-  Use with the output of to_graph, and wrap the execution of respective ops.
-  Example:
-
-    converted_my_func = ag.to_graph(my_func)
-    ops = converted_my_func(...)
-
-    with ag.improved_errors(converted_my_func):
-      sess.run(ops)
-
-  Args:
-    converted_function: Callable[..., Any], the output of a to_graph call
-
-  Yields:
-    None
-
-  Raises:
-    TfRuntimeError: if any OpError originates in the converted code, it will
-        be wrapped into a TfRuntimeError
-    ValueError: If converted_function is not generated by AutoGraph
-  """
-  if (getattr(converted_function, 'ag_source_map', None) is None or
-      not isinstance(converted_function.ag_source_map, dict)):
-    raise ValueError(
-        'converted_function must be the result of an autograph.to_graph call')
-  try:
-    yield
-  except errors_impl.OpError as e:
-    raise rewrite_tf_runtime_error(e, converted_function.ag_source_map)
diff --git a/tensorflow/python/autograph/core/errors_test.py b/tensorflow/python/autograph/core/errors_test.py
deleted file mode 100644
index 845a28a..0000000
--- a/tensorflow/python/autograph/core/errors_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for errors module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.core import errors
-from tensorflow.python.autograph.pyct import origin_info
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors as tf_errors
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import tf_inspect
-
-
-def zero_div():
-  x = array_ops.constant(10, dtype=dtypes.int32)
-  return x // 0
-
-
-def zero_div_caller():
-  return zero_div()
-
-
-class RuntimeErrorsTest(test.TestCase):
-
-  def fake_origin(self, function, line_offset):
-    _, lineno = tf_inspect.getsourcelines(function)
-    filename = tf_inspect.getsourcefile(function)
-    lineno += line_offset
-    loc = origin_info.LineLocation(filename, lineno)
-    origin = origin_info.OriginInfo(loc, 'test_function_name', 'test_code',
-                                    'test_comment')
-    return loc, origin
-
-  @test_util.run_deprecated_v1
-  def test_improved_errors_basic(self):
-    loc, origin = self.fake_origin(zero_div, 2)
-    zero_div_caller.ag_source_map = {loc: origin}
-
-    ops = zero_div_caller()
-    with self.assertRaises(errors.TfRuntimeError) as cm:
-      with errors.improved_errors(zero_div_caller):
-        with self.cached_session() as sess:
-          self.evaluate(ops)
-
-    for frame in cm.exception.custom_traceback:
-      _, _, function_name, _ = frame
-      self.assertNotEqual('zero_div', function_name)
-    self.assertIn(origin.as_frame(), set(cm.exception.custom_traceback))
-
-  @test_util.run_deprecated_v1
-  def test_improved_errors_no_matching_lineno(self):
-    loc, origin = self.fake_origin(zero_div, -1)
-    zero_div_caller.ag_source_map = {loc: origin}
-
-    ops = zero_div_caller()
-    with self.assertRaises(errors.TfRuntimeError) as cm:
-      with errors.improved_errors(zero_div_caller):
-        with self.cached_session() as sess:
-          self.evaluate(ops)
-
-    all_function_names = set()
-    for frame in cm.exception.custom_traceback:
-      _, _, function_name, _ = frame
-      all_function_names.add(function_name)
-      self.assertNotEqual('test_function_name', function_name)
-    self.assertIn('zero_div', all_function_names)
-
-  @test_util.run_deprecated_v1
-  def test_improved_errors_failures(self):
-    loc, _ = self.fake_origin(zero_div, 2)
-    zero_div_caller.ag_source_map = {loc: 'bogus object'}
-
-    ops = zero_div_caller()
-    with self.assertRaises(tf_errors.InvalidArgumentError):
-      with errors.improved_errors(zero_div_caller):
-        with self.cached_session() as sess:
-          self.evaluate(ops)
-
-  def test_improved_errors_validation(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        'converted_function must be the result of an autograph.to_graph call'):
-      errors.improved_errors(zero_div).__enter__()
-    with self.assertRaisesRegexp(
-        ValueError,
-        'converted_function must be the result of an autograph.to_graph call'):
-      zero_div_caller.ag_source_map = 'not a dict'
-      errors.improved_errors(zero_div_caller).__enter__()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 66f7915..fe205cd 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -31,9 +31,7 @@
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
-        "//third_party/py/numpy",
         "@gast_archive//:gast",
-        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index d2b1b7d..df75aa0 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -24,16 +24,12 @@
 import inspect
 import os
 import pdb
+import re
 import sys
 import textwrap
 
 from enum import Enum
 
-# pylint:disable=g-bad-import-order
-import six
-# pylint:enable=g-bad-import-order
-
-
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import conversion
 from tensorflow.python.autograph.operators import py_builtins
@@ -192,7 +188,6 @@
     return True
   # Note: inspect is required here, to avoid unpacking tf.function decorators.
   if inspect.ismethod(f):
-    f = six.get_unbound_function(f)
     # The the unbound method if of this type. Example:
     #
     # class ClassType:
@@ -201,7 +196,7 @@
     #     ...
     # o = ClassType()
     # o.method()
-    if isinstance(f, type_entity):
+    if isinstance(f.__func__, type_entity):
       return True
   return False
 
@@ -262,10 +257,18 @@
 
   # Other built-in modules are permanently whitelisted.
   # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
-  if any(f in m.__dict__.values() for m in (collections, pdb, copy, inspect)):
+  if any(
+      f in m.__dict__.values() for m in (collections, pdb, copy, inspect, re)):
     logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
     return _call_unconverted(f, args, kwargs)
 
+  # Custom ops and kernels are also permanently whitelisted.
+  # See tensorflow.framework.load_library.
+  if (hasattr(f, '__module__')
+      and hasattr(f.__module__, '_IS_TENSORFLOW_PLUGIN')):
+    logging.log(2, 'Permanently whitelisted: %s: TensorFlow plugin', f)
+    return _call_unconverted(f, args, kwargs)
+
   if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
     return _call_unconverted(f, args, kwargs)
 
@@ -319,11 +322,18 @@
       target_entity = f
       raise NotImplementedError('unknown callable type "%s"' % type(f))
 
-    if (not tf_inspect.isclass(target_entity) and
-        not hasattr(target_entity, '__code__')):
-      logging.log(
-          2, 'Permanently whitelisted: %s: native binding', target_entity)
-      return _call_unconverted(f, args, kwargs)
+    if not tf_inspect.isclass(target_entity):
+      if not hasattr(target_entity, '__code__'):
+        logging.log(
+            2, 'Permanently whitelisted: %s: native binding', target_entity)
+        return _call_unconverted(f, args, kwargs)
+      elif (hasattr(target_entity.__code__, 'co_filename') and
+            target_entity.__code__.co_filename == '<string>'):
+        # TODO(mdan): __globals__['txt'] might work in Py3.
+        logging.log(
+            2, 'Permanently whitelisted: %s: dynamic code (exec?)',
+            target_entity)
+        return _call_unconverted(f, args, kwargs)
 
     converted_f = to_graph(
         target_entity,
@@ -362,10 +372,17 @@
 
     return _call_unconverted(f, args, kwargs)
 
-  if kwargs is not None:
-    result = converted_f(*effective_args, **kwargs)
-  else:
-    result = converted_f(*effective_args)
+  try:
+    if kwargs is not None:
+      result = converted_f(*effective_args, **kwargs)
+    else:
+      result = converted_f(*effective_args)
+  except errors.StagingError as e:
+    target_origin = errors.extract_origin_info(converted_f)
+    raise errors.StagingError((target_origin,) + e.user_trace, e.original_error)
+  except errors.AutoGraphError as e:
+    target_origin = errors.extract_origin_info(converted_f)
+    raise errors.StagingError((target_origin,), e)
 
   return result
 
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index de61023..d65a4ed 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -21,7 +21,10 @@
 import collections
 import functools
 import gc
+import imp
 import os
+import re
+import textwrap
 import types
 
 import numpy as np
@@ -202,7 +205,8 @@
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
           x //= api.converted_call(self.called_member, None,
-                                   converter.ConversionOptions(), (a,), {})
+                                   converter.ConversionOptions(recursive=True),
+                                   (a,), {})
         return x
 
     tc = TestClass()
@@ -212,9 +216,16 @@
     self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
-    x = api.converted_call(range, None, converter.ConversionOptions(), (3,), {})
+    x = api.converted_call(range, None,
+                           converter.ConversionOptions(recursive=True), (3,),
+                           {})
     self.assertEqual((0, 1, 2), tuple(x))
 
+    x = api.converted_call('compile', re,
+                           converter.ConversionOptions(recursive=True),
+                           ('mnas_v4_a.*\\/.*(weights|kernel):0$',), {})
+    self.assertIsNotNone(x.match('mnas_v4_a/weights:0'))
+
   def test_converted_call_function(self):
 
     def test_fn(x):
@@ -222,7 +233,8 @@
         return -x
       return x
 
-    x = api.converted_call(test_fn, None, converter.ConversionOptions(),
+    x = api.converted_call(test_fn, None,
+                           converter.ConversionOptions(recursive=True),
                            (constant_op.constant(-1),), {})
     self.assertEqual(1, self.evaluate(x))
 
@@ -236,13 +248,15 @@
 
     x = api.converted_call(
         functools.partial(test_fn, constant_op.constant(-1), z=-3), None,
-        converter.ConversionOptions(), (constant_op.constant(-2),), {})
+        converter.ConversionOptions(recursive=True),
+        (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
     x = api.converted_call(
         functools.partial(
             functools.partial(test_fn, constant_op.constant(-1)), z=-3), None,
-        converter.ConversionOptions(), (constant_op.constant(-2),), {})
+        converter.ConversionOptions(recursive=True),
+        (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
@@ -266,8 +280,8 @@
         return self.x
 
     tc = TestClass(constant_op.constant(-1))
-    x = api.converted_call(tc.test_method, None, converter.ConversionOptions(),
-                           (), {})
+    x = api.converted_call(tc.test_method, None,
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_synthetic_method(self):
@@ -285,8 +299,8 @@
     tc = TestClass(constant_op.constant(-1))
     test_method = types.MethodType(test_function, tc)
 
-    x = api.converted_call(test_method, None, converter.ConversionOptions(),
-                           (), {})
+    x = api.converted_call(test_method, None,
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_wrapper(self):
@@ -299,8 +313,9 @@
     tc = TestClass()
 
     # `method.__get__()` returns a so-called method-wrapper.
-    wrapper = api.converted_call(
-        '__get__', tc.foo, converter.ConversionOptions(), (tc,), {})
+    wrapper = api.converted_call('__get__', tc.foo,
+                                 converter.ConversionOptions(recursive=True),
+                                 (tc,), {})
     self.assertEqual(wrapper, tc.foo)
 
   def test_converted_call_method_as_object_attribute(self):
@@ -324,7 +339,7 @@
     tc = TestClass(obj.method)
 
     x = api.converted_call('another_obj_method', tc,
-                           converter.ConversionOptions(), (), {})
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(self.evaluate(x), 2)
 
   def test_converted_call_method_converts_recursively(self):
@@ -361,7 +376,8 @@
 
     tc = TestClass(constant_op.constant(-1))
     x = api.converted_call(TestClass.test_method, None,
-                           converter.ConversionOptions(), (tc,), {})
+                           converter.ConversionOptions(recursive=True), (tc,),
+                           {})
     self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
@@ -377,7 +393,8 @@
         return self.x
 
     tc = TestClass(constant_op.constant(-1))
-    x = api.converted_call(tc, None, converter.ConversionOptions(), (), {})
+    x = api.converted_call(tc, None,
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(1, self.evaluate(x))
 
   @test_util.run_deprecated_v1
@@ -393,7 +410,8 @@
           return -self.x
         return self.x
 
-    tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
+    tc = api.converted_call(TestClass, None,
+                            converter.ConversionOptions(recursive=True),
                             (constant_op.constant(-1),), {})
     # tc is still a TestClass - constructors are whitelisted.
     # TODO(b/124016764): Support this use case.
@@ -407,13 +425,14 @@
     def f(x):
       return x == 0
 
-    x = api.converted_call(f, None, converter.ConversionOptions(),
+    x = api.converted_call(f, None, converter.ConversionOptions(recursive=True),
                            (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
 
     converted_f = api.to_graph(
         f, experimental_optional_features=converter.Feature.ALL)
-    x = api.converted_call(converted_f, None, converter.ConversionOptions(),
+    x = api.converted_call(converted_f, None,
+                           converter.ConversionOptions(recursive=True),
                            (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
 
@@ -429,7 +448,7 @@
     def f(g, x):
       return g(x)
 
-    x = api.converted_call(f, None, converter.ConversionOptions(),
+    x = api.converted_call(f, None, converter.ConversionOptions(recursive=True),
                            (g, constant_op.constant(1)), {})
     self.assertEqual(self.evaluate(x), 1)
 
@@ -463,7 +482,7 @@
 
   def test_converted_call_whitelisted_method(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     model = sequential.Sequential([
         core.Dense(2)
@@ -477,7 +496,7 @@
 
   def test_converted_call_whitelisted_method_via_owner(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     model = sequential.Sequential([
         core.Dense(2)
@@ -491,7 +510,7 @@
 
   def test_converted_call_numpy(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     x = api.converted_call(np.arange, None, opts, (5,), {})
 
@@ -507,9 +526,23 @@
 
     self.assertAllEqual(self.evaluate(x), 2)
 
+  def test_converted_call_exec_generated_code(self):
+
+    temp_mod = imp.new_module('test_module')
+    dynamic_code = '''
+      def foo(x):
+        return x + 1
+    '''
+    exec(textwrap.dedent(dynamic_code), temp_mod.__dict__)  # pylint:disable=exec-used
+    opts = converter.ConversionOptions(optional_features=None)
+
+    x = api.converted_call(temp_mod.foo, None, opts, (1,), {})
+
+    self.assertAllEqual(x, 2)
+
   def test_converted_call_namedtuple(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     x = api.converted_call(collections.namedtuple, None, opts,
                            ('TestNamedtuple', ('a', 'b')), {})
@@ -518,7 +551,7 @@
 
   def test_converted_call_namedtuple_via_collections(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     x = api.converted_call('namedtuple', collections, opts, ('TestNamedtuple',
                                                              ('a', 'b')), {})
@@ -527,7 +560,7 @@
 
   def test_converted_call_lambda(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     l = lambda x: x == 0
 
@@ -538,7 +571,7 @@
 
   def test_converted_call_defun_object_method(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     # pylint:disable=method-hidden
     class TestClass(object):
@@ -574,7 +607,7 @@
       def f(y):
         return res.x + y
 
-      opts = converter.ConversionOptions()
+      opts = converter.ConversionOptions(recursive=True)
       api.converted_call(f, None, opts, (1,), {})
 
     self.assertNoMemoryLeaks(test_fn)
@@ -591,7 +624,7 @@
 
         return inner_f
 
-      opts = converter.ConversionOptions()
+      opts = converter.ConversionOptions(recursive=True)
       api.converted_call(f, None, opts, (1,), {})()
 
     self.assertNoMemoryLeaks(test_fn)
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index d67f0ad..6979a1a 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -38,7 +38,6 @@
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.converters import directives
-from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
@@ -47,7 +46,6 @@
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import errors as ag_errors
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.core import unsupported_features_checker
@@ -337,16 +335,10 @@
   if hasattr(m, '__name__'):
     # Builtins typically have unnamed modules.
     for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
-      if m.__name__.startswith(prefix):
+      if m.__name__.startswith(prefix + '.') or m.__name__ == prefix:
         logging.log(2, 'Whitelisted: %s: name starts with "%s"', o, prefix)
         return True
 
-    # Temporary -- whitelist tensorboard modules.
-    # TODO(b/122731813): Remove.
-    if m.__name__ == 'tensorboard' or '.tensorboard' in m.__name__:
-      logging.log(2, 'Whitelisted: %s: name contains "tensorboard"', o)
-      return True
-
   if hasattr(o, 'autograph_info__') or hasattr(o, '__ag_compiled'):
     logging.log(2, 'Whitelisted: %s: already converted', o)
     return True
@@ -585,8 +577,6 @@
     ag_internal.Feature = converter.Feature
     ag_internal.utils = utils
     ag_internal.function_scope = function_wrapping.function_scope
-    ag_internal.rewrite_graph_construction_error = (
-        ag_errors.rewrite_graph_construction_error)
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
@@ -693,6 +683,4 @@
   # TODO(mdan): If function scopes ever does more, the toggle will need moving.
   if context.program.options.uses(converter.Feature.NAME_SCOPES):
     node = converter.apply_(node, context, function_scopes)
-  if context.program.options.uses(converter.Feature.ERROR_REWRITING):
-    node = converter.apply_(node, context, error_handlers)
   return node
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index bdbdb87..02efb78 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import imp
 import gast
 
 from tensorflow.python.autograph import utils
@@ -46,6 +47,16 @@
     self.assertTrue(conversion.is_whitelisted_for_graph(utils))
     self.assertTrue(conversion.is_whitelisted_for_graph(constant_op.constant))
 
+  def test_is_whitelisted_for_graph_tensorflow_like(self):
+
+    tf_like = imp.new_module('tensorflow_foo')
+    def test_fn():
+      pass
+    tf_like.test_fn = test_fn
+    test_fn.__module__ = tf_like
+
+    self.assertFalse(conversion.is_whitelisted_for_graph(tf_like.test_fn))
+
   def test_convert_entity_to_ast_unsupported_types(self):
     with self.assertRaises(NotImplementedError):
       program_ctx = self._simple_program_ctx()
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 5b3f45d..bbc684e 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -72,4 +72,6 @@
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
 from tensorflow.python.autograph.operators.special_values import is_undefined
+from tensorflow.python.autograph.operators.special_values import is_undefined_return
 from tensorflow.python.autograph.operators.special_values import Undefined
+from tensorflow.python.autograph.operators.special_values import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index eba4414..5575b4c 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -22,13 +22,16 @@
 from tensorflow.python.autograph.operators import special_values
 from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.utils import ag_logging
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.experimental.ops import take_while_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import tensor_array_ops
 
 
 LIMIT_PYTHON_ITERATIONS = True
@@ -38,6 +41,24 @@
 INEFFICIENT_UNROLL_MIN_OPS = 1
 
 
+def _disallow_undefs_into_loop(*values):
+  """Ensures that all values in the state are defined when entering a loop."""
+  undefined = tuple(filter(special_values.is_undefined, values))
+  if undefined:
+    raise ValueError(
+        'TensorFlow requires that the following symbols must be defined'
+        ' before the loop: {}'.format(
+            tuple(s.symbol_name for s in undefined)))
+
+  for value in values:
+    if special_values.is_undefined_return(value):
+      # Assumption: the loop will only capture the variable which tracks the
+      # return value if the loop contained a return statement.
+      # TODO(mdan): This should be checked at the place where return occurs.
+      raise ValueError(
+          'Return statements are not supported within a TensorFlow loop.')
+
+
 def for_stmt(iter_, extra_test, body, init_state):
   """Functional form of a for statement.
 
@@ -71,29 +92,23 @@
   Returns:
     Tuple containing the final state.
   """
-  def _check_undefined_symbols(init_state):
-    # Check for undefined symbols and report an error. This prevents the error
-    # from propagating into the TF runtime. We have more information here and
-    # can provide a clearer error message.
-    undefined = tuple(filter(special_values.is_undefined, init_state))
-    if undefined:
-      raise ValueError(
-          'TensorFlow requires that the following symbols must be defined'
-          ' before the loop: {}'.format(
-              tuple(s.symbol_name for s in undefined)))
-
   if tensor_util.is_tensor(iter_):
     return _known_len_tf_for_stmt(iter_, extra_test, body, init_state)
 
-  elif isinstance(iter_, dataset_ops.DatasetV2):
-    _check_undefined_symbols(init_state)
-    return _dataset_for_stmt(iter_, extra_test, body, init_state)
-  elif hasattr(iter_, '_autograph_for_loop'):
-    _check_undefined_symbols(init_state)
-    # This is an experimental mechanism and is subject to change.
-    return iter_._autograph_for_loop(extra_test, body, init_state)  # pylint: disable=protected-access
-  else:
-    return _py_for_stmt(iter_, extra_test, body, init_state)
+  if isinstance(iter_, dataset_ops.DatasetV2):
+    return _tf_dataset_for_stmt(iter_, extra_test, body, init_state)
+
+  if isinstance(iter_, iterator_ops.IteratorV2):
+    return _tf_iterator_for_stmt(iter_, extra_test, body, init_state)
+
+  # Note: This experimental interface is subject to change.
+  custom_handler = getattr(iter_, '_autograph_for_loop', None)
+  if custom_handler is not None:
+    # TODO(mdan): TensorFlow-specific verification - handlers should perform it.
+    _disallow_undefs_into_loop(*init_state)
+    return custom_handler(extra_test, body, init_state)
+
+  return _py_for_stmt(iter_, extra_test, body, init_state)
 
 
 def _py_for_stmt(iter_, extra_test, body, init_state):
@@ -107,11 +122,18 @@
 
 
 def _known_len_tf_for_stmt(iter_, extra_test, body, init_state):
-  """Overload of for_stmt that iterates over objects that admit a length."""
+  """Overload of for_stmt that iterates over TF entities that admit a length."""
+  _disallow_undefs_into_loop(*init_state)
+
   n = py_builtins.len_(iter_)
+  # TODO(b/117628877): Revisit performance once XLA has the necessary support.
+  # Note: using a TensorArray creates an extra copy, but can calculate
+  # gradients more efficiently than StridedSlice.
+  ta = tensor_array_ops.TensorArray(iter_.dtype, size=n)
+  iter_ = ta.unstack(iter_)
 
   def while_body(iterate_index, *state):
-    iterate = iter_[iterate_index]
+    iterate = iter_.read(iterate_index)
     new_state = body(iterate, *state)
 
     state = (iterate_index + 1,)
@@ -122,7 +144,10 @@
 
   def while_cond(iterate_index, *state):
     if extra_test is not None:
-      return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+      return control_flow_ops.cond(
+          iterate_index < n,
+          lambda: extra_test(*state),
+          lambda: False)
     return iterate_index < n
 
   results = _tf_while_stmt(
@@ -143,13 +168,95 @@
   return results
 
 
-def _dataset_for_stmt(ds, extra_test, body, init_state):
+def _tf_iterator_for_stmt(itr, extra_test, body, init_state):
+  """Overload of for_stmt that iterates over TF Iterators. See for_loop."""
+  _disallow_undefs_into_loop(*init_state)
+
+  def while_body_actual(opt_iterate, *state):
+    new_state = body(opt_iterate.get_value(), *state)
+    # TODO(mdan): Fix this inconsistency in the converter.
+    if new_state is None:
+      new_state = ()
+    return new_state
+
+  def while_body(has_next, state):
+    """Main loop body."""
+    opt_iterate = iterator_ops.get_next_as_optional(itr)
+    has_next = opt_iterate.has_value()
+
+    if not init_state:
+      # cond_v2 requires at least one state tensor in V1.
+      dummy_state = (constant_op.constant(()),)
+    else:
+      dummy_state = ()
+
+    # TODO(mdan): If tf.while_loop supported Optional, this could be avoided.
+    new_state = control_flow_ops.cond(
+        has_next,
+        lambda: dummy_state + while_body_actual(opt_iterate, *state),
+        lambda: dummy_state + state)
+
+    if dummy_state:
+      new_state = new_state[1:]
+
+    return has_next, new_state
+
+  def while_cond(has_next, state):
+    if extra_test is not None:
+      return control_flow_ops.cond(
+          has_next,
+          lambda: extra_test(*state),
+          lambda: False)
+    return has_next
+
+  _, final_state = _tf_while_stmt(
+      while_cond,
+      while_body,
+      init_state=(True, init_state),
+      opts=None)
+  return final_state
+
+
+def _tf_dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
+  _disallow_undefs_into_loop(*init_state)
 
   if extra_test is not None:
-    raise NotImplementedError(
-        'break and return statements are not yet supported in '
-        'for/Dataset loops.')
+    assert init_state, 'Lowering should always add state.'
+    return _dataset_for_stmt_with_extra_test(ds, extra_test, body, init_state)
+
+  return _dataset_for_stmt_no_extra_test(ds, body, init_state)
+
+
+def _dataset_for_stmt_with_extra_test(ds, extra_test, body, init_state):
+  """Overload of _dataset_for_stmt with early stopping. See for_stmt."""
+
+  def scan_body(state, iterate):
+    extra_cond = extra_test(*state)
+    new_state = control_flow_ops.cond(
+        extra_cond, lambda: body(iterate, *state), lambda: state)
+    aug_state = new_state, extra_cond
+    # Note: new_state is the actual state of scan; aug_state is its output
+    # (hence the redundancy).
+    return new_state, aug_state
+
+  def take_while_predicate(new_state, extra_cond):
+    del new_state
+    return extra_cond
+
+  def reduce_body(old_state, aug_state):
+    del old_state
+    new_state, extra_cond = aug_state
+    del extra_cond
+    return new_state
+
+  ds = ds.apply(scan_ops.scan(init_state, scan_body))
+  ds = ds.apply(take_while_ops.take_while(take_while_predicate))
+  return ds.reduce(init_state, reduce_body)
+
+
+def _dataset_for_stmt_no_extra_test(ds, body, init_state):
+  """Overload of _dataset_for_stmt without early stopping. See for_stmt."""
 
   def reduce_body(state, iterate):
     new_state = body(iterate, *state)
@@ -158,7 +265,7 @@
   if init_state:
     return ds.reduce(init_state, reduce_body)
 
-  # Workaround for Datset.reduce not allowing empty state tensors - create
+  # Workaround for Dataset.reduce not allowing empty state tensors - create
   # a dummy state variable that remains unused.
   def reduce_body_with_dummy_state(state, iterate):
     reduce_body((), iterate)
@@ -209,16 +316,11 @@
 
 def _tf_while_stmt(test, body, init_state, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
+  _disallow_undefs_into_loop(*init_state)
+
   if opts is None:
     opts = {}
 
-  undefined = tuple(filter(special_values.is_undefined, init_state))
-  if undefined:
-    raise ValueError(
-        'TensorFlow requires that the following symbols must be initialized '
-        'to a Tensor, Variable or TensorArray before the loop: {}'.format(
-            tuple(s.symbol_name for s in undefined)))
-
   # Non-v2 while_loop unpacks the results when there is only one return value.
   # This enforces consistency across versions.
   opts['return_same_structure'] = True
@@ -342,8 +444,8 @@
 
 def tf_if_stmt(cond, body, orelse, get_state, set_state):
   """Overload of if_stmt that stages a TF cond."""
-  body = _disallow_undefs(body, branch_name='if')
-  orelse = _disallow_undefs(orelse, branch_name='else')
+  body = _wrap_disallow_undefs_from_cond(body, branch_name='if')
+  orelse = _wrap_disallow_undefs_from_cond(orelse, branch_name='else')
   body = _isolate_state(body, get_state, set_state)
   orelse = _isolate_state(orelse, get_state, set_state)
 
@@ -391,8 +493,8 @@
   return wrapper
 
 
-def _disallow_undefs(func, branch_name):
-  """Wraps function to raise useful error when it returns undefined symbols."""
+def _wrap_disallow_undefs_from_cond(func, branch_name):
+  """Wraps conditional branch to disallow returning undefined symbols."""
 
   def wrapper():
     """Calls function and raises an error if undefined symbols are returned."""
@@ -410,6 +512,13 @@
           ' statement.'.format(branch_name,
                                tuple(s.symbol_name for s in undefined)))
 
+    for result in results_tuple:
+      if special_values.is_undefined_return(result):
+        raise ValueError(
+            'A value must also be returned from the {} branch. If a value is '
+            'returned from one branch of a conditional a value must be '
+            'returned from all branches.'.format(branch_name))
+
     return results
 
   return wrapper
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 4aaf0ac..72cf214 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -32,6 +32,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -39,44 +40,104 @@
 
 class ForLoopTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_tensor(self):
-    s = control_flow.for_stmt(
-        constant_op.constant([1, 2, 3, 4]),
-        extra_test=lambda s: True,
-        body=lambda i, s: (s + i,),
-        init_state=(0,))
-    with self.cached_session():
-      self.assertEqual((10,), self.evaluate(s))
+    with ops.Graph().as_default():
+      s = control_flow.for_stmt(
+          constant_op.constant([1, 2, 3, 4]),
+          extra_test=lambda s: True,
+          body=lambda i, s: (s * 10 + i,),
+          init_state=(0,))
+      self.assertEqual(self.evaluate(s), (1234,))
 
   def test_python(self):
     s = control_flow.for_stmt(
         range(5),
         extra_test=lambda s: True,
-        body=lambda i, s: (s + i,),
+        body=lambda i, s: (s * 10 + i,),
         init_state=(0,))
-    self.assertEqual((10,), s)
+    self.assertEqual(s, (1234,))
 
-  def test_dataset(self):
+  def test_tf_dataset(self):
+    with ops.Graph().as_default():
+      s = control_flow.for_stmt(
+          dataset_ops.Dataset.range(5),
+          extra_test=None,
+          body=lambda i, s: (s * 10 + i,),
+          init_state=(constant_op.constant(0, dtype=dtypes.int64),))
+      self.assertEqual(self.evaluate(s), (1234,))
+
+  def test_dataset_with_extra_test(self):
     s = control_flow.for_stmt(
         dataset_ops.Dataset.range(5),
-        extra_test=None,
+        extra_test=lambda s: s < 3,
         body=lambda i, s: (s + i,),
         init_state=(constant_op.constant(0, dtype=dtypes.int64),))
-    self.assertEqual(self.evaluate(s), (10,))
+    self.assertEqual(self.evaluate(s), (3,))
 
-  @test_util.run_v2_only
-  def test_dataset_no_state(self):
-    v = variables.Variable(0, dtype=dtypes.int64)
-    def stateless_with_side_effects(i):
-      v.assign(v.read_value() + i)
+  def test_dataset_with_extra_test_no_extra_iterations(self):
+
+    def guarded_body(i, s):
+      with ops.control_dependencies((control_flow_ops.Assert(i < 3, (i,)),)):
+        return s + i,
+
     s = control_flow.for_stmt(
         dataset_ops.Dataset.range(5),
-        extra_test=None,
-        body=stateless_with_side_effects,
-        init_state=())
-    self.evaluate(s)
-    self.assertEqual(self.evaluate(v.read_value()), 10)
+        extra_test=lambda s: s < 3,
+        body=guarded_body,
+        init_state=(constant_op.constant(0, dtype=dtypes.int64),))
+    self.assertEqual(self.evaluate(s), (3,))
+
+  @test_util.run_v2_only
+  def test_tf_dataset_no_state(self):
+    v = variables.Variable(0, dtype=dtypes.int64)
+    self.evaluate(v.initializer)
+
+    def stateless_with_side_effects(i):
+      v.assign(v.read_value() * 10 + i)
+
+    # function is important here, because ops test for its presence.
+    @def_function.function(autograph=False)
+    def test_fn():
+      control_flow.for_stmt(
+          dataset_ops.Dataset.range(5),
+          extra_test=None,
+          body=stateless_with_side_effects,
+          init_state=())
+
+    test_fn()
+    self.assertEqual(self.evaluate(v.read_value()), 1234)
+
+  def test_tf_iterator(self):
+    # graph-mode iterators are only supported inside tf.function.
+    @def_function.function(autograph=False)
+    def test_fn():
+      itr = iter(dataset_ops.Dataset.range(5))
+      return control_flow.for_stmt(
+          itr,
+          extra_test=None,
+          body=lambda i, s: (s * 10 + i,),
+          init_state=(constant_op.constant(0, dtype=dtypes.int64),))
+    s, = test_fn()
+    self.assertAllEqual(s, 1234)
+
+  @test_util.run_v2_only
+  def test_tf_iterator_no_state(self):
+    v = variables.Variable(0, dtype=dtypes.int64)
+
+    def stateless_with_side_effects(i):
+      v.assign(v.read_value() * 10 + i)
+
+    # graph-mode iterators are only supported inside tf.function.
+    @def_function.function(autograph=False)
+    def test_fn():
+      control_flow.for_stmt(
+          iter(dataset_ops.Dataset.range(5)),
+          extra_test=None,
+          body=stateless_with_side_effects,
+          init_state=())
+
+    test_fn()
+    self.assertEqual(self.evaluate(v.read_value()), 1234)
 
 
 class WhileLoopTest(test.TestCase):
@@ -99,7 +160,7 @@
       v.assign(v.read_value() + 1)
       return v.read_value()
 
-    # function is important here, for its automatic control deps.
+    # function is important here, because ops test for its presence.
     @def_function.function(autograph=False)
     def test_fn():
       return control_flow.while_stmt(
@@ -134,7 +195,7 @@
 
   def test_python_infinite_loop(self):
     if __debug__:
-      with test.mock.patch.object(control_flow, 'PYTHON_MAX_ITERATIONS', 1000):
+      with test.mock.patch.object(control_flow, 'PYTHON_MAX_ITERATIONS', 100):
         with self.assertRaisesRegexp(errors.ExecutionError, 'iteration limit'):
           control_flow.while_stmt(
               test=lambda _: True,
@@ -143,17 +204,20 @@
 
   def test_python_long_loop_unroll_warning(self):
     if __debug__:
-      with ops.Graph().as_default():
-        out_capturer = six.StringIO()
-        with test.mock.patch.object(sys, 'stdout', out_capturer):
-          ag_logging.echo_log_to_stdout = True
-          sys.stdout = out_capturer
-          control_flow.while_stmt(
-              test=lambda i, _: i < 10000,
-              body=lambda i, _: (i + 1, gen_math_ops.add(i, 1),),
-              init_state=(0, None))
-        self.assertTrue(re.match(
-            r'.*ops.*loop.*large.*iterations.*Add.*', out_capturer.getvalue()))
+      with test.mock.patch.object(
+          control_flow, 'INEFFICIENT_UNROLL_MIN_ITERATIONS', 10):
+        with ops.Graph().as_default():
+          out_capturer = six.StringIO()
+          with test.mock.patch.object(sys, 'stdout', out_capturer):
+            ag_logging.echo_log_to_stdout = True
+            sys.stdout = out_capturer
+            control_flow.while_stmt(
+                test=lambda i, _: i < 100,
+                body=lambda i, _: (i + 1, gen_math_ops.add(i, 1),),
+                init_state=(0, None))
+          self.assertTrue(re.match(
+              r'.*ops.*loop.*large.*iterations.*Add.*',
+              out_capturer.getvalue()))
 
 
 class IfStmtTest(test.TestCase):
diff --git a/tensorflow/python/autograph/operators/slices.py b/tensorflow/python/autograph/operators/slices.py
index 2b7f5ad..af4074c 100644
--- a/tensorflow/python/autograph/operators/slices.py
+++ b/tensorflow/python/autograph/operators/slices.py
@@ -22,6 +22,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -119,9 +120,7 @@
     if target.dtype == dtypes.variant:
       return _tf_tensor_list_set_item(target, i, x)
     else:
-      raise ValueError(
-          'tensor lists are expected to be Tensors with dtype=tf.variant,'
-          ' instead found %s' % target)
+      return _tf_tensor_set_item(target, i, x)
   else:
     return _py_set_item(target, i, x)
 
@@ -136,6 +135,11 @@
   return list_ops.tensor_list_set_item(target, i, x)
 
 
+def _tf_tensor_set_item(target, i, x):
+  """Overload of set_item that stages a Tensor scatter update."""
+  return gen_array_ops.tensor_scatter_update(target, ((i,),), (x,))
+
+
 def _py_set_item(target, i, x):
   """Overload of set_item that executes a Python list modification."""
   target[i] = x
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/special_values.py
index 13d846f..a41f516 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/special_values.py
@@ -64,3 +64,13 @@
     Boolean, whether the input value is undefined.
   """
   return isinstance(value, Undefined)
+
+
+class UndefinedReturnValue(object):
+  """Represents a default return value from a function (None in Python)."""
+  pass
+
+
+def is_undefined_return(value):
+  """Checks whether `value` is the default return value."""
+  return isinstance(value, UndefinedReturnValue)
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 9f4b9f3..ec463a3 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -48,6 +48,7 @@
 py_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -58,6 +59,7 @@
 py_test(
     name = "ast_util_test",
     srcs = ["ast_util_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -69,6 +71,7 @@
 py_test(
     name = "cfg_test",
     srcs = ["cfg_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -80,6 +83,7 @@
 py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -91,6 +95,7 @@
 py_test(
     name = "inspect_utils_test",
     srcs = ["inspect_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -100,9 +105,19 @@
     ],
 )
 
+sh_test(
+    name = "inspect_utils_test_par",
+    srcs = ["inspect_utils_test.sh"],
+    data = [
+        ":inspect_utils_test.par",
+    ],
+    tags = ["no_oss"],
+)
+
 py_test(
     name = "origin_info_test",
     srcs = ["origin_info_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -113,6 +128,7 @@
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -123,6 +139,7 @@
 py_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -133,6 +150,7 @@
 py_test(
     name = "qual_names_test",
     srcs = ["qual_names_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -143,6 +161,7 @@
 py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -155,6 +174,7 @@
 py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
index 3c169ad..48e9fbb 100644
--- a/tensorflow/python/autograph/pyct/errors.py
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -18,6 +18,10 @@
 from __future__ import division
 from __future__ import print_function
 
+import sys
+import traceback
+
+from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.utils import ag_logging
 
 
@@ -50,11 +54,66 @@
         type(self.original_exc).__name__, self.message, self.original_exc)
 
 
+# TODO(znado): merge with ExecutionError.
+class StagingError(AutoGraphError):
+  """Raised when AutoGraph has an error while executing a converted function."""
+
+  def __init__(self, user_trace, original_error):
+    """Constructs a StagingError.
+
+    Args:
+      user_trace: Tuple[OriginInfo], the converted call traceback frames.
+      original_error: Exception, the original error thrown.
+    """
+    super(StagingError, self).__init__()
+    self.user_trace = user_trace
+    self.original_error = original_error
+
+  def __str__(self):
+    indent_str = '    '
+    new_stacktrace_lines = []
+    for origin in self.user_trace:
+      if not origin:
+        continue
+      frame_str = indent_str + '{}:{} ({})\n{}    {}'.format(
+          origin.loc.filename, origin.loc.lineno, origin.function_name,
+          indent_str, origin.source_code_line.strip())
+      new_stacktrace_lines.append(frame_str)
+    new_stacktrace_str = '\n'.join(new_stacktrace_lines)
+    original_type = self.original_error.__class__.__name__
+    original_message = str(self.original_error)
+    new_message = original_type + ': ' + original_message
+    return ('\nAn error occurred while executing AutoGraph transformed code. '
+            'For details, set the verbosity to 10 (on Linux, '
+            '`export AUTOGRAPH_VERBOSITY=10`). Corresponding code:\n' +
+            new_stacktrace_str + '\n\n' + indent_str + new_message + '\n\n')
+
+
 def report_internal_error(entity, exception):
   ag_logging.log(1, 'Error transforming %s', entity, exc_info=True)
   # TODO(znado): Add external bug reporting instructions.
   raise AutoGraphError(
-      'Unexpected error transforming %s. If you believe this is due to a bug,'
-      ' please set the verbosity to 10 (on Linux, `export '
+      'Unexpected error transforming %s. If you believe this is due to a bug, '
+      'please set the verbosity to 10 (on Linux, `export '
       'AUTOGRAPH_VERBOSITY=10`) and attach the full output when filing the bug '
       'report. Caused by: %s' % (entity, exception))
+
+
+def extract_origin_info(converted_f):
+  """Attempts to use converted_f's source map to get error origin info."""
+  source_map = converted_f.ag_source_map
+  original_traceback = traceback.extract_tb(sys.exc_info()[2])
+  # Can go through all frames and check which ones have origin info in order to
+  # filter for only the locations relevant to converted_f.
+  #
+  # Return the first occurrence of the reversed traceback in the source map in
+  # order to return the innermost frame for this function. We want to do this
+  # because when have a tf.cond we will have multiple matches and we want to
+  # return the last one in this function, because any after that will be in
+  # the next function/frame in the stacktrace.
+  for frame in reversed(original_traceback):
+    converted_loc = origin_info.LineLocation(
+        filename=frame[0], lineno=frame[1])
+    if converted_loc in source_map:
+      return source_map[converted_loc]
+  return None
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 855a690..beb9dd0 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -21,13 +21,20 @@
 from __future__ import division
 from __future__ import print_function
 
+import inspect
 import itertools
+import linecache
+import sys
+import threading
 import types
 
 import six
 
 from tensorflow.python.util import tf_inspect
 
+# This lock seems to help avoid linecache concurrency errors.
+_linecache_lock = threading.Lock()
+
 
 # These functions test negative for isinstance(*, types.BuiltinFunctionType)
 # and inspect.isbuiltin, and are generally not visible in globals().
@@ -83,6 +90,41 @@
   return False
 
 
+def _fix_linecache_record(obj):
+  """Fixes potential corruption of linecache in the presence of functools.wraps.
+
+  functools.wraps modifies the target object's __module__ field, which seems
+  to confuse linecache in special instances, for example when the source is
+  loaded from a .par file (see https://google.github.io/subpar/subpar.html).
+
+  This function simply triggers a call to linecache.updatecache when a mismatch
+  was detected between the object's __module__ property and the object's source
+  file.
+
+  Args:
+    obj: Any
+  """
+  if hasattr(obj, '__module__'):
+    obj_file = inspect.getfile(obj)
+    obj_module = obj.__module__
+
+    # A snapshot of the loaded modules helps avoid "dict changed size during
+    # iteration" errors.
+    loaded_modules = tuple(sys.modules.values())
+    for m in loaded_modules:
+      if hasattr(m, '__file__') and m.__file__ == obj_file:
+        if obj_module is not m:
+          linecache.updatecache(obj_file, m.__dict__)
+
+
+def getimmediatesource(obj):
+  """A variant of inspect.getsource that ignores the __wrapped__ property."""
+  with _linecache_lock:
+    _fix_linecache_record(obj)
+    lines, lnum = inspect.findsource(obj)
+    return ''.join(inspect.getblock(lines[lnum:]))
+
+
 def getnamespace(f):
   """Returns the complete namespace of a function.
 
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 7c198e6..ff91e53 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for unspect_utils module."""
+"""Tests for inspect_utils module."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +21,7 @@
 import collections
 import functools
 import imp
+import textwrap
 import types
 import weakref
 
@@ -28,6 +29,7 @@
 
 from tensorflow.python import lib
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct.testing import decorators
 from tensorflow.python.autograph.pyct.testing import future_import_module
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
@@ -133,6 +135,125 @@
 
     self.assertTrue(inspect_utils.isnamedtuple(NamedTupleSubclass))
 
+  def assertSourceIdentical(self, actual, expected):
+    self.assertEqual(
+        textwrap.dedent(actual).strip(),
+        textwrap.dedent(expected).strip()
+    )
+
+  def test_getimmediatesource_basic(self):
+
+    def test_decorator(f):
+
+      def f_wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+
+      return f_wrapper
+
+    expected = """
+      def f_wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+    """
+
+    @test_decorator
+    def test_fn(a):
+      """Test docstring."""
+      return [a]
+
+    self.assertSourceIdentical(
+        inspect_utils.getimmediatesource(test_fn), expected)
+
+  def test_getimmediatesource_noop_decorator(self):
+
+    def test_decorator(f):
+      return f
+
+    expected = '''
+      @test_decorator
+      def test_fn(a):
+        """Test docstring."""
+        return [a]
+    '''
+
+    @test_decorator
+    def test_fn(a):
+      """Test docstring."""
+      return [a]
+
+    self.assertSourceIdentical(
+        inspect_utils.getimmediatesource(test_fn), expected)
+
+  def test_getimmediatesource_functools_wrapper(self):
+
+    def wrapper_decorator(f):
+
+      @functools.wraps(f)
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+
+      return wrapper
+
+    expected = textwrap.dedent("""
+      @functools.wraps(f)
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+    """)
+
+    @wrapper_decorator
+    def test_fn(a):
+      """Test docstring."""
+      return [a]
+
+    self.assertSourceIdentical(
+        inspect_utils.getimmediatesource(test_fn), expected)
+
+  def test_getimmediatesource_functools_wrapper_different_module(self):
+
+    expected = textwrap.dedent("""
+      @functools.wraps(f)
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+    """)
+
+    @decorators.wrapping_decorator
+    def test_fn(a):
+      """Test docstring."""
+      return [a]
+
+    self.assertSourceIdentical(
+        inspect_utils.getimmediatesource(test_fn), expected)
+
+  def test_getimmediatesource_normal_decorator_different_module(self):
+
+    expected = textwrap.dedent("""
+      def standalone_wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+    """)
+
+    @decorators.standalone_decorator
+    def test_fn(a):
+      """Test docstring."""
+      return [a]
+
+    self.assertSourceIdentical(
+        inspect_utils.getimmediatesource(test_fn), expected)
+
+  def test_getimmediatesource_normal_functional_decorator_different_module(
+      self):
+
+    expected = textwrap.dedent("""
+      def functional_wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+    """)
+
+    @decorators.functional_decorator()
+    def test_fn(a):
+      """Test docstring."""
+      return [a]
+
+    self.assertSourceIdentical(
+        inspect_utils.getimmediatesource(test_fn), expected)
+
   def test_getnamespace_globals(self):
     ns = inspect_utils.getnamespace(factory)
     self.assertEqual(ns['free_function'], free_function)
diff --git a/tensorflow/tools/docker/run_jupyter.sh b/tensorflow/python/autograph/pyct/inspect_utils_test.sh
similarity index 75%
rename from tensorflow/tools/docker/run_jupyter.sh
rename to tensorflow/python/autograph/pyct/inspect_utils_test.sh
index 2771aea..02dfae2 100755
--- a/tensorflow/tools/docker/run_jupyter.sh
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.sh
@@ -1,5 +1,4 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# Test that runs inspect_utils_test as a .par file.
 
 
-jupyter notebook "$@"
+SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
+${SCRIPT_DIR}/inspect_utils_test.par
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 70c8d69..ea6a8e0 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -22,16 +22,13 @@
 from __future__ import print_function
 
 import textwrap
-import threading
 
 import gast
 
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.util import tf_inspect
 
 
-_parse_lock = threading.Lock()  # Prevents linecache concurrency errors.
-
-
 STANDARD_PREAMBLE = textwrap.dedent("""
     from __future__ import division
     from __future__ import print_function
@@ -53,8 +50,7 @@
     generate the AST (including any prefixes that this function may have added).
   """
   try:
-    with _parse_lock:
-      source = tf_inspect.getsource_no_unwrap(entity)
+    source = inspect_utils.getimmediatesource(entity)
   except (IOError, OSError) as e:
     raise ValueError(
         'Unable to locate the source code of {}. Note that functions defined'
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index 231c35d..6351ca0 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -17,6 +17,7 @@
 py_library(
     name = "test_modules",
     srcs = [
+        "decorators.py",
         "future_import_module.py",
     ],
     srcs_version = "PY2AND3",
@@ -41,6 +42,7 @@
     name = "codegen_test",
     size = "large",
     srcs = ["codegen_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "manual",
diff --git a/tensorflow/python/autograph/pyct/testing/decorators.py b/tensorflow/python/autograph/pyct/testing/decorators.py
new file mode 100644
index 0000000..428007bf5
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/testing/decorators.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module with test decorators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+
+def wrapping_decorator(f):
+
+  @functools.wraps(f)
+  def wrapper(*args, **kwargs):
+    return f(*args, **kwargs)
+
+  return wrapper
+
+
+def standalone_decorator(f):
+
+  def standalone_wrapper(*args, **kwargs):
+    return f(*args, **kwargs)
+
+  return standalone_wrapper
+
+
+def functional_decorator():
+
+  def decorator(f):
+
+    def functional_wrapper(*args, **kwargs):
+      return f(*args, **kwargs)
+
+    return functional_wrapper
+
+  return decorator
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 6ab36b1..910cd84 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -93,6 +93,7 @@
   Attributes:
     type: Any, the type of objects that this stack holds
     level: int, the current stack depth
+    stack: List[Any], the actual stack
     value: Any, the instance of the object at the top of the stack
   """
 
@@ -111,6 +112,10 @@
     return self._stack.pop()
 
   @property
+  def stack(self):
+    return self._stack
+
+  @property
   def level(self):
     return len(self._stack)
 
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 76353b5..d83539b 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -27,6 +27,7 @@
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device
@@ -36,6 +37,7 @@
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -668,6 +670,19 @@
     if not isinstance(config, config_pb2.ConfigProto):
       raise TypeError(
           'config must be a tf.ConfigProto, but got %s' % type(config))
+
+    if (mixed_precision_global_state.mixed_precision_is_enabled and
+        config.graph_options.rewrite_options.auto_mixed_precision !=
+        rewriter_config_pb2.RewriterConfig.OFF):
+      new_config = config_pb2.ConfigProto()
+      new_config.CopyFrom(config)
+      new_config.graph_options.rewrite_options.auto_mixed_precision = (
+          rewriter_config_pb2.RewriterConfig.ON)
+      config = new_config
+    elif (config.graph_options.rewrite_options.auto_mixed_precision !=
+          rewriter_config_pb2.RewriterConfig.ON):
+      mixed_precision_global_state.non_mixed_precision_session_created = True
+
     self._config = config
     self._add_shapes = config.graph_options.infer_shapes
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 294078c..8d3875d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -27,7 +27,7 @@
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 4, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 5, 10)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 6aa32f7..f42bbf2 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -144,7 +144,7 @@
     ).GetConversionParams(run_params)._replace(
         # Disable layout optimizer, since it'll add Transpose(Const, Const) to
         # the graph and breaks the conversion check.
-        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+        rewriter_config_template=trt_test.OptimizerDisabledRewriterConfig())
 
 
 class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
diff --git a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
index cd72b3f..f7dc210 100644
--- a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
@@ -77,7 +77,7 @@
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["TRTEngineOp_0", "TRTEngineOp_1"]
+    return ["TRTEngineOp_0", "TRTEngineOp_1", "TRTEngineOp_2"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
index 2b7bbbc..69d4ab0 100644
--- a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
@@ -124,7 +124,7 @@
         maximum_cached_engines=1,
         # Disable layout optimizer, since it will convert BiasAdd with NHWC
         # format to NCHW format under four dimentional input.
-        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+        rewriter_config_template=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index cb358d4..a906071 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -85,7 +85,7 @@
         maximum_cached_engines=10,
         # Disable layout optimizer, since it will convert BiasAdd with NHWC
         # format to NCHW format under four dimentional input.
-        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+        rewriter_config_template=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     return ["TRTEngineOp_0"]
diff --git a/tensorflow/python/compiler/tensorrt/test/int32_test.py b/tensorflow/python/compiler/tensorrt/test/int32_test.py
index 6d44469..41a5a27 100644
--- a/tensorflow/python/compiler/tensorrt/test/int32_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/int32_test.py
@@ -65,7 +65,7 @@
         maximum_cached_engines=1,
         # Disable layout optimizer, since it will convert BiasAdd with NHWC
         # format to NCHW format under four dimentional input.
-        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+        rewriter_config_template=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 345f8bb..f499d83 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -56,12 +56,6 @@
     "use_calibration"
 ])
 
-ConversionParams = namedtuple("ConversionParams", [
-    "max_batch_size", "max_workspace_size_bytes", "precision_mode",
-    "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "cached_engine_batches", "rewriter_config", "use_calibration"
-])
-
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
 
 
@@ -163,7 +157,7 @@
     raise NotImplementedError()
 
   def GetConversionParams(self, run_params):
-    """Return a ConversionParams for test."""
+    """Return a TrtConversionParams for test."""
     batch_list = []
     for dims_list in self._GetParamsCached().input_dims:
       assert dims_list
@@ -171,19 +165,22 @@
       input_batches = [dims[0] for dims in dims_list]
       assert max(input_batches) == min(input_batches)
       batch_list.append(input_batches[0])
-    return ConversionParams(
+    conversion_params = trt_convert.TrtConversionParams(
         # We use the minimum of all the batch sizes, so when multiple different
         # input shapes are provided it'll always create new engines in the
         # cache, and we can therefore test the cache behavior.
-        max_batch_size=min(batch_list),
+        rewriter_config_template=None,
         max_workspace_size_bytes=1 << 25,
         precision_mode=run_params.precision_mode,
         minimum_segment_size=2,
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
-        cached_engine_batches=None,
-        rewriter_config=None,
-        use_calibration=run_params.use_calibration)
+        use_calibration=run_params.use_calibration,
+        use_function_backup=False,
+        max_batch_size=min(batch_list),
+        cached_engine_batches=None)
+    return conversion_params._replace(
+        use_function_backup=IsQuantizationWithCalibration(conversion_params))
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
@@ -218,24 +215,13 @@
     """Get config proto based on specific settings."""
     conversion_params = self.GetConversionParams(run_params)
     if graph_state == GraphState.INFERENCE and run_params.use_optimizer:
-      rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
-          conversion_params.rewriter_config,
-          conversion_params.max_batch_size,
-          conversion_params.max_workspace_size_bytes,
-          conversion_params.precision_mode,
-          conversion_params.minimum_segment_size,
-          conversion_params.is_dynamic_op,
-          conversion_params.maximum_cached_engines,
-          conversion_params.cached_engine_batches,
-          conversion_params.use_calibration,
-          use_function_backup=IsQuantizationWithCalibration(conversion_params))
-
+      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(conversion_params)
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
-      if conversion_params.rewriter_config is not None:
+      if conversion_params.rewriter_config_template is not None:
         graph_options.rewrite_options.CopyFrom(
-            conversion_params.rewriter_config)
+            conversion_params.rewriter_config_template)
 
     config = config_pb2.ConfigProto(
         gpu_options=self._GetGPUOptions(), graph_options=graph_options)
@@ -310,7 +296,7 @@
         maximum_cached_engines=conversion_params.maximum_cached_engines,
         cached_engine_batches=conversion_params.cached_engine_batches,
         use_calibration=conversion_params.use_calibration,
-        use_function_backup=IsQuantizationWithCalibration(conversion_params))
+        use_function_backup=conversion_params.use_function_backup)
     return converter
 
   def _GetCalibratedInferGraph(self, run_params, gdef, inputs_data):
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 326a7d0..c9348b7 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -18,7 +18,12 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import os
+import tempfile
+
 import six as _six
+
 from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_loaded_tensorrt_version
@@ -27,9 +32,11 @@
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -43,6 +50,8 @@
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util.lazy_loader import LazyLoader
 
 # Import TRT library. This is fine since we don't import TF-TRT in
 # tensorflow/python/compiler/__init__.py, and `import tensorflow` won't trigger
@@ -53,6 +62,13 @@
 # TRT-converted graph without calling any of the methods in this module.
 trt_ops.load_trt_ops()
 
+# Lazily load the op, since it's not available in cpu-only builds. Importing
+# this at top will cause tests that imports TF-TRT fail when they're built
+# and run without CUDA/GPU.
+gen_trt_ops = LazyLoader(
+    "gen_trt_ops", globals(),
+    "tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops")
+
 
 def _to_bytes(s):
   """Encode s if it is a sequence of chars."""
@@ -80,7 +96,7 @@
   class MyGraphConverter(GraphConverter):
     ...
 
-    def get_rewriter_config(self, rewriter_config_template=None):
+    def get_rewriter_config(self):
       my_rewriter_config = ...
       return my_rewriter_config
   ```
@@ -129,7 +145,7 @@
         If set to None, the graph will be read from the SavedModel loaded from
         input_saved_model_dir.
       nodes_blacklist: list of node names to prevent the converter from
-        touching. Only used when input_graph_def is not None.
+        touching.
       session_config: the ConfigProto used to create a Session. It's also used
         as a template to create a RewriterConfig for conversion. If not
         specified, a default ConfigProto will be used.
@@ -137,21 +153,15 @@
     Raises:
       ValueError: if the combination of the parameters is invalid.
     """
-    if context.executing_eagerly():
-      if input_graph_def or not input_saved_model_dir:
-        raise ValueError(
-            "TF 2.0 only supports conversion of SavedModel, please specify "
-            "input_saved_model_dir as input.")
-    else:
-      if input_graph_def and input_saved_model_dir:
-        raise ValueError(
-            "Can only specify one of input_graph_def and input_saved_model_dir")
-      if not input_graph_def and not input_saved_model_dir:
-        raise ValueError("Must specify one of input_graph_def and "
-                         "input_saved_model_dir")
+    if input_graph_def and input_saved_model_dir:
+      raise ValueError(
+          "Can only specify one of input_graph_def and input_saved_model_dir")
+    if not input_graph_def and not input_saved_model_dir:
+      raise ValueError("Must specify one of input_graph_def and "
+                       "input_saved_model_dir")
 
-      self._input_graph_def = input_graph_def
-      self._nodes_blacklist = nodes_blacklist
+    self._input_graph_def = input_graph_def
+    self._nodes_blacklist = nodes_blacklist
 
     self._input_saved_model_dir = input_saved_model_dir
     self._converted = False
@@ -169,14 +179,9 @@
     self._calibration_sess = None
     self._calibration_data_collected = False
 
-  def get_rewriter_config(self, rewriter_config_template=None):
+  def get_rewriter_config(self):
     """Returns a RewriterConfig proto for TRT transformation.
 
-    Args:
-      rewriter_config_template: a template RewriterConfig proto used to create a
-        RewriterConfig for the conversion. The implementation should not modify
-        the template. If None, it will use a default one.
-
     Returns:
       A RewriterConfig proto which will be used to run the conversion using
       Grappler.
@@ -188,11 +193,7 @@
     # Create custom ConfigProto for Grappler.
     grappler_session_config = config_pb2.ConfigProto()
     grappler_session_config.CopyFrom(self._session_config)
-    rewriter_config = None
-    if (grappler_session_config.HasField("graph_options") and
-        grappler_session_config.graph_options.HasField("rewrite_options")):
-      rewriter_config = grappler_session_config.graph_options.rewrite_options
-    custom_rewriter_config = self.get_rewriter_config(rewriter_config)
+    custom_rewriter_config = self.get_rewriter_config()
     grappler_session_config.graph_options.rewrite_options.CopyFrom(
         custom_rewriter_config)
 
@@ -285,33 +286,6 @@
 
     self._run_conversion()
 
-  # TODO(laigd): provide a utility function to optimize a ConcreteFunction and
-  # use it here (b/124792963).
-  def _convert_saved_model_v2(self):
-    """Convert the input SavedModel in 2.0 format."""
-    assert context.executing_eagerly()
-
-    self._saved_model = load.load(self._input_saved_model_dir,
-                                  self._input_saved_model_tags)
-    func = self._saved_model.signatures[self._input_saved_model_signature_key]
-    frozen_func = convert_to_constants.convert_variables_to_constants_v2(func)
-    self._grappler_meta_graph_def = saver.export_meta_graph(
-        graph_def=frozen_func.graph.as_graph_def(), graph=frozen_func.graph)
-
-    # Add a collection 'train_op' so that Grappler knows the outputs.
-    fetch_collection = meta_graph_pb2.CollectionDef()
-    for array in frozen_func.inputs + frozen_func.outputs:
-      fetch_collection.node_list.value.append(array.name)
-    self._grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
-        fetch_collection)
-
-    # Run TRT optimizer in Grappler to convert the graph.
-    self._run_conversion()
-    self._converted_func = wrap_function.function_from_graph_def(
-        self._converted_graph_def,
-        [tensor.name for tensor in frozen_func.inputs],
-        [tensor.name for tensor in frozen_func.outputs])
-
   def convert(self):
     """Run the conversion.
 
@@ -320,16 +294,11 @@
       2.0+.
     """
     assert not self._converted
-
-    if context.executing_eagerly():
-      self._convert_saved_model_v2()
-      return self._converted_func
+    if self._input_graph_def:
+      self._convert_graph_def()
     else:
-      if self._input_graph_def:
-        self._convert_graph_def()
-      else:
-        self._convert_saved_model()
-      return self._converted_graph_def
+      self._convert_saved_model()
+    return self._converted_graph_def
 
   def calibrate(self,
                 fetch_names,
@@ -408,80 +377,71 @@
       SavedModel.
     """
     assert self._converted
+    if self._input_graph_def:
+      raise ValueError(
+          "Not able to save to a SavedModel since input is a GraphDef")
 
-    if context.executing_eagerly():
-      # Rewrite the signature map using the optimized ConcreteFunction.
-      signatures = {
-          key: value for key, value in self._saved_model.signatures.items()
-      }
-      signatures[self._input_saved_model_signature_key] = self._converted_func
-      save.save(self._saved_model, output_saved_model_dir, signatures)
-    else:
-      if self._input_graph_def:
-        raise ValueError(
-            "Not able to save to a SavedModel since input is a GraphDef")
-
-      def _restore_collections(dest_graph, src_meta_graph_def, collections):
-        """Restores collections that we need to keep."""
-        scope = ""
-        for key in collections:
-          collection_def = src_meta_graph_def.collection_def[key]
-          kind = collection_def.WhichOneof("kind")
-          if kind is None:
-            tf_logging.error(
-                "Cannot identify data type for collection %s. Skipping.", key)
-            continue
-          from_proto = ops.get_from_proto_function(key)
-          if from_proto and kind == "bytes_list":
-            proto_type = ops.get_collection_proto_type(key)
-            # It is assumed that there are no Variables Keys in collections
-            for value in collection_def.bytes_list.value:
-              proto = proto_type()
-              proto.ParseFromString(value)
+    def _restore_collections(dest_graph, src_meta_graph_def, collection_keys):
+      """Restores collections that we need to keep."""
+      scope = ""
+      for key in collection_keys:
+        collection_def = src_meta_graph_def.collection_def[key]
+        kind = collection_def.WhichOneof("kind")
+        if kind is None:
+          tf_logging.error(
+              "Cannot identify data type for collection %s. Skipping.", key)
+          continue
+        from_proto = ops.get_from_proto_function(key)
+        if from_proto and kind == "bytes_list":
+          proto_type = ops.get_collection_proto_type(key)
+          # It is assumed that there are no Variables Keys in collections
+          for value in collection_def.bytes_list.value:
+            proto = proto_type()
+            proto.ParseFromString(value)
+            try:
+              new_value = from_proto(proto, import_scope=scope)
+            except:
+              continue
+            dest_graph.add_to_collection(key, new_value)
+        else:
+          field = getattr(collection_def, kind)
+          if kind == "node_list":
+            for value in field.value:
+              name = ops.prepend_name_scope(value, scope)
+              # Since the graph has been optimized, the node may no longer
+              # exists
               try:
-                new_value = from_proto(proto, import_scope=scope)
-              except:
+                col_op = dest_graph.as_graph_element(name)
+              except (TypeError, ValueError, KeyError) as e:
                 continue
-              dest_graph.add_to_collection(key, new_value)
+              dest_graph.add_to_collection(key, col_op)
+          elif kind == "int64_list":
+            # NOTE(opensource): This force conversion is to work around the
+            # fact that Python2 distinguishes between int and long, while
+            # Python3 has only int.
+            for value in field.value:
+              dest_graph.add_to_collection(key, int(value))
           else:
-            field = getattr(collection_def, kind)
-            if kind == "node_list":
-              for value in field.value:
-                name = ops.prepend_name_scope(value, scope)
-                # Since the graph has been optimized, the node may no longer
-                # exists
-                try:
-                  col_op = dest_graph.as_graph_element(name)
-                except (TypeError, ValueError, KeyError) as e:
-                  continue
-                dest_graph.add_to_collection(key, col_op)
-            elif kind == "int64_list":
-              # NOTE(opensource): This force conversion is to work around the
-              # fact that Python2 distinguishes between int and long, while
-              # Python3 has only int.
-              for value in field.value:
-                dest_graph.add_to_collection(key, int(value))
-            else:
-              for value in field.value:
-                dest_graph.add_to_collection(
-                    key, ops.prepend_name_scope(value, scope))
+            for value in field.value:
+              dest_graph.add_to_collection(key,
+                                           ops.prepend_name_scope(value, scope))
 
-      # Write the transformed graphdef as SavedModel.
-      saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
-      with ops.Graph().as_default():
-        importer.import_graph_def(self._converted_graph_def, name="")
-        _restore_collections(
-            ops.get_default_graph(), self._grappler_meta_graph_def,
-            self._collections_to_keep(
-                self._grappler_meta_graph_def.collection_def))
-        # We don't use any specific converter here.
-        with session.Session(config=self._session_config) as sess:
-          saved_model_builder.add_meta_graph_and_variables(
-              sess,
-              self._input_saved_model_tags,
-              signature_def_map=self._grappler_meta_graph_def.signature_def)
-      # Ignore other meta graphs from the input SavedModel.
-      saved_model_builder.save()
+    # Write the transformed graphdef as SavedModel.
+    saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
+    with ops.Graph().as_default():
+      importer.import_graph_def(self._converted_graph_def, name="")
+      _restore_collections(
+          ops.get_default_graph(), self._grappler_meta_graph_def,
+          self._collections_to_keep(
+              self._grappler_meta_graph_def.collection_def))
+      # We don't use any specific converter here.
+      with session.Session(config=self._session_config) as sess:
+        saved_model_builder.add_meta_graph_and_variables(
+            sess,
+            self._input_saved_model_tags,
+            signature_def_map=self._grappler_meta_graph_def.signature_def)
+    # Ignore other meta graphs from the input SavedModel.
+    saved_model_builder.save()
 
 
 class TrtPrecisionMode(object):
@@ -498,101 +458,212 @@
 # so it can produce reasonable performance results with the default.
 DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30
 
+# TrtConversionParams encapsulates the parameters that are used for TF-TRT
+# conversion.
+TrtConversionParams = collections.namedtuple(
+    "TrtConversionParams",
+    [
+
+        # A template RewriterConfig proto used to create a TRT-enabled
+        # RewriterConfig. If None, it will use a default one.
+        "rewriter_config_template",
+
+        # The maximum GPU temporary memory which the TRT engine can use at
+        # execution time. This corresponds to the 'workspaceSize' parameter of
+        # nvinfer1::IBuilder::setMaxWorkspaceSize().
+        "max_workspace_size_bytes",
+
+        # One of TrtPrecisionMode.supported_precision_modes().
+        "precision_mode",
+
+        # The minimum number of nodes required for a subgraph to be replaced by
+        # TRTEngineOp.
+        "minimum_segment_size",
+
+        # Whether to generate dynamic TRT ops which will build the TRT network
+        # and engine at run time.
+        #
+        # TODO(laigd): In TF 2.0, this options should only affect INT8 mode.
+        "is_dynamic_op",
+
+        # Max number of cached TRT engines in dynamic TRT ops. If the number of
+        # cached engines is already at max but none of them can serve the input,
+        # the TRTEngineOp will fall back to run the TF function based on which
+        # the TRTEngineOp is created.
+        "maximum_cached_engines",
+
+        # This argument is ignored if precision_mode is not INT8. If set to
+        # True, a calibration graph will be created to calibrate the missing
+        # ranges. The calibration graph must be converted to an inference graph
+        # by running calibration with calibrate(). If set to False, quantization
+        # nodes will be expected for every tensor in the graph (exlcuding those
+        # which will be fused). If a range is missing, an error will occur.
+        # Please note that accuracy may be negatively affected if there is a
+        # mismatch between which tensors TRT quantizes and which tensors were
+        # trained with fake quantization.
+        "use_calibration",
+
+        # If set to True, it will create a FunctionDef for each subgraph that is
+        # converted to TRT op, and if TRT ops fail to execute at runtime, it'll
+        # invoke that function as a fallback.
+        "use_function_backup",
+
+        # Max size for the input batch.
+        # This option is deprecated in TF 2.0.
+        "max_batch_size",
+
+        # A list of batch sizes used to create cached engines, only used when
+        # is_dynamic_op is True. The length of the list should be <=
+        # maximum_cached_engines, and the dynamic TRT op will use this list to
+        # determine the batch sizes of the cached engines, instead of making the
+        # decision on the fly. This is useful when we know the most common batch
+        # size(s) the application is going to generate.
+        # This option is deprecated in TF 2.0.
+        "cached_engine_batches",
+    ])
+
+DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams(
+    rewriter_config_template=None,
+    max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    use_calibration=True,
+    use_function_backup=True,
+    max_batch_size=1,
+    cached_engine_batches=None)
+
+_TRT_CALIBRATION_RESOURCE_CONTAINER_NAME = "TF-TRT-Calibration"
+_TRT_ENGINE_CACHE_CONTAINER_NAME = "TF-TRT-Engine-Cache"
+_TRT_ENGINE_OP_NAME = "TRTEngineOp"
+
+
+def _check_conversion_params(conversion_params):
+  """Validate the provided TrtConversionParams.
+
+  Args:
+    conversion_params: a TrtConversionParams instance.
+
+  Raises:
+    TypeError: if any of the parameters are of unexpected type.
+    ValueError: if any of the parameters are of unexpected value.
+  """
+  supported_precision_modes = TrtPrecisionMode.supported_precision_modes()
+  if conversion_params.precision_mode not in supported_precision_modes:
+    raise ValueError(
+        ("precision mode '{}' is not supported."
+         "It should be one of {}").format(conversion_params.precision_mode,
+                                          supported_precision_modes))
+  if conversion_params.cached_engine_batches:
+    if not isinstance(conversion_params.cached_engine_batches, list):
+      raise TypeError("cached_engine_batches should be a list.")
+    if len(conversion_params.cached_engine_batches
+          ) > conversion_params.maximum_cached_engines:
+      raise ValueError("cached_engine_batches should not contain more than "
+                       "maximum_cached_engines items.")
+
+
+def _check_trt_version_compatibility():
+  """Check compatibility of TensorRT version.
+
+  Raises:
+    RuntimeError: if the TensorRT library version is incompatible.
+  """
+  compiled_version = get_linked_tensorrt_version()
+  loaded_version = get_loaded_tensorrt_version()
+  tf_logging.info("Linked TensorRT version: %s" % str(compiled_version))
+  tf_logging.info("Loaded TensorRT version: %s" % str(loaded_version))
+  version_mismatch = False
+  if loaded_version[0] < compiled_version[0]:
+    tf_logging.error(
+        "TensorRT version mismatch. Tensorflow was compiled against " +
+        "TensorRT %s but library loaded from environment is TensorRT %s" %
+        (".".join([str(x) for x in compiled_version]),
+         ".".join([str(x) for x in loaded_version])) +
+        ". Please make sure that correct version of TensorRT " +
+        "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
+    raise RuntimeError("Incompatible TensorRT library version")
+  for i in zip(loaded_version, compiled_version):
+    if i[0] != i[1]:
+      tf_logging.warn("TensorRT mismatch. Compiled against version " +
+                      "%s, but loaded %s. Things may not work" %
+                      (".".join([str(x) for x in compiled_version]),
+                       ".".join([str(x) for x in loaded_version])))
+      version_mismatch = True
+      break
+  if not version_mismatch:
+    tf_logging.info("Running against TensorRT version %s" %
+                    ".".join([str(x) for x in loaded_version]))
+
+
+def get_tensorrt_rewriter_config(
+    conversion_params=DEFAULT_TRT_CONVERSION_PARAMS, is_v2=False):
+  """Returns a RewriterConfig proto for TRT transformation.
+
+  Args:
+    conversion_params: a TrtConversionParams instance.
+    is_v2: whether we're getting a RewriterConfig for TF 2.0.
+
+  Returns:
+    A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
+
+  Raises:
+    TypeError: if any of the parameters are of unexpected type.
+    ValueError: if any of the parameters are of unexpected value.
+  """
+  if conversion_params.rewriter_config_template is not None and not isinstance(
+      conversion_params.rewriter_config_template,
+      rewriter_config_pb2.RewriterConfig):
+    raise TypeError(
+        "rewriter_config_template should be a RewriterConfig proto.")
+  _check_conversion_params(conversion_params)
+
+  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
+  if conversion_params.rewriter_config_template is None:
+    # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
+    # need to run constant folding again.
+    rewriter_config_with_trt.optimizers.extend(
+        ["constfold", "layout", "constfold"])
+    rewriter_config_with_trt.meta_optimizer_iterations = (
+        rewriter_config_pb2.RewriterConfig.ONE)
+  else:
+    rewriter_config_with_trt.CopyFrom(
+        conversion_params.rewriter_config_template)
+
+  optimizer = rewriter_config_with_trt.custom_optimizers.add()
+  optimizer.name = "TensorRTOptimizer"
+  optimizer.parameter_map[
+      "minimum_segment_size"].i = conversion_params.minimum_segment_size
+  optimizer.parameter_map[
+      "max_workspace_size_bytes"].i = conversion_params.max_workspace_size_bytes
+  optimizer.parameter_map["precision_mode"].s = _to_bytes(
+      conversion_params.precision_mode)
+  optimizer.parameter_map[
+      "maximum_cached_engines"].i = conversion_params.maximum_cached_engines
+  optimizer.parameter_map[
+      "use_calibration"].b = conversion_params.use_calibration
+  optimizer.parameter_map[
+      "use_function_backup"].b = conversion_params.use_function_backup
+
+  if is_v2:
+    # Static mode (a.k.a pre-generating TRT engines and make them node
+    # attributes) is deprecated in TF 2.0.
+    optimizer.parameter_map["is_dynamic_op"].b = True
+  else:
+    optimizer.parameter_map[
+        "max_batch_size"].i = conversion_params.max_batch_size
+    optimizer.parameter_map["is_dynamic_op"].b = conversion_params.is_dynamic_op
+    if conversion_params.cached_engine_batches:
+      optimizer.parameter_map["cached_engine_batches"].list.i.extend(
+          conversion_params.cached_engine_batches)
+  return rewriter_config_with_trt
+
 
 class TrtGraphConverter(GraphConverter):
   """A GraphConverter for TRT transformation."""
 
-  _TRT_CALIBRATION_RESOURCE_CONTAINER_NAME = "TF-TRT-Calibration"
-
-  @classmethod
-  def get_tensorrt_rewriter_config(
-      cls,
-      rewriter_config_template=None,
-      max_batch_size=1,
-      max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
-      precision_mode=TrtPrecisionMode.FP32,
-      minimum_segment_size=3,
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batches=None,
-      use_calibration=True,
-      use_function_backup=True):
-    """Returns a RewriterConfig proto for TRT transformation.
-
-    Args:
-      rewriter_config_template: a template RewriterConfig proto used to create a
-        TRT-enabled RewriterConfig. If None, it will use a default one.
-      max_batch_size: max size for the input batch
-      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
-        engine can use at execution time. This corresponds to the
-        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
-      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
-      minimum_segment_size: the minimum number of nodes required for a subgraph
-        to be replaced by TRTEngineOp.
-      is_dynamic_op: whether to generate dynamic TRT ops which will build the
-        TRT network and engine at run time.
-      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
-        ops. If the number of cached engines is already at max but none of them
-        can serve the input, the TRTEngineOp will fall back to run the TF
-        function based on which the TRTEngineOp is created.
-      cached_engine_batches: a list of batch sizes used to create cached
-        engines, only used when is_dynamic_op is True. The length of the list
-        should be <= maximum_cached_engines, and the dynamic TRT op will use
-        this list to determine the batch sizes of the cached engines, instead of
-        making the decision on the fly. This is useful when we know the most
-        common batch size(s) the application is going to generate.
-      use_calibration: this argument is ignored if precision_mode is not INT8.
-        If set to True, a calibration graph will be created to calibrate the
-        missing ranges. The calibration graph must be converted to an inference
-        graph by running calibration with calibrate(). If set to False,
-        quantization nodes will be expected for every tensor in the graph
-        (exlcuding those which will be fused). If a range is missing, an error
-        will occur. Please note that accuracy may be negatively affected if
-        there is a mismatch between which tensors TRT quantizes and which
-        tensors were trained with fake quantization.
-      use_function_backup: if set to True, it will create a FunctionDef for each
-        subgraph that is converted to TRT op, and if TRT ops fail to execute at
-        runtime, it'll invoke that function as a fallback.
-
-    Returns:
-      A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
-
-    Raises:
-      TypeError: if any of the parameters are of unexpected type.
-      ValueError: if any of the parameters are of unexpected value.
-    """
-    if rewriter_config_template is not None and not isinstance(
-        rewriter_config_template, rewriter_config_pb2.RewriterConfig):
-      raise TypeError(
-          "rewriter_config_template should be a RewriterConfig proto.")
-
-    rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
-    if rewriter_config_template is None:
-      # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
-      # need to run constant folding again.
-      rewriter_config_with_trt.optimizers.extend(
-          ["constfold", "layout", "constfold"])
-      rewriter_config_with_trt.meta_optimizer_iterations = (
-          rewriter_config_pb2.RewriterConfig.ONE)
-    else:
-      rewriter_config_with_trt.CopyFrom(rewriter_config_template)
-
-    optimizer = rewriter_config_with_trt.custom_optimizers.add()
-    optimizer.name = "TensorRTOptimizer"
-    optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
-    optimizer.parameter_map["max_batch_size"].i = max_batch_size
-    optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
-    optimizer.parameter_map[
-        "max_workspace_size_bytes"].i = max_workspace_size_bytes
-    optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
-    optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
-    if cached_engine_batches:
-      optimizer.parameter_map["cached_engine_batches"].list.i.extend(
-          cached_engine_batches)
-    optimizer.parameter_map["use_calibration"].b = use_calibration
-    optimizer.parameter_map["use_function_backup"].b = use_function_backup
-    return rewriter_config_with_trt
-
+  # TODO(laigd): use TrtConversionParams here.
   def __init__(self,
                input_saved_model_dir=None,
                input_saved_model_tags=None,
@@ -621,7 +692,7 @@
         If set to None, the graph will be read from the SavedModel loaded from
         input_saved_model_dir.
       nodes_blacklist: list of node names to prevent the converter from
-        touching. Only used when input_graph_def is not None.
+        touching.
       session_config: the ConfigProto used to create a Session. It's also used
         as a template to create a TRT-enabled ConfigProto for conversion. If not
         specified, a default ConfigProto will be used.
@@ -659,7 +730,6 @@
 
     Raises:
       ValueError: if the combination of the parameters is invalid.
-      RuntimeError: if the TensorRT library version is incompatible.
     """
     super(TrtGraphConverter, self).__init__(
         input_saved_model_dir=input_saved_model_dir,
@@ -668,54 +738,10 @@
         input_graph_def=input_graph_def,
         nodes_blacklist=nodes_blacklist,
         session_config=session_config)
-
-    # TODO(laigd): move all the validations below to
-    # get_tensorrt_rewriter_config().
-    # Check compatibility of TensorRT version.
-    compiled_version = get_linked_tensorrt_version()
-    loaded_version = get_loaded_tensorrt_version()
-    tf_logging.info("Linked TensorRT version: %s" % str(compiled_version))
-    tf_logging.info("Loaded TensorRT version: %s" % str(loaded_version))
-    version_mismatch = False
-    if loaded_version[0] < compiled_version[0]:
-      tf_logging.error(
-          "TensorRT version mismatch. Tensorflow was compiled against " +
-          "TensorRT %s but library loaded from environment is TensorRT %s" %
-          (".".join([str(x) for x in compiled_version]),
-           ".".join([str(x) for x in loaded_version])) +
-          ". Please make sure that correct version of TensorRT " +
-          "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
-      raise RuntimeError("Incompatible TensorRT library version")
-    for i in zip(loaded_version, compiled_version):
-      if i[0] != i[1]:
-        tf_logging.warn("TensorRT mismatch. Compiled against version " +
-                        "%s, but loaded %s. Things may not work" %
-                        (".".join([str(x) for x in compiled_version]),
-                         ".".join([str(x) for x in loaded_version])))
-        version_mismatch = True
-        break
-    if not version_mismatch:
-      tf_logging.info("Running against TensorRT version %s" %
-                      ".".join([str(x) for x in loaded_version]))
-
-    # Check input arguments.
-    supported_precision_modes = TrtPrecisionMode.supported_precision_modes()
-    if precision_mode not in supported_precision_modes:
-      raise ValueError(
-          ("precision mode '{}' is not supported."
-           "It should be one of {}").format(precision_mode,
-                                            supported_precision_modes))
-
-    if cached_engine_batches:
-      if not isinstance(cached_engine_batches, list):
-        raise TypeError("cached_engine_batches should be a list.")
-      if len(cached_engine_batches) > maximum_cached_engines:
-        raise ValueError("cached_engine_batches should not contain more than "
-                         "maximum_cached_engines items.")
+    _check_trt_version_compatibility()
 
     self._need_calibration = (
         precision_mode == TrtPrecisionMode.INT8 and use_calibration)
-    self._use_function_backup = use_function_backup
 
     # TODO(laigd): consider provide a mechanism to remove the fallback path
     # after calibration is done.
@@ -724,44 +750,36 @@
           "Calibration requires enabling fallback to TF function execution.")
 
     # TODO(laigd):
-    # - Get rid of is_dynamic_op option, it should always be True, and it should
-    #   accept N shapes as input.
     # - Verify in int8 mode that maximum_cached_engines and
     #   cached_engine_batches are set appropriately.
     # - If it fails to build the int8 engine it should return error.
-    self._max_batch_size = max_batch_size
-    self._max_workspace_size_bytes = max_workspace_size_bytes
-    self._precision_mode = precision_mode
-    self._minimum_segment_size = minimum_segment_size
-    self._is_dynamic_op = is_dynamic_op
-    self._maximum_cached_engines = maximum_cached_engines
-    self._cached_engine_batches = cached_engine_batches
+    rewriter_config_template = None
+    if (session_config and session_config.HasField("graph_options") and
+        session_config.graph_options.HasField("rewrite_options")):
+      rewriter_config_template = session_config.graph_options.rewrite_options
 
-  def get_rewriter_config(self, rewriter_config_template=None):
-    return TrtGraphConverter.get_tensorrt_rewriter_config(
-        rewriter_config_template,
-        max_batch_size=self._max_batch_size,
-        max_workspace_size_bytes=self._max_workspace_size_bytes,
-        precision_mode=self._precision_mode,
-        minimum_segment_size=self._minimum_segment_size,
-        is_dynamic_op=self._is_dynamic_op,
-        maximum_cached_engines=self._maximum_cached_engines,
-        cached_engine_batches=self._cached_engine_batches,
-        use_calibration=self._need_calibration,
-        use_function_backup=self._use_function_backup)
+    self._conversion_params = TrtConversionParams(
+        rewriter_config_template=rewriter_config_template,
+        max_workspace_size_bytes=max_workspace_size_bytes,
+        precision_mode=precision_mode,
+        minimum_segment_size=minimum_segment_size,
+        is_dynamic_op=is_dynamic_op,
+        maximum_cached_engines=maximum_cached_engines,
+        use_calibration=use_calibration,
+        use_function_backup=use_function_backup,
+        max_batch_size=max_batch_size,
+        cached_engine_batches=cached_engine_batches)
+    _check_conversion_params(self._conversion_params)
+
+  def get_rewriter_config(self):
+    return get_tensorrt_rewriter_config(
+        conversion_params=self._conversion_params)
 
   def finalize_calibration(self):
     assert self._need_calibration
     assert self._converted
     assert not self._calibration_data_collected
 
-    # Lazily load the op, since it's not available in cpu-only builds. Importing
-    # this at top will cause tests that imports TF-TRT fail when they're built
-    # and run without CUDA/GPU.
-    # pylint: disable=g-import-not-at-top,line-too-long
-    from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import get_serialized_resource_op
-    # pylint: enable=g-import-not-at-top,line-too-long
-
     # TODO(laigd): a better way would be to use self._calibration_sess to list
     # all the devices, add one get_serialized_resource_op for each device, and
     # fetch each such op for every resource until its found. This can work
@@ -775,15 +793,15 @@
       resource_name_input = array_ops.placeholder(dtypes.string)
 
       for node in self._converted_graph_def.node:
-        if node.op == "TRTEngineOp":
+        if node.op == _TRT_ENGINE_OP_NAME:
           # Adds the get_serialized_resource_op for the device if not done
           # before. We only add one such op for each device.
           # TODO(laigd): What if the device is empty?????
           if node.device not in device_to_get_resource_op_map:
             with self._calibration_graph.device(node.device):
               serialized_resources_output = (
-                  get_serialized_resource_op(container_input,
-                                             resource_name_input))
+                  gen_trt_ops.get_serialized_resource_op(
+                      container_input, resource_name_input))
             device_to_get_resource_op_map[node.device] = (
                 serialized_resources_output)
 
@@ -791,11 +809,8 @@
           calibration_result = self._calibration_sess.run(
               device_to_get_resource_op_map[node.device],
               feed_dict={
-                  container_input:
-                      TrtGraphConverter
-                      ._TRT_CALIBRATION_RESOURCE_CONTAINER_NAME,
-                  resource_name_input:
-                      node.name
+                  container_input: _TRT_CALIBRATION_RESOURCE_CONTAINER_NAME,
+                  resource_name_input: node.name
               })
           node.attr["calibration_data"].s = calibration_result
 
@@ -806,9 +821,237 @@
     """Save the converted graph as a SavedModel."""
     if self._need_calibration:
       assert self._calibration_data_collected
+
     super(TrtGraphConverter, self).save(output_saved_model_dir)
 
 
+class TRTEngineResource(tracking.TrackableResource):
+  """Class to track the serialized engines resource."""
+
+  def __init__(self, resource_name, filename, maximum_cached_engines):
+    super(TRTEngineResource, self).__init__()
+    self._resource_name = resource_name
+    # Track the serialized engine file in the SavedModel.
+    self._filename = self._track_trackable(
+        tracking.TrackableAsset(filename), "_serialized_trt_engine_filename")
+    self._maximum_cached_engines = maximum_cached_engines
+
+  def _create_resource(self):
+    return gen_trt_ops.create_trt_engine_cache(
+        container=_TRT_ENGINE_CACHE_CONTAINER_NAME,
+        resource_name=self._resource_name,
+        max_cached_engines_count=self._maximum_cached_engines)
+
+  def _initialize(self):
+    gen_trt_ops.populate_trt_engine_cache(self.resource_handle, self._filename)
+
+
+class TrtGraphConverterV2(object):
+  """An offline converter for TF-TRT transformation for TF 2.0 SavedModels.
+
+  To run the conversion without quantization calibration (e.g. for FP32/FP16
+  precision modes):
+
+  ```python
+  TrtConversionParams params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
+      precision_mode='FP16')
+  converter = TrtGraphConverterV2(
+      input_saved_model_dir="my_dir", conversion_params=params)
+  converter.convert()
+  converter.save(output_saved_model_dir)
+  ```
+
+  As a result, a TF-TRT converted SavedModel will be generated and saved to
+  `output_saved_model_dir`. The SavedModel will have TRT compatible subgraph
+  replaced by TRTEngineOps, but no TRT engines will be pre-built until execution
+  time. We can also build the TRT engines offline by running the converted
+  function with some input data:
+
+  ```python
+  TrtConversionParams params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
+      precision_mode='FP16',
+      # Set this to a large enough number so it can cache all the TRT engines.
+      maximum_cached_engines=16)
+  converter = TrtGraphConverterV2(
+      input_saved_model_dir="my_dir", conversion_params=params)
+  converted_func = converter.convert()
+  for data in my_input_data:
+    converted_func(my_input_data)
+  converter.save(output_saved_model_dir)
+  ```
+
+  In this way, for each unique shapes of the inputs to the TRTEngineOp, if it
+  cannot be handled by any previously generated TRT engine, a new engine will be
+  generated and serialized to the output SavedModel in `output_saved_model_dir`.
+  This is good for applications that cannot afford building TRT engines at
+  runtime but have access to input data that is similar to the one used in
+  production (for example, that will result in the same input shapes to the
+  TRTEngineOps). Also, the generated TRT engines is platform dependent, so we
+  need to run `converted_func` in an environment that is similar to production
+  (at least with same type of GPU).
+
+  TODO(laigd/hinsu): running conversion with calibration in INT8 mode should
+  follow exactly the same steps.
+  """
+
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_saved_model_signature_key=None,
+               conversion_params=DEFAULT_TRT_CONVERSION_PARAMS):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_saved_model_signature_key: the key of the signature to optimize the
+        graph for.
+      conversion_params: a TrtConversionParams instance.
+    """
+    assert context.executing_eagerly()
+    _check_trt_version_compatibility()
+
+    self._input_saved_model_dir = input_saved_model_dir
+    self._input_saved_model_tags = (
+        input_saved_model_tags or [tag_constants.SERVING])
+    self._input_saved_model_signature_key = (
+        input_saved_model_signature_key or
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+
+    self._need_calibration = (
+        conversion_params.precision_mode == TrtPrecisionMode.INT8 and
+        conversion_params.use_calibration)
+    self._conversion_params = conversion_params
+    _check_conversion_params(self._conversion_params)
+    self._converted = False
+
+  def _run_conversion(self, meta_graph_def):
+    """Run Grappler's OptimizeGraph() tool to convert the graph.
+
+    Args:
+      meta_graph_def: the MetaGraphDef instance to run the optimizations on.
+
+    Returns:
+      The optimized GraphDef.
+    """
+    rewriter_config = get_tensorrt_rewriter_config(
+        conversion_params=self._conversion_params, is_v2=True)
+    grappler_session_config = config_pb2.ConfigProto()
+    grappler_session_config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config)
+    return tf_optimizer.OptimizeGraph(
+        grappler_session_config, meta_graph_def, graph_id=b"tf_graph")
+
+  # TODO(laigd): provide a utility function to optimize a ConcreteFunction and
+  # use it here (b/124792963).
+  def convert(self):
+    """Convert the input SavedModel in 2.0 format.
+
+    Returns:
+      The TF-TRT converted Function.
+    """
+    assert not self._converted
+    self._saved_model = load.load(self._input_saved_model_dir,
+                                  self._input_saved_model_tags)
+    func = self._saved_model.signatures[self._input_saved_model_signature_key]
+    frozen_func = convert_to_constants.convert_variables_to_constants_v2(func)
+    grappler_meta_graph_def = saver.export_meta_graph(
+        graph_def=frozen_func.graph.as_graph_def(), graph=frozen_func.graph)
+
+    # Add a collection 'train_op' so that Grappler knows the outputs.
+    fetch_collection = meta_graph_pb2.CollectionDef()
+    for array in frozen_func.inputs + frozen_func.outputs:
+      fetch_collection.node_list.value.append(array.name)
+    grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
+        fetch_collection)
+
+    # Run TRT optimizer in Grappler to convert the graph.
+    self._converted_graph_def = self._run_conversion(grappler_meta_graph_def)
+    self._converted_func = wrap_function.function_from_graph_def(
+        self._converted_graph_def,
+        [tensor.name for tensor in frozen_func.inputs],
+        [tensor.name for tensor in frozen_func.outputs])
+
+    self._converted = True
+
+    # Wrap the converted ConcreteFunction in a Function so it can accept numpy
+    # arrays as input.
+    @def_function.function
+    def wrapper_func(*args, **kwargs):
+      return self._converted_func(*args, **kwargs)
+
+    return wrapper_func
+
+  def save(self, output_saved_model_dir):
+    """Save the converted SavedModel.
+
+    Args:
+      output_saved_model_dir: directory to saved the converted SavedModel.
+    """
+    assert self._converted
+
+    @def_function.function
+    def _dump_trt_cache(resource_name, filename):
+      gen_trt_ops.dump_trt_engine_cache(
+          container=_TRT_ENGINE_CACHE_CONTAINER_NAME,
+          resource_name=resource_name,
+          filename=filename,
+          delete_cache_after_dump=True)
+
+    # Serialize the TRT engines in the cache if any, and create trackable
+    # resource to track them.
+    engine_asset_dir = tempfile.mkdtemp()
+    resource_map = {}
+
+    def _serialize_and_track_engine(canonical_engine_name):
+      """Serialize TRT engines in the cache and track them."""
+      # Don't dump the same cache twice.
+      if canonical_engine_name in resource_map:
+        return
+
+      filename = os.path.join(engine_asset_dir,
+                              "trt-serialized-engine." + canonical_engine_name)
+      try:
+        _dump_trt_cache(canonical_engine_name, filename)
+      except errors.NotFoundError:
+        # If user haven't run the function to populate the engine, it's fine,
+        # and we don't need to track any serialized TRT engines.
+        return
+
+      resource_map[canonical_engine_name] = TRTEngineResource(
+          canonical_engine_name, filename,
+          self._conversion_params.maximum_cached_engines)
+
+    # Remove all scope prefixes in the node name. In TF 2.0, the same concrete
+    # function can be initialized multiple times with different prefixes, and
+    # this will result in the same TRTEngineOp being initialized multiple times
+    # with different cache and duplicate TRT engines.
+    # TODO(laigd): this may be caused by the fact that TRTEngineOp is not
+    # stataful, need to investigate.
+    # TODO(laigd): we rely on the fact that all functions are fully inlined
+    # before TF-TRT optimizer is called, as otherwise it may generate the same
+    # name when optimizing a different function graph. Fix this.
+    canonical_engine_name = lambda node: node.name.split("/")[-1]
+    for node in self._converted_graph_def.node:
+      if node.op == _TRT_ENGINE_OP_NAME:
+        _serialize_and_track_engine(canonical_engine_name(node))
+    for func in self._converted_graph_def.library.function:
+      for node in func.node_def:
+        if node.op == _TRT_ENGINE_OP_NAME:
+          _serialize_and_track_engine(canonical_engine_name(node))
+
+    self._saved_model.trt_engine_resources = resource_map
+
+    # Rewrite the signature map using the optimized ConcreteFunction.
+    signatures = {
+        key: value for key, value in self._saved_model.signatures.items()
+    }
+    signatures[self._input_saved_model_signature_key] = self._converted_func
+    save.save(self._saved_model, output_saved_model_dir, signatures)
+
+
+# TODO(laigd): use TrtConversionParams here.
 def create_inference_graph(
     input_graph_def,
     outputs,
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 3d7ebfa..f69e7dc 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -19,13 +19,15 @@
 from __future__ import print_function
 
 import os
+import tempfile
+
+import numpy as np
 
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.compiler.tensorrt import trt_convert
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -35,7 +37,6 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder
@@ -44,9 +45,9 @@
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
-from tensorflow.python.tools import saved_model_utils
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
+from tensorflow.python.tools import saved_model_utils
 from tensorflow.python.training.tracking import tracking
 
 _SAVED_MODEL_SIGNATURE_KEY = "mypredict"
@@ -63,8 +64,7 @@
     """Test case for TrtGraphConverter.get_tensorrt_rewriter_config()."""
     if not is_tensorrt_enabled():
       return
-    rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
-        rewriter_config_template=None,
+    conversion_params = trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace(
         max_batch_size=128,
         max_workspace_size_bytes=1234,
         precision_mode="INT8",
@@ -72,6 +72,8 @@
         is_dynamic_op=True,
         maximum_cached_engines=2,
         cached_engine_batches=[1, 128])
+    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
+        conversion_params=conversion_params)
     self.assertEqual(["constfold", "layout", "constfold"],
                      rewriter_cfg.optimizers)
     self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
@@ -106,7 +108,8 @@
         gpu_options=config_pb2.GPUOptions(allow_growth=True))
     return config
 
-  def _GetGraph(self):
+  @classmethod
+  def _GetGraph(cls, inp, var):
     """Get the graph for testing."""
     # The graph computes (input+1)^2, it looks like:
     #
@@ -119,24 +122,42 @@
     #                    +
     #                    |
     #                 output (Identity)
+    add = inp + var
+    mul = inp * add
+    add = mul + add
+    out = array_ops.identity(add, name="output")
+    return out
+
+  def _GetModelForV2(self):
+
+    class SimpleModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = None
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[None, 1, 1], dtype=dtypes.float32)
+      ])
+      def run(self, inp):
+        if self.v is None:
+          self.v = variables.Variable([[[1.0]]], dtype=dtypes.float32)
+        return TrtConvertTest._GetGraph(inp, self.v)
+
+    return SimpleModel()
+
+  def _GetGraphForV1(self):
     g = ops.Graph()
     with g.as_default():
       with g.device("/GPU:0"):
         inp = array_ops.placeholder(
             dtype=dtypes.float32, shape=[None, 1, 1], name="input")
-        var = variables.VariableV1([[[1.0]]],
-                                   dtype=dtypes.float32,
-                                   name="v1",
-                                   use_resource=False)
-        add = inp + var.value()
-        mul = inp * add
-        add = mul + add
-        out = array_ops.identity(add, name="output")
-    return g, var, inp, out
+        var = variables.Variable([[[1.0]]], dtype=dtypes.float32, name="v1")
+        out = TrtConvertTest._GetGraph(inp, var)
+        return g, var, inp, out
 
   def _GetGraphDef(self):
     """Get the graph def for testing."""
-    g, var, _, _ = self._GetGraph()
+    g, var, _, _ = self._GetGraphForV1()
     with self.session(graph=g, config=self._GetConfigProto()) as sess:
       sess.run(var.initializer)
       graph_def = graph_util.convert_variables_to_constants(
@@ -145,7 +166,7 @@
     self.assertEqual(
         {
             "v1": "Const",
-            "v1/read": "Identity",
+            "add/ReadVariableOp": "Identity",
             "input": "Placeholder",
             "add": "Add",
             "mul": "Mul",
@@ -156,7 +177,7 @@
 
   def _WriteInputSavedModel(self, input_saved_model_dir):
     """Write the saved model as an input for testing."""
-    g, var, inp, out = self._GetGraph()
+    g, var, inp, out = self._GetGraphForV1()
     signature_def = signature_def_utils.build_signature_def(
         inputs={"myinput": utils.build_tensor_info(inp)},
         outputs={"myoutput": utils.build_tensor_info(out)},
@@ -183,7 +204,7 @@
         input_saved_model_dir=input_saved_model_dir,
         input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
         input_graph_def=None if input_saved_model_dir else self._GetGraphDef(),
-        nodes_blacklist=["output"],
+        nodes_blacklist=None if input_saved_model_dir else ["output"],
         session_config=self._GetConfigProto(),
         max_batch_size=max_batch_size,
         max_workspace_size_bytes=TrtConvertTest._TRT_MAX_WORKSPACE_SIZE_BYTES,
@@ -193,28 +214,23 @@
         is_dynamic_op=is_dynamic_op,
         maximum_cached_engines=maximum_cached_engines,
         use_function_backup=use_function_backup)
-    conversion_result = converter.convert()
+    output_graph_def = converter.convert()
 
-    if context.executing_eagerly():
-      output_graph_def = conversion_result.graph.as_graph_def()
-    else:
-      output_graph_def = conversion_result
+    if need_calibration:
 
-      if need_calibration:
+      class CalibrationData(object):
 
-        class CalibrationData(object):
+        def __init__(self):
+          self._data = 0
 
-          def __init__(self):
-            self._data = 0
+        def next(self):
+          self._data += 1
+          return {"input:0": [[[self._data]]]}
 
-          def next(self):
-            self._data += 1
-            return {"input:0": [[[self._data]]]}
-
-        output_graph_def = converter.calibrate(
-            fetch_names=["output:0"],
-            num_runs=10,
-            feed_dict_fn=CalibrationData().next)
+      output_graph_def = converter.calibrate(
+          fetch_names=["output:0"],
+          num_runs=10,
+          feed_dict_fn=CalibrationData().next)
 
     if output_saved_model_dir is not None:
       converter.save(output_saved_model_dir=output_saved_model_dir)
@@ -235,31 +251,19 @@
     graph_defs_to_verify = [output_graph_def]
 
     if output_saved_model_dir:
-      if context.executing_eagerly():
-        root = load.load(output_saved_model_dir)
-        saved_model_graph_def = root.signatures[
-            _SAVED_MODEL_SIGNATURE_KEY].graph.as_graph_def()
-      else:
-        saved_model_graph_def = saved_model_utils.get_meta_graph_def(
-            output_saved_model_dir, tag_constants.SERVING).graph_def
-      self.assertTrue(isinstance(saved_model_graph_def, graph_pb2.GraphDef))
+      saved_model_graph_def = saved_model_utils.get_meta_graph_def(
+          output_saved_model_dir, tag_constants.SERVING).graph_def
+      self.assertIsInstance(saved_model_graph_def, graph_pb2.GraphDef)
       graph_defs_to_verify.append(saved_model_graph_def)
 
     for graph_def in graph_defs_to_verify:
       node_name_to_op = {node.name: node.op for node in graph_def.node}
-      if context.executing_eagerly():
-        # In V2 the actual graph could be inside a function.
-        for func in graph_def.library.function:
-          node_name_to_op.update({node.name: node.op for node in func.node_def})
-        self.assertIn("TRTEngineOp_0", node_name_to_op)
-        self.assertEqual("TRTEngineOp", node_name_to_op["TRTEngineOp_0"])
-      else:
-        self.assertEqual(
-            {
-                "input": "Placeholder",
-                "TRTEngineOp_0": "TRTEngineOp",
-                "output": "Identity"
-            }, node_name_to_op)
+      self.assertEqual(
+          {
+              "input": "Placeholder",
+              "TRTEngineOp_0": "TRTEngineOp",
+              "output": "Identity"
+          }, node_name_to_op)
 
       if need_calibration:
         trt_engine_nodes = [
@@ -306,39 +310,81 @@
     if not is_tensorrt_enabled():
       return
 
-    # TODO(laigd): we need to use ops like conv2d so Grappler can infer the
-    # shapes (at least rank) of the tensors, so we're able to build an TRT
-    # engine in dynamic mode. Currently shape information is not propagate from
-    # ConcreteFunction to GraphDef, need to investigate and fix it.
-    class SimpleModel(tracking.AutoTrackable):
+    np_input = np.random.random_sample([4, 1, 1]).astype(np.float32)
 
-      def __init__(self):
-        self.v = None
-
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=[None, 24, 24, 2], dtype=dtypes.float32)
-      ])
-      def run(self, inp):
-        if self.v is None:
-          self.v = variables.Variable([[[[1., 0.5, 4., 6., 0.5, 1.],
-                                         [1., 0.5, 1., 1., 0.5, 1.]]]])
-        conv = gen_nn_ops.conv2d(
-            input=inp, filter=self.v, strides=[1, 2, 2, 1], padding="SAME")
-        identity = array_ops.identity(conv)
-        return identity
-
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir1_v2")
-    root = SimpleModel()
+    # Create a model and save it.
+    input_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    root = self._GetModelForV2()
+    expected_output = root.run(np_input)
     save.save(root, input_saved_model_dir,
               {_SAVED_MODEL_SIGNATURE_KEY: root.run})
 
-    # Convert the SavedModel and verify the result.
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir1_v2")
-    self._TestTrtGraphConverter(
+    # Run TRT conversion.
+    converter = trt_convert.TrtGraphConverterV2(
         input_saved_model_dir=input_saved_model_dir,
-        output_saved_model_dir=output_saved_model_dir,
-        is_dynamic_op=True)
+        input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
+        conversion_params=trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace(
+            precision_mode=trt_convert.TrtPrecisionMode.FP32,
+            is_dynamic_op=True,
+            maximum_cached_engines=2,
+            use_function_backup=False))
+    converted_func = converter.convert()
+
+    def _check_trt_ops(graph_def):
+      trt_op_names = [
+          node.name for node in graph_def.node if node.op == "TRTEngineOp"
+      ]
+      for func in graph_def.library.function:
+        for node in func.node_def:
+          if node.op == "TRTEngineOp":
+            trt_op_names.append(node.name)
+      self.assertEqual(1, len(trt_op_names))
+      self.assertIn("TRTEngineOp_0", trt_op_names[0])
+
+    # Verify the converted GraphDef and ConcreteFunction.
+    self.assertIsInstance(converted_func, def_function.Function)
+    converted_concrete_func = converted_func.get_concrete_function(
+        tensor_spec.TensorSpec(shape=[None, 1, 1], dtype=dtypes.float32))
+    _check_trt_ops(converted_concrete_func.graph.as_graph_def())
+
+    # Save the converted model without any TRT engine cache.
+    output_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    converter.save(output_saved_model_dir)
+    unexpected_asset_file = os.path.join(
+        output_saved_model_dir, "assets/trt-serialized-engine.TRTEngineOp_0")
+    self.assertFalse(os.path.exists(unexpected_asset_file))
+
+    # Run the converted function to populate the engine cache.
+    output_with_trt = converted_func(np_input)
+    self.assertEqual(1, len(output_with_trt))
+    self.assertAllClose(
+        expected_output, output_with_trt[0], atol=1e-6, rtol=1e-6)
+
+    # Save the converted model again with serialized engine cache.
+    output_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    converter.save(output_saved_model_dir)
+    expected_asset_file = os.path.join(
+        output_saved_model_dir, "assets/trt-serialized-engine.TRTEngineOp_0")
+    self.assertTrue(os.path.exists(expected_asset_file))
+    self.assertTrue(os.path.getsize(expected_asset_file))
+
+    # Load and verify the converted model.
+    #
+    # TODO(laigd): the name of then new input_signature of the
+    # `root_with_trt.run` function is empty string (originaly was None),
+    # investigate why.
+    root_with_trt = load.load(output_saved_model_dir)
+    # TODO(laigd): `root_with_trt.run` is still using the original graph without
+    # trt. Consider changing that.
+    # _check_trt_ops(
+    #     root_with_trt.run.get_concrete_function().graph.as_graph_def())
+    converted_signature = root_with_trt.signatures[_SAVED_MODEL_SIGNATURE_KEY]
+    _check_trt_ops(converted_signature.graph.as_graph_def())
+    output_with_trt = converted_signature(ops.convert_to_tensor(np_input))
+    # The output of running the converted signature is a dict due to
+    # compatibility reasons with V1 SavedModel signature mechanism.
+    output_with_trt = output_with_trt[output_with_trt.keys()[0]]
+    self.assertAllClose(expected_output, output_with_trt, atol=1e-6, rtol=1e-6)
 
   def _TestRun(self,
                sess,
@@ -363,7 +409,7 @@
     node_name_to_op = {node.name: node.op for node in output_graph_def.node}
     self.assertEqual(
         {
-            "v1/read": "Const",
+            "add/ReadVariableOp": "Const",
             "input": "Placeholder",
             "add": "Add",
             "mul": "Mul",
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
index 8cad912..5b97873 100644
--- a/tensorflow/python/data/benchmarks/batch_benchmark.py
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -48,6 +48,25 @@
             name="sparse_num_elements_%d_batch_size_%d" % (non_zeros_per_row,
                                                            batch_size))
 
+  def benchmark_batch_dense(self):
+    for element_exp in [10, 12, 14, 16, 18, 20, 22]:
+      for batch_exp in [3, 6, 9]:
+        for parallel_copy in [True, False]:
+          element_size = 1 << element_exp
+          batch_size = 1 << batch_exp
+          dataset = dataset_ops.Dataset.from_tensors(
+              np.random.rand(element_size)).repeat().batch(batch_size)
+          options = dataset_ops.Options()
+          options.experimental_optimization.parallel_batch = parallel_copy
+          dataset = dataset.with_options(options)
+          tag = "_parallel" if parallel_copy else ""
+          self.run_and_report_benchmark(
+              dataset,
+              num_elements=(1 << (22 - batch_exp - element_exp // 2)),
+              iters=1,
+              name="batch_element_size_%d_batch_size_%d%s" %
+              (element_size, batch_size, tag))
+
 
 if __name__ == "__main__":
   benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
index 11aaeba..85c8945 100644
--- a/tensorflow/python/data/benchmarks/benchmark_base.py
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -30,7 +30,7 @@
 class DatasetBenchmarkBase(test.Benchmark):
   """Base class for dataset benchmarks."""
 
-  def run_benchmark(self, dataset, num_elements, iters=1):
+  def run_benchmark(self, dataset, num_elements, iters=1, warmup=True):
     """Benchmarks the dataset.
 
     Runs the dataset `iters` times. In each iteration, the benchmark measures
@@ -41,6 +41,7 @@
       num_elements: Number of dataset elements to iterate through each benchmark
         iteration.
       iters: Number of times to repeat the timing.
+      warmup: If true, warms up the session caches by running an untimed run.
 
     Returns:
       A float, representing the per-element wall time of the dataset in seconds.
@@ -62,9 +63,10 @@
     deltas = []
     for _ in range(iters):
       with session.Session() as sess:
-        # Run once to warm up the session caches.
-        sess.run(iterator.initializer)
-        sess.run(next_element)
+        if warmup:
+          # Run once to warm up the session caches.
+          sess.run(iterator.initializer)
+          sess.run(next_element)
 
         sess.run(iterator.initializer)
         start = time.time()
@@ -78,9 +80,10 @@
                                num_elements,
                                name,
                                iters=5,
-                               extras=None):
+                               extras=None,
+                               warmup=True):
     # Measure the per-element wall time.
-    wall_time = self.run_benchmark(dataset, num_elements, iters)
+    wall_time = self.run_benchmark(dataset, num_elements, iters, warmup)
 
     if extras is None:
       extras = {}
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 519e8d5..a3b1140 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -26,11 +26,13 @@
 @@CheckpointInputPipelineHook
 @@CsvDataset
 @@DatasetStructure
+@@DistributeOptions
 @@MapVectorizationOptions
 @@NestedStructure
 @@OptimizationOptions
 @@Optional
 @@OptionalStructure
+@@RaggedTensorStructure
 @@RandomDataset
 @@Reducer
 @@SparseTensorStructure
@@ -85,7 +87,6 @@
 from __future__ import print_function
 
 # pylint: disable=unused-import
-
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch_with_legacy_function
@@ -94,6 +95,7 @@
 from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
 from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.experimental.ops.counter import Counter
+from tensorflow.python.data.experimental.ops.distribute_options import DistributeOptions
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
@@ -136,6 +138,7 @@
 from tensorflow.python.data.ops.optional_ops import Optional
 from tensorflow.python.data.ops.optional_ops import OptionalStructure
 from tensorflow.python.data.util.structure import NestedStructure
+from tensorflow.python.data.util.structure import RaggedTensorStructure
 from tensorflow.python.data.util.structure import SparseTensorStructure
 from tensorflow.python.data.util.structure import Structure
 from tensorflow.python.data.util.structure import TensorArrayStructure
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index d6b7d21..9d0a263 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -15,7 +15,6 @@
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
-        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -185,6 +184,25 @@
 )
 
 py_test(
+    name = "snapshot_dataset_benchmark",
+    srcs = ["snapshot_dataset_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/benchmarks:benchmark_base",
+        "//tensorflow/python/data/experimental/ops:snapshot",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index 4d9e625..e0e6f88 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -22,7 +22,6 @@
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import math_ops
@@ -78,13 +77,12 @@
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
                                                                1))).repeat()
-    dataset = dataset.apply(
-        batching.map_and_batch(
-            math_ops.matmul,
-            num_parallel_calls=optimization.AUTOTUNE,
-            batch_size=batch_size))
+    dataset = dataset.map(
+        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset.batch(batch_size=batch_size)
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_and_batch_fusion = True
     options.experimental_optimization.autotune = autotune
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
new file mode 100644
index 0000000..79b93c8
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -0,0 +1,98 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.snapshot()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
+from tensorflow.python.data.experimental.ops import snapshot
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for `tf.data.experimental.snapshot()`."""
+
+  def _makeSnapshotDirectory(self):
+    tmp_dir = test.get_temp_dir()
+    tmp_dir = os.path.join(tmp_dir, "snapshot")
+    if os.path.exists(tmp_dir):
+      shutil.rmtree(tmp_dir)
+    os.mkdir(tmp_dir)
+    return tmp_dir
+
+  def _createSimpleDataset(self, num_elems, tmp_dir=None):
+    if not tmp_dir:
+      tmp_dir = self._makeSnapshotDirectory()
+
+    dataset = dataset_ops.Dataset.from_tensor_slices([1.0])
+    dataset = dataset.map(
+        lambda x: gen_array_ops.broadcast_to(x, [50, 50, 3]))
+    dataset = dataset.repeat(num_elems)
+    dataset = dataset.apply(snapshot.snapshot(tmp_dir))
+
+    return dataset
+
+  def _consumeDataset(self, dataset, num_elems):
+    dataset = dataset.skip(num_elems)
+    next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    with session.Session() as sess:
+      try:
+        sess.run(next_element)
+      except errors.OutOfRangeError:
+        pass
+
+  def benchmarkWriteSnapshotSimple(self):
+    num_elems = 500000
+    dataset = self._createSimpleDataset(num_elems)
+
+    # We only run one iteration here because running multiple iterations will
+    # cause the later iterations to simply read from the already written
+    # snapshot rather than write a new one.
+    self.run_and_report_benchmark(dataset, num_elems, "write_simple",
+                                  warmup=False, iters=1)
+
+  def benchmarkPassthroughSnapshotSimple(self):
+    num_elems = 100000
+    tmp_dir = self._makeSnapshotDirectory()
+    dataset = self._createSimpleDataset(num_elems, tmp_dir)
+
+    # Consume only 1 element, thus making sure we don't finalize.
+    self._consumeDataset(dataset, 1)
+
+    self.run_and_report_benchmark(dataset, num_elems, "passthrough_simple")
+
+  def benchmarkReadSnapshotSimple(self):
+    num_elems = 100000
+    tmp_dir = self._makeSnapshotDirectory()
+    dataset = self._createSimpleDataset(num_elems, tmp_dir)
+
+    # consume all the elements to let snapshot write things to disk
+    self._consumeDataset(dataset, num_elems)
+
+    self.run_and_report_benchmark(dataset, num_elems, "read_simple")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 205337c..d288e25 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -145,23 +145,6 @@
 )
 
 py_test(
-    name = "enumerate_dataset_test",
-    size = "small",
-    srcs = ["enumerate_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/experimental/ops:enumerate_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_test(
     name = "get_single_element_test",
     size = "small",
     srcs = ["get_single_element_test.py"],
@@ -241,21 +224,6 @@
 )
 
 py_test(
-    name = "indexed_dataset_ops_test",
-    srcs = ["indexed_dataset_ops_test.py"],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/data/experimental/ops:indexed_dataset_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
     name = "make_batched_features_dataset_test",
     size = "medium",
     srcs = ["make_batched_features_dataset_test.py"],
@@ -404,11 +372,7 @@
     size = "medium",
     srcs = ["parallel_interleave_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "notap",
-    ],
+    tags = ["no_pip"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -495,6 +459,8 @@
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
@@ -635,6 +601,24 @@
 )
 
 py_test(
+    name = "snapshot_test",
+    srcs = ["snapshot_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":reader_dataset_ops_test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/experimental/ops:snapshot",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
     name = "stats_dataset_ops_test",
     size = "large",
     srcs = ["stats_dataset_ops_test.py"],
@@ -731,6 +715,7 @@
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops/ragged",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 64db471..5d964c1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -248,6 +248,18 @@
     ]
     self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
 
+  def testShardOutOfRange(self):
+    dataset = dataset_ops.Dataset.range(5)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = distribute._AutoShardDataset(dataset, 10, 0)
+      self.evaluate(self.getNext(dataset)())
+
+  def testShardOutOfRangeEmptyDataset(self):
+    dataset = dataset_ops.Dataset.range(0)
+    with self.assertRaises(errors.OutOfRangeError):
+      dataset = distribute._AutoShardDataset(dataset, 10, 0)
+      self.evaluate(self.getNext(dataset)())
+
   def testNoReaderPipelines(self):
     dataset = dataset_ops.Dataset.range(1024)
     dataset = distribute._AutoShardDataset(dataset, 2, 0)
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
deleted file mode 100644
index 79b8c49..0000000
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for experimental indexed dataset ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-from tensorflow.python.data.experimental.ops import indexed_dataset_ops
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-from tensorflow.python.platform import test
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class IndexedDatasetOpsTest(test_base.DatasetTestBase):
-
-  def testLowLevelIndexedDatasetOps(self):
-    identity = ged_ops.experimental_identity_indexed_dataset(
-        ops.convert_to_tensor(16, dtype=dtypes.uint64))
-    handle = ged_ops.experimental_materialized_index_dataset_handle(
-        container="",
-        shared_name="",
-        output_types=[dtypes.uint64],
-        output_shapes=[[]])
-    materialize = ged_ops.experimental_indexed_dataset_materialize(
-        identity, handle)
-    get_op = ged_ops.experimental_indexed_dataset_get(
-        handle, 3, output_types=[dtypes.uint64], output_shapes=[[]])
-
-    self.evaluate(materialize)
-    self.assertEqual([3], self.evaluate(get_op))
-
-  # TODO(b/117581999): Eager mode not supported.
-  @test_util.run_deprecated_v1
-  def testSkipEagerIdentityIndexedDataset(self):
-    ds = indexed_dataset_ops.IdentityIndexedDataset(16)
-    materialized = ds.materialize()
-    self.evaluate(materialized.initializer)
-    for i in range(16):
-      output = self.evaluate(materialized.get(i))
-      self.assertEqual([i], output)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.evaluate(materialized.get(16))
-
-  @unittest.skip("Requisite functionality currently unimplemented.")
-  def testIdentityIndexedDatasetIterator(self):
-    ds = indexed_dataset_ops.IdentityIndexedDataset(16)
-    n = self.getNext(ds)
-
-    for i in range(16):
-      output = self.evaluate(n())
-      self.assertEqual(i, output)
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(n())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 9109e6d..a0253ad 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -44,17 +44,12 @@
 class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ("Default", None, None, False),
-      ("SequentialCalls", 1, None, False),
-      ("ParallelCalls", 2, None, False),
-      ("ParallelBatches", None, 10, False),
-      ("DefaultNUMA", None, None, True),
-      ("SequentialCallsNUMA", 1, None, True),
-      ("ParallelCallsNUMA", 2, None, True),
-      ("ParallelBatchesNUMA", None, 10, True),
+      ("Default", None, None),
+      ("SequentialCalls", 1, None),
+      ("ParallelCalls", 2, None),
+      ("ParallelBatches", None, 10),
   )
-  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches,
-                      numa_aware):
+  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
@@ -65,7 +60,7 @@
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    def dataset_fn(batch_size, count, numa_aware=numa_aware):
+    def dataset_fn(batch_size, count):
       dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
           count).apply(
               batching.map_and_batch(
@@ -73,10 +68,6 @@
                   batch_size=batch_size,
                   num_parallel_calls=num_parallel_calls,
                   num_parallel_batches=num_parallel_batches))
-      if numa_aware:
-        options = dataset_ops.Options()
-        options.experimental_numa_aware = True
-        dataset = dataset.with_options(options)
       return dataset
 
     # Batch of a finite input, where the batch_size divides the
@@ -126,12 +117,10 @@
       self.assertDatasetProduces(dataset_fn(0, 14), expected_output=[])
 
   @parameterized.named_parameters(
-      ("Even", False, False),
-      ("Uneven", True, False),
-      ("EvenNUMA", False, True),
-      ("UnevenNUMA", True, True),
+      ("Even", False),
+      ("Uneven", True),
   )
-  def testMapAndBatchPartialBatch(self, drop_remainder, numa_aware):
+  def testMapAndBatchPartialBatch(self, drop_remainder):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
             batching.map_and_batch(
@@ -139,11 +128,6 @@
                 batch_size=4,
                 drop_remainder=drop_remainder)))
 
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
-
     if drop_remainder:
       self.assertEqual(
           [4, 1], dataset_ops.get_legacy_output_shapes(dataset).as_list())
@@ -155,36 +139,20 @@
       expected_output.append([[64], [81]])
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchYieldsPartialBatch(self, numa_aware):
+  def testMapAndBatchYieldsPartialBatch(self):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
             batching.map_and_batch(lambda x: array_ops.reshape(x * x, [1]), 4)))
 
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
     self.assertEqual(
         [None, 1], dataset_ops.get_legacy_output_shapes(dataset).as_list())
     expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]],
                        [[64], [81]]]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchParallelGetNext(self, numa_aware):
+  def testMapAndBatchParallelGetNext(self):
     dataset = dataset_ops.Dataset.range(50000).apply(
         batching.map_and_batch(lambda x: x, batch_size=100))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
 
     if context.executing_eagerly():
       iterator = iter(dataset)
@@ -207,20 +175,11 @@
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate([element() for element in elements])
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchParallelGetNextDropRemainder(self, numa_aware):
+  def testMapAndBatchParallelGetNextDropRemainder(self):
     dataset = dataset_ops.Dataset.range(49999).apply(
         batching.map_and_batch(
             lambda x: x, batch_size=100, drop_remainder=True))
 
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
-
     if context.executing_eagerly():
       iterator = iter(dataset)
       get_next = iterator._next_internal  # pylint: disable=protected-access
@@ -242,11 +201,7 @@
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate([element() for element in elements])
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchSparse(self, numa_aware):
+  def testMapAndBatchSparse(self):
 
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -254,10 +209,6 @@
 
     dataset = dataset_ops.Dataset.range(10).apply(
         batching.map_and_batch(_sparse, 5))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
 
     self.assertDatasetProduces(
         dataset,
@@ -268,11 +219,7 @@
                 dense_shape=[5, 1]) for i in range(2)
         ])
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchFails(self, numa_aware):
+  def testMapAndBatchFails(self):
     """Test a dataset that maps a TF function across its input elements."""
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
@@ -280,18 +227,10 @@
           array_ops.check_numerics(
               constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
       dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
-      if numa_aware:
-        options = dataset_ops.Options()
-        options.experimental_numa_aware = True
-        dataset = dataset.with_options(options)
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchShapeMismatch(self, numa_aware):
+  def testMapAndBatchShapeMismatch(self):
     """Test a dataset that maps a TF function across its input elements."""
 
     def generator():
@@ -304,20 +243,12 @@
         generator, output_types=dtypes.int32)
     batch_size = 4
     dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
         expected_error=(errors.InvalidArgumentError,
                         "number of elements does not match"))
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchImplicitDispose(self, numa_aware):
+  def testMapAndBatchImplicitDispose(self):
     # Tests whether a map and batch dataset will be cleaned up correctly when
     # the pipeline does not run it until exhaustion.
     # The pipeline is TensorSliceDataset -> RepeatDataset(1000) ->
@@ -332,29 +263,19 @@
     dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
         1000).apply(batching.map_and_batch(_map_fn, batch_size=100))
     dataset = dataset.prefetch(5)
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
     for _ in range(3):
       self.evaluate(get_next())
 
   @parameterized.named_parameters(
-      ("1", 0, False),
-      ("2", 5, False),
-      ("3", 10, False),
-      ("4", 90, False),
-      ("5", 95, False),
-      ("6", 99, False),
-      ("1NUMA", 0, True),
-      ("2NUMA", 5, True),
-      ("3NUMA", 10, True),
-      ("4NUMA", 90, True),
-      ("5NUMA", 95, True),
-      ("6NUMA", 99, True),
+      ("1", 0),
+      ("2", 5),
+      ("3", 10),
+      ("4", 90),
+      ("5", 95),
+      ("6", 99),
   )
-  def testMapAndBatchMapError(self, threshold, numa_aware):
+  def testMapAndBatchMapError(self, threshold):
 
     def raising_py_fn(i):
       if i >= threshold:
@@ -366,52 +287,31 @@
         batching.map_and_batch(
             lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
             batch_size=10))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
 
     get_next = self.getNext(dataset)
     for i in range(threshold // 10):
       self.assertAllEqual([i * 10 + j for j in range(10)],
                           self.evaluate(get_next()))
-    if numa_aware:
-      if threshold % 10 != 0:
-        self.assertAllEqual(
-            [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            self.evaluate(get_next()))
-    else:
-      for i in range(threshold // 10, 10):
-        with self.assertRaises(errors.InvalidArgumentError):
-          self.evaluate(get_next())
+    for i in range(threshold // 10, 10):
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(get_next())
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
   @parameterized.named_parameters(
-      ("1", False, dtypes.bool, False),
-      ("2", -42, dtypes.int8, False),
-      ("3", -42, dtypes.int16, False),
-      ("4", -42, dtypes.int32, False),
-      ("5", -42, dtypes.int64, False),
-      ("6", 42, dtypes.uint8, False),
-      ("7", 42, dtypes.uint16, False),
-      ("8", 42.0, dtypes.float16, False),
-      ("9", 42.0, dtypes.float32, False),
-      ("10", 42.0, dtypes.float64, False),
-      ("11", b"hello", dtypes.string, False),
-      ("1NUMA", False, dtypes.bool, True),
-      ("2NUMA", -42, dtypes.int8, True),
-      ("3NUMA", -42, dtypes.int16, True),
-      ("4NUMA", -42, dtypes.int32, True),
-      ("5NUMA", -42, dtypes.int64, True),
-      ("6NUMA", 42, dtypes.uint8, True),
-      ("7NUMA", 42, dtypes.uint16, True),
-      ("8NUMA", 42.0, dtypes.float16, True),
-      ("9NUMA", 42.0, dtypes.float32, True),
-      ("10NUMA", 42.0, dtypes.float64, True),
-      ("11NUMA", b"hello", dtypes.string, True),
+      ("1", False, dtypes.bool),
+      ("2", -42, dtypes.int8),
+      ("3", -42, dtypes.int16),
+      ("4", -42, dtypes.int32),
+      ("5", -42, dtypes.int64),
+      ("6", 42, dtypes.uint8),
+      ("7", 42, dtypes.uint16),
+      ("8", 42.0, dtypes.float16),
+      ("9", 42.0, dtypes.float32),
+      ("10", 42.0, dtypes.float64),
+      ("11", b"hello", dtypes.string),
   )
-  def testMapAndBatchTypes(self, element, dtype, numa_aware):
+  def testMapAndBatchTypes(self, element, dtype):
 
     def gen():
       yield element
@@ -419,11 +319,6 @@
     dataset = dataset_ops.Dataset.from_generator(gen, dtype).repeat(100).apply(
         batching.map_and_batch(lambda x: x, batch_size=10))
 
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
-
     get_next = self.getNext(dataset)
     for _ in range(10):
       self.assertAllEqual([element for _ in range(10)],
@@ -456,11 +351,7 @@
     get_next = self.getNext(dataset, requires_initialization=True)
     self.assertAllEqual([42] * 10, self.evaluate(get_next()))
 
-  @parameterized.named_parameters(
-      ("Normal", False),
-      ("NUMA", True),
-  )
-  def testMapAndBatchControlFlow(self, numa_aware):
+  def testMapAndBatchControlFlow(self):
 
     def map_fn(x):
       previous_control_flow_v2_value = control_flow_util.ENABLE_CONTROL_FLOW_V2
@@ -471,10 +362,6 @@
 
     dataset = dataset_ops.Dataset.range(100).apply(
         batching.map_and_batch(map_fn, batch_size=10))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
     for i in range(10):
       if i < 5:
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 318dfd9..fbeb443 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -55,9 +55,11 @@
     srcs = ["filter_with_random_uniform_fusion_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "manual",
         "no_oss",
         "no_pip",
         "no_windows",
+        "notap",  # TODO(b/131229793)
     ],
     deps = [
         "//tensorflow/python:client_testlib",
@@ -122,25 +124,6 @@
 )
 
 py_test(
-    name = "make_numa_aware_test",
-    srcs = ["make_numa_aware_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:optimization",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_test(
     name = "map_and_batch_fusion_test",
     srcs = ["map_and_batch_fusion_test.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
deleted file mode 100644
index d79ae43..0000000
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the `MakeNumaAware` optimization."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class MakeNumaAwareTest(test_base.DatasetTestBase):
-
-  def testMakeNumaAware(self):
-    dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(["NumaMapAndBatch"])).apply(
-            batching.map_and_batch(lambda x: x * x, 10))
-    options = dataset_ops.Options()
-    options.experimental_numa_aware = True
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    self.assertDatasetProduces(
-        dataset, expected_output=[[x * x for x in range(10)]])
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 9d53531..1fe5655 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -171,6 +171,7 @@
   def _testSingleThreaded(self, sloppy=False, prefetch_input_elements=0):
     # cycle_length=1,block_length=1 acts like `Dataset.interleave()` and
     # `Dataset.flat_map()` and is single-threaded. No synchronization required.
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     next_element = self.getNext(
         self.dataset_fn(
@@ -202,6 +203,7 @@
 
   def testSingleThreadedRagged(self):
     # Tests a sequence with wildly different elements per iterator.
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     next_element = self.getNext(
         self.dataset_fn(
@@ -229,6 +231,7 @@
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     done_first_event = False
     next_element = self.getNext(
@@ -271,6 +274,7 @@
     Args:
       sloppy: Whether to be sloppy or not.
     """
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     done_first_event = False
     next_element = self.getNext(
@@ -311,6 +315,7 @@
   def _testTwoThreadsNoContentionBlockLength(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     done_first_event = False
     next_element = self.getNext(
@@ -354,6 +359,7 @@
     Args:
       sloppy: Whether to be sloppy or not.
     """
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     done_first_event = False
     next_element = self.getNext(
@@ -434,6 +440,7 @@
   def _testPartiallyEmptyOutputs(self, sloppy=False, prefetch_input_elements=1):
     race_indices = {2, 8, 14}  # Sequence points when sloppy mode has race conds
     # Mixture of non-empty and empty interleaved datasets.
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     done_first_event = False
     next_element = self.getNext(
@@ -471,6 +478,7 @@
   def testDelayedOutputSloppy(self):
     # Explicitly control the sequence of events to ensure we correctly avoid
     # head-of-line blocking.
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     next_element = self.getNext(
         self.dataset_fn(
@@ -493,6 +501,7 @@
       self.evaluate(next_element())
 
   def testBlockLengthWithContentionSloppy(self):
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     done_first_event = False
     next_element = self.getNext(
@@ -526,6 +535,7 @@
 
   def _testEarlyExit(self, sloppy=False):
     # Exiting without consuming all input should not block
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     next_element = self.getNext(
         self.dataset_fn(
@@ -601,6 +611,7 @@
       self.evaluate(get_next())
 
   def testErrorsInOutputFn(self):
+    self.skipTest("b/131722904")
     self._clear_coordination_events()
     next_element = self.getNext(
         self.dataset_fn(
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 38d3abe..20ea60b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -20,13 +20,18 @@
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -42,7 +47,7 @@
   def testBasic(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(1024).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[32 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -56,14 +61,14 @@
   def testScalarInputError(self, _):
     dataset = dataset_ops.Dataset.range(1024)
     with self.assertRaisesRegexp(ValueError, "at least one dimension"):
-      batching._RebatchDataset(dataset, num_workers=4)
+      distribute._RebatchDataset(dataset, num_workers=4)
 
   def testNotDivisibleError(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(1024).batch(
         32, drop_remainder=drop_remainder)
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "not divisible by"):
-      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=5)
+      rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=5)
       next_element = self.getNext(rebatched_dataset)
       self.evaluate(next_element())
 
@@ -71,7 +76,7 @@
     dataset = (
         dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(
             32, drop_remainder=drop_remainder))
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     expected_output = [([k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
                         [k for k in range(i, i + 8)])
                        for i in range(0, 1024, 8)]
@@ -81,7 +86,7 @@
     dataset = dataset_ops.Dataset.range(1024).map(
         lambda x: {"a": x, "b": {"c": x}}).batch(
             32, drop_remainder=drop_remainder)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     expected_output = [{"a": [k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
                         "b": {"c": [k for k in range(i, i + 8)]}}
                        for i in range(0, 1024, 8)]
@@ -90,7 +95,7 @@
   def testFinalPartialBatchOriginal(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(1032).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[32 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -104,7 +109,7 @@
   def testFinalPartialBatchAfterRebatch(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(34).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[32 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -130,7 +135,7 @@
                        for i in range(0, 128, 32)]
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = batching._RebatchDataset(dataset, 4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, 4)
     self.assertEqual(
         [[2, 4]] if drop_remainder else [[None, None]],
         [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
@@ -144,7 +149,7 @@
     dataset = dataset_ops.Dataset.range(1024).apply(
         batching.map_and_batch(
             math_ops.square, 32, drop_remainder=drop_remainder))
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[32 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -155,10 +160,26 @@
                        for i in range(0, 1024, 8)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  def testMapAndBatchWithCapturedInput(self, drop_remainder):
+    captured_t = variables.Variable(42)
+    dataset = dataset_ops.Dataset.range(1024).apply(
+        batching.map_and_batch(
+            lambda x: captured_t, 32, drop_remainder=drop_remainder))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual([[32 if drop_remainder else None]],
+                     [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual([[8 if drop_remainder else None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = [[42 for _ in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 1024, 8)]
+    self.evaluate(variables.global_variables_initializer())
+    self.assertDatasetProduces(
+        rebatched_dataset, expected_output, requires_initialization=True)
+
   def testPaddedBatch(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(128).batch(4).padded_batch(
         8, padded_shapes=[5], drop_remainder=drop_remainder)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[8, 5]] if drop_remainder else [[None, 5]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -184,7 +205,7 @@
     dataset2 = dataset_ops.Dataset.range(32).batch(
         8, drop_remainder=drop_remainder)
     dataset = dataset1.concatenate(dataset2)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[8 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -201,7 +222,7 @@
     dataset2 = dataset_ops.Dataset.range(32).batch(
         8, drop_remainder=drop_remainder)
     dataset = dataset1.concatenate(dataset2)
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
     self.assertEqual(
@@ -217,7 +238,7 @@
     dataset2 = dataset_ops.Dataset.range(32).batch(
         8, drop_remainder=drop_remainder)
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[8], [8]] if drop_remainder else [[None], [None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -233,7 +254,7 @@
     dataset2 = dataset_ops.Dataset.range(32).batch(
         8, drop_remainder=drop_remainder)
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[16], [8]] if drop_remainder else [[None], [None]],
         [ts.as_list() for ts in _flat_shapes(dataset)])
@@ -249,7 +270,7 @@
         32, drop_remainder=drop_remainder).apply(
             scan_ops.scan([0], lambda _, a: ([0], a)))
     with self.assertRaises(errors.InvalidArgumentError):
-      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+      rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
       next_element = self.getNext(rebatched_dataset)
       self.evaluate(next_element())
 
@@ -264,7 +285,7 @@
     expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[8 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
@@ -286,7 +307,7 @@
     expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[8 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
@@ -309,7 +330,7 @@
     expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
         [[8 if drop_remainder else None]],
         [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
@@ -320,5 +341,25 @@
                        for _ in range(2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  def testGroupByWindowBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)])
+    reduce_fn = lambda bucket_id, ds: ds.batch(
+        batch_size=10, drop_remainder=drop_remainder)
+    dataset = dataset.apply(
+        grouping.group_by_window(
+            key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=2)
+
+    self.assertEqual([[5, 3] if drop_remainder else [None, 3]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # pylint: disable=g-complex-comprehension
+    expected_output = [[[j + i * 4 + k * 20] * 3
+                        for i in range(5)]
+                       for j in range(4)
+                       for k in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 1888390..f0fc2d5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -373,26 +373,6 @@
 )
 
 py_test(
-    name = "numa_map_and_batch_dataset_serialization_test",
-    size = "medium",
-    srcs = ["numa_map_and_batch_dataset_serialization_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # b/118497483
-        "no_pip",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
-        ":dataset_serialization_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_test(
     name = "map_dataset_serialization_test",
     size = "medium",
     srcs = ["map_dataset_serialization_test.py"],
@@ -466,7 +446,7 @@
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/numa_map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/numa_map_and_batch_dataset_serialization_test.py
deleted file mode 100644
index 04aab32..0000000
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/numa_map_and_batch_dataset_serialization_test.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the MapAndBatchDataset serialization."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class MapAndBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def testNumParallelBatches(self):
-    range_size = 11
-    num_repeats = 2
-    batch_size = 5
-    total_outputs = range_size * num_repeats
-    num_outputs_drop_remainder = total_outputs // batch_size
-    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
-    num_parallel_batches = 2
-
-    def build_ds(range_start, drop_remainder=False):
-
-      def _map_fn(x):
-        return math_ops.square(x)
-
-      ds = dataset_ops.Dataset.range(
-          range_start, range_start + range_size).repeat(num_repeats).apply(
-              batching.map_and_batch(
-                  map_func=_map_fn,
-                  batch_size=batch_size,
-                  num_parallel_batches=num_parallel_batches,
-                  drop_remainder=drop_remainder))
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      return ds.with_options(options)
-
-    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
-                        num_outputs_keep_remainder)
-    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
-                        num_outputs_drop_remainder)
-
-  def testNumParallelCalls(self):
-    range_size = 11
-    num_repeats = 2
-    batch_size = 5
-    total_outputs = range_size * num_repeats
-    num_outputs_drop_remainder = total_outputs // batch_size
-    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
-    num_parallel_calls = 7
-
-    def build_ds(range_start, drop_remainder=False):
-
-      def _map_fn(x):
-        return math_ops.square(x)
-
-      ds = dataset_ops.Dataset.range(
-          range_start, range_start + range_size).repeat(num_repeats).apply(
-              batching.map_and_batch(
-                  map_func=_map_fn,
-                  batch_size=batch_size,
-                  num_parallel_calls=num_parallel_calls,
-                  drop_remainder=drop_remainder))
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      return ds.with_options(options)
-
-    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
-                        num_outputs_keep_remainder)
-    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
-                        num_outputs_drop_remainder)
-
-
-if __name__ == "__main__":
-  test.main()
-
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
index b30db58..a053d08 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
@@ -18,7 +18,7 @@
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
@@ -29,7 +29,7 @@
   def testCore(self):
 
     def build_dataset(num_elements, batch_size):
-      return batching._RebatchDataset(
+      return distribute._RebatchDataset(
           dataset_ops.Dataset.range(num_elements).batch(
               4 * batch_size, drop_remainder=True),
           num_workers=4)
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
new file mode 100644
index 0000000..50090f2
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -0,0 +1,183 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `SnapshotDataset` transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import snapshot
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
+
+  def setUp(self):
+    super(SnapshotDatasetTest, self).setUp()
+    self.removeTFRecords()
+
+  def removeTFRecords(self):
+    for filename in self.test_filenames:
+      os.remove(filename)
+    self.test_filenames = []
+
+  def setUpTFRecord(self):
+    self._num_files = 10
+    self._num_records = 10
+    self.test_filenames = self._createFiles()
+
+  def makeSnapshotDirectory(self):
+    tmpdir = self.get_temp_dir()
+    tmpdir = os.path.join(tmpdir, "snapshot")
+    os.mkdir(tmpdir)
+    return tmpdir
+
+  def assertSnapshotDirectoryContains(
+      self, directory, num_fingerprints, num_runs_per_fp, num_snapshot_files):
+    dirlist = os.listdir(directory)
+    self.assertEqual(len(dirlist), num_fingerprints)
+
+    for i in range(num_fingerprints):
+      fingerprint_dir = os.path.join(directory, dirlist[i])
+      fingerprint_dir_list = sorted(os.listdir(fingerprint_dir))
+      self.assertEqual(len(fingerprint_dir_list), num_runs_per_fp + 1)
+      self.assertEqual(fingerprint_dir_list[num_runs_per_fp],
+                       "snapshot.metadata")
+
+      for j in range(num_runs_per_fp):
+        run_dir = os.path.join(fingerprint_dir, fingerprint_dir_list[j])
+        run_dirlist = sorted(os.listdir(run_dir))
+        self.assertEqual(len(run_dirlist), num_snapshot_files)
+
+        file_counter = 0
+        for filename in run_dirlist:
+          self.assertEqual(filename, "%08d.snapshot" % file_counter)
+          file_counter += 1
+
+  def testWriteDifferentPipelinesInOneDirectory(self):
+    tmpdir = self.makeSnapshotDirectory()
+
+    dataset = dataset_ops.Dataset.range(1000)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset, list(range(1000)))
+
+    dataset = dataset_ops.Dataset.range(1001)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset, list(range(1001)))
+
+    self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
+
+  def testWriteSnapshotMultipleSimultaneous(self):
+    tmpdir = self.makeSnapshotDirectory()
+
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    next1 = self.getNext(dataset1)
+
+    dataset2 = dataset_ops.Dataset.range(1000)
+    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    next2 = self.getNext(dataset2)
+
+    for _ in range(1000):
+      self.evaluate(next1())
+      self.evaluate(next2())
+
+    # we check that only one copy of the metadata has been written, and the
+    # one that lost the race would be in passthrough mode.
+    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
+
+  def testWriteSnapshotSimpleSuccessful(self):
+    tmpdir = self.makeSnapshotDirectory()
+
+    dataset = dataset_ops.Dataset.range(1000)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset, list(range(1000)))
+
+    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
+
+  def testWriteSnapshotMultiFileSuccessful(self):
+    tmpdir = self.makeSnapshotDirectory()
+
+    dataset = dataset_ops.Dataset.range(20000)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset, list(range(20000)))
+
+    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 2)
+
+  def testReadSnapshotBackAfterWrite(self):
+    self.setUpTFRecord()
+    filenames = self.test_filenames
+
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 10)
+    ]
+
+    tmpdir = self.makeSnapshotDirectory()
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset, expected)
+
+    # remove the original files and try to read the data back only from snapshot
+    self.removeTFRecords()
+
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset2, expected)
+
+  def testAdditionalOperationsAfterReadBack(self):
+    self.setUpTFRecord()
+    filenames = self.test_filenames
+
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 10)
+    ]
+
+    tmpdir = self.makeSnapshotDirectory()
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset, expected)
+
+    # remove the original files and try to read the data back only from snapshot
+    self.removeTFRecords()
+
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    self.assertDatasetProduces(dataset2, expected)
+
+    expected_after = [
+        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 10)
+    ]
+
+    dataset3 = core_readers._TFRecordDataset(filenames)
+    dataset3 = dataset3.apply(snapshot.snapshot(tmpdir))
+    dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000))
+    self.assertDatasetProduces(dataset3, expected_after)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index dfcc14e..22a9b9c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -32,6 +32,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -82,18 +83,37 @@
     ]
     self.assertDatasetProduces(data, expected_output=expected_output)
 
-  def testUnbatchDatasetWithDenseAndSparseTensor(self):
+  def testUnbatchDatasetWithDenseSparseAndRaggedTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
         values=list(range(10)),
         dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
+    rt = ragged_factory_ops.constant_value([[[0]], [[1]], [[2]], [[3]], [[4]],
+                                            [[5]], [[6]], [[7]], [[8]], [[9]]])
+    data = dataset_ops.Dataset.from_tensors((list(range(10)), st, rt))
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    expected_output = [(i, sparse_tensor.SparseTensorValue([[i]], [i], [10]))
+    expected_output = [(i, sparse_tensor.SparseTensorValue([[i]], [i], [10]),
+                        ragged_factory_ops.constant_value([[i]]))
                        for i in range(10)]
-    self.assertDatasetProduces(data, expected_output=expected_output)
+    self.assertDatasetProduces(
+        data, expected_output=expected_output)
+
+  def testUnbatchDatasetWithRaggedTensor(self):
+    rt = ragged_factory_ops.constant_value([[[0]], [[1]], [[2]], [[3]], [[4]],
+                                            [[5]], [[6]], [[7]], [[8]], [[9]]])
+    data = dataset_ops.Dataset.from_tensors(rt)
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.batch(2)
+    data = data.apply(batching.unbatch())
+    expected_output = [
+        ragged_factory_ops.constant_value([[[0]], [[1]], [[2]], [[3]], [[4]]]),
+        ragged_factory_ops.constant_value([[[5]], [[6]], [[7]], [[8]], [[9]]]),
+    ]
+    self.assertDatasetProduces(
+        data, expected_output=expected_output)
 
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index faf4c2d..c5b4508 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -5,12 +5,30 @@
 exports_files(["LICENSE"])
 
 py_library(
+    name = "batching",
+    srcs = ["batching.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:structure",
+    ],
+)
+
+py_library(
     name = "cardinality",
     srcs = ["cardinality.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -22,120 +40,8 @@
         ":scan_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_library(
-    name = "get_single_element",
-    srcs = ["get_single_element.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-)
-
-py_library(
-    name = "iterator_ops",
-    srcs = [
-        "iterator_ops.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:basic_session_run_hooks",
-        "//tensorflow/python:checkpoint_management",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:session_run_hook",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:optional_ops",
-    ],
-)
-
-py_library(
-    name = "random_ops",
-    srcs = [
-        "random_ops.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:structure",
-    ],
-)
-
-py_library(
-    name = "readers",
-    srcs = [
-        "readers.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":batching",
-        ":interleave_ops",
-        ":optimization",
-        ":parsing_ops",
-        ":shuffle_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:convert",
-        "//tensorflow/python/data/util:nest",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "shuffle_ops",
-    srcs = [
-        "shuffle_ops.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_library(
-    name = "batching",
-    srcs = ["batching.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":get_single_element",
-        ":grouping",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:convert",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-        "//tensorflow/python/data/util:structure",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -146,13 +52,26 @@
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:errors",
         "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
     ],
 )
 
 py_library(
+    name = "distribute_options",
+    srcs = ["distribute_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
+    ],
+)
+
+py_library(
     name = "enumerate_ops",
     srcs = ["enumerate_ops.py"],
     srcs_version = "PY2AND3",
@@ -175,6 +94,18 @@
 )
 
 py_library(
+    name = "get_single_element",
+    srcs = ["get_single_element.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
     name = "grouping",
     srcs = ["grouping.py"],
     srcs_version = "PY2AND3",
@@ -212,6 +143,24 @@
 )
 
 py_library(
+    name = "iterator_ops",
+    srcs = [
+        "iterator_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:basic_session_run_hooks",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session_run_hook",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:optional_ops",
+    ],
+)
+
+py_library(
     name = "map_defun",
     srcs = ["map_defun.py"],
     srcs_version = "PY2AND3",
@@ -275,6 +224,69 @@
 )
 
 py_library(
+    name = "prefetching_ops",
+    srcs = ["prefetching_ops.py"],
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
+py_library(
+    name = "random_ops",
+    srcs = [
+        "random_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
+    ],
+)
+
+py_library(
+    name = "readers",
+    srcs = [
+        "readers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batching",
+        ":interleave_ops",
+        ":optimization",
+        ":parsing_ops",
+        ":shuffle_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
     name = "resampling",
     srcs = ["resampling.py"],
     srcs_version = "PY2AND3",
@@ -311,6 +323,17 @@
 )
 
 py_library(
+    name = "shuffle_ops",
+    srcs = [
+        "shuffle_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
     name = "sleep",
     srcs = ["sleep.py"],
     srcs_version = "PY2AND3",
@@ -321,6 +344,19 @@
 )
 
 py_library(
+    name = "snapshot",
+    srcs = [
+        "snapshot.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
+    ],
+)
+
+py_library(
     name = "stats_aggregator",
     srcs = ["stats_aggregator.py"],
     srcs_version = "PY2AND3",
@@ -420,33 +456,6 @@
 )
 
 py_library(
-    name = "indexed_dataset_ops",
-    srcs = ["indexed_dataset_ops.py"],
-    deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-)
-
-py_library(
-    name = "prefetching_ops",
-    srcs = ["prefetching_ops.py"],
-    deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
-    ],
-)
-
-py_library(
     name = "dataset_ops",
     deps = [
         ":batching",
@@ -457,7 +466,6 @@
         ":error_ops",
         ":get_single_element",
         ":grouping",
-        ":indexed_dataset_ops",
         ":interleave_ops",
         ":map_defun",
         ":matching_files",
@@ -468,6 +476,7 @@
         ":scan_ops",
         ":shuffle_ops",
         ":sleep",
+        ":snapshot",
         ":stats_ops",
         ":take_while_ops",
         ":threadpool",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 5ad917e..202a80a 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -17,141 +17,19 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.data.experimental.ops import get_single_element
-from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-def batch_window(dataset):
-  """Batches a window of tensors.
-
-  Args:
-    dataset: the input dataset.
-
-  Returns:
-    A `Tensor` representing the batch of the entire input dataset.
-  """
-  dataset_output_classes = dataset_ops.get_legacy_output_classes(dataset)
-  if isinstance(dataset_output_classes, tuple):
-    raise TypeError("Input dataset expected to have a single component")
-  if dataset_output_classes is ops.Tensor:
-    return _batch_dense_window(dataset)
-  elif dataset_output_classes is sparse_tensor.SparseTensor:
-    return _batch_sparse_window(dataset)
-  else:
-    raise TypeError("Unsupported dataset type: %s" % dataset_output_classes)
-
-
-def _batch_dense_window(dataset):
-  """Batches a window of dense tensors."""
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def shape_init_fn(_):
-    return array_ops.shape(first_element)
-
-  def shape_reduce_fn(state, value):
-    check_ops.assert_equal(state, array_ops.shape(value))
-    return state
-
-  def finalize_fn(state):
-    return state
-
-  dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
-  if dataset_output_shapes.is_fully_defined():
-    shape = dataset_output_shapes
-  else:
-    first_element = get_single_element.get_single_element(dataset.take(1))
-    shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
-                                     finalize_fn)
-    shape = get_single_element.get_single_element(
-        dataset.apply(grouping.group_by_reducer(key_fn, shape_reducer)))
-
-  def batch_init_fn(_):
-    batch_shape = array_ops.concat([[0], shape], 0)
-    return gen_array_ops.empty(
-        batch_shape, dtype=dataset_ops.get_legacy_output_types(dataset))
-
-  def batch_reduce_fn(state, value):
-    return array_ops.concat([state, [value]], 0)
-
-  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, batch_reducer)))
-
-
-def _batch_sparse_window(dataset):
-  """Batches a window of sparse tensors."""
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def shape_init_fn(_):
-    return first_element.dense_shape
-
-  def shape_reduce_fn(state, value):
-    check_ops.assert_equal(state, value.dense_shape)
-    return state
-
-  def finalize_fn(state):
-    return state
-
-  dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
-  if dataset_output_shapes.is_fully_defined():
-    shape = dataset_output_shapes
-  else:
-    first_element = get_single_element.get_single_element(dataset.take(1))
-    shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
-                                     finalize_fn)
-    shape = get_single_element.get_single_element(
-        dataset.apply(grouping.group_by_reducer(key_fn, shape_reducer)))
-
-  def batch_init_fn(_):
-    indices_shape = array_ops.concat([[0], [array_ops.size(shape) + 1]], 0)
-    return sparse_tensor.SparseTensor(
-        indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
-        values=constant_op.constant(
-            [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)),
-        dense_shape=array_ops.concat(
-            [np.array([0], dtype=np.int64),
-             math_ops.cast(shape, dtypes.int64)], 0))
-
-  def batch_reduce_fn(state, value):
-    return sparse_ops.sparse_concat(0, [state, value])
-
-  def reshape_fn(value):
-    return sparse_ops.sparse_reshape(
-        value,
-        array_ops.concat([np.array([1], dtype=np.int64), value.dense_shape], 0))
-
-  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.map(reshape_fn).apply(
-          grouping.group_by_reducer(key_fn, batch_reducer)))
-
-
 @tf_export("data.experimental.dense_to_sparse_batch")
 def dense_to_sparse_batch(batch_size, row_shape):
   """A transformation that batches ragged elements into `tf.SparseTensor`s.
@@ -183,14 +61,13 @@
   ```
 
   Args:
-    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the
-      number of consecutive elements of this dataset to combine in a
-      single batch.
-    row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like
-      object representing the equivalent dense shape of a row in the
-      resulting `tf.SparseTensor`. Each element of this dataset must
-      have the same rank as `row_shape`, and must have size less
-      than or equal to `row_shape` in each dimension.
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like object
+      representing the equivalent dense shape of a row in the resulting
+      `tf.SparseTensor`. Each element of this dataset must have the same rank as
+      `row_shape`, and must have size less than or equal to `row_shape` in each
+      dimension.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -203,412 +80,6 @@
   return _apply_fn
 
 
-def padded_batch_window(dataset, padded_shape, padding_value=None):
-  """Batches a window of tensors with padding.
-
-  Args:
-    dataset: the input dataset.
-    padded_shape: (Optional.) `tf.TensorShape` or `tf.int64` vector tensor-like
-      object representing the shape to which the input elements should be padded
-      prior to batching. Any unknown dimensions (e.g. `tf.Dimension(None)` in a
-      `tf.TensorShape` or `-1` in a tensor-like object) will be padded to the
-      maximum size of that dimension in each batch.
-    padding_value: (Optional.) A scalar-shaped `tf.Tensor`, representing the
-      padding value to use. Defaults are `0` for numeric types and the empty
-      string for string types. If `dataset` contains `tf.SparseTensor`, this
-      value is ignored.
-
-  Returns:
-    A `Tensor` representing the batch of the entire input dataset.
-
-  Raises:
-    ValueError: if invalid arguments are provided.
-  """
-  dataset_output_classes = dataset_ops.get_legacy_output_classes(dataset)
-  if not issubclass(dataset_output_classes,
-                    (ops.Tensor, sparse_tensor.SparseTensor)):
-    raise TypeError("Input dataset expected to have a single tensor component")
-  if issubclass(dataset_output_classes, (ops.Tensor)):
-    return _padded_batch_dense_window(dataset, padded_shape, padding_value)
-  elif issubclass(dataset_output_classes, (sparse_tensor.SparseTensor)):
-    if padding_value is not None:
-      raise ValueError("Padding value not allowed for sparse tensors")
-    return _padded_batch_sparse_window(dataset, padded_shape)
-  else:
-    raise TypeError("Unsupported dataset type: %s" % dataset_output_classes)
-
-
-def _padded_batch_dense_window(dataset, padded_shape, padding_value=None):
-  """Batches a window of dense tensors with padding."""
-
-  padded_shape = math_ops.cast(
-      convert.partial_shape_to_tensor(padded_shape), dtypes.int32)
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def max_init_fn(_):
-    return padded_shape
-
-  def max_reduce_fn(state, value):
-    """Computes the maximum shape to pad to."""
-    condition = math_ops.reduce_all(
-        math_ops.logical_or(
-            math_ops.less_equal(array_ops.shape(value), padded_shape),
-            math_ops.equal(padded_shape, -1)))
-    assert_op = control_flow_ops.Assert(condition, [
-        "Actual shape greater than padded shape: ",
-        array_ops.shape(value), padded_shape
-    ])
-    with ops.control_dependencies([assert_op]):
-      return math_ops.maximum(state, array_ops.shape(value))
-
-  def finalize_fn(state):
-    return state
-
-  # Compute the padded shape.
-  max_reducer = grouping.Reducer(max_init_fn, max_reduce_fn, finalize_fn)
-  padded_shape = get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
-
-  dataset_output_types = dataset_ops.get_legacy_output_types(dataset)
-  if padding_value is None:
-    if dataset_output_types == dtypes.string:
-      padding_value = ""
-    elif dataset_output_types == dtypes.bool:
-      padding_value = False
-    elif dataset_output_types == dtypes.variant:
-      raise TypeError("Unable to create padding for field of type 'variant'")
-    else:
-      padding_value = 0
-
-  def batch_init_fn(_):
-    batch_shape = array_ops.concat(
-        [np.array([0], dtype=np.int32), padded_shape], 0)
-    return gen_array_ops.empty(batch_shape, dtype=dataset_output_types)
-
-  def batch_reduce_fn(state, value):
-    return array_ops.concat([state, [value]], 0)
-
-  def pad_fn(value):
-    shape = array_ops.shape(value)
-    left = array_ops.zeros_like(shape)
-    right = padded_shape - shape
-    return array_ops.pad(
-        value, array_ops.stack([left, right], 1), constant_values=padding_value)
-
-  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.map(pad_fn).apply(
-          grouping.group_by_reducer(key_fn, batch_reducer)))
-
-
-def _padded_batch_sparse_window(dataset, padded_shape):
-  """Batches a window of sparse tensors with padding."""
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def max_init_fn(_):
-    return convert.partial_shape_to_tensor(padded_shape)
-
-  def max_reduce_fn(state, value):
-    """Computes the maximum shape to pad to."""
-    condition = math_ops.reduce_all(
-        math_ops.logical_or(
-            math_ops.less_equal(value.dense_shape, padded_shape),
-            math_ops.equal(padded_shape, -1)))
-    assert_op = control_flow_ops.Assert(condition, [
-        "Actual shape greater than padded shape: ", value.dense_shape,
-        padded_shape
-    ])
-    with ops.control_dependencies([assert_op]):
-      return math_ops.maximum(state, value.dense_shape)
-
-  def finalize_fn(state):
-    return state
-
-  # Compute the padded shape.
-  max_reducer = grouping.Reducer(max_init_fn, max_reduce_fn, finalize_fn)
-  padded_shape = get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
-
-  def batch_init_fn(_):
-    indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]],
-                                     0)
-    return sparse_tensor.SparseTensor(
-        indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
-        values=constant_op.constant(
-            [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)),
-        dense_shape=array_ops.concat(
-            [np.array([0], dtype=np.int64), padded_shape], 0))
-
-  def batch_reduce_fn(state, value):
-    padded_value = sparse_tensor.SparseTensor(
-        indices=value.indices, values=value.values, dense_shape=padded_shape)
-    reshaped_value = sparse_ops.sparse_reshape(
-        padded_value,
-        array_ops.concat(
-            [np.array([1], dtype=np.int64), padded_value.dense_shape], 0))
-    return sparse_ops.sparse_concat(0, [state, reshaped_value])
-
-  reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, reducer)))
-
-
-class _UnbatchDataset(dataset_ops.UnaryDataset):
-  """A dataset that splits the elements of its input into multiple elements."""
-
-  def __init__(self, input_dataset):
-    """See `unbatch()` for more details."""
-    input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
-    flat_shapes = nest.flatten(input_shapes)
-    if any(s.ndims == 0 for s in flat_shapes):
-      raise ValueError("Cannot unbatch an input with scalar components.")
-    known_batch_dim = tensor_shape.Dimension(None)
-    for s in flat_shapes:
-      try:
-        known_batch_dim = known_batch_dim.merge_with(s[0])
-      except ValueError:
-        raise ValueError("Cannot unbatch an input whose components have "
-                         "different batch sizes.")
-    self._input_dataset = input_dataset
-
-    self._structure = structure.convert_legacy_structure(
-        dataset_ops.get_legacy_output_types(input_dataset),
-        nest.map_structure(lambda s: s[1:], input_shapes),
-        dataset_ops.get_legacy_output_classes(input_dataset))
-
-    variant_tensor = ged_ops.experimental_unbatch_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
-
-  @property
-  def _element_structure(self):
-    return self._structure
-
-
-@tf_export("data.experimental.unbatch")
-def unbatch():
-  """Splits elements of a dataset into multiple elements on the batch dimension.
-
-  For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
-  where `B` may vary for each input element, then for each element in the
-  dataset, the unbatched dataset will contain `B` consecutive elements
-  of shape `[a0, a1, ...]`.
-
-  ```python
-  # NOTE: The following example uses `{ ... }` to represent the contents
-  # of a dataset.
-  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
-
-  a.apply(tf.data.experimental.unbatch()) == {
-      'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
-  ```
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-  """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
-    # are normalized to the rank-1 dense representation, so that the
-    # sparse-oblivious unbatching logic will slice them
-    # appropriately. This leads to a somewhat inefficient re-encoding step
-    # for all SparseTensor components.
-    # TODO(mrry): Consider optimizing this in future if it turns out to be
-    # a bottleneck.
-    def normalize(arg, *rest):
-      # pylint: disable=protected-access
-      if rest:
-        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
-      else:
-        return dataset._element_structure._to_batched_tensor_list(arg)
-
-    normalized_dataset = dataset.map(normalize)
-
-    # NOTE(mrry): Our `map()` has lost information about the sparseness
-    # of any SparseTensor components, so re-apply the structure of the
-    # original dataset.
-    restructured_dataset = _RestructuredDataset(
-        normalized_dataset,
-        dataset_ops.get_legacy_output_types(dataset),
-        dataset_ops.get_legacy_output_shapes(dataset),
-        dataset_ops.get_legacy_output_classes(dataset),
-        allow_unsafe_cast=True)
-    return _UnbatchDataset(restructured_dataset)
-
-  return _apply_fn
-
-
-class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that batches ragged dense elements into `tf.SparseTensor`s."""
-
-  def __init__(self, input_dataset, batch_size, row_shape):
-    """See `Dataset.dense_to_sparse_batch()` for more details."""
-    if not isinstance(
-        dataset_ops.get_legacy_output_types(input_dataset), dtypes.DType):
-      raise TypeError("DenseToSparseDataset requires an input whose elements "
-                      "have a single component, whereas the input has %r." %
-                      dataset_ops.get_legacy_output_types(input_dataset))
-    self._input_dataset = input_dataset
-    self._batch_size = batch_size
-    self._row_shape = row_shape
-    self._structure = structure.SparseTensorStructure(
-        dataset_ops.get_legacy_output_types(input_dataset),
-        tensor_shape.vector(None).concatenate(self._row_shape))
-
-    variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._batch_size,
-        row_shape=convert.partial_shape_to_tensor(self._row_shape),
-        **dataset_ops.flat_structure(self))
-    super(_DenseToSparseBatchDataset, self).__init__(input_dataset,
-                                                     variant_tensor)
-
-  @property
-  def _element_structure(self):
-    return self._structure
-
-
-class _RestructuredDataset(dataset_ops.UnaryDataset):
-  """An internal helper for changing the structure and shape of a dataset."""
-
-  def __init__(self,
-               dataset,
-               output_types,
-               output_shapes=None,
-               output_classes=None,
-               allow_unsafe_cast=False):
-    """Creates a new dataset with the given output types and shapes.
-
-    The given `dataset` must have a structure that is convertible:
-    * `dataset.output_types` must be the same as `output_types` module nesting.
-    * Each shape in `dataset.output_shapes` must be compatible with each shape
-      in `output_shapes` (if given).
-
-    Note: This helper permits "unsafe casts" for shapes, equivalent to using
-    `tf.Tensor.set_shape()` where domain-specific knowledge is available.
-
-    Args:
-      dataset: A `Dataset` object.
-      output_types: A nested structure of `tf.DType` objects.
-      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
-        If omitted, the shapes will be inherited from `dataset`.
-      output_classes: (Optional.) A nested structure of class types.
-        If omitted, the class types will be inherited from `dataset`.
-      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
-        reported output types and shapes of the restructured dataset, e.g. to
-        switch a sparse tensor represented as `tf.variant` to its user-visible
-        type and shape.
-
-    Raises:
-      ValueError: If either `output_types` or `output_shapes` is not compatible
-        with the structure of `dataset`.
-    """
-    self._input_dataset = dataset
-
-    input_types = dataset_ops.get_legacy_output_types(dataset)
-    if not allow_unsafe_cast:
-      # Validate that the types are compatible.
-      output_types = nest.map_structure(dtypes.as_dtype, output_types)
-      flat_original_types = nest.flatten(input_types)
-      flat_new_types = nest.flatten(output_types)
-      if flat_original_types != flat_new_types:
-        raise ValueError(
-            "Dataset with output types %r cannot be restructured to have "
-            "output types %r" %
-            (dataset_ops.get_legacy_output_types(dataset), output_types))
-
-    input_shapes = dataset_ops.get_legacy_output_shapes(dataset)
-    if output_shapes is None:
-      # Inherit shapes from the original `dataset`.
-      output_shapes = nest.pack_sequence_as(
-          output_types, nest.flatten(input_shapes))
-    else:
-      if not allow_unsafe_cast:
-        # Validate that the shapes are compatible.
-        nest.assert_same_structure(output_types, output_shapes)
-        flat_original_shapes = nest.flatten(input_shapes)
-        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
-
-        for original_shape, new_shape in zip(flat_original_shapes,
-                                             flat_new_shapes):
-          if not original_shape.is_compatible_with(new_shape):
-            raise ValueError(
-                "Dataset with output shapes %r cannot be restructured to have "
-                "incompatible output shapes %r" % (input_shapes,
-                                                   output_shapes))
-      output_shapes = nest.map_structure_up_to(
-          output_types, tensor_shape.as_shape, output_shapes)
-
-    input_classes = dataset_ops.get_legacy_output_classes(dataset)
-    if output_classes is None:
-      # Inherit class types from the original `dataset`.
-      output_classes = nest.pack_sequence_as(
-          output_types, nest.flatten(input_classes))
-
-    self._structure = structure.convert_legacy_structure(
-        output_types, output_shapes, output_classes)
-    variant_tensor = self._input_dataset._variant_tensor  # pylint: disable=protected-access
-    super(_RestructuredDataset, self).__init__(dataset, variant_tensor)
-
-  @property
-  def _element_structure(self):
-    return self._structure
-
-
-class _MapAndBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that maps a function over a batch of elements."""
-
-  def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
-               drop_remainder, use_legacy_function=False):
-    """See `Dataset.map()` for details."""
-    self._input_dataset = input_dataset
-
-    self._map_func = dataset_ops.StructuredFunctionWrapper(
-        map_func,
-        "tf.data.experimental.map_and_batch()",
-        dataset=input_dataset,
-        use_legacy_function=use_legacy_function)
-    self._batch_size_t = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-    self._num_parallel_calls_t = ops.convert_to_tensor(
-        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
-    self._drop_remainder_t = ops.convert_to_tensor(
-        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
-
-    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder_t)
-    if constant_drop_remainder:
-      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
-      # or `False` (explicitly retaining the remainder).
-      self._structure = self._map_func.output_structure._batch(  # pylint: disable=protected-access
-          tensor_util.constant_value(self._batch_size_t))
-    else:
-      self._structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
-    variant_tensor = ged_ops.experimental_map_and_batch_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._map_func.function.captured_inputs,
-        f=self._map_func.function,
-        batch_size=self._batch_size_t,
-        num_parallel_calls=self._num_parallel_calls_t,
-        drop_remainder=self._drop_remainder_t,
-        preserve_cardinality=True,
-        **dataset_ops.flat_structure(self))
-    super(_MapAndBatchDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._map_func]
-
-  @property
-  def _element_structure(self):
-    return self._structure
-
-
 @deprecation.deprecated(None, "Use `tf.data.experimental.map_and_batch()")
 @tf_export(v1=["data.experimental.map_and_batch_with_legacy_function"])
 def map_and_batch_with_legacy_function(map_func,
@@ -728,38 +199,250 @@
   return _apply_fn
 
 
-class _RebatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that divides the batch size by `num_workers`."""
+@tf_export("data.experimental.unbatch")
+def unbatch():
+  """Splits elements of a dataset into multiple elements on the batch dimension.
 
-  def __init__(self, input_dataset, num_workers):
+  For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
+  where `B` may vary for each input element, then for each element in the
+  dataset, the unbatched dataset will contain `B` consecutive elements
+  of shape `[a0, a1, ...]`.
+
+  ```python
+  # NOTE: The following example uses `{ ... }` to represent the contents
+  # of a dataset.
+  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+  a.apply(tf.data.experimental.unbatch()) == {
+      'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
+  ```
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+
+    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
+    # are normalized to the rank-1 dense representation, so that the
+    # sparse-oblivious unbatching logic will slice them
+    # appropriately. This leads to a somewhat inefficient re-encoding step
+    # for all SparseTensor components.
+    # TODO(mrry): Consider optimizing this in future if it turns out to be
+    # a bottleneck.
+    def normalize(arg, *rest):
+      # pylint: disable=protected-access
+      if rest:
+        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
+      else:
+        return dataset._element_structure._to_batched_tensor_list(arg)
+
+    normalized_dataset = dataset.map(normalize)
+
+    # NOTE(mrry): Our `map()` has lost information about the sparseness
+    # of any SparseTensor components, so re-apply the structure of the
+    # original dataset.
+    restructured_dataset = _RestructuredDataset(
+        normalized_dataset,
+        dataset_ops.get_legacy_output_types(dataset),
+        dataset_ops.get_legacy_output_shapes(dataset),
+        dataset_ops.get_legacy_output_classes(dataset),
+        allow_unsafe_cast=True)
+    return _UnbatchDataset(restructured_dataset)
+
+  return _apply_fn
+
+
+class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that batches ragged dense elements into `tf.SparseTensor`s."""
+
+  def __init__(self, input_dataset, batch_size, row_shape):
+    """See `Dataset.dense_to_sparse_batch()` for more details."""
+    if not isinstance(
+        dataset_ops.get_legacy_output_types(input_dataset), dtypes.DType):
+      raise TypeError("DenseToSparseDataset requires an input whose elements "
+                      "have a single component, whereas the input has %r." %
+                      dataset_ops.get_legacy_output_types(input_dataset))
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    self._row_shape = row_shape
+    self._structure = structure.SparseTensorStructure(
+        dataset_ops.get_legacy_output_types(input_dataset),
+        tensor_shape.vector(None).concatenate(self._row_shape))
+
+    variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._batch_size,
+        row_shape=convert.partial_shape_to_tensor(self._row_shape),
+        **dataset_ops.flat_structure(self))
+    super(_DenseToSparseBatchDataset, self).__init__(input_dataset,
+                                                     variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
+class _MapAndBatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over a batch of elements."""
+
+  def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
+               drop_remainder, use_legacy_function=False):
+    """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
 
-    def recalculate_output_shapes(output_shapes):
-      """Recalculates the output_shapes after dividing it by num_workers."""
-      if len(output_shapes) < 1:
-        raise ValueError("Input shape should have at least one dimension.")
-      if (tensor_shape.dimension_value(output_shapes[0]) and
-          tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
-        raise errors.InvalidArgumentError(
-            None, None,
-            "First dim of input shape: %d is not divisible by num_workers: %d" %
-            (output_shapes[0], num_workers))
-      output_dims = [d for d in output_shapes.dims]
-      output_dims[0] = output_dims[0] // num_workers
-      return tensor_shape.TensorShape(output_dims)
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func,
+        "tf.data.experimental.map_and_batch()",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+    self._batch_size_t = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    self._num_parallel_calls_t = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
+    self._drop_remainder_t = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-    input_types = dataset_ops.get_legacy_output_types(self._input_dataset)
-    input_shapes = dataset_ops.get_legacy_output_shapes(self._input_dataset)
-    input_classes = dataset_ops.get_legacy_output_classes(self._input_dataset)
-    output_shapes = nest.map_structure(recalculate_output_shapes, input_shapes)
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder_t)
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = self._map_func.output_structure._batch(  # pylint: disable=protected-access
+          tensor_util.constant_value(self._batch_size_t))
+    else:
+      self._structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_map_and_batch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        batch_size=self._batch_size_t,
+        num_parallel_calls=self._num_parallel_calls_t,
+        drop_remainder=self._drop_remainder_t,
+        preserve_cardinality=True,
+        **dataset_ops.flat_structure(self))
+    super(_MapAndBatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
+class _RestructuredDataset(dataset_ops.UnaryDataset):
+  """An internal helper for changing the structure and shape of a dataset."""
+
+  def __init__(self,
+               dataset,
+               output_types,
+               output_shapes=None,
+               output_classes=None,
+               allow_unsafe_cast=False):
+    """Creates a new dataset with the given output types and shapes.
+
+    The given `dataset` must have a structure that is convertible:
+    * `dataset.output_types` must be the same as `output_types` module nesting.
+    * Each shape in `dataset.output_shapes` must be compatible with each shape
+      in `output_shapes` (if given).
+
+    Note: This helper permits "unsafe casts" for shapes, equivalent to using
+    `tf.Tensor.set_shape()` where domain-specific knowledge is available.
+
+    Args:
+      dataset: A `Dataset` object.
+      output_types: A nested structure of `tf.DType` objects.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
+        If omitted, the shapes will be inherited from `dataset`.
+      output_classes: (Optional.) A nested structure of class types. If omitted,
+        the class types will be inherited from `dataset`.
+      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
+        reported output types and shapes of the restructured dataset, e.g. to
+        switch a sparse tensor represented as `tf.variant` to its user-visible
+        type and shape.
+
+    Raises:
+      ValueError: If either `output_types` or `output_shapes` is not compatible
+        with the structure of `dataset`.
+    """
+    self._input_dataset = dataset
+
+    input_types = dataset_ops.get_legacy_output_types(dataset)
+    if not allow_unsafe_cast:
+      # Validate that the types are compatible.
+      output_types = nest.map_structure(dtypes.as_dtype, output_types)
+      flat_original_types = nest.flatten(input_types)
+      flat_new_types = nest.flatten(output_types)
+      if flat_original_types != flat_new_types:
+        raise ValueError(
+            "Dataset with output types %r cannot be restructured to have "
+            "output types %r" %
+            (dataset_ops.get_legacy_output_types(dataset), output_types))
+
+    input_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+    if output_shapes is None:
+      # Inherit shapes from the original `dataset`.
+      output_shapes = nest.pack_sequence_as(
+          output_types, nest.flatten(input_shapes))
+    else:
+      if not allow_unsafe_cast:
+        # Validate that the shapes are compatible.
+        nest.assert_same_structure(output_types, output_shapes)
+        flat_original_shapes = nest.flatten(input_shapes)
+        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
+
+        for original_shape, new_shape in zip(flat_original_shapes,
+                                             flat_new_shapes):
+          if not original_shape.is_compatible_with(new_shape):
+            raise ValueError(
+                "Dataset with output shapes %r cannot be restructured to have "
+                "incompatible output shapes %r" % (input_shapes,
+                                                   output_shapes))
+      output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+
+    input_classes = dataset_ops.get_legacy_output_classes(dataset)
+    if output_classes is None:
+      # Inherit class types from the original `dataset`.
+      output_classes = nest.pack_sequence_as(
+          output_types, nest.flatten(input_classes))
 
     self._structure = structure.convert_legacy_structure(
-        input_types, output_shapes, input_classes)
-    variant_tensor = ged_ops.experimental_rebatch_dataset(
+        output_types, output_shapes, output_classes)
+    variant_tensor = self._input_dataset._variant_tensor  # pylint: disable=protected-access
+    super(_RestructuredDataset, self).__init__(dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
+class _UnbatchDataset(dataset_ops.UnaryDataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset):
+    """See `unbatch()` for more details."""
+    input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
+    flat_shapes = nest.flatten(input_shapes)
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError:
+        raise ValueError("Cannot unbatch an input whose components have "
+                         "different batch sizes.")
+    self._input_dataset = input_dataset
+
+    self._structure = dataset_ops.get_structure(input_dataset)._unbatch()  # pylint: disable=protected-access
+
+    variant_tensor = ged_ops.experimental_unbatch_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        num_workers=num_workers,
         **dataset_ops.flat_structure(self))
-    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
+    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index 909e5fa..d5e85a1 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -18,6 +18,10 @@
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
@@ -61,4 +65,42 @@
       _AutoShardDataset(input_dataset, num_workers, index))
 
 
+class _RebatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that divides the batch size by `num_workers`."""
+
+  def __init__(self, input_dataset, num_workers):
+    self._input_dataset = input_dataset
+
+    def recalculate_output_shapes(output_shapes):
+      """Recalculates the output_shapes after dividing it by num_workers."""
+      if len(output_shapes) < 1:
+        raise ValueError("Input shape should have at least one dimension.")
+      if (tensor_shape.dimension_value(output_shapes[0]) and
+          tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
+        raise errors.InvalidArgumentError(
+            None, None,
+            "First dim of input shape: %d is not divisible by num_workers: %d" %
+            (output_shapes[0], num_workers))
+      output_dims = [d for d in output_shapes.dims]
+      output_dims[0] = output_dims[0] // num_workers
+      return tensor_shape.TensorShape(output_dims)
+
+    input_types = dataset_ops.get_legacy_output_types(self._input_dataset)
+    input_shapes = dataset_ops.get_legacy_output_shapes(self._input_dataset)
+    input_classes = dataset_ops.get_legacy_output_classes(self._input_dataset)
+    output_shapes = nest.map_structure(recalculate_output_shapes, input_shapes)
+
+    self._structure = structure.convert_legacy_structure(
+        input_types, output_shapes, input_classes)
+    variant_tensor = ged_ops.experimental_rebatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_workers=num_workers,
+        **dataset_ops.flat_structure(self))
+    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
 _AutoShardDatasetV1.__doc__ = _AutoShardDataset.__doc__
diff --git a/tensorflow/python/data/experimental/ops/distribute_options.py b/tensorflow/python/data/experimental/ops/distribute_options.py
new file mode 100644
index 0000000..3c5b4a6
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/distribute_options.py
@@ -0,0 +1,55 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling distribution in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.DistributeOptions")
+class DistributeOptions(options.OptionsBase):
+  """Represents options for distributed data processing.
+
+  You can set the distribution options of a dataset through the
+  `experimental_distribute` property of `tf.data.Options`; the property is
+  an instance of `tf.data.experimental.DistributeOptions`.
+
+  ```python
+  options = tf.data.Options()
+  options.experimental_distribute.auto_shard = False
+  dataset = dataset.with_options(options)
+  ```
+  """
+
+  auto_shard = options.create_option(
+      name="auto_shard",
+      ty=bool,
+      docstring=
+      "Whether the dataset should be automatically sharded when processed"
+      "in a distributed fashion. This is applicable when using Keras with "
+      "multi-worker/TPU distribution strategy, and by "
+      "using strategy.experimental_distribute_dataset(). In other cases, this "
+      "option does nothing. If None, defaults to True.",
+      default_factory=lambda: True)
+
+  num_devices = options.create_option(
+      name="num_devices",
+      ty=int,
+      docstring=
+      "The number of devices attached to this input pipeline. This will be "
+      "automatically set by MultiDeviceIterator.")
diff --git a/tensorflow/python/data/experimental/ops/enumerate_ops.py b/tensorflow/python/data/experimental/ops/enumerate_ops.py
index 04d875c..67cb0f1 100644
--- a/tensorflow/python/data/experimental/ops/enumerate_ops.py
+++ b/tensorflow/python/data/experimental/ops/enumerate_ops.py
@@ -17,13 +17,11 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+@deprecation.deprecated(None, "Use `tf.data.Dataset.enumerate()")
 @tf_export("data.experimental.enumerate_dataset")
 def enumerate_dataset(start=0):
   """A transformation that enumerates the elements of a dataset.
@@ -39,8 +37,10 @@
 
   # The nested structure of the `datasets` argument determines the
   # structure of elements in the resulting dataset.
-  a.apply(tf.data.experimental.enumerate(start=5)) == { (5, 1), (6, 2), (7, 3) }
-  b.apply(tf.data.experimental.enumerate()) == { (0, (7, 8)), (1, (9, 10)) }
+  a.apply(tf.data.experimental.enumerate_dataset(start=5))
+  => { (5, 1), (6, 2), (7, 3) }
+  b.apply(tf.data.experimental.enumerate_dataset())
+  => { (0, (7, 8)), (1, (9, 10)) }
   ```
 
   Args:
@@ -53,8 +53,6 @@
   """
 
   def _apply_fn(dataset):
-    max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
-    return dataset_ops.Dataset.zip((dataset_ops.Dataset.range(start, max_value),
-                                    dataset))
+    return dataset.enumerate(start)
 
   return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index eab29c7..8de41b3 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -33,8 +33,9 @@
   ```python
   dataset = tf.data.Dataset.from_tensor_slices([1., 2., 0., 4.])
 
-  # Computing `tf.check_numerics(1. / 0.)` will raise an InvalidArgumentError.
-  dataset = dataset.map(lambda x: tf.check_numerics(1. / x, "error"))
+  # Computing `tf.debugging.check_numerics(1. / 0.)` will raise an
+  InvalidArgumentError.
+  dataset = dataset.map(lambda x: tf.debugging.check_numerics(1. / x, "error"))
 
   # Using `ignore_errors()` will drop the element that causes an error.
   dataset =
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index 46c215d..3ca6492 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -27,13 +27,14 @@
   """Returns the single element in `dataset` as a nested structure of tensors.
 
   This function enables you to use a `tf.data.Dataset` in a stateless
-  "tensor-in tensor-out" expression, without creating a `tf.data.Iterator`.
+  "tensor-in tensor-out" expression, without creating a
+  `tf.compat.v1.data.Iterator`.
   This can be useful when your preprocessing transformations are expressed
   as a `Dataset`, and you want to use the transformation at serving time.
   For example:
 
   ```python
-  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
+  input_batch = tf.compat.v1.placeholder(tf.string, shape=[BATCH_SIZE])
 
   def preprocessing_fn(input_str):
     # ...
diff --git a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
deleted file mode 100644
index fdf3692..0000000
--- a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for indexed datasets."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.data.util import structure
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-
-
-class MaterializedIndexedDataset(object):
-  """MaterializedIndexedDataset is highly experimental!
-  """
-
-  def __init__(self, materialized_resource, materializer, output_classes,
-               output_types, output_shapes):
-    self._materialized_resource = materialized_resource
-    self._materializer = materializer
-    self._output_classes = output_classes
-    self._output_types = output_types
-    self._output_shapes = output_shapes
-
-  @property
-  def initializer(self):
-    if self._materializer is not None:
-      return self._materializer
-    raise ValueError("MaterializedDataset does not have a materializer")
-
-  def get(self, index):
-    """Get retrieves a value (or set of values) from the IndexedDataset.
-
-    Args:
-      index: A uint64 scalar or vector tensor with the indices to retrieve.
-
-    Returns:
-      A tensor containing the values corresponding to `index`.
-    """
-    # TODO(saeta): nest.pack_sequence_as(...)
-    return ged_ops.experimental_indexed_dataset_get(
-        self._materialized_resource,
-        index,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self._output_types, self._output_classes)),
-        output_shapes=nest.flatten(
-            sparse.as_dense_types(self._output_shapes, self._output_classes)))
-
-
-# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
-class IndexedDataset(dataset_ops.Dataset):
-  """IndexedDataset is highly experimental!
-  """
-
-  def __init__(self):
-    pass
-
-  def materialize(self, shared_name=None, container=None):
-    """Materialize creates a MaterializedIndexedDataset.
-
-    IndexedDatasets can be combined through operations such as TBD. Therefore,
-    they are only materialized when absolutely required.
-
-    Args:
-      shared_name: a string for the shared name to use for the resource.
-      container: a string for the container to store the resource.
-
-    Returns:
-      A MaterializedIndexedDataset.
-    """
-    if container is None:
-      container = ""
-    if shared_name is None:
-      shared_name = ""
-    materialized_resource = (
-        ged_ops.experimental_materialized_index_dataset_handle(
-            container=container,
-            shared_name=shared_name,
-            **dataset_ops.flat_structure(self)))
-
-    with ops.colocate_with(materialized_resource):
-      materializer = ged_ops.experimental_indexed_dataset_materialize(
-          self._as_variant_tensor(), materialized_resource)
-    return MaterializedIndexedDataset(materialized_resource, materializer,
-                                      self.output_classes, self.output_types,
-                                      self.output_shapes)
-
-  @abc.abstractmethod
-  def _as_variant_tensor(self):
-    """Creates a `tf.variant` `tf.Tensor` representing this IndexedDataset.
-
-    Returns:
-      A scalar `tf.Tensor` of `tf.variant` type, which represents this
-      IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset._as_variant_tensor")
-
-
-# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
-class IdentityIndexedDataset(IndexedDataset):
-  """IdentityIndexedDataset is a trivial indexed dataset used for testing.
-  """
-
-  def __init__(self, size):
-    super(IdentityIndexedDataset, self).__init__()
-    # TODO(saeta): Verify _size is a scalar!
-    self._size = ops.convert_to_tensor(size, dtype=dtypes.uint64, name="size")
-
-  @property
-  def _element_structure(self):
-    return structure.TensorStructure(dtypes.uint64, [])
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_identity_indexed_dataset(self._size)
-
-  def _inputs(self):
-    return []
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 0480ac4..3ee90ce 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -153,7 +153,7 @@
       `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     A dataset that interleaves elements from `datasets` at random, according to
diff --git a/tensorflow/python/data/experimental/ops/iterator_ops.py b/tensorflow/python/data/experimental/ops/iterator_ops.py
index 20172f2..2735281 100644
--- a/tensorflow/python/data/experimental/ops/iterator_ops.py
+++ b/tensorflow/python/data/experimental/ops/iterator_ops.py
@@ -44,8 +44,8 @@
     saveable_obj = tf.data.experimental.make_saveable_from_iterator(iterator)
     # Add the SaveableObject to the SAVEABLE_OBJECTS collection so
     # it can be automatically saved using Saver.
-    tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
-    saver = tf.train.Saver()
+    tf.compat.v1.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
+    saver = tf.compat.v1.train.Saver()
 
     while continue_training:
       ... Perform training ...
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index 11123c2..20f72cd 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -153,6 +153,12 @@
       docstring=
       "Whether to eliminate no-op transformations. If None, defaults to True.")
 
+  parallel_batch = options.create_option(
+      name="parallel_batch",
+      ty=bool,
+      docstring="Whether to parallelize copying of batch elements. If None, "
+      "defaults to False.")
+
   shuffle_and_repeat_fusion = options.create_option(
       name="shuffle_and_repeat_fusion",
       ty=bool,
@@ -171,6 +177,7 @@
         "map_parallelization",
         "map_fusion",
         "noop_elimination",
+        "parallel_batch",
         "shuffle_and_repeat_fusion",
     ]
     for optimization in all_optimizations:
@@ -178,8 +185,8 @@
         result.add(optimization)
 
     if self.apply_default_optimizations is not False:
-      # The following optimizations are turned on by default, unless the
-      # user explicitly disables them.
+      # The following optimizations are turned on by default, unless the user
+      # explicitly disables them.
       optimizations_to_disable = [
           "map_and_batch_fusion",
           "noop_elimination",
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index f6cf2ce..381cdc4 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -108,7 +108,7 @@
   and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
   `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
   and `SparseFeature` is mapped to a `SparseTensor`, and each
-  `FixedLenFeature` is mapped to a `Tensor`. See `tf.parse_example` for more
+  `FixedLenFeature` is mapped to a `Tensor`. See `tf.io.parse_example` for more
   details about feature dictionaries.
 
   Args:
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index a631fa6..d1d3845 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -235,7 +235,7 @@
 
   Args:
     file_pattern: List of files or patterns of TFRecord file paths.
-      See `tf.gfile.Glob` for pattern rules.
+      See `tf.io.gfile.glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     parser_fn: (Optional.) A function accepting string input to parse
@@ -340,7 +340,7 @@
 
   Args:
     file_pattern: List of files or patterns of file paths containing CSV
-      records. See `tf.gfile.Glob` for pattern rules.
+      records. See `tf.io.gfile.glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     column_names: An optional list of strings that corresponds to the CSV
@@ -583,7 +583,7 @@
     We can construct a CsvDataset from it as follows:
 
     ```python
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
 
      dataset = tf.data.experimental.CsvDataset(
         "my_file*.csv",
@@ -768,11 +768,11 @@
 
   Args:
     file_pattern: List of files or patterns of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
+      `Example` records. See `tf.io.gfile.glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
-      `VarLenFeature` values. See `tf.parse_example`.
+      `VarLenFeature` values. See `tf.io.parse_example`.
     reader: A function or class that can be
       called with a `filenames` tensor and (optional) `reader_args` and returns
       a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
@@ -808,7 +808,7 @@
     Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.
 
   Raises:
-    TypeError: If `reader` is a `tf.ReaderBase` subclass.
+    TypeError: If `reader` is a `tf.compat.v1.ReaderBase` subclass.
     ValueError: If `label_key` is not one of the `features` keys.
   """
   # Create dataset of all matching filenames
@@ -934,7 +934,7 @@
     For example:
 
     ```python
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
 
     dataset = tf.data.experimental.SqlDataset("sqlite", "/foo/bar.sqlite3",
                                               "SELECT name, age FROM people",
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index 98f682e..e68531d 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -79,7 +79,7 @@
       indefinitely.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
new file mode 100644
index 0000000..723eb7e
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -0,0 +1,60 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dataset snapshot and related functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A Dataset that captures a snapshot or reads from a snapshot."""
+
+  def __init__(self, input_dataset, path):
+    self._input_dataset = input_dataset
+    self._path = ops.convert_to_tensor(path, dtype=dtypes.string, name="path")
+
+    variant_tensor = ged_ops.snapshot_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        path=self._path,
+        **dataset_ops.flat_structure(self))
+    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
+
+
+def snapshot(path):
+  """Writes to/reads from a snapshot of a dataset.
+
+  This function attempts to determine whether a valid snapshot exists at the
+  `path`, and reads from the snapshot if so. If not, it will run the
+  preprocessing pipeline as usual, and write out a snapshot of the data
+  processed for future use.
+
+  Args:
+    path: A directory where we want to save our snapshots and/or read from a
+      previously saved snapshot.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _SnapshotDataset(dataset, path)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index 73e4e4f..2fc87b1 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -115,7 +115,7 @@
   aggregator = tf.data.experimental.StatsAggregator()
   # ...
   stats_summary = aggregator.get_summary()
-  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+  tf.compat.v1.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
   ```
 
   Note: This interface is experimental and expected to change. In particular,
@@ -130,7 +130,8 @@
   def get_summary(self):
     """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
 
-    The returned tensor will contain a serialized `tf.summary.Summary` protocol
+    The returned tensor will contain a serialized `tf.compat.v1.summary.Summary`
+    protocol
     buffer, which can be used with the standard TensorBoard logging facilities.
 
     Returns:
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index dff4286..d1912f4 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -30,7 +30,7 @@
   """Set the given `stats_aggregator` for aggregating the input dataset stats.
 
   Args:
-    stats_aggregator: A `tf.contrib.data.StatsAggregator` object.
+    stats_aggregator: A `tf.data.experimental.StatsAggregator` object.
     prefix: (Optional) String, all statistics recorded for the input `dataset`
       will have given `prefix` prepend with the name.
     counter_prefix: (Optional) String, all statistics recorded as `counters`
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 2a4226b..088e769 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -24,6 +24,7 @@
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/ops/ragged",
     ],
 )
 
@@ -104,6 +105,21 @@
 )
 
 tf_py_test(
+    name = "enumerate_test",
+    size = "small",
+    srcs = ["enumerate_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
     name = "filter_test",
     size = "small",
     srcs = ["filter_test.py"],
@@ -179,6 +195,7 @@
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops/ragged",
     ],
     grpc_enabled = True,
 )
@@ -263,6 +280,7 @@
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops/ragged",
     ],
 )
 
@@ -420,6 +438,7 @@
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops/ragged",
     ],
 )
 
@@ -549,7 +568,7 @@
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.py"],
@@ -691,6 +710,8 @@
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/ops/ragged:ragged_test_util",
     ],
 )
 
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index 30fdd4b..1d8e25b 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -30,6 +30,10 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -169,6 +173,40 @@
             r'Cannot batch tensors with different shapes in component 0. First '
             r'element had shape \[3\] and element 2 had shape \[4\].'))
 
+  # Ragged Tensors.
+  def testBatchRagged(self):
+
+    def _ragged(i):
+      return ragged_tensor.RaggedTensor.from_tensor(i * [[1]])
+
+    dataset = dataset_ops.Dataset.range(10).map(_ragged).batch(5)
+    expected_output = [
+        ragged_factory_ops.constant([[[0]], [[1]], [[2]], [[3]], [[4]]]),
+        ragged_factory_ops.constant([[[5]], [[6]], [[7]], [[8]], [[9]]])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchRaggedWithDifferentShapes(self):
+    dataset = dataset_ops.Dataset.range(10).map(ragged_math_ops.range).batch(5)
+    expected_output = [
+        ragged_concat_ops.stack([ragged_math_ops.range(i) for i in range(5)]),
+        ragged_concat_ops.stack(
+            [ragged_math_ops.range(i) for i in range(5, 10)])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedBatchRagged(self):
+
+    def _ragged(i):
+      return ragged_tensor.RaggedTensor.from_tensor(i * [[1]])
+
+    dataset = dataset_ops.Dataset.range(10).map(_ragged).batch(5).batch(2)
+    expected_output = [
+        ragged_factory_ops.constant([[[[0]], [[1]], [[2]], [[3]], [[4]]],
+                                     [[[5]], [[6]], [[7]], [[8]], [[9]]]])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index e022449..7edcf07 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -29,12 +31,14 @@
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
@@ -66,25 +70,27 @@
   def testAsFunctionWithMap(self):
     if not context.executing_eagerly():
       self.skipTest("Only works executing eagerly")
-    original_dataset = dataset_ops.Dataset.range(5).map(lambda x: x * 2)
-    fn = original_dataset._trace_variant_creation()
-    variant = fn()
+    with ops.device("CPU"):
+      original_dataset = dataset_ops.Dataset.range(5).map(lambda x: x * 2)
+      fn = original_dataset._trace_variant_creation()
+      variant = fn()
 
-    revived_dataset = _RevivedDataset(
-        variant, original_dataset._element_structure)
-    self.assertDatasetProduces(revived_dataset, range(0, 10, 2))
+      revived_dataset = _RevivedDataset(
+          variant, original_dataset._element_structure)
+      self.assertDatasetProduces(revived_dataset, range(0, 10, 2))
 
   def testAsFunctionWithMapInFlatMap(self):
     if not context.executing_eagerly():
       self.skipTest("Only works executing eagerly")
-    original_dataset = dataset_ops.Dataset.range(5).flat_map(
-        lambda x: dataset_ops.Dataset.range(5).map(lambda x: x * 2))
-    fn = original_dataset._trace_variant_creation()
-    variant = fn()
+    with ops.device("CPU"):
+      original_dataset = dataset_ops.Dataset.range(5).flat_map(
+          lambda x: dataset_ops.Dataset.range(5).map(lambda x: x * 2))
+      fn = original_dataset._trace_variant_creation()
+      variant = fn()
 
-    revived_dataset = _RevivedDataset(
-        variant, original_dataset._element_structure)
-    self.assertDatasetProduces(revived_dataset, list(original_dataset))
+      revived_dataset = _RevivedDataset(
+          variant, original_dataset._element_structure)
+      self.assertDatasetProduces(revived_dataset, list(original_dataset))
 
   @staticmethod
   def make_apply_fn(dataset):
@@ -203,6 +209,12 @@
     dataset_fn = self.make_interleave_fn(*interleave_fn_args)
     self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
 
+  def testNoWarnings(self):
+    with test.mock.patch.object(warnings, "warn") as mock_log:
+      dataset_fn = self.make_interleave_fn(dataset_ops.Dataset.range(10))
+      dataset_fn(dataset_ops.Dataset.range(10))
+      self.assertEmpty(mock_log.call_args_list)
+
   @parameterized.named_parameters(
       ("Concatenate", lambda x, y: x.concatenate(y),
        lambda: dataset_ops.Dataset.range(0),
@@ -346,6 +358,51 @@
         self.assertEqual(val, foo.numpy())
         val += 1
 
+  def testDatasetAsFunctionArgument(self):
+
+    @def_function.function
+    def _uses_dataset(d):
+      accumulator = array_ops.zeros([], dtype=dtypes.int64)
+      for value in d:
+        accumulator += value
+      return accumulator
+
+    with ops.device("CPU"):
+      first_dataset = dataset_ops.Dataset.range(10)
+      self.assertEqual(45, self.evaluate(_uses_dataset(first_dataset)))
+      second_dataset = dataset_ops.Dataset.range(11)
+      self.assertEqual(55, self.evaluate(_uses_dataset(second_dataset)))
+      first_concrete = _uses_dataset.get_concrete_function(first_dataset)
+      # The dataset should not be a captured input
+      self.assertEmpty(first_concrete.graph.captures)
+      # The two datasets have the same structure and so should re-use a trace.
+      self.assertIs(first_concrete,
+                    _uses_dataset.get_concrete_function(second_dataset))
+      # With a different structure we should use a different trace.
+      self.assertIsNot(
+          first_concrete,
+          _uses_dataset.get_concrete_function(
+              dataset_ops.Dataset.zip((first_dataset, second_dataset))))
+
+  def testLimitedRetracingWithCompositeTensors(self):
+    trace_count = [0]
+
+    @def_function.function
+    def f(ds):
+      trace_count[0] += 1
+      counter = np.int64(0)
+      for elem in ds:
+        counter += elem
+      return counter
+
+    dataset = dataset_ops.Dataset.range(5)
+    dataset2 = dataset_ops.Dataset.range(10)
+
+    for _ in range(10):
+      self.assertEqual(self.evaluate(f(dataset)), 10)
+      self.assertEqual(self.evaluate(f(dataset2)), 45)
+      self.assertEqual(trace_count[0], 1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/kernel_tests/enumerate_test.py
similarity index 86%
rename from tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
rename to tensorflow/python/data/kernel_tests/enumerate_test.py
index 8842f55..0666449 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/enumerate_test.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for `tf.data.experimental.enumerate_dataset()`."""
+"""Tests for `tf.data.Dataset.enumerate()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import enumerate_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -28,14 +27,14 @@
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class EnumerateDatasetTest(test_base.DatasetTestBase):
+class EnumerateTest(test_base.DatasetTestBase):
 
-  def testEnumerateDataset(self):
+  def testEnumerate(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
 
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
-        enumerate_ops.enumerate_dataset(start))
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).enumerate(
+        start)
 
     self.assertEqual(dtypes.int64,
                      dataset_ops.get_legacy_output_types(dataset)[0])
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index 6872f51..db04707 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -31,6 +31,8 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -145,6 +147,21 @@
 
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  def testRagged(self):
+
+    def _map_fn(i):
+      return ragged_tensor.RaggedTensor.from_tensor(i * [[1], [-1]])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          ragged_conversion_ops.to_tensor(x))
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+    expected_output = []
+    for i in range(10):
+      expected_output.append([i])
+      expected_output.append([-i])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index ef46f8e..69b3d9f 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -26,6 +26,8 @@
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -176,5 +178,79 @@
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  def testFromTensorSlicesRagged(self):
+    components = (
+        ragged_factory_ops.constant_value([[[0]], [[1]], [[2]]]),
+        ragged_factory_ops.constant_value([[[3]], [[4]], [[5]]]),
+    )
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    expected = [(ragged_factory_ops.constant_value([[0]]),
+                 ragged_factory_ops.constant_value([[3]])),
+                (ragged_factory_ops.constant_value([[1]]),
+                 ragged_factory_ops.constant_value([[4]])),
+                (ragged_factory_ops.constant_value([[2]]),
+                 ragged_factory_ops.constant_value([[5]]))]
+    self.assertDatasetProduces(dataset, expected_output=expected)
+
+  def testFromTensorSlicesMixedRagged(self):
+    components = (np.tile(np.array([[1], [2], [3]]),
+                          20), np.tile(np.array([[12], [13], [14]]),
+                                       22), np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])),
+                  ragged_factory_ops.constant_value([[[0]], [[1]], [[2]]]))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3])), ragged_factory_ops.constant_value([[0]
+                                                                           ])),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3])), ragged_factory_ops.constant_value([[1]
+                                                                           ])),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3])), ragged_factory_ops.constant_value([[2]
+                                                                           ])),
+    ]
+    for i in range(3):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(
+          (list(zip(*components[:3]))[i] + expected[i]), results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        elif ragged_tensor.is_ragged(component):
+          self.assertRaggedEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index ce8ba6d..3c61ec3 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -34,6 +34,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
@@ -101,6 +102,32 @@
 
     self.assertDatasetProduces(dataset, expected_output=[components])
 
+  def testFromTensorsRagged(self):
+    components = (
+        ragged_factory_ops.constant_value([[[0]], [[1]], [[2]]]),
+        ragged_factory_ops.constant_value([[[3]], [[4]], [[5]]]),
+    )
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testFromTensorsMixedRagged(self):
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])),
+                  ragged_factory_ops.constant_value([[[0]], [[1]], [[2]]]))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
   # pylint: disable=g-long-lambda,unnecessary-lambda
   def testNestedStructure(self):
     components = (np.array([1, 2, 3], dtype=np.int64),
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index a5fdef2..3e38474 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -732,7 +732,7 @@
     with ops.device("/job:localhost/replica:0/task:0/device:GPU:0"):
       target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
       iterator_3_handle_uint8 = parsing_ops.decode_raw(
-          bytes=iterator_3_handle, out_type=dtypes.uint8)
+          input_bytes=iterator_3_handle, out_type=dtypes.uint8)
       remote_op = functional_ops.remote_call(
           args=[iterator_3_handle_uint8],
           Tout=[dtypes.int32],
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index fefebeb..a284171 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -50,6 +50,9 @@
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -759,6 +762,34 @@
         dataset,
         expected_output=[list(range(i)) for i in range(10)])
 
+  def testRagged(self):
+
+    def _ragged(i):
+      return ragged_tensor.RaggedTensor.from_tensor(i * [[1]])
+
+    dataset = dataset_ops.Dataset.range(5).map(_ragged)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[ragged_factory_ops.constant([[i]]) for i in range(5)])
+
+  def testRaggedChain(self):
+
+    def _ragged(i):
+      return ragged_tensor.RaggedTensor.from_tensor(i * [[1]])
+
+    def _concat(i):
+      self.assertTrue(ragged_tensor.is_ragged(i))
+      return ragged_concat_ops.concat([i, i], 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_ragged).map(_concat)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            self.evaluate(_concat(ragged_factory_ops.constant([[i]])))
+            for i in range(10)
+        ])
+
   @test_util.run_v1_only("b/123904513")
   def testParallelMapOutOfRangeError(self):
     def raising_py_func(i):
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index c379afc..08564b2 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -335,6 +335,46 @@
         self.evaluate(elem_on_2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class PrefetchWithSlackTest(test_base.DatasetTestBase):
+
+  @test_util.run_v1_only("b/121264236")
+  def testPrefetchWithSlack(self):
+    dataset = dataset_ops.Dataset.range(10)
+    options = dataset_ops.Options()
+    options.experimental_slack = True
+    dataset = dataset.with_options(options)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    dataset = multi_device_iterator._dataset  # pylint: disable=protected-access
+    self.assertIn("slack", dataset.options()._static_optimizations())
+    self.assertIn("slack:slack_period:2",
+                  dataset.options()._static_optimization_configs())
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config):
+      self.evaluate(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
+
+  def testPrefetchWithSlackWithoutIterator(self):
+    dataset = dataset_ops.Dataset.range(10)
+    options = dataset_ops.Options()
+    options.experimental_slack = True
+    dataset = dataset.with_options(options)
+    self.assertIn("slack", dataset.options()._static_optimizations())
+    self.assertIn("slack:slack_period:1",
+                  dataset.options()._static_optimization_configs())
+
+    self.assertDatasetProduces(dataset, range(10))
+
+
 if __name__ == "__main__":
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(device_count={"CPU": 3, "GPU": 1}))
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index 846d9a6..a297036 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -190,6 +191,21 @@
     self.assertEqual(self.evaluate(fn()), b"hello")
     self.assertEqual(self.evaluate(counter_var), 4)
 
+  def testStateOnGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPUs available.")
+
+    state = constant_op.constant(0, dtype=dtypes.int64)
+
+    def reduce_fn(state, value):
+      with ops.device("/gpu:0"):
+        return state + value
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce(state, reduce_fn)
+      self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index d18d247..07ea217 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -28,10 +28,12 @@
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import test
 
 
-class DatasetTestBase(test.TestCase):
+class DatasetTestBase(ragged_test_util.RaggedTensorTestCase, test.TestCase):
   """Base class for dataset tests."""
 
   @classmethod
@@ -100,6 +102,8 @@
           nest.flatten(result_values[i]), nest.flatten(expected_values[i])):
         if sparse_tensor.is_sparse(result_value):
           self.assertSparseValuesEqual(result_value, expected_value)
+        elif ragged_tensor.is_ragged(result_value):
+          self.assertRaggedEqual(result_value, expected_value)
         else:
           self.assertAllEqual(
               result_value,
@@ -198,6 +202,8 @@
       for i in range(len(op1)):
         if sparse_tensor.is_sparse(op1[i]):
           self.assertSparseValuesEqual(op1[i], op2[i])
+        elif ragged_tensor.is_ragged(op1[i]):
+          self.assertRaggedEqual(op1[i], op2[i])
         elif flattened_types[i] == dtypes.string:
           self.assertAllEqual(op1[i], op2[i])
         else:
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index d018ba2..4ce3543 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -26,6 +26,7 @@
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:distribute_options",
         "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/experimental/ops:threading_options",
@@ -35,6 +36,7 @@
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/data/util:traverse",
+        "//tensorflow/python/ops/ragged",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 126665f..00a972a 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -30,6 +30,7 @@
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.experimental.ops import threading_options
@@ -42,6 +43,7 @@
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -79,7 +81,7 @@
 
 @tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class DatasetV2(tracking_base.Trackable):
+class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
@@ -141,7 +143,7 @@
       raise NotImplementedError(
           "Can only export Datasets which were created executing eagerly. "
           "Please file a feature request if this is important to you.")
-    with context.eager_mode():
+    with context.eager_mode(), ops.device("CPU"):
       graph_def = graph_pb2.GraphDef().FromString(
           self._as_serialized_graph().numpy())  # pylint: disable=protected-access
     output_node_name = None
@@ -236,7 +238,10 @@
       if t_options.private_threadpool_size is not None:
         dataset = _PrivateThreadPoolDataset(dataset,
                                             t_options.private_threadpool_size)
-    static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    static_optimizations = options._static_optimizations()
+    static_optimization_configs = options._static_optimization_configs()
+    # pylint: enable=protected-access
     if static_optimizations:
       if self._has_captured_ref():
         warnings.warn(
@@ -247,7 +252,7 @@
             ", ".join(static_optimizations))
       else:
         dataset = _OptimizeDataset(dataset, static_optimizations,
-                                   options._static_optimization_configs())  # pylint: disable=protected-access
+                                   static_optimization_configs)
 
     autotune = True
     cpu_budget = 0  # Indicates that all CPU cores should be used.
@@ -299,6 +304,24 @@
     return ("<%s shapes: %s, types: %s>" % (type(self).__name__, output_shapes,
                                             output_types))
 
+  def _to_components(self):
+    return [self._variant_tensor]
+
+  def _component_metadata(self):
+    return self._element_structure
+
+  @classmethod
+  def _from_components(cls, components, metadata):
+    return _VariantDataset(components[0], metadata)
+
+  def _shape_invariant_to_components(self, shape=None):
+    del shape  # not used
+    return tensor_shape.TensorShape([])  # dataset component is always a scalar.
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._variant_tensor, "graph")
+
   @staticmethod
   def from_tensors(tensors):
     """Creates a `Dataset` with a single element, comprising the given tensors.
@@ -389,7 +412,7 @@
 
     ```python
     import itertools
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
 
     def gen():
       for i in itertools.count(1):
@@ -405,7 +428,7 @@
     ```
 
     NOTE: The current implementation of `Dataset.from_generator()` uses
-    `tf.py_func` and inherits the same constraints. In particular, it
+    `tf.compat.v1.py_func` and inherits the same constraints. In particular, it
     requires the `Dataset`- and `Iterator`-related operations to be placed
     on a device in the same process as the Python program that called
     `Dataset.from_generator()`. The body of `generator` will not be
@@ -423,14 +446,13 @@
 
     Args:
       generator: A callable object that returns an object that supports the
-        `iter()` protocol. If `args` is not specified, `generator` must take
-        no arguments; otherwise it must take as many arguments as there are
-        values in `args`.
+        `iter()` protocol. If `args` is not specified, `generator` must take no
+        arguments; otherwise it must take as many arguments as there are values
+        in `args`.
       output_types: A nested structure of `tf.DType` objects corresponding to
         each component of an element yielded by `generator`.
-      output_shapes: (Optional.) A nested structure of `tf.TensorShape`
-        objects corresponding to each component of an element yielded by
-        `generator`.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
+        corresponding to each component of an element yielded by `generator`.
       args: (Optional.) A tuple of `tf.Tensor` objects that will be evaluated
         and passed to `generator` as NumPy-array arguments.
 
@@ -478,8 +500,8 @@
       `iterator_id_t`, and raise `StopIteration` to terminate the iteration.
 
       Args:
-        iterator_id_t: A `tf.int64` tensor whose value uniquely identifies
-          the iterator in `generator_state` from which to generate an element.
+        iterator_id_t: A `tf.int64` tensor whose value uniquely identifies the
+          iterator in `generator_state` from which to generate an element.
 
       Returns:
         A nested structure of tensors representing an element from the iterator.
@@ -621,27 +643,25 @@
     For example:
 
     ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3 }
-    b = { 4, 5, 6 }
-    c = { (7, 8), (9, 10), (11, 12) }
-    d = { 13, 14 }
+    a = Dataset.range(1, 4)  # ==> [ 1, 2, 3 ]
+    b = Dataset.range(4, 7)  # ==> [ 4, 5, 6 ]
+    c = Dataset.range(7, 13).batch(2)  # ==> [ [7, 8], [9, 10], [11, 12] ]
+    d = Dataset.range(13, 15)  # ==> [ 13, 14 ]
 
     # The nested structure of the `datasets` argument determines the
     # structure of elements in the resulting dataset.
-    Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
-    Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }
+    Dataset.zip((a, b))  # ==> [ (1, 4), (2, 5), (3, 6) ]
+    Dataset.zip((b, a))  # ==> [ (4, 1), (5, 2), (6, 3) ]
 
     # The `datasets` argument may contain an arbitrary number of
     # datasets.
-    Dataset.zip((a, b, c)) == { (1, 4, (7, 8)),
-                                (2, 5, (9, 10)),
-                                (3, 6, (11, 12)) }
+    Dataset.zip((a, b, c))  # ==> [ (1, 4, [7, 8]),
+                            #       (2, 5, [9, 10]),
+                            #       (3, 6, [11, 12]) ]
 
     # The number of elements in the resulting dataset is the same as
     # the size of the smallest dataset in `datasets`.
-    Dataset.zip((a, d)) == { (1, 13), (2, 14) }
+    Dataset.zip((a, d))  # ==> [ (1, 13), (2, 14) ]
     ```
 
     Args:
@@ -656,18 +676,16 @@
     """Creates a `Dataset` by concatenating given dataset with this dataset.
 
     ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3 }
-    b = { 4, 5, 6, 7 }
+    a = Dataset.range(1, 4)  # ==> [ 1, 2, 3 ]
+    b = Dataset.range(4, 8)  # ==> [ 4, 5, 6, 7 ]
 
     # Input dataset and dataset to be concatenated should have same
     # nested structures and output types.
-    # c = { (8, 9), (10, 11), (12, 13) }
-    # d = { 14.0, 15.0, 16.0 }
+    # c = Dataset.range(8, 14).batch(2)  # ==> [ [8, 9], [10, 11], [12, 13] ]
+    # d = Dataset.from_tensor_slices([14.0, 15.0, 16.0])
     # a.concatenate(c) and a.concatenate(d) would result in error.
 
-    a.concatenate(b) == { 1, 2, 3, 4, 5, 6, 7 }
+    a.concatenate(b)  # ==> [ 1, 2, 3, 4, 5, 6, 7 ]
     ```
 
     Args:
@@ -685,8 +703,8 @@
     a batch and this operation will prefetch `buffer_size` batches.
 
     Args:
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        maximum number of elements that will be buffered when prefetching.
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the maximum
+        number of elements that will be buffered when prefetching.
 
     Returns:
       Dataset: A `Dataset`.
@@ -719,7 +737,7 @@
         Defaults to `True`.
       seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
         seed that will be used to create the distribution. See
-        `tf.set_random_seed` for behavior.
+        `tf.compat.v1.set_random_seed` for behavior.
 
     Returns:
      Dataset: A `Dataset` of strings corresponding to file names.
@@ -761,16 +779,40 @@
 
     Args:
       count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        number of times the dataset should be repeated. The default behavior
-        (if `count` is `None` or `-1`) is for the dataset be repeated
-        indefinitely.
+        number of times the dataset should be repeated. The default behavior (if
+        `count` is `None` or `-1`) is for the dataset be repeated indefinitely.
 
     Returns:
       Dataset: A `Dataset`.
     """
     return RepeatDataset(self, count)
 
-  def _enumerate(self, start=0):
+  def enumerate(self, start=0):
+    """Enumerates the elements of this dataset.
+
+    It is similar to python's `enumerate`.
+
+    For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3 }
+    b = { (7, 8), (9, 10) }
+
+    # The nested structure of the `datasets` argument determines the
+    # structure of elements in the resulting dataset.
+    a.enumerate(start=5)) == { (5, 1), (6, 2), (7, 3) }
+    b.enumerate() == { (0, (7, 8)), (1, (9, 10)) }
+    ```
+
+    Args:
+      start: A `tf.int64` scalar `tf.Tensor`, representing the start value for
+        enumeration.
+
+    Returns:
+      Dataset: A `Dataset`.
+    """
 
     max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
     return Dataset.zip((Dataset.range(start, max_value), self))
@@ -790,12 +832,11 @@
     maintaining the 1,000 element buffer.
 
     Args:
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        number of elements from this dataset from which the new
-        dataset will sample.
-      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        random seed that will be used to create the distribution. See
-        `tf.set_random_seed` for behavior.
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        elements from this dataset from which the new dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
+        seed that will be used to create the distribution. See
+        `tf.compat.v1.set_random_seed` for behavior.
       reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
         that the dataset should be pseudorandomly reshuffled each time it is
         iterated over. (Defaults to `True`.)
@@ -836,11 +877,10 @@
     """Creates a `Dataset` that skips `count` elements from this dataset.
 
     Args:
-      count: A `tf.int64` scalar `tf.Tensor`, representing the number
-        of elements of this dataset that should be skipped to form the
-        new dataset.  If `count` is greater than the size of this
-        dataset, the new dataset will contain no elements.  If `count`
-        is -1, skips the entire dataset.
+      count: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        elements of this dataset that should be skipped to form the new dataset.
+        If `count` is greater than the size of this dataset, the new dataset
+        will contain no elements.  If `count` is -1, skips the entire dataset.
 
     Returns:
       Dataset: A `Dataset`.
@@ -947,9 +987,11 @@
     determines the resulting shape for each dimension of each component in an
     output element:
 
-    * If the dimension is a constant (e.g. `tf.Dimension(37)`), the component
+    * If the dimension is a constant (e.g. `tf.compat.v1.Dimension(37)`), the
+    component
       will be padded out to that length in that dimension.
-    * If the dimension is unknown (e.g. `tf.Dimension(None)`), the component
+    * If the dimension is unknown (e.g. `tf.compat.v1.Dimension(None)`), the
+    component
       will be padded out to the maximum length of all elements in that
       dimension.
 
@@ -959,17 +1001,16 @@
     Args:
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
         consecutive elements of this dataset to combine in a single batch.
-      padded_shapes: A nested structure of `tf.TensorShape` or
-        `tf.int64` vector tensor-like objects representing the shape
-        to which the respective component of each input element should
-        be padded prior to batching. Any unknown dimensions
-        (e.g. `tf.Dimension(None)` in a `tf.TensorShape` or `-1` in a
-        tensor-like object) will be padded to the maximum size of that
-        dimension in each batch.
+      padded_shapes: A nested structure of `tf.TensorShape` or `tf.int64` vector
+        tensor-like objects representing the shape to which the respective
+        component of each input element should be padded prior to batching. Any
+        unknown dimensions (e.g. `tf.compat.v1.Dimension(None)` in a
+        `tf.TensorShape` or `-1` in a tensor-like object) will be padded to the
+        maximum size of that dimension in each batch.
       padding_values: (Optional.) A nested structure of scalar-shaped
-        `tf.Tensor`, representing the padding values to use for the
-        respective components.  Defaults are `0` for numeric types and
-        the empty string for string types.
+        `tf.Tensor`, representing the padding values to use for the respective
+        components.  Defaults are `0` for numeric types and the empty string for
+        string types.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
         whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
@@ -991,17 +1032,17 @@
     For example:
 
     ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3, 4, 5 }
+    a = Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
 
-    a.map(lambda x: x + 1) = { 2, 3, 4, 5, 6 }
+    a.map(lambda x: x + 1)  # ==> [ 2, 3, 4, 5, 6 ]
     ```
 
     The input signature of `map_func` is determined by the structure of each
     element in this dataset. For example:
 
     ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
     # Each element is a `tf.Tensor` object.
     a = { 1, 2, 3, 4, 5 }
     # `map_func` takes a single argument of type `tf.Tensor` with the same
@@ -1060,10 +1101,21 @@
     In addition to `tf.Tensor` objects, `map_func` can accept as arguments and
     return `tf.SparseTensor` objects.
 
+    Note that irrespective of the context in which `map_func` is defined (eager
+    vs. graph), tf.data traces the function and executes it as a graph. To use
+    Python code inside of the function you have two options:
+
+    1) Rely on AutoGraph to convert Python code into an equivalent graph
+    computation. The downside of this approach is that AutoGraph can convert
+    some but not all Python code.
+
+    2) Use `tf.py_function`, which allows you to write arbitrary Python code but
+    will generally result in worse performance than 1).
+
     Args:
-      map_func: A function mapping a nested structure of tensors (having
-        shapes and types defined by `self.output_shapes` and
-       `self.output_types`) to another nested structure of tensors.
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to
+        another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process asynchronously in parallel.
         If not specified, elements will be processed sequentially. If the value
@@ -1087,12 +1139,10 @@
     dataset of their elements:
 
     ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset. '[...]' represents a tensor.
-    a = {[1,2,3,4,5], [6,7,8,9], [10]}
+    a = Dataset.from_tensor_slices([ [1, 2, 3], [4, 5, 6], [7, 8, 9] ])
 
-    a.flat_map(lambda x: Dataset.from_tensor_slices(x)) ==
-      {[1,2,3,4,5,6,7,8,9,10]}
+    a.flat_map(lambda x: Dataset.from_tensor_slices(x + 1)) # ==>
+    #  [ 2, 3, 4, 5, 6, 7, 8, 9, 10 ]
     ```
 
     `tf.data.Dataset.interleave()` is a generalization of `flat_map`, since
@@ -1143,24 +1193,20 @@
     For example:
 
     ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3, 4, 5 }
+    a = Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
 
     # NOTE: New lines indicate "block" boundaries.
     a.interleave(lambda x: Dataset.from_tensors(x).repeat(6),
-                 cycle_length=2, block_length=4) == {
-        1, 1, 1, 1,
-        2, 2, 2, 2,
-        1, 1,
-        2, 2,
-        3, 3, 3, 3,
-        4, 4, 4, 4,
-        3, 3,
-        4, 4,
-        5, 5, 5, 5,
-        5, 5,
-    }
+                cycle_length=2, block_length=4)  # ==> [1, 1, 1, 1,
+                                                 #      2, 2, 2, 2,
+                                                 #      1, 1,
+                                                 #      2, 2,
+                                                 #      3, 3, 3, 3,
+                                                 #      4, 4, 4, 4,
+                                                 #      3, 3,
+                                                 #      4, 4,
+                                                 #      5, 5, 5, 5,
+                                                 #      5, 5]
     ```
 
     NOTE: The order of elements yielded by this transformation is
@@ -1176,8 +1222,8 @@
         processed concurrently.
       block_length: The number of consecutive elements to produce from each
         input element before cycling to another input element.
-      num_parallel_calls: (Optional.) If specified, the implementation creates
-        a threadpool, which is used to fetch inputs from cycle elements
+      num_parallel_calls: (Optional.) If specified, the implementation creates a
+        threadpool, which is used to fetch inputs from cycle elements
         asynchronously and in parallel. The default behavior is to fetch inputs
         from cycle elements synchronously with no parallelism. If the value
         `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
@@ -1198,13 +1244,13 @@
     ```python
     d = tf.data.Dataset.from_tensor_slices([1, 2, 3])
 
-    d = d.filter(lambda x: x < 3) # [1, 2]
+    d = d.filter(lambda x: x < 3)  # ==> [1, 2]
 
     # `tf.math.equal(x, y)` is required for equality comparison
     def filter_fn(x):
       return tf.math.equal(x, 1)
 
-    d = d.filter(filter_fn) # [1]
+    d = d.filter(filter_fn)  # ==> [1]
     ```
 
     Args:
@@ -1250,16 +1296,17 @@
     return dataset
 
   def window(self, size, shift=None, stride=1, drop_remainder=False):
-    """Combines input elements into a dataset of windows.
+    """Combines (nests of) input elements into a dataset of (nests of) windows.
 
-    Each window is a dataset itself and contains `size` elements (or
-    possibly fewer if there are not enough input elements to fill the window
-    and `drop_remainder` evaluates to false).
+    A "window" is a finite dataset of flat elements of size `size` (or possibly
+    fewer if there are not enough input elements to fill the window and
+    `drop_remainder` evaluates to false).
 
-    The `stride` argument determines the stride of the input elements,
-    and the `shift` argument determines the shift of the window.
+    The `stride` argument determines the stride of the input elements, and the
+    `shift` argument determines the shift of the window.
 
-    For example:
+    For example, letting {...} to represent a Dataset:
+
     - `tf.data.Dataset.range(7).window(2)` produces
       `{{0, 1}, {2, 3}, {4, 5}, {6}}`
     - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
@@ -1267,6 +1314,16 @@
     - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
       `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
 
+    Note that when the `window` transformation is applied to a dataset of
+    nested elements, it produces a dataset of nested windows.
+
+    For example:
+
+    - `tf.data.Dataset.from_tensor_slices((range(4), range(4)).window(2)`
+      produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
+    - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
+      produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
+
     Args:
       size: A `tf.int64` scalar `tf.Tensor`, representing the number of elements
         of the input dataset to combine into a window.
@@ -1280,9 +1337,9 @@
         `window_size`.
 
     Returns:
-      Dataset: A `Dataset` of windows, each of which is a nested `Dataset` with
-        the same structure as this dataset, but a finite subsequence of its
-        elements.
+      Dataset: A `Dataset` of (nests of) windows -- a finite datasets of flat
+        elements created from the (nests of) input elements.
+
     """
     if shift is None:
       shift = size
@@ -1306,8 +1363,8 @@
       initial_state: A nested structure of tensors, representing the initial
         state of the transformation.
       reduce_func: A function that maps `(old_state, input_element)` to
-        `new_state`. It must take two arguments and return a nested structure
-        of tensors. The structure of `new_state` must match the structure of
+        `new_state`. It must take two arguments and return a nested structure of
+        tensors. The structure of `new_state` must match the structure of
         `initial_state`.
 
     Returns:
@@ -1317,17 +1374,7 @@
     """
 
     with ops.name_scope("initial_state"):
-      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
-      # values to tensors.
-      initial_state = nest.pack_sequence_as(initial_state, [
-          sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
-              t, name="component_%d" % i)
-          for i, t in enumerate(nest.flatten(initial_state))
-      ])
-
-    # Compute initial values for the state classes, shapes and types based on
-    # the initial state.
+      initial_state = structure_lib.normalize_tensors(initial_state)
     state_structure = structure_lib.Structure.from_value(initial_state)
 
     # Iteratively rerun the reduce function until reaching a fixed point on
@@ -1546,8 +1593,8 @@
 
     Args:
       shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
+        shared under the given name across multiple sessions that share the same
+        devices (e.g. when using a remote server).
 
     Returns:
       An `Iterator` over the elements of this dataset.
@@ -1878,7 +1925,7 @@
 
 @tf_export(v1=["data.make_one_shot_iterator"])
 def make_one_shot_iterator(dataset):
-  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+  """Creates a `tf.compat.v1.data.Iterator` for enumerating the elements of a dataset.
 
   Note: The returned iterator will be initialized automatically.
   A "one-shot" iterator does not support re-initialization.
@@ -1887,7 +1934,7 @@
     dataset: A `tf.data.Dataset`.
 
   Returns:
-    A `tf.data.Iterator` over the elements of this dataset.
+    A `tf.compat.v1.data.Iterator` over the elements of this dataset.
   """
   try:
     # Call the defined `_make_one_shot_iterator()` if there is one, because some
@@ -1899,26 +1946,26 @@
 
 @tf_export(v1=["data.make_initializable_iterator"])
 def make_initializable_iterator(dataset, shared_name=None):
-  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+  """Creates a `tf.compat.v1.data.Iterator` for enumerating the elements of a dataset.
 
   Note: The returned iterator will be in an uninitialized state,
   and you must run the `iterator.initializer` operation before using it:
 
   ```python
   dataset = ...
-  iterator = tf.data.make_initializable_iterator(dataset)
+  iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
   # ...
   sess.run(iterator.initializer)
   ```
 
   Args:
     dataset: A `tf.data.Dataset`.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-      shared under the given name across multiple sessions that share the
-      same devices (e.g. when using a remote server).
+    shared_name: (Optional.) If non-empty, the returned iterator will be shared
+      under the given name across multiple sessions that share the same devices
+      (e.g. when using a remote server).
 
   Returns:
-    A `tf.data.Iterator` over the elements of `dataset`.
+    A `tf.compat.v1.data.Iterator` over the elements of `dataset`.
 
   Raises:
     RuntimeError: If eager execution is enabled.
@@ -1936,8 +1983,8 @@
   """Returns the `tf.data.experimental.Structure` of a `Dataset` or `Iterator`.
 
   Args:
-    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
-    `IteratorV2`.
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.compat.v1.data.Iterator`, or
+      `IteratorV2`.
 
   Returns:
     A `tf.data.experimental.Structure` representing the structure of the
@@ -1964,8 +2011,8 @@
   `tf.compat.v1.Dataset.output_shapes` property.
 
   Args:
-    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
-    `IteratorV2`.
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.compat.v1.data.Iterator`, or
+      `IteratorV2`.
 
   Returns:
     A nested structure of `tf.TensorShape` objects corresponding to each
@@ -1982,8 +2029,8 @@
   `tf.compat.v1.Dataset.output_types` property.
 
   Args:
-    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
-    `IteratorV2`.
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.compat.v1.data.Iterator`, or
+      `IteratorV2`.
 
   Returns:
     A nested structure of `tf.DType` objects corresponding to each component
@@ -2000,8 +2047,8 @@
   `tf.compat.v1.Dataset.output_classes` property.
 
   Args:
-    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
-    `IteratorV2`.
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.compat.v1.data.Iterator`, or
+      `IteratorV2`.
 
   Returns:
     A nested structure of Python `type` or `tf.data.experimental.Structure`
@@ -2027,11 +2074,13 @@
       "Whether the outputs need to be produced in deterministic order. If None,"
       " defaults to True.")
 
-  experimental_numa_aware = options_lib.create_option(
-      name="experimental_numa_aware",
-      ty=bool,
+  experimental_distribute = options_lib.create_option(
+      name="experimental_distribute",
+      ty=distribute_options.DistributeOptions,
       docstring=
-      "Whether to use NUMA-aware operations. If None, defaults to False.")
+      "The distribution options associated with the dataset. See "
+      "`tf.data.experimental.DistributeOptions` for more details.",
+      default_factory=distribute_options.DistributeOptions)
 
   experimental_optimization = options_lib.create_option(
       name="experimental_optimization",
@@ -2041,6 +2090,15 @@
       "`tf.data.experimental.OptimizationOptions` for more details.",
       default_factory=optimization_options.OptimizationOptions)
 
+  experimental_slack = options_lib.create_option(
+      name="experimental_slack",
+      ty=bool,
+      docstring="Whether to introduce 'slack' in the last `prefetch` of the "
+      "input pipeline, if it exists. This may reduce CPU contention with "
+      "accelerator host-side activity at the start of a step. The slack "
+      "frequency is determined by the number of devices attached to this "
+      "input pipeline. If None, defaults to False.")
+
   experimental_stats = options_lib.create_option(
       name="experimental_stats",
       ty=stats_options.StatsOptions,
@@ -2063,18 +2121,28 @@
     result = []
     result.extend(self.experimental_optimization._static_optimizations())  # pylint: disable=protected-access
 
-    if self.experimental_numa_aware:
-      result.append("make_numa_aware")
     if self.experimental_deterministic is False:
       result.append("make_sloppy")
     exp_stats_options = self.experimental_stats
     if exp_stats_options and exp_stats_options.latency_all_edges:
       result.append("latency_all_edges")
+    if self.experimental_slack:
+      result.append("slack")
     return result
 
   def _static_optimization_configs(self):
     """Produces the list of configurations for enabled static optimizations."""
-    return self.experimental_optimization._static_optimization_configs()  # pylint: disable=protected-access
+    result = []
+    if self.experimental_optimization:
+      result.extend(
+          self.experimental_optimization._static_optimization_configs())  # pylint: disable=protected-access
+
+    if self.experimental_slack:
+      num_devices = self.experimental_distribute.num_devices
+      if num_devices is None:
+        num_devices = 1
+      result.append("slack:slack_period:%d" % num_devices)
+    return result
 
   def merge(self, options):
     """Merges itself with the given `tf.data.Options`.
@@ -2150,12 +2218,7 @@
   def __init__(self, tensors):
     """See `Dataset.from_tensor_slices()` for details."""
     with ops.name_scope("tensors"):
-      tensors = nest.pack_sequence_as(tensors, [
-          sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
-              t, name="component_%d" % i)
-          for i, t in enumerate(nest.flatten(tensors))
-      ])
+      tensors = structure_lib.normalize_tensors(tensors)
 
     batched_structure = structure_lib.Structure.from_value(tensors)
     # pylint: disable=protected-access
@@ -2257,6 +2320,14 @@
   def __init__(self, element_structure):
     self._element_structure = element_structure
 
+  def __eq__(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, DatasetStructure) and
+            self._element_structure == other._element_structure)
+
+  def __hash__(self):
+    return hash(self._element_structure)
+
   @property
   def _flat_shapes(self):
     return [tensor_shape.scalar()]
@@ -2421,7 +2492,7 @@
       # those tensors into a single tensor, because the customized
       # version of `nest.flatten()` does not recurse into lists. Since
       # it is more likely that the list arose from returning the
-      # result of an operation (such as `tf.py_func()`) that returns a
+      # result of an operation (such as `tf.compat.v1.py_func()`) that returns a
       # list of not-necessarily-stackable tensors, we treat the
       # returned value is a `tuple` instead. A user wishing to pack
       # the return value into a single tensor can use an explicit
@@ -2705,6 +2776,7 @@
   def __init__(self, *args):
     """See `Dataset.range()` for details."""
     self._parse_args(*args)
+    self._structure = structure_lib.TensorStructure(dtypes.int64, [])
     variant_tensor = gen_dataset_ops.range_dataset(
         start=self._start,
         stop=self._stop,
@@ -2734,7 +2806,7 @@
 
   @property
   def _element_structure(self):
-    return structure_lib.TensorStructure(dtypes.int64, [])
+    return self._structure
 
 
 class CacheDataset(UnaryUnchangedStructureDataset):
@@ -2764,12 +2836,11 @@
 
     Args:
       input_dataset: The input dataset.
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        number of elements from this dataset from which the new
-        dataset will sample.
-      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        random seed that will be used to create the distribution. See
-        `tf.set_random_seed` for behavior.
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        elements from this dataset from which the new dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
+        seed that will be used to create the distribution. See
+        `tf.compat.v1.set_random_seed` for behavior.
       reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
         that the dataset should be pseudorandomly reshuffled each time it is
         iterated over. (Defaults to `True`.)
@@ -2876,7 +2947,7 @@
     return self._structure
 
 
-class _VariantTracker(tracking.TrackableResource):
+class _VariantTracker(tracking.CapturableResource):
   """Allows export of functions capturing a Dataset in SavedModels.
 
   When saving a SavedModel, `tf.saved_model.save` traverses the object
@@ -2896,7 +2967,7 @@
         variant-dtype Tensor. This function will be included in SavedModels and
         run to re-create the Dataset's variant Tensor on restore.
     """
-    super(_VariantTracker, self).__init__()
+    super(_VariantTracker, self).__init__(device="CPU")
     self._resource_handle = variant_tensor
     self._create_resource = resource_creator
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 751233d..dfd17b5 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -225,7 +225,8 @@
 
     This method allows you to define a "feedable" iterator where you can choose
     between concrete iterators by feeding a value in a `tf.Session.run` call.
-    In that case, `string_handle` would be a `tf.placeholder`, and you would
+    In that case, `string_handle` would be a `tf.compat.v1.placeholder`, and you
+    would
     feed it with the value of `tf.data.Iterator.string_handle` in each step.
 
     For example, if you had two iterators that marked the current position in
@@ -239,7 +240,7 @@
     test_iterator = tf.data.Dataset(...).make_one_shot_iterator()
     test_iterator_handle = sess.run(test_iterator.string_handle())
 
-    handle = tf.placeholder(tf.string, shape=[])
+    handle = tf.compat.v1.placeholder(tf.string, shape=[])
     iterator = tf.data.Iterator.from_string_handle(
         handle, train_iterator.output_types)
 
@@ -251,8 +252,8 @@
     ```
 
     Args:
-      string_handle: A scalar `tf.Tensor` of type `tf.string` that evaluates
-        to a handle produced by the `Iterator.string_handle()` method.
+      string_handle: A scalar `tf.Tensor` of type `tf.string` that evaluates to
+        a handle produced by the `Iterator.string_handle()` method.
       output_types: A nested structure of `tf.DType` objects corresponding to
         each component of an element of this dataset.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
@@ -387,10 +388,10 @@
 
     # Build a TensorFlow graph that does something with each element.
     loss = model_function(next_element)
-    optimizer = ...  # A `tf.train.Optimizer` object.
+    optimizer = ...  # A `tf.compat.v1.train.Optimizer` object.
     train_op = optimizer.minimize(loss)
 
-    with tf.Session() as sess:
+    with tf.compat.v1.Session() as sess:
       try:
         while True:
           sess.run(train_op)
@@ -719,7 +720,7 @@
   will have no value.
 
   Args:
-    iterator: A `tf.data.Iterator` object.
+    iterator: A `tf.compat.v1.data.Iterator` object.
 
   Returns:
     An `Optional` object representing the next value from the iterator (if it
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 7b8680c..0973e66 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -210,6 +210,9 @@
       than the max_buffer_size, we set the max_buffer_size to
       prefetch_buffer_size.
     """
+    options = dataset_ops.Options()
+    options.experimental_distribute.num_devices = len(devices)
+    dataset = dataset.with_options(options)
     self._dataset = dataset._apply_options()  # pylint: disable=protected-access
     self._devices = devices
     self._source_device = source_device
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 0447c90..a2c0b92 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -37,7 +37,8 @@
   An `Optional` can represent the result of an operation that may fail as a
   value, rather than raising an exception and halting execution. For example,
   `tf.data.experimental.get_next_as_optional` returns an `Optional` that either
-  contains the next value from a `tf.data.Iterator` if one exists, or a "none"
+  contains the next value from a `tf.compat.v1.data.Iterator` if one exists, or
+  a "none"
   value that indicates the end of the sequence has been reached.
   """
 
@@ -154,6 +155,14 @@
   def __init__(self, value_structure):
     self._value_structure = value_structure
 
+  def __eq__(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, OptionalStructure) and
+            self._value_structure == other._value_structure)
+
+  def __hash__(self):
+    return hash(self._value_structure)
+
   @property
   def _flat_shapes(self):
     return [tensor_shape.scalar()]
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 991d026..bdd6054 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -27,6 +27,7 @@
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops/ragged",
         "//third_party/py/numpy",
     ],
 )
@@ -77,6 +78,7 @@
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/ragged",
     ],
 )
 
@@ -96,6 +98,8 @@
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/ops/ragged:ragged_test_util",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 616aa9f..e53753e 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -27,6 +27,7 @@
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
@@ -144,6 +145,17 @@
     self.assertEqual([st, st, st], nest.flatten(nest_of_values))
     self.assertEqual([st, st, st], nest.flatten(dict_of_values))
 
+  def testFlattenRaggedValue(self):
+    rt = ragged_factory_ops.constant_value([[[0]], [[1]]])
+    single_value = rt
+    list_of_values = [rt, rt, rt]
+    nest_of_values = ((rt), ((rt), (rt)))
+    dict_of_values = {"foo": rt, "bar": rt, "baz": rt}
+    self.assertEqual([rt], nest.flatten(single_value))
+    self.assertEqual([[rt, rt, rt]], nest.flatten(list_of_values))
+    self.assertEqual([rt, rt, rt], nest.flatten(nest_of_values))
+    self.assertEqual([rt, rt, rt], nest.flatten(dict_of_values))
+
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertFalse(nest.is_sequence([1, 3, [4, 5]]))
@@ -157,6 +169,8 @@
     self.assertTrue(nest.is_sequence({"foo": 1, "bar": 2}))
     self.assertFalse(
         nest.is_sequence(sparse_tensor.SparseTensorValue([[0]], [0], [1])))
+    self.assertFalse(
+        nest.is_sequence(ragged_factory_ops.constant_value([[[0]], [[1]]])))
 
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
index d5169f7..ef8047b 100644
--- a/tensorflow/python/data/util/random_seed.py
+++ b/tensorflow/python/data/util/random_seed.py
@@ -29,7 +29,8 @@
 def get_seed(seed):
   """Returns the local seeds an operation should use given an op-specific seed.
 
-  See `tf.get_seed` for more details. This wrapper adds support for the case
+  See `tf.compat.v1.get_seed` for more details. This wrapper adds support for
+  the case
   where `seed` may be a tensor.
 
   Args:
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 661a59c..87337dc 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -26,10 +26,13 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -51,9 +54,30 @@
   TODO(b/110122868): In the future, a single `Structure` will replace the
   `tf.data.Dataset.output_types`, `tf.data.Dataset.output_shapes`,
   and `tf.data.Dataset.output_classes`, and similar properties and arguments in
-  the `tf.data.Iterator` and `Optional` classes.
+  the `tf.compat.v1.data.Iterator` and `Optional` classes.
   """
 
+  @abc.abstractmethod
+  def __eq__(self, other):
+    """Returns the this structure and the input structure are equal.
+
+    Args:
+      other: the structure to use for equality check
+
+    Returns:
+      `True` if this and the input structure are equal and `False` otherwise.
+    """
+    raise NotImplementedError("Structure.__eq__()")
+
+  @abc.abstractmethod
+  def __hash__(self):
+    """Returns the hash of this structure.
+
+    Returns:
+      The hash of this structure.
+    """
+    raise NotImplementedError("Structure.__hash__()")
+
   @abc.abstractproperty
   def _flat_shapes(self):
     """A list of shapes matching the shapes of `self._to_tensor_list()`.
@@ -80,7 +104,7 @@
 
     * `s` and `t` are instances of the same `Structure` subclass.
     * The nested structures (if any) of `s` and `t` are the same, according to
-      `tf.contrib.framework.nest.assert_same_structure`, and each nested
+      `tf.nest.assert_same_structure`, and each nested
       structure of `t` is a "subtype" of the corresponding nested structure of
       `s`.
     * Any `tf.DType` components of `t` are the same as the corresponding
@@ -207,6 +231,10 @@
       return SparseTensorStructure.from_value(value)
     elif isinstance(value, tensor_array_ops.TensorArray):
       return TensorArrayStructure.from_value(value)
+    elif isinstance(
+        value,
+        (ragged_tensor.RaggedTensor, ragged_tensor_value.RaggedTensorValue)):
+      return RaggedTensorStructure.from_value(value)
     elif isinstance(value, (tuple, dict)):
       return NestedStructure.from_value(value)
     else:
@@ -265,6 +293,10 @@
     for i, t in enumerate(flat_tensors):
       if sparse_tensor_lib.is_sparse(t):
         prepared.append(sparse_tensor_lib.SparseTensor.from_value(t))
+      elif ragged_tensor.is_ragged(t):
+        prepared.append(
+            ragged_tensor.convert_to_tensor_or_ragged_tensor(
+                t, name="component_%d" % i))
       elif isinstance(t, tensor_array_ops.TensorArray):
         prepared.append(t)
       else:
@@ -351,6 +383,22 @@
       self._flat_shapes_list.extend(s._flat_shapes)
       self._flat_types_list.extend(s._flat_types)
 
+  def __eq__(self, other):
+    if not isinstance(other, NestedStructure):
+      return False
+    try:
+      # pylint: disable=protected-access
+      nest.assert_same_structure(self._nested_structure,
+                                 other._nested_structure)
+    except (ValueError, TypeError):
+      return False
+
+    return nest.flatten(self._nested_structure) == nest.flatten(
+        other._nested_structure)
+
+  def __hash__(self):
+    return hash(tuple(nest.flatten(self._nested_structure)))
+
   @property
   def _flat_shapes(self):
     return self._flat_shapes_list
@@ -469,6 +517,14 @@
     self._dtype = dtypes.as_dtype(dtype)
     self._shape = tensor_shape.as_shape(shape)
 
+  def __eq__(self, other):
+    return (isinstance(other, TensorStructure) and tensor_spec.TensorSpec(
+        self._shape, self._dtype) == tensor_spec.TensorSpec(
+            other._shape, other._dtype))
+
+  def __hash__(self):
+    return hash(tensor_spec.TensorSpec(self._shape, self._dtype))
+
   @property
   def _flat_shapes(self):
     return [self._shape]
@@ -543,6 +599,14 @@
     self._dtype = dtypes.as_dtype(dtype)
     self._dense_shape = tensor_shape.as_shape(dense_shape)
 
+  def __eq__(self, other):
+    return (isinstance(other, SparseTensorStructure) and tensor_spec.TensorSpec(
+        self._dense_shape, self._dtype) == tensor_spec.TensorSpec(
+            other._dense_shape, other._dtype))
+
+  def __hash__(self):
+    return hash(tensor_spec.TensorSpec(self._dense_shape, self._dtype))
+
   @property
   def _flat_shapes(self):
     # NOTE(mrry): The default flat shape of a boxed `SparseTensor` is `(3,)`,
@@ -621,6 +685,17 @@
     self._dynamic_size = dynamic_size
     self._infer_shape = infer_shape
 
+  def __eq__(self, other):
+    return (isinstance(other, TensorArrayStructure) and tensor_spec.TensorSpec(
+        self._element_shape, self._dtype) == tensor_spec.TensorSpec(
+            other._element_shape, other._dtype) and
+            self._dynamic_size == other._dynamic_size and
+            self._infer_shape == other._infer_shape)
+
+  def __hash__(self):
+    return hash((tensor_spec.TensorSpec(self._element_shape, self._dtype),
+                 self._dynamic_size, self._infer_shape))
+
   @property
   def _flat_shapes(self):
     # A TensorArray is represented via its variant object, which is a scalar.
@@ -700,3 +775,86 @@
 
   def _unbatch(self):
     raise NotImplementedError("TensorArrayStructure._unbatch")
+
+
+@tf_export("data.experimental.RaggedTensorStructure")
+class RaggedTensorStructure(Structure):
+  """Represents structural information about a `tf.RaggedTensor`."""
+
+  def __init__(self, dtype, shape, ragged_rank):
+    self._dtype = dtypes.as_dtype(dtype)
+    self._shape = tensor_shape.as_shape(shape)
+    self._ragged_rank = ragged_rank
+
+  def __eq__(self, other):
+    return (isinstance(other, RaggedTensorStructure) and tensor_spec.TensorSpec(
+        self._shape, self._dtype) == tensor_spec.TensorSpec(
+            other._shape, other._dtype))
+
+  def __hash__(self):
+    return hash(tensor_spec.TensorSpec(self._shape, self._dtype))
+
+  @property
+  def _flat_shapes(self):
+    # A list of shapes matching the shapes of `self._to_tensor_list()`.
+    # NOTE(mishragaurav): The default flat shape of a boxed `RaggedTensor` is
+    # `[]` (scalar), but a `RaggedTensorStructure` can also represent a batch of
+    # boxed `RaggedTensor` objects with shape `(?)` (and batches of batches,
+    # etc.), so the flat shape must be unknown.
+    return [tensor_shape.unknown_shape(None)]
+
+  @property
+  def _flat_types(self):
+    return [dtypes.variant]
+
+  def is_compatible_with(self, other):
+    return (isinstance(other, RaggedTensorStructure) and
+            self._dtype.is_compatible_with(other._dtype) and
+            self._shape.is_compatible_with(other._shape) and
+            self._ragged_rank == other._ragged_rank)
+
+  def _to_tensor_list(self, value):
+    return [value._to_variant()]
+
+  def _to_batched_tensor_list(self, value):
+    return [value._to_variant(batched_input=True)]
+
+  def _from_tensor_list(self, flat_value):
+    if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant):
+      raise ValueError("RaggedTensorStructure corresponds to a single "
+                       "tf.variant scalar.")
+    return self._from_compatible_tensor_list(flat_value)
+
+  def _from_compatible_tensor_list(self, flat_value):
+    if self._ragged_rank <= 0:
+      raise ValueError(
+          "ragged_rank must be greater than zero. Found ragged_rank: %d" %
+          self._ragged_rank)
+    return ragged_tensor.RaggedTensor._from_variant(
+        flat_value[0], dtype=self._dtype, output_ragged_rank=self._ragged_rank)
+
+  @staticmethod
+  def from_value(value):
+    return RaggedTensorStructure(value.dtype, value.shape, value.ragged_rank)
+
+  def _to_legacy_output_types(self):
+    return self._dtype
+
+  def _to_legacy_output_shapes(self):
+    return self._shape
+
+  def _to_legacy_output_classes(self):
+    return self
+
+  def _batch(self, batch_size):
+    return RaggedTensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._shape),
+        self._ragged_rank + 1)
+
+  def _unbatch(self):
+    # Note: Any ragged_rank is allowed here because the dataset could be
+    # subsequently batched again. Errors are handled in
+    # RaggedTensorStructure._from_compatible_tensor_list()
+    return RaggedTensorStructure(self._dtype, self._shape[1:],
+                                 self._ragged_rank - 1)
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index d292e9c..64fee62 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -33,38 +33,50 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import test
 
 
-class StructureTest(test_base.DatasetTestBase, parameterized.TestCase):
+# NOTE(mrry): Arguments of parameterized tests are lifted into lambdas to make
+# sure they are not executed before the (eager- or graph-mode) test environment
+# has been set up.
+#
+# TODO(jsimsa): Add tests for OptionalStructure and DatasetStructure.
+class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
+                    ragged_test_util.RaggedTensorTestCase):
 
-  # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
-  # will be executed before the (eager- or graph-mode) test environment has been
-  # set up.
   # pylint: disable=g-long-lambda,protected-access
-  @parameterized.parameters(
-      (lambda: constant_op.constant(37.0), structure.TensorStructure,
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0), structure.TensorStructure,
        [dtypes.float32], [[]]),
-      (lambda: tensor_array_ops.TensorArray(
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
           dtype=dtypes.float32, element_shape=(3,), size=0),
        structure.TensorArrayStructure, [dtypes.variant], [None, 3]),
-      (lambda: sparse_tensor.SparseTensor(
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
        structure.SparseTensorStructure, [dtypes.variant], [None]),
-      (lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
+      ("RaggedTensor", lambda: ragged_factory_ops.constant([[1, 2], [], [4]]),
+       structure.RaggedTensorStructure, [dtypes.variant], [None]),
+      ("Nested_0",
+       lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
        structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
-      (lambda: {
+      ("Nested_1", lambda: {
           "a": constant_op.constant(37.0),
           "b": constant_op.constant([1, 2, 3])
       }, structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
-      (lambda: {
-          "a": constant_op.constant(37.0),
+      ("Nested_2", lambda: {
+          "a":
+              constant_op.constant(37.0),
           "b": (sparse_tensor.SparseTensor(
               indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
                 sparse_tensor.SparseTensor(
                     indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
       }, structure.NestedStructure,
-       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]))
+       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]),
+  )
   def testFlatStructure(self, value_fn, expected_structure, expected_types,
                         expected_shapes):
     value = value_fn()
@@ -76,28 +88,27 @@
       self.assertTrue(
           tensor_shape.as_shape(expected).is_compatible_with(actual))
 
-  @parameterized.parameters(
-      (lambda: constant_op.constant(37.0), lambda: [
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0), lambda: [
           constant_op.constant(38.0),
           array_ops.placeholder(dtypes.float32),
           variables.Variable(100.0), 42.0,
           np.array(42.0, dtype=np.float32)
-      ], lambda: [constant_op.constant([1.0, 2.0]), constant_op.constant(37)]),
-      (lambda: tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, element_shape=(3,), size=0),
-       lambda: [
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.float32, element_shape=(3,), size=0),
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.float32, element_shape=(3,), size=10)
-       ],
-       lambda: [
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.int32, element_shape=(3,), size=0),
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.float32, element_shape=(), size=0)
-       ]),
-      (lambda: sparse_tensor.SparseTensor(
+      ], lambda: [constant_op.constant([1.0, 2.0]),
+                  constant_op.constant(37)]),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, element_shape=(3,), size=0), lambda: [
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.float32, element_shape=(3,), size=0),
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.float32, element_shape=(3,), size=10)
+          ], lambda: [
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.int32, element_shape=(3,), size=0),
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.float32, element_shape=(), size=0)
+          ]),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
        lambda: [
            sparse_tensor.SparseTensor(
@@ -115,7 +126,17 @@
            sparse_tensor.SparseTensor(
                indices=[[3, 4]], values=[-1.0], dense_shape=[4, 5])
        ]),
-      (lambda: {
+      ("RaggedTensor", lambda: ragged_factory_ops.constant([[1, 2], [], [3]]),
+       lambda: [
+           ragged_factory_ops.constant([[1, 2], [3, 4], []]),
+           ragged_factory_ops.constant([[1], [2, 3, 4], [5]]),
+       ], lambda: [
+           ragged_factory_ops.constant(1),
+           ragged_factory_ops.constant([1, 2]),
+           ragged_factory_ops.constant([[1], [2]]),
+           ragged_factory_ops.constant([["a", "b"]]),
+       ]),
+      ("Nested", lambda: {
           "a": constant_op.constant(37.0),
           "b": constant_op.constant([1, 2, 3])
       }, lambda: [{
@@ -151,24 +172,110 @@
           s.is_compatible_with(
               structure.Structure.from_value(incompatible_value)))
 
-  @parameterized.parameters(
-      (lambda: constant_op.constant(37.0),),
-      (lambda: sparse_tensor.SparseTensor(
-          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),),
-      (lambda: tensor_array_ops.TensorArray(
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       lambda: constant_op.constant(42.0), lambda: constant_op.constant([5])),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.int32, element_shape=(), size=0)),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[1, 2]], values=[42], dense_shape=[4, 5]), lambda:
+       sparse_tensor.SparseTensor(indices=[[3]], values=[-1], dense_shape=[5])),
+      ("Nested", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": constant_op.constant([1, 2, 3])
+      }, lambda: {
+          "a": constant_op.constant(42.0),
+          "b": constant_op.constant([4, 5, 6])
+      }, lambda: {
+          "a": constant_op.constant([1, 2, 3]),
+          "b": constant_op.constant(37.0)
+      }),
+  )
+  def testEquality(self, value1_fn, value2_fn, value3_fn):
+    s1 = structure.Structure.from_value(value1_fn())
+    s2 = structure.Structure.from_value(value2_fn())
+    s3 = structure.Structure.from_value(value3_fn())
+    self.assertEqual(s1, s1)
+    self.assertEqual(s1, s2)
+    self.assertNotEqual(s1, s3)
+    self.assertNotEqual(s2, s3)
+
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       lambda: constant_op.constant(42.0), lambda: constant_op.constant([5])),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.int32, element_shape=(), size=0)),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[1, 2]], values=[42], dense_shape=[4, 5]), lambda:
+       sparse_tensor.SparseTensor(indices=[[3]], values=[-1], dense_shape=[5])),
+      ("Nested", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": constant_op.constant([1, 2, 3])
+      }, lambda: {
+          "a": constant_op.constant(42.0),
+          "b": constant_op.constant([4, 5, 6])
+      }, lambda: {
+          "a": constant_op.constant([1, 2, 3]),
+          "b": constant_op.constant(37.0)
+      }),
+  )
+  def testHash(self, value1_fn, value2_fn, value3_fn):
+    s1 = structure.Structure.from_value(value1_fn())
+    s2 = structure.Structure.from_value(value2_fn())
+    s3 = structure.Structure.from_value(value3_fn())
+    self.assertEqual(hash(s1), hash(s1))
+    self.assertEqual(hash(s1), hash(s2))
+    self.assertNotEqual(hash(s1), hash(s3))
+    self.assertNotEqual(hash(s2), hash(s3))
+
+  @parameterized.named_parameters(
+      (
+          "Tensor",
+          lambda: constant_op.constant(37.0),
+      ),
+      (
+          "SparseTensor",
+          lambda: sparse_tensor.SparseTensor(
+              indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+      ),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
           dtype=dtypes.float32, element_shape=(), size=1).write(0, 7)),
-      (lambda: {"a": constant_op.constant(37.0),
-                "b": constant_op.constant([1, 2, 3])},),
-      (lambda: {"a": constant_op.constant(37.0),
-                "b": (sparse_tensor.SparseTensor(
-                    indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-                      sparse_tensor.SparseTensor(
-                          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
-               },),
-      )
+      ("RaggedTensor", lambda: ragged_factory_ops.constant([[1, 2], [], [3]]),),
+      (
+          "Nested_0",
+          lambda: {
+              "a": constant_op.constant(37.0),
+              "b": constant_op.constant([1, 2, 3])
+          },
+      ),
+      (
+          "Nested_1",
+          lambda: {
+              "a":
+                  constant_op.constant(37.0),
+              "b": (sparse_tensor.SparseTensor(
+                  indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+                    sparse_tensor.SparseTensor(
+                        indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
+          },
+      ),
+  )
   def testRoundTripConversion(self, value_fn):
     value = value_fn()
     s = structure.Structure.from_value(value)
+
     def maybe_stack_ta(v):
       if isinstance(v, tensor_array_ops.TensorArray):
         return v.stack()
@@ -186,8 +293,13 @@
         self.assertAllEqual(b.indices, a.indices)
         self.assertAllEqual(b.values, a.values)
         self.assertAllEqual(b.dense_shape, a.dense_shape)
+      elif isinstance(
+          b,
+          (ragged_tensor.RaggedTensor, ragged_tensor_value.RaggedTensorValue)):
+        self.assertRaggedEqual(b, a)
       else:
         self.assertAllEqual(b, a)
+
   # pylint: enable=g-long-lambda
 
   def testIncompatibleStructure(self):
@@ -268,7 +380,7 @@
     # that:
     # 1. Using one structure to flatten a value with an incompatible structure
     #    fails.
-    # 2. Using one structure to restructre a flattened value with an
+    # 2. Using one structure to restructure a flattened value with an
     #    incompatible structure fails.
 
     value_0 = {
@@ -367,30 +479,40 @@
   @parameterized.named_parameters(
       ("Tensor", dtypes.float32, tensor_shape.scalar(), ops.Tensor,
        structure.TensorStructure(dtypes.float32, [])),
-      ("SparseTensor", dtypes.int32, tensor_shape.matrix(2, 2),
-       sparse_tensor.SparseTensor,
+      ("SparseTensor", dtypes.int32, tensor_shape.matrix(
+          2, 2), sparse_tensor.SparseTensor,
        structure.SparseTensorStructure(dtypes.int32, [2, 2])),
-      ("TensorArray0", dtypes.int32, tensor_shape.as_shape([None, True, 2, 2]),
-       tensor_array_ops.TensorArray,
+      ("TensorArray_0", dtypes.int32, tensor_shape.as_shape(
+          [None, True, 2, 2]), tensor_array_ops.TensorArray,
        structure.TensorArrayStructure(
            dtypes.int32, [2, 2], dynamic_size=None, infer_shape=True)),
-      ("TensorArray1", dtypes.int32, tensor_shape.as_shape([True, None, 2, 2]),
-       tensor_array_ops.TensorArray,
+      ("TensorArray_1", dtypes.int32, tensor_shape.as_shape(
+          [True, None, 2, 2]), tensor_array_ops.TensorArray,
        structure.TensorArrayStructure(
            dtypes.int32, [2, 2], dynamic_size=True, infer_shape=None)),
-      ("TensorArray2", dtypes.int32, tensor_shape.as_shape([True, False, 2, 2]),
-       tensor_array_ops.TensorArray,
+      ("TensorArray_2", dtypes.int32, tensor_shape.as_shape(
+          [True, False, 2, 2]), tensor_array_ops.TensorArray,
        structure.TensorArrayStructure(
            dtypes.int32, [2, 2], dynamic_size=True, infer_shape=False)),
-      ("Nest",
-       {"a": dtypes.float32, "b": (dtypes.int32, dtypes.string)},
-       {"a": tensor_shape.scalar(),
-        "b": (tensor_shape.matrix(2, 2), tensor_shape.scalar())},
-       {"a": ops.Tensor, "b": (sparse_tensor.SparseTensor, ops.Tensor)},
+      ("RaggedTensor", dtypes.int32, tensor_shape.matrix(2, 2),
+       structure.RaggedTensorStructure(dtypes.int32, [2, 2], 1),
+       structure.RaggedTensorStructure(dtypes.int32, [2, 2], 1)),
+      ("Nested", {
+          "a": dtypes.float32,
+          "b": (dtypes.int32, dtypes.string)
+      }, {
+          "a": tensor_shape.scalar(),
+          "b": (tensor_shape.matrix(2, 2), tensor_shape.scalar())
+      }, {
+          "a": ops.Tensor,
+          "b": (sparse_tensor.SparseTensor, ops.Tensor)
+      },
        structure.NestedStructure({
-           "a": structure.TensorStructure(dtypes.float32, []),
+           "a":
+               structure.TensorStructure(dtypes.float32, []),
            "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
-                 structure.TensorStructure(dtypes.string, []))})),
+                 structure.TensorStructure(dtypes.string, []))
+       })),
   )
   def testConvertLegacyStructure(self, output_types, output_shapes,
                                  output_classes, expected_structure):
@@ -441,7 +563,13 @@
       ("SparseTensorUnknown",
        structure.SparseTensorStructure(dtypes.float32, [4]), None,
        structure.SparseTensorStructure(dtypes.float32, [None, 4])),
-      ("Nest", structure.NestedStructure({
+      ("RaggedTensor",
+       structure.RaggedTensorStructure(dtypes.float32, [2, None], 1), 32,
+       structure.RaggedTensorStructure(dtypes.float32, [32, 2, None], 2)),
+      ("RaggedTensorUnknown",
+       structure.RaggedTensorStructure(dtypes.float32, [4, None], 1), None,
+       structure.RaggedTensorStructure(dtypes.float32, [None, 4, None], 2)),
+      ("Nested", structure.NestedStructure({
           "a": structure.TensorStructure(dtypes.float32, []),
           "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
                 structure.TensorStructure(dtypes.string, []))}), 128,
@@ -469,7 +597,13 @@
       ("SparseTensorUnknown",
        structure.SparseTensorStructure(dtypes.float32, [None, 4]),
        structure.SparseTensorStructure(dtypes.float32, [4])),
-      ("Nest", structure.NestedStructure({
+      ("RaggedTensor",
+       structure.RaggedTensorStructure(dtypes.float32, [32, 4, None], 2),
+       structure.RaggedTensorStructure(dtypes.float32, [4, None], 1)),
+      ("RaggedTensorUnknown",
+       structure.RaggedTensorStructure(dtypes.float32, [None, None, 4], 2),
+       structure.RaggedTensorStructure(dtypes.float32, [None, 4], 1)),
+      ("Nested", structure.NestedStructure({
           "a": structure.TensorStructure(dtypes.float32, [128]),
           "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
                 structure.TensorStructure(dtypes.string, [None]))}),
@@ -493,6 +627,9 @@
           indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2]),
        lambda: sparse_tensor.SparseTensor(
            indices=[[0]], values=[13], dense_shape=[2])),
+      ("RaggedTensor",
+       lambda: ragged_factory_ops.constant([[[1]], [[2]]]),
+       lambda: ragged_factory_ops.constant([[1]])),
       ("Nest", lambda: (
           constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
           sparse_tensor.SparseTensor(
@@ -523,10 +660,13 @@
         nest.flatten(expected_element_0), nest.flatten(actual_element_0)):
       if sparse_tensor.is_sparse(expected):
         self.assertSparseValuesEqual(expected, actual)
+      elif ragged_tensor.is_ragged(expected):
+        self.assertRaggedEqual(expected, actual)
       else:
         self.assertAllEqual(expected, actual)
 
   # pylint: enable=g-long-lambda
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index e2d5dec..9c4b57e 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -156,27 +156,11 @@
 )
 
 py_library(
-    name = "stepper",
-    srcs = ["lib/stepper.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":debug_data",
-        ":debug_graphs",
-        ":debug_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:session_ops",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
     name = "framework",
     srcs = ["wrappers/framework.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":debug_utils",
-        ":stepper",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:errors",
@@ -286,21 +270,6 @@
 )
 
 py_library(
-    name = "stepper_cli",
-    srcs = ["cli/stepper_cli.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cli_shared",
-        ":command_parser",
-        ":debugger_cli_common",
-        ":stepper",
-        ":tensor_format",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
     name = "base_ui",
     srcs = ["cli/base_ui.py"],
     srcs_version = "PY2AND3",
@@ -389,7 +358,6 @@
         ":debugger_cli_common",
         ":framework",
         ":profile_analyzer_cli",
-        ":stepper_cli",
         ":tensor_format",
         ":ui_factory",
     ],
@@ -405,7 +373,6 @@
         ":framework",
         ":grpc_wrapper",
         ":local_cli_wrapper",
-        ":stepper",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:training",
     ],
@@ -665,24 +632,6 @@
     ],
 )
 
-cuda_py_test(
-    name = "stepper_test",
-    size = "small",
-    srcs = ["lib/stepper_test.py"],
-    additional_deps = [
-        ":stepper",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-    ],
-    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
-)
-
 py_test(
     name = "framework_test",
     size = "medium",
@@ -1026,33 +975,6 @@
 )
 
 cuda_py_test(
-    name = "stepper_cli_test",
-    size = "small",
-    srcs = ["cli/stepper_cli_test.py"],
-    additional_deps = [
-        ":stepper",
-        ":stepper_cli",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-    ],
-    tags = [
-        "manual",
-        "no_pip",
-        "no_windows",
-        "notap",
-    ],
-    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
-)
-
-cuda_py_test(
     name = "session_debug_grpc_test",
     size = "medium",
     srcs = ["lib/session_debug_grpc_test.py"],
@@ -1145,7 +1067,6 @@
         ":dumping_wrapper",
         ":framework",
         ":hooks",
-        ":stepper",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index 9a47cd1..224fb45 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -16,8 +16,6 @@
 
 The analyzer performs post hoc analysis of dumped intermediate tensors and
 graph structure information from debugged Session.run() calls.
-
-The other part of the debugger is the stepper (c.f. stepper_cli.py).
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index 6a36868..a61a5e8 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -367,13 +367,6 @@
   out.extend(
       debugger_cli_common.rich_text_lines_from_rich_line_list(more_lines))
 
-  out.extend(
-      _recommend_command(
-          "invoke_stepper",
-          "Use the node-stepper interface, which allows you to interactively "
-          "step through nodes involved in the graph run() call and "
-          "inspect/modify their values", create_link=True))
-
   out.append("")
 
   out.append_rich_line(RL("For more details, see ") +
@@ -384,8 +377,6 @@
   # Make main menu for the run-start intro.
   menu = debugger_cli_common.Menu()
   menu.append(debugger_cli_common.MenuItem("run", "run"))
-  menu.append(debugger_cli_common.MenuItem(
-      "invoke_stepper", "invoke_stepper"))
   menu.append(debugger_cli_common.MenuItem("exit", "exit"))
   out.annotations[debugger_cli_common.MAIN_MENU_KEY] = menu
 
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 535e8a2..a7ccd84 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -149,10 +149,6 @@
     self.assertEqual([(2, 12, "bold")], run_start_intro.font_attr_segs[15])
     self.assertEqual("run -f <filter_name>:", run_start_intro.lines[17][2:])
     self.assertEqual([(2, 22, "bold")], run_start_intro.font_attr_segs[17])
-    annot = run_start_intro.font_attr_segs[21][0]
-    self.assertEqual(2, annot[0])
-    self.assertEqual(16, annot[1])
-    self.assertEqual("invoke_stepper", annot[2][0].content)
 
     # Verify short description.
     description = cli_shared.get_run_short_description(12, self.const_a, None)
@@ -163,8 +159,6 @@
                   run_start_intro.annotations)
     menu = run_start_intro.annotations[debugger_cli_common.MAIN_MENU_KEY]
     self.assertEqual("run", menu.caption_to_item("run").content)
-    self.assertEqual("invoke_stepper",
-                     menu.caption_to_item("invoke_stepper").content)
     self.assertEqual("exit", menu.caption_to_item("exit").content)
 
   def testSparseTensorAsFeedShouldHandleNoNameAttribute(self):
diff --git a/tensorflow/python/debug/cli/stepper_cli.py b/tensorflow/python/debug/cli/stepper_cli.py
deleted file mode 100644
index fe1a012..0000000
--- a/tensorflow/python/debug/cli/stepper_cli.py
+++ /dev/null
@@ -1,632 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""CLI Backend for the Node Stepper Part of the Debugger."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-
-import numpy as np  # pylint: disable=unused-import
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.python.debug.cli import cli_shared
-from tensorflow.python.debug.cli import command_parser
-from tensorflow.python.debug.cli import debugger_cli_common
-from tensorflow.python.debug.cli import tensor_format
-from tensorflow.python.debug.lib import stepper
-
-RL = debugger_cli_common.RichLine
-
-
-class NodeStepperCLI(object):
-  """Command-line-interface backend of Node Stepper."""
-
-  # Possible states of an element in the transitive closure of the stepper's
-  # fetch(es).
-  # State where the element is already continued-to and a TensorHandle is
-  # available for the tensor.
-  STATE_CONT = "H"
-
-  # State where an intermediate dump of the tensor is available.
-  STATE_DUMPED_INTERMEDIATE = "I"
-
-  # State where the element is already overridden.
-  STATE_OVERRIDDEN = "O"
-
-  # State where the element is a placeholder (and hence cannot be continued to)
-  STATE_IS_PLACEHOLDER = "P"
-
-  # State where a variable's value has been updated during the lifetime of
-  # this NodeStepperCLI instance.
-  STATE_DIRTY_VARIABLE = "D"
-
-  STATE_UNFEEDABLE = "U"
-
-  NEXT_NODE_POINTER_STR = "-->"
-
-  _MESSAGE_TEMPLATES = {
-      "NOT_IN_CLOSURE":
-          "%s is not in the transitive closure of this stepper instance.",
-      "MULTIPLE_TENSORS":
-          "Node %s has more than one output tensor. "
-          "Please use full tensor name.",
-  }
-
-  _UPDATED_ATTRIBUTE = "bold"
-
-  _STATE_COLORS = {
-      STATE_CONT: cli_shared.COLOR_GREEN,
-      STATE_DIRTY_VARIABLE: cli_shared.COLOR_MAGENTA,
-      STATE_DUMPED_INTERMEDIATE: cli_shared.COLOR_BLUE,
-      STATE_OVERRIDDEN: cli_shared.COLOR_YELLOW,
-      STATE_IS_PLACEHOLDER: cli_shared.COLOR_CYAN,
-      STATE_UNFEEDABLE: cli_shared.COLOR_RED,
-  }
-
-  _FEED_COLORS = {
-      stepper.NodeStepper.FEED_TYPE_CLIENT: cli_shared.COLOR_WHITE,
-      stepper.NodeStepper.FEED_TYPE_HANDLE: cli_shared.COLOR_GREEN,
-      stepper.NodeStepper.FEED_TYPE_OVERRIDE: cli_shared.COLOR_YELLOW,
-      stepper.NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE: cli_shared.COLOR_BLUE,
-  }
-
-  def __init__(self, node_stepper):
-    self._node_stepper = node_stepper
-
-    # Command parsers for the stepper.
-    self.arg_parsers = {}
-
-    # Parser for "list_sorted_nodes".
-    ap = argparse.ArgumentParser(
-        description="List the state of the sorted transitive closure of the "
-        "stepper.",
-        usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "-l",
-        "--lower_bound",
-        dest="lower_bound",
-        type=int,
-        default=-1,
-        help="Lower-bound index (0-based)")
-    ap.add_argument(
-        "-u",
-        "--upper_bound",
-        dest="upper_bound",
-        type=int,
-        default=-1,
-        help="Upper-bound index (0-based)")
-    self.arg_parsers["list_sorted_nodes"] = ap
-
-    # Parser for "cont".
-    ap = argparse.ArgumentParser(
-        description="Continue to a tensor or op.", usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "target_name",
-        type=str,
-        help="Name of the Tensor or Op to continue to.")
-    ap.add_argument(
-        "-i",
-        "--invalidate_from_updated_variables",
-        dest="invalidate_from_updated_variables",
-        action="store_true",
-        help="Whether to invalidate the cached "
-             "tensor handles and intermediate tensor handles affected "
-             "by Variable updates in this continue call.")
-    ap.add_argument(
-        "-r",
-        "--restore_variable_values",
-        dest="restore_variable_values",
-        action="store_true",
-        help="Restore all variables in the transitive closure of the cont "
-             "target to their initial values (i.e., values when this stepper "
-             "instance was created.")
-    self.arg_parsers["cont"] = ap
-
-    # Parser for "step".
-    ap = argparse.ArgumentParser(
-        description="Step to the next tensor or op in the sorted transitive "
-        "closure of the stepper's fetch(es).",
-        usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "-t",
-        "--num_times",
-        dest="num_times",
-        type=int,
-        default=1,
-        help="Number of times to step (>=1)")
-    self.arg_parsers["step"] = ap
-
-    # Parser for "print_tensor".
-    ap = argparse.ArgumentParser(
-        description="Print the value of a tensor, from cached TensorHandle or "
-        "client-provided overrides.",
-        usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "tensor_name",
-        type=str,
-        help="Name of the tensor, followed by any slicing indices, "
-        "e.g., hidden1/Wx_plus_b/MatMul:0, "
-        "hidden1/Wx_plus_b/MatMul:0[1, :]")
-    ap.add_argument(
-        "-r",
-        "--ranges",
-        dest="ranges",
-        type=str,
-        default="",
-        help="Numerical ranges to highlight tensor elements in. "
-        "Examples: -r 0,1e-8, -r [-0.1,0.1], "
-        "-r \"[[-inf, -0.1], [0.1, inf]]\"")
-    ap.add_argument(
-        "-a",
-        "--all",
-        dest="print_all",
-        action="store_true",
-        help="Print the tensor in its entirety, i.e., do not use ellipses.")
-    self.arg_parsers["print_tensor"] = ap
-
-    # Parser for inject_value.
-    ap = argparse.ArgumentParser(
-        description="Inject (override) the value of a Tensor.",
-        usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "tensor_name",
-        type=str,
-        help="Name of the Tensor of which the value is to be overridden.")
-    ap.add_argument(
-        "tensor_value_str",
-        type=str,
-        help="A string representing the value of the tensor, without any "
-        "whitespaces, e.g., np.zeros([10,100])")
-    self.arg_parsers["inject_value"] = ap
-
-    self._initialize_state()
-
-  def _initialize_state(self):
-    """Initialize the state of this stepper CLI."""
-
-    # Get the elements in the sorted transitive closure, as a list of str.
-    self._sorted_nodes = self._node_stepper.sorted_nodes()
-    self._closure_elements = self._node_stepper.closure_elements()
-    self._placeholders = self._node_stepper.placeholders()
-    self._completed_nodes = set()
-
-    self._calculate_next()
-
-  def _calculate_next(self):
-    """Calculate the next target for "step" action based on current state."""
-
-    override_names = self._node_stepper.override_names()
-
-    next_i = -1
-    for i in xrange(len(self._sorted_nodes)):
-      if (i > next_i and (self._sorted_nodes[i] in self._completed_nodes) or
-          (self._sorted_nodes[i] in override_names)):
-        next_i = i
-
-    next_i += 1
-    self._next = next_i
-
-  def list_sorted_nodes(self, args, screen_info=None):
-    """List the sorted transitive closure of the stepper's fetches."""
-
-    # TODO(cais): Use pattern such as del args, del screen_info python/debug.
-    _ = args
-    _ = screen_info
-
-    parsed = self.arg_parsers["list_sorted_nodes"].parse_args(args)
-
-    if parsed.lower_bound != -1 and parsed.upper_bound != -1:
-      index_range = [
-          max(0, parsed.lower_bound),
-          min(len(self._sorted_nodes), parsed.upper_bound)
-      ]
-      verbose = False
-    else:
-      index_range = [0, len(self._sorted_nodes)]
-      verbose = True
-
-    handle_node_names = self._node_stepper.handle_node_names()
-    intermediate_tensor_names = self._node_stepper.intermediate_tensor_names()
-    override_names = self._node_stepper.override_names()
-    dirty_variable_names = [
-        dirty_variable.split(":")[0]
-        for dirty_variable in self._node_stepper.dirty_variables()
-    ]
-
-    lines = []
-    if verbose:
-      lines.extend(
-          ["Topologically-sorted transitive input(s) and fetch(es):", ""])
-
-    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
-    self._add_deprecation_warning(output)
-
-    for i, element_name in enumerate(self._sorted_nodes):
-      if i < index_range[0] or i >= index_range[1]:
-        continue
-
-      # TODO(cais): Use fixed-width text to show node index.
-      if i == self._next:
-        node_prefix = RL("  ") + RL(self.NEXT_NODE_POINTER_STR, "bold")
-      else:
-        node_prefix = RL("     ")
-
-      node_prefix += "(%d / %d)" % (i + 1, len(self._sorted_nodes)) + "  ["
-      node_prefix += self._get_status_labels(
-          element_name,
-          handle_node_names,
-          intermediate_tensor_names,
-          override_names,
-          dirty_variable_names)
-
-      output.append_rich_line(node_prefix + "] " + element_name)
-
-    if verbose:
-      output.extend(self._node_status_label_legend())
-
-    return output
-
-  def _add_deprecation_warning(self, message):
-    """Add deprecation warning as RichTextLines."""
-    color = "yellow"
-    message.append_rich_line(
-        debugger_cli_common.RichLine(
-            "WARNING: the invoke_stepper feature of tfdbg has been deprecated ",
-            color))
-    message.append_rich_line(
-        debugger_cli_common.RichLine(
-            "and will be removed in the next release of TensorFlow.",
-            color))
-    message.append_rich_line(debugger_cli_common.RichLine("", color))
-    message.append_rich_line(
-        debugger_cli_common.RichLine(
-            "There now exist better alternatives of stepping debugging, "
-            "including:",
-            color))
-    message.append_rich_line(
-        debugger_cli_common.RichLine("- TensorBoard Debugger Plugin", color))
-    message.append_rich_line(
-        debugger_cli_common.RichLine("- Eager Execution", color))
-    message.append_rich_line(debugger_cli_common.RichLine("", color))
-
-  def _get_status_labels(self,
-                         element_name,
-                         handle_node_names,
-                         intermediate_tensor_names,
-                         override_names,
-                         dirty_variable_names):
-    """Get a string of status labels for a graph element.
-
-    A status label indicates that a node has a certain state in this
-    node-stepper CLI invocation. For example, 1) that the node has been
-    continued-to and a handle to its output tensor is available to the node
-    stepper; 2) the node is a Variable and its value has been altered, e.g.,
-    by continuing to a variable-updating node, since the beginning of this
-    node-stepper invocation (i.e., "dirty variable").
-
-    Args:
-      element_name: (str) name of the graph element.
-      handle_node_names: (list of str) Names of the nodes of which the output
-        tensors' handles are available.
-      intermediate_tensor_names: (list of str) Names of the intermediate tensor
-        dumps generated from the graph element.
-      override_names: (list of str) Names of the tensors of which the values
-        are overridden.
-      dirty_variable_names: (list of str) Names of the dirty variables.
-
-    Returns:
-      (RichLine) The rich text string of status labels that currently apply to
-        the graph element.
-    """
-
-    status = RL()
-
-    node_name = element_name.split(":")[0]
-    status += (RL(self.STATE_IS_PLACEHOLDER,
-                  self._STATE_COLORS[self.STATE_IS_PLACEHOLDER])
-               if node_name in self._placeholders else " ")
-    status += (RL(self.STATE_UNFEEDABLE,
-                  self._STATE_COLORS[self.STATE_UNFEEDABLE])
-               if not self._node_stepper.is_feedable(str(element_name))
-               else " ")
-    status += (RL(self.STATE_CONT, self._STATE_COLORS[self.STATE_CONT])
-               if element_name in handle_node_names else " ")
-
-    intermediate_node_names = [
-        tensor_name.split(":")[0] for tensor_name in intermediate_tensor_names]
-    status += (RL(self.STATE_DUMPED_INTERMEDIATE,
-                  self._STATE_COLORS[self.STATE_DUMPED_INTERMEDIATE])
-               if element_name in intermediate_node_names else " ")
-
-    slots = self._node_stepper.output_slots_in_closure(element_name)
-    has_override = any(element_name + ":%d" % slot in override_names
-                       for slot in slots)
-    status += (RL(self.STATE_OVERRIDDEN,
-                  self._STATE_COLORS[self.STATE_OVERRIDDEN])
-               if has_override else " ")
-    status += (RL(self.STATE_DIRTY_VARIABLE,
-                  self._STATE_COLORS[self.STATE_DIRTY_VARIABLE])
-               if element_name in dirty_variable_names else " ")
-
-    return status
-
-  def _node_status_label_legend(self):
-    """Get legend for node-status labels.
-
-    Returns:
-      (debugger_cli_common.RichTextLines) Legend text.
-    """
-
-    return debugger_cli_common.rich_text_lines_from_rich_line_list([
-        "",
-        "Legend:",
-        (RL("  ") +
-         RL(self.STATE_IS_PLACEHOLDER,
-            self._STATE_COLORS[self.STATE_IS_PLACEHOLDER]) +
-         " - Placeholder"),
-        (RL("  ") +
-         RL(self.STATE_UNFEEDABLE,
-            self._STATE_COLORS[self.STATE_UNFEEDABLE]) +
-         " - Unfeedable"),
-        (RL("  ") +
-         RL(self.STATE_CONT,
-            self._STATE_COLORS[self.STATE_CONT]) +
-         " - Already continued-to; Tensor handle available from output "
-         "slot(s)"),
-        (RL("  ") +
-         RL(self.STATE_DUMPED_INTERMEDIATE,
-            self._STATE_COLORS[self.STATE_DUMPED_INTERMEDIATE]) +
-         " - Unfeedable"),
-        (RL("  ") +
-         RL(self.STATE_OVERRIDDEN,
-            self._STATE_COLORS[self.STATE_OVERRIDDEN]) +
-         " - Has overriding (injected) tensor value"),
-        (RL("  ") +
-         RL(self.STATE_DIRTY_VARIABLE,
-            self._STATE_COLORS[self.STATE_DIRTY_VARIABLE]) +
-         " - Dirty variable: Variable already updated this node stepper.")])
-
-  def cont(self, args, screen_info=None):
-    """Continue-to action on the graph."""
-
-    _ = screen_info
-
-    parsed = self.arg_parsers["cont"].parse_args(args)
-
-    # Determine which node is being continued to, so the _next pointer can be
-    # set properly.
-    node_name = parsed.target_name.split(":")[0]
-    if node_name not in self._sorted_nodes:
-      return cli_shared.error(self._MESSAGE_TEMPLATES["NOT_IN_CLOSURE"] %
-                              parsed.target_name)
-    self._next = self._sorted_nodes.index(node_name)
-
-    cont_result = self._node_stepper.cont(
-        parsed.target_name,
-        invalidate_from_updated_variables=(
-            parsed.invalidate_from_updated_variables),
-        restore_variable_values=parsed.restore_variable_values)
-    self._completed_nodes.add(parsed.target_name.split(":")[0])
-
-    screen_output = debugger_cli_common.RichTextLines(
-        ["Continued to %s:" % parsed.target_name, ""])
-    screen_output.extend(self._report_last_feed_types())
-    screen_output.extend(self._report_last_updated())
-    screen_output.extend(
-        tensor_format.format_tensor(
-            cont_result, parsed.target_name, include_metadata=True))
-
-    # Generate windowed view of the sorted transitive closure on which the
-    # stepping is occurring.
-    lower_bound = max(0, self._next - 2)
-    upper_bound = min(len(self._sorted_nodes), self._next + 3)
-
-    final_output = self.list_sorted_nodes(
-        ["-l", str(lower_bound), "-u", str(upper_bound)])
-    final_output.extend(debugger_cli_common.RichTextLines([""]))
-    final_output.extend(screen_output)
-
-    # Re-calculate the target of the next "step" action.
-    self._calculate_next()
-
-    return final_output
-
-  def _report_last_feed_types(self):
-    """Generate a report of the feed types used in the cont/step call.
-
-    Returns:
-      (debugger_cli_common.RichTextLines) A RichTextLines representation of the
-        feeds used in the last cont/step call.
-    """
-    feed_types = self._node_stepper.last_feed_types()
-
-    out = ["Stepper used feeds:"]
-    if feed_types:
-      for feed_name in feed_types:
-        feed_info = RL("  %s : " % feed_name)
-        feed_info += RL(feed_types[feed_name],
-                        self._FEED_COLORS[feed_types[feed_name]])
-        out.append(feed_info)
-    else:
-      out.append("  (No feeds)")
-    out.append("")
-
-    return debugger_cli_common.rich_text_lines_from_rich_line_list(out)
-
-  def _report_last_updated(self):
-    """Generate a report of the variables updated in the last cont/step call.
-
-    Returns:
-      (debugger_cli_common.RichTextLines) A RichTextLines representation of the
-        variables updated in the last cont/step call.
-    """
-
-    last_updated = self._node_stepper.last_updated()
-    if not last_updated:
-      return debugger_cli_common.RichTextLines([])
-
-    rich_lines = [RL("Updated:", self._UPDATED_ATTRIBUTE)]
-    sorted_last_updated = sorted(list(last_updated))
-    for updated in sorted_last_updated:
-      rich_lines.append("  %s" % updated)
-    rich_lines.append("")
-    return debugger_cli_common.rich_text_lines_from_rich_line_list(rich_lines)
-
-  def step(self, args, screen_info=None):
-    """Step once.
-
-    Args:
-      args: (list of str) command-line arguments for the "step" command.
-      screen_info: Information about screen.
-
-    Returns:
-      (RichTextLines) Screen output for the result of the stepping action.
-    """
-
-    parsed = self.arg_parsers["step"].parse_args(args)
-
-    if parsed.num_times < 0:
-      return debugger_cli_common.RichTextLines(
-          "ERROR: Invalid number of times to step: %d" % parsed.num_times)
-
-    for _ in xrange(parsed.num_times):
-      if self._next >= len(self._sorted_nodes):
-        return debugger_cli_common.RichTextLines(
-            "ERROR: Cannot step any further because the end of the sorted "
-            "transitive closure has been reached.")
-      else:
-        screen_output = self.cont([self._sorted_nodes[self._next]], screen_info)
-
-    return screen_output
-
-  def print_tensor(self, args, screen_info=None):
-    """Print the value of a tensor that the stepper has access to."""
-
-    parsed = self.arg_parsers["print_tensor"].parse_args(args)
-
-    if screen_info and "cols" in screen_info:
-      np_printoptions = {"linewidth": screen_info["cols"]}
-    else:
-      np_printoptions = {}
-
-    # Determine if any range-highlighting is required.
-    highlight_options = cli_shared.parse_ranges_highlight(parsed.ranges)
-
-    tensor_name, tensor_slicing = (
-        command_parser.parse_tensor_name_with_slicing(parsed.tensor_name))
-
-    tensor_names = self._resolve_tensor_names(tensor_name)
-    if not tensor_names:
-      return cli_shared.error(
-          self._MESSAGE_TEMPLATES["NOT_IN_CLOSURE"] % tensor_name)
-    elif len(tensor_names) > 1:
-      return cli_shared.error(
-          self._MESSAGE_TEMPLATES["MULTIPLE_TENSORS"] % tensor_name)
-    else:
-      tensor_name = tensor_names[0]
-
-    try:
-      tensor_value = self._node_stepper.get_tensor_value(tensor_name)
-    except ValueError as e:
-      return debugger_cli_common.RichTextLines([str(e)])
-
-    return cli_shared.format_tensor(
-        tensor_value,
-        tensor_name,
-        np_printoptions,
-        print_all=parsed.print_all,
-        tensor_slicing=tensor_slicing,
-        highlight_options=highlight_options)
-
-  def inject_value(self, args, screen_info=None):
-    """Inject value to a given tensor.
-
-    Args:
-      args: (list of str) command-line arguments for the "step" command.
-      screen_info: Information about screen.
-
-    Returns:
-      (RichTextLines) Screen output for the result of the stepping action.
-    """
-
-    _ = screen_info  # Currently unused.
-
-    if screen_info and "cols" in screen_info:
-      np_printoptions = {"linewidth": screen_info["cols"]}
-    else:
-      np_printoptions = {}
-
-    parsed = self.arg_parsers["inject_value"].parse_args(args)
-
-    tensor_names = self._resolve_tensor_names(parsed.tensor_name)
-    if not tensor_names:
-      return cli_shared.error(
-          self._MESSAGE_TEMPLATES["NOT_IN_CLOSURE"] % parsed.tensor_name)
-    elif len(tensor_names) > 1:
-      return cli_shared.error(
-          self._MESSAGE_TEMPLATES["MULTIPLE_TENSORS"] % parsed.tensor_name)
-    else:
-      tensor_name = tensor_names[0]
-
-    tensor_value = eval(parsed.tensor_value_str)  # pylint: disable=eval-used
-
-    try:
-      self._node_stepper.override_tensor(tensor_name, tensor_value)
-      lines = [
-          "Injected value \"%s\"" % parsed.tensor_value_str,
-          "  to tensor \"%s\":" % tensor_name, ""
-      ]
-
-      tensor_lines = tensor_format.format_tensor(
-          tensor_value,
-          tensor_name,
-          include_metadata=True,
-          np_printoptions=np_printoptions).lines
-      lines.extend(tensor_lines)
-
-    except ValueError:
-      lines = [
-          "ERROR: Failed to inject value to tensor %s" % parsed.tensor_name
-      ]
-
-    return debugger_cli_common.RichTextLines(lines)
-
-  # TODO(cais): Implement list_inputs
-  # TODO(cais): Implement list_outputs
-  # TODO(cais): Implement node_info
-
-  def _resolve_tensor_names(self, element_name):
-    """Resolve tensor name from graph element name.
-
-    Args:
-      element_name: (str) Name of the graph element to resolve.
-
-    Returns:
-      (list) Name of the tensor(s). If element_name is the name of a tensor in
-      the transitive closure, return [element_name]. If element_name is the
-      name of a node in the transitive closure, return the list of output
-      tensors from the node that are in the transitive closure. Otherwise,
-      return empty list.
-    """
-
-    if element_name in self._closure_elements and ":" in element_name:
-      return [element_name]
-    if (element_name in self._sorted_nodes or
-        (element_name in self._closure_elements and ":" not in element_name)):
-      slots = self._node_stepper.output_slots_in_closure(element_name)
-      return [(element_name + ":%d" % slot) for slot in slots]
-    else:
-      return []
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
deleted file mode 100644
index c728373..0000000
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests of the Stepper CLI Backend."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.debug.cli import stepper_cli
-from tensorflow.python.debug.lib import stepper
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-from tensorflow.python.training import gradient_descent
-
-# Regex pattern for a node line in the stepper CLI output.
-NODE_LINE_PATTERN = re.compile(r".*\(.*\).*\[.*\].*")
-
-
-def _parse_sorted_nodes_list(lines):
-  """Parsed a list of lines to extract the node list.
-
-  Args:
-    lines: (list of str) Lines from which the node list and associated
-      information will be extracted.
-
-  Returns:
-    (list of str) The list of node names.
-    (list of str) The list of status labels.
-    (int) 0-based index among the nodes for the node pointed by the next-node
-      pointer. If no such node exists, -1.
-  """
-
-  node_names = []
-  status_labels = []
-  node_pointer = -1
-
-  node_line_counter = 0
-  for line in lines:
-    if NODE_LINE_PATTERN.match(line):
-      node_names.append(line.split(" ")[-1])
-
-      idx_left_bracket = line.index("[")
-      idx_right_bracket = line.index("]")
-      status_labels.append(line[idx_left_bracket + 1:idx_right_bracket])
-      if line.strip().startswith(
-          stepper_cli.NodeStepperCLI.NEXT_NODE_POINTER_STR):
-        node_pointer = node_line_counter
-
-      node_line_counter += 1
-
-  return node_names, status_labels, node_pointer
-
-
-def _parsed_used_feeds(lines):
-  feed_types = {}
-
-  begin_line = -1
-  for i, line in enumerate(lines):
-    if line.startswith("Stepper used feeds:"):
-      begin_line = i + 1
-      break
-
-  if begin_line == -1:
-    return feed_types
-
-  for line in lines[begin_line:]:
-    line = line.strip()
-    if not line:
-      return feed_types
-    else:
-      feed_name = line.split(" : ")[0].strip()
-      feed_type = line.split(" : ")[1].strip()
-      feed_types[feed_name] = feed_type
-
-
-def _parse_updated(lines):
-  """Parse the Updated section in the output text lines.
-
-  Args:
-    lines: (list of str) The output text lines to be parsed.
-
-  Returns:
-    If the Updated section does not exist, returns None.
-    Otherwise, returns the Tensor names included in the section.
-  """
-  updated = None
-
-  begin_line = -1
-  for i, line in enumerate(lines):
-    if line.startswith("Updated:"):
-      updated = []
-      begin_line = i + 1
-      break
-
-  if begin_line == -1:
-    return updated
-
-  for line in lines[begin_line:]:
-    line = line.strip()
-    if not line:
-      return updated
-    else:
-      updated.append(line.strip())
-
-  return updated
-
-
-@test_util.run_v1_only("b/120545219")
-class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.a = variables.VariableV1(10.0, name="a")
-    self.b = variables.VariableV1(20.0, name="b")
-
-    self.c = math_ops.add(self.a, self.b, name="c")  # Should be 30.0.
-    self.d = math_ops.subtract(self.a, self.c, name="d")  # Should be -20.0.
-    self.e = math_ops.multiply(self.c, self.d, name="e")  # Should be -600.0.
-
-    self.ph = array_ops.placeholder(dtypes.float32, shape=(2, 2), name="ph")
-    self.f = math_ops.multiply(self.e, self.ph, name="f")
-
-    self.opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
-        self.e, name="opt")
-
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True)
-    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
-    config = config_pb2.ConfigProto(graph_options=graph_options)
-    self.sess = session.Session(config=config)
-
-    self.sess.run(self.a.initializer)
-    self.sess.run(self.b.initializer)
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-  def _assert_nodes_topologically_sorted_with_target_e(self, node_names):
-    """Check the topologically sorted order of the node names."""
-
-    self.assertGreaterEqual(len(node_names), 7)
-    self.assertLess(node_names.index("a"), node_names.index("a/read"))
-    self.assertLess(node_names.index("b"), node_names.index("b/read"))
-    self.assertLess(node_names.index("a/read"), node_names.index("c"))
-    self.assertLess(node_names.index("b/read"), node_names.index("c"))
-    self.assertLess(node_names.index("a/read"), node_names.index("d"))
-    self.assertLess(node_names.index("c"), node_names.index("d"))
-    self.assertLess(node_names.index("c"), node_names.index("e"))
-    self.assertLess(node_names.index("d"), node_names.index("e"))
-
-  def _assert_nodes_topologically_sorted_with_target_f(self, node_names):
-    self._assert_nodes_topologically_sorted_with_target_e(node_names)
-
-    self.assertGreaterEqual(len(node_names), 9)
-    self.assertLess(node_names.index("ph"), node_names.index("f"))
-    self.assertLess(node_names.index("e"), node_names.index("f"))
-
-  def testListingSortedNodesPresentsTransitveClosure(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      self._assert_nodes_topologically_sorted_with_target_e(node_names)
-      self.assertEqual(len(node_names), len(stat_labels))
-      for stat_label in stat_labels:
-        self.assertEqual("      ", stat_label)
-      self.assertEqual(0, node_pointer)
-
-  def testListingSortedNodesLabelsPlaceholders(self):
-    with stepper.NodeStepper(self.sess, self.f) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      self._assert_nodes_topologically_sorted_with_target_f(node_names)
-
-      index_ph = node_names.index("ph")
-      self.assertEqual(len(node_names), len(stat_labels))
-      for i in xrange(len(stat_labels)):
-        if index_ph == i:
-          self.assertIn(stepper_cli.NodeStepperCLI.STATE_IS_PLACEHOLDER,
-                        stat_labels[i])
-        else:
-          self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_IS_PLACEHOLDER,
-                           stat_labels[i])
-
-      self.assertEqual(0, node_pointer)
-
-  def testContToNonexistentNodeShouldError(self):
-    with stepper.NodeStepper(self.sess, self.f) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.cont(["foobar"])
-      self.assertEqual([
-          "ERROR: foobar is not in the transitive closure of this stepper "
-          "instance."
-      ], output.lines)
-
-  def testContToNodeOutsideTransitiveClosureShouldError(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.cont(["f"])
-      self.assertEqual([
-          "ERROR: f is not in the transitive closure of this stepper "
-          "instance."
-      ], output.lines)
-
-  def testContToValidNodeShouldUpdateStatus(self):
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      index_c = node_names.index("c")
-      self.assertEqual("      ", stat_labels[index_c])
-      self.assertEqual(0, node_pointer)
-
-      output = cli.cont("c")
-      self.assertIsNone(_parse_updated(output.lines))
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      self.assertGreaterEqual(len(node_names), 3)
-      self.assertIn("c", node_names)
-      index_c = node_names.index("c")
-      self.assertEqual(index_c, node_pointer)
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[index_c])
-
-      output = cli.cont("d")
-      self.assertIsNone(_parse_updated(output.lines))
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      used_feed_types = _parsed_used_feeds(output.lines)
-      self.assertEqual({
-          "c:0": stepper.NodeStepper.FEED_TYPE_HANDLE,
-          "a/read:0": stepper.NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, used_feed_types)
-
-      self.assertGreaterEqual(len(node_names), 3)
-      self.assertIn("d", node_names)
-      index_d = node_names.index("d")
-      self.assertEqual(index_d, node_pointer)
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[index_d])
-
-  def testSteppingOneStepAtATimeShouldUpdateStatus(self):
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.list_sorted_nodes([])
-      orig_node_names, _, node_pointer = _parse_sorted_nodes_list(output.lines)
-      self.assertEqual(0, node_pointer)
-
-      for i in xrange(len(orig_node_names)):
-        output = cli.step([])
-        node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-            output.lines)
-
-        next_node_name = node_names[node_pointer]
-        self.assertEqual(orig_node_names[i], next_node_name)
-
-        self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT,
-                      stat_labels[node_pointer])
-
-        # The order in which the nodes are listed should not change as the
-        # stepping happens.
-        output = cli.list_sorted_nodes([])
-        node_names, _, node_pointer = _parse_sorted_nodes_list(output.lines)
-        self.assertEqual(orig_node_names, node_names)
-
-        if i < len(orig_node_names) - 1:
-          self.assertEqual(i + 1, node_pointer)
-        else:
-          # Stepped over the limit. Pointer should be at -1.
-          self.assertEqual(-1, node_pointer)
-
-      # Attempt to step once more after the end has been reached should error
-      # out.
-      output = cli.step([])
-      self.assertEqual([
-          "ERROR: Cannot step any further because the end of the sorted "
-          "transitive closure has been reached."
-      ], output.lines)
-
-  def testSteppingMultipleStepsUpdatesStatus(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.list_sorted_nodes([])
-      orig_node_names, _, _ = _parse_sorted_nodes_list(output.lines)
-
-      output = cli.step(["-t", "3"])
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      self.assertEqual(orig_node_names[2], node_names[node_pointer])
-
-      for i in xrange(node_pointer):
-        self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[i])
-
-      for i in xrange(node_pointer + 1, len(stat_labels)):
-        self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[i])
-
-  def testContToNodeWithoutOutputTensorInClosureShowsNoHandleCached(self):
-    with stepper.NodeStepper(self.sess, self.opt) as node_stepper:
-      sorted_nodes = node_stepper.sorted_nodes()
-      closure_elements = node_stepper.closure_elements()
-
-      # Find a node which is in the list of sorted nodes, but whose output
-      # Tensor is not in the transitive closure.
-      no_output_node = None
-      for node in sorted_nodes:
-        if (node + ":0" not in closure_elements and
-            node + ":1" not in closure_elements):
-          no_output_node = node
-          break
-
-      self.assertIsNotNone(no_output_node)
-
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-      output = cli.cont([no_output_node])
-      self.assertIsNone(_parse_updated(output.lines))
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      self.assertEqual(no_output_node, node_names[node_pointer])
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_CONT,
-                       stat_labels[node_pointer])
-
-  def testContToUpdateNodeWithTrackingLeadsToDirtyVariableLabel(self):
-    with stepper.NodeStepper(self.sess, self.opt) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-      output = cli.cont(["opt/update_b/ApplyGradientDescent", "-i"])
-
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, _ = _parse_sorted_nodes_list(output.lines)
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                    stat_labels[node_names.index("b")])
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                       stat_labels[node_names.index("a")])
-
-  def testContToUpdateNodeWithoutTrackingLeadsToNoDirtyVariableLabel(self):
-    with stepper.NodeStepper(self.sess, self.opt) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-      output = cli.cont(["opt/update_b/ApplyGradientDescent"])
-
-      self.assertItemsEqual([self.b.name], _parse_updated(output.lines))
-
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, _ = _parse_sorted_nodes_list(output.lines)
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                    stat_labels[node_names.index("b")])
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                       stat_labels[node_names.index("a")])
-
-  def testContWithRestoreVariablesOptionShouldRestoreVariableValue(self):
-    with stepper.NodeStepper(self.sess, self.opt) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-      output = cli.cont(["opt/update_a/ApplyGradientDescent",
-                         "--invalidate_from_updated_variables"])
-
-      self.assertItemsEqual([self.a.name], _parse_updated(output.lines))
-
-      # After cont() call on .../update_a/..., Variable a should have been
-      # marked as dirty, whereas b should not have.
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, _ = _parse_sorted_nodes_list(output.lines)
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                    stat_labels[node_names.index("a")])
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                       stat_labels[node_names.index("b")])
-
-      output = cli.cont(["opt/update_b/ApplyGradientDescent", "-r", "-i"])
-
-      self.assertItemsEqual([self.b.name], _parse_updated(output.lines))
-
-      # After cont() call on .../update_b/... with the -r flag, Variable b
-      # should have been marked as dirty, whereas Variable a should not be
-      # because it should have been restored.
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, _ = _parse_sorted_nodes_list(output.lines)
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                    stat_labels[node_names.index("b")])
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_DIRTY_VARIABLE,
-                       stat_labels[node_names.index("a")])
-
-  def testPrintTensorShouldWorkWithTensorName(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      cli.cont("d")
-      output = cli.print_tensor(["d:0"])
-
-      self.assertEqual("Tensor \"d:0\":", output.lines[0])
-      self.assertEqual("-20.0", output.lines[-1])
-
-  def testPrintTensorShouldWorkWithNodeNameWithOutputTensor(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      cli.cont("d")
-      output = cli.print_tensor(["d"])
-
-      self.assertEqual("Tensor \"d:0\":", output.lines[0])
-      self.assertEqual("-20.0", output.lines[-1])
-
-  def testPrintTensorShouldWorkSlicingString(self):
-    ph_value = np.array([[1.0, 0.0], [0.0, 2.0]])
-    with stepper.NodeStepper(
-        self.sess, self.f, feed_dict={self.ph: ph_value}) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.print_tensor(["ph:0[:, 1]"])
-      self.assertEqual("Tensor \"ph:0[:, 1]\":", output.lines[0])
-      self.assertEqual(repr(ph_value[:, 1]), output.lines[-1])
-
-      output = cli.print_tensor(["ph[:, 1]"])
-      self.assertEqual("Tensor \"ph:0[:, 1]\":", output.lines[0])
-      self.assertEqual(repr(ph_value[:, 1]), output.lines[-1])
-
-  def testPrintTensorWithNonexistentTensorShouldError(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.print_tensor(["foobar"])
-      self.assertEqual([
-          "ERROR: foobar is not in the transitive closure of this stepper "
-          "instance."
-      ], output.lines)
-
-  def testPrintTensorWithNoHandleShouldError(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.print_tensor("e")
-      self.assertEqual([
-          "This stepper instance does not have access to the value of tensor "
-          "\"e:0\""
-      ], output.lines)
-
-  def testInjectTensorValueByTensorNameShouldBeReflected(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.cont(["d"])
-      node_names, _, node_pointer = _parse_sorted_nodes_list(output.lines)
-      self.assertEqual("d", node_names[node_pointer])
-
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      index_d = node_names.index("d")
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[index_d])
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_OVERRIDDEN,
-                       stat_labels[index_d])
-
-      self.assertAllClose(-20.0, node_stepper.get_tensor_value("d:0"))
-
-      output = cli.inject_value(["d:0", "20.0"])
-
-      # Verify that the override is available.
-      self.assertEqual(["d:0"], node_stepper.override_names())
-
-      # Verify that the list of sorted nodes reflects the existence of the value
-      # override (i.e., injection).
-      output = cli.list_sorted_nodes([])
-      node_names, stat_labels, node_pointer = _parse_sorted_nodes_list(
-          output.lines)
-
-      index_d = node_names.index("d")
-      self.assertNotIn(stepper_cli.NodeStepperCLI.STATE_CONT,
-                       stat_labels[index_d])
-      self.assertIn(stepper_cli.NodeStepperCLI.STATE_OVERRIDDEN,
-                    stat_labels[index_d])
-
-  def testInjectTensorValueByNodeNameShouldBeReflected(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      cli.inject_value(["d", "20.0"])
-      self.assertEqual(["d:0"], node_stepper.override_names())
-
-  def testInjectToNonexistentTensorShouldError(self):
-    with stepper.NodeStepper(self.sess, self.e) as node_stepper:
-      cli = stepper_cli.NodeStepperCLI(node_stepper)
-
-      output = cli.inject_value(["foobar:0", "20.0"])
-      self.assertEqual([
-          "ERROR: foobar:0 is not in the transitive closure of this stepper "
-          "instance."
-      ], output.lines)
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/lib/stepper.py b/tensorflow/python/debug/lib/stepper.py
deleted file mode 100644
index 7b1e525..0000000
--- a/tensorflow/python/debug/lib/stepper.py
+++ /dev/null
@@ -1,966 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorFlow Debugger (tfdbg) Stepper Module."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import os
-import shutil
-import tempfile
-import time
-
-import six
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.debug.lib import debug_data
-from tensorflow.python.debug.lib import debug_graphs
-from tensorflow.python.debug.lib import debug_utils
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import session_ops
-
-
-# TODO(cais): Use nest.flatten once it handles nest Dicts correctly.
-def _flatten_fetches(fetches):
-  """Flatten list, tuple of fetches, or a single fetch into a list of fetches.
-
-  Args:
-    fetches: The fetches to flatten: Can be a single Tensor, Op, or a
-      potentially nested list, tuple or dict of such individual fetches.
-
-  Returns:
-    The fetches flattened to a list.
-  """
-
-  flattened = []
-  if isinstance(fetches, (list, tuple)):
-    for fetch in fetches:
-      flattened.extend(_flatten_fetches(fetch))
-  elif isinstance(fetches, dict):
-    for key in fetches:
-      flattened.extend(_flatten_fetches(fetches[key]))
-  else:
-    flattened.append(fetches)
-
-  return flattened
-
-
-class NodeStepper(object):
-  """TensorFlow Debugger (tfdbg) stepper.
-
-  The stepper provides ability to perform "continue to" actions on a graph,
-  given fetch and feeds. The stepper calculates the transitive closure of the
-  fetch. cont() (continue to) calls can only be performed on members of the
-  transitive closure.
-
-  On a cont() call, the stepper performs depth-first tracing of the input
-  tree of the target. When it reaches an input where one of the following is
-  available, it will supply the available value to the feed_dict of the cont()
-  call:
-    (1) Overriding (injected) values from the client.
-    (2) TensorHandles from previous cont() calls.
-    (3) Dumped intermediate Tensors from previous cont() calls.
-    (4) Feeds supplied during the construction of the stepper instance.
-
-  During the cont() call, intermediate Tensors are dumped to temporary
-  directories. The dumped Tensor values will be used in subsequent cont() calls
-  when they are required as data dependencies.
-
-  The temporary directories are automatically clean when the NodeStepper
-  instance exits as a context manager.
-
-  Once the tracing is complete, it will issue a run() call on the
-  underlying session, using the aforementioned feed_dict prepared by the input
-  tracing, to achieve the "continue-to" action. The above process takes into
-  account whether the transitive closure of an input contains Variables that
-  are updated during previous cont() calls on this stepper instance. If such
-  updates exist, we say the transitive closure is "dirty" and the stepper
-  can restore the "clean" state of the Variable and avoid using the
-  TensorHandle.
-
-  Example of basic usage:
-    a = tf.Variable(1.0, name="a")
-    b = tf.Variable(2.0, anme="b")
-    c = tf.add(a, b, name="c")
-    d = tf.multiply(a, c, name="d")
-
-    sess = tf.compat.v1.Session()
-    sess.run(tf.initialize_all_varialbes())
-    stepper = NodeStepper(sess, d)
-
-    stepper.cont(c)  # Caches the handle to Tensor c:0.
-    stepper.cont(d)  # Uses handle to Tensor c:0, avoiding recomputing c.
-  """
-
-  # Possible types of feed used during cont() calls.
-  FEED_TYPE_CLIENT = "client"
-  FEED_TYPE_HANDLE = "handle"
-  FEED_TYPE_OVERRIDE = "override"
-  FEED_TYPE_DUMPED_INTERMEDIATE = "dumped_intermediate"
-
-  def __init__(self, sess, fetches, feed_dict=None):
-    """Constructor for Debugger.
-
-    Args:
-      sess: (Session) the TensorFlow Session to step in.
-      fetches: Same as the fetches input argument to `Session.run()`.
-      feed_dict: Same as the feed_dict input argument to `Session.run()`.
-    """
-
-    self._sess = sess
-
-    self._fetches = fetches
-    flattened_fetches = _flatten_fetches(fetches)
-
-    self._fetch_names, self._fetch_list = self._get_fetch_and_name_lists(
-        flattened_fetches)
-
-    # A map from Variable name to initializer op.
-    self._variable_initializers = {}
-
-    # A map from Variable name to initial value, used when overriding or
-    # restoring Variable values.
-    self._variable_initial_values = {}
-
-    # Initialize the map for output recipients (targets).
-    self._output_targets = {}
-
-    # Sorted transitive closure of the fetched node.
-    # We also collect the list of the names of the reference-type Tensors,
-    # because we later need to avoid using intermediate dumps for such Tensors.
-    (self._sorted_nodes,
-     self._closure_elements,
-     self._ref_tensor_names) = self._dfs_visit(self._sess.graph,
-                                               self._fetch_list)
-
-    self._transitive_closure_set = set(self._sorted_nodes)
-
-    # A map from Variable name to the old values (before any cont() calls).
-    self._cached_variable_values = {}
-
-    # A cache map from tensor name to what variables may invalidate the tensor
-    self._cached_invalidation_path = {}
-
-    # Keep track of which variables are in a dirty state.
-    self._dirty_variables = set()
-
-    # Variables updated in the last cont() call.
-    self._last_updated = None
-
-    # Cached tensor handles: a dict with keys as tensor names and values as
-    # tensor handles.
-    self._tensor_handles = {}
-
-    # Cached intermediate tensor values: a dict mapping tensor names to
-    # DebugTensorDatum.
-    self._dumped_intermediate_tensors = {}
-    self._dump_session_root = tempfile.mkdtemp(prefix="tfdbg_stepper_")
-
-    # Feed dict from the client.
-    self._client_feed_dict = {}
-    if feed_dict:
-      for key in feed_dict:
-        if isinstance(key, ops.Tensor):
-          self._client_feed_dict[key.name] = feed_dict[key]
-        else:
-          self._client_feed_dict[key] = feed_dict[key]
-
-    # Overriding tensor values.
-    self._override_tensors = {}
-
-    # What the feed types were used by the last cont() call.
-    self._last_feed_types = {}
-
-  def __enter__(self):
-    return self
-
-  def __exit__(self, exc_type, exc_value, exc_traceback):
-    if os.path.isdir(self._dump_session_root):
-      shutil.rmtree(self._dump_session_root)
-
-  def _get_fetch_and_name_lists(self, flattened_fetches):
-    """Get the lists of fetches and their names.
-
-    Args:
-      flattened_fetches: A list of fetches or their names. Can mix fetches and
-        names.
-
-    Returns:
-      (list of str): A list of the names of the fetches.
-      (list): A list of the fetches.
-    """
-
-    fetch_names = []
-    fetch_list = []
-    for fetch in flattened_fetches:
-      if isinstance(fetch, six.string_types):
-        fetch_names.append(fetch)
-        fetch_list.append(self._sess.graph.as_graph_element(fetch))
-      else:
-        fetch_names.append(fetch.name)
-        fetch_list.append(fetch)
-
-    return fetch_names, fetch_list
-
-  def _dfs_visit(self, graph, elem_list):
-    """Trace back the input of a graph element, using depth-first search.
-
-    Uses non-recursive implementation to prevent stack overflow for deep
-    graphs.
-
-    Also performs the following action(s):
-      1) When encountering a Variable, obtain its initializer op, to
-         facilitate possible subsequent restoration / overriding of variable
-         value.
-
-    Args:
-      graph: A TF graph instance.
-      elem_list: list of graph elements: a Tensor or an Operation.
-
-    Returns:
-      (list of str) A topologically-sorted list of all nodes (not tensors)
-        in the transitive closure of elem_list. Obviously, the topological sort
-         is not unique in general. The return value here is just an arbitrary
-         one of potentially many possible topological sorts.
-      (list of str) A list of all graph elements (nodes and/or tensors) in the
-        transitive closure.
-    """
-
-    # These set should hold only strings, i.e, names of the nodes.
-    done = set()  # Keep track of visited graph elements.
-
-    # A list of str: Names of the topologically-sorted graph elements.
-    node_inputs = {}  # New: Input map of nodes in the transitive closure.
-
-    elem_stack = copy.copy(elem_list)
-
-    # Graph elements in the transitive closure, including the nodes and tensors.
-    closure_elements = [elem.name for elem in elem_list]
-
-    ref_tensor_names = set()
-    for element in elem_list:
-      if isinstance(element, ops.Tensor) and element.dtype._is_ref_dtype:  # pylint: disable=protected-access
-        ref_tensor_names.add(element.name)
-
-    while elem_stack:
-      curr_elem = elem_stack.pop()
-      curr_node = self._get_node(curr_elem)
-
-      done.add(curr_node.name)
-
-      non_control_inputs = [inp for inp in curr_node.inputs]
-      control_inputs = [inp for inp in curr_node.control_inputs]
-      all_inputs = set(non_control_inputs + control_inputs)
-
-      if curr_node.name not in node_inputs:
-        all_input_nodes = set()
-        for inp in all_inputs:
-          all_input_nodes.add(self._get_node(inp).name)
-        node_inputs[curr_node.name] = all_input_nodes
-
-      # Iterate through the (non-control) inputs.
-      for inp in all_inputs:
-        # Set up the non-control output map.
-        # if is_non_control_input:
-        if inp.name not in self._output_targets:
-          self._output_targets[inp.name] = set([curr_elem.name])
-        else:
-          self._output_targets[inp.name].add(curr_elem.name)
-
-        if (isinstance(inp, ops.Tensor) and
-            inp.op.type in ["Variable", "VariableV2"] and
-            inp.name not in self._variable_initializers):
-          # Obtain the initializer op of the variable, in case the Variable's
-          # value needs to be restored later.
-          initializer = graph.as_graph_element(inp.op.name + "/Assign")
-          self._variable_initializers[inp.name] = initializer
-          self._variable_initial_values[inp.name] = initializer.inputs[1]
-
-        inp_node = self._get_node(inp)
-        if inp_node.name in done:
-          # Already visited.
-          continue
-
-        elem_stack.append(inp)
-        closure_elements.append(inp.name)
-        if isinstance(inp, ops.Tensor) and inp.dtype._is_ref_dtype:  # pylint: disable=protected-access
-          ref_tensor_names.add(inp.name)
-
-    # Now that we have traversed the transitive closure and obtained the
-    # node-input map, we can topologically sort them.
-    sorted_nodes = []
-    stack = []
-    for node in node_inputs:
-      if not node_inputs[node]:
-        stack.append(node)
-    for node in stack:
-      del node_inputs[node]
-
-    while stack:
-      curr_node = stack.pop()
-      sorted_nodes.append(curr_node)
-
-      # Iterate through the node-input map and remove the child.
-      pushes = []
-      for node in node_inputs:
-        if curr_node in node_inputs[node]:
-          node_inputs[node].remove(curr_node)
-          if not node_inputs[node]:
-            pushes.append(node)
-
-      # Delete new pushes from node-input map.
-      for node in pushes:
-        del node_inputs[node]
-
-      stack.extend(pushes)
-
-    return sorted_nodes, closure_elements, ref_tensor_names
-
-  def sorted_nodes(self):
-    """Get a topologically-sorted list of node names of the stepper.
-
-    These are the names of the nodes (i.e., not Tensors) in the transitive
-    closure of the stepper, in a topologically-sorted order.
-
-    Returns:
-      (list of str): Sorted transitive inputs to the fetch of the stepper
-        instance. The fetch itself is included in the list.
-    """
-
-    return self._sorted_nodes
-
-  def closure_elements(self):
-    """Get a name list of the graph elements of the stepper.
-
-    Returns:
-      (list of str): names of the graph elements (i.e., nodes and tensors) in
-    the transitive closure of the stepper, in a random order.
-    """
-
-    return self._closure_elements
-
-  def output_slots_in_closure(self, node_name):
-    """Get the output tensors in the transitive closure from node.
-
-    Args:
-      node_name: (str) Name of the node in question.
-
-    Returns:
-      (list of int) Output slots of the output tensors of the node that are in
-        the transitive closure of the stepper.
-    """
-
-    node = self._sess.graph.as_graph_element(node_name)
-
-    tensor_slots = []
-    for i, _ in enumerate(node.outputs):
-      tensor_name = node_name + ":%d" % i
-      if tensor_name in self._closure_elements:
-        tensor_slots.append(i)
-
-    return tensor_slots
-
-  def is_feedable(self, name):
-    """Determine if a graph element if feedable.
-
-    Args:
-      name: (str) name of the graph element (Tensor or Operation)
-
-    Returns:
-      (bool) whether the graph element is feedable.
-    """
-
-    if not isinstance(name, six.string_types):
-      raise TypeError("Expected type str; got type %s" % type(name))
-
-    elem = self._sess.graph.as_graph_element(name)
-    return self._sess.graph.is_feedable(elem)
-
-  def override_tensor(self, tensor_name, overriding_val):
-    """Override the value of a tensor.
-
-    Args:
-      tensor_name: (str) Name of the tensor to override.
-      overriding_val: (numpy.ndarray) Overriding tensor value.
-
-    Raises:
-      ValueError: If tensor_name does not correspond to a tensor in the input
-        tree to the fetched graph element of this stepper instance.
-    """
-
-    if not isinstance(tensor_name, six.string_types):
-      raise TypeError("Expected type str; got type %s" % type(tensor_name))
-
-    node_name = self._get_node_name(tensor_name)
-    if node_name not in self._transitive_closure_set:
-      raise ValueError(
-          "Cannot override tensor \"%s\" because it does not exist in the "
-          "input tree to the fetch \"%s\"" %
-          (tensor_name, repr(self._fetch_names)))
-
-    self._override_tensors[tensor_name] = overriding_val
-
-    # Invalidate cache by tracing outputs.
-    self._invalidate_transitively_outgoing_cache(tensor_name)
-
-  def remove_override(self, tensor_name):
-    """Remove the overriding value on a tensor.
-
-    Args:
-      tensor_name: (str) name of the tensor to remove the overriding value
-        from.
-
-    Raises:
-      ValueError: If no overriding value exists for tensor_name.
-    """
-
-    if tensor_name not in self._override_tensors:
-      raise ValueError("No overriding value exists for tensor \"%s\"." %
-                       tensor_name)
-
-    del self._override_tensors[tensor_name]
-
-    # Invalidate cache by tracing outputs.
-    self._invalidate_transitively_outgoing_cache(tensor_name)
-
-  def last_feed_types(self):
-    """Obtain information about the feed in the last cont() call.
-
-    Returns:
-      (dict) A dict mapping tensor names to feed types.
-    """
-
-    return self._last_feed_types
-
-  def cont(self,
-           target,
-           use_tensor_handles=True,
-           use_dumped_intermediates=True,
-           use_overrides=True,
-           invalidate_from_updated_variables=False,
-           restore_variable_values=False):
-    """Continue till the completion of the specified target tensor.
-
-    Args:
-      target: A single fetched Tensor or Op, or a name (str) representing the
-        Tensor or Op. In the case of a name str, the graph will be searched
-        to find the corresponding Tensor or Op.
-        # TODO(cais): Support multiple fetches as in Session.run() interface.
-      use_tensor_handles: (bool) Whether this cont() run will use cached tensor
-        handles to avoid recomputation. Default: True.
-      use_dumped_intermediates: (bool) Whether this cont() call will use dumped
-        intermediate tensors to avoid recomputation.
-      use_overrides: (bool) Whether the overriding tensor values supplied by
-        the client are to be used in this cont() call. Default: True.
-      invalidate_from_updated_variables: (bool) Whether to invalidate the
-        tensor handles and intermediate tensor handles affected by the
-        Variable updates that happen in this cont() call.
-      restore_variable_values: (bool) Whether the old values of the variables
-        (before any cont() calls in this object) are to be restored.
-
-    Returns:
-      Value from Session.run() of the target.
-
-    Raises:
-      ValueError: If the target is specified as a string and the string does
-        not correspond to any tensors in the Session graph.
-        Or if the target of this cont() is not in the input list of the Stepper
-        object's target.
-        Or if target is a Placeholder.
-    """
-
-    self._last_feed_types = {}
-
-    if isinstance(target, six.string_types):
-      # Fetch target is a string. Assume it is the name of the Tensor or Op and
-      # will attempt to find it in the Session's graph.
-      target_name = target
-    else:
-      target_name = target.name
-
-    graph_element = self._sess.graph.as_graph_element(target_name)
-    # Any additional tensor handles to obtain in this cont() action.
-    additional_handle_requests = []
-
-    if (isinstance(graph_element, ops.Tensor) and
-        graph_element.op.type == "Placeholder"):
-      self._last_feed_types[graph_element.name] = self.FEED_TYPE_CLIENT
-      return self._client_feed_dict[graph_element.name]
-    elif (isinstance(graph_element, ops.Operation) and
-          graph_element.type == "Placeholder"):
-      tensor_name = graph_element.name + ":0"
-      self._last_feed_types[tensor_name] = self.FEED_TYPE_CLIENT
-      return self._client_feed_dict[tensor_name]
-
-    if isinstance(graph_element, ops.Operation) and graph_element.outputs:
-      # Check if this op has any output tensors that also fall into this
-      # stepper's transitive closure.
-      node_outputs = [
-          output.name for output in graph_element.outputs
-          if output.name in self._closure_elements
-      ]
-      if node_outputs:
-        # The target is an op with at least one output within the transitive
-        # closure. The cont() action will amount to using the 0-th
-        # output Tensor as the target, as well as obtaining handles to it
-        # and to the rest of the outputs tensors in the transitive closure
-        # (if any).
-        target_name = node_outputs[0]
-        additional_handle_requests = node_outputs[1:]
-
-    # Verify that the target is in the transitive closure of the stepper's
-    # fetch.
-    target_node_name = self._get_node_name(target_name)
-    if target_node_name not in self._transitive_closure_set:
-      raise ValueError(
-          "Target \"%s\" is not in the transitive closure for the fetch of the "
-          "stepper: \"%s\"." % (target_name, repr(self._fetch_names)))
-
-    # Check if a cached tensor handle can be used on the fetch directly.
-    if use_tensor_handles and target_name in self._tensor_handles:
-      self._last_feed_types[target_name] = self.FEED_TYPE_HANDLE
-      return self._tensor_handles[target_name].eval()
-
-    # Check if a dumped intermediate tensor can be used on the fetch directly.
-    if (use_dumped_intermediates and
-        target_name in self._dumped_intermediate_tensors):
-      self._last_feed_types[target_name] = self.FEED_TYPE_DUMPED_INTERMEDIATE
-      return self._dumped_intermediate_tensors[target_name].get_tensor()
-
-    # Check if an overriding tensor value can be used directly.
-    if use_overrides and target_name in self._override_tensors:
-      # Override is available. Return the value right away.
-      self._last_feed_types[target_name] = self.FEED_TYPE_OVERRIDE
-      return self._override_tensors[target_name]
-
-    # Keep track of which variables are restored in this cont() call.
-    restored_variables = set()
-
-    # Keep track of which variables are "touched" (i.e., possibly updated) in
-    # this cont() call.
-    self._last_updated = set()
-
-    # =========================================================================
-    # Use a non-recursive method to trace the inputs from the node and set up
-    # the feeds.
-    feeds = {}  # The feeds to be used in the Session.run() call.
-    fetched = self._sess.graph.as_graph_element(target_name)
-    elem_stack = [fetched]
-    done = set()
-
-    while elem_stack:
-      curr_elem = elem_stack.pop()
-      curr_node = self._get_node(curr_elem)
-
-      done.add(curr_node.name)
-
-      non_control_inputs = [inp for inp in curr_node.inputs]
-      control_inputs = [inp for inp in curr_node.control_inputs]
-      all_inputs = set(non_control_inputs + control_inputs)
-
-      # Iterate through the (non-control) inputs.
-      for inp in all_inputs:
-        # Determine whether the input is feedable. Reference-type tensors,
-        # e.g., Variables, should not be fed, because they can change.
-        if isinstance(inp, ops.Tensor):
-          is_inp_ref = inp.dtype._is_ref_dtype  # pylint: disable=protected-access
-          can_feed = self._sess.graph.is_feedable(inp) and not is_inp_ref
-        else:
-          is_inp_ref = False
-          can_feed = False
-
-        if (restore_variable_values and inp.name in self._dirty_variables and
-            inp.name not in restored_variables and
-            inp.name not in self._last_updated):
-          # Do not restore Variables touched or restored previously in this
-          # cont() call.
-          initializer_op = self._variable_initializers[inp.name]
-          initial_value_tensor = self._variable_initial_values[inp.name]
-          self._sess.run(initializer_op,
-                         feed_dict={
-                             initial_value_tensor:
-                                 self._cached_variable_values[inp.name]
-                         })
-
-          # Mark the variable as restored.
-          restored_variables.add(inp.name)
-
-        # Determine if this is a reference-type input from a variable, and
-        # the recipient node is not Identity. In that case, the Variable
-        # needs to be marked as dirty and its current value recorded, due to
-        # the fact that the receiving op may mutate the value of the Variable.
-        if (is_inp_ref and inp.op.type in ["Variable", "VariableV2"] and
-            curr_node.type != "Identity"):
-          # Mark the variable as dirty.
-          self._last_updated.add(inp.name)
-
-          # Obtain the old value of the variable and cache it.
-          if inp.name not in self._cached_variable_values:
-            old_value = self._sess.run(inp)
-            self._cached_variable_values[inp.name] = old_value
-
-        # N.B.: The order of the logical branches matters. For example,
-        # _client_feed_dict comes after _tensor_handles, so that tensor
-        # handles stored in cont() calls can override the original client
-        # feeds. Also for example, _override_tensors comes the first, so
-        # the manual overriding, if exists, can always take effect.
-        if use_overrides and can_feed and inp.name in self._override_tensors:
-          # Use client-supplied overriding tensor value.
-          feeds[inp] = self._override_tensors[inp.name]
-          self._last_feed_types[inp.name] = self.FEED_TYPE_OVERRIDE
-        elif (can_feed and inp not in feeds and
-              use_tensor_handles and inp.name in self._tensor_handles):
-          # Tensor handle found in cache.
-          feeds[inp] = self._tensor_handles[inp.name]
-          self._last_feed_types[inp.name] = self.FEED_TYPE_HANDLE
-        elif (can_feed and inp not in feeds and
-              use_dumped_intermediates and
-              inp.name in self._dumped_intermediate_tensors):
-          # Dumped intermediate Tensor found.
-          feeds[inp] = self._dumped_intermediate_tensors[inp.name].get_tensor()
-          self._last_feed_types[inp.name] = self.FEED_TYPE_DUMPED_INTERMEDIATE
-        elif inp.name in self._client_feed_dict:
-          # This input is available in the client feed_dict.
-          feeds[inp] = self._client_feed_dict[inp.name]
-          self._last_feed_types[inp.name] = self.FEED_TYPE_CLIENT
-        else:
-          # There is no feed available for this input. So keep tracing its
-          # input(s).
-          inp_node = self._get_node(inp)
-          if inp_node.name in done:
-            # Already visited.
-            continue
-
-          elem_stack.append(inp)
-          done.add(inp_node.name)
-
-    # =========================================================================
-
-    if self._last_updated:
-      self._dirty_variables.update(self._last_updated)
-
-    for variable in restored_variables:
-      self._dirty_variables.remove(variable)
-
-    (dump_path,
-     run_options) = self._prepare_cont_call_dump_path_and_run_options()
-    if isinstance(fetched, ops.Operation):
-      # The fetched is an Operation: Will not get tensor handle.
-      self._sess.run(fetched, feed_dict=feeds, options=run_options)
-      return_value = None
-    else:
-      # This is a Tensor: Will get tensor handle and cache it.
-      # Will also get the additional requested tensor handles (if any).
-      tensors_to_get_handles_for = [fetched]
-      handle_names = [target_name]
-
-      tensors_to_get_handles_for.extend([
-          self._sess.graph.as_graph_element(h)
-          for h in additional_handle_requests
-      ])
-      handle_names.extend(additional_handle_requests)
-
-      handles = self._sess.run(
-          [session_ops.get_session_handle(tensor) for tensor in
-           tensors_to_get_handles_for],
-          feed_dict=feeds,
-          options=run_options)
-      for handle_name, handle in zip(handle_names, handles):
-        self._tensor_handles[handle_name] = handle
-
-      return_value = self._tensor_handles[target_name].eval()
-
-    self._load_dumped_intermediate_tensors(dump_path, target_name)
-
-    if invalidate_from_updated_variables:
-      # Invalidate caches at the end.
-      for last_updated_variable in self._last_updated:
-        self._invalidate_transitively_outgoing_cache(last_updated_variable)
-
-    return return_value
-
-  def _prepare_cont_call_dump_path_and_run_options(self):
-    """Prepare the dump path and RunOptions for next cont() call.
-
-    Returns:
-      dump_path: (str) Directory path to which the intermediate tensor will be
-        dumped.
-      run_options: (config_pb2.RunOptions) The RunOptions containing the tensor
-        watch options for this graph.
-    """
-    run_options = config_pb2.RunOptions()
-    dump_path = self._cont_call_dump_path()
-    for element_name in self._closure_elements:
-      if ":" in element_name:
-        debug_utils.add_debug_tensor_watch(
-            run_options,
-            debug_graphs.get_node_name(element_name),
-            output_slot=debug_graphs.get_output_slot(element_name),
-            debug_urls=["file://" + dump_path])
-
-    return dump_path, run_options
-
-  def _cont_call_dump_path(self):
-    return os.path.join(self._dump_session_root,
-                        "cont_%d" % int(time.time() * 1e6))
-
-  def _load_dumped_intermediate_tensors(self, dump_path, target_name):
-    dump_dir = debug_data.DebugDumpDir(dump_path, validate=False)
-    for dump in dump_dir.dumped_tensor_data:
-      if (dump.tensor_name not in self._ref_tensor_names and
-          dump.tensor_name not in self._tensor_handles and
-          dump.tensor_name not in self._override_tensors and
-          dump.tensor_name != target_name):
-        self._dumped_intermediate_tensors[dump.tensor_name] = dump
-
-  def _get_node_name(self, graph_element_name):
-    return graph_element_name.split(":")[0]
-
-  def _invalidate_transitively_outgoing_cache(self, source_element):
-    """Invalidate the cached tensor handles by tracing output.
-
-    This method is used to invalidate caches such as cached TensorHandles
-    and intermediate tensor values when Variable mutation happens or when
-    client overrides tensor values.
-
-    Uses non-recursive implementation to avoid stack overflow on deep networks.
-
-    Args:
-      source_element: The source graph element (e.g., a Variable output slot)
-        to trace the output from.
-    """
-
-    if not self._tensor_handles and not self._dumped_intermediate_tensors:
-      return
-
-    # First, use cached invalidation paths to eliminate some cached tensor
-    # handles and intermediate tensors.
-    to_delete_handles = []
-    for handle_name in self._tensor_handles:
-      if (handle_name in self._cached_invalidation_path and
-          source_element in self._cached_invalidation_path[handle_name]):
-        to_delete_handles.append(handle_name)
-    for handle_name in to_delete_handles:
-      del self._tensor_handles[handle_name]
-
-    to_delete_intermediates = []
-    for intm_tensor_name in self._dumped_intermediate_tensors:
-      if (intm_tensor_name in self._cached_invalidation_path and
-          source_element in self._cached_invalidation_path[intm_tensor_name]):
-        to_delete_intermediates.append(intm_tensor_name)
-    for intermediate in to_delete_intermediates:
-      del self._dumped_intermediate_tensors[intermediate]
-
-    if not self._tensor_handles and not self._dumped_intermediate_tensors:
-      return
-
-    stack = [source_element]
-    done = set()
-
-    while stack:
-      curr_element = stack.pop()
-      done.add(curr_element)
-
-      if (curr_element in self._tensor_handles or
-          curr_element in self._dumped_intermediate_tensors):
-        # Cache the invalidation path for potential future use.
-        if curr_element not in self._cached_invalidation_path:
-          self._cached_invalidation_path[curr_element] = set([source_element])
-        else:
-          self._cached_invalidation_path[curr_element].add(source_element)
-
-        if curr_element in self._tensor_handles:
-          del self._tensor_handles[curr_element]
-        else:
-          del self._dumped_intermediate_tensors[curr_element]
-
-      targets = self._output_targets.get(curr_element, [])
-      for target in targets:
-        if target in done:
-          continue
-        else:
-          stack.append(target)
-
-  def finalize(self):
-    """Run the final fetch(es).
-
-    Restore the dirty variables; ignore the client-supplied overriding tensor
-    values.
-
-    Returns:
-      The same return value as self.cont() as called on the final fetch.
-    """
-
-    self.restore_variable_values()
-    return self._sess.run(self._fetches, feed_dict=self._client_feed_dict)
-
-  def restore_variable_values(self):
-    """Restore variables to the initial values.
-
-    "Initial value" refers to the value when this NodeStepper instance was
-    first constructed.
-    """
-
-    for var_name in self._dirty_variables:
-      self._sess.run(self._variable_initializers[var_name],
-                     feed_dict={
-                         self._variable_initial_values[var_name]:
-                             self._cached_variable_values[var_name]
-                     })
-
-  def handle_names(self):
-    """Return names of the TensorHandles that the debugger is holding.
-
-    Returns:
-      (list of str) Name of the tensors for which TensorHandle is available.
-    """
-
-    return [name for name in self._tensor_handles]
-
-  def handle_node_names(self):
-    """Get list of names of the nodes for which handles are available.
-
-    Returns:
-      (set of str) List of names of the nodes.
-    """
-
-    return set([self._get_node_name(name) for name in self._tensor_handles])
-
-  def intermediate_tensor_names(self):
-    """Get list of the names of the Tensors for which dumps are available.
-
-    Returns:
-      (list of str) List of the names of the Tensors for which intermediate
-        dumps are available.
-    """
-
-    return self._dumped_intermediate_tensors.keys()
-
-  def last_updated(self):
-    """Get the names of the variables updated in the last cont() call.
-
-    Returns:
-      A set of the variable names updated in the previous cont() call.
-      If no cont() call has occurred before, returns None.
-    """
-
-    return self._last_updated
-
-  def dirty_variables(self):
-    """Get the set of variables that are currently "dirty".
-
-    "dirty" means:
-      previous cont() calls have updated the value of the Variable,
-      and the Variable's old value (the value before any cont() calls
-      happened) was not restored.
-
-    Returns:
-      (set) A set of dirty variables.
-    """
-
-    return self._dirty_variables
-
-  def is_placeholder(self, graph_element_name):
-    """Check whether a graph element is a Placeholder, by name.
-
-    Args:
-      graph_element_name: (str) Name of the tensor or op to be tested.
-
-    Returns:
-      (bool) Whether the graph element of the specified name is a Placeholder
-        op or the output Tensor of a Placeholder op.
-
-    Raises:
-      ValueError: If graph_element_name is not in the transitive closure of the
-        stepper instance.
-    """
-
-    node_name = self._get_node_name(graph_element_name)
-    if node_name not in self.sorted_nodes():
-      raise ValueError(
-          "%s is not in the transitive closure of this NodeStepper "
-          "instance" % graph_element_name)
-
-    graph_element = self._sess.graph.as_graph_element(graph_element_name)
-    if not isinstance(graph_element, ops.Operation):
-      graph_element = graph_element.op
-    return graph_element.type == "Placeholder"
-
-  def placeholders(self):
-    """Get the list of Placeholder Tensors in the transitive closure.
-
-    Returns:
-      (list of str) A list of Placeholder Tensors or ops in the transitive
-        closure.
-    """
-
-    placeholders = []
-    for item in self.sorted_nodes():
-      if self.is_placeholder(item):
-        placeholders.append(item)
-
-    return placeholders
-
-  def get_tensor_value(self, tensor_name):
-    """Get the value of a tensor that the stepper has access to.
-
-    Args:
-      tensor_name: (str) Name of the tensor.
-
-    Returns:
-      Value of the tensor, from overriding values or cached tensor handles.
-
-    Raises:
-      ValueError: If the value is not available as an overriding value
-        or through a TensorHandle.
-    """
-
-    if self.is_placeholder(tensor_name):
-      if ":" not in tensor_name:
-        tensor_name += ":0"
-      return self._client_feed_dict[tensor_name]
-    elif tensor_name in self._override_tensors:
-      return self._override_tensors[tensor_name]
-    elif tensor_name in self._tensor_handles:
-      return self._tensor_handles[tensor_name].eval()
-    elif tensor_name in self._dumped_intermediate_tensors:
-      return self._dumped_intermediate_tensors[tensor_name].get_tensor()
-    else:
-      raise ValueError(
-          "This stepper instance does not have access to the value of "
-          "tensor \"%s\"" % tensor_name)
-
-  def override_names(self):
-    """Return names of the TensorHandles that the debugger is holding.
-
-    Returns:
-      (list of str) Name of the tensor for which overriding tensor values are
-        available.
-    """
-    return [name for name in self._override_tensors]
-
-  def _get_node(self, element):
-    """Get the node of a graph element.
-
-    Args:
-      element: A graph element (Op, Tensor or Node)
-
-    Returns:
-      The node associated with element in the graph.
-    """
-
-    node_name, _ = debug_graphs.parse_node_or_tensor_name(element.name)
-    return self._sess.graph.as_graph_element(node_name)
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
deleted file mode 100644
index bec858a..0000000
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ /dev/null
@@ -1,1110 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Unit tests of the tfdbg Stepper."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.debug.lib.stepper import NodeStepper
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-from tensorflow.python.training import gradient_descent
-
-
-@test_util.run_v1_only("b/120545219")
-class StepperTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.a = variables.VariableV1(2.0, name="a")
-    self.b = variables.VariableV1(3.0, name="b")
-
-    self.c = math_ops.multiply(self.a, self.b, name="c")  # Should be 6.0.
-    self.d = math_ops.multiply(self.a, self.a, name="d")  # Should be 4.0.
-
-    self.e = math_ops.multiply(self.d, self.c, name="e")  # Should be 24.0.
-
-    self.f_y = constant_op.constant(0.30, name="f_y")
-    self.f = math_ops.div(self.b, self.f_y, name="f")  # Should be 10.0.
-
-    # The there nodes x, y and z form a graph with "cross-links" in. I.e., x
-    # and y are both direct inputs to z, but x is also a direct input to y.
-    self.x = variables.VariableV1(2.0, name="x")  # Should be 2.0
-    self.y = math_ops.negative(self.x, name="y")  # Should be -2.0.
-
-    self.z = math_ops.multiply(self.x, self.y, name="z")  # Should be -4.0.
-
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
-    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
-    config = config_pb2.ConfigProto(graph_options=graph_options)
-    self.sess = session.Session(config=config)
-    self.sess.run(variables.global_variables_initializer())
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-  def testContToFetchNotInTransitiveClosureShouldError(self):
-    with NodeStepper(self.sess, "e:0") as stepper:
-      sorted_nodes = stepper.sorted_nodes()
-      self.assertEqual(7, len(sorted_nodes))
-      self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("a/read"))
-      self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("b/read"))
-      self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("c"))
-      self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("c"))
-      self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("d"))
-      self.assertLess(sorted_nodes.index("d"), sorted_nodes.index("e"))
-      self.assertLess(sorted_nodes.index("c"), sorted_nodes.index("e"))
-
-      self.assertSetEqual(
-          {"e:0", "d:0", "c:0", "a/read:0", "b/read:0", "b:0", "a:0"},
-          set(stepper.closure_elements()))
-
-      with self.assertRaisesRegexp(
-          ValueError,
-          "Target \"f:0\" is not in the transitive closure for the fetch of "
-          "the stepper"):
-        stepper.cont("f:0")
-
-  def testContToNodeNameShouldReturnTensorValue(self):
-    with NodeStepper(self.sess, "e:0") as stepper:
-      self.assertAllClose(6.0, stepper.cont("c"))
-
-  def testUsingNamesNotUsingIntermediateTensors(self):
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with NodeStepper(self.sess, "e:0") as stepper:
-      # The first cont() call should have used no feeds.
-      result = stepper.cont("c:0")
-      self.assertAllClose(6.0, result)
-      self.assertItemsEqual(["a/read:0", "b/read:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(2.0, stepper.get_tensor_value("a/read:0"))
-      self.assertAllClose(3.0, stepper.get_tensor_value("b/read:0"))
-      self.assertEqual({}, stepper.last_feed_types())
-
-      # The second cont() call should have used the tensor handle from the
-      # previous cont() call.
-      result = stepper.cont("e:0")
-      self.assertAllClose(24.0, result)
-      self.assertItemsEqual(["a/read:0", "b/read:0", "d:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(2.0, stepper.get_tensor_value("a/read:0"))
-      self.assertAllClose(3.0, stepper.get_tensor_value("b/read:0"))
-      self.assertAllClose(4.0, stepper.get_tensor_value("d:0"))
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_HANDLE,
-          "a/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-  def testUsingNodesNotUsingIntermediateTensors(self):
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with NodeStepper(self.sess, self.e) as stepper:
-      # There should be no handles before any cont() calls.
-      self.assertEqual([], stepper.handle_names())
-      self.assertSetEqual(set(), stepper.handle_node_names())
-
-      # Before the cont() call, the stepper should not have access to the value
-      # of c:0.
-      with self.assertRaisesRegexp(
-          ValueError,
-          "This stepper instance does not have access to the value of tensor "
-          "\"c:0\""):
-        stepper.get_tensor_value("c:0")
-
-      # Using the node/tensor itself, instead of the name str, should work on
-      # cont().
-      result = stepper.cont(self.c)
-      self.assertItemsEqual(["a/read:0", "b/read:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(6.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-
-      self.assertEqual(["c:0"], stepper.handle_names())
-      self.assertEqual({"c"}, stepper.handle_node_names())
-
-      # After the cont() call, the stepper should have access to the value of
-      # c:0 via a tensor handle.
-      self.assertAllClose(6.0, stepper.get_tensor_value("c:0"))
-
-      result = stepper.cont(self.e)
-      self.assertAllClose(24.0, result)
-      self.assertItemsEqual(["a/read:0", "b/read:0", "d:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_HANDLE,
-          "a/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-  def testContToTensorWithIntermediateDumpShouldUseDump(self):
-    with NodeStepper(self.sess, ["e:0", "f:0"]) as stepper:
-      stepper.cont("c:0")
-      self.assertItemsEqual(["a/read:0", "b/read:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(2.0, stepper.get_tensor_value("a/read:0"))
-      self.assertAllClose(3.0, stepper.get_tensor_value("b/read:0"))
-
-      self.assertAllClose(2.0, stepper.cont("a/read:0"))
-      self.assertEqual({
-          "a/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE
-      }, stepper.last_feed_types())
-
-      self.assertAllClose(10.0, stepper.cont("f:0"))
-      self.assertEqual({
-          "b/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE
-      }, stepper.last_feed_types())
-
-  def testDisablingUseDumpedIntermediatesWorks(self):
-    with NodeStepper(self.sess, ["e:0", "f:0"]) as stepper:
-      stepper.cont("c:0")
-      self.assertItemsEqual(["a/read:0", "b/read:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(2.0, stepper.get_tensor_value("a/read:0"))
-      self.assertAllClose(3.0, stepper.get_tensor_value("b/read:0"))
-
-      self.assertAllClose(10.0,
-                          stepper.cont("f:0", use_dumped_intermediates=False))
-      self.assertEqual({}, stepper.last_feed_types())
-
-  def testIsFeedableShouldGiveCorrectAnswers(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      self.assertTrue(stepper.is_feedable("a/read:0"))
-      self.assertTrue(stepper.is_feedable("b/read:0"))
-      self.assertTrue(stepper.is_feedable("c:0"))
-      self.assertTrue(stepper.is_feedable("d:0"))
-
-  def testOverrideValue(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      result = stepper.cont(self.c)
-      self.assertAllClose(6.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-
-      # There should be no overrides before any cont() calls.
-      self.assertEqual([], stepper.override_names())
-
-      # Calling cont() on c again should lead to use of the handle.
-      result = stepper.cont(self.c)
-      self.assertAllClose(6.0, result)
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_HANDLE
-      }, stepper.last_feed_types())
-
-      # Override c:0.
-      stepper.override_tensor("c:0", 7.0)
-
-      # After the overriding, calling get_tensor_value() on c:0 should yield the
-      # overriding value.
-      self.assertEqual(7.0, stepper.get_tensor_value("c:0"))
-
-      # Now c:0 should have only an override value, but no cached handle,
-      # because the handle should have been invalidated.
-      self.assertEqual([], stepper.handle_names())
-      self.assertSetEqual(set(), stepper.handle_node_names())
-      self.assertEqual(["c:0"], stepper.override_names())
-
-      # Run a downstream tensor after the value override.
-      result = stepper.cont(self.e)
-      self.assertAllClose(28.0, result)  # Should reflect the overriding value.
-
-      # Should use override, instead of the handle.
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_OVERRIDE,
-          "a/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-  def testOverrideValueTwice(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      # Override once.
-      stepper.override_tensor("c:0", 7.0)
-      self.assertAllClose(28.0, stepper.cont(self.e))
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_OVERRIDE
-      }, stepper.last_feed_types())
-
-      self.assertEqual(["e:0"], stepper.handle_names())
-      self.assertSetEqual({"e"}, stepper.handle_node_names())
-      self.assertEqual(["c:0"], stepper.override_names())
-
-      # Calling cont(self.e) again. This time the cached tensor handle of e
-      # should be used.
-      self.assertEqual(28.0, stepper.cont(self.e))
-      self.assertEqual({
-          "e:0": NodeStepper.FEED_TYPE_HANDLE
-      }, stepper.last_feed_types())
-
-      # Override c again. This should have invalidated the cache for e.
-      stepper.override_tensor("c:0", 8.0)
-
-      self.assertEqual([], stepper.handle_names())
-      self.assertEqual(set(), stepper.handle_node_names())
-      self.assertEqual(["c:0"], stepper.override_names())
-
-      self.assertAllClose(32.0, stepper.cont(self.e))
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_OVERRIDE,
-          "d:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-  def testRemoveOverrideValue(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      result = stepper.cont(self.c)
-      self.assertAllClose(6.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-
-      # The previous cont() step should have generated a cached tensor handle.
-      self.assertEqual(["c:0"], stepper.handle_names())
-      self.assertSetEqual({"c"}, stepper.handle_node_names())
-
-      # Override c:0.
-      stepper.override_tensor("c:0", 7.0)
-
-      # The overriding should have invalidated the tensor handle.
-      self.assertEqual([], stepper.handle_names())
-      self.assertSetEqual(set(), stepper.handle_node_names())
-      self.assertEqual(["c:0"], stepper.override_names())
-
-      result = stepper.cont(self.e)
-      self.assertAllClose(28.0, result)  # Should reflect the overriding value.
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_OVERRIDE,
-          "a/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-      # The handle to tensor e:0 should have been cached, even though its
-      # transitive closure contains an override.
-      self.assertIn("e:0", stepper.handle_names())
-      self.assertSetEqual({"e"}, stepper.handle_node_names())
-
-      # Remove the override.
-      stepper.remove_override("c:0")
-      # c:0 should not be in the overrides anymore.
-      self.assertEqual([], stepper.override_names())
-
-      # Removing the override should have invalidated the tensor handle for c.
-      self.assertNotIn("e:0", stepper.handle_names())
-      self.assertNotIn("e", stepper.handle_node_names())
-
-      # Should reflect the non-overriding value.
-      self.assertAllClose(24.0, stepper.cont(self.e))
-
-      # This time, the handle to tensor e:0 should have been cached again, even
-      # thought its transitive closure contains an override.
-      self.assertIn("e:0", stepper.handle_names())
-      self.assertIn("e", stepper.handle_node_names())
-
-      # Calling cont(self.e) again should have used the tensor handle to e:0.
-      self.assertAllClose(24.0, stepper.cont(self.e))
-      self.assertEqual({
-          "e:0": NodeStepper.FEED_TYPE_HANDLE,
-      }, stepper.last_feed_types())
-
-  def testOverrideAndContToSameTensor(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      result = stepper.cont(self.c)
-      self.assertAllClose(6.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-      self.assertEqual(["c:0"], stepper.handle_names())
-      self.assertSetEqual({"c"}, stepper.handle_node_names())
-
-      self.assertAllClose(6.0, stepper.cont(self.c))
-
-      # The last cont() call should use the tensor handle directly.
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_HANDLE
-      }, stepper.last_feed_types())
-
-      # Override c:0.
-      stepper.override_tensor("c:0", 7.0)
-
-      # As a result of the override, the tensor handle should have been
-      # invalidated.
-      self.assertEqual([], stepper.handle_names())
-      self.assertSetEqual(set(), stepper.handle_node_names())
-
-      result = stepper.cont(self.c)
-      self.assertAllClose(7.0, result)
-
-      self.assertEqual({
-          "c:0": NodeStepper.FEED_TYPE_OVERRIDE
-      }, stepper.last_feed_types())
-
-  def testFinalizeWithPreviousOverrides(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      stepper.override_tensor("a/read:0", 20.0)
-      self.assertEqual(["a/read:0"], stepper.override_names())
-
-      # Should reflect the overriding value.
-      self.assertAllClose(24000.0, stepper.cont("e:0"))
-      self.assertEqual({
-          "a/read:0": NodeStepper.FEED_TYPE_OVERRIDE
-      }, stepper.last_feed_types())
-
-      # Finalize call should have ignored the overriding value.
-      self.assertAllClose(24.0, stepper.finalize())
-
-  def testRemoveNonexistentOverrideValue(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      self.assertEqual([], stepper.override_names())
-      with self.assertRaisesRegexp(
-          ValueError, "No overriding value exists for tensor \"c:0\""):
-        stepper.remove_override("c:0")
-
-  def testAttemptToOverrideInvalidTensor(self):
-    stepper = NodeStepper(self.sess, self.e)
-
-    with self.assertRaisesRegexp(ValueError, "Cannot override tensor \"f:0\""):
-      stepper.override_tensor("f:0", 42.0)
-
-  def testInvalidOverrideArgumentType(self):
-    with NodeStepper(self.sess, self.e) as stepper:
-      with self.assertRaisesRegexp(TypeError, "Expected type str; got type"):
-        stepper.override_tensor(self.a, 42.0)
-
-  def testTransitiveClosureWithCrossLinksShouldHaveCorrectOrder(self):
-    with NodeStepper(self.sess, "z:0") as stepper:
-      sorted_nodes = stepper.sorted_nodes()
-      self.assertEqual(4, len(sorted_nodes))
-      self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("x/read"))
-      self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("y"))
-      self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("z"))
-      self.assertLess(sorted_nodes.index("y"), sorted_nodes.index("z"))
-
-  def testNodeStepperConstructorShouldAllowListOrTupleOrDictOfFetches(self):
-    for i in range(6):
-      if i == 0:
-        fetches = [self.e, [self.f, self.z]]
-      elif i == 1:
-        fetches = (self.e, (self.f, self.z))
-      elif i == 2:
-        fetches = {"e": self.e, "fz": {"f": self.f, "z": self.z}}
-      elif i == 3:
-        fetches = ["e:0", ["f:0", "z:0"]]
-      elif i == 4:
-        fetches = ("e:0", ("f:0", "z:0"))
-      elif i == 5:
-        fetches = {"e": "e:0", "fz": {"f": "f:0", "z": "z:0"}}
-
-      with NodeStepper(self.sess, fetches) as stepper:
-        sorted_nodes = stepper.sorted_nodes()
-        self.assertEqual(13, len(sorted_nodes))
-
-        # Check the topological order of the sorted nodes.
-        self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("x/read"))
-        self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("y"))
-        self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("z"))
-        self.assertLess(sorted_nodes.index("y"), sorted_nodes.index("z"))
-
-        self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("a/read"))
-        self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("b/read"))
-        self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("c"))
-        self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("c"))
-        self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("d"))
-        self.assertLess(sorted_nodes.index("d"), sorted_nodes.index("e"))
-        self.assertLess(sorted_nodes.index("c"), sorted_nodes.index("e"))
-        self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("f"))
-        self.assertLess(sorted_nodes.index("f_y"), sorted_nodes.index("f"))
-
-        closure_elements = stepper.closure_elements()
-        self.assertIn("x/read:0", closure_elements)
-        self.assertIn("e:0", closure_elements)
-        self.assertIn("f:0", closure_elements)
-
-        self.assertEqual([0], stepper.output_slots_in_closure("x/read"))
-        self.assertEqual([0], stepper.output_slots_in_closure("e"))
-        self.assertEqual([0], stepper.output_slots_in_closure("f"))
-
-        result = stepper.finalize()
-        if i == 0 or i == 1 or i == 3 or i == 4:
-          self.assertAllClose(24.0, result[0])
-          self.assertAllClose(10.0, result[1][0])
-          self.assertAllClose(-4.0, result[1][1])
-        elif i == 2 or i == 5:
-          self.assertAllClose(24.0, result["e"])
-          self.assertAllClose(10.0, result["fz"]["f"])
-          self.assertAllClose(-4.0, result["fz"]["z"])
-
-
-@test_util.run_v1_only("b/120545219")
-class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.ph0 = array_ops.placeholder(dtypes.float32, shape=(2, 2), name="ph0")
-    self.ph1 = array_ops.placeholder(dtypes.float32, shape=(2, 1), name="ph1")
-
-    self.x = math_ops.matmul(self.ph0, self.ph1, name="x")
-    self.y = math_ops.add(self.x, self.ph1, name="y")
-
-    self.sess = session.Session()
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-  def testGetTensorValueWorksOnPlaceholder(self):
-    with NodeStepper(
-        self.sess,
-        self.y,
-        feed_dict={
-            self.ph0: [[1.0, 2.0], [-3.0, 5.0]],
-            self.ph1: [[-1.0], [0.5]]
-        }) as stepper:
-      self.assertAllClose([[1.0, 2.0], [-3.0, 5.0]],
-                          stepper.get_tensor_value("ph0"))
-      self.assertAllClose([[1.0, 2.0], [-3.0, 5.0]],
-                          stepper.get_tensor_value("ph0:0"))
-      with self.assertRaisesRegexp(
-          KeyError,
-          r"The name 'ph0:1' refers to a Tensor which does not exist"):
-        stepper.get_tensor_value("ph0:1")
-
-  def testIsPlaceholdersShouldGiveCorrectAnswers(self):
-    with NodeStepper(self.sess, self.y) as stepper:
-      self.assertTrue(stepper.is_placeholder(self.ph0.name))
-      self.assertTrue(stepper.is_placeholder(self.ph1.name))
-
-      self.assertFalse(stepper.is_placeholder(self.x.name))
-      self.assertFalse(stepper.is_placeholder(self.y.name))
-
-      with self.assertRaisesRegexp(ValueError,
-                                   "A is not in the transitive closure"):
-        self.assertFalse(stepper.is_placeholder("A"))
-
-  def testPlaceholdersShouldGiveCorrectAnswers(self):
-    with NodeStepper(self.sess, self.y) as stepper:
-      self.assertSetEqual({"ph0", "ph1"}, set(stepper.placeholders()))
-
-  def testContWithPlaceholders(self):
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with NodeStepper(
-        self.sess,
-        self.y,
-        feed_dict={
-            self.ph0: [[1.0, 2.0], [-3.0, 5.0]],
-            self.ph1: [[-1.0], [0.5]]
-        }) as stepper:
-      self.assertEqual(4, len(stepper.sorted_nodes()))
-      self.assertSetEqual({"ph0:0", "ph1:0", "x:0", "y:0"},
-                          set(stepper.closure_elements()))
-
-      result = stepper.cont(self.x)
-      self.assertAllClose([[0.0], [5.5]], result)
-      self.assertEqual({
-          "ph0:0": NodeStepper.FEED_TYPE_CLIENT,
-          "ph1:0": NodeStepper.FEED_TYPE_CLIENT,
-      }, stepper.last_feed_types())
-
-      self.assertEqual(["x:0"], stepper.handle_names())
-      self.assertSetEqual({"x"}, stepper.handle_node_names())
-
-      result = stepper.cont(self.y)
-      self.assertAllClose([[-1.0], [6.0]], result)
-      self.assertEqual({
-          "x:0": NodeStepper.FEED_TYPE_HANDLE,
-          "ph1:0": NodeStepper.FEED_TYPE_CLIENT,
-      }, stepper.last_feed_types())
-
-  def testAttemptToContToPlaceholderWithTensorFeedKeysShouldWork(self):
-    """Continuing to a placeholder should be allowed, using client feed."""
-
-    ph0_feed = [[1.0, 2.0], [-3.0, 5.0]]
-    ph1_feed = [[-1.0], [0.5]]
-    with NodeStepper(
-        self.sess, self.y, feed_dict={
-            self.ph0: ph0_feed,
-            self.ph1: ph1_feed,
-        }) as stepper:
-      self.assertAllClose(ph0_feed, stepper.cont(self.ph0))
-      self.assertEqual({
-          self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
-      }, stepper.last_feed_types())
-
-      self.assertAllClose(ph1_feed, stepper.cont(self.ph1))
-      self.assertEqual({
-          self.ph1.name: NodeStepper.FEED_TYPE_CLIENT
-      }, stepper.last_feed_types())
-
-      ph0_node = self.sess.graph.as_graph_element("ph0")
-      self.assertAllClose(ph0_feed, stepper.cont(ph0_node))
-      self.assertEqual({
-          self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
-      }, stepper.last_feed_types())
-
-      self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
-
-  def testAttemptToContToPlaceholderWithTensorNameFeedKeysShouldWork(self):
-
-    ph0_feed = [[1.0, 2.0], [-3.0, 5.0]]
-    ph1_feed = [[-1.0], [0.5]]
-    with NodeStepper(
-        self.sess,
-        self.y,
-        feed_dict={
-            self.ph0.name: ph0_feed,
-            self.ph1.name: ph1_feed,
-        }) as stepper:
-      self.assertAllClose(ph0_feed, stepper.cont(self.ph0))
-      self.assertEqual({
-          self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
-      }, stepper.last_feed_types())
-
-      self.assertAllClose(ph1_feed, stepper.cont(self.ph1))
-      self.assertEqual({
-          self.ph1.name: NodeStepper.FEED_TYPE_CLIENT
-      }, stepper.last_feed_types())
-
-      ph0_node = self.sess.graph.as_graph_element("ph0")
-      self.assertAllClose(ph0_feed, stepper.cont(ph0_node))
-      self.assertEqual({
-          self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
-      }, stepper.last_feed_types())
-
-      self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
-
-
-@test_util.run_v1_only("b/120545219")
-class StepperAssignAddTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.v = variables.VariableV1(10.0, name="v")
-    self.p = math_ops.add(self.v, self.v, name="p")
-    self.q = math_ops.multiply(self.p, self.p, name="q")
-    self.delta = constant_op.constant(2.0, name="delta")
-    self.v_add = state_ops.assign_add(self.v, self.delta, name="v_add")
-    self.v_add_plus_one = math_ops.add(self.v_add,
-                                       1.0,
-                                       name="v_add_plus_one")
-
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
-    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
-    config = config_pb2.ConfigProto(graph_options=graph_options)
-    self.sess = session.Session(config=config)
-    self.sess.run(self.v.initializer)
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-  def testLastUpdatedVariablesReturnsNoneBeforeAnyContCalls(self):
-    with NodeStepper(self.sess, [self.q, self.v_add]) as stepper:
-      self.assertIsNone(stepper.last_updated())
-
-  def testContToUpdateInvalidatesDumpedIntermediates(self):
-    with NodeStepper(self.sess, [self.q, self.v_add]) as stepper:
-      self.assertAllClose(400.0, stepper.cont("q:0"))
-      self.assertItemsEqual(["v/read:0", "p:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(10.0, stepper.get_tensor_value("v/read:0"))
-      self.assertAllClose(20.0, stepper.get_tensor_value("p:0"))
-
-      self.assertAllClose(
-          12.0, stepper.cont(
-              self.v_add, invalidate_from_updated_variables=True))
-      self.assertAllClose(12.0, self.sess.run(self.v))
-      self.assertSetEqual({self.v.name}, stepper.last_updated())
-      self.assertItemsEqual(["v:0"], stepper.dirty_variables())
-      # Updating the value of v by calling v_add should have invalidated the
-      # dumped intermediate tensors for v/read:0 and p:0.
-      self.assertItemsEqual(["delta:0"], stepper.intermediate_tensor_names())
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"This stepper instance does not have access to the value of tensor "
-          r"\"p:0\""):
-        stepper.get_tensor_value("p:0")
-
-      # The next cont to q should not have used any dumped intermediate tensors
-      # and its result should reflect the updated value.
-      self.assertAllClose(576.0, stepper.cont("q:0"))
-      self.assertSetEqual(set(), stepper.last_updated())
-      self.assertEqual({}, stepper.last_feed_types())
-
-  def testOverridingUpstreamTensorInvalidatesDumpedIntermediates(self):
-    with NodeStepper(self.sess, self.q) as stepper:
-      self.assertAllClose(400.0, stepper.cont("q:0"))
-      self.assertItemsEqual(["v/read:0", "p:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertAllClose(10.0, stepper.get_tensor_value("v/read:0"))
-      self.assertAllClose(20.0, stepper.get_tensor_value("p:0"))
-
-      stepper.override_tensor("v/read:0", 11.0)
-      self.assertItemsEqual(["v/read:0"], stepper.override_names())
-      # Overriding the upstream v/read:0 should have invalidated the dumped
-      # intermediate tensor for the downstream p:0.
-      self.assertItemsEqual([], stepper.intermediate_tensor_names())
-
-      # The next cont to q should not have used any dumped intermediate tensors
-      # and its result should reflect the overriding value.
-      self.assertAllClose(484.0, stepper.cont("q:0"))
-      self.assertEqual({
-          "v/read:0": NodeStepper.FEED_TYPE_OVERRIDE
-      }, stepper.last_feed_types())
-
-  def testRemovingOverrideToUpstreamTensorInvalidatesDumpedIntermediates(self):
-    with NodeStepper(self.sess, self.q) as stepper:
-      stepper.override_tensor("v/read:0", 9.0)
-      self.assertItemsEqual(["v/read:0"], stepper.override_names())
-
-      self.assertAllClose(324.0, stepper.cont(self.q))
-      self.assertItemsEqual(["p:0"], stepper.intermediate_tensor_names())
-
-      stepper.remove_override("v/read:0")
-      self.assertItemsEqual([], stepper.override_names())
-      # Removing the pre-existing override to v/read:0 should have invalidated
-      # the dumped intermediate tensor.
-      self.assertItemsEqual([], stepper.intermediate_tensor_names())
-
-  def testRepeatedCallsToAssignAddDoesNotUpdateVariableAgain(self):
-    with NodeStepper(self.sess, self.v_add) as stepper:
-      stepper.cont(self.v_add)
-      self.assertSetEqual({self.v.name}, stepper.last_updated())
-      self.assertAllClose(12.0, stepper.cont(self.v))
-      stepper.cont(self.v_add)
-      self.assertSetEqual(set(), stepper.last_updated())
-      self.assertEqual({"v_add:0": NodeStepper.FEED_TYPE_HANDLE},
-                       stepper.last_feed_types())
-      self.assertAllClose(12.0, stepper.cont(self.v))
-
-  def testRepeatedCallsToAssignAddDownStreamDoesNotUpdateVariableAgain(self):
-    with NodeStepper(self.sess, self.v_add_plus_one) as stepper:
-      stepper.cont(self.v_add_plus_one)
-      self.assertSetEqual({self.v.name}, stepper.last_updated())
-      self.assertAllClose(12.0, stepper.cont(self.v))
-      stepper.cont(self.v_add_plus_one)
-      self.assertSetEqual(set(), stepper.last_updated())
-      self.assertEqual({"v_add_plus_one:0": NodeStepper.FEED_TYPE_HANDLE},
-                       stepper.last_feed_types())
-      self.assertAllClose(12.0, stepper.cont(self.v))
-
-
-@test_util.run_v1_only("b/120545219")
-class StepperBackwardRunTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    """Test setup.
-
-    Structure of the forward graph:
-              f
-             | |
-        -----   -----
-        |           |
-        d           e
-       | |         | |
-    ---   ---------  ---
-    |         |        |
-    a         b        c
-
-    Construct a backward graph using the GradientDescentOptimizer.
-    """
-
-    self.a = variables.VariableV1(1.0, name="a")
-    self.b = variables.VariableV1(2.0, name="b")
-    self.c = variables.VariableV1(4.0, name="c")
-    self.d = math_ops.multiply(self.a, self.b, name="d")
-    self.e = math_ops.multiply(self.b, self.c, name="e")
-    self.f = math_ops.multiply(self.d, self.e, name="f")
-
-    # Gradient descent optimizer that minimizes g.
-    gradient_descent.GradientDescentOptimizer(0.01).minimize(
-        self.f, name="optim")
-
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
-    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
-    config = config_pb2.ConfigProto(graph_options=graph_options)
-    self.sess = session.Session(config=config)
-    self.sess.run(variables.global_variables_initializer())
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-  def testContToUpdateA(self):
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with NodeStepper(self.sess, "optim") as stepper:
-      result = stepper.cont("a:0")
-      self.assertAllClose(1.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-
-      result = stepper.cont("optim/learning_rate:0")
-      self.assertAllClose(0.01, result)
-      self.assertEqual({}, stepper.last_feed_types())
-
-      # Before any cont calls on ApplyGradientDescent, there should be no
-      # "dirty" variables.
-      self.assertEqual(set(), stepper.dirty_variables())
-
-      # First, all the two control inputs to optim.
-      result = stepper.cont("optim/update_a/ApplyGradientDescent",
-                            invalidate_from_updated_variables=True)
-
-      # Now variable a should have been marked as dirty due to the update
-      # by optim/update_a/ApplyGradientDescent.
-      self.assertSetEqual({"a:0"}, stepper.last_updated())
-      self.assertEqual({"a:0"}, stepper.dirty_variables())
-      self.assertIsNone(result)
-      self.assertEqual({
-          "optim/learning_rate:0": NodeStepper.FEED_TYPE_HANDLE
-      }, stepper.last_feed_types())
-
-      # Check that Variable "a" has been updated properly, but "b", "c" and "d"
-      # remain the same.
-      # For backprop on Variable a:
-      #   Because f = a * b * b * c, df / da = b * b * c.
-      #   1.0 - learning_rate * b * b * c
-      #     = 1.0 -  0.01 * 2.0 * 2.0 * 4.0 = 0.84.
-      self.assertAllClose(0.84, self.sess.run(self.a))
-      self.assertAllClose(2.0, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-
-  def testContToUpdateB(self):
-    with NodeStepper(self.sess, "optim") as stepper:
-      result = stepper.cont("optim/update_b/ApplyGradientDescent",
-                            invalidate_from_updated_variables=True)
-      self.assertIsNone(result)
-      self.assertSetEqual({"b:0"}, stepper.last_updated())
-      self.assertEqual(set(["b:0"]), stepper.dirty_variables())
-
-      # For backprop on Variable b:
-      #   Because f = a * b * b * c, df / da = 2 * a * b * c.
-      #   2.0 - learning_rate * 2 * a * b * c
-      #     = 2.0 - 0.01 * 2 * 1.0 * 2.0 * 4.0 = 1.84
-      self.assertAllClose(1.0, self.sess.run(self.a))
-      self.assertAllClose(1.84, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-
-  def testContAfterUpdateWithoutRestoringVariableValue(self):
-    with NodeStepper(self.sess, "optim") as stepper:
-      # First, update Variable a from 1.0 to 0.84.
-      result = stepper.cont(
-          "optim/update_a/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-      self.assertIsNone(result)
-      self.assertSetEqual({"a:0"}, stepper.last_updated())
-      self.assertEqual(set(["a:0"]), stepper.dirty_variables())
-      self.assertAllClose(0.84, self.sess.run(self.a))
-      self.assertAllClose(2.0, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-      # Tracking of the updated variables should have invalidated all
-      # intermediate tensors downstream to a:0.
-      self.assertNotIn("a/read:0", stepper.intermediate_tensor_names())
-      self.assertNotIn("d:0", stepper.intermediate_tensor_names())
-
-      # Second, update Variable b without the default restore_variable_values.
-      result = stepper.cont(
-          "optim/update_b/ApplyGradientDescent", restore_variable_values=False)
-      self.assertIsNone(result)
-      # For the backprop on Variable b under the updated value of a:
-      #   2.0 - learning_rate * 2 * a' * b * c
-      #     = 2.0 - 0.01 * 2 * 0.84 * 2.0 * 4.0 = 1.8656
-      self.assertAllClose(0.84, self.sess.run(self.a))
-      self.assertAllClose(1.8656, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-
-  def testContNotInvalidatingFromVariableUpdatesWorksForNextUpdate(self):
-    with NodeStepper(self.sess, "optim") as stepper:
-      self.assertIsNone(stepper.cont(
-          "optim/update_a/ApplyGradientDescent",
-          invalidate_from_updated_variables=False))
-      # Even though invalidate_from_updated_variables is set to False, dirty
-      # variables should still have been tracked.
-      self.assertSetEqual({"a:0"}, stepper.last_updated())
-      self.assertEqual({"a:0"}, stepper.dirty_variables())
-      self.assertIn("a/read:0", stepper.intermediate_tensor_names())
-      self.assertIn("b/read:0", stepper.intermediate_tensor_names())
-      self.assertIn("c/read:0", stepper.intermediate_tensor_names())
-      self.assertIn("d:0", stepper.intermediate_tensor_names())
-      self.assertIn("e:0", stepper.intermediate_tensor_names())
-      self.assertIn("optim/learning_rate:0",
-                    stepper.intermediate_tensor_names())
-      self.assertNotIn("a:0", stepper.intermediate_tensor_names())
-      self.assertNotIn("b:0", stepper.intermediate_tensor_names())
-      self.assertNotIn("c:0", stepper.intermediate_tensor_names())
-
-      self.assertAllClose(0.84, self.sess.run(self.a))
-      self.assertAllClose(2.0, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-
-      # For the backprop on Variable b, the result should reflect the original
-      # value of Variable a, even though Variable a has actually been updated.
-      #   2.0 - learning_rate * 2 * a * b * c
-      #     = 2.0 - 0.01 * 2 * 1.0 * 2.0 * 4.0 = 1.84
-      self.assertIsNone(stepper.cont(
-          "optim/update_b/ApplyGradientDescent",
-          invalidate_from_updated_variables=False,
-          restore_variable_values=False))
-      self.assertAllClose(0.84, self.sess.run(self.a))
-      self.assertAllClose(1.84, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-
-  def testUpdateTwiceRestoreVariable(self):
-    with NodeStepper(self.sess, "optim") as stepper:
-      result = stepper.cont(
-          "optim/update_a/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-      self.assertIsNone(result)
-      self.assertSetEqual({"a:0"}, stepper.last_updated())
-      self.assertEqual({"a:0"}, stepper.dirty_variables())
-
-      result = stepper.cont(
-          "optim/update_b/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-      self.assertIsNone(result)
-      # Variables a and c should have been restored and hence no longer dirty.
-      # Variable b should have been marked as dirty.
-      self.assertSetEqual({"b:0"}, stepper.last_updated())
-      self.assertEqual({"b:0"}, stepper.dirty_variables())
-
-    # The result of the update should be identitcal to as if only update_b is
-    # run.
-    self.assertAllClose(1.0, self.sess.run(self.a))
-    self.assertAllClose(1.84, self.sess.run(self.b))
-    self.assertAllClose(4.0, self.sess.run(self.c))
-
-  def testSelectiveHandleUsageDependingOnTransitiveCleanliness(self):
-    """Test tensor handlers are using only during clean transitive closure.
-
-    "clean" means no Variables have been updated by preceding cont() calls.
-    """
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with NodeStepper(self.sess, "optim") as stepper:
-      # First, call cont() on the two tensors on the intermediate level: e and
-      # f.
-      result = stepper.cont("d:0")
-      self.assertAllClose(2.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-      self.assertItemsEqual(["a/read:0", "b/read:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertItemsEqual(["d:0"], stepper.handle_names())
-      self.assertSetEqual(set(), stepper.last_updated())
-      self.assertEqual(set(), stepper.dirty_variables())
-
-      result = stepper.cont("e:0")
-      self.assertAllClose(8.0, result)
-      self.assertEqual({
-          "b/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE
-      }, stepper.last_feed_types())
-      self.assertItemsEqual(["d:0", "e:0"], stepper.handle_names())
-      self.assertItemsEqual(["a/read:0", "b/read:0", "c/read:0"],
-                            stepper.intermediate_tensor_names())
-      self.assertSetEqual(set(), stepper.last_updated())
-      self.assertEqual(set(), stepper.dirty_variables())
-
-      # Now run update_a, so as to let Variable a be dirty.
-      result = stepper.cont(
-          "optim/update_a/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-      self.assertIsNone(result)
-      # Due to the update to the value of a:0, the dumped intermediate a/read:0
-      # should have been invalidated.
-      self.assertNotIn("a/read:0", stepper.intermediate_tensor_names())
-      self.assertSetEqual({"a:0"}, stepper.last_updated())
-      self.assertEqual({"a:0"}, stepper.dirty_variables())
-
-      # Now, run update_b.
-      result = stepper.cont(
-          "optim/update_b/ApplyGradientDescent", restore_variable_values=True)
-      self.assertIsNone(result)
-
-      # The last cont() run should have use the handle of tensor e, but not the
-      # handle of tensor d, because the transitive closure of e is clean,
-      # whereas that of d is dirty due to the update to a in the previous cont()
-      # call.
-      last_feed_types = stepper.last_feed_types()
-      self.assertNotIn("d:0", last_feed_types)
-      self.assertEqual(NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-                       last_feed_types["b/read:0"])
-      self.assertEqual(NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-                       last_feed_types["c/read:0"])
-
-      # The result of the update_b should be identical to as if no other
-      # update_* cont() calls have occurred before.
-      self.assertAllClose(1.0, self.sess.run(self.a))
-      self.assertAllClose(1.84, self.sess.run(self.b))
-      self.assertAllClose(4.0, self.sess.run(self.c))
-
-  def testRestoreVariableValues(self):
-    """Test restore_variable_values() restores the old values of variables."""
-
-    with NodeStepper(self.sess, "optim") as stepper:
-      stepper.cont(
-          "optim/update_b/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-      self.assertAllClose(1.84, self.sess.run(self.b))
-
-      stepper.restore_variable_values()
-      self.assertAllClose(2.0, self.sess.run(self.b))
-
-  def testFinalize(self):
-    """Test finalize() to restore variables and run the original fetch."""
-
-    with NodeStepper(self.sess, "optim") as stepper:
-      # Invoke update_b before calling finalize.
-      stepper.cont(
-          "optim/update_b/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-
-      result = stepper.finalize()
-      self.assertIsNone(result)
-
-      # The results of the Variable updates should be the same as if no cont()
-      # call has occurred on update_b.
-      self.assertAllClose(0.84, self.sess.run(self.a))
-      self.assertAllClose(1.84, self.sess.run(self.b))
-      self.assertAllClose(3.96, self.sess.run(self.c))
-
-  def testOverrideThenContToUpdateThenRemoveOverrideThenUpdateAgain(self):
-    """Test cont() to update nodes after overriding tensor values."""
-    if test_util.is_gpu_available():
-      self.skipTest("b/123446705 this causes a segfault on GPU")
-
-    with NodeStepper(self.sess, "optim") as stepper:
-      result = stepper.cont("d:0")
-      self.assertAllClose(2.0, result)
-      self.assertEqual({}, stepper.last_feed_types())
-      self.assertSetEqual(set(), stepper.last_updated())
-      self.assertEqual(set(), stepper.dirty_variables())
-      self.assertEqual(["d:0"], stepper.handle_names())
-      self.assertSetEqual({"d"}, stepper.handle_node_names())
-
-      # Override the value from 1.0 to 10.0.
-      stepper.override_tensor("a/read:0", 10.0)
-
-      self.assertEqual(["a/read:0"], stepper.override_names())
-
-      result = stepper.cont(
-          "optim/update_c/ApplyGradientDescent",
-          invalidate_from_updated_variables=True,
-          restore_variable_values=True)
-      self.assertIsNone(result)
-
-      # The last cont() call should have not used the tensor handle to d:0,
-      # because the transitive closure of d:0 contains an override tensor.
-      self.assertEqual({
-          "a/read:0": NodeStepper.FEED_TYPE_OVERRIDE,
-          "b/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-      # The tensor handle to d:0 should have been removed due to the dirty
-      # transitive closure.
-      self.assertEqual([], stepper.handle_names())
-      self.assertSetEqual(set(), stepper.handle_node_names())
-
-      # For this backprop on c, the overriding value of a/read:0 should have
-      # been used:
-      #   4.0 - learning_rate * a * b * b
-      #     = 4.0 - 0.01 * 10.0 * 2.0 * 2.0 = 3.6.
-      self.assertAllClose(3.6, self.sess.run(self.c))
-
-      # Now remove the overriding value of a/read:0.
-      stepper.remove_override("a/read:0")
-      self.assertEqual([], stepper.override_names())
-
-      # Obtain the tensor handle to d:0 again.
-      result = stepper.cont("d:0")
-      self.assertAllClose(2.0, result)
-      self.assertEqual(["d:0"], stepper.handle_names())
-      self.assertSetEqual({"d"}, stepper.handle_node_names())
-      self.assertNotIn("a/read:0", stepper.last_feed_types())
-
-      # Then call update_c again, without restoring c.
-      result = stepper.cont("optim/update_c/ApplyGradientDescent",
-                            restore_variable_values=False)
-      self.assertIsNone(result)
-      self.assertNotIn("a/read:0", stepper.last_feed_types())
-
-      # This time, the d:0 tensor handle should have been used, because its
-      # transitive closure is clean.
-      self.assertEqual({
-          "b/read:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-          "d:0": NodeStepper.FEED_TYPE_HANDLE,
-          "optim/learning_rate:0": NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE,
-      }, stepper.last_feed_types())
-
-      # For this backprop on c, the overriding value of a/read:0 should have
-      # been used:
-      #   3.6 - learning_rate * a * b * b
-      #     = 3.6 - 0.01 * 1.0 * 2.0 * 2.0 = 3.56.
-      self.assertAllClose(3.56, self.sess.run(self.c))
-
-  def testContToNodeWithOutputTensors(self):
-    """cont() to an op should cache its output tensors if appropriate."""
-
-    with NodeStepper(self.sess, "optim") as stepper:
-      # In the transitive closure of the stepper, look for an op of which the
-      # output tensor also is in the transitive closure.
-      # Do not assume a specific op, e.g., ""gradients/e_grad/Reshape_1",
-      # because it may vary between builds.
-      closure_elements = stepper.closure_elements()
-      op_with_output_in_closure = None
-      for element_name in closure_elements:
-        if element_name + ":0" in closure_elements:
-          op_with_output_in_closure = str(element_name)
-          break
-
-      self.assertEqual(
-          [0], stepper.output_slots_in_closure(op_with_output_in_closure))
-
-      self.assertIsNotNone(op_with_output_in_closure)
-      output_tensor = op_with_output_in_closure + ":0"
-
-      # The op "gradients/?_grad/Reshape_1" is in the transitive closure of the
-      # stepper, because it is the control input to another o. However, its
-      # output tensor "gradients/?_grad/Reshape_1:0" is also in the transitive
-      # closure, because it is the (non-control) input of certain ops. Calling
-      # cont() on the op should lead to the caching of the tensor handle for
-      # the output tensor.
-      stepper.cont(op_with_output_in_closure)
-
-      self.assertEqual([output_tensor], stepper.handle_names())
-      self.assertSetEqual({op_with_output_in_closure},
-                          stepper.handle_node_names())
-
-      # Do a cont() call that uses the cached tensor of
-      # "gradients/?_grad/Reshape_1:0".
-      stepper.cont(output_tensor)
-      self.assertEqual({
-          output_tensor: NodeStepper.FEED_TYPE_HANDLE
-      }, stepper.last_feed_types())
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 42e3b09..ea8f31a 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -25,7 +25,6 @@
 
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
-from tensorflow.python.debug.lib import stepper
 from tensorflow.python.debug.wrappers import dumping_wrapper
 from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.debug.wrappers import hooks
@@ -380,16 +379,6 @@
     self.assertEqual(1, dump.size)
     self.assertEqual("delta", dump.dumped_tensor_data[0].node_name)
 
-  def testCallingInvokeNodeStepperOnDumpingWrapperRaisesException(self):
-    sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
-    node_stepper = stepper.NodeStepper(self.sess, self.inc_v)
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r"NonInteractiveDebugWrapperSession does not support node-stepper "
-        r"mode\."):
-      sess.invoke_node_stepper(node_stepper)
-
   def testDumpingWrapperWithEmptyFetchWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess, session_root=self.session_root, log_usage=False)
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 6cc9c67..986c91d 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -25,10 +25,7 @@
    launching a UI to let users inspect the intermediate tensors and partition
    graphs from the run() call.
 
-c) (To be implemented) Intercept a run() call and give control to DebugStepper
-   to let it perform stepping / continuing-to actions on the graph.
-
-b) (To be implemented in a future CL) Enter an instruction loop to let an
+c) (To be implemented in a future CL) Enter an instruction loop to let an
    external object (e.g., remote client) launch run() and cont() calls
    remotely.
 
@@ -70,14 +67,6 @@
 
     If the action is NON_DEBUG_RUN, a non-debug (normal) run will ensue.
 
-    If the action is INVOKE_STEPPER, no run() call will be issued to the
-    wrapped session. But instead, a DebugStepper (i.e., "continuation
-    debugger") will be used to perform stepping / continue-to actions on
-    the graph.
-
-TODO(cais): The event loop for the DebugStepper will request additional
-   callbacks including on_cont_start() and on_cont_end(). Add those.
-
 A2) Right before the run() returns, the on_run_end() callback is invoked,
     with an OnRunEndRequest object as the argument, which carries information
     including the actual action performed in the warpper run() call and the
@@ -93,9 +82,7 @@
     OnInstrStartResponse object with an action field which can order one of
     the following actions:
         i) a run() call with fetches, feeds and debug_urls specified.
-       ii) a DebugStepper cont() call with target specified.
-      iii) value overrides in the cached tensors from the DebugStepper.
-       iv) exit the instruction loop.
+       ii) exit the instruction loop.
 
 B2) The wrapper session carries out the action specified above.
 
@@ -112,6 +99,7 @@
 from __future__ import print_function
 
 import abc
+import collections
 import re
 import threading
 
@@ -120,7 +108,6 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_utils
-from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
@@ -234,10 +221,6 @@
   # Run without debug tensor-watching.
   NON_DEBUG_RUN = "non_debug_run"
 
-  # Instead of running the fetches as a whole, as would normally happen, invoke
-  # the (to-be-implemented) debug stepper.
-  # TODO(cais): Remove "to-be-implemented".
-  INVOKE_STEPPER = "invoke_stepper"
 
 
 class OnRunStartResponse(object):
@@ -339,9 +322,6 @@
   methods such as on_session_init, on_run_start and on_run_end.
   """
 
-  # TODO(cais): Add on_cont_start and on_cont_end callbacks once the stepper is
-  # is available.
-
   def __init__(self, sess, thread_name_filter=None,
                pass_through_operrors=False):
     """Constructor of `BaseDebugWrapperSession`.
@@ -460,7 +440,19 @@
           "but are used simultaneously.")
 
     self.increment_run_call_count()
-    empty_fetches = not nest.flatten(fetches)
+
+    def is_empty(x):
+      """Check whether a possibly nested structure is empty."""
+      if not nest.is_nested(x):
+        return False
+      if isinstance(x, collections.Mapping):
+        return is_empty(list(x.values()))
+      for item in x:
+        if not is_empty(item):
+          return False
+      return True
+
+    empty_fetches = is_empty(fetches)
     if empty_fetches:
       tf_logging.info(
           "Due to empty fetches, tfdbg Session wrapper is letting a "
@@ -568,19 +560,7 @@
           run_start_resp.action,
           run_metadata=run_metadata,
           client_graph_def=self._sess.graph.as_graph_def())
-    elif (run_start_resp.action == OnRunStartAction.NON_DEBUG_RUN or
-          run_start_resp.action == OnRunStartAction.INVOKE_STEPPER):
-      if callable_runner:
-        raise NotImplementedError(
-            "Stepper mode is not implemented for callables created by "
-            "Session.make_callable().")
-
-      if run_start_resp.action == OnRunStartAction.INVOKE_STEPPER:
-        with stepper.NodeStepper(
-            self._sess, fetches, feed_dict) as node_stepper:
-          retvals = self.invoke_node_stepper(
-              node_stepper, restore_variable_values_on_exit=True)
-
+    elif run_start_resp.action == OnRunStartAction.NON_DEBUG_RUN:
       # Invoke run() method of the wrapped session.
       retvals = self._sess.run(
           fetches,
@@ -748,9 +728,7 @@
 
     Returns:
       An instance of `OnRunStartResponse`, carrying information to
-        1) direct the wrapper session to perform a specified action (e.g., run
-          with or without debug tensor watching, invoking the stepper.)
-        2) debug URLs used to watch the tensors.
+        debug URLs used to watch the tensors.
     """
 
   @abc.abstractmethod
@@ -791,26 +769,6 @@
   # TODO(cais): Add _node_name_regex_whitelist and
   #   _node_op_type_regex_whitelist.
 
-  def invoke_node_stepper(self,
-                          node_stepper,
-                          restore_variable_values_on_exit=True):
-    """Callback invoked when the client intends to step through graph nodes.
-
-    Args:
-      node_stepper: (stepper.NodeStepper) An instance of NodeStepper to be used
-        in this stepping session.
-      restore_variable_values_on_exit: (bool) Whether any variables whose values
-        have been altered during this node-stepper invocation should be restored
-        to their old values when this invocation ends.
-
-    Returns:
-      The same return values as the `Session.run()` call on the same fetches as
-        the NodeStepper.
-    """
-    raise NotImplementedError(
-        self.__class__.__name__ + " does not support node-stepper mode.")
-
-
   def should_stop(self):
     if hasattr(self._sess, "should_stop"):
       return self._sess.should_stop()
@@ -974,11 +932,3 @@
     """See doc of BaseDebugWrapperSession.on_run_end."""
 
     return OnRunEndResponse()
-
-  def invoke_node_stepper(self,
-                          node_stepper,
-                          restore_variable_values_on_exit=True):
-    """See doc of BaseDebugWrapperSession.invoke_node_stepper."""
-
-    raise NotImplementedError(
-        "NonInteractiveDebugWrapperSession does not support node-stepper mode.")
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index d948320..ba93e22 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -20,7 +20,6 @@
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.debug.lib import debug_utils
-from tensorflow.python.debug.lib import stepper
 from tensorflow.python.debug.wrappers import dumping_wrapper
 from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.debug.wrappers import grpc_wrapper
@@ -137,12 +136,6 @@
       run_context.session.graph._finalized = False
       # pylint: enable=protected-access
 
-      with stepper.NodeStepper(
-          run_context.session, run_context.original_args.fetches,
-          run_context.original_args.feed_dict) as node_stepper:
-        self._session_wrapper.invoke_node_stepper(
-            node_stepper, restore_variable_values_on_exit=True)
-
     return run_args
 
   def after_run(self, run_context, run_values):
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index a3ce4d3..c9dd47f 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -29,7 +29,6 @@
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import profile_analyzer_cli
-from tensorflow.python.debug.cli import stepper_cli
 from tensorflow.python.debug.cli import ui_factory
 from tensorflow.python.debug.lib import common
 from tensorflow.python.debug.lib import debug_data
@@ -193,11 +192,6 @@
     self._argparsers["run"] = ap
 
     ap = argparse.ArgumentParser(
-        description="Invoke stepper (cont, step, breakpoint, etc.)",
-        usage=argparse.SUPPRESS)
-    self._argparsers["invoke_stepper"] = ap
-
-    ap = argparse.ArgumentParser(
         description="Display information about this Session.run() call.",
         usage=argparse.SUPPRESS)
     self._argparsers["run_info"] = ap
@@ -232,9 +226,6 @@
   def on_run_start(self, request):
     """Overrides on-run-start callback.
 
-    Invoke the CLI to let user choose what action to take:
-      `run` / `invoke_stepper`.
-
     Args:
       request: An instance of `OnRunStartRequest`.
 
@@ -582,11 +573,6 @@
         self._argparsers["run"].format_help(),
         prefix_aliases=["r"])
     curses_cli.register_command_handler(
-        "invoke_stepper",
-        self._on_run_start_step_handler,
-        self._argparsers["invoke_stepper"].format_help(),
-        prefix_aliases=["s"])
-    curses_cli.register_command_handler(
         "run_info",
         self._run_info_handler,
         self._argparsers["run_info"].format_help(),
@@ -607,19 +593,6 @@
                    for key in self._feed_dict.keys()]
       curses_cli.register_tab_comp_context(["print_feed", "pf"], feed_keys)
 
-  def _on_run_start_step_handler(self, args, screen_info=None):
-    """Command handler for "invoke_stepper" command during on-run-start."""
-
-    _ = screen_info  # Currently unused.
-
-    # No parsing is currently necessary for invoke_stepper. This may change
-    # in the future when the command has arguments.
-
-    # Raise CommandLineExit exception to cause the CLI to exit.
-    raise debugger_cli_common.CommandLineExit(
-        exit_token=framework.OnRunStartResponse(
-            framework.OnRunStartAction.INVOKE_STEPPER, []))
-
   def _get_run_debug_urls(self):
     """Get the debug_urls value for the current run() call.
 
@@ -663,72 +636,3 @@
         feed_dict,
         self._tensor_filters,
         is_callable_runner=is_callable_runner)
-
-  def invoke_node_stepper(self,
-                          node_stepper,
-                          restore_variable_values_on_exit=True):
-    """Overrides method in base class to implement interactive node stepper.
-
-    Args:
-      node_stepper: (`stepper.NodeStepper`) The underlying NodeStepper API
-        object.
-      restore_variable_values_on_exit: (`bool`) Whether any variables whose
-        values have been altered during this node-stepper invocation should be
-        restored to their old values when this invocation ends.
-
-    Returns:
-      The same return values as the `Session.run()` call on the same fetches as
-        the NodeStepper.
-    """
-
-    stepper = stepper_cli.NodeStepperCLI(node_stepper)
-
-    # On exiting the node-stepper CLI, the finalize method of the node_stepper
-    # object will be called, ensuring that the state of the graph will be the
-    # same as if the stepping did not happen.
-    # TODO(cais): Perhaps some users will want the effect of the interactive
-    # stepping and value injection to persist. When that happens, make the call
-    # to finalize optional.
-    stepper_ui = ui_factory.get_ui(
-        self._ui_type,
-        on_ui_exit=(node_stepper.restore_variable_values if
-                    restore_variable_values_on_exit else None))
-
-    stepper_ui.register_command_handler(
-        "list_sorted_nodes",
-        stepper.list_sorted_nodes,
-        stepper.arg_parsers["list_sorted_nodes"].format_help(),
-        prefix_aliases=["lt", "lsn"])
-    stepper_ui.register_command_handler(
-        "cont",
-        stepper.cont,
-        stepper.arg_parsers["cont"].format_help(),
-        prefix_aliases=["ct", "c"])
-    stepper_ui.register_command_handler(
-        "step",
-        stepper.step,
-        stepper.arg_parsers["step"].format_help(),
-        prefix_aliases=["st", "s"])
-    stepper_ui.register_command_handler(
-        "print_tensor",
-        stepper.print_tensor,
-        stepper.arg_parsers["print_tensor"].format_help(),
-        prefix_aliases=["pt"])
-    stepper_ui.register_command_handler(
-        "inject_value",
-        stepper.inject_value,
-        stepper.arg_parsers["inject_value"].format_help(),
-        prefix_aliases=["inject", "override_value", "override"])
-
-    # Register tab completion candidates.
-    stepper_ui.register_tab_comp_context([
-        "cont", "ct", "c", "pt", "inject_value", "inject", "override_value",
-        "override"
-    ], [str(elem) for elem in node_stepper.sorted_nodes()])
-    # TODO(cais): Tie up register_tab_comp_context to a single alias to shorten
-    # calls like this.
-
-    return stepper_ui.run_ui(
-        init_command="lt",
-        title="Node Stepper: " + self._run_description,
-        title_color="blue_on_white")
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index e38df86..c48a582 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -45,6 +45,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_run_hook
 
 
 class LocalCLIDebuggerWrapperSessionForTest(
@@ -829,6 +830,39 @@
     run_output = wrapped_sess.run({"foo": {"baz": []}, "bar": ()})
     self.assertEqual({"foo": {"baz": []}, "bar": ()}, run_output)
 
+  def testSessionRunHook(self):
+    a = array_ops.placeholder(dtypes.float32, [10])
+    b = a + 1
+    c = b * 2
+
+    class Hook(session_run_hook.SessionRunHook):
+
+      def before_run(self, _):
+        return session_run_hook.SessionRunArgs(fetches=c)
+
+    class Hook2(session_run_hook.SessionRunHook):
+
+      def before_run(self, _):
+        return session_run_hook.SessionRunArgs(fetches=b)
+
+    sess = session.Session()
+    sess = LocalCLIDebuggerWrapperSessionForTest([["run"], ["run"]], sess)
+
+    class SessionCreator(object):
+
+      def create_session(self):
+        return sess
+
+    final_sess = monitored_session.MonitoredSession(
+        session_creator=SessionCreator(), hooks=[Hook(), Hook2()])
+
+    final_sess.run(b, feed_dict={a: np.arange(10)})
+    debug_dumps = sess.observers["debug_dumps"]
+    self.assertEqual(1, len(debug_dumps))
+    debug_dump = debug_dumps[0]
+    node_names = [datum.node_name for datum in debug_dump.dumped_tensor_data]
+    self.assertIn(b.op.name, node_names)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 22576a3..c710410 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -191,7 +191,7 @@
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
         "//tensorflow/python:session",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_lib",
     ],
 )
 
@@ -199,6 +199,9 @@
     name = "distribute_coordinator_test",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",  # b/131691139
+    ],
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -338,7 +341,7 @@
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_lib",
     ],
 )
 
@@ -559,6 +562,16 @@
     ],
 )
 
+py_library(
+    name = "model_combinations",
+    srcs = ["model_combinations.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":combinations",
+        ":simple_models",
+    ],
+)
+
 py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
@@ -690,6 +703,9 @@
         "//tensorflow/python/eager:test",
     ],
     grpc_enabled = True,
+    tags = [
+        "no_pip",  # b/131691139
+    ],
 )
 
 py_library(
@@ -749,19 +765,20 @@
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
-    additional_deps = [
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
-        "@absl_py//absl/testing:parameterized",
-        "//tensorflow/python/eager:test",
+    main = "moving_averages_test.py",
+    deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -789,6 +806,7 @@
     main = "minimize_loss_test.py",
     tags = [
         "multi_and_single_gpu",
+        "no_pip",  # b/131691139
     ],
     deps = [
         ":mirrored_strategy",
@@ -841,6 +859,7 @@
     main = "step_fn_test.py",
     tags = [
         "multi_and_single_gpu",
+        "no_pip",  # b/131691139
     ],
     deps = [
         ":single_loss_example",
@@ -877,10 +896,10 @@
     name = "mirrored_strategy_test",
     srcs = ["mirrored_strategy_test.py"],
     additional_deps = [
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
+        ":combinations",
+        ":strategy_combinations",
         ":mirrored_strategy",
-        "//tensorflow/python/distribute:multi_worker_test_base",
+        ":multi_worker_test_base",
         ":strategy_test_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -888,6 +907,7 @@
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:distribute_lib",
@@ -899,6 +919,7 @@
     tags = [
         "guitar",
         "multi_and_single_gpu",
+        "no_pip",  # b/131691139
         "no_windows_gpu",  # TODO(b/130551176)
     ],
 )
@@ -934,3 +955,37 @@
         "//tensorflow/python:layers",
     ],
 )
+
+py_library(
+    name = "model_collection_base",
+    srcs = ["model_collection/model_collection_base.py"],
+)
+
+py_library(
+    name = "simple_models",
+    srcs = ["model_collection/simple_models.py"],
+    deps = [
+        ":model_collection_base",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/keras",
+    ],
+)
+
+distribute_py_test(
+    name = "saved_model_test",
+    size = "medium",
+    srcs = ["saved_model_test.py"],
+    main = "saved_model_test.py",
+    tags = [
+        "no_pip",  # b/131691139
+    ],
+    deps = [
+        ":combinations",
+        ":model_combinations",
+        ":strategy_combinations",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 6d0e3b8..c6c9a19 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -108,13 +108,11 @@
 
     if self._discovery_url:
       return discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=credentials,
-          discoveryServiceUrl=self._discovery_url)
+          'tpu', 'v1alpha1', credentials=credentials,
+          discoveryServiceUrl=self._discovery_url, cache_discovery=False)
     else:
       return discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=credentials)
+          'tpu', 'v1alpha1', credentials=credentials, cache_discovery=False)
 
   def _request_compute_metadata(self, path):
     req = Request('%s/computeMetadata/v1/%s' % (_GCE_METADATA_ENDPOINT, path),
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 4b5eae0..04cbb58 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -380,7 +380,7 @@
     """Configures the object.
 
     Args:
-      session_config: a `tf.ConfigProto`
+      session_config: a `tf.compat.v1.ConfigProto`
       cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
         cluster configurations.
       task_type: the current task type, such as "worker".
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 6ef51b0..6e264de 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -150,20 +150,7 @@
       required_gpus = distribution.required_gpus
       required_tpu = distribution.required_tpu
 
-    if required_tpu and not TPU_TEST:
-      self.skipTest("Test requires a TPU, but it's not available.")
-    if not required_tpu and TPU_TEST:
-      self.skipTest("Test that doesn't require a TPU.")
-
-    if not required_gpus:
-      if GPU_TEST:
-        self.skipTest("Test that doesn't require GPUs.")
-    elif context.num_gpus() < required_gpus:
-      # TODO(priyag): Consider allowing tests in graph mode using soft
-      # placement.
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(required_gpus, context.num_gpus()))
+    maybe_skip_test(self, required_tpu, required_gpus)
 
     # At this point, `kwargs` doesn't have `required_gpus` or `required_tpu`
     # that the user might have specified.  `kwargs` still has `mode`, which
@@ -199,6 +186,23 @@
   return decorated
 
 
+def maybe_skip_test(test_case, is_tpu_required, num_gpus_required):
+  if is_tpu_required and not TPU_TEST:
+    test_case.skipTest("Test requires a TPU, but it's not available.")
+  if not is_tpu_required and TPU_TEST:
+    test_case.skipTest("Test that doesn't require a TPU.")
+
+  if not num_gpus_required:
+    if GPU_TEST:
+      test_case.skipTest("Test that doesn't require GPUs.")
+  elif context.num_gpus() < num_gpus_required:
+    # TODO(priyag): Consider allowing tests in graph mode using soft
+    # placement.
+    test_case.skipTest(
+        "{} GPUs are not available for this test. {} GPUs are available".format(
+            num_gpus_required, context.num_gpus()))
+
+
 def combine(**kwargs):
   """Generate combinations based on its keyword arguments.
 
@@ -310,3 +314,43 @@
   @property
   def required_tpu(self):
     return self._required_tpu
+
+
+class NamedDistributionPair(object):
+  """NamedDistribution but for a pair of strategies."""
+
+  def __init__(self, named_distribution_1, named_distribution_2):
+    self._named_distribution_1 = named_distribution_1
+    self._named_distribution_2 = named_distribution_2
+    self._name = str(named_distribution_1) + "_" + str(named_distribution_2)
+
+    if not named_distribution_1.required_gpus:
+      self._required_gpus = named_distribution_2.required_gpus
+    elif not named_distribution_2.required_gpus:
+      self._required_gpus = named_distribution_1.required_gpus
+    else:
+      self._required_gpus = max(self._named_distribution_1.required_gpus,
+                                self._named_distribution_2.required_gpus)
+
+    self._required_tpu = (
+        self._named_distribution_1.required_tpu or
+        self._named_distribution_2.required_tpu)
+
+  def __repr__(self):
+    return self._name
+
+  @property
+  def strategy_1(self):
+    return self._named_distribution_1._distribution_fn()
+
+  @property
+  def strategy_2(self):
+    return self._named_distribution_2._distribution_fn()
+
+  @property
+  def num_gpus_required(self):
+    return self._required_gpus
+
+  @property
+  def is_tpu_required(self):
+    return self._required_tpu
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 1a4c7c5..fd23058 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -1154,8 +1154,8 @@
 
   Args:
     devices: a list of devices passed to `tf.distribute.Strategy`.
-    session_config: a `tf.ConfigProto` or `None`. If `None`, it will make
-      decision based on all local devices.
+    session_config: a `tf.compat.v1.ConfigProto` or `None`. If `None`, it will
+      make decision based on all local devices.
 
   Returns:
     A subclass of `CrossDeviceOps`.
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index 9ba1b95..8f73fdb 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -128,7 +128,7 @@
         replicated training.
       task_id: an integer indicating id of the corresponding task. It can be
         None if it is local training or in-graph replicated training.
-      session_config: an optional `tf.ConfigProto` object.
+      session_config: an optional `tf.compat.v1.ConfigProto` object.
       rpc_layer: optional string specifying the RPC protocol for communication
         with worker masters. If None or empty, hosts in the `cluster_spec` will
         be used directly.
@@ -578,11 +578,11 @@
   "grpc".
 
   Args:
-    session_config: an optional `tf.ConfigProto` object. Users can pass in
-      the session config object to configure server-local devices.
+    session_config: an optional `tf.compat.v1.ConfigProto` object. Users can
+      pass in the session config object to configure server-local devices.
 
   Returns:
-    a `tf.train.Server` object which has already been started.
+    a `tf.distribute.Server` object which has already been started.
 
   Raises:
     ValueError: if the "TF_CONFIG" environment is not complete.
@@ -736,8 +736,8 @@
       in a cluster. If not set or empty, fall back to local training.
     task_type: the current task type, optional if this is a client.
     task_id: the current task id, optional if this is a client.
-    session_config: an optional `tf.ConfigProto` object which will be passed
-      to `strategy`'s `configure` method and used to create a session.
+    session_config: an optional `tf.compat.v1.ConfigProto` object which will be
+      passed to `strategy`'s `configure` method and used to create a session.
     rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
 
   Raises:
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index 2299716..eb8daa7 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -39,6 +39,7 @@
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator
 from tensorflow.python.distribute import distribute_coordinator_context
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -46,6 +47,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_manager
 
@@ -181,6 +183,7 @@
     self._strategy_property = {}
     self._std_servers = {}
     self._barrier = distribute_coordinator._Barrier(NUM_WORKERS)
+    self._coord = coordinator.Coordinator()
 
   @contextlib.contextmanager
   def _test_session(self, target):
@@ -236,10 +239,16 @@
     if result_value == expected:
       self._result_correct += 1
 
+  def _wrapped_worker_fn(self, worker_fn):
+    def wrapped(*args, **kwargs):
+      with self._coord.stop_on_exception():
+        return worker_fn(*args, **kwargs)
+    return wrapped
+
   def _run_coordinator_in_thread(self, worker_fn, strategy, **kwargs):
     t = threading.Thread(
         target=distribute_coordinator.run_distribute_coordinator,
-        args=(worker_fn, strategy),
+        args=(self._wrapped_worker_fn(worker_fn), strategy),
         kwargs=kwargs)
     t.start()
     return t
@@ -260,6 +269,15 @@
         threads[task_type].append(t)
     return threads
 
+  def _join_threads(self, threads):
+    try:
+      self._coord.join(threads)
+    except errors.UnknownError as e:
+      if "Could not start gRPC server" in e.message:
+        self.skipTest("Cannot start std servers.")
+      else:
+        raise
+
   def _between_graph_worker_fn(self, strategy):
     context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
@@ -584,7 +602,7 @@
         MockStrategy(between_graph=False),
         cluster_spec,
         mode=INDEPENDENT_WORKER)
-    threads[WORKER][0].join()
+    self._join_threads([threads[WORKER][0]])
     self.assertEqual(self._result_correct, 1)
 
   def testBetweenGraph(self):
@@ -595,8 +613,7 @@
         MockStrategy(between_graph=True),
         cluster_spec,
         mode=INDEPENDENT_WORKER)
-    for task_id in range(NUM_WORKERS):
-      threads[WORKER][task_id].join()
+    self._join_threads(threads[WORKER])
 
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
@@ -610,8 +627,7 @@
         MockStrategy(between_graph=True),
         cluster_spec,
         mode=INDEPENDENT_WORKER)
-    for task_id in range(NUM_WORKERS):
-      threads[WORKER][task_id].join()
+    self._join_threads(threads[WORKER])
 
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
@@ -627,8 +643,7 @@
           cluster_spec,
           mode=INDEPENDENT_WORKER,
           rpc_layer=None)
-      for task_id in range(NUM_WORKERS):
-        threads[WORKER][task_id].join()
+      self._join_threads(threads[WORKER])
 
     # There is only one type of task and three such tasks.
     self.assertEqual(len(self._worker_context), 1)
@@ -666,8 +681,7 @@
           cluster_spec,
           mode=INDEPENDENT_WORKER,
           rpc_layer=None)
-      for task_id in range(NUM_WORKERS):
-        threads[WORKER][task_id].join()
+      self._join_threads(threads[WORKER])
 
     # There is only one type of task and there three such tasks.
     self.assertEqual(len(self._strategy_property), 1)
@@ -691,8 +705,7 @@
           cluster_spec,
           mode=INDEPENDENT_WORKER,
           rpc_layer=None)
-      for task_id in range(NUM_WORKERS):
-        threads[WORKER][task_id].join()
+      self._join_threads(threads[WORKER])
 
     # There is only a "None" task in the dumped task context.
     self.assertEqual(len(self._worker_context), 1)
@@ -727,9 +740,8 @@
           cluster_spec,
           mode=INDEPENDENT_WORKER,
           rpc_layer=None)
-      for task_id in range(NUM_WORKERS):
-        threads[WORKER][task_id].join()
-      threads[EVALUATOR][0].join()
+      self._join_threads(threads[WORKER])
+      self._join_threads([threads[EVALUATOR][0]])
 
     # There are one "None" task and one EVALUATOR task.
     self.assertEqual(len(self._worker_context), 2)
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 844b068..fce8aca 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -33,6 +33,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
@@ -432,9 +433,27 @@
   def experimental_distribute_dataset(self, dataset):
     """Distributes a tf.data.Dataset instance provided via `dataset`.
 
-    Data from the given dataset will be distributed evenly across all the
-    compute replicas. This function assumes that the input dataset is batched
-    by the global batch size.
+    In a multi-worker setting, we will first attempt to distribute the dataset
+    by attempting to detect whether the dataset is being created out of
+    ReaderDatasets (e.g. TFRecordDataset, TextLineDataset, etc.) and if so,
+    attempting to shard the input files. Note that there has to be at least one
+    input file per worker. If you have less than one input file per worker, we
+    suggest that you should disable distributing your dataset using the method
+    below.
+
+    If that attempt is unsuccessful (e.g. the dataset is created from a
+    Dataset.range), we will shard the dataset evenly at the end by appending a
+    `.shard` operation to the end of the processing pipeline. This will cause
+    the entire preprocessing pipeline for all the data to be run on every
+    worker, and each worker will do redundant work. We will print a warning
+    if this method of sharding is selected.
+
+    You can disable dataset distribution using the `auto_shard` option in
+    `tf.data.experimental.DistributeOptions`.
+
+    Within each host, we will also split the data among all the worker devices
+    (if more than one a present), and this will happen even if multi-worker
+    sharding is disabled using the method above.
 
     The following is an example:
 
@@ -442,7 +461,8 @@
     strategy = tf.distribute.MirroredStrategy()
 
     # Create a dataset
-    dataset = dataset_ops.Dataset.range(10).batch(2)
+    dataset = dataset_ops.Dataset.TFRecordDataset([
+      "/a/1.tfr", "/a/2.tfr", "/a/3.tfr", /a/4.tfr"])
 
     # Distribute that dataset
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
@@ -453,8 +473,8 @@
     ```
 
     Args:
-      dataset: `tf.data.Dataset` that will be distributed evenly across all
-        replicas.
+      dataset: `tf.data.Dataset` that will be sharded across all replicas using
+        the rules stated above.
 
     Returns:
       A `DistributedDataset` which returns inputs for each step of the
@@ -561,14 +581,15 @@
           raise ValueError(
               "`axis` = %r out of range for `value` with rank %d" %
               (axis, v.shape.rank))
-        # TODO(anjalisridhar): Added a second condition to handle the case of
-        # dynamic shapes when using tf.functions. We might want to remove this
-        # static shape case and always calculate the shape of v.
-        if (v.shape[axis] is not None and
-            [x for x in v.get_shape().as_list() if x]):
+        # TF v2 returns `None` for unknown dimensions and an integer for
+        # known dimension, whereas TF v1 returns tensor_shape.Dimension(None)
+        # or tensor_shape.Dimension(integer). `dimension_value` hides this
+        # difference, always returning `None` or an integer.
+        dim = tensor_shape.dimension_value(v.shape[axis])
+        if dim is not None:
           # By returning a python value in the static shape case, we can
           # maybe get a fast path for reducing the denominator.
-          return numer, v.shape[axis]
+          return numer, dim
       elif axis < 0:
         axis = axis + array_ops.rank(v)
       denom = array_ops.shape_v2(v, out_type=dtypes.int64)[axis]
@@ -984,13 +1005,13 @@
     for `d`
   * `with d.extended.colocate_vars_with(v)`: in replica/cross-replica context,
     variables will be created with locality V(`v`). That is, if we write
-    `with d.extended.colocate_vars_with(v1): v2 = tf.get_variable(...)`,
-    then `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
-    V(`v1`).
+    `with d.extended.colocate_vars_with(v1):
+    v2 = tf.Variable(...)`, then `v2` will have locality V(`v1`),
+    i.e. locality V(`v2`) will equal V(`v1`).
   * `with d.extended.colocate_vars_with(d.extended.non_slot_devices(...))`: in
     replica/cross-replica context, variables will be created with locality N
-  * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
-    a variable (which by definition will have locality V(`v`), though
+  * `v = tf.Variable(...)`: in replica/cross-replica context,
+    creates a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
   * `d.make_dataset_iterator(dataset)`: in cross-replica
@@ -1128,7 +1149,7 @@
     No operations should be added to the graph inside this scope, it
     should only be used when creating variables (some implementations
     work by changing variable creation, others work by using a
-    tf.colocate_with() scope).
+    tf.compat.v1.colocate_with() scope).
 
     This may only be used inside `self.scope()`.
 
@@ -1136,11 +1157,11 @@
 
     ```
     with strategy.scope():
-      var1 = tf.get_variable(...)
+      var1 = tf.Variable(...)
       with strategy.extended.colocate_vars_with(var1):
         # var2 and var3 will be created on the same device(s) as var1
-        var2 = tf.get_variable(...)
-        var3 = tf.get_variable(...)
+        var2 = tf.Variable(...)
+        var3 = tf.Variable(...)
 
       def fn(v1, v2, v3):
         # operates on v1 from var1, v2 from var2, and v3 from var3
@@ -1839,7 +1860,7 @@
   def _update_non_slot(self, colocate_with, fn, args, kwargs, should_group):
     # TODO(josh11b): Figure out what we should be passing to UpdateContext()
     # once that value is used for something.
-    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
+    with UpdateContext(colocate_with):
       result = fn(*args, **kwargs)
       if should_group:
         return result
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 7d98f95..65447dd 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -159,7 +159,7 @@
 
   # Don't use distribute coordinator if it is local training or cluster has a
   # MASTER job or `train_distribute` is not specifed.
-  if (not tf_config or 'master' in cluster_spec.jobs or
+  if (not cluster_spec or 'master' in cluster_spec.jobs or
       not config._train_distribute):
     config._distribute_coordinator_mode = None
     config._init_distributed_setting_from_environment_var(tf_config)
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index ffd033b..d1da5ba 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.data.util import structure
@@ -327,7 +328,7 @@
     # pipeline and only receive its own shard of the dataset.
     assert isinstance(input_workers, InputWorkers)
     if split_batch_by:
-      dataset = batching._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
+      dataset = distribute._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
 
     self._cloned_datasets = []
     if input_context:
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index fb92045..3d61cd5 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -298,6 +298,34 @@
 
   @combinations.generate(combinations.combine(
       mode=["graph"],
+      input_type=["dataset"],
+      api_type=["wrap_into_iterator", "wrap_into_dataset"],
+      iteration_type=["get_next", "for_loop"],
+      autoshard=[True, False]))
+  def testAutoshardingOption(self, input_type, api_type, iteration_type,
+                             autoshard):
+    ds_option = dataset_ops.Options()
+    ds_option.experimental_distribute.auto_shard = autoshard
+
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      if tf2.enabled():
+        dataset_fn = (
+            lambda _: dataset_ops.DatasetV2.range(4).with_options(ds_option))
+      else:
+        dataset_fn = (
+            lambda _: dataset_ops.Dataset.range(4).with_options(ds_option))
+
+      if autoshard:
+        expected_values = [[0, 1], [2, 3]]
+      else:
+        expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]]
+      self._test_input_iteration(input_type, api_type, iteration_type,
+                                 dataset_fn, worker_devices,
+                                 expected_values, sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
       input_type=["input_fn", "dataset"],
       api_type=["wrap_into_iterator", "wrap_into_dataset"],
       iteration_type=["get_next", "for_loop"]))
diff --git a/tensorflow/python/distribute/input_ops.py b/tensorflow/python/distribute/input_ops.py
index 8764b1f..836bfae 100644
--- a/tensorflow/python/distribute/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -42,10 +42,13 @@
     files. The input dataset will be returned if we cannot automatically
     determine a good way to shard the input dataset.
   """
-  if isinstance(dataset, dataset_ops.DatasetV1):
-    return distribute._AutoShardDatasetV1(dataset, num_shards, index)
+  if dataset.options().experimental_distribute.auto_shard:
+    if isinstance(dataset, dataset_ops.DatasetV1):
+      return distribute._AutoShardDatasetV1(dataset, num_shards, index)
+    else:
+      return distribute._AutoShardDataset(dataset, num_shards, index)
   else:
-    return distribute._AutoShardDataset(dataset, num_shards, index)
+    return dataset
 
 
 def _clone_dataset(dataset):
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index eac56e4..a7f811e 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -429,9 +429,10 @@
   The multi-worker version will be added in the future.
 
   Args:
-    devices: a list of device strings.
+    devices: a list of device strings.  If `None`, all available GPUs are used.
+    If no GPUs are found, CPU is used.
     cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
-      set, nccl will be use by default.
+      set, nccl will be used by default.
   """
 
   def __init__(self, devices=None, cross_device_ops=None):
@@ -956,20 +957,20 @@
     # `tf.function` and there is a merge_call in `fn`. This breaks because each
     # thread tries to create a distinct tf.function. Each tf.function creation
     # takes a lock, and so if there is a merge call in the middle, the lock is
-    # never releases and subsequent replica threads cannot proceed to define
+    # never released and subsequent replica threads cannot proceed to define
     # their own functions. Checking for the graph being the same is one way for
     # us to check this didn't happen.
     if ops.get_default_graph() != t.graph:
       raise RuntimeError(
-          "`merge_call` called while defining a new graph. "
-          "This can happen if the function `fn` passed to "
-          "`strategy.experimental_run()` or "
-          "`strategy.extended.call_for_each_replica()` is decorated with "
-          "`@tf.function`. In this case, wrap the call to "
-          "`strategy.experimental_run()` or "
-          "`strategy.extended.call_for_each_replica()` with `@tf.function` "
-          "instead of `fn`. This will avoid mismatching graphs and also "
-          "improve performance.")
+          "`merge_call` called while defining a new graph or a tf.function. "
+          "This can often happen if the function `fn` passed to "
+          "`strategy.experimental_run()` is decorated with "
+          "`@tf.function` (or contains a nested `@tf.function`), and `fn` "
+          "contains a synchronization point, such as aggregating gradients. "
+          "This behavior is not yet supported. Instead, please wrap the entire "
+          "call `strategy.experimental_run(fn)` in a `@tf.function`, and avoid "
+          "nested `tf.function`s that may potentially cross a synchronization "
+          "boundary.")
 
     t.has_paused.set()
     t.should_run.wait()
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 308ef22..7308354 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -44,6 +44,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
@@ -107,6 +108,20 @@
       expected = sum(range(distribution.num_replicas_in_sync))
       self.assertEqual(expected, self.evaluate(reduced))
 
+  def reduce_axis_helper(self, distribution, replica_squared_fn):
+    with distribution.scope():
+      num_replicas = distribution.num_replicas_in_sync
+      result = distribution.extended.call_for_each_replica(replica_squared_fn)
+      # sum
+      reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result, axis=0)
+      expected = sum(x * (x + 1) for x in range(num_replicas))
+      self.assertNear(expected, self.evaluate(reduced), 0.00001)
+
+      # mean
+      reduced = distribution.reduce(reduce_util.ReduceOp.MEAN, result, axis=0)
+      expected /= sum(x + 1 for x in range(num_replicas))
+      self.assertNear(expected, self.evaluate(reduced), 0.00001)
+
   def testReduceAxisToCpu(self, distribution):
     for dtype in (dtypes.float32, dtypes.int32):
       def replica_squared_fn(dtype=dtype):
@@ -114,18 +129,31 @@
         replica_id = _replica_id_as_int()
         return math_ops.cast([replica_id] * (replica_id + 1), dtype)
 
-      with distribution.scope():
-        num_replicas = distribution.num_replicas_in_sync
-        result = distribution.extended.call_for_each_replica(replica_squared_fn)
-        # sum
-        reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result, axis=0)
-        expected = sum(x * (x + 1) for x in range(num_replicas))
-        self.assertNear(expected, self.evaluate(reduced), 0.00001)
+      self.reduce_axis_helper(distribution, replica_squared_fn)
 
-        # mean
-        reduced = distribution.reduce(reduce_util.ReduceOp.MEAN, result, axis=0)
-        expected /= sum(x + 1 for x in range(num_replicas))
-        self.assertNear(expected, self.evaluate(reduced), 0.00001)
+  def set_v2_tensorshape(self, v2):
+    if v2:
+      tensor_shape.enable_v2_tensorshape()
+    else:
+      tensor_shape.disable_v2_tensorshape()
+
+  def testReduceAxisToCpuUnknownShape(self, distribution):
+    original_v2 = tensor_shape._TENSORSHAPE_V2_OVERRIDE  # pylint: disable=protected-access
+    try:
+      for v2 in (False, True):
+        self.set_v2_tensorshape(v2)
+        for dtype in (dtypes.float32, dtypes.int32):
+          for shape in ((None,), None):  # Test both unknown size and rank.
+            def replica_squared_fn(dtype=dtype, shape=shape):
+              # Lists with different lengths on different replicas.
+              replica_id = _replica_id_as_int()
+              tensor = math_ops.cast([replica_id] * (replica_id + 1), dtype)
+              # Erase shape information
+              return array_ops.placeholder_with_default(tensor, shape=shape)
+
+            self.reduce_axis_helper(distribution, replica_squared_fn)
+    finally:
+      self.set_v2_tensorshape(original_v2)
 
   def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
@@ -140,8 +168,7 @@
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
                                  expected_values)
 
-  # TODO(b/124344198): Re-enable after fixing this flaky test.
-  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
+  def testMakeInputFnIteratorWithCallable(self, distribution):
     def fn():
       dataset = dataset_ops.Dataset.range(2).interleave(
           (lambda _: dataset_ops.Dataset.range(10)), cycle_length=2)
@@ -156,7 +183,8 @@
         expected_input_pipeline_id=0)
     iterator = distribution.make_input_fn_iterator(input_fn)
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
-                                 expected_values, test_reinitialize=False)
+                                 expected_values, test_reinitialize=False,
+                                 ignore_order=True)
 
   def testNumpyDataset(self, distribution):
     self._test_numpy_dataset(distribution)
@@ -1544,7 +1572,7 @@
       self._test_input_fn_iterator(
           iterator, distribution.extended.worker_devices, expected_values, sess)
 
-  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
+  def testMakeInputFnIteratorWithCallable(self, distribution):
     self._configure_distribution_strategy(distribution)
     def fn():
       dataset = dataset_ops.Dataset.range(100)
@@ -1568,7 +1596,7 @@
       iterator = distribution.make_input_fn_iterator(input_fn)
       self._test_input_fn_iterator(
           iterator, distribution.extended.worker_devices, expected_values, sess,
-          test_reinitialize=False)
+          test_reinitialize=False, ignore_order=True)
 
   def testUpdateConfigProto(self, distribution):
     distribution.configure(cluster_spec={"worker": ["fake1", "fake2"]})
diff --git a/tensorflow/python/distribute/model_collection/model_collection_base.py b/tensorflow/python/distribute/model_collection/model_collection_base.py
new file mode 100644
index 0000000..bbfae29
--- /dev/null
+++ b/tensorflow/python/distribute/model_collection/model_collection_base.py
@@ -0,0 +1,45 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A base class to provid a model and corresponding input data for testing."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class ModelAndInput(object):
+  """Base class to provide model and its corresponding inputs."""
+
+  def get_model(self):
+    """Returns a compiled keras model object, together with output name.
+
+    Returns:
+      model: a keras model object
+      output_name: a string for the name of the output layer
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def get_data(self):
+    """Returns data for training and predicting.
+
+    Returns:
+      x_train: data used for training
+      y_train: label used for training
+      x_predict: data used for predicting
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def get_batch_size(self):
+    """Returns the batch_size used by the model."""
+    raise NotImplementedError("must be implemented in descendants")
diff --git a/tensorflow/python/distribute/model_collection/simple_models.py b/tensorflow/python/distribute/model_collection/simple_models.py
new file mode 100644
index 0000000..1248ba6
--- /dev/null
+++ b/tensorflow/python/distribute/model_collection/simple_models.py
@@ -0,0 +1,111 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple functional keras model with one layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute.model_collection import model_collection_base
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+
+_BATCH_SIZE = 10
+
+
+def _get_data_for_simple_models():
+  x_train = constant_op.constant(np.random.rand(1, 3), dtype=dtypes.float32)
+  y_train = constant_op.constant(np.random.rand(1, 5), dtype=dtypes.float32)
+  x_predict = constant_op.constant(np.random.rand(1, 3), dtype=dtypes.float32)
+
+  return x_train, y_train, x_predict
+
+
+class SimpleFunctionalModel(model_collection_base.ModelAndInput):
+  """A simple functinal model and its inputs."""
+
+  def get_model(self, **kwargs):
+    output_name = 'output_layer'
+
+    x = keras.layers.Input(shape=(3,), dtype=dtypes.float32)
+    y = keras.layers.Dense(5, dtype=dtypes.float32, name=output_name)(x)
+
+    model = keras.Model(inputs=x, outputs=y)
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    model.compile(loss='mse', metrics=['mae'], optimizer=optimizer)
+
+    return model, output_name
+
+  def get_data(self):
+    return _get_data_for_simple_models()
+
+  def get_batch_size(self):
+    return _BATCH_SIZE
+
+
+class SimpleSequentialModel(model_collection_base.ModelAndInput):
+  """A simple sequential model and its inputs."""
+
+  def get_model(self, **kwargs):
+    output_name = 'output_layer'
+
+    model = keras.Sequential()
+    y = keras.layers.Dense(
+        5, dtype=dtypes.float32, name=output_name, input_dim=3)
+    model.add(y)
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    model.compile(loss='mse', metrics=['mae'], optimizer=optimizer)
+
+    return model, output_name
+
+  def get_data(self):
+    return _get_data_for_simple_models()
+
+  def get_batch_size(self):
+    return _BATCH_SIZE
+
+
+class _SimpleModel(keras.Model):
+
+  output_name = 'output_layer'
+
+  def __init__(self):
+    self._dense_layer = keras.layers.Dense(
+        5, dtype=dtypes.float32, name=self.output_name)
+
+  def call(self, inputs):
+    return self._dense_layer(inputs)
+
+
+class SimpleSubclassModel(model_collection_base.ModelAndInput):
+  """A simple subclass model and its data."""
+
+  def get_model(self, **kwargs):
+    model = _SimpleModel()
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    model.compile(
+        loss='mse', metrics=['mae'], cloning=False, optimizer=optimizer)
+
+    return model, model.output_name
+
+  def get_data(self):
+    return _get_data_for_simple_models()
+
+  def get_batch_size(self):
+    return _BATCH_SIZE
diff --git a/tensorflow/python/distribute/model_combinations.py b/tensorflow/python/distribute/model_combinations.py
new file mode 100644
index 0000000..798bf11
--- /dev/null
+++ b/tensorflow/python/distribute/model_combinations.py
@@ -0,0 +1,31 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Strategy and optimizer combinations for combinations.combine()."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute.model_collection import simple_models
+
+simple_functional_model = combinations.NamedObject(
+    "SimpleFunctionalModel", simple_models.SimpleFunctionalModel())
+
+simple_sequential_model = combinations.NamedObject(
+    "SimpleSequentialModel", simple_models.SimpleSequentialModel())
+
+simple_subclass_model = combinations.NamedObject(
+    "SimpleSubclassModel", simple_models.SimpleSubclassModel())
diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py
index db9c492..97626ed 100644
--- a/tensorflow/python/distribute/moving_averages_test.py
+++ b/tensorflow/python/distribute/moving_averages_test.py
@@ -35,6 +35,7 @@
         strategy_combinations.default_strategy,
         strategy_combinations.one_device_strategy,
         strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        strategy_combinations.tpu_strategy,
     ],
     mode=["graph"])
 
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 21b7db3..caedfbb 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -23,6 +23,8 @@
 import copy
 import json
 import os
+import subprocess
+import sys
 import threading
 import numpy as np
 
@@ -40,11 +42,13 @@
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import nest
 
 
 original_run_std_server = dc._run_std_server  # pylint: disable=protected-access
@@ -286,9 +290,14 @@
 
     return config
 
-  def _run_client(self, client_fn, task_type, task_id, num_gpus, *args,
-                  **kwargs):
-    result = client_fn(task_type, task_id, num_gpus, *args, **kwargs)
+  def _run_client(self, client_fn, task_type, task_id, num_gpus, eager_mode,
+                  *args, **kwargs):
+    if eager_mode:
+      with context.eager_mode():
+        result = client_fn(task_type, task_id, num_gpus, *args, **kwargs)
+    else:
+      with context.graph_mode():
+        result = client_fn(task_type, task_id, num_gpus, *args, **kwargs)
     if np.all(result):
       with self._lock:
         self._result += 1
@@ -310,7 +319,8 @@
       for task_id in range(len(cluster_spec.get(task_type, []))):
         t = threading.Thread(
             target=self._run_client,
-            args=(client_fn, task_type, task_id, num_gpus) + args,
+            args=(client_fn, task_type, task_id, num_gpus,
+                  context.executing_eagerly()) + args,
             kwargs=kwargs)
         t.start()
         threads.append(t)
@@ -369,7 +379,6 @@
   """Testing infra for independent workers."""
 
   def _make_mock_run_std_server(self):
-    thread_local = threading.local()
 
     def _mock_run_std_server(*args, **kwargs):
       ret = original_run_std_server(*args, **kwargs)
@@ -377,9 +386,9 @@
       # of remote sessions taking local ports that have been assigned to std
       # servers. Only call this barrier the first time this function is run for
       # each thread.
-      if not getattr(thread_local, 'server_started', False):
+      if not getattr(self._thread_local, 'server_started', False):
         self._barrier.wait()
-      thread_local.server_started = True
+      self._thread_local.server_started = True
       return ret
 
     return _mock_run_std_server
@@ -391,6 +400,8 @@
     self._coord = coordinator.Coordinator()
     super(IndependentWorkerTestBase, self).setUp()
     self._mock_context.__enter__()
+    # threading local object to be shared by all threads
+    self._thread_local = threading.local()
 
   def tearDown(self):
     self._mock_context.__exit__(None, None, None)
@@ -411,18 +422,39 @@
 
   def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
                           *args, **kwargs):
-    if task_type:
-      tf_config = {
-          'cluster': cluster_spec,
-          'task': {
-              'type': task_type,
-              'index': task_id
-          }
-      }
-    else:
-      tf_config = {
-          'cluster': cluster_spec,
-      }
+    """Run tasks in a thread.
+
+    If `tf_config` is provided, use it for the new thread; if not, construct one
+    from `cluster_spec`, `task_type`, and `task_id`, and provide it to the new
+    thread to be set as `TF_CONFIG` environment.
+
+    Arguments:
+      task_fn: The function to run in the new thread.
+      cluster_spec: The cluster spec.
+      task_type: The task type.
+      task_id: The task id.
+      *args: Additional positional arguments to provide to the thread's task_fn.
+      **kwargs: Additional keyword arguments to provide to the thread's task_fn.
+        If `tf_config` is provided, that dict will be used for the TF_CONFIG for
+        the new thread.
+
+    Returns:
+      The thread that has started.
+    """
+    tf_config = kwargs.pop('tf_config', None)
+    if tf_config is None:
+      if task_type:
+        tf_config = {
+            'cluster': cluster_spec,
+            'task': {
+                'type': task_type,
+                'index': task_id
+            }
+        }
+      else:
+        tf_config = {
+            'cluster': cluster_spec,
+        }
     t = threading.Thread(
         target=self._task_thread,
         args=(task_fn, tf_config, context.executing_eagerly()) + args,
@@ -443,7 +475,69 @@
     return threads
 
   def join_independent_workers(self, worker_threads):
-    self._coord.join(worker_threads)
+    try:
+      self._coord.join(worker_threads)
+    except errors.UnknownError as e:
+      if 'Could not start gRPC server' in e.message:
+        self.skipTest('Cannot start std servers.')
+      else:
+        raise
+
+
+class MultiWorkerMultiProcessTest(test.TestCase):
+  """Testing infra for independent workers using multiple processes."""
+
+  def _run_task_in_process(self, cmd_args, cluster_spec, task_type, task_id):
+    env = os.environ.copy()
+    env['TF_CONFIG'] = json.dumps({
+        'cluster': cluster_spec,
+        'task': {
+            'type': task_type,
+            'index': task_id
+        }
+    })
+    return subprocess.Popen(
+        cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
+
+  def run_multiple_tasks_in_processes(self, cmd_args, cluster_spec):
+    """Run `cmd_args` in a process for each task in `cluster_spec`."""
+    processes = {}
+    for task_type in cluster_spec.keys():
+      processes[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        p = self._run_task_in_process(cmd_args, cluster_spec, task_type,
+                                      task_id)
+        processes[task_type].append(p)
+    return processes
+
+  def join_independent_workers(self, worker_processes):
+    return_codes = []
+    for p in nest.flatten(worker_processes):
+      try:
+        # Calling p.wait() will hang if we don't consume its output.
+        p.communicate()
+      except ValueError:
+        # The output of the process may have been consumed, in which case
+        # calling `p.communicate()` will raise a ValueError.
+        pass
+      finally:
+        return_codes.append(p.returncode)
+    for return_code in return_codes:
+      self.assertEqual(return_code, 0)
+
+  def stream_stderr(self, process):
+    # TODO(yuefengz): calling stream_stderr on a single process will probably
+    # make all processes hang if they have too much output e.g. adding
+    # --vmodule=execute=2 to cmd_args. But this method is useful for debugging
+    # purposes. We should figure out the hanging problem, probably by consuming
+    # outputs of all processes at the same time.
+    while True:
+      output = process.stderr.readline()
+      if not output and process.poll() is not None:
+        break
+      if output:
+        print(output.strip())
+        sys.stdout.flush()
 
 
 def get_tf_config_task():
diff --git a/tensorflow/python/distribute/one_device_strategy_test.py b/tensorflow/python/distribute/one_device_strategy_test.py
index 377f37c..023b2ba 100644
--- a/tensorflow/python/distribute/one_device_strategy_test.py
+++ b/tensorflow/python/distribute/one_device_strategy_test.py
@@ -74,7 +74,7 @@
     iterator = distribution.make_input_fn_iterator(input_fn)
     self._test_input_fn_iterator(
         iterator, distribution.extended.worker_devices, expected_values,
-        test_reinitialize=False)
+        test_reinitialize=False, ignore_order=True)
 
   def testNumpyDataset(self, distribution):
     self._test_numpy_dataset(distribution)
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 89812a1..26eafa3 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -81,9 +81,9 @@
   variables.
 
   2) It is also not recommended to open a colocation scope (i.e. calling
-  `tf.colocate_with`) under the strategy's scope. For colocating variables, use
-  `strategy.extended.colocate_vars_with` instead. Colocation of ops will
-  possibly create conflicts of device assignment.
+  `tf.compat.v1.colocate_with`) under the strategy's scope. For colocating
+  variables, use `strategy.extended.colocate_vars_with` instead. Colocation of
+  ops will possibly create conflicts of device assignment.
   """
 
   def __init__(self, cluster_resolver=None):
diff --git a/tensorflow/python/distribute/saved_model_test.py b/tensorflow/python/distribute/saved_model_test.py
new file mode 100644
index 0000000..eb04451
--- /dev/null
+++ b/tensorflow/python/distribute/saved_model_test.py
@@ -0,0 +1,200 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test saved_model with distribution strategies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import model_combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import test
+from tensorflow.python.framework import random_seed
+from tensorflow.python.saved_model import saved_model
+
+_RANDOM_SEED = 1337
+_DEFAULT_FUNCTION_KEY = 'serving_default'
+_IN_SCOPE_SAVE_DIR = 'in_scope/'
+_OUT_OF_SCOPE_SAVE_DIR = 'out_of_scope/'
+
+simple_models = [
+    model_combinations.simple_functional_model,
+    model_combinations.simple_sequential_model,
+
+    # TODO(b/131715604): figure out why subclass model does not work
+    # model_combinations.simple_subclass_model,
+]
+
+
+def get_strategy_cross_product():
+  result = []
+  for strategy_1 in strategy_combinations.strategies_minus_tpu:
+    for strategy_2 in strategy_combinations.strategies_minus_tpu:
+      result.append(combinations.NamedDistributionPair(strategy_1, strategy_2))
+
+  return result
+
+
+def simple_models_with_strategies():
+  return combinations.combine(
+      model_and_input=simple_models,
+      distribution=strategy_combinations.strategies_minus_tpu,
+      mode=['eager'])
+
+
+class TestSavedModel(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    np.random.seed(_RANDOM_SEED)
+    random_seed.set_random_seed(_RANDOM_SEED)
+    super(TestSavedModel, self).setUp()
+
+  def _train_model(self, model, x_train, y_train, batch_size):
+    training_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x_train, y_train))
+    training_dataset = training_dataset.repeat()
+    training_dataset = training_dataset.batch(batch_size)
+
+    # Train the model for 1 step
+    model.fit(x=training_dataset, epochs=1, steps_per_epoch=1)
+
+  def _load_and_run_model(self, saved_dir, x_predict):
+    func = saved_model.load(saved_dir)
+    return func.signatures[_DEFAULT_FUNCTION_KEY](x_predict)
+
+  def _get_predict_dataset(self, x_predict, batch_size):
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = predict_dataset.batch(batch_size)
+    return predict_dataset
+
+  @combinations.generate(simple_models_with_strategies())
+  def test_save_no_dist_restore_dist(self, model_and_input, distribution):
+    """Save a model without DS, and restore it with DS."""
+
+    self.skipTest('Loading model with DS is not supported yet')
+
+    saved_dir = os.path.join(self.get_temp_dir(),
+                             'test_save_no_dist_restore_dist')
+
+    model, output_name = model_and_input.get_model()
+    x_train, y_train, x_predict = model_and_input.get_data()
+    batch_size = model_and_input.get_batch_size()
+
+    self._train_model(model, x_train, y_train, batch_size)
+    predict_dataset = self._get_predict_dataset(x_predict, batch_size)
+    result_before_save = model.predict(predict_dataset)
+
+    saved_model.save(model, saved_dir)
+
+    with distribution.scope():
+      predict_dataset = distribution.experimental_distribute_dataset(
+          predict_dataset)
+      actual_data = next(iter(predict_dataset))
+      result_after_save = self._load_and_run_model(saved_dir, actual_data)
+
+    self.assertAllEqual(result_before_save, result_after_save[output_name])
+
+  @combinations.generate(simple_models_with_strategies())
+  def test_save_dist_restore_no_dist(self, model_and_input, distribution):
+    """Save a model with DS, and restore it without DS."""
+
+    self.skipTest('Saving model with DS is not supported yet')
+
+    saved_dir = os.path.join(self.get_temp_dir(),
+                             'test_save_no_dist_restore_dist')
+    saved_dir_in_scope = os.path.join(saved_dir, _IN_SCOPE_SAVE_DIR)
+    saved_dir_out_of_scope = os.path.join(saved_dir, _OUT_OF_SCOPE_SAVE_DIR)
+
+    with distribution.scope():
+      model, output_name = model_and_input.get_model()
+      x_train, y_train, x_predict = model_and_input.get_data()
+      batch_size = model_and_input.get_batch_size()
+
+      self._train_model(model, x_train, y_train, batch_size)
+      predict_dataset = self._get_predict_dataset(x_predict, batch_size)
+      result_before_save = model.predict(predict_dataset)
+
+      # save the model both in and out of the DS scope
+      saved_model.save(model, saved_dir_in_scope)
+    saved_model.save(model, saved_dir_out_of_scope)
+
+    actual_data = next(iter(predict_dataset))
+    result_load_from_save_in_scope = self._load_and_run_model(
+        saved_dir_in_scope, actual_data)
+    result_load_from_save_out_of_scope = self._load_and_run_model(
+        saved_dir_out_of_scope, actual_data)
+
+    self.assertAllEqual(result_before_save,
+                        result_load_from_save_in_scope[output_name])
+    self.assertAllEqual(result_before_save,
+                        result_load_from_save_out_of_scope[output_name])
+
+  @combinations.generate(
+      combinations.combine(
+          model_and_input=simple_models,
+          distribution_pair=get_strategy_cross_product(),
+          mode=['eager']))
+  def test_save_dist_restore_dist(self, model_and_input, distribution_pair):
+    """Save a model with DS, and restore it with potentially different DS."""
+
+    self.skipTest('Saving model with DS is not supported yet')
+
+    combinations.maybe_skip_test(self, distribution_pair.is_tpu_required,
+                                 distribution_pair.num_gpus_required)
+
+    saved_dir = os.path.join(self.get_temp_dir(), 'test_save_dist_restore_dist')
+    saved_dir_in_scope = os.path.join(saved_dir, _IN_SCOPE_SAVE_DIR)
+    saved_dir_out_of_scope = os.path.join(saved_dir, _OUT_OF_SCOPE_SAVE_DIR)
+
+    dist_for_save = distribution_pair.strategy_1
+    dist_for_restore = distribution_pair.strategy_2
+
+    with dist_for_save.scope():
+      model, output_name = model_and_input.get_model()
+      x_train, y_train, x_predict = model_and_input.get_data()
+      batch_size = model_and_input.get_batch_size()
+
+      self._train_model(model, x_train, y_train, batch_size)
+      predict_dataset = self._get_predict_dataset(x_predict, batch_size)
+      result_before_save = model.predict(predict_dataset)
+
+      # save the model both in and out of the DS scope
+      saved_model.save(model, saved_dir_in_scope)
+    saved_model.save(model, saved_dir_out_of_scope)
+
+    with dist_for_restore.scope():
+      predict_dataset = dist_for_restore.experimental_distribute_dataset(
+          predict_dataset)
+      actual_data = next(iter(predict_dataset))
+
+      result_load_from_save_in_scope = self._load_and_run_model(
+          saved_dir_in_scope, actual_data)
+      result_load_from_save_out_of_scope = self._load_and_run_model(
+          saved_dir_out_of_scope, actual_data)
+
+    self.assertAllEqual(result_before_save,
+                        result_load_from_save_in_scope[output_name])
+    self.assertAllEqual(result_before_save,
+                        result_load_from_save_out_of_scope[output_name])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 2a336c2..8c54047 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -137,9 +137,6 @@
 graph_and_eager_modes = ["graph", "eager"]
 
 
-optimizers_v1_and_v2 = optimizers_v1 + optimizers_v2
-
-
 def distributions_and_v1_optimizers():
   """A common set of combination with DistributionStrategies and Optimizers."""
   return combinations.combine(
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index ab3e9ab..b47dbf3 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -311,7 +311,8 @@
                               devices,
                               expected_values,
                               sess=None,
-                              test_reinitialize=True):
+                              test_reinitialize=True,
+                              ignore_order=False):
     evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
     evaluate(iterator.initialize())
 
@@ -319,7 +320,10 @@
       next_element = iterator.get_next()
       computed_value = evaluate(
           [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertEqual(expected_value, computed_value)
+      if ignore_order:
+        self.assertCountEqual(expected_value, computed_value)
+      else:
+        self.assertEqual(expected_value, computed_value)
 
     with self.assertRaises(errors.OutOfRangeError):
       next_element = iterator.get_next()
@@ -335,7 +339,10 @@
         computed_value = evaluate([
             values.select_replica(r, next_element) for r in range(len(devices))
         ])
-        self.assertEqual(expected_value, computed_value)
+        if ignore_order:
+          self.assertCountEqual(expected_value, computed_value)
+        else:
+          self.assertEqual(expected_value, computed_value)
 
   def _test_global_step_update(self, strategy):
     with strategy.scope():
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index fdfde16..e9544c0 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -179,8 +179,21 @@
          values.select_replica(i, args),
          values.select_replica(i, kwargs)])
 
+  # Construct and pass `maximum_shapes` so that we could support dynamic
+  # shapes using dynamic padder.
+  if replicate_inputs:
+    maximum_shapes = []
+    flattened_list = nest.flatten(replicate_inputs[0])
+    for input_tensor in flattened_list:
+      maximum_shapes.append(input_tensor.get_shape())
+    maximum_shapes = nest.pack_sequence_as(replicate_inputs[0],
+                                           maximum_shapes)
+  else:
+    maximum_shapes = None
+
   with strategy.scope():
-    replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs)
+    replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs,
+                                      maximum_shapes=maximum_shapes)
 
   # Remove all no ops that may have been added during 'tpu.replicate()'
   if isinstance(result[0], list):
@@ -304,7 +317,8 @@
   def _make_dataset_iterator(self, dataset):
     """Make iterators for each of the TPU hosts."""
     return input_lib.DatasetIterator(dataset, self._input_workers,
-                                     self._num_replicas_in_sync)
+                                     self._num_replicas_in_sync,
+                                     _enable_get_next_as_optional=True)
 
   def _make_input_fn_iterator(
       self,
@@ -318,7 +332,8 @@
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
     return input_lib.InputFunctionIterator(
-        input_fn, self._input_workers, input_contexts)
+        input_fn, self._input_workers, input_contexts,
+        _enable_get_next_as_optional=True)
 
   def _experimental_make_numpy_dataset(self, numpy_input, session):
     return numpy_dataset.one_host_numpy_dataset(
@@ -334,14 +349,6 @@
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
   def _experimental_run_steps_on_iterator(
       self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
-    output_shapes = multi_worker_iterator.output_shapes
-    shapes = nest.flatten(output_shapes)
-    if any(not s.is_fully_defined() for s in shapes):
-      raise ValueError(
-          "TPU currently requires fully defined shapes. Either use "
-          "set_shape() on the input tensors or use "
-          "dataset.batch(..., drop_remainder=True).")
-
     # Wrap `fn` for repeat.
     if initial_loop_values is None:
       initial_loop_values = {}
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index ecd2919..7cf1a00 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -29,14 +29,15 @@
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import saver
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -232,6 +233,69 @@
     "LogicalDeviceSpec", ("device_map", "logical_device"))
 
 
+class WorkerDeviceMap(DeviceMap):
+  """A device map for one value per worker."""
+
+  def __init__(self, devices, num_replicas_per_worker):
+    """Initialize a `WorkerDeviceMap`.
+
+    Args:
+      devices: `devices[i]` is the string device for worker `i` in in-graph
+        relication case; devices is single-element list for its corresponding
+        worker in between-graph case.
+      num_replicas_per_worker: number of replicas per worker, useful in in-graph
+        replication case.
+    """
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    if len(set(self._devices)) != len(self._devices):
+      raise ValueError("Duplicate devices in %s, after canonicalization: %s" %
+                       (devices, self._devices))
+    self._num_replicas_per_worker = num_replicas_per_worker
+
+  @property
+  def all_devices(self):
+    return self._devices
+
+  @property
+  def devices_by_replica(self):
+    raise ValueError("`WorkerDeviceMap` is not indexed by replicas")
+
+  @property
+  def num_logical_devices(self):
+    return 1
+
+  @property
+  def num_replicas_in_graph(self):
+    return len(self._devices)
+
+  def logical_device_from_values(self, values):
+    del values
+    return 0
+
+  def logical_to_actual_devices(self, logical_device_id):
+    assert logical_device_id == 0
+    return self._devices
+
+  def select_for_current_replica(self, values, replica_context):
+    return values[replica_context.replica_id_in_sync_group //
+                  self._num_replicas_per_worker]
+
+  def replica_for_device(self, device):
+    raise ValueError("`WorkerDeviceMap` not indexed by replicas")
+
+  def select_for_device(self, values, device):
+    # TODO(yuefengz): this should map from any device to the value on its
+    # corresponding worker.
+    return values[self._devices.index(device_util.canonicalize(device))]
+
+  def is_device_in_replica(self, device, replica_id):
+    raise ValueError("WorkerDeviceMap not indexed by replicas")
+
+  def __repr__(self):
+    return "%s(%r, num_replicas_per_worker=%d)" % (
+        self.__class__.__name__, self._devices, self._num_replicas_per_worker)
+
+
 class DistributedValues(object):
   """Holds a map from device to values. Either PerReplica or Mirrored."""
 
@@ -380,13 +444,38 @@
   # TODO(josh11b): Even more operator overloads.
 
 
-class PerReplica(DistributedValues):
+class PerReplica(DistributedValues, composite_tensor.CompositeTensor):
   """Holds a map from device to unsynchronized values."""
-  pass
+
+  def _to_components(self):
+    replica_context = distribution_strategy_context.get_replica_context()
+    if replica_context is not None and replica_context.num_replicas_in_sync > 1:
+      raise ValueError(
+          "Flattening a PerReplica to components is not supported in replica "
+          "context.")
+    return self._values
+
+  def _component_metadata(self):
+    return self._device_map, self._logical_device
+
+  @classmethod
+  def _from_components(cls, components, metadata):
+    device_map, logical_device = metadata
+    return PerReplica(device_map, components, logical_device=logical_device)
+
+  def _is_graph_tensor(self):
+    return any(hasattr(t, "graph") for t in self._values)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      return tuple(v.shape for v in self._values)
+    else:
+      return tuple(shape for _ in self._values)
 
 
 # Note that unlike PerReplica, Mirrored values inherit from
 # DistributedDelegate and so can be used directly in cross-replica mode.
+# TODO(tomhennigan) Should this extend CompositeTensor?
 class Mirrored(DistributedDelegate):
   """Holds a map from device to values which are kept in sync."""
 
@@ -436,7 +525,7 @@
     "DistributedVarOp", ["name", "graph", "type"])
 
 
-class DistributedVariable(DistributedDelegate, variables_lib.Variable):
+class DistributedVariable(DistributedDelegate, variables_lib.AbstractVariable):
   """Holds a map from device to variables."""
   # TODO(josh11b): Support changing the set of variables if e.g. if new
   # devices are joining or a device is to leave.
@@ -851,6 +940,14 @@
         "numpy() is only available when eager execution is enabled.")
 
   @property
+  def initialized_value(self):
+    return self.primary.initialized_value()
+
+  @property
+  def initial_value(self):
+    return self.primary.initial_value
+
+  @property
   def primary(self):
     """Returns a representative component."""
     return self._values[0]
@@ -1091,7 +1188,7 @@
 
   @property
   def constraint(self):
-    return None
+    return self.primary.constraint
 
   @property
   def initializer(self):
@@ -1565,6 +1662,14 @@
     return self._v.initializer
 
   @property
+  def initialized_value(self):
+    return self._v.initialized_value()
+
+  @property
+  def initial_value(self):
+    return self._v.initial_value
+
+  @property
   def op(self):
     return self._v.op
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index f1a51b5..2312973 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -26,6 +26,7 @@
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
@@ -262,6 +263,14 @@
     self.assertEqual(_nested_value("2"),
                      values.select_device_mirrored(_device_str(1), result))
 
+  def testWrapAListOfTwoTuples(self):
+    device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1)))
+    result = values.regroup(device_map, [("1", "2"), ("3", "4")])
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(2, len(result))
+    self._is_per_replica(result[0], ("1", "3"), values.PerReplica)
+    self._is_per_replica(result[1], ("2", "4"), values.PerReplica)
+
   def testMirroredContainer(self):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
@@ -854,5 +863,146 @@
     self._restore_replica_local_sum(save_path, distribution)
 
 
+class PerReplicaTest(test.TestCase):
+
+  def testToComponents(self):
+    device_map = values.SingleDeviceMap("CPU")
+    vals = (constant_op.constant(1.),)
+    per_replica = values.PerReplica(device_map, vals)
+    logical_device = 0
+    self.assertEqual(per_replica._to_components(), vals)
+    self.assertEqual(per_replica._component_metadata(), (device_map,
+                                                         logical_device))
+
+  def testFromComponents(self):
+    device_map = values.SingleDeviceMap("CPU")
+    vals = (constant_op.constant(1.),)
+    logical_device = 0
+    metadata = device_map, logical_device
+    per_replica = values.PerReplica._from_components(vals, metadata)
+    self.assertEqual(per_replica._device_map, device_map)
+    self.assertEqual(per_replica._values, vals)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testIsGraphTensor(self):
+    per_replica = values.PerReplica(values.SingleDeviceMap("CPU"),
+                                    (constant_op.constant(1.),))
+    self.assertEqual(per_replica._is_graph_tensor(),
+                     not context.executing_eagerly())
+
+  def testShapeInvariantToComponents(self):
+    v1 = constant_op.constant(1.)
+    v2 = constant_op.constant(2.)
+    per_replica = values.PerReplica(values.SingleDeviceMap("CPU"), (v1, v2))
+    self.assertEqual(per_replica._shape_invariant_to_components(),
+                     (v1.shape, v2.shape))
+
+  def testShapeInvariantToComponentsExplicitShape(self):
+    v1 = constant_op.constant([1., 1., 1.])
+    v2 = constant_op.constant([2., 2., 2.])
+    per_replica = values.PerReplica(values.SingleDeviceMap("CPU"), (v1, v2))
+    shape = [None]
+    self.assertEqual(per_replica._shape_invariant_to_components(shape=shape),
+                     (shape, shape))
+
+  def testDoesNotTriggerFunctionTracing(self):
+    traces = []
+
+    @def_function.function
+    def f(x):
+      traces.append(None)  # Only happens on trace.
+      return x
+
+    per_replica = values.PerReplica(
+        values.SingleDeviceMap("CPU"), (constant_op.constant(1.),))
+
+    # Trace once.
+    f(per_replica)
+    self.assertNotEmpty(traces)
+    del traces[:]
+
+    metadata = per_replica._component_metadata()
+    for _ in range(5):
+      vals = per_replica._to_components()
+      vals = [v * 2 for v in vals]
+      per_replica = values.PerReplica._from_components(vals, metadata)
+
+      output = f(per_replica)
+      self.assertIsInstance(output, values.PerReplica)
+      self.assertAllEqual(output._values, per_replica._values)
+      self.assertAllEqual(output._device_map, per_replica._device_map)
+      self.assertAllEqual(output._logical_device, per_replica._logical_device)
+      self.assertEmpty(traces)  # Make sure we're not re-tracing `f`.
+
+  def testFunctionCanReturnPerReplica(self):
+    f = def_function.function(lambda x: x)
+    x = values.PerReplica(
+        values.SingleDeviceMap("CPU"), (constant_op.constant(1.),))
+    y = f(x)
+    self.assertIsNot(x, y)
+    for a, b in zip(x._to_components(), y._to_components()):
+      self.assertAllEqual(a, b)
+    self.assertEqual(x._component_metadata(), y._component_metadata())
+
+
+class WorkerDeviceMapTest(test.TestCase):
+
+  class ReplicaContext(object):
+
+    def __init__(self, replica_id_in_sync_group):
+      self.replica_id_in_sync_group = replica_id_in_sync_group
+
+  def testBasic(self):
+    devices = [
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:2/device:CPU:0"
+    ]
+    device_map = values.WorkerDeviceMap(devices, 1)
+    self.assertAllEqual(devices, device_map.all_devices)
+
+    # pylint:disable=pointless-statement
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "`WorkerDeviceMap` is not indexed by replicas"):
+      device_map.devices_by_replica
+
+    self.assertEqual(1, device_map.num_logical_devices)
+
+    self.assertEqual(2, device_map.num_replicas_in_graph)
+
+    self.assertEqual(0, device_map.logical_device_from_values(["a", "b"]))
+
+    self.assertAllEqual(devices, device_map.logical_to_actual_devices(0))
+
+    replica_context = WorkerDeviceMapTest.ReplicaContext(1)
+    self.assertEqual(
+        "b", device_map.select_for_current_replica(["a", "b"], replica_context))
+
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "`WorkerDeviceMap` not indexed by replicas"):
+      device_map.replica_for_device(devices[1])
+
+    self.assertEqual("b", device_map.select_for_device(["a", "b"], devices[1]))
+
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "WorkerDeviceMap not indexed by replicas"):
+      device_map.is_device_in_replica(devices[1], 1)
+
+    self.assertEqual(
+        "WorkerDeviceMap(('/job:worker/replica:0/task:0/device:CPU:0', "
+        "'/job:worker/replica:0/task:2/device:CPU:0'), "
+        "num_replicas_per_worker=1)", repr(device_map))
+
+  def testMultipleReplicasPerWorker(self):
+    devices = [
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:2/device:CPU:0"
+    ]
+    device_map = values.WorkerDeviceMap(devices, 2)
+
+    replica_context = WorkerDeviceMapTest.ReplicaContext(3)
+    self.assertEqual(
+        "b", device_map.select_for_current_replica(["a", "b"], replica_context))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/zero_batch_test.py b/tensorflow/python/distribute/zero_batch_test.py
index cb8ce07..bebec16c 100644
--- a/tensorflow/python/distribute/zero_batch_test.py
+++ b/tensorflow/python/distribute/zero_batch_test.py
@@ -23,6 +23,8 @@
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import normalization
@@ -32,17 +34,16 @@
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
-all_combinations = combinations.combine(
-    distribution=[
-        strategy_combinations.one_device_strategy,
-    ], mode=["graph"])
-
 
 class NormalizationTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(all_combinations,
-                         combinations.combine(fused=[True, False])))
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy,
+          ],
+          mode=["graph"],
+          fused=[True, False]))
   def disabled_testBNWithZeroBatchInput(self, distribution, fused):
     with distribution.scope(), self.cached_session() as sess:
       bn_list = []
@@ -106,6 +107,56 @@
       np_output = sess.run(predict_op, {inputs_placeholder: inputs})
       self.assertEqual([], np_output.tolist())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy,
+          ],
+          mode=["eager"],
+          fused=[True, False]))
+  def testBNWithZeroBatchInput(self, distribution, fused):
+    with distribution.scope():
+      inputs = np.random.random((0, 4, 4, 3)).astype(np.float32) + 100
+      targets = np.random.random((0, 4, 4, 3)).astype(np.float32)
+      bn = normalization.BatchNormalization(
+          axis=3, epsilon=1e-3, momentum=0.9, fused=fused)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.01)
+
+      @def_function.function
+      def train_step():
+        def step_fn(inputs, targets):
+          with backprop.GradientTape() as tape:
+            outputs = bn.apply(inputs, training=True)
+            loss = losses.mean_squared_error(targets, outputs)
+          grads = tape.gradient(loss, bn.variables)
+          optimizer.apply_gradients(zip(grads, bn.variables))
+          return loss
+
+        return distribution.experimental_run_v2(
+            step_fn, args=(inputs, targets))
+
+      for _ in range(100):
+        np_output = train_step().numpy()
+        self.assertEqual(0.0, np_output)
+
+      # Verify that the statistics and weights are not changed after training.
+      self.assertAllEqual([0, 0, 0], bn.moving_mean.numpy())
+      self.assertAllEqual([1, 1, 1], bn.moving_variance.numpy())
+      self.assertAllEqual([1, 1, 1], bn.gamma.numpy())
+      self.assertAllEqual([0, 0, 0], bn.beta.numpy())
+
+      @def_function.function
+      def test_step():
+        def step_fn(inputs):
+          outputs = bn.apply(inputs, training=False)
+          return outputs
+
+        return distribution.experimental_run_v2(
+            step_fn, args=(inputs,))
+
+      # Test inference.
+      self.assertAllEqual(np.zeros(shape=(0, 4, 4, 3), dtype=np.float32),
+                          test_step().numpy())
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 17d0383..458ac15 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
@@ -102,7 +102,9 @@
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        "//tensorflow/python:c_api_util",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index aada963..af126d3 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -568,8 +568,8 @@
       if isinstance(grad, ops.Tensor):
         indexed_slices = ops.IndexedSlices(
             grad,
-            math_ops.range(grad.shape[0]),
-            constant_op.constant(grad.shape.as_list()))
+            math_ops.range(array_ops.shape(grad)[0]),
+            array_ops.shape(grad))
         indexed_slices_list.append(indexed_slices)
       else:
         indexed_slices_list.append(grad)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 3e24f44..7149245 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -959,13 +959,31 @@
         self.set_virtual_device_configuration(
             cpus[0], [VirtualDeviceConfiguration() for _ in range(num_cpus)])
 
+    # Parse GPU options
     gpus = [d for d in self._physical_devices if d.device_type == "GPU"]
+
+    # If there are no GPUs detected, simply ignore all the GPU options passed in
+    # rather than doing any validation checks.
+    if not gpus:
+      return
+
     gpu_count = self._config.device_count.get("GPU", None)
-    if gpu_count == 0:
-      self.set_visible_devices([], "GPU")
-    elif gpu_count is not None:
-      # TODO(gjn): Handle importing existing virtual GPU configuration
-      self.set_visible_devices(gpus[:gpu_count], "GPU")
+
+    visible_gpus = []
+    # TODO(gjn): Handle importing existing virtual GPU configuration
+    visible_indices = self._config.gpu_options.visible_device_list
+    if visible_indices:
+      for index in visible_indices.split(","):
+        if int(index) >= len(gpus):
+          raise ValueError("Invalid visible device index: %s" % index)
+        visible_gpus.append(gpus[int(index)])
+    else:
+      visible_gpus = gpus
+
+    if gpu_count is not None:
+      visible_gpus = visible_gpus[:gpu_count]
+
+    self.set_visible_devices(visible_gpus, "GPU")
 
   def list_logical_devices(self, device_type=None):
     """Return logical devices."""
@@ -1176,6 +1194,9 @@
 
   @log_device_placement.setter
   def log_device_placement(self, enabled):
+    if self._log_device_placement == enabled:
+      return
+
     if self._context_handle is not None:
       raise RuntimeError(
           "Device placement logging must be set at program startup")
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index c0ed2f5..bdcc928 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -30,7 +30,6 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -38,7 +37,7 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
+class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
   """Variable which does not lift its initializer out of function context.
 
   Instances of this variable, when created, build a graph which runs their
@@ -48,7 +47,7 @@
   mode. That is, non-function-building graphs are not supported.
   """
 
-  def __init__(self,  # pylint: disable=super-init-not-called
+  def __init__(self,
                initial_value=None,
                trainable=None,
                caching_device=None,
@@ -116,8 +115,6 @@
           caching_device=caching_device, name=name, dtype=dtype,
           constraint=constraint)
       return
-    with ops.init_scope():
-      self._in_graph_mode = not context.executing_eagerly()
     if initial_value is None:
       raise ValueError("initial_value must be specified.")
     init_from_fn = callable(initial_value)
@@ -130,44 +127,27 @@
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
-    synchronization, aggregation, trainable = (
-        variables.validate_synchronization_aggregation_trainable(
-            synchronization, aggregation, trainable, name))
-    self._trainable = trainable
-    self._synchronization = synchronization
-    self._aggregation = aggregation
-    self._save_slice_info = None
-    self._initial_value = None
-    self._initializer_op = None
-    self._is_initialized_op = None
-    self._graph_element = None
-    self._cached_value = None
-    # Store the graph key so optimizers know how to only retrieve variables from
-    # this graph. Guaranteed to be the same as the eager graph_key.
-    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     with ops.name_scope(name, "Variable", []
                         if init_from_fn else [initial_value]) as name:
-      # pylint: disable=protected-access
-      with ops.init_scope():
-        handle_name = ops.name_from_scope_name(name)
-        unique_id = "%s_%d" % (handle_name, ops.uid())
-        shared_name = context.shared_name(unique_id)
       with ops.name_scope("Initializer"), ops.device(None):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
             name="initial_value", dtype=dtype)
-      with ops.init_scope():
-        self._handle = resource_variable_ops.eager_safe_variable_handle(
-            initial_value=initial_value,
-            shared_name=shared_name,
-            name=name,
-            graph_mode=self._in_graph_mode)
-      self._shape = initial_value.shape
-      self._unique_id = unique_id
-      self._handle_name = handle_name + ":0"
-      self._dtype = initial_value.dtype.base_dtype
-      self._constraint = constraint
       assert initial_value is not None
+
+      # Use the constructor for UninitializedVariable to start.
+      super(UnliftedInitializerVariable, self).__init__(
+          trainable=trainable,
+          caching_device=caching_device,
+          name=name,
+          shape=initial_value.shape,
+          dtype=initial_value.dtype,
+          constraint=constraint,
+          synchronization=synchronization,
+          aggregation=aggregation,
+          extra_handle_data=initial_value,
+          **unused_kwargs)
+
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
@@ -188,13 +168,6 @@
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
               self._initializer_op = resource_variable_ops.assign_variable_op(
                   self._handle, lifted_initializer, name=n)
-          with ops.name_scope("Read"), ops.colocate_with(self._handle):
-            # Manually assign reads to the handle's device to avoid log
-            # messages.
-            with ops.device(self._handle.device):
-              value = self._read_variable_op()
-            self._graph_element = value
-          ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
       else:
         if add_initializers_to is not None:
           add_initializers_to[self] = initial_value
@@ -214,16 +187,6 @@
             resource_variable_ops.var_is_initialized_op(self._handle),
             not_assign_fn, assign_fn)
 
-    # After the handle has been created, set up a way to clean it up when
-    # executing eagerly. We'll hold the only reference to the deleter, so that
-    # when this object is garbage collected the deleter will be too. This
-    # means ResourceVariables can be part of reference cycles without those
-    # cycles being uncollectable.
-    if not self._in_graph_mode:
-      self._handle_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._handle, handle_device=self._handle.device)
-    self._cached_shape_as_list = None
-
 
 RUN_FUNCTIONS_EAGERLY = False
 
@@ -529,7 +492,7 @@
 
     # Note: using defun here avoids an infinite recursion.
     # Note: there is no reason not to autograph once the overhead is negligible.
-    @function_lib.defun(autograph=False)  # tf.function internal, pure graph
+    @function_lib.defun
     def initialize_variables():
       for v, init in initializer_map.items():
         with ops.init_scope():
@@ -607,7 +570,7 @@
     # This is run only at serialization time on likely very small inputs so we
     # are not concerned about O(n^2) runtime.
     for concrete_function in concrete_functions:
-      signature, _ = concrete_function.structured_input_signature
+      signature = concrete_function.structured_input_signature
       flattened = nest.flatten(signature)
       if any(
           isinstance(arg, func_graph_module.UnknownArgument)
@@ -915,9 +878,9 @@
 
   _Tracing and staging_
 
-  When `autograph` is `True`, all Python code that depends on `Tensor` values is
-  staged into a TensorFlow graph. When `autograph` is `False`, the function is
-  traced and control flow is not allowed to depend on data.
+  When `autograph` is `True`, all Python control flow that depends on `Tensor`
+  values is staged into a TensorFlow graph. When `autograph` is `False`, the
+  function is traced and control flow is not allowed to depend on data.
 
   Note that `function` only stages TensorFlow operations, all Python code that
   `func` executes and does not depend on data will shape the _construction_ of
@@ -954,6 +917,33 @@
   such as a loop. If your code uses side effects that are not intended to
   control graph construction, wrap them inside `tf.compat.v1.py_func`.
 
+  _Retracing_
+
+  A single tf.function object might need to map to multiple computation graphs
+  under the hood. This should be visible only as performance (tracing graphs has
+  a nonzero computational and memory cost) but should not affect the correctness
+  of the program. A traced function should return the same result as it would
+  when run eagerly, assuming no unintended Python side-effects.
+
+  Calling a `tf.function` with tensor arguments of different dtypes should lead
+  to at least one computational graph per distinct set of dtypes. Alternatively,
+  always calling a `tf.function` with tensor arguments of the same shapes and
+  dtypes and the same non-tensor arguments should not lead to additional
+  retracings of your function.
+
+  Other than that, TensorFlow reserves the right to retrace functions as many
+  times as needed, to ensure that traced functions behave as they would when run
+  eagerly and to provide the best end-to-end performance. For example, the
+  behavior of how many traces TensorFlow will do when the function is repeatedly
+  called with different python scalars as arguments is left undefined to allow
+  for future optimizations.
+
+  To control the tracing behavior, use the following tools:
+   - different `tf.function` objects are guaranteed to not share traces; and
+   - specifying a signature or using concrete function objects returned from
+     get_concrete_function() guarantees that only one function graph will be
+     built.
+
   Args:
     func: function to be compiled. If `func` is None, returns a decorator that
       can be invoked with a single argument - `func`. The end result is
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 3bedb42..df8da0f 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -17,6 +17,8 @@
 from __future__ import division
 from __future__ import print_function
 
+from six.moves import range
+
 import functools
 import weakref
 
@@ -113,6 +115,14 @@
     with self.assertRaises(ValueError):
       fn(1.0)
 
+  def testRange(self):
+
+    @def_function.function
+    def f(unused_x):
+      return 1.0
+
+    self.assertAllEqual(f(range(5)), 1.0)
+
   def testCorrectVariableCreation(self):
 
     state = []
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index b5abda9..b947946 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -588,8 +588,10 @@
       raise AssertionError("Expected all args to be Tensors or Variables; "
                            "but got CompositeTensor: %r" % args)
 
-    for v in self._func_graph.variables:
-      resource_variable_ops.variable_accessed(v)
+    if (tape.could_possibly_record() or
+        hasattr(ops.get_default_graph(), "watch_variable")):
+      for v in self._func_graph.variables:
+        resource_variable_ops.variable_accessed(v)
 
     tensor_inputs = []
     variables_used = set([])
@@ -966,6 +968,8 @@
     #   - remove the corresponding arguments,
     #   - remove the corresponding keywords.
     _, unwrapped = tf_decorator.unwrap(python_function)
+    # TODO(b/131153379): Consider Python3's fullargspec.kwonlyargs and
+    # fullargspec.kwonlydefaults.
     if isinstance(unwrapped, functools.partial):
       # Also consider the Python3 case with kwonlydefaults.
       if fullargspec.defaults or fullargspec.kwonlydefaults:
@@ -1037,7 +1041,7 @@
 
       self._input_signature = tuple(input_signature)
       self._flat_input_signature = tuple(nest.flatten(input_signature,
-                                                      expand_composites=False))
+                                                      expand_composites=True))
 
   @property
   def fullargspec(self):
@@ -1654,7 +1658,7 @@
 
 def validate_signature(signature):
   if any(not isinstance(arg, tensor_spec.TensorSpec)
-         for arg in nest.flatten(signature, expand_composites=False)):
+         for arg in nest.flatten(signature, expand_composites=True)):
     raise TypeError("Invalid input_signature %s; input_signature must be "
                     "a possibly nested sequence of TensorSpec objects.")
 
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index 7cf7757..98dec0b 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -226,8 +226,7 @@
     self.assertAllEqual(g, 1.0)
 
   def testGradient(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     def sq(x):
       return matmul(x, x, transpose_a=True)
@@ -697,8 +696,7 @@
     self.assertAllEqual(g2, 2.0)
 
   def testGradientWithKeywordArguments(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     def sq(x):
       return matmul(a=x, b=x, transpose_a=True)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 02130b0..52ab4c9 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -41,6 +41,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import sparse_tensor_spec
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
@@ -65,7 +66,9 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_spec
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_ops
 from tensorflow.python.util import compat
@@ -116,8 +119,7 @@
 class FunctionTest(test.TestCase, parameterized.TestCase):
 
   def testBasic(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
     sq = matmul(t, t, transpose_a=True)
     sq2 = matmul(sq, t, transpose_a=True)
@@ -289,8 +291,7 @@
     self.assertEqual(add_2._name, 'add_2')
 
   def testBasicGraphMode(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     @def_function.function
     def sq(a):
@@ -301,8 +302,7 @@
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedInputsGraphMode(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
@@ -316,8 +316,7 @@
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedOutputsGraphMode(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
@@ -346,8 +345,7 @@
       self.assertEqual(f().shape, ())
 
   def testBasicGraphFunction(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     @def_function.function
     def sq(a):
@@ -361,8 +359,7 @@
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testInputSpecGraphFunction(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     @def_function.function
     def sq(a):
@@ -381,8 +378,7 @@
     self.assertAllEqual(out2, math_ops.matmul(t2, t2).numpy())
 
   def testNestedInputSpecGraphFunction(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     @def_function.function
     def sq(mats):
@@ -476,8 +472,7 @@
     self.assertAllEqual(f(), x)
 
   def testNestedInputsGraphFunction(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
@@ -494,8 +489,7 @@
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedOutputGraphFunction(self):
-    # TODO(b/121134877): Remove the autograph override.
-    matmul = def_function.function(math_ops.matmul, autograph=False)
+    matmul = def_function.function(math_ops.matmul)
 
     @def_function.function
     def sq(a):
@@ -964,6 +958,16 @@
        {'flat_values': [1, 2, 3], 'nested_row_lengths': [[1, 2], [2, 0, 1]]}),
       (sparse_tensor.SparseTensor,
        {'values': [1, 2, 3], 'indices': [[0], [8], [10]], 'dense_shape': [20]}),
+      (ragged_tensor.RaggedTensor.from_row_lengths,
+       {'values': [1, 2, 3], 'row_lengths': [2, 0, 1]},
+       [ragged_tensor_spec.ragged_tensor_spec([None, None], dtypes.int32)]),
+      (ragged_tensor.RaggedTensor.from_nested_row_lengths,
+       {'flat_values': [1, 2, 3], 'nested_row_lengths': [[1, 2], [2, 0, 1]]},
+       [ragged_tensor_spec.ragged_tensor_spec([None, None, None],
+                                              dtypes.int32)]),
+      (sparse_tensor.SparseTensor,
+       {'values': [1, 2, 3], 'indices': [[0], [8], [10]], 'dense_shape': [20]},
+       [sparse_tensor_spec.sparse_tensor_spec([None], dtypes.int32)]),
   ])  # pyformat: disable
   def testCompositeAsArgumentTensorWithDefun(self,
                                              factory_fn,
@@ -988,8 +992,7 @@
   @test_util.run_gpu_only
   def testFunctionOnDevice(self):
     x = constant_op.constant([1.]).gpu()
-    # TODO(b/121134877): Remove the autograph override.
-    f = def_function.function(math_ops.add, autograph=False)
+    f = def_function.function(math_ops.add)
     y = f(x, x).cpu()
     self.assertAllEqual(y, [2.])
 
@@ -1053,8 +1056,7 @@
   @test_util.run_gpu_only
   def testFunctionHandlesInputsOnDifferentDevices(self):
     # The Reshape op requires the shape tensor to be placed in host memory.
-    # TODO(b/121134877): Remove the autograph override.
-    reshape = def_function.function(array_ops.reshape, autograph=False)
+    reshape = def_function.function(array_ops.reshape)
     value = constant_op.constant([1., 2.]).gpu()
     shape = constant_op.constant([2, 1])
     reshaped = reshape(value, shape).cpu()
@@ -1063,8 +1065,7 @@
   @test_util.run_gpu_only
   def testFunctionHandlesInputsPlacedOnTheWrongDeviceGracefully(self):
     # The Reshape op requires the shape tensor to be placed in host memory.
-    # TODO(b/121134877): Remove the autograph override.
-    reshape = def_function.function(array_ops.reshape, autograph=False)
+    reshape = def_function.function(array_ops.reshape)
     value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
     reshape(value, shape)  # No error is raised
@@ -1123,9 +1124,7 @@
       self.assertEqual(1, int(self.evaluate(read())))
 
   def testSequenceInputs(self):
-    # TODO(b/121134877): Remove the autograph override.
-    clip_by_global_norm = def_function.function(
-        clip_ops.clip_by_global_norm, autograph=False)
+    clip_by_global_norm = def_function.function(clip_ops.clip_by_global_norm)
     t_list = [constant_op.constant(1.0), constant_op.constant(2.0)]
     clipped_list, global_norm = clip_by_global_norm(t_list,
                                                     constant_op.constant(.2))
@@ -1713,7 +1712,6 @@
     self.assertAllEqual(x.numpy(), foo(x).numpy())
 
   def testInputSignatureWithPartialFunction(self):
-    self.skipTest('b/124441704')
     def full_function(a, b, c=3.0):
       return a, b, c
 
@@ -1791,6 +1789,43 @@
     result = x(constant_op.constant(5.0), constant_op.constant(5))
     self.assertAllEqual(result, [5.0, 5])
 
+  def testInputSignatureWithCompositeTensors(self):
+    def f(rt):
+      self.assertEqual(rt.values.shape.as_list(), [None])
+      self.assertEqual(rt.row_splits.shape.as_list(), [4])
+      return rt
+
+    signature = [ragged_tensor_spec.ragged_tensor_spec(
+        shape=[3, None], dtype=dtypes.int32)]
+    defined = function.defun(f, input_signature=signature)
+    rt1 = ragged_factory_ops.constant([[1], [], [2, 3, 4]])
+    out1 = defined(rt1)
+    self.assertLen(total_function_cache(defined), 1)
+    self.assertAllEqual(out1.values, rt1.values)
+    self.assertAllEqual(out1.row_splits, rt1.row_splits)
+
+    # Changing the row lengths shouldn't create a new function.
+    rt2 = ragged_factory_ops.constant([[1, 2], [3, 4], [5]])
+    out2 = defined(rt2)
+    self.assertLen(total_function_cache(defined), 1)
+    self.assertAllEqual(out2.values, rt2.values)
+    self.assertAllEqual(out2.row_splits, rt2.row_splits)
+
+    # Different number of rows
+    rt3 = ragged_factory_ops.constant([[1, 2], [3, 4], [5], [6]])
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      defined(rt3)
+
+    # Different dtype
+    rt4 = ragged_factory_ops.constant([[1.0, 2.0], [], [3.0]])
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      defined(rt4)
+
+    # Different rank
+    rt5 = ragged_factory_ops.constant([[[1]], [[2]], [[3]]])
+    with self.assertRaisesRegexp(ValueError, 'do not match'):
+      defined(rt5)
+
   def testTensorKeywordArguments(self):
 
     def foo(a, b):
@@ -2580,6 +2615,20 @@
     # Tracing more than twice per input doesn't make sense.
     self.assertLess(trace_count[0], 13)
 
+  def testLimitedRetracingWithCompositeTensors(self):
+    trace_count = [0]
+
+    @def_function.function
+    def f(x):
+      trace_count[0] += 1
+      return x
+
+    for i in range(10):
+      f(ragged_factory_ops.constant([[1, 2], [i]]))
+      f(ragged_factory_ops.constant([[1, 2], [], [3, 4, 5]]))
+      f(ragged_factory_ops.constant([[[1, 2], [3]], [[4, 5, 6]]]))
+      self.assertEqual(trace_count[0], 3)
+
   def test_concrete_function_shape_mismatch(self):
 
     @def_function.function
@@ -2940,6 +2989,32 @@
 
     train()
 
+  def testEarlyStoppingTrainingLoopInFunction(self):
+    layer = core.Dense(2)
+    dataset = (
+        dataset_ops.DatasetV2.from_tensors(
+            (array_ops.ones([784]), array_ops.ones([], dtypes.int32)))
+        .map(lambda x, y: (x, y))
+        .repeat(10)
+        .batch(32))
+    optimizer = adam.Adam()
+
+    @def_function.function
+    def train():
+      for x, y in dataset:
+        with backprop.GradientTape() as tape:
+          out = layer(x)
+          loss = math_ops.reduce_mean(
+              nn_ops.sparse_softmax_cross_entropy_with_logits(
+                  logits=out, labels=y))
+        layer_variables = layer.trainable_variables
+        gradients = tape.gradient(loss, layer_variables)
+        optimizer.apply_gradients(zip(gradients, layer_variables))
+        if optimizer.iterations > 3:
+          break
+
+    train()
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
diff --git a/tensorflow/python/eager/monitoring.py b/tensorflow/python/eager/monitoring.py
index 335c2fd..fd4717f 100644
--- a/tensorflow/python/eager/monitoring.py
+++ b/tensorflow/python/eager/monitoring.py
@@ -20,7 +20,10 @@
 
 import collections
 
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.util import compat
 
 _MetricMethod = collections.namedtuple('MetricMethod', 'create delete get_cell')
 _counter_methods = [
@@ -37,32 +40,62 @@
         delete=pywrap_tensorflow.TFE_MonitoringDeleteCounter2,
         get_cell=pywrap_tensorflow.TFE_MonitoringGetCellCounter2),
 ]
-
-
-def gauge(name, label, value):
-  """Set the value of a Gauge metric.
-
-  If the metric with this name does not exist, it will create a new metric.
-
-  Args:
-    name: metric name
-    label: long label
-    value: a int64 value
-  """
-  pywrap_tensorflow.TFE_MonitoringSetGauge(name, label, value)
-
-
-def sampler(name, label, value):
-  """Add the value of a Sampler metric.
-
-  If the metric with this name does not exist, it will create a new metric.
-
-  Args:
-    name: metric name
-    label: metric label
-    value: a double value
-  """
-  pywrap_tensorflow.TFE_MonitoringAddSampler(name, label, value)
+_int_gauge_methods = [
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewIntGauge0,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteIntGauge0,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellIntGauge0),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewIntGauge1,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteIntGauge1,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellIntGauge1),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewIntGauge2,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteIntGauge2,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellIntGauge2),
+]
+_string_gauge_methods = [
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewStringGauge0,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteStringGauge0,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellStringGauge0),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewStringGauge1,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteStringGauge1,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellStringGauge1),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewStringGauge2,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteStringGauge2,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellStringGauge2),
+]
+_bool_gauge_methods = [
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewBoolGauge0,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteBoolGauge0,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellBoolGauge0),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewBoolGauge1,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteBoolGauge1,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellBoolGauge1),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewBoolGauge2,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteBoolGauge2,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellBoolGauge2),
+]
+_sampler_methods = [
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewSampler0,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteSampler0,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellSampler0),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewSampler1,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteSampler1,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellSampler1),
+    _MetricMethod(
+        create=pywrap_tensorflow.TFE_MonitoringNewSampler2,
+        delete=pywrap_tensorflow.TFE_MonitoringDeleteSampler2,
+        get_cell=pywrap_tensorflow.TFE_MonitoringGetCellSampler2),
+]
 
 
 class Metric(object):
@@ -88,7 +121,8 @@
     self._metric = self._metric_methods[self._label_length].create(*args)
 
   def __del__(self):
-    self._metric_methods[self._label_length].delete(self._metric)
+    if hasattr(self, '_metric'):
+      self._metric_methods[self._label_length].delete(self._metric)
 
   def get_cell(self, *labels):
     """Retrieves the cell."""
@@ -124,7 +158,12 @@
 
 
 class Counter(Metric):
-  """A stateful class for updating a cumulative integer metric."""
+  """A stateful class for updating a cumulative integer metric.
+
+  This class encapsulates a set of values (or a single value for a label-less
+  metric). Each value is identified by a tuple of labels. The class allows the
+  user to increment each value.
+  """
 
   def __init__(self, name, description, *labels):
     """Creates a new Counter.
@@ -132,7 +171,7 @@
     Args:
       name: name of the new metric.
       description: description of the new metric.
-      *labels: The label list of the new metrics
+      *labels: The label list of the new metric.
     """
     super(Counter, self).__init__('Counter', _counter_methods, len(labels),
                                   name, description, *labels)
@@ -140,3 +179,245 @@
   def get_cell(self, *labels):
     """Retrieves the cell."""
     return CounterCell(super(Counter, self).get_cell(*labels))
+
+
+class IntGaugeCell(object):
+  """A single integer value stored in an `IntGauge`."""
+
+  def __init__(self, cell):
+    """Creates a new IntGaugeCell.
+
+    Args:
+      cell: A c pointer of TFE_MonitoringIntGaugeCell.
+    """
+    self._cell = cell
+
+  def set(self, value):
+    """Atomically set the value.
+
+    Args:
+      value: integer value.
+    """
+    pywrap_tensorflow.TFE_MonitoringIntGaugeCellSet(self._cell, value)
+
+  def value(self):
+    """Retrieves the current value."""
+    return pywrap_tensorflow.TFE_MonitoringIntGaugeCellValue(self._cell)
+
+
+class IntGauge(Metric):
+  """A stateful class for updating a gauge-like integer metric.
+
+  This class encapsulates a set of integer values (or a single value for a
+  label-less metric). Each value is identified by a tuple of labels. The class
+  allows the user to set each value.
+  """
+
+  def __init__(self, name, description, *labels):
+    """Creates a new IntGauge.
+
+    Args:
+      name: name of the new metric.
+      description: description of the new metric.
+      *labels: The label list of the new metric.
+    """
+    super(IntGauge, self).__init__('IntGauge', _int_gauge_methods, len(labels),
+                                   name, description, *labels)
+
+  def get_cell(self, *labels):
+    """Retrieves the cell."""
+    return IntGaugeCell(super(IntGauge, self).get_cell(*labels))
+
+
+class StringGaugeCell(object):
+  """A single string value stored in an `StringGauge`."""
+
+  def __init__(self, cell):
+    """Creates a new StringGaugeCell.
+
+    Args:
+      cell: A c pointer of TFE_MonitoringStringGaugeCell.
+    """
+    self._cell = cell
+
+  def set(self, value):
+    """Atomically set the value.
+
+    Args:
+      value: string value.
+    """
+    pywrap_tensorflow.TFE_MonitoringStringGaugeCellSet(self._cell, value)
+
+  def value(self):
+    """Retrieves the current value."""
+    with c_api_util.tf_buffer() as buffer_:
+      pywrap_tensorflow.TFE_MonitoringStringGaugeCellValue(self._cell, buffer_)
+      value = pywrap_tensorflow.TF_GetBuffer(buffer_).decode('utf-8')
+    return value
+
+
+class StringGauge(Metric):
+  """A stateful class for updating a gauge-like string metric.
+
+  This class encapsulates a set of string values (or a single value for a
+  label-less metric). Each value is identified by a tuple of labels. The class
+  allows the user to set each value.
+  """
+
+  def __init__(self, name, description, *labels):
+    """Creates a new StringGauge.
+
+    Args:
+      name: name of the new metric.
+      description: description of the new metric.
+      *labels: The label list of the new metric.
+    """
+    super(StringGauge, self).__init__('StringGauge', _string_gauge_methods,
+                                      len(labels), name, description, *labels)
+
+  def get_cell(self, *labels):
+    """Retrieves the cell."""
+    return StringGaugeCell(super(StringGauge, self).get_cell(*labels))
+
+
+class BoolGaugeCell(object):
+  """A single boolean value stored in an `BoolGauge`."""
+
+  def __init__(self, cell):
+    """Creates a new BoolGaugeCell.
+
+    Args:
+      cell: A c pointer of TFE_MonitoringBoolGaugeCell.
+    """
+    self._cell = cell
+
+  def set(self, value):
+    """Atomically set the value.
+
+    Args:
+      value: bool value.
+    """
+    pywrap_tensorflow.TFE_MonitoringBoolGaugeCellSet(self._cell, value)
+
+  def value(self):
+    """Retrieves the current value."""
+    return pywrap_tensorflow.TFE_MonitoringBoolGaugeCellValue(self._cell)
+
+
+class BoolGauge(Metric):
+  """A stateful class for updating a gauge-like bool metric.
+
+  This class encapsulates a set of boolean values (or a single value for a
+  label-less metric). Each value is identified by a tuple of labels. The class
+  allows the user to set each value.
+  """
+
+  def __init__(self, name, description, *labels):
+    """Creates a new BoolGauge.
+
+    Args:
+      name: name of the new metric.
+      description: description of the new metric.
+      *labels: The label list of the new metric.
+    """
+    super(BoolGauge, self).__init__('BoolGauge', _bool_gauge_methods,
+                                    len(labels), name, description, *labels)
+
+  def get_cell(self, *labels):
+    """Retrieves the cell."""
+    return BoolGaugeCell(super(BoolGauge, self).get_cell(*labels))
+
+
+class SamplerCell(object):
+  """SamplerCell stores each value of a Sampler."""
+
+  def __init__(self, cell):
+    """Creates a new SamplerCell.
+
+    Args:
+      cell: A c pointer of TFE_MonitoringSamplerCell.
+    """
+    self._cell = cell
+
+  def add(self, value):
+    """Atomically add a sample.
+
+    Args:
+      value: float value.
+    """
+    pywrap_tensorflow.TFE_MonitoringSamplerCellAdd(self._cell, value)
+
+  def value(self):
+    """Retrieves the current distribution of samples.
+
+    Returns:
+      A HistogramProto describing the distribution of samples.
+    """
+    with c_api_util.tf_buffer() as buffer_:
+      pywrap_tensorflow.TFE_MonitoringSamplerCellValue(self._cell, buffer_)
+      proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
+    histogram_proto = summary_pb2.HistogramProto()
+    histogram_proto.ParseFromString(compat.as_bytes(proto_data))
+    return histogram_proto
+
+
+class Buckets(object):
+  """Bucketing strategies for the samplers."""
+
+  def __init__(self, buckets):
+    """Creates a new Buckets.
+
+    Args:
+      buckets: A c pointer of TFE_MonitoringBuckets.
+    """
+    self.buckets = buckets
+
+  def __del__(self):
+    pywrap_tensorflow.TFE_MonitoringDeleteBuckets(self.buckets)
+
+
+class ExponentialBuckets(Buckets):
+  """Exponential bucketing strategy.
+
+  Sets up buckets of the form:
+      [-DBL_MAX, ..., scale * growth^i,
+       scale * growth_factor^(i + 1), ..., DBL_MAX].
+  """
+
+  def __init__(self, scale, growth_factor, bucket_count):
+    """Creates a new exponential Buckets.
+
+    Args:
+      scale: float
+      growth_factor: float
+      bucket_count: integer
+    """
+    super(ExponentialBuckets, self).__init__(
+        pywrap_tensorflow.TFE_MonitoringNewExponentialBuckets(
+            scale, growth_factor, bucket_count))
+
+
+class Sampler(Metric):
+  """A stateful class for updating a cumulative histogram metric.
+
+  This class encapsulates a set of histograms (or a single histogram for a
+  label-less metric) configured with a list of increasing bucket boundaries.
+  Each histogram is identified by a tuple of labels. The class allows the
+  user to add a sample to each histogram value.
+  """
+
+  def __init__(self, name, buckets, description, *labels):
+    """Creates a new Sampler.
+
+    Args:
+      name: name of the new metric.
+      buckets: bucketing strategy of the new metric.
+      description: description of the new metric.
+      *labels: The label list of the new metric.
+    """
+    super(Sampler, self).__init__('Sampler', _sampler_methods, len(labels),
+                                  name, buckets.buckets, description, *labels)
+
+  def get_cell(self, *labels):
+    """Retrieves the cell."""
+    return SamplerCell(super(Sampler, self).get_cell(*labels))
diff --git a/tensorflow/python/eager/monitoring_test.py b/tensorflow/python/eager/monitoring_test.py
index cc3ef39..3f60173 100644
--- a/tensorflow/python/eager/monitoring_test.py
+++ b/tensorflow/python/eager/monitoring_test.py
@@ -26,11 +26,6 @@
 
 class MonitoringTest(test_util.TensorFlowTestCase):
 
-  def test_monitoring(self):
-    # These methods should not throw any exception.
-    monitoring.gauge('test/gauge', 'label', 1)
-    monitoring.sampler('test/sampler', 'label', 1.0)
-
   def test_counter(self):
     counter = monitoring.Counter('test/counter', 'test counter')
     counter.get_cell().increase_by(1)
@@ -52,6 +47,59 @@
     with self.assertRaises(errors.AlreadyExistsError):
       counter2 = monitoring.Counter('test/same_counter', 'test counter')  # pylint: disable=unused-variable
 
+  def test_int_gauge(self):
+    gauge = monitoring.IntGauge('test/gauge', 'test gauge')
+    gauge.get_cell().set(1)
+    self.assertEqual(gauge.get_cell().value(), 1)
+    gauge.get_cell().set(5)
+    self.assertEqual(gauge.get_cell().value(), 5)
+
+    gauge1 = monitoring.IntGauge('test/gauge1', 'test gauge1', 'label1')
+    gauge1.get_cell('foo').set(2)
+    self.assertEqual(gauge1.get_cell('foo').value(), 2)
+
+  def test_string_gauge(self):
+    gauge = monitoring.StringGauge('test/gauge', 'test gauge')
+    gauge.get_cell().set('left')
+    self.assertEqual(gauge.get_cell().value(), 'left')
+    gauge.get_cell().set('right')
+    self.assertEqual(gauge.get_cell().value(), 'right')
+
+    gauge1 = monitoring.StringGauge('test/gauge1', 'test gauge1', 'label1')
+    gauge1.get_cell('foo').set('start')
+    self.assertEqual(gauge1.get_cell('foo').value(), 'start')
+
+  def test_bool_gauge(self):
+    gauge = monitoring.BoolGauge('test/gauge', 'test gauge')
+    gauge.get_cell().set(True)
+    self.assertTrue(gauge.get_cell().value())
+    gauge.get_cell().set(False)
+    self.assertFalse(gauge.get_cell().value())
+
+    gauge1 = monitoring.BoolGauge('test/gauge1', 'test gauge1', 'label1')
+    gauge1.get_cell('foo').set(True)
+    self.assertTrue(gauge1.get_cell('foo').value())
+
+  def test_sampler(self):
+    buckets = monitoring.ExponentialBuckets(1.0, 2.0, 2)
+    sampler = monitoring.Sampler('test/sampler', buckets, 'test sampler')
+    sampler.get_cell().add(1.0)
+    sampler.get_cell().add(5.0)
+    histogram_proto = sampler.get_cell().value()
+    self.assertEqual(histogram_proto.min, 1.0)
+    self.assertEqual(histogram_proto.num, 2.0)
+    self.assertEqual(histogram_proto.sum, 6.0)
+
+    sampler1 = monitoring.Sampler('test/sampler1', buckets, 'test sampler',
+                                  'label1')
+    sampler1.get_cell('foo').add(2.0)
+    sampler1.get_cell('foo').add(4.0)
+    sampler1.get_cell('bar').add(8.0)
+    histogram_proto1 = sampler1.get_cell('foo').value()
+    self.assertEqual(histogram_proto1.max, 4.0)
+    self.assertEqual(histogram_proto1.num, 2.0)
+    self.assertEqual(histogram_proto1.sum, 6.0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 4190c5b..0f9da66 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -283,9 +283,6 @@
   // cycles, and hence don't provide GC support for it.
   PyObject* handle_data;
 
-  // This stores `_keras_mask` object and is set by Tensorflow layers.
-  PyObject* keras_mask;
-
   // This stores `_tensor_shape`, a cached `TensorShape` object, and is set the
   // first time that `_EagerTensorBase`'s `shape` property is called.
   PyObject* tensor_shape;
@@ -349,8 +346,6 @@
   Py_INCREF(Py_None);
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
-  self->keras_mask = Py_None;
-  Py_INCREF(Py_None);
   self->tensor_shape = Py_None;
   self->status = TF_NewStatus();
   self->dict = nullptr;
@@ -498,7 +493,6 @@
 
   TF_DeleteStatus(self->status);
   Py_DECREF(self->handle_data);
-  Py_DECREF(self->keras_mask);
   Py_DECREF(self->tensor_shape);
   // If an attribute dictionary has been created, release it. Note that this
   // is only ever created by CPython's attribute setting methods; we don't
@@ -593,19 +587,6 @@
   return 0;
 }
 
-static PyObject* EagerTensor_keras_mask(EagerTensor* self, void* unused) {
-  Py_INCREF(self->keras_mask);
-  return self->keras_mask;
-}
-
-static int EagerTensor_setkeras_mask(EagerTensor* self, PyObject* value,
-                                     void* unused) {
-  Py_DECREF(self->keras_mask);
-  Py_INCREF(value);
-  self->keras_mask = value;
-  return 0;
-}
-
 static PyObject* EagerTensor_tensor_shape(EagerTensor* self, void* unused) {
   Py_INCREF(self->tensor_shape);
   return self->tensor_shape;
@@ -646,7 +627,36 @@
     PyErr_SetString(PyExc_RuntimeError, TF_Message(status.get()));
     return nullptr;
   }
+
+  // HACK(slebedev): The following explains why TensorToNdarray never
+  // reuses the storage.
+  //
+  // TF_TensorToPyArray copies the storage unless its
+  // refcount is 1. For DT_STRING and DT_RESOURCE TF_TensorFromTensor
+  // has to copy so the refcount of the original storage is unchanged.
+  // However, if the storage can be reused by TF_TensorFromTensor its
+  // refcount is +1'd and hence TF_TensorToPyArray no longer can reuse it.
+  //
+  // Here we attempt a direct conversion without an intermediate TF_Tensor
+  // and fall-back to the slow path on failure.
   PyObject* ret = nullptr;
+  if (t->dtype() != tensorflow::DT_STRING &&
+      t->dtype() != tensorflow::DT_RESOURCE) {
+    tensorflow::gtl::InlinedVector<npy_intp, 4> dims(t->dims());
+    for (int d = 0; d < t->dims(); ++d) {
+      dims[d] = t->dim_size(d);
+    }
+
+    auto* copy = new tensorflow::Tensor(*t);
+    char* data = const_cast<char*>(copy->tensor_data().data());
+    if (tensorflow::ArrayFromMemory(
+            dims.size(), dims.data(), data, t->dtype(), [copy] { delete copy; },
+            &ret)
+            .ok()) {
+      return ret;
+    }
+  }
+
   auto cppstatus = tensorflow::TensorToNdarray(*t, &ret);
   if (MaybeRaiseExceptionFromStatus(cppstatus, PyExc_RuntimeError)) {
     Py_XDECREF(ret);
@@ -697,9 +707,6 @@
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_tensor_handle,
      (setter)EagerTensor_settensor_handle, const_cast<char*>("_tensor_handle"),
      nullptr},
-    {const_cast<char*>("_keras_mask"), (getter)EagerTensor_keras_mask,
-     (setter)EagerTensor_setkeras_mask, const_cast<char*>("_keras_mask"),
-     nullptr},
     {const_cast<char*>("_tensor_shape"), (getter)EagerTensor_tensor_shape,
      (setter)EagerTensor_settensor_shape, const_cast<char*>("_tensor_shape"),
      nullptr},
@@ -824,8 +831,6 @@
     Py_INCREF(Py_None);
     t->handle_data = Py_None;
     Py_INCREF(Py_None);
-    t->keras_mask = Py_None;
-    Py_INCREF(Py_None);
     t->tensor_shape = Py_None;
     t->handle = handle;
     t->status = TF_NewStatus();
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index efffe95..7993abc 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1802,7 +1802,13 @@
 }
 
 bool CheckResourceVariable(PyObject* item) {
-  return PyObject_TypeCheck(item, resource_variable_type);
+  if (PyObject_TypeCheck(item, resource_variable_type)) {
+    tensorflow::Safe_PyObjectPtr handle(
+        PyObject_GetAttrString(item, "_handle"));
+    return EagerTensor_CheckExact(handle.get());
+  }
+
+  return false;
 }
 
 bool IsNumberType(PyObject* item) {
@@ -2861,7 +2867,6 @@
 
 namespace {
 const char kTensor[] = "T";
-const char kIndexedSlices[] = "I";
 const char kList[] = "L";
 const char kListEnd[] = "l";
 const char kTuple[] = "U";
@@ -2872,6 +2877,7 @@
 const char kShapeDelim[] = "-";
 const char kDType[] = "d";
 const char kNone[] = "n";
+const char kCompositeTensor[] = "C";
 
 struct EncodeResult {
   string str;
@@ -3012,38 +3018,6 @@
     absl::StrAppend(&result->str, kTensor);
     TF_RETURN_IF_ERROR(
         TFE_Py_EncodeTensor(arg, include_tensor_ranks_only, result));
-  } else if (tensorflow::swig::IsIndexedSlices(arg)) {
-    absl::StrAppend(&result->str, kIndexedSlices);
-    tensorflow::Safe_PyObjectPtr values(PyObject_GetAttrString(arg, "values"));
-    if (values == nullptr) {
-      PyErr_Clear();
-      return tensorflow::errors::InvalidArgument(
-          "IndexedSlices does not have a values attr");
-    }
-    TF_RETURN_IF_ERROR(
-        TFE_Py_EncodeTensor(values.get(), include_tensor_ranks_only, result));
-
-    tensorflow::Safe_PyObjectPtr indices(
-        PyObject_GetAttrString(arg, "indices"));
-    if (indices == nullptr) {
-      PyErr_Clear();
-      return tensorflow::errors::InvalidArgument(
-          "IndexedSlices does not have a indices attr");
-    }
-    TF_RETURN_IF_ERROR(
-        TFE_Py_EncodeTensor(indices.get(), include_tensor_ranks_only, result));
-
-    tensorflow::Safe_PyObjectPtr dense_shape(
-        PyObject_GetAttrString(arg, "dense_shape"));
-    if (dense_shape == nullptr) {
-      PyErr_Clear();
-      return tensorflow::errors::InvalidArgument(
-          "IndexedSlices does not have a dense_shape attr");
-    }
-    if (dense_shape.get() != Py_None) {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(
-          dense_shape.get(), include_tensor_ranks_only, result));
-    }
   } else if (PyList_Check(arg)) {
     TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
         arg, kList, kListEnd, include_tensor_ranks_only, result));
@@ -3067,6 +3041,28 @@
       TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(
           value.get(), include_tensor_ranks_only, result));
     }
+  } else if (tensorflow::swig::IsCompositeTensor(arg)) {
+    absl::StrAppend(&result->str, kCompositeTensor);
+
+    static char _to_components[] = "_to_components";
+    tensorflow::Safe_PyObjectPtr components(
+        PyObject_CallMethod(arg, _to_components, nullptr));
+    if (components == nullptr) {
+      return tensorflow::errors::InvalidArgument(
+          "Error while calling CompositeTensor._to_components().");
+    }
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(
+        components.get(), include_tensor_ranks_only, result));
+
+    static char _component_metadata[] = "_component_metadata";
+    tensorflow::Safe_PyObjectPtr metadata(
+        PyObject_CallMethod(arg, _component_metadata, nullptr));
+    if (metadata == nullptr) {
+      return tensorflow::errors::InvalidArgument(
+          "Error while calling CompositeTensor._component_metadata().");
+    }
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(
+        metadata.get(), include_tensor_ranks_only, result));
   } else {
     PyObject* object = PyWeakref_NewRef(arg, nullptr);
 
@@ -3125,7 +3121,6 @@
     for (int i = 0; i < len; i += CHUNK_SIZE) {
       PySys_WriteStdout("%s", string_msg.substr(i, CHUNK_SIZE).c_str());
     }
-    PySys_WriteStdout("\n");
 
     // Force flushing to make sure print newlines aren't interleaved in
     // some colab environments
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 9bded96..5299d1e 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -26,6 +26,7 @@
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -255,6 +256,18 @@
                                  "Expected list for 'values' argument"):
       _ = array_ops.stack(value, axis=1)
 
+  def testGraphResourceVariableRaisesFallback(self):
+    with ops.Graph().as_default():
+      a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+      m = resource_variable_ops.ResourceVariable(a_2_by_2)
+    ctx = context.context()
+    ctx.ensure_initialized()
+    with self.assertRaises(core._FallbackException):
+      pywrap_tensorflow.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
+                                               "MatMul", None, None, m, m,
+                                               "transpose_a", False,
+                                               "transpose_b", False)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index f8e3f64..b583371 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -368,6 +368,13 @@
         "Provided value.*Requested dtype.*"):
       _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
 
+  def testEagerLargeConstant(self):
+    for t in [dtypes.uint64, dtypes.uint32, dtypes.int32, dtypes.int64]:
+      self.assertEqual(
+          constant_op.constant(t.max, dtype=t).numpy(), t.max)
+      self.assertEqual(
+          constant_op.constant(t.min, dtype=t).numpy(), t.min)
+
 
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
@@ -480,6 +487,11 @@
         ValueError, "non-rectangular Python sequence"):
       constant_op.constant(l)
 
+  def test_numpyIsView(self):
+    t = constant_op.constant([0.0])
+    t._numpy()[0] = 42.0
+    self.assertAllClose(t, constant_op.constant([42.0]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 8f4ca1f..9d7fed6 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -21,13 +21,11 @@
 
 import weakref
 
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training.tracking import data_structures
@@ -86,68 +84,110 @@
     return wrapped
 
 
+def _lift_single_variable(old_variable, graph, variable_holder):
+  """Lifts `old_variable` out of the `FuncGraph` `graph`."""
+  new_variable = resource_variable_ops.UninitializedVariable(
+      shape=old_variable.shape,
+      dtype=old_variable.dtype,
+      name=old_variable.op.name,
+      trainable=old_variable.trainable,
+      extra_handle_data=old_variable.handle)
+  new_variable._initializer_op = old_variable._initializer_op  # pylint: disable=protected-access
+  graph.inputs.append(old_variable.handle)
+  graph.captures[new_variable.handle] = old_variable.handle
+  # Now that we've added the new variable to graph.captures,
+  # graph.capture will use that cached value and do some post-processing
+  # on the capture like recording it on the tape.
+  graph.capture(new_variable.handle)
+  # pylint: disable=protected-access
+  variable_name = new_variable.name.split(":")[0]
+  variable_holder._variables_by_name[variable_name] = new_variable
+  graph._weak_variables.append(weakref.ref(new_variable))
+  # pylint: enable=protected-access
+  graph.watch_variable(new_variable)
+  return new_variable
+
+
+def _lift_unlifted_variables(graph, variable_holder):
+  """Finds resource variables and lifts them into the outer context.
+
+  When we import a GraphDef inside a wrap_function, no Python graph building
+  code runs. This means we get VarHandleOps which create variable resources,
+  but no corresponding Python objects. Leaving them like this works but gives
+  the user no way to interact with or modify the variables outside the graph.
+
+  This method searches for variables and lifts them out as regular variable
+  objects when possible, indicating to the FuncGraph that they are captures.
+
+  Args:
+    graph: The FuncGraph to lift variables from.
+    variable_holder: A VariableHolder to record the lifted variables in.
+  """
+  with graph.as_default():
+    global_collection_variables = ops.get_collection(
+        ops.GraphKeys.GLOBAL_VARIABLES)
+    local_collection_variables = ops.get_collection(
+        ops.GraphKeys.LOCAL_VARIABLES)
+    existing_captures = set(graph.internal_captures)
+    lifted_variables = {}
+
+    def _should_lift_variable(v):
+      return ((v._in_graph_mode  # pylint: disable=protected-access
+               and v.graph.building_function)
+              and isinstance(v, resource_variable_ops.ResourceVariable)
+              and v.handle not in existing_captures)
+
+    for old_variable in global_collection_variables:
+      if _should_lift_variable(old_variable):
+        new_variable = _lift_single_variable(
+            old_variable, graph, variable_holder)
+        lifted_variables[old_variable] = new_variable
+        existing_captures.add(old_variable.handle)
+
+    for old_variable in local_collection_variables:
+      if _should_lift_variable(old_variable):
+        new_variable = _lift_single_variable(
+            old_variable, graph, variable_holder)
+        lifted_variables[old_variable] = new_variable
+        existing_captures.add(old_variable.handle)
+        if new_variable._in_graph_mode:  # pylint: disable=protected-access
+          outer_graph = new_variable.graph
+          # Variables are added to the global collection by default. In this
+          # case we only want the variable in the local collection, so we'll pop
+          # it out.
+          global_collection = outer_graph.get_collection_ref(
+              ops.GraphKeys.GLOBAL_VARIABLES)
+          global_collection.remove(new_variable)
+          outer_graph.add_to_collection(
+              ops.GraphKeys.LOCAL_VARIABLES, new_variable)
+
+    # Update the FuncGraph's collections, partly for the user and partly so this
+    # function is idempotent when it runs again in prune() calls.
+    for collection_name in [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.LOCAL_VARIABLES
+    ]:
+      mutable_collection = ops.get_collection_ref(collection_name)
+      for index, current in enumerate(mutable_collection):
+        mutable_collection[index] = lifted_variables.get(current, current)
+
+
 # TODO(allenl): make this trackable
 class WrappedFunction(function.ConcreteFunction):
   """Wraps a tf V1 piece of code in a function."""
 
   def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
+    self._variable_holder = variable_holder
+    _lift_unlifted_variables(fn_graph, variable_holder)
+    # We call __init__ after lifting variables so that the function's signature
+    # properly reflects the new captured inputs.
     super(WrappedFunction, self).__init__(
         fn_graph, attrs=attrs, signature=signature)
-    self._variable_holder = variable_holder
-    if ops.executing_eagerly_outside_functions():
-      # TODO(allenl): Make this work in 1.x?
-      self._lift_unlifted_variables()
-
-  def _lift_unlifted_variables(self):
-    """Finds resource variables and lifts them into the outer context.
-
-    When we import a GraphDef inside a wrap_function, no Python graph building
-    code runs. This means we get VarHandleOps which create variable resources,
-    but no corresponding Python objects. Leaving them like this works but gives
-    the user no way to interact with or modify the variables outside the graph.
-
-    This method searches for variables and lifts them out as regular variable
-    objects when possible, indicating to the FuncGraph that they are captures.
-    """
-    with self.graph.as_default():
-      collection_variables = (
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) +
-          ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
-      existing_captures = set(self.graph.internal_captures)
-      lifted_variables = {}
-      for old_variable in collection_variables:
-        if (old_variable._in_graph_mode  # pylint: disable=protected-access
-            and
-            isinstance(old_variable, resource_variable_ops.ResourceVariable)):
-          if old_variable.handle in existing_captures:
-            continue
-          new_variable = def_function.UnliftedInitializerVariable(
-              array_ops.placeholder(
-                  name="unused_{}_initializer".format(old_variable.op.name),
-                  shape=old_variable.shape,
-                  dtype=old_variable.dtype),
-              name=old_variable.op.name,
-              trainable=old_variable.trainable)
-          self.graph.captures[new_variable.handle] = old_variable.handle
-          existing_captures.add(old_variable.handle)
-          lifted_variables[old_variable] = new_variable
-          # pylint: disable=protected-access
-          variable_name = new_variable.name.split(":")[0]
-          self._variable_holder._variables_by_name[variable_name] = new_variable
-          self.graph._weak_variables.append(weakref.ref(new_variable))
-          # pylint: enable=protected-access
-      # Update the graph's collections, partly for the user and partly so this
-      # function is idempotent when it runs again in prune() calls.
-      for collection_name in [
-          ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.LOCAL_VARIABLES
-      ]:
-        mutable_collection = ops.get_collection_ref(collection_name)
-        for index, current in enumerate(mutable_collection):
-          mutable_collection[index] = lifted_variables.get(current, current)
 
   def prune(self, feeds, fetches, name=None, input_signature=None):
     # TODO(b/129646028): Add support for CompositeTensors.
     name = name or "pruned"
+    feeds = nest.map_structure(self.graph.as_graph_element, feeds)
+    fetches = nest.map_structure(self.graph.as_graph_element, fetches)
     flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
     for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index adbc132..b1511a6 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -17,6 +17,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -31,6 +32,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
 
 
 class WrapFunctionTest(test.TestCase):
@@ -77,6 +79,63 @@
     f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
     self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
 
+  def _assert_single_captured_variable_argument(self, graph_def):
+    # The single FunctionDef should have one argument, a captured variable
+    function_def, = graph_def.library.function
+    self.assertLen(function_def.signature.input_arg, 1)
+    function_arg, = function_def.signature.input_arg
+    self.assertEqual(dtypes.resource, dtypes.as_dtype(function_arg.type))
+
+  def testVariableLifting(self):
+    save_prefix = os.path.join(self.get_temp_dir(), 'meta_graph_test')
+
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      v = variables.Variable(1.)
+      array_ops.identity(v + 1., name='output')
+      saver = saver_lib.Saver([v])
+      with self.test_session() as session:
+        session.run(v.initializer)
+        saver.save(session, save_prefix)
+
+    def importer():
+      saver_lib.import_meta_graph(save_prefix + '.meta')
+      return ops.get_default_graph().as_graph_element('output:0')
+
+    wrapped = wrap_function.wrap_function(importer, [])
+    lifted_variables = list(wrapped.graph.variables)
+    self.assertLen(lifted_variables, 1)
+    initializer = wrapped.prune(
+        [], wrapped.graph.as_graph_element(v.initializer.name))
+    self.assertEqual(lifted_variables, list(initializer.graph.variables))
+    self.assertEqual(initializer.graph.external_captures,
+                     wrapped.graph.external_captures)
+
+    @def_function.function
+    def wraps_initializer():
+      initializer()
+
+    wraps_initializer()
+    self.assertEqual(1., lifted_variables[0].numpy())
+    wrapped_initializer_graphdef = (
+        wraps_initializer.get_concrete_function().graph.as_graph_def())
+    self._assert_single_captured_variable_argument(wrapped_initializer_graphdef)
+
+    @def_function.function
+    def wraps_wrapped():
+      return wrapped()
+
+    # Verify that the original graph also has the correct signature.
+    wrapped_wrapped_graphdef = (
+        wraps_wrapped.get_concrete_function().graph.as_graph_def())
+    self._assert_single_captured_variable_argument(wrapped_wrapped_graphdef)
+    # Now check that the graph runs wrapped, from eager, and when pruned.
+    self.assertAllEqual(wraps_wrapped().numpy(),
+                        lifted_variables[0].numpy() + 1.)
+    self.assertAllEqual(wrapped().numpy(), lifted_variables[0].numpy() + 1.)
+    pruned = wrapped.prune([], wrapped.graph.as_graph_element('output:0'))
+    self.assertAllEqual(wrapped().numpy(), pruned().numpy())
+
   def testNoArguments(self):
 
     def f():
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index a8ba4ea..9aae859 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -335,7 +335,7 @@
 
     # Ensure all ops which must run do run
     self.ops_which_must_run.update(ops_which_must_run)
-    for r in self._returned_tensors:
+    for r in nest.flatten(list(self._returned_tensors), expand_composites=True):
       if self.ops_which_must_run:
         r.op._add_control_inputs(  # pylint: disable=protected-access
             [o for o in self.ops_which_must_run
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index 522e81c..657c47c 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -42,30 +42,40 @@
   ct = ...  # Create a composite tensor.
   flat_list_of_tensors = nest.flatten(ct, expand_composites=True)
   transformed_list_of_tensors = ...  # do something with the flat tensors.
-  result = nest.pack_sequence_as(ct, transformed_list_of_tensors)
+  result = nest.pack_sequence_as(ct, transformed_list_of_tensors,
+                                 expand_composites=True)
   ```
   """
 
   @abc.abstractmethod
   def _to_components(self):
-    """Decomposes this composite tensor into its components.
+    """Decomposes this composite tensor into its component tensors.
 
     Returns:
-      The components that comprise this composite tensor: a nested structure
-      (as defined by `tf.python.util.nest`) whose values are `tf.Tensor`s or
-      `CompositeTensor`s.
+      A nested structure of `tf.Tensor`s and `CompositeTensor`s that can be
+      used to reconstruct this composite tensor (along with metadata returned
+      by `_component_metadata`).
     """
     raise NotImplementedError("CompositeTensor._to_components")
 
+  def _component_metadata(self):
+    """Returns any non-tensor metadata needed to reconstruct a composite tensor.
+
+    Returns:
+      A nested structure of metadata that can be used to reconstruct this
+      composite tensor (along with the tensors returned by `_to_components`).
+    """
+    return None
+
   @abc.abstractmethod
-  def _from_components(cls, components):  # pylint: disable=no-self-argument
+  def _from_components(cls, components, metadata):  # pylint: disable=no-self-argument
     """Creates a composite tensor of type `cls` from components.
 
     Args:
-      components: The components that should be used to form the
-        composite tensor: a nested structure (as defined by
-        `tf.python.util.nest`) whose values are tf.Tensors or composite
-        tensors.
+      components: A nested structure whose values are `tf.Tensor`s or
+        `tf.CompositeTensor`s (as returned by `_to_components`).
+      metadata: A nested structure containing any additional metadata needed to
+        reconstruct the composite tensor (as returned by `_composite_metadata`).
 
     Returns:
       A `CompositeTensor` of type `cls`.
@@ -90,9 +100,9 @@
   @abc.abstractproperty
   def _is_graph_tensor(self):
     """Returns True if this tensor's components belong to a TF graph."""
-    raise NotImplementedError("CompositeTensor._is_symbolic_tensor")
+    raise NotImplementedError("CompositeTensor._is_graph_tensor")
 
-  def consumers(self):
+  def _consumers(self):
     """Returns a list of `Operation`s that consume this `CompositeTensor`.
 
     Returns:
diff --git a/tensorflow/python/framework/composite_tensor_test.py b/tensorflow/python/framework/composite_tensor_test.py
index 65518bf..3f2d792 100644
--- a/tensorflow/python/framework/composite_tensor_test.py
+++ b/tensorflow/python/framework/composite_tensor_test.py
@@ -18,25 +18,33 @@
 from __future__ import division
 from __future__ import print_function
 
+import gc
+import sys
+import weakref
+from absl.testing import parameterized
+
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 from tensorflow.python.util import nest
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class TestCompositeTensor(composite_tensor.CompositeTensor):
+class CT(composite_tensor.CompositeTensor):
+  """A generic CompositeTensor, used for constructing tests."""
 
-  def __init__(self, *components):
-    self._components = components
+  def __init__(self, components, metadata=None):
+    self.components = components
+    self.metadata = metadata
 
   def _to_components(self):
-    return self._components
+    return self.components
+
+  def _component_metadata(self):
+    return self.metadata
 
   @classmethod
-  def _from_components(cls, components):
-    return cls(*components)
+  def _from_components(cls, components, metadata):
+    return cls(components, metadata)
 
   def _shape_invariant_to_components(self, shape=None):
     raise NotImplementedError('CompositeTensor._shape_invariant_to_components')
@@ -45,242 +53,315 @@
     return False
 
   def __repr__(self):
-    return 'TestCompositeTensor%r' % (self._components,)
+    return '%s(%r, %r)' % (type(self).__name__, self.components, self.metadata)
 
   def __eq__(self, other):
-    return (isinstance(other, TestCompositeTensor) and
-            self._components == other._components)
+    return (type(self) is type(other) and
+            self.components == other.components and
+            self.metadata == other.metadata)
 
 
-class CompositeTensorTest(test_util.TensorFlowTestCase):
+class CT2(CT):
+  """Another test CompositeTensor class.
 
-  def assertNestEqual(self, a, b):
-    if isinstance(a, dict):
-      self.assertIsInstance(b, dict)
-      self.assertEqual(set(a), set(b))
-      for key in a:
-        self.assertNestEqual(a[key], b[key])
-    elif isinstance(a, (list, tuple)):
-      self.assertIsInstance(b, (list, tuple))
-      self.assertEqual(len(a), len(b))
-      for a_val, b_val in zip(a, b):
-        self.assertNestEqual(a_val, b_val)
-    elif isinstance(a, composite_tensor.CompositeTensor):
-      self.assertIsInstance(b, composite_tensor.CompositeTensor)
-      self.assertNestEqual(a._to_components(), b._to_components())
-    else:
-      self.assertAllEqual(a, b)
+  `tf.nest` should treat different CT classes as different structure types.
+  """
+  pass
 
-  def testNestFlatten(self):
-    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
-    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
-    structure = [[st1], 'foo', {'y': [st2]}]
-    x = nest.flatten(structure, expand_composites=True)
-    self.assertNestEqual(x, [
-        st1.indices, st1.values, st1.dense_shape, 'foo', st2.indices,
-        st2.values, st2.dense_shape
-    ])
 
-  def testNestPackSequenceAs(self):
-    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
-    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
-    structure1 = [[st1], 'foo', {'y': [st2]}]
-    flat = [
-        st2.indices, st2.values, st2.dense_shape, 'bar', st1.indices,
-        st1.values, st1.dense_shape
-    ]
-    result = nest.pack_sequence_as(structure1, flat, expand_composites=True)
-    expected = [[st2], 'bar', {'y': [st1]}]
-    self.assertNestEqual(expected, result)
+@test_util.run_all_in_graph_and_eager_modes
+class CompositeTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  def testNestAssertSameStructure(self):
-    st1 = sparse_tensor.SparseTensor([[0]], [0], [100])
-    st2 = sparse_tensor.SparseTensor([[0, 3]], ['x'], [100, 100])
-    test = TestCompositeTensor(st1.indices, st1.values, st1.dense_shape)
-    nest.assert_same_structure(st1, st2, expand_composites=False)
-    nest.assert_same_structure(st1, st2, expand_composites=True)
-    nest.assert_same_structure(st1, test, expand_composites=False)
-    with self.assertRaises(TypeError):
-      nest.assert_same_structure(st1, test, expand_composites=True)
-
-  def testNestMapStructure(self):
-    structure = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-
-    def func(x):
-      return x + 10
-
-    result = nest.map_structure(func, structure, expand_composites=True)
-    expected = [[TestCompositeTensor(11, 12, 13)], 110, {
-        'y': TestCompositeTensor(TestCompositeTensor(14, 15), 16)
-    }]
+  @parameterized.parameters([
+      {'structure': CT(0),
+       'expected': [0],
+       'paths': [('CT',)]},
+      {'structure': CT('a'),
+       'expected': ['a'],
+       'paths': [('CT',)]},
+      {'structure': CT(['a', 'b', 'c']),
+       'expected': ['a', 'b', 'c'],
+       'paths': [('CT', 0), ('CT', 1), ('CT', 2)]},
+      {'structure': CT({'x': 'a', 'y': 'b', 'z': 'c'}),
+       'expected': ['a', 'b', 'c'],
+       'paths': [('CT', 'x'), ('CT', 'y'), ('CT', 'z')]},
+      {'structure': [{'k1': CT('a')}, CT(['b', {'x': CT({'y': 'c'})}])],
+       'expected': ['a', 'b', 'c'],
+       'paths': [(0, 'k1', 'CT'), (1, 'CT', 0), (1, 'CT', 1, 'x', 'CT', 'y')]},
+      {'structure': CT(0),
+       'expand_composites': False,
+       'expected': [CT(0)],
+       'paths': [()]},
+      {'structure': [{'k1': CT('a')}, CT(['b', {'x': CT({'y': 'c'})}])],
+       'expand_composites': False,
+       'expected': [CT('a'), CT(['b', {'x': CT({'y': 'c'})}])],
+       'paths': [(0, 'k1'), (1,)]},
+  ])  # pyformat: disable
+  def testNestFlatten(self, structure, expected, paths, expand_composites=True):
+    result = nest.flatten(structure, expand_composites=expand_composites)
     self.assertEqual(result, expected)
 
-  def testNestMapStructureWithPaths(self):
-    structure = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
+    result_with_paths = nest.flatten_with_tuple_paths(
+        structure, expand_composites=expand_composites)
+    self.assertEqual(result_with_paths, list(zip(paths, expected)))
 
-    def func(path, x):
+    string_paths = ['/'.join(str(p) for p in path) for path in paths]  # pylint: disable=g-complex-comprehension
+    result_with_string_paths = nest.flatten_with_joined_string_paths(
+        structure, expand_composites=expand_composites)
+    self.assertEqual(result_with_string_paths,
+                     list(zip(string_paths, expected)))
+
+    flat_paths_result = list(
+        nest.yield_flat_paths(structure, expand_composites=expand_composites))
+    self.assertEqual(flat_paths_result, paths)
+
+  @parameterized.parameters([
+      {'s1': [1, 2, 3],
+       's2': [CT(['a', 'b']), 'c', 'd'],
+       'expand_composites': False,
+       'expected': [CT(['a', 'b']), 'c', 'd'],
+       'paths': [(0,), (1,), (2,)]},
+      {'s1': [CT([1, 2, 3])],
+       's2': [5],
+       'expand_composites': False,
+       'expected': [5],
+       'paths': [(0,)]},
+      {'s1': [[CT([9, 9, 9])], 999, {'y': CT([9, 9])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       'expected': [1, 2, 3, 100, CT([4, 5]), 6],
+       'paths': [(0, 0, 'CT', 0), (0, 0, 'CT', 1), (0, 0, 'CT', 2),
+                 (1,), (2, 'y', 'CT', 0), (2, 'y', 'CT', 1)]},
+      {'s1': [[CT([9, 9, 9])], 999, {'y': CT([9, 9])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       'expand_composites': False,
+       'expected': [CT([1, 2, 3]), 100, CT([CT([4, 5]), 6])],
+       'paths': [(0, 0), (1,), (2, 'y')]},
+      {'s1': [[CT([9, 9, 9])], 999, {'y': CT([CT([9, 9]), 9])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([5, 6])}],
+       'expand_composites': False,
+       'expected': [CT([1, 2, 3]), 100, CT([5, 6])],
+       'paths': [(0, 0), (1,), (2, 'y')]},
+  ])  # pyformat: disable
+  def testNestFlattenUpTo(self, s1, s2, expected, paths,
+                          expand_composites=True):
+    result = nest.flatten_up_to(s1, s2, expand_composites=expand_composites)
+    self.assertEqual(expected, result)
+
+    result_with_paths = nest.flatten_with_tuple_paths_up_to(
+        s1, s2, expand_composites=expand_composites)
+    self.assertEqual(result_with_paths, list(zip(paths, expected)))
+
+  @parameterized.parameters([
+      {'structure': CT(0),
+       'sequence': [5],
+       'expected': CT(5)},
+      {'structure': CT(['a', 'b', 'c']),
+       'sequence': ['A', CT(['b']), {'x': 'y'}],
+       'expected': CT(['A', CT(['b']), {'x': 'y'}])},
+      {'structure': [{'k1': CT('a')}, CT(['b', {'x': CT({'y': 'c'})}])],
+       'sequence': ['A', 'B', 'C'],
+       'expected': [{'k1': CT('A')}, CT(['B', {'x': CT({'y': 'C'})}])]},
+      {'structure': [{'k1': CT('a')}, CT(['b', {'x': CT({'y': 'c'})}])],
+       'sequence': ['A', 'B'],
+       'expand_composites': False,
+       'expected': [{'k1': 'A'}, 'B']},
+      {'structure': CT(0, metadata='abc'),
+       'sequence': [5],
+       'expected': CT(5, metadata='abc')},
+  ])  # pyformat: disable
+  def testNestPackSequenceAs(self,
+                             structure,
+                             sequence,
+                             expected,
+                             expand_composites=True):
+    result = nest.pack_sequence_as(
+        structure, sequence, expand_composites=expand_composites)
+    self.assertEqual(result, expected)
+
+  @parameterized.parameters([
+      {'s1': CT(0), 's2': CT('xyz')},
+      {'s1': CT(['a', 'b', 'c']), 's2': CT(['d', 'e', 'f'])},
+      {'s1': [1, CT(['a']), CT('b', metadata='xyz')],
+       's2': [8, CT([55]), CT(100, metadata='xyz')]},
+  ])  # pyformat: disable
+  def testNestAssertSameStructure(self, s1, s2, expand_composites=True):
+    nest.assert_same_structure(s1, s2, expand_composites=expand_composites)
+    nest.assert_shallow_structure(s1, s2, expand_composites=expand_composites)
+
+  @parameterized.parameters([
+      {'s1': CT(0), 's2': CT(['x'])},
+      {'s1': CT([1]), 's2': CT([1, 2])},
+      {'s1': CT({'x': 1}), 's2': CT({'y': 1})},
+      {'s1': CT(0), 's2': CT(0, metadata='xyz')},
+      {'s1': CT(0, metadata='xyz'), 's2': CT(0)},
+      {'s1': CT(0, metadata='xyz'), 's2': CT(0, metadata='abc')},
+      {'s1': CT(['a', 'b', 'c']), 's2': CT(['d', 'e'])},
+      {'s1': [1, CT(['a']), CT('b', metadata='xyz')],
+       's2': [8, CT([55, 66]), CT(100, metadata='abc')]},
+      {'s1': CT(0), 's2': CT2(0), 'error': TypeError},
+      {'s1': CT((1, 2)), 's2': CT([1, 2]), 'error': TypeError},
+  ])  # pyformat: disable
+  def testNestAssertSameStructureCompositeMismatch(self,
+                                                   s1,
+                                                   s2,
+                                                   error=ValueError):
+    # s1 and s2 have the same structure if expand_composites=False; but
+    # different structures if expand_composites=True.
+    nest.assert_same_structure(s1, s2, expand_composites=False)
+    nest.assert_shallow_structure(s1, s2, expand_composites=False)
+    with self.assertRaises(error):  # pylint: disable=g-error-prone-assert-raises
+      nest.assert_same_structure(s1, s2, expand_composites=True)
+
+  @parameterized.parameters([
+      # Note: there are additional test cases in testNestAssertSameStructure.
+      {'s1': CT(1), 's2': CT([1])},
+      {'s1': CT(1), 's2': CT(CT(1))},
+      {'s1': [1], 's2': [CT(1)]},
+      {'s1': [[CT([1, 2, 3])], 100, {'y': CT([5, 6])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}]},
+      {'s1': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([5, 6])}],
+       'expand_composites': False},
+  ])  # pyformat: disable
+  def testNestAssertShallowStructure(self, s1, s2, expand_composites=True):
+    nest.assert_shallow_structure(s1, s2, expand_composites=expand_composites)
+
+  @parameterized.parameters([
+      # Note: there are additional test cases in
+      # testNestAssertSameStructureCompositeMismatch.
+      {'s1': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([5, 6])}]},
+      {'s1': CT([1, 2, 3]),
+       's2': [1, 2, 3],
+       'check_types': False},
+  ])  # pyformat: disable
+  def testNestAssertShallowStructureCompositeMismatch(self,
+                                                      s1,
+                                                      s2,
+                                                      check_types=True):
+    with self.assertRaises(TypeError):  # pylint: disable=g-error-prone-assert-raises
+      nest.assert_shallow_structure(
+          s1, s2, expand_composites=True, check_types=check_types)
+
+  @parameterized.parameters([
+      {'structure': CT(1, metadata=2),
+       'expected': CT(11, metadata=2)},
+      {'structure': CT({'x': 1, 'y': [2, 3]}, metadata=2),
+       'expected': CT({'x': 11, 'y': [12, 13]}, metadata=2)},
+      {'structure': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       'expected': [[CT([11, 12, 13])], 110, {'y': CT([CT([14, 15]), 16])}]},
+  ])  # pyformat: disable
+  def testNestMapStructure(self, structure, expected, expand_composites=True):
+    func = lambda x: x + 10
+    result = nest.map_structure(
+        func, structure, expand_composites=expand_composites)
+    self.assertEqual(result, expected)
+
+  @parameterized.parameters([
+      {'s1': [[CT([1, 2, 3])], 100, {'y': CT([5, 6])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       'expected': [[CT([11, 12, 13])], 110, {'y': CT([CT([4, 5]), 16])}]}
+  ])  # pyformat: disable
+  def testNestMapStructureUpTo(self, s1, s2, expected):
+    func = lambda x: x + 10 if isinstance(x, int) else x
+    result = nest.map_structure_up_to(s1, func, s2, expand_composites=True)
+    self.assertEqual(result, expected)
+
+  @parameterized.parameters([
+      {'structure': CT('a'),
+       'expected': CT('CT:a')},
+      {'structure': CT(['a', 'b']),
+       'expected': CT(['CT/0:a', 'CT/1:b'])},
+      {'structure': CT({'x': 'a', 'y': 'b'}),
+       'expected': CT({'x': 'CT/x:a', 'y': 'CT/y:b'})},
+      {'structure': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       'expected': [
+           [CT(['0/0/CT/0:1', '0/0/CT/1:2', '0/0/CT/2:3'])],
+           '1:100',
+           {'y': CT([CT(['2/y/CT/0/CT/0:4', '2/y/CT/0/CT/1:5']),
+                     '2/y/CT/1:6'])}]},
+  ])  # pyformat: disable
+  def testNestMapStructureWithPaths(self,
+                                    structure,
+                                    expected,
+                                    expand_composites=True):
+
+    def func1(path, x):
       return '%s:%s' % (path, x)
 
     result = nest.map_structure_with_paths(
-        func, structure, expand_composites=True)
-    expected = [[TestCompositeTensor('0/0/0:1', '0/0/1:2', '0/0/2:3')], '1:100',
-                {
-                    'y':
-                        TestCompositeTensor(
-                            TestCompositeTensor('2/y/0/0:4', '2/y/0/1:5'),
-                            '2/y/1:6')
-                }]
+        func1, structure, expand_composites=expand_composites)
     self.assertEqual(result, expected)
 
-  def testNestMapStructureWithTuplePaths(self):
-    structure = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-
-    def func(path, x):
-      return (path, x)
+    # Use the same test cases for map_structure_with_tuple_paths.
+    def func2(tuple_path, x):
+      return '%s:%s' % ('/'.join(str(v) for v in tuple_path), x)
 
     result = nest.map_structure_with_tuple_paths(
-        func, structure, expand_composites=True)
-    expected = [[
-        TestCompositeTensor(((0, 0, 0), 1), ((0, 0, 1), 2), ((0, 0, 2), 3))
-    ], ((1,), 100), {
-        'y':
-            TestCompositeTensor(
-                TestCompositeTensor(((2, 'y', 0, 0), 4), ((2, 'y', 0, 1), 5)),
-                ((2, 'y', 1), 6))
-    }]
+        func2, structure, expand_composites=expand_composites)
     self.assertEqual(result, expected)
 
-  def testNestAssertShallowStructure(self):
-    s1 = [[TestCompositeTensor(1, 2, 3)], 100, {'y': TestCompositeTensor(5, 6)}]
-    s2 = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-    nest.assert_shallow_structure(s1, s2, expand_composites=False)
-    nest.assert_shallow_structure(s1, s2, expand_composites=True)
-    nest.assert_shallow_structure(s2, s1, expand_composites=False)
-    with self.assertRaises(TypeError):
-      nest.assert_shallow_structure(s2, s1, expand_composites=True)
+  @parameterized.parameters([
+      {'s1': [[CT([1, 2, 3])], 100, {'y': CT([5, 6])}],
+       's2': [[CT([1, 2, 3])], 100, {'y': CT([CT([4, 5]), 6])}],
+       'expected': [
+           [CT(['0/0/CT/0:1', '0/0/CT/1:2', '0/0/CT/2:3'])],
+           ('1:100'),
+           {'y': CT(['2/y/CT/0:CT([4, 5], None)', '2/y/CT/1:6'])}]},
+  ])  # pyformat: disable
+  def testNestMapStructureWithTuplePathsUpTo(self, s1, s2, expected):
 
-  def testNestFlattenUpTo(self):
-    s1 = [[TestCompositeTensor(1, 2, 3)], 100, {'y': TestCompositeTensor(5, 6)}]
-    s2 = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-    result1 = nest.flatten_up_to(s1, s2, expand_composites=True)
-    expected1 = [1, 2, 3, 100, TestCompositeTensor(4, 5), 6]
-    self.assertEqual(result1, expected1)
-
-    result2 = nest.flatten_up_to(s1, s2, expand_composites=False)
-    expected2 = [
-        TestCompositeTensor(1, 2, 3), 100,
-        TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    ]
-    self.assertEqual(result2, expected2)
-
-  def testNestFlattenWithTuplePathsUpTo(self):
-    s1 = [[TestCompositeTensor(1, 2, 3)], 100, {'y': TestCompositeTensor(5, 6)}]
-    s2 = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-    result1 = nest.flatten_with_tuple_paths_up_to(
-        s1, s2, expand_composites=True)
-    expected1 = [((0, 0, 0), 1), ((0, 0, 1), 2), ((0, 0, 2), 3), ((1,), 100),
-                 ((2, 'y', 0), TestCompositeTensor(4, 5)), ((2, 'y', 1), 6)]
-    self.assertEqual(result1, expected1)
-
-    result2 = nest.flatten_with_tuple_paths_up_to(
-        s1, s2, expand_composites=False)
-    expected2 = [((0, 0), TestCompositeTensor(1, 2, 3)), ((1,), 100),
-                 ((2, 'y'), TestCompositeTensor(TestCompositeTensor(4, 5), 6))]
-    self.assertEqual(result2, expected2)
-
-  def testNestMapStructureUpTo(self):
-    s1 = [[TestCompositeTensor(1, 2, 3)], 100, {'y': TestCompositeTensor(5, 6)}]
-    s2 = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-
-    def func(x):
-      return x + 10 if isinstance(x, int) else x
-
-    result = nest.map_structure_up_to(s1, func, s2, expand_composites=True)
-    expected = [[TestCompositeTensor(11, 12, 13)], 110, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 16)
-    }]
-    self.assertEqual(result, expected)
-
-  def testNestMapStructureWithTuplePathsUpTo(self):
-    s1 = [[TestCompositeTensor(1, 2, 3)], 100, {'y': TestCompositeTensor(5, 6)}]
-    s2 = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-
-    def func(path, x):
-      return (path, x)
+    def func(tuple_path, x):
+      return '%s:%s' % ('/'.join(str(v) for v in tuple_path), x)
 
     result = nest.map_structure_with_tuple_paths_up_to(
         s1, func, s2, expand_composites=True)
-    expected = [[
-        TestCompositeTensor(((0, 0, 0), 1), ((0, 0, 1), 2), ((0, 0, 2), 3))
-    ], ((1,), 100), {
-        'y':
-            TestCompositeTensor(((2, 'y', 0), TestCompositeTensor(4, 5)),
-                                ((2, 'y', 1), 6))
-    }]
     self.assertEqual(result, expected)
 
   def testNestGetTraverseShallowStructure(self):
-    pass
+    func = lambda t: not (isinstance(t, CT) and t.metadata == 'B')
+    structure = [CT([1, 2], metadata='A'), CT([CT(3)], metadata='B')]
 
-  def testNestYieldFlatPaths(self):
-    structure = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-    result1 = list(nest.yield_flat_paths(structure, expand_composites=True))
-    expected1 = [(0, 0, 0), (0, 0, 1), (0, 0, 2), (1,), (2, 'y', 0, 0),
-                 (2, 'y', 0, 1), (2, 'y', 1)]
-    self.assertEqual(result1, expected1)
+    result = nest.get_traverse_shallow_structure(
+        func, structure, expand_composites=True)
+    expected = [CT([True, True], metadata='A'), False]
+    self.assertEqual(result, expected)
 
-    result2 = list(nest.yield_flat_paths(structure, expand_composites=False))
-    expected2 = [(0, 0), (1,), (2, 'y')]
-    self.assertEqual(result2, expected2)
+  def testMemoryIsFreed(self):
+    # Note: we use `set` values for components and metadata because we need
+    # to construct weakrefs to them.  Other builtin types, such as `list` and
+    # `tuple`, do not support weakrefs.
+    ct1 = CT(set([1, 2]), set(['no', 'leaks']))
+    ct2 = CT(set([3, 4]), set(['no', 'leaks']))
+    ct3 = CT(set([5, 6]), set(['other', 'metadata']))
 
-  def testNestFlattenWithJoinedStringPaths(self):
-    structure = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-    result1 = nest.flatten_with_joined_string_paths(
-        structure, expand_composites=True)
-    expected1 = [('0/0/0', 1), ('0/0/1', 2), ('0/0/2', 3), ('1', 100),
-                 ('2/y/0/0', 4), ('2/y/0/1', 5), ('2/y/1', 6)]
-    self.assertEqual(result1, expected1)
+    # Note: map_structure exercises flatten, pack_sequence_as, and
+    # assert_same_structure.
+    func = lambda x, y: x | y
+    ct4 = nest.map_structure(func, ct1, ct2, expand_composites=True)
 
-    result2 = nest.flatten_with_joined_string_paths(
-        structure, expand_composites=False)
-    expected2 = [('0/0', TestCompositeTensor(1, 2, 3)), ('1', 100),
-                 ('2/y', TestCompositeTensor(TestCompositeTensor(4, 5), 6))]
-    self.assertEqual(result2, expected2)
+    # Check that the exception-raising path in assert_same_structure
+    # doesn't leak any objects.
+    with self.assertRaisesRegexp(ValueError,
+                                 ".*don't have the same nested structure.*"):
+      nest.map_structure(func, ct2, ct3, expand_composites=True)
+    if hasattr(sys, 'exc_clear'):
+      sys.exc_clear()  # Remove any references in exception stack traces.
 
-  def testNestFlattenWithTuplePaths(self):
-    structure = [[TestCompositeTensor(1, 2, 3)], 100, {
-        'y': TestCompositeTensor(TestCompositeTensor(4, 5), 6)
-    }]
-    result1 = nest.flatten_with_tuple_paths(structure, expand_composites=True)
-    expected1 = [((0, 0, 0), 1), ((0, 0, 1), 2), ((0, 0, 2), 3), ((1,), 100),
-                 ((2, 'y', 0, 0), 4), ((2, 'y', 0, 1), 5), ((2, 'y', 1), 6)]
-    self.assertEqual(result1, expected1)
+    refs = []
+    for ct in [ct1, ct2, ct3, ct4]:
+      refs.append(weakref.ref(ct))
+      refs.append(weakref.ref(ct.components))
+      refs.append(weakref.ref(ct.metadata))
+    del ct  # pylint: disable=undefined-loop-variable
 
-    result2 = nest.flatten_with_tuple_paths(structure, expand_composites=False)
-    expected2 = [((0, 0), TestCompositeTensor(1, 2, 3)), ((1,), 100),
-                 ((2, 'y'), TestCompositeTensor(TestCompositeTensor(4, 5), 6))]
-    self.assertEqual(result2, expected2)
+    for ref in refs:
+      self.assertIsNotNone(ref())
 
+    del ct1, ct2, ct3, ct4
+    gc.collect()
+    for ref in refs:
+      self.assertIsNone(ref())
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index 4fddb86..b64407d 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -198,8 +198,10 @@
 
     with self.assertRaises(RuntimeError):
       context.set_log_device_placement(True)
-    with self.assertRaises(RuntimeError):
-      context.set_log_device_placement(False)
+
+    # If the setting the device placement is a no-op, do not throw a runtime
+    # exception.
+    context.set_log_device_placement(False)
 
   @test_util.run_gpu_only
   @reset_eager
@@ -493,6 +495,19 @@
     for gpu in gpus:
       self.assertIsNotNone(gpu.name)
 
+  @reset_eager
+  def testV1CompatibilityDummyInivisibleDeviceList(self):
+    gpus = config.list_physical_devices('GPU')
+    if gpus:
+      self.skipTest('Test requires no GPUs')
+
+    # Ensure GPU options left untouched on CPU only environments
+    context.context()._physical_devices = None
+    context.context()._config = config_pb2.ConfigProto(
+        gpu_options=config_pb2.GPUOptions(visible_device_list='0'))
+    new_config = context.context().config
+    self.assertEqual(new_config.gpu_options.visible_device_list, '0')
+
   @test_util.run_gpu_only
   @reset_eager
   def testV1Compatibility(self):
@@ -503,20 +518,38 @@
     context.context()._physical_devices = None
 
     # Ensure CPU is split
-    context.context()._config = config_pb2.ConfigProto(device_count={'CPU': 2},)
+    context.context()._config = config_pb2.ConfigProto(device_count={'CPU': 2})
     new_config = context.context().config
     self.assertEqual(new_config.device_count['CPU'], 2)
     context.context()._physical_devices = None
 
-    # Ensure Handle visible device list parsing
+    # Handle empty visible device list
     context.context()._config = config_pb2.ConfigProto(
-        gpu_options=config_pb2.GPUOptions(visible_device_list='',),)
+        gpu_options=config_pb2.GPUOptions(visible_device_list=''))
     gpus = config.list_physical_devices('GPU')
+    gpu_count = len(gpus)
     new_config = context.context().config
     self.assertEqual(new_config.gpu_options.visible_device_list,
                      ','.join(str(i) for i in range(len(gpus))))
     context.context()._physical_devices = None
 
+    # Handle invalid visible device list
+    context.context()._config = config_pb2.ConfigProto(
+        gpu_options=config_pb2.GPUOptions(visible_device_list=str(gpu_count)))
+    with self.assertRaisesRegexp(ValueError, 'Invalid visible device index'):
+      gpus = config.list_physical_devices('GPU')
+      new_config = context.context().config
+    context.context()._physical_devices = None
+
+    # Handle single visible device list
+    context.context()._config = config_pb2.ConfigProto(
+        gpu_options=config_pb2.GPUOptions(visible_device_list=str(gpu_count-1)))
+    gpus = config.list_physical_devices('GPU')
+    new_config = context.context().config
+    self.assertEqual(new_config.gpu_options.visible_device_list,
+                     str(gpu_count-1))
+    context.context()._physical_devices = None
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 6352ef1..43b713d 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -109,14 +109,31 @@
   input_tensors = func.inputs[-len(func.captured_inputs):]
   for var in func.graph.variables:
     index = func.captured_inputs.index(var.handle)
-    tensor = input_tensors[index]
-    node_name = get_name(tensor.name)
-    tensor_data[node_name] = var.numpy()
-    map_name_to_handle[node_name] = var.handle
+    tensor_name = get_name(input_tensors[index].name)
+    tensor_data[tensor_name] = var.numpy()
+    map_name_to_handle[tensor_name] = var.handle
+
+  # Get mapping from input name to value for non-variable placeholders.
+  map_name_to_value = {}
+  for name_tensor, value_tensor in zip(input_tensors, func.captured_inputs):
+    tensor_name = get_name(name_tensor.name)
+    if tensor_name not in map_name_to_handle:
+      map_name_to_value[tensor_name] = value_tensor
 
   resource_identities = {}
-  resource_placeholders = {}
+  placeholders = {}
+  converted_input_indices = set()
   for node in graph_def.node:
+    if node.name in map_name_to_value:
+      # Get the dtype and data for the Placeholders whose values are stored as
+      # Tensors. This is the case for values that were originally Const ops.
+      tensor = map_name_to_value[node.name]
+      placeholders[node.name] = {
+          "dtype": node.attr["dtype"],
+          "data": tensor.numpy(),
+      }
+      converted_input_indices.add(
+          func.captured_inputs.index(map_name_to_value[node.name]))
     if node.op == "ReadVariableOp":
       # Get name of Placeholder op associated with ReadVariableOp. There can be
       # an Identity in between the ReadVariableOp and Placeholder. Store the
@@ -130,22 +147,23 @@
                          "to the ReadVariableOp.")
       # Build a map of Placeholder ops that are inputs to ReadVariableOps to the
       # variable's dtype and data.
-      resource_placeholders[input_name] = {
+      placeholders[input_name] = {
           "dtype": node.attr["dtype"],
           "data": tensor_data[input_name],
       }
+      converted_input_indices.add(
+          func.captured_inputs.index(map_name_to_handle[input_name]))
 
   # Reconstruct the graph with constants in place of variables.
   output_graph_def = graph_pb2.GraphDef()
   how_many_converted = 0
 
-  converted_input_indices = set([])
   for input_node in graph_def.node:
     output_node = output_graph_def.node.add()
-    # Convert Placeholder ops that are inputs to ReadVariableOps into Const ops.
-    if input_node.name in resource_placeholders:
-      dtype = resource_placeholders[input_node.name]["dtype"]
-      data = resource_placeholders[input_node.name]["data"]
+    # Convert Placeholder ops to Const ops.
+    if input_node.name in placeholders:
+      dtype = placeholders[input_node.name]["dtype"]
+      data = placeholders[input_node.name]["data"]
 
       output_node.op = "Const"
       output_node.name = input_node.name
@@ -154,8 +172,6 @@
           tensor_util.make_tensor_proto(
               data, dtype=dtype.type, shape=data.shape))
       how_many_converted += 1
-      converted_input_indices.add(
-          func.captured_inputs.index(map_name_to_handle[input_node.name]))
     # Change the dtype for Identity ops that are inputs to ReadVariableOps.
     elif input_node.name in resource_identities:
       output_node.CopyFrom(input_node)
diff --git a/tensorflow/python/framework/device_spec.py b/tensorflow/python/framework/device_spec.py
index 4978162..0d78e47 100644
--- a/tensorflow/python/framework/device_spec.py
+++ b/tensorflow/python/framework/device_spec.py
@@ -357,13 +357,15 @@
 
   def __eq__(self, other):
     """Checks if the `other` DeviceSpec is same as the current instance, eg have
+
        same value for all the internal fields.
 
     Args:
       other: Another DeviceSpec
 
     Returns:
-      Return `True` if `other` is also a DeviceSpec instance and has same value as the current instance.
+      Return `True` if `other` is also a DeviceSpec instance and has same value
+      as the current instance.
       Return `False` otherwise.
     """
     return (isinstance(other, self.__class__) and
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 1c47848..5311f66 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -27,7 +27,6 @@
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
@@ -79,7 +78,6 @@
     Identical structure that has TensorSpec objects instead of Tensors and
     UknownArgument instead of any unsupported types.
   """
-  structure = composite_tensor.replace_composites_with_components(structure)
   def encode_arg(arg, path):
     """A representation for this argument, for converting into signatures."""
     if isinstance(arg, ops.Tensor):
@@ -370,7 +368,6 @@
       name=None,
       attrs=None,
       op_def=None,
-      compute_shapes=True,
       compute_device=True):
     # When capturing by value, do the read outside
     reverse_captures = dict((v, k) for k, v in self.captures.items())
@@ -383,8 +380,14 @@
             context.context())
       else:
         op = ops.get_default_graph().create_op(
-            op_type, uncaptured_inputs, dtypes, input_types, name, attrs,
-            op_def, compute_shapes, compute_device)
+            op_type,
+            uncaptured_inputs,
+            dtypes,
+            input_types,
+            name,
+            attrs,
+            op_def,
+            compute_device=compute_device)
         value = op.outputs[0]
     captured_value = self.capture(value)
     return captured_value.op
@@ -432,11 +435,11 @@
     Returns:
       An `Operation` object.
     """
+    del compute_shapes
     if self.capture_by_value and op_type in ["ReadVariableOp",
                                              "ResourceGather"]:
-      return self._capture_by_value(
-          op_type, inputs, dtypes, input_types, name, attrs, op_def,
-          compute_shapes, compute_device)
+      return self._capture_by_value(op_type, inputs, dtypes, input_types, name,
+                                    attrs, op_def, compute_device)
 
     # This capturing logic interacts poorly with control flow contexts which
     # want to replace inputs of ops far too late in the process. This can lead
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index b46af06..d287ea2 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -352,6 +352,21 @@
     if self._definition is not None or self._c_func is not None:
       return
 
+    # Copy variable collections (by reference) from the parent graph such that
+    # name based variable sharing (e.g. via tf.make_template) works between the
+    # func graph and parent graph.
+    variable_keys = []
+    variable_keys.extend(ops.GraphKeys._VARIABLE_COLLECTIONS)  # pylint: disable=protected-access
+    variable_keys.append(vs._VARSTORE_KEY)  # pylint: disable=protected-access
+
+    collections_ref = {}
+    parent_collections_ref = ops.get_default_graph()._collections  # pylint: disable=protected-access
+    for key in variable_keys:
+      if key not in parent_collections_ref:
+        parent_collections_ref[key] = collections_ref[key] = []
+      else:
+        collections_ref[key] = parent_collections_ref[key]
+
     temp_graph = func_graph_from_py_func(
         self._func,
         self._arg_names,
@@ -359,6 +374,7 @@
         self._func_name,
         self._capture_by_value,
         self._caller_device,
+        collections_ref=collections_ref,
         whitelisted_stateful_ops=self._whitelisted_stateful_ops,
         capture_resource_var_by_value=self._capture_resource_var_by_value)
 
@@ -1014,13 +1030,7 @@
   attrs = _parse_kwargs_as_attrs(func_name, **kwargs)
   output_types = [dtypes.DType(x.type) for x in sig.output_arg]
   op = g.create_op(
-      func_name,
-      list(inputs),
-      output_types,
-      name=name,
-      attrs=attrs,
-      op_def=sig,
-      compute_shapes=False)
+      func_name, list(inputs), output_types, name=name, attrs=attrs, op_def=sig)
   if op.outputs:
     if len(op.outputs) == 1:
       ret = op.outputs[0]
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 4b323e9..57f50b8 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -19,7 +19,6 @@
 from __future__ import print_function
 
 import re
-import sys
 import time
 
 import numpy as np
@@ -48,6 +47,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -825,19 +825,6 @@
       self.assertEqual(self.evaluate(y), 1)
       self.assertEqual(self.evaluate(z), 2)
 
-  def testStableName(self):
-
-    @function.Defun()
-    def Foo(x, y, z):
-      return math_ops.tanh(math_ops.matmul(x, y) + z)
-
-    if sys.byteorder == "big":
-      self.assertEqual("Foo_kEdkAG8SJvg",
-                       Foo.instantiate([dtypes.float32] * 3).name)
-    else:
-      self.assertEqual("Foo_aCYSbwBkR5A",
-                       Foo.instantiate([dtypes.float32] * 3).name)
-
   @test_util.run_deprecated_v1
   def testSignatureHash(self):
     # Foo.Inner and Bar.Inner have identical function body but have
@@ -1767,13 +1754,64 @@
 
   @test_util.run_deprecated_v1
   def testBasic(self):
-    self._testSimpleModel(True)
     self._testSimpleModel(False)
+    self._testSimpleModel(True)
 
   @test_util.run_deprecated_v1
   def testBasicResource(self):
-    self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
+    self._testSimpleModel(True, use_resource=True)
+
+
+class TemplateTest(test.TestCase):
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasic(self):
+    self.assertTemplateVariableSharing(use_resource=True, defun_first=False)
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasicRef(self):
+    self.assertTemplateVariableSharing(use_resource=False, defun_first=False)
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasicDefunFirst(self):
+    self.assertTemplateVariableSharing(use_resource=True, defun_first=True)
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasicRefDefunFirst(self):
+    self.assertTemplateVariableSharing(use_resource=False, defun_first=True)
+
+  def assertTemplateVariableSharing(self, use_resource, defun_first):
+    parameters = []
+
+    def MakeModel(x):
+      w = variable_scope.get_variable(
+          "w", (64, 64),
+          initializer=init_ops.random_uniform_initializer(seed=312),
+          use_resource=use_resource)
+      b = variable_scope.get_variable(
+          "b", (64),
+          initializer=init_ops.zeros_initializer(),
+          use_resource=use_resource)
+      parameters.extend((w, b))
+      return math_ops.sigmoid(math_ops.matmul(x, w) + b)
+
+    model = template.make_template("f", MakeModel, create_scope_now_=True)
+
+    @function.Defun()
+    def ModelDefun(x):
+      return model(x)
+
+    x = array_ops.placeholder(dtypes.float32)
+    if defun_first:
+      ModelDefun(x)
+      model(x)
+    else:
+      model(x)
+      ModelDefun(x)
+    w1, b1, w2, b2 = parameters  # pylint: disable=unbalanced-tuple-unpacking
+    self.assertIs(w1, w2)
+    self.assertIs(b1, b2)
 
 
 class DevicePlacementTest(test.TestCase):
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index d8ceb43..68d7f35 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -68,7 +68,8 @@
   path = os.path.join(logdir, name)
   if as_text:
     file_io.atomic_write_string_to_file(path,
-                                        text_format.MessageToString(graph_def))
+                                        text_format.MessageToString(
+                                            graph_def, float_format=''))
   else:
     file_io.atomic_write_string_to_file(path, graph_def.SerializeToString())
   return path
diff --git a/tensorflow/python/framework/indexed_slices_tensor_spec.py b/tensorflow/python/framework/indexed_slices_tensor_spec.py
new file mode 100644
index 0000000..965e092
--- /dev/null
+++ b/tensorflow/python/framework/indexed_slices_tensor_spec.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorSpec factory for sparse tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+
+
+def indexed_slices_tensor_spec(shape=None,
+                               dtype=dtypes.float32,
+                               num_slices=None,
+                               has_dense_shape=True,
+                               name=None):
+  """Returns a tensor specification for a IndexedSlices.
+
+  Returns an object which can be passed to `tf.function` (or other
+  functions that expect `TensorSpec`s) to specify shape constraints
+  for a `IndexedSlices` argument.
+
+  Args:
+    shape: The shape of the IndexedSlices, or `None` to allow any shape.
+      The returned specification object depends only on `shape[1:]`.
+    dtype: Data type of values in the IndexedSlices.
+    num_slices: Number of slices.  Default allows for any number of slices.
+    has_dense_shape: Indicates whether the IndexedSlices is expected to have a
+      `dense_shape` component.
+    name: Optional name prefix for the `TensorSpec`s.
+
+  Returns:
+    An object describing the `values`, `indices` and `dense_shape` tensors
+    that comprise the `IndexedSlices`.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  shape = tensor_shape.TensorShape(shape)
+  num_slices = tensor_shape.Shape([num_slices])
+
+  values = tensor_spec.TensorSpec(
+      num_slices.concatenate(shape[1:]), dtype, name)
+  indices = tensor_spec.TensorSpec(num_slices, dtypes.int64,
+                                   ("%s.indices" % name) if name else None)
+  if has_dense_shape:
+    dense_shape = tensor_spec.TensorSpec([shape.ndims], dtypes.int64,
+                                         ("%s.dense_shape" %
+                                          name) if name else None)
+  else:
+    dense_shape = None
+  return ops.IndexedSlices(values, indices, dense_shape)
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index a311460..304745f 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -80,6 +80,8 @@
   module.LIB_HANDLE = lib_handle
   # OpDefs of the list of ops defined in the library.
   module.OP_LIST = op_list
+  # Allow this to be recognized by AutoGraph.
+  setattr(module, '_IS_TENSORFLOW_PLUGIN', True)
   sys.modules[module_name] = module
   return module
 
@@ -132,7 +134,7 @@
   """Loads a TensorFlow plugin.
 
   "library_location" can be a path to a specific shared object, or a folder.
-  If it is a folder, all sahred objects that are named "libtfkernel*" will be
+  If it is a folder, all shared objects that are named "libtfkernel*" will be
   loaded. When the library is loaded, kernels registered in the library via the
   `REGISTER_*` macros are made available in the TensorFlow process.
 
diff --git a/tensorflow/python/framework/op_def_registry.py b/tensorflow/python/framework/op_def_registry.py
index 6cbe590..8ded8f6 100644
--- a/tensorflow/python/framework/op_def_registry.py
+++ b/tensorflow/python/framework/op_def_registry.py
@@ -32,7 +32,10 @@
                     (op_list, type(op_list)))
   for op_def in op_list.op:
     if op_def.name in _registered_ops:
-      assert _registered_ops[op_def.name] == op_def
+      if _registered_ops[op_def.name] != op_def:
+        raise ValueError(
+            "Registered op_def for %s (%s) not equal to op_def to register (%s)"
+            % (op_def.name, _registered_ops[op_def.name], op_def))
     else:
       _registered_ops[op_def.name] = op_def
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 605b181..62eb993 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -62,8 +62,14 @@
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_stack
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
+# This is to avoid a circular dependency: ops -> tensor_spec -> ops
+tensor_spec = LazyLoader(
+    "tensor_spec", globals(),
+    "tensorflow.python.framework.tensor_spec")
+
 # Temporary global switches determining if we should enable the work-in-progress
 # calls to the C API. These will be removed once all functionality is supported.
 _USE_C_API = True
@@ -752,7 +758,8 @@
     """
     if self.dtype == dtypes.resource:
       raise ValueError("Resource handles are not convertible to numpy.")
-    return self._cpu_nograd()._numpy()  # pylint: disable=protected-access
+    maybe_arr = self._cpu_nograd()._numpy()  # pylint: disable=protected-access
+    return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
 
   # __int__, __float__ and __index__ may copy the tensor to CPU and
   # only work for scalars; values are cast as per numpy.
@@ -766,7 +773,7 @@
     return int(self.numpy())
 
   def __array__(self, dtype=None):
-    return np.array(self.numpy(), dtype=dtype)
+    return np.asarray(self.numpy(), dtype=dtype)
 
   def __format__(self, format_spec):
     return self.numpy().__format__(format_spec)
@@ -880,7 +887,10 @@
       self_device = self.device
 
       def grad_fun(dresult):
-        return [dresult._copy(device_name=self_device)]
+        return [
+            dresult._copy(device_name=self_device)
+            if hasattr(dresult, "_copy") else dresult
+        ]
 
       tape.record_operation("_copy", [new_tensor], [self], grad_fun)
     return new_tensor
@@ -1683,7 +1693,8 @@
 
   def __init__(self, values, indices, dense_shape=None):
     """Creates an `IndexedSlices`."""
-    _get_graph_from_inputs([values, indices, dense_shape])
+    if not isinstance(values, tensor_spec.TensorSpec):
+      _get_graph_from_inputs([values, indices, dense_shape])
     self._values = values
     self._indices = indices
     self._dense_shape = dense_shape
@@ -1744,22 +1755,25 @@
       return (self._values, self._indices, self._dense_shape)
 
   @classmethod
-  def _from_components(cls, components):
+  def _from_components(cls, components, metadata):
     return cls(*components)
 
   def _shape_invariant_to_components(self, shape=None):
     if shape is None:
       shape = self._values.shape
     if self._dense_shape is None:
-      return [shape, shape[:1]]  # values, indices
+      return (shape, shape[:1])  # values, indices
     else:
       # values, indices, dense_shape
-      return [shape, shape[:1], tensor_shape.TensorShape([shape.ndims])]
+      return (shape, shape[:1], tensor_shape.TensorShape([shape.ndims]))
 
   @property
   def _is_graph_tensor(self):
     return hasattr(self._values, "graph")
 
+  def consumers(self):
+    return self._consumers()
+
 
 IndexedSlicesValue = collections.namedtuple(
     "IndexedSlicesValue", ["values", "indices", "dense_shape"])
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 76342ab..2ae7d29 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -26,6 +26,7 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import common_shapes
@@ -47,6 +48,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import resources
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.gradients  # pylint: disable=unused-import
@@ -156,6 +158,18 @@
     tensor = ops.convert_to_tensor(x, name="tensor")
     self.assertAllEqual(self.evaluate(tensor), [[2, 3], [0, 0], [5, 7]])
 
+  @test_util.run_gpu_only
+  def testEagerCopy(self):
+    with context.eager_mode():
+      var = variables.Variable([[0.0], [0.0], [0.0], [0.0]], name="tensor")
+      with backprop.GradientTape() as tape:
+        a = array_ops.gather(array_ops.gather(var, [0, 1]), [0, 1])
+        b = array_ops.gather(array_ops.gather(var, [2, 3]), [0, 1])
+        r = special_math_ops.einsum("ij,ij->i", a, b)
+      g = tape.gradient(r, [var])[0]
+      values = g.values if isinstance(g, ops.IndexedSlices) else g
+      self.assertAllEqual(values.get_shape(), [4, 1])
+
   @test_util.run_deprecated_v1
   def testNegation(self):
     with self.cached_session():
@@ -3056,7 +3070,7 @@
     return self._components
 
   @classmethod
-  def _from_components(cls, components):
+  def _from_components(cls, components, metadata):
     return cls(*components)
 
   def _shape_invariant_to_components(self, shape=None):
@@ -3102,7 +3116,7 @@
     """Tests that a user can register a CompositeTensor converter."""
     x = _MyTuple((1, [2., 3.], [[4, 5], [6, 7]]))
     y = ops.convert_to_tensor_or_composite(x)
-    self.assertTrue(tensor_util.is_tensor(y))
+    self.assertFalse(tensor_util.is_tensor(y))
     self.assertIsInstance(y, _TupleTensor)
     self.assertLen(y, len(x))
     for x_, y_ in zip(x, y):
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 9f37632..0678b2c 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -114,21 +115,29 @@
       values: A 1-D tensor of any type and shape `[N]`.
       dense_shape: A 1-D int64 tensor of shape `[ndims]`.
     """
-    with ops.name_scope(None, "SparseTensor", [indices, values, dense_shape]):
-      indices = ops.convert_to_tensor(
-          indices, name="indices", dtype=dtypes.int64)
-      # TODO(touts): Consider adding mutable_values() when 'values'
-      # is a VariableOp and updating users of SparseTensor.
-      values = ops.internal_convert_to_tensor(values, name="values")
-      dense_shape = ops.convert_to_tensor(
-          dense_shape, name="dense_shape", dtype=dtypes.int64)
+    if isinstance(indices, tensor_spec.TensorSpec):
+      if not isinstance(values, tensor_spec.TensorSpec):
+        raise TypeError("Expected values to be a TensorSpec")
+      if not isinstance(dense_shape, tensor_spec.TensorSpec):
+        raise TypeError("Expected dense_shape to be a TensorSpec")
+      if indices.dtype != dtypes.int64 or dense_shape.dtype != dtypes.int64:
+        raise TypeError("indices and dense_shape must have dtype=int64")
+    else:
+      with ops.name_scope(None, "SparseTensor", [indices, values, dense_shape]):
+        indices = ops.convert_to_tensor(
+            indices, name="indices", dtype=dtypes.int64)
+        # TODO(touts): Consider adding mutable_values() when 'values'
+        # is a VariableOp and updating users of SparseTensor.
+        values = ops.internal_convert_to_tensor(values, name="values")
+        dense_shape = ops.convert_to_tensor(
+            dense_shape, name="dense_shape", dtype=dtypes.int64)
     self._indices = indices
     self._values = values
     self._dense_shape = dense_shape
 
-    indices_shape = indices.get_shape().with_rank(2)
-    values_shape = values.get_shape().with_rank(1)
-    dense_shape_shape = dense_shape.get_shape().with_rank(1)
+    indices_shape = indices.shape.with_rank(2)
+    values_shape = values.shape.with_rank(1)
+    dense_shape_shape = dense_shape.shape.with_rank(1)
 
     # Assert number of rows in indices match the number of elements in values.
     indices_shape.dims[0].merge_with(values_shape.dims[0])
@@ -166,7 +175,7 @@
   @property
   def op(self):
     """The `Operation` that produces `values` as an output."""
-    return self.values.op
+    return self._values.op
 
   @property
   def dtype(self):
@@ -229,7 +238,7 @@
     return (self._indices, self._values, self._dense_shape)
 
   @classmethod
-  def _from_components(cls, components):
+  def _from_components(cls, components, metadata):
     return cls(*components)
 
   def _shape_invariant_to_components(self, shape=None):
@@ -241,16 +250,19 @@
       raise ValueError("Shape invariant for SparseTensor must have the form "
                        "TensorShape([r]), got %r" % shape)
     rank = tensor_shape.dimension_value(shape[0])
-    return [
+    return (
         tensor_shape.TensorShape([None, rank]),  # indices
         tensor_shape.TensorShape([None]),  # values
-        tensor_shape.TensorShape([rank])
-    ]  # dense_shape
+        tensor_shape.TensorShape([rank])  # dense_shape
+        )
 
   @property
   def _is_graph_tensor(self):
     return hasattr(self._values, "graph")
 
+  def consumers(self):
+    return self._consumers()
+
 
 SparseTensorValue = collections.namedtuple("SparseTensorValue",
                                            ["indices", "values", "dense_shape"])
diff --git a/tensorflow/python/framework/sparse_tensor_spec.py b/tensorflow/python/framework/sparse_tensor_spec.py
new file mode 100644
index 0000000..4c9f163
--- /dev/null
+++ b/tensorflow/python/framework/sparse_tensor_spec.py
@@ -0,0 +1,56 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorSpec factory for sparse tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+
+
+def sparse_tensor_spec(shape=None,
+                       dtype=dtypes.float32,
+                       num_values=None,
+                       name=None):
+  """Returns a tensor specification for a SparseTensor.
+
+  Returns an object which can be passed to `tf.function` (or other
+  functions that expect `TensorSpec`s) to specify shape constraints
+  for a `SparseTensor` argument.
+
+  Args:
+    shape: The shape of the SparseTensor, or `None` to allow any shape. The
+      returned specification object depends only on `shape.ndims`.
+    dtype: Data type of values in the SparseTensor.
+    num_values: The number of values in the SparseTensor, or `None` to allow any
+      number of values.
+    name: Optional name prefix for the `TensorSpec`s.
+
+  Returns:
+    An object describing the `values`, `indices` and `dense_shape` tensors
+    that comprise the `SparseTensor`.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  rank = tensor_shape.TensorShape(shape).rank
+  indices = tensor_spec.TensorSpec([num_values, rank], dtypes.int64,
+                                   ("%s.indices" % name) if name else None)
+  values = tensor_spec.TensorSpec([num_values], dtype, name)
+  dense_shape = tensor_spec.TensorSpec(
+      [rank], dtypes.int64, ("%s.dense_shape" % name) if name else None)
+  return sparse_tensor.SparseTensor(indices, values, dense_shape)
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 06308a2..b6a470d 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -245,7 +245,10 @@
     Returns:
       True if this Dimension and `other` are compatible.
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
     return (self._value is None or other.value is None or
             self._value == other.value)
 
@@ -293,7 +296,10 @@
       ValueError: If `self` and `other` are not compatible (see
         is_compatible_with).
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
     self.assert_is_compatible_with(other)
     if self._value is None:
       return Dimension(other.value)
@@ -322,7 +328,10 @@
     Returns:
       A Dimension whose value is the sum of `self` and `other`.
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
     if self._value is None or other.value is None:
       return Dimension(None)
     else:
@@ -361,7 +370,10 @@
     Returns:
       A Dimension whose value is the subtraction of `other` from `self`.
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
     if self._value is None or other.value is None:
       return Dimension(None)
     else:
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index b3621a4..3cebed7 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,7 +22,6 @@
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -939,20 +938,16 @@
 
 @tf_export("is_tensor")
 def is_tensor(x):  # pylint: disable=invalid-name
-  """Check whether `x` is of tensor type.
+  """Checks whether `x` is a tensor or "tensor-like".
 
-  Check whether an object is a tensor or a composite tensor. This check is
-  equivalent to calling
-  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.RaggedTensor, tf.Variable))`
-  and also checks if all the component variables of a MirroredVariable or a
-  SyncOnReadVariable are tensors.
+  If `is_tensor(x)` returns `True`, it is safe to assume that `x` is a tensor or
+  can be converted to a tensor using `ops.convert_to_tensor(x)`.
 
   Args:
     x: A python object to check.
 
   Returns:
-    `True` if `x` is a tensor, `False` if not.
+    `True` if `x` is a tensor or "tensor-like", `False` if not.
   """
   return (isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x) or  # pylint: disable=protected-access
-          isinstance(x, composite_tensor.CompositeTensor) or
-          (hasattr(x, "is_tensor_like") and x.is_tensor_like))
+          getattr(x, "is_tensor_like", False))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index b28de54..3d24a30 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -256,8 +256,12 @@
   return pywrap_tensorflow.IsGoogleCudaEnabled()
 
 
-def CudaSupportsHalfMatMulAndConv():
-  return pywrap_tensorflow.CudaSupportsHalfMatMulAndConv()
+def IsBuiltWithROCm():
+  return pywrap_tensorflow.IsBuiltWithROCm()
+
+
+def GpuSupportsHalfMatMulAndConv():
+  return pywrap_tensorflow.GpuSupportsHalfMatMulAndConv()
 
 
 def IsMklEnabled():
@@ -1333,13 +1337,17 @@
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
 
+  Warning: if a non-GPU version of the package is installed, the function would
+  also return False. Use `tf.test.is_built_with_cuda` to validate if TensorFlow
+  was build with CUDA support.
+
   Args:
-    cuda_only: limit the search to CUDA gpus.
+    cuda_only: limit the search to CUDA GPUs.
     min_cuda_compute_capability: a (major,minor) pair that indicates the minimum
       CUDA compute capability required, or None if no requirement.
 
   Returns:
-    True if a gpu device of the requested kind is available.
+    True if a GPU device of the requested kind is available.
   """
 
   def compute_capability_from_device_desc(device_desc):
@@ -1546,20 +1554,86 @@
   return disable_xla_impl
 
 
-# The description is just for documentation purposes.
-def disable_all_xla(description):
+def for_all_test_methods(decorator, *args, **kwargs):
+  """Generate class-level decorator from given method-level decorator.
 
-  def disable_all_impl(cls):
-    """Execute all test methods in this class only if xla is not enabled."""
-    base_decorator = disable_xla
+  It is expected for the given decorator to take some arguments and return
+  a method that is then called on the test method to produce a decorated
+  method.
+
+  Args:
+    decorator: The decorator to apply.
+    *args: Positional arguments
+    **kwargs: Keyword arguments
+  Returns: Function that will decorate a given classes test methods with the
+    decorator.
+  """
+
+  def all_test_methods_impl(cls):
+    """Apply decorator to all test methods in class."""
     for name in dir(cls):
       value = getattr(cls, name)
       if callable(value) and name.startswith(
-          "test") and not name == "test_session":
-        setattr(cls, name, base_decorator(description)(value))
+          "test") and (name != "test_session"):
+        setattr(cls, name, decorator(*args, **kwargs)(value))
     return cls
 
-  return disable_all_impl
+  return all_test_methods_impl
+
+
+# The description is just for documentation purposes.
+def no_xla_auto_jit(description):  # pylint: disable=unused-argument
+
+  def no_xla_auto_jit_impl(func):
+    """This test is not intended to be run with XLA auto jit enabled."""
+
+    def decorator(func):
+
+      def decorated(self, *args, **kwargs):
+        if is_xla_enabled():
+          # Skip test if using XLA is forced.
+          return
+        else:
+          return func(self, *args, **kwargs)
+
+      return decorated
+
+    if func is not None:
+      return decorator(func)
+
+    return decorator
+
+  return no_xla_auto_jit_impl
+
+
+# The description is just for documentation purposes.
+def xla_allow_fallback(description):  # pylint: disable=unused-argument
+
+  def xla_allow_fallback_impl(func):
+    """Allow fallback to TF even though testing xla."""
+
+    def decorator(func):
+
+      def decorated(self, *args, **kwargs):
+        if is_xla_enabled():
+          # Update the global XLABuildOpsPassFlags to enable lazy compilation,
+          # which allows the compiler to fall back to TF classic. Remember the
+          # old value so that we can reset it.
+          old_value = pywrap_tensorflow.TF_SetXlaEnableLazyCompilation(True)
+          result = func(self, *args, **kwargs)
+          pywrap_tensorflow.TF_SetXlaEnableLazyCompilation(old_value)
+          return result
+        else:
+          return func(self, *args, **kwargs)
+
+      return decorated
+
+    if func is not None:
+      return decorator(func)
+
+    return decorator
+
+  return xla_allow_fallback_impl
 
 
 class EagerSessionWarner(object):
@@ -1581,10 +1655,10 @@
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TensorFlowTestCase, self).__init__(methodName)
     if is_xla_enabled():
-      os.putenv(
-          "TF_XLA_FLAGS", "--tf_xla_auto_jit=2 --tf_xla_min_cluster_size=1 "
-          "--tf_xla_enable_lazy_compilation=false " +
-          os.getenv("TF_XLA_FLAGS", ""))
+      pywrap_tensorflow.TF_SetXLaAutoJitMode("2")
+      pywrap_tensorflow.TF_SetXlaMinClusterSize(1)
+      pywrap_tensorflow.TF_SetXlaEnableLazyCompilation(False)
+
     self._threads = []
     self._tempdir = None
     self._cached_session = None
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 3b6d2ce..7a7761b 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -201,6 +201,8 @@
       'VecPermuteNCHWToNHWC-LayoutOptimizer')
 
 
+@test_util.for_all_test_methods(test_util.no_xla_auto_jit,
+                                'Test does not apply in XLA setting')
 class LayoutOptimizerTest(test.TestCase):
   """Tests the Grappler layout optimizer."""
 
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 881395e..4e63547 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -117,7 +117,7 @@
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_lib",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_coordinator",
@@ -194,8 +194,10 @@
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras/distribute",
         "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
         "//tensorflow/python/keras/mixed_precision/experimental:policy",
         "//tensorflow/python/module",
         "//tensorflow/python/training/tracking:data_structures",
@@ -698,6 +700,19 @@
 )
 
 tf_py_test(
+    name = "subclassed_layers_test",
+    size = "medium",
+    srcs = ["layers/subclassed_layers_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 3,
+)
+
+tf_py_test(
     name = "dense_attention_test",
     size = "medium",
     srcs = ["layers/dense_attention_test.py"],
@@ -1256,6 +1271,7 @@
     shard_count = 6,
     tags = [
         "no_oss",
+        "noasan",  # TODO(b/132183295): Re-enable this.
         "notsan",
     ],
 )
@@ -1315,9 +1331,23 @@
 )
 
 tf_py_test(
-    name = "topology_test",
+    name = "custom_training_loop_test",
     size = "medium",
-    srcs = ["engine/topology_test.py"],
+    srcs = ["custom_training_loop_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "network_test",
+    size = "medium",
+    srcs = ["engine/network_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index b7ec638..64fa731 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -21,6 +21,8 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import tf2
+
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import applications
 from tensorflow.python.keras import backend
@@ -46,7 +48,10 @@
 
 from tensorflow.python.util.tf_export import keras_export
 
-__version__ = '2.2.4-tf'
+if tf2.enabled():
+  __version__ = '2.3.0-tf'
+else:
+  __version__ = '2.2.4-tf'
 
 keras_export('keras.__version__').export_constant(__name__, '__version__')
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index aa1cf5b..5f7ade6 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -95,12 +95,38 @@
 def selu(x):
   """Scaled Exponential Linear Unit (SELU).
 
-  SELU is equal to: `scale * elu(x, alpha)`, where alpha and scale
-  are pre-defined constants. The values of `alpha` and `scale` are
+  The Scaled Exponential Linear Unit (SELU) activation function is:
+  `scale * x` if `x > 0` and `scale * alpha * (exp(x) - 1)` if `x < 0`
+  where `alpha` and `scale` are pre-defined constants
+  (`alpha = 1.67326324`
+  and `scale = 1.05070098`).
+  The SELU activation function multiplies  `scale` > 1 with the
+  `[elu](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/activations/elu)`
+  (Exponential Linear Unit (ELU)) to ensure a slope larger than one
+  for positive net inputs.
+
+  The values of `alpha` and `scale` are
   chosen so that the mean and variance of the inputs are preserved
   between two consecutive layers as long as the weights are initialized
-  correctly (see `lecun_normal` initialization) and the number of inputs
-  is "large enough" (see references for more information).
+  correctly (see [`lecun_normal` initialization]
+  (https://www.tensorflow.org/api_docs/python/tf/keras/initializers/lecun_normal))
+  and the number of inputs is "large enough"
+  (see references for more information).
+
+  ![](https://cdn-images-1.medium.com/max/1600/1*m0e8lZU_Zrkh4ESfQkY2Pw.png)
+  (Courtesy: Blog on Towards DataScience at
+  https://towardsdatascience.com/selu-make-fnns-great-again-snn-8d61526802a9)
+
+  Example Usage:
+  ```python3
+  n_classes = 10 #10-class problem
+  model = models.Sequential()
+  model.add(Dense(64, kernel_initializer='lecun_normal', activation='selu',
+  input_shape=(28, 28, 1))))
+  model.add(Dense(32, kernel_initializer='lecun_normal', activation='selu'))
+  model.add(Dense(16, kernel_initializer='lecun_normal', activation='selu'))
+  model.add(Dense(n_classes, activation='softmax'))
+  ```
 
   Arguments:
       x: A tensor or variable to compute the activation function for.
@@ -109,11 +135,14 @@
       The scaled exponential unit activation: `scale * elu(x, alpha)`.
 
   # Note
-      - To be used together with the initialization "lecun_normal".
-      - To be used together with the dropout variant "AlphaDropout".
+      - To be used together with the initialization "[lecun_normal]
+      (https://www.tensorflow.org/api_docs/python/tf/keras/initializers/lecun_normal)".
+      - To be used together with the dropout variant "[AlphaDropout]
+      (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AlphaDropout)".
 
   References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+      [Self-Normalizing Neural Networks (Klambauer et al, 2017)]
+      (https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 5bf4d88..21f6f29 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -95,6 +95,9 @@
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = weakref.WeakKeyDictionary()
 
+# This dictionary holds a mapping {graph: set_of_freezable_variables}.
+# Each set tracks objects created via `freezable_variable` in the graph.
+_FREEZABLE_VARS = weakref.WeakKeyDictionary()
 
 # _DUMMY_EAGER_GRAPH is used as a key in _GRAPH_LEARNING_PHASES.
 # We keep a separate reference to it to make sure it does not get removed from
@@ -221,18 +224,22 @@
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   global _GRAPH_VARIABLES  # pylint: disable=global-variable-not-assigned
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
+  global _GRAPH
+  global _FREEZABLE_VARS
+  _GRAPH = None
   ops.reset_default_graph()
   reset_uids()
   _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
-    with ops.name_scope(''):
+    with name_scope(''):
       phase = array_ops.placeholder_with_default(
           False, shape=(), name='keras_learning_phase')
     _GRAPH_LEARNING_PHASES = {}
     _GRAPH_LEARNING_PHASES[graph] = phase
     _GRAPH_VARIABLES.pop(graph, None)
     _GRAPH_TF_OPTIMIZERS.pop(graph, None)
+    _FREEZABLE_VARS.pop(graph, None)
 
 
 @keras_export('keras.backend.manual_variable_initialization')
@@ -243,7 +250,7 @@
   variables should be initialized
   as they are instantiated (default), or if
   the user should handle the initialization
-  (e.g. via `tf.initialize_all_variables()`).
+  (e.g. via `tf.compat.v1.initialize_all_variables()`).
 
   Arguments:
       value: Python boolean.
@@ -287,7 +294,7 @@
   graph = get_graph()
   with graph.as_default():
     if graph not in _GRAPH_LEARNING_PHASES:
-      with ops.name_scope(''):
+      with name_scope(''):
         phase = array_ops.placeholder_with_default(
             False, shape=(), name='keras_learning_phase')
       _GRAPH_LEARNING_PHASES[graph] = phase
@@ -386,7 +393,7 @@
   """
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   assert value in {0, 1}
-  assert context.executing_eagerly()
+  assert ops.executing_eagerly_outside_functions()
   previous_value = learning_phase()
   try:
     _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
@@ -692,7 +699,32 @@
     return tensor
 
 
-name_scope = ops.name_scope
+@keras_export('keras.backend.name_scope', v1=[])
+def name_scope(name):
+  """A context manager for use when defining a Python op.
+
+  This context manager pushes a name scope, which will make the name of all
+  operations added within it have a prefix.
+
+  For example, to define a new Python op called `my_op`:
+
+  ```python
+  def my_op(a):
+    with tf.name_scope("MyOp") as scope:
+      a = tf.convert_to_tensor(a, name="a")
+      # Define some computation that uses `a`.
+      return foo_op(..., name=scope)
+  ```
+
+  When executed, the Tensor `a` will have the name `MyOp/a`.
+
+  Args:
+    name: The prefix to use on all names created within the name scope.
+
+  Returns:
+    Name scope context manager.
+  """
+  return ops.name_scope_v2(name)
 
 
 @keras_export('keras.backend.variable')
@@ -812,13 +844,6 @@
   if dtype is None:
     dtype = floatx()
 
-  # If the outer context is eager but we are executing under the keras
-  # FuncGraph, we create EagerTensors and use them as constants.
-  if (ops.executing_eagerly_outside_functions() and
-      getattr(get_graph(), 'name', '') == 'keras_graph'):
-    with ops.init_scope():
-      return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
-
   return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
 
 
@@ -846,7 +871,7 @@
       >>> np_var = numpy.array([1, 2])
       >>> K.is_keras_tensor(np_var) # A numpy array is not a symbolic tensor.
       ValueError
-      >>> k_var = tf.placeholder('float32', shape=(1,1))
+      >>> k_var = tf.compat.v1.placeholder('float32', shape=(1,1))
       >>> K.is_keras_tensor(k_var) # A variable indirectly created outside of
       keras is not a Keras tensor.
       False
@@ -931,6 +956,55 @@
     return False
 
 
+def freezable_variable(value, shape=None, name=None):
+  """A tensor-like object whose value can be updated only up until execution.
+
+  After creating the freezable variable, you can update its value by calling
+  `var.update_value(new_value)` (similar to a regular variable).
+  Unlike an actual variable, the value used during execution is the current
+  value at the time the execution function (`backend.function()`) was created.
+
+  This is an internal API, expected to be temporary. It is used to implement a
+  mutable `trainable` property for `BatchNormalization` layers, with a frozen
+  value after model compilation.
+
+  We don't use a plain variable in this case because we need the value used
+  in a specific model to be frozen after `compile` has been called
+  (e.g. GAN use case).
+
+  Arguments:
+    value: The initial value for the tensor-like object.
+    shape: The shape for the tensor-like object (cannot be changed).
+    name: The name for the tensor-like object.
+
+  Returns:
+    A tensor-like object with a static value that can be updated via
+    `x.update_value(new_value)`, up until creating an execution function
+    (afterwards the value is fixed).
+  """
+  graph = get_graph()
+  with graph.as_default():
+    x = array_ops.placeholder_with_default(
+        value, shape=shape, name=name)
+    x._initial_value = value
+    x._current_value = value
+
+    def update_value(new_value):
+      x._current_value = new_value
+
+    def get_value():
+      return x._current_value
+
+    x.update_value = update_value
+    x.get_value = get_value
+
+    global _FREEZABLE_VARS
+    if graph not in _FREEZABLE_VARS:
+      _FREEZABLE_VARS[graph] = weakref.WeakSet()
+    _FREEZABLE_VARS[graph].add(x)
+  return x
+
+
 @keras_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
@@ -1351,21 +1425,20 @@
   Returns:
       Keras tensor with dtype `dtype`.
 
-  Example:
+  Examples:
+      Cast a float32 variable to a float64 tensor
+
   ```python
-      >>> from keras import backend as K
-      >>> input = K.placeholder((2, 3), dtype='float32')
-      >>> input
-      <tf.Tensor 'Placeholder_2:0' shape=(2, 3) dtype=float32>
-      # It doesn't work in-place as below.
-      >>> K.cast(input, dtype='float16')
-      <tf.Tensor 'Cast_1:0' shape=(2, 3) dtype=float16>
-      >>> input
-      <tf.Tensor 'Placeholder_2:0' shape=(2, 3) dtype=float32>
-      # you need to assign it.
-      >>> input = K.cast(input, dtype='float16')
-      >>> input
-      <tf.Tensor 'Cast_2:0' shape=(2, 3) dtype=float16>
+      >>> import tensorflow as tf
+      >>> from tensorflow.keras import backend as K
+      >>> input = K.ones(shape=(1,3))
+      >>> print(input)
+      >>> cast_input = K.cast(input, dtype='float64')
+      >>> print(cast_input)
+
+      <tf.Variable 'Variable:0' shape=(1, 3) dtype=float32,
+           numpy=array([[1., 1., 1.]], dtype=float32)>
+      tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float64)
   ```
   """
   return math_ops.cast(x, dtype)
@@ -2642,6 +2715,17 @@
 
   Returns:
       A tensor.
+
+  Examples:
+    Flattening a 3D tensor to 2D by collapsing the last dimension.
+
+  ```python
+      >>> from tensorflow.keras import backend as K
+      >>> x_batch = K.ones(shape=(2, 3, 4, 5))
+      >>> x_batch_flatten = K.batch_flatten(x_batch)
+      >>> K.int_shape(x_batch_flatten)
+      (2, 60)
+  ```
   """
   x = array_ops.reshape(x, array_ops.stack([-1, prod(shape(x)[1:])]))
   return x
@@ -3214,6 +3298,9 @@
           # `update.op` may have been None in certain cases.
           updates_ops.append(update)
 
+    self._freezable_vars_to_feed = []
+    self._freezable_vars_values = []
+    freezable_vars_from_keras_graph = _FREEZABLE_VARS.get(global_graph, {})
     with _scratch_graph() as exec_graph:
       global_graph = get_graph()
       if source_graph not in (exec_graph, global_graph):
@@ -3234,6 +3321,18 @@
         legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
                              for p, p_new in legacy_update_ops]
 
+        # Keep track of the value to feed to any "freezable variables"
+        # created in this graph.
+        for old_op, new_op in lifted_map.items():
+          if old_op in freezable_vars_from_keras_graph:
+            frozen_var = old_op
+            if frozen_var._initial_value != frozen_var._current_value:
+              # We only feed a frozen_variable if its value has changed;
+              # otherwise it can rely on the default value of the
+              # underlying placeholder_with_default.
+              self._freezable_vars_to_feed.append(new_op)
+              self._freezable_vars_values.append(frozen_var._current_value)
+
     # Consolidate updates
     with exec_graph.as_default():
       outputs = cast_variables_to_tensor(outputs)
@@ -3242,14 +3341,16 @@
           updates_ops.append(state_ops.assign(p, p_new))
 
       self.inputs, self.outputs = inputs, outputs
+      self._input_references = self.inputs + self._freezable_vars_to_feed
       with ops.control_dependencies(updates_ops):
         self.outputs[0] = array_ops.identity(self.outputs[0])
 
-      exec_graph.inputs = self.inputs + list(exec_graph.captures.values())
+      exec_graph.inputs = self._input_references + list(
+          exec_graph.captures.values())
       exec_graph.outputs = self.outputs
       graph_fn = eager_function.ConcreteFunction(exec_graph)
 
-    graph_fn._num_positional_args = len(self.inputs)
+    graph_fn._num_positional_args = len(self._input_references)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
 
@@ -3263,9 +3364,11 @@
               x.op.inputs[0])
 
   def __call__(self, inputs):
-    inputs = nest.flatten(inputs)
+    input_values = nest.flatten(inputs)
+    if self._freezable_vars_values:
+      input_values = input_values + self._freezable_vars_values
     converted_inputs = []
-    for tensor, value in zip(self.inputs, inputs):
+    for tensor, value in zip(self._input_references, input_values):
       if value is None:
         # Assume `value` is a placeholder with default
         value = self._placeholder_default_values.get(tensor, None)
@@ -5381,6 +5484,8 @@
 
 def in_multi_worker_mode():
   """Whether we are operating in a Multi-Worker setting."""
+  # TODO(rchao): Consider a warning if user uses multiple `model` method
+  # calls in multi-worker setting.
   tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
   cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
   return tf_config and 'master' not in cluster_spec.jobs
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 4c3a864..e7b1a55 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -92,6 +92,21 @@
                          str(keras_output))
 
 
+class BackendResetTest(test.TestCase, parameterized.TestCase):
+
+  # We can't use the normal parameterized decorator because the test session
+  # will block graph clearing.
+  @parameterized.named_parameters(('_v1', context.graph_mode),
+                                  ('_v2', context.eager_mode))
+  def test_new_graph(self, test_context):
+    with test_context():
+      g_old = keras.backend.get_graph()
+      keras.backend.clear_session()
+      g = keras.backend.get_graph()
+
+      assert g_old is not g
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class BackendUtilsTest(test.TestCase):
 
@@ -1743,8 +1758,8 @@
 
   @test_util.run_deprecated_v1
   def test_function_tf_fetches(self):
-    # Additional operations can be passed to tf.Session().run() via its
-    # `fetches` arguments. In contrast to `updates` argument of
+    # Additional operations can be passed to tf.compat.v1.Session().run() via
+    # its `fetches` arguments. In contrast to `updates` argument of
     # keras.backend.function() these do not have control dependency on `outputs`
     # so they can run in parallel. Also they should not contribute to output of
     # keras.backend.function().
@@ -1766,9 +1781,9 @@
 
   @test_util.run_deprecated_v1
   def test_function_tf_feed_dict(self):
-    # Additional substitutions can be passed to `tf.Session().run()` via its
-    # `feed_dict` arguments. Note that the feed_dict is passed once in the
-    # constructor but we can modify the values in the dictionary. Through
+    # Additional substitutions can be passed to `tf.compat.v1.Session().run()`
+    # via its `feed_dict` arguments. Note that the feed_dict is passed once in
+    # the constructor but we can modify the values in the dictionary. Through
     # this feed_dict we can provide additional substitutions besides Keras
     # inputs.
     with self.cached_session():
@@ -1865,5 +1880,39 @@
       self.assertIsNot(session, keras.backend.get_session())
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class ControlOpsTests(test.TestCase):
+
+  def test_function_switch_basics(self):
+    x = array_ops.constant(2.0)
+    y = array_ops.constant(3.0)
+
+    def xpowy():
+      return keras.backend.pow(x, y)
+
+    def ypowx():
+      return keras.backend.pow(y, x)
+
+    tensor = keras.backend.switch(keras.backend.less(x, y), xpowy, ypowx)
+    self.assertEqual(keras.backend.eval(tensor), [8.0])
+
+    tensor = keras.backend.switch(keras.backend.greater(x, y), xpowy, ypowx)
+    self.assertEqual(keras.backend.eval(tensor), [9.0])
+
+  def test_unequal_rank(self):
+    x = ops.convert_to_tensor(np.array([[1, 2, 3], [4, 5, 6]]), dtype='float32')
+    y = ops.convert_to_tensor(np.array([1, 2, 3]), dtype='float32')
+
+    def true_func():
+      return x
+
+    def false_func():
+      return y
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Rank of `condition` should be less than'):
+      keras.backend.switch(keras.backend.equal(x, x), false_func, true_func)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6374c8f..ffe0501 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -25,6 +25,7 @@
 import io
 import json
 import os
+import re
 import tempfile
 import time
 
@@ -42,6 +43,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_management
 from tensorflow.python.util.tf_export import keras_export
 
 try:
@@ -49,6 +51,11 @@
 except ImportError:
   requests = None
 
+# Constant for `tf.keras.Model` to store the epoch at which the most recently
+# saved checkpoint was saved. See `Model._get_updated_initial_epoch()`'s
+# docstring for more information.
+CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
+
 
 def configure_callbacks(callbacks,
                         model,
@@ -109,6 +116,18 @@
       mode=mode)
 
   callback_list.model.stop_training = False
+  # pylint: disable=protected-access
+  if callback_list.model._ckpt_saved_epoch is not None:
+    # The attribute `_ckpt_saved_epoch` is supposed to be None at the start of
+    # training (it should be made None at the end of successful multi-worker
+    # training), unless the user's `fit()` does not end successfully before
+    # making another `fit()` call.
+    raise ValueError(
+        '`tf.Keras.Model._ckpt_saved_epoch` attr should be None at '
+        'callback setup time. Please ensure `fit()` in multi-worker '
+        'training finishes successfully before starting a new one. If the '
+        'issue persists, try using only one `model.fit()` in multi-worker '
+        'training.')
   return callback_list
 
 
@@ -421,6 +440,7 @@
           (eg. verbosity, batch size, number of epochs...).
       model: instance of `keras.models.Model`.
           Reference of the model being trained.
+      validation_data: Deprecated. Do not use.
 
   The `logs` dictionary that callback methods
   take as argument will contain keys for quantities relevant to
@@ -904,11 +924,34 @@
     if (self.load_weights_on_restart and self.filepath is not None and
         os.path.exists(self.filepath)):
       try:
-        self.model.load_weights(self.filepath)
+        # `filepath` may contain placeholders such as `{epoch:02d}`, and thus
+        # it attempts to load the most recently modified file with file name
+        # matching the pattern.
+        self.model.load_weights(
+            self._get_most_recently_modified_file_matching_pattern(
+                self.filepath))
       except (IOError, ValueError) as e:
         raise ValueError('Error loading file from {}. Reason: {}'.format(
             self.filepath, e))
 
+  def on_train_end(self, logs=None):
+    logs = logs or {}
+    # pylint: disable=protected-access
+    if self.model._ckpt_saved_epoch is not None:
+      # Make `_ckpt_saved_epoch` attribute `None` at the end of training as it
+      # is only used during the training. Currently it is decided not to
+      # support fault tolerance across multiple `model.fit()` or `model.fit()`
+      # with other `model` methods.
+      epoch = self.model._ckpt_saved_epoch
+      self.model._ckpt_saved_epoch = None
+      # TODO(rchao): Support all `save_weights_only` and `save_best_only` cases.
+      # This will be done with the help of a decoupled training state file that
+      # contains both epoch and model weights.
+      if self.save_weights_only and not self.save_best_only:
+        file_handle, filepath = self._get_file_handle_and_path(epoch, logs)
+        self.model.save_weights(filepath, overwrite=True)
+        self._maybe_remove_file(file_handle, filepath)
+
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
     if isinstance(self.save_freq, int):
@@ -937,23 +980,7 @@
     if isinstance(self.save_freq,
                   int) or self.epochs_since_last_save >= self.period:
       self.epochs_since_last_save = 0
-
-      # TODO(rchao): Replace dc_context reference with
-      # distributed_training_utils.should_current_worker_checkpoint() once
-      # distributed_training_utils.py no longer depends on callbacks.py.
-      if not K.in_multi_worker_mode() or dc_context.get_current_worker_context(
-      ).should_checkpoint:
-        filepath = self.filepath.format(epoch=epoch + 1, **logs)
-      else:
-        # If this is multi-worker training, and this worker should not
-        # save checkpoint, we replace the filepath with a dummy filepath so
-        # it writes to a file that will be removed at the end of _save_model()
-        # call. This is because the SyncOnReadVariable needs to be synced across
-        # all the workers in order to be read, and all workers need to initiate
-        # that.
-        file_handle, temp_file_name = tempfile.mkstemp()
-        extension = os.path.splitext(self.filepath)[1]
-        filepath = temp_file_name + '.' + extension
+      file_handle, filepath = self._get_file_handle_and_path(epoch, logs)
 
       if self.save_best_only:
         current = logs.get(self.monitor)
@@ -979,16 +1006,101 @@
         if self.verbose > 0:
           print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
         if self.save_weights_only:
+          if K.in_multi_worker_mode():
+            # TODO(rchao): Save to an additional training state file for FT,
+            # instead of adding an attr to weight file. With this we can support
+            # the cases of all combinations with `save_weights_only`,
+            # `save_best_only`, and `save_format` parameters.
+            # pylint: disable=protected-access
+            self.model._ckpt_saved_epoch = epoch
           self.model.save_weights(filepath, overwrite=True)
         else:
           self.model.save(filepath, overwrite=True)
 
-      # Remove the file in multi-worker training where this worker should
-      # not checkpoint.
-      if K.in_multi_worker_mode(
-      ) and not dc_context.get_current_worker_context().should_checkpoint:
-        os.close(file_handle)
-        os.remove(filepath)
+      self._maybe_remove_file(file_handle, filepath)
+
+  def _get_file_handle_and_path(self, epoch, logs):
+    """Returns the file handle and path."""
+    # TODO(rchao): Replace dc_context reference with
+    # distributed_training_utils.should_current_worker_checkpoint() once
+    # distributed_training_utils.py no longer depends on callbacks.py.
+    if not K.in_multi_worker_mode() or dc_context.get_current_worker_context(
+    ).should_checkpoint:
+      return None, self.filepath.format(epoch=epoch + 1, **logs)
+    else:
+      # If this is multi-worker training, and this worker should not
+      # save checkpoint, we replace the filepath with a dummy filepath so
+      # it writes to a file that will be removed at the end of _save_model()
+      # call. This is because the SyncOnReadVariable needs to be synced across
+      # all the workers in order to be read, and all workers need to initiate
+      # that.
+      file_handle, temp_file_name = tempfile.mkstemp()
+      extension = os.path.splitext(self.filepath)[1]
+      return file_handle, temp_file_name + '.' + extension
+
+  def _maybe_remove_file(self, file_handle, filepath):
+    # Remove the file in multi-worker training where this worker should
+    # not checkpoint. It is a dummy file previously saved for sync distributed
+    # training.
+    if K.in_multi_worker_mode(
+    ) and not dc_context.get_current_worker_context().should_checkpoint:
+      os.close(file_handle)
+      os.remove(filepath)
+
+  def _get_most_recently_modified_file_matching_pattern(self, pattern):
+    """Returns the most recently modified filepath matching pattern.
+
+    Pattern may contain python formatting placeholder. If
+    `tf.train.latest_checkpoint()` does not return None, use that; otherwise,
+    check for most recently modified one that matches the pattern. This utility
+    function is best demonstrated via an example:
+
+    ```python
+    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
+    test_dir = self.get_temp_dir()
+    path_pattern = os.path.join(test_dir, file_pattern)
+    file_paths = [
+        os.path.join(test_dir, file_name) for file_name in
+        ['f.batch03epoch02.h5', 'f.batch02epoch02.h5', 'f.batch01epoch01.h5']
+    ]
+    for file_path in file_paths:
+      # Write something to each of the files
+    self.assertEqual(
+        _get_most_recently_modified_file_matching_pattern(path_pattern),
+        file_paths[-1])
+    ```
+
+    Arguments:
+        pattern: The file pattern that may optionally contain python placeholder
+            such as `{epoch:02d}`.
+
+    Returns:
+        The most recently modified file's full filepath matching `pattern`. If
+        `pattern` does not contain any placeholder, this returns the filepath
+        that
+        exactly matches `pattern`. Returns `None` if no match is found.
+    """
+    dir_name = os.path.dirname(pattern)
+    base_name = os.path.basename(pattern)
+    base_name_regex = '^' + re.sub(r'{.*}', r'.*', base_name) + '$'
+
+    # If tf.train.latest_checkpoint tells us there exists a latest checkpoint,
+    # use that as it is more robust than `os.path.getmtime()`.
+    latest_tf_checkpoint = checkpoint_management.latest_checkpoint(dir_name)
+    if latest_tf_checkpoint is not None and re.match(
+        base_name_regex, os.path.basename(latest_tf_checkpoint)):
+      return latest_tf_checkpoint
+
+    latest_mod_time = 0
+    file_path_with_latest_mod_time = None
+    for file_name in os.listdir(dir_name):
+      if re.match(base_name_regex, file_name):
+        file_path = os.path.join(dir_name, file_name)
+        mod_time = os.path.getmtime(file_path)
+        if mod_time > latest_mod_time:
+          latest_mod_time = mod_time
+          file_path_with_latest_mod_time = file_path
+    return file_path_with_latest_mod_time
 
 
 @keras_export('keras.callbacks.EarlyStopping')
@@ -1018,6 +1130,16 @@
           the epoch with the best value of the monitored quantity.
           If False, the model weights obtained at the last step of
           training are used.
+
+  Example:
+
+  ```python
+  callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
+  # This callback will stop the training when there is no improvement in
+  # the validation loss for three consecutive epochs.
+  model.fit(data, labels, epochs=100, callbacks=[callback],
+      validation_data=(val_data, val_labels))
+  ```
   """
 
   def __init__(self,
@@ -1167,6 +1289,20 @@
           (integer, indexed from 0) and returns a new
           learning rate as output (float).
       verbose: int. 0: quiet, 1: update messages.
+
+  ```python
+  # This function keeps the learning rate at 0.001 for the first ten epochs
+  # and decreases it exponentially after that.
+  def scheduler(epoch):
+    if epoch < 10:
+      return 0.001
+    else:
+      return 0.001 * tf.math.exp(0.1 * (10 - epoch))
+
+  callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
+  model.fit(data, labels, epochs=100, callbacks=[callback],
+            validation_data=(val_data, val_labels))
+  ```
   """
 
   def __init__(self, schedule, verbose=0):
@@ -1238,6 +1374,14 @@
       profile_batch: Profile the batch to sample compute characteristics. By
         default, it will profile the second batch. Set profile_batch=0 to
         disable profiling. Must run in TensorFlow eager mode.
+      embeddings_freq: frequency (in epochs) at which embedding layers will
+        be visualized. If set to 0, embeddings won't be visualized.
+      embeddings_metadata: a dictionary which maps layer name to a file name in
+        which metadata for this embedding layer is saved. See the
+        [details](
+          https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+        about metadata files format. In case if the same metadata file is
+        used for all embedding layers, string can be passed.
 
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
@@ -1252,6 +1396,8 @@
                write_images=False,
                update_freq='epoch',
                profile_batch=2,
+               embeddings_freq=0,
+               embeddings_metadata=None,
                **kwargs):
     super(TensorBoard, self).__init__()
     self._validate_kwargs(kwargs)
@@ -1264,6 +1410,8 @@
       self.update_freq = 1
     else:
       self.update_freq = update_freq
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_metadata = embeddings_metadata
 
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
@@ -1292,17 +1440,21 @@
     if kwargs.get('write_grads', False):
       logging.warning('`write_grads` will be ignored in TensorFlow 2.0 '
                       'for the `TensorBoard` Callback.')
-    if kwargs.get('embeddings_freq', False):
-      logging.warning('Embeddings will be ignored in TensorFlow 2.0 '
-                      'for the `TensorBoard` Callback.')
     if kwargs.get('batch_size', False):
       logging.warning('`batch_size` is no longer needed in the '
                       '`TensorBoard` Callback and will be ignored '
                       'in TensorFlow 2.0.')
+    if kwargs.get('embeddings_layer_names', False):
+      logging.warning('`embeddings_layer_names` is not supported in '
+                      'TensorFlow 2.0. Instead, all `Embedding` layers '
+                      'will be visualized.')
+    if kwargs.get('embeddings_data', False):
+      logging.warning('`embeddings_data` is not supported in TensorFlow '
+                      '2.0. Instead, all `Embedding` variables will be '
+                      'visualized.')
 
     unrecognized_kwargs = set(kwargs.keys()) - {
-        'write_grads', 'embeddings_freq', 'embeddings_layer_names',
-        'embeddings_metadata', 'embeddings_data', 'batch_size'
+        'write_grads', 'embeddings_layer_names', 'embeddings_data', 'batch_size'
     }
 
     # Only allow kwargs that were supported in V1.
@@ -1327,6 +1479,48 @@
             if summary_writable:
               summary_ops_v2.keras_model('keras', self.model, step=0)
 
+    if self.embeddings_freq:
+      self._configure_embeddings()
+
+  def _configure_embeddings(self):
+    """Configure the Projector for embeddings."""
+    # TODO(omalleyt): Add integration tests.
+    from tensorflow.python.keras.layers import embeddings
+    try:
+      from tensorboard.plugins import projector
+    except ImportError:
+      raise ImportError('Failed to import TensorBoard. Please make sure that '
+                        'TensorBoard integration is complete."')
+    config = projector.ProjectorConfig()
+    for layer in self.model.layers:
+      if isinstance(layer, embeddings.Embedding):
+        embedding = config.embeddings.add()
+        embedding.tensor_name = layer.embeddings.name
+
+        if self.embeddings_metadata is not None:
+          if isinstance(self.embeddings_metadata, str):
+            embedding.metadata_path = self.embeddings_metadata
+          else:
+            if layer.name in embedding.metadata_path:
+              embedding.metadata_path = self.embeddings_metadata.pop(layer.name)
+
+    if self.embeddings_metadata:
+      raise ValueError('Unrecognized `Embedding` layer names passed to '
+                       '`keras.callbacks.TensorBoard` `embeddings_metadata` '
+                       'argument: ' + str(self.embeddings_metadata.keys()))
+
+    class DummyWriter(object):
+      """Dummy writer to conform to `Projector` API."""
+
+      def __init__(self, logdir):
+        self.logdir = logdir
+
+      def get_logdir(self):
+        return self.logdir
+
+    writer = DummyWriter(self.log_dir)
+    projector.visualize_embeddings(writer, config)
+
   def _close_writers(self):
     """Close all remaining open file writers owned by this callback.
 
@@ -1342,7 +1536,7 @@
 
     A writer will be created if it does not yet exist.
 
-    Args:
+    Arguments:
       writer_name: The name of the directory for which to create or
         retrieve a writer. Should be either `self._train_run_name` or
         `self._validation_run_name`.
@@ -1365,6 +1559,10 @@
     """Writes scalar summaries for metrics on every training batch.
 
     Performs profiling if current batch is in profiler_batches.
+
+    Arguments:
+      batch: Integer, index of batch within the current epoch.
+      logs: Dict. Metric results for this batch.
     """
     # Don't output batch_size and batch number as TensorBoard summaries
     logs = logs or {}
@@ -1388,6 +1586,9 @@
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._log_weights(epoch)
 
+    if self.embeddings_freq and epoch % self.embeddings_freq == 0:
+      self._log_embeddings(epoch)
+
   def on_train_end(self, logs=None):
     if self._is_tracing:
       self._log_trace()
@@ -1492,6 +1693,11 @@
     if len(shape) == 4 and shape[-1] in [1, 3, 4]:
       summary_ops_v2.image(weight_name, w_img, step=epoch)
 
+  def _log_embeddings(self, epoch):
+    embeddings_ckpt = os.path.join(self.log_dir, 'train',
+                                   'keras_embedding.ckpt-{}'.format(epoch))
+    self.model.save_weights(embeddings_ckpt)
+
 
 @keras_export('keras.callbacks.ReduceLROnPlateau')
 class ReduceLROnPlateau(Callback):
@@ -1512,22 +1718,20 @@
 
   Arguments:
       monitor: quantity to be monitored.
-      factor: factor by which the learning rate will
-          be reduced. new_lr = lr * factor
-      patience: number of epochs with no improvement
-          after which learning rate will be reduced.
+      factor: factor by which the learning rate will be reduced. new_lr = lr *
+        factor
+      patience: number of epochs with no improvement after which learning rate
+        will be reduced.
       verbose: int. 0: quiet, 1: update messages.
-      mode: one of {auto, min, max}. In `min` mode,
-          lr will be reduced when the quantity
-          monitored has stopped decreasing; in `max`
-          mode it will be reduced when the quantity
-          monitored has stopped increasing; in `auto`
-          mode, the direction is automatically inferred
-          from the name of the monitored quantity.
-      min_delta: threshold for measuring the new optimum,
-          to only focus on significant changes.
-      cooldown: number of epochs to wait before resuming
-          normal operation after lr has been reduced.
+      mode: one of {auto, min, max}. In `min` mode, lr will be reduced when the
+        quantity monitored has stopped decreasing; in `max` mode it will be
+        reduced when the quantity monitored has stopped increasing; in `auto`
+        mode, the direction is automatically inferred from the name of the
+        monitored quantity.
+      min_delta: threshold for measuring the new optimum, to only focus on
+        significant changes.
+      cooldown: number of epochs to wait before resuming normal operation after
+        lr has been reduced.
       min_lr: lower bound on the learning rate.
   """
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 8d8b361..5f86641 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -25,6 +25,7 @@
 import shutil
 import sys
 import threading
+import time
 import unittest
 
 from absl.testing import parameterized
@@ -43,6 +44,7 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import adam
+from tensorflow.python.training import checkpoint_management
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -543,25 +545,31 @@
     model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])
     train_ds = get_input_datasets()
 
-    filepath = os.path.join(self.get_temp_dir(), 'checkpoint.h5')
+    temp_dir = self.get_temp_dir()
+    filepath = os.path.join(temp_dir, 'checkpoint.epoch{epoch:02d}.h5')
+    initial_epochs = 3
 
     # The filepath shouldn't exist at the beginning.
     self.assertFalse(os.path.exists(filepath))
-    model.fit(
-        train_ds,
-        epochs=3,
-        callbacks=[
-            keras.callbacks.ModelCheckpoint(
-                filepath=filepath, save_weights_only=True)
-        ])
+    callback = keras.callbacks.ModelCheckpoint(
+        filepath=filepath, save_weights_only=True)
+    model.fit(train_ds, epochs=initial_epochs, callbacks=[callback])
 
-    # The filepath should exist after fitting with callback.
-    self.assertTrue(os.path.exists(filepath))
+    # The files should exist after fitting with callback.
+    for epoch in range(initial_epochs):
+      self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
+    self.assertFalse(os.path.exists(filepath.format(epoch=initial_epochs + 1)))
+    self.skipTest('b/131852849')
+    self.assertEqual(
+        callback._get_most_recently_modified_file_matching_pattern(filepath),
+        filepath.format(epoch=initial_epochs))
+
     model.fit(train_ds, epochs=1)
     weights_after_one_more_epoch = model.get_weights()
 
     # The filepath should continue to exist after fitting without callback.
-    self.assertTrue(os.path.exists(filepath))
+    for epoch in range(initial_epochs):
+      self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
 
     return model, train_ds, filepath, weights_after_one_more_epoch
 
@@ -572,6 +580,17 @@
       (model, train_ds, filepath, weights_after_one_more_epoch
       ) = self._run_load_weights_on_restart_test_common_iterations()
 
+      callback = keras.callbacks.ModelCheckpoint(
+          filepath=filepath,
+          save_weights_only=save_weights_only,
+          load_weights_on_restart=True)
+      model.fit(train_ds, epochs=1, callbacks=[callback])
+      weights_after_model_restoring_and_one_more_epoch = model.get_weights()
+
+      self.assertEqual(
+          callback._get_most_recently_modified_file_matching_pattern(filepath),
+          filepath.format(epoch=1))
+
       model.fit(
           train_ds,
           epochs=1,
@@ -581,7 +600,7 @@
                   save_weights_only=save_weights_only,
                   load_weights_on_restart=True)
           ])
-      weights_after_model_restoring_and_one_more_epoch = model.get_weights()
+      weights_with_one_final_extra_epoch = model.get_weights()
 
       # Asserting the weights one epoch after initial fitting and another epoch
       # after that are closed, if a ModelCheckpoint with
@@ -590,6 +609,9 @@
       self.assertAllClose(weights_after_one_more_epoch,
                           weights_after_model_restoring_and_one_more_epoch)
 
+      self.assertNotAllClose(weights_after_one_more_epoch,
+                             weights_with_one_final_extra_epoch)
+
     return func
 
   @staticmethod
@@ -633,16 +655,14 @@
     (model, train_ds, filepath,
      _) = self._run_load_weights_on_restart_test_common_iterations()
 
-    model.load_weights(filepath)
+    callback = keras.callbacks.ModelCheckpoint(
+        filepath=filepath, save_weights_only=True)
+    model.load_weights(
+        callback._get_most_recently_modified_file_matching_pattern(filepath))
     weights_before_additional_fit = model.get_weights()
-    model.fit(
-        train_ds,
-        epochs=1,
-        callbacks=[
-            keras.callbacks.ModelCheckpoint(
-                filepath=filepath, save_weights_only=True)
-        ])
-    model.load_weights(filepath)
+    model.fit(train_ds, epochs=1, callbacks=[callback])
+    model.load_weights(
+        callback._get_most_recently_modified_file_matching_pattern(filepath))
     weights_after_additional_fit = model.get_weights()
 
     self.assertNotAllClose(weights_before_additional_fit,
@@ -1260,7 +1280,8 @@
         keras.layers.Dense(1)
     ]
     model = testing_utils.get_model_from_layers(layers, input_shape=(10, 10, 1))
-    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    opt = gradient_descent.SGD(learning_rate=0.001)
+    model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   def test_TensorBoard_default_logdir(self):
@@ -1489,7 +1510,8 @@
         keras.layers.Flatten(),
         keras.layers.Dense(1),
     ])
-    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    opt = gradient_descent.SGD(learning_rate=0.001)
+    model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   def fitModelAndAssertKerasModelWritten(self, model):
@@ -1602,5 +1624,98 @@
     self.assertEmpty(summary_file.tensors)
 
 
+class MostRecentlyModifiedFileMatchingPatternTest(test.TestCase):
+
+  def test_get_most_recently_modified_file_matching_pattern(self):
+    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
+    test_dir = self.get_temp_dir()
+    path_pattern = os.path.join(test_dir, file_pattern)
+    file_paths = [
+        os.path.join(test_dir, file_name) for file_name in
+        ['f.batch03epoch02.h5', 'f.batch02epoch02.h5', 'f.batch01epoch01.h5']
+    ]
+    for file_path in file_paths:
+      with open(file_path, 'w') as f:
+        # Ensure there are some intervals between file creation.
+        time.sleep(2)
+        f.write('foo bar')
+    # Ensure the files have been actually written.
+    self.assertEqual(
+        set([
+            os.path.join(test_dir, file_name)
+            for file_name in os.listdir(test_dir)
+        ]), set(file_paths))
+    self.assertEqual(
+        keras.callbacks.ModelCheckpoint(None)
+        ._get_most_recently_modified_file_matching_pattern(path_pattern),
+        file_paths[-1])
+
+  def test_some_file_not_matching_pattern(self):
+    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
+    test_dir = self.get_temp_dir()
+    path_pattern = os.path.join(test_dir, file_pattern)
+    file_paths = [
+        os.path.join(test_dir, file_name) for file_name in
+        ['f.batch03epoch02.h5', 'f.batch02epoch02.h5', 'f.baatch01epoch01.h5']
+    ]
+    for file_path in file_paths:
+      with open(file_path, 'w') as f:
+        # Ensure there are some intervals between file creation.
+        time.sleep(2)
+        f.write('foo bar')
+    self.assertEqual(
+        keras.callbacks.ModelCheckpoint(None)
+        ._get_most_recently_modified_file_matching_pattern(path_pattern),
+        file_paths[-2])
+
+  def test_get_same_file_if_file_name_equals_pattern(self):
+    file_name = 'f.batch02.h5'
+    test_dir = self.get_temp_dir()
+    file_path = os.path.join(test_dir, file_name)
+    with open(file_path, 'w') as f:
+      f.write('foo bar')
+    self.assertEqual(os.path.join(test_dir, os.listdir(test_dir)[0]), file_path)
+    self.assertEqual(
+        keras.callbacks.ModelCheckpoint(
+            None)._get_most_recently_modified_file_matching_pattern(file_path),
+        file_path)
+
+  def test_get_none_if_file_does_not_exist(self):
+    file_name = 'f.batch02.h5'
+    test_dir = self.get_temp_dir()
+    file_path = os.path.join(test_dir, file_name)
+    self.assertLen(os.listdir(test_dir), 0)
+    self.assertEqual(
+        keras.callbacks.ModelCheckpoint(
+            None)._get_most_recently_modified_file_matching_pattern(file_path),
+        None)
+
+  def test_using_checkpoint_management_latest_checkpoint(self):
+    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}'
+    ckpt_file_name = 'f.batchXepochY'
+    test_dir = self.get_temp_dir()
+    path_pattern = os.path.join(test_dir, file_pattern)
+    ckpt_file_path = os.path.join(test_dir, ckpt_file_name)
+    with open(ckpt_file_path, 'w') as f:
+      f.write('dummy ckpt')
+    checkpoint_management.update_checkpoint_state_internal(
+        test_dir, ckpt_file_path)
+
+    file_paths = [
+        os.path.join(test_dir, file_name)
+        for file_name in ['f.batch03epoch02', 'f.batch02epoch02']
+    ]
+    for file_path in file_paths:
+      with open(file_path, 'w') as f:
+        f.write('foo bar')
+
+    # The result returned from checkpoint_management.latest_checkpoint takes
+    # priority, so even if it was written earlier, we should still return that.
+    self.assertEqual(
+        keras.callbacks.ModelCheckpoint(None)
+        ._get_most_recently_modified_file_matching_pattern(path_pattern),
+        ckpt_file_path)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/custom_training_loop_test.py b/tensorflow/python/keras/custom_training_loop_test.py
new file mode 100644
index 0000000..1084fbf
--- /dev/null
+++ b/tensorflow/python/keras/custom_training_loop_test.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class LayerWithLosses(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.v = self.add_weight(
+        name='hey',
+        shape=(),
+        initializer='ones',
+        regularizer=keras.regularizers.l1(100))
+
+  def call(self, inputs):
+    self.add_loss(math_ops.reduce_sum(inputs))
+    return self.v * inputs
+
+
+def add_loss_step(defun):
+  optimizer = keras.optimizer_v2.adam.Adam()
+  model = testing_utils.get_model_from_layers([LayerWithLosses()],
+                                              input_shape=(10,))
+
+  def train_step(x):
+    with backprop.GradientTape() as tape:
+      model(x)
+      assert len(model.losses) == 2
+      loss = math_ops.reduce_sum(model.losses)
+    gradients = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+    return loss
+
+  if defun:
+    train_step = def_function.function(train_step)
+
+  x = array_ops.ones((10, 10))
+  return train_step(x)
+
+
+def batch_norm_step(defun):
+  optimizer = keras.optimizer_v2.adadelta.Adadelta()
+  model = testing_utils.get_model_from_layers([
+      keras.layers.BatchNormalization(momentum=0.9),
+      keras.layers.Dense(1, kernel_initializer='zeros', activation='softmax')
+  ],
+                                              input_shape=(10,))
+
+  def train_step(x, y):
+    with backprop.GradientTape() as tape:
+      y_pred = model(x, training=True)
+      loss = keras.losses.binary_crossentropy(y, y_pred)
+    gradients = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+    return loss, model(x, training=False)
+
+  if defun:
+    train_step = def_function.function(train_step)
+
+  x, y = array_ops.ones((10, 10)), array_ops.ones((10, 1))
+  return train_step(x, y)
+
+
+@keras_parameterized.run_with_all_model_types
+class CustomTrainingLoopTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(('add_loss_step', add_loss_step),
+                                  ('batch_norm_step', batch_norm_step))
+  def test_eager_and_tf_function(self, train_step):
+    eager_result = train_step(defun=False)
+    fn_result = train_step(defun=True)
+    self.assertAllClose(eager_result, fn_result)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index 022a9b7..e3a03c8 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -82,7 +82,7 @@
       path,
       origin=origin_folder + 'imdb.npz',
       file_hash='599dadb1135973df5b59232a0e9a887c')
-  with np.load(path) as f:
+  with np.load(path, allow_pickle=True) as f:
     x_train, labels_train = f['x_train'], f['y_train']
     x_test, labels_test = f['x_test'], f['y_test']
 
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 7bfab1a..dc7b1e0 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -23,7 +23,10 @@
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:callbacks",
@@ -52,9 +55,11 @@
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
@@ -68,7 +73,7 @@
     srcs = ["distribute_strategy_test.py"],
     full_precision = True,
     main = "distribute_strategy_test.py",
-    shard_count = 32,
+    shard_count = 4,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/117919883): Fix python error.
@@ -108,11 +113,14 @@
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:backend",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -144,9 +152,7 @@
     srcs = ["keras_embedding_model_correctness_test.py"],
     full_precision = True,
     main = "keras_embedding_model_correctness_test.py",
-    # Shard count is set to an odd number to distribute tasks across
-    # shards more evenly.
-    shard_count = 31,
+    shard_count = 4,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # b/129793413
@@ -164,9 +170,7 @@
     srcs = ["keras_image_model_correctness_test.py"],
     full_precision = True,
     main = "keras_image_model_correctness_test.py",
-    # Shard count is set to an odd number to distribute tasks across
-    # shards more evenly.
-    shard_count = 31,
+    shard_count = 8,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # b/129793413
@@ -191,6 +195,7 @@
         "multi_and_single_gpu",
         "no_oss",  # b/129793413
         "no_windows_gpu",
+        "notap",  # b/131937016
         "notsan",
     ],
     deps = [
@@ -204,9 +209,7 @@
     srcs = ["keras_stateful_lstm_model_correctness_test.py"],
     full_precision = True,
     main = "keras_stateful_lstm_model_correctness_test.py",
-    # Shard count is set to an odd number to distribute tasks across
-    # shards more evenly.
-    shard_count = 31,
+    shard_count = 4,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
@@ -223,7 +226,7 @@
     srcs = ["keras_utils_test.py"],
     full_precision = True,
     main = "keras_utils_test.py",
-    shard_count = 32,
+    shard_count = 4,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # b/129793413
@@ -270,28 +273,6 @@
     ],
 )
 
-distribute_py_test(
-    name = "keras_two_input_layers_correctness_test",
-    size = "medium",
-    srcs = ["keras_two_input_layers_correctness_test.py"],
-    full_precision = True,
-    main = "keras_two_input_layers_correctness_test.py",
-    # Shard count is set to an odd number to distribute tasks across
-    # shards more evenly.
-    shard_count = 5,
-    tags = [
-        "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_windows_gpu",
-        "notsan",
-    ],
-    deps = [
-        ":keras_correctness_test_lib",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
-    ],
-)
-
 cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
@@ -299,7 +280,7 @@
     shard_count = 32,
     tags = [
         "no_oss",  # TODO(b/130369494): Investigate why it times out on OSS.
-        # TODO(b/124344198): Add "multi_and_single_gpu",
+        # TODO(b/123307453): Add "multi_and_single_gpu",
     ],
 )
 
@@ -346,7 +327,66 @@
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/keras",
     ],
-    shard_count = 12,
+    shard_count = 14,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/132384649): Flakily times out.
+    ],
+)
+
+py_binary(
+    name = "mnist_multi_worker",
+    srcs = ["mnist_multi_worker.py"],
+    python_version = "PY2",
+    deps = [":mnist_multi_worker_lib"],
+)
+
+py_library(
+    name = "mnist_multi_worker_lib",
+    srcs = [
+        "mnist_multi_worker.py",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:callbacks",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:optimizers",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "multi_worker_optimizer_comparison_test",
+    srcs = ["multi_worker_optimizer_comparison_test.py"],
+    additional_deps = [
+        ":mnist_multi_worker_lib",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/keras/distribute:multi_worker_test",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/keras",
+    ],
     tags = [
         "multi_and_single_gpu",
     ],
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 079adb0..9954067 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
@@ -34,6 +35,7 @@
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
@@ -114,7 +116,7 @@
       inputs=[input_a, input_b, input_m], outputs=[output_c, output_d])
   model.compile(
       loss='categorical_crossentropy',
-      optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+      optimizer=gradient_descent_keras.SGD(learning_rate=0.001),
       metrics={
           'dense_2': 'categorical_accuracy',
           'dense_3': 'categorical_accuracy'
@@ -289,6 +291,18 @@
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
+def all_strategy_combinations_plus_cloning():
+  return (
+      combinations.combine(
+          distribution=strategies_minus_tpu,
+          mode=['graph', 'eager'],
+          cloning=[True, False]) +
+      combinations.combine(
+          distribution=tpu_strategies,
+          mode=['graph'],
+          cloning=[True, False]))
+
+
 def all_strategy_minus_default_and_tpu_combinations():
   return combinations.combine(
       distribution=[
@@ -308,16 +322,23 @@
 def strategy_and_optimizer_combinations():
   return combinations.times(
       all_strategy_combinations(),
-      combinations.combine(optimizer=[
-          strategy_combinations.adagrad_optimizer_v1_fn,
-          strategy_combinations.adagrad_optimizer_keras_v2_fn,
-          strategy_combinations.adam_optimizer_v1_fn,
-          strategy_combinations.adam_optimizer_keras_v2_fn,
-          strategy_combinations.gradient_descent_optimizer_v1_fn,
-          strategy_combinations.gradient_descent_optimizer_keras_v2_fn,
-          strategy_combinations.rmsprop_optimizer_v1_fn,
-          strategy_combinations.rmsprop_optimizer_keras_v2_fn
-      ]))
+      # TODO(b/130808953):  Simplify when optimizers v1 work with cloning=False.
+      combinations.combine(
+          optimizer=[
+              strategy_combinations.adagrad_optimizer_v1_fn,
+              strategy_combinations.adam_optimizer_v1_fn,
+              strategy_combinations.gradient_descent_optimizer_v1_fn,
+              strategy_combinations.rmsprop_optimizer_v1_fn,
+          ],
+          cloning=True) +
+      combinations.combine(
+          optimizer=[
+              strategy_combinations.adagrad_optimizer_keras_v2_fn,
+              strategy_combinations.adam_optimizer_keras_v2_fn,
+              strategy_combinations.gradient_descent_optimizer_keras_v2_fn,
+              strategy_combinations.rmsprop_optimizer_keras_v2_fn
+          ],
+          cloning=[True, False]))
 
 
 class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
@@ -343,25 +364,29 @@
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.mirrored_strategy_with_two_gpus
           ],
-          mode=['graph']))
-  def test_train_functional_with_distribution_strategy(self, distribution):
+          mode=['graph'],
+          cloning=[True, False]))
+  def test_train_functional_with_distribution_strategy(self, distribution,
+                                                       cloning):
     keras_model = simple_functional_model()
     keras_model.compile(
         loss='categorical_crossentropy',
         metrics=[keras.metrics.CategoricalAccuracy()],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
-    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
-                                      model_dir=self._base_dir,
-                                      train_distribute=distribution,
-                                      eval_distribute=distribution)
+        optimizer=rmsprop_keras.RMSprop(learning_rate=0.01),
+        cloning=cloning)
+    config = run_config_lib.RunConfig(
+        tf_random_seed=_RANDOM_SEED,
+        model_dir=self._base_dir,
+        train_distribute=distribution,
+        eval_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
       before_eval_results = est_keras.evaluate(
           input_fn=get_ds_test_input_fn, steps=1)
       est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-      after_eval_results = est_keras.evaluate(input_fn=get_ds_test_input_fn,
-                                              steps=1)
+      after_eval_results = est_keras.evaluate(
+          input_fn=get_ds_test_input_fn, steps=1)
       self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
 
     writer_cache.FileWriterCache.clear()
@@ -373,24 +398,28 @@
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.mirrored_strategy_with_two_gpus
           ],
-          mode=['graph']))
-  def test_train_sequential_with_distribution_strategy(self, distribution):
+          mode=['graph'],
+          cloning=[True, False]))
+  def test_train_sequential_with_distribution_strategy(self, distribution,
+                                                       cloning):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
         metrics=[keras.metrics.CategoricalAccuracy()],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
-    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
-                                      model_dir=self._base_dir,
-                                      train_distribute=distribution)
+        optimizer=rmsprop_keras.RMSprop(learning_rate=0.01),
+        cloning=cloning)
+    config = run_config_lib.RunConfig(
+        tf_random_seed=_RANDOM_SEED,
+        model_dir=self._base_dir,
+        train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
       before_eval_results = est_keras.evaluate(
           input_fn=get_ds_test_input_fn, steps=1)
       est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-      after_eval_results = est_keras.evaluate(input_fn=get_ds_test_input_fn,
-                                              steps=1)
+      after_eval_results = est_keras.evaluate(
+          input_fn=get_ds_test_input_fn, steps=1)
       self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
 
     writer_cache.FileWriterCache.clear()
@@ -449,32 +478,6 @@
       eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu
-          ],
-          mode=['graph']))
-  def test_keras_optimizer_with_distribution_strategy(self, distribution):
-    keras_model = simple_sequential_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.rmsprop(lr=0.01))
-
-    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
-                                      model_dir=self._base_dir,
-                                      train_distribute=distribution)
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
-                                               config=config)
-      with self.assertRaisesRegexp(ValueError,
-                                   'Only TensorFlow native optimizers are '
-                                   'supported with DistributionStrategy.'):
-        est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-
-    writer_cache.FileWriterCache.clear()
-    gfile.DeleteRecursively(self._config.model_dir)
-
 
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
@@ -607,15 +610,21 @@
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=10, batch_size=13)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_calling_model_with_numpy_arrays(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_calling_model_with_numpy_arrays(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning or not distribution_strategy_context.has_strategy()
+            else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(0.001)
         model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
         inputs = np.zeros((64, 3), dtype=np.float32)
         targets = np.zeros((64, 4), dtype=np.float32)
@@ -638,15 +647,19 @@
         # with batch_size
         model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_calling_model_with_nested_numpy_arrays(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_calling_model_with_nested_numpy_arrays(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(learning_rate=0.001)
         model = multi_input_output_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(
-            learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
       input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
@@ -674,14 +687,22 @@
       model.predict(inputs, batch_size=8)
 
   @combinations.generate(
-      combinations.combine(distribution=strategies_minus_tpu, mode=['graph']))
-  def test_numpy_with_sample_weights(self, distribution):
+      combinations.combine(
+          distribution=strategies_minus_tpu,
+          mode=['graph'],
+          cloning=[True, False]))
+  def test_numpy_with_sample_weights(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            rmsprop.RMSPropOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(learning_rate=0.001)
         model = get_model()
-        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       inputs = np.zeros((20, 3), np.float32)
       targets = np.zeros((20, 4), np.float32)
@@ -690,15 +711,19 @@
       model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
                 steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_flatten_predict_outputs(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_flatten_predict_outputs(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
         model = multi_input_output_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(
-            learning_rate=0.001)
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       # We take 6 input samples with each input having a dimension of 3 or 5.
       input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
@@ -715,15 +740,60 @@
       self.assertAllEqual([6, 7], outs[0].shape)
       self.assertAllEqual([6, 7], outs[1].shape)
 
-  @combinations.generate(tpu_strategy_combinations())
-  def test_predict_with_partial_batch(self, distribution):
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(batch_size=[4, 6])))
+  def test_evaluate_with_partial_batch(self, distribution, batch_size):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+
+      with distribution.scope():
+        model_with_ds_strategy = get_model()
+        model_with_ds_strategy.compile(optimizer, loss, metrics=metrics)
+
+      cpu_model = get_model()
+      cpu_model.compile(optimizer, loss, metrics=metrics)
+
+      x = np.random.random((10, 3)).astype('float32')
+      y = np.random.random((10, 4)).astype('float32')
+
+      # As sample size is 10, we batch by 4 so that the last batch is
+      # a partial batch. Also `evaluate()` using numpy array as inputs without
+      # distribution strategy uses entire sample as a single batch. As so,
+      # we remove parameters `batch_size` and `steps`.
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+      evaluate_ground_truth = cpu_model.evaluate(x, y)
+
+      # We don't compare the loss as loss is currently not computed as metric
+      # in Keras, the loss value is inaccurate for last partial batch due to
+      # more weights for the last batch samples.
+      steps = np.ceil(10.0 / batch_size)
+      self.assertAllClose(
+          model_with_ds_strategy.evaluate(
+              x, y, batch_size=batch_size, steps=steps)[1:],
+          evaluate_ground_truth[1:],
+          atol=1e-5,
+          rtol=1e-5)
+      # Test that `steps` is inferred correctly when final partial batch exists.
+      self.assertAllClose(
+          model_with_ds_strategy.evaluate(x, y, batch_size=batch_size)[1:],
+          evaluate_ground_truth[1:],
+          atol=1e-5,
+          rtol=1e-5)
+
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_predict_with_partial_batch(self, distribution, cloning):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
 
       with distribution.scope():
         model_with_ds_strategy = get_model()
-        model_with_ds_strategy.compile(optimizer, loss)
+        model_with_ds_strategy.compile(optimizer, loss, cloning=cloning)
 
       cpu_model = get_model()
       cpu_model.compile(optimizer, loss)
@@ -749,15 +819,42 @@
           rtol=1e-5)
 
   @combinations.generate(tpu_strategy_combinations())
+  def test_no_target_model(self, distribution):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+
+      class MyLayer(keras.layers.Layer):
+
+        def call(self, inputs, training=None):
+          self.add_loss(math_ops.reduce_sum(inputs), inputs=True)
+          return inputs
+
+      with distribution.scope():
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(16, activation='relu',
+                                     input_shape=_INPUT_SIZE))
+        model.add(MyLayer())
+        model.add(keras.layers.Dense(_NUM_CLASS, activation='softmax'))
+
+        model.compile(optimizer)
+        inputs = np.zeros((20, 10), np.float32)
+
+        model.fit(inputs, epochs=1, steps_per_epoch=2)
+        model.predict(inputs, steps=1)
+        model.evaluate(inputs, steps=1)
+
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
   def test_predict_multi_output_model_with_partial_batch(
-      self, distribution):
+      self, distribution, cloning):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
 
       with distribution.scope():
         model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
-        model_with_ds_strategy.compile(optimizer, loss)
+        model_with_ds_strategy.compile(optimizer, loss, cloning=cloning)
 
       cpu_model = simple_multi_inputs_multi_outputs_model()
       cpu_model.compile(optimizer, loss)
@@ -782,15 +879,20 @@
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
-  def test_calling_model_on_same_dataset(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_calling_model_on_same_dataset(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(0.001)
         model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       dataset = get_dataset(distribution)
 
@@ -801,22 +903,30 @@
                 validation_data=dataset, validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_model_interleaved_eval_same_as_direct_eval(self, distribution,
+                                                      cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
         user_controlled_model = get_model()
         user_controlled_model.compile(
-            gradient_descent.GradientDescentOptimizer(0.001),
+            optimizer_fn(0.001),
             loss='mse',
-            metrics=['mae', keras.metrics.CategoricalAccuracy()])
+            metrics=['mae', keras.metrics.CategoricalAccuracy()],
+            cloning=cloning)
 
         interleaved_model = get_model()
         interleaved_model.set_weights(user_controlled_model.get_weights())
         interleaved_model.compile(
-            gradient_descent.GradientDescentOptimizer(0.001),
+            optimizer_fn(0.001),
             loss='mse',
-            metrics=['mae', keras.metrics.CategoricalAccuracy()])
+            metrics=['mae', keras.metrics.CategoricalAccuracy()],
+            cloning=cloning)
 
       dataset = get_dataset(distribution)
 
@@ -848,22 +958,25 @@
   # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
   # as clone_model's input_tensors argument only seems to accept list and not
   # tuples or dict.
-
   @combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
           ],
-          mode=['graph', 'eager']))
-  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
+          mode=['graph', 'eager'], cloning=[True, False]))
+  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(learning_rate=0.001)
         model = multi_input_output_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(
-            learning_rate=0.001)
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 5))
@@ -887,16 +1000,21 @@
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(all_strategy_combinations())
+  @combinations.generate(all_strategy_combinations_plus_cloning())
   def test_fit_eval_and_predict_methods_on_dataset_without_steps(
-      self, distribution):
+      self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(0.001)
         model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       inputs = np.zeros((1000, 3), dtype=np.float32)
       targets = np.zeros((1000, 4), dtype=np.float32)
@@ -921,16 +1039,23 @@
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
-  @combinations.generate(strategy_minus_tpu_combinations())
+  @combinations.generate(
+      combinations.times(strategy_minus_tpu_combinations(),
+                         combinations.combine(cloning=[True, False])))
   def test_on_dataset_with_unknown_cardinality_without_steps(
-      self, distribution):
+      self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(0.001)
         model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       inputs = np.zeros((1000, 3), dtype=np.float32)
       targets = np.zeros((1000, 4), dtype=np.float32)
@@ -970,15 +1095,20 @@
           fit_with_numpy_multiple_epochs,
           fit_with_ds_multiple_epochs, atol=1e-4, rtol=1e-4)
 
-  @combinations.generate(tpu_strategy_combinations())
-  def test_on_dataset_with_unknown_cardinality(self, distribution):
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_on_dataset_with_unknown_cardinality(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
         model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(
+            gradient_descent.GradientDescentOptimizer(0.001),
+            loss,
+            metrics=metrics,
+            cloning=cloning)
 
       inputs = np.zeros((1000, 3), dtype=np.float32)
       targets = np.zeros((1000, 4), dtype=np.float32)
@@ -992,15 +1122,16 @@
       predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
           inputs)
 
-      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
-          dataset)), cardinality.UNKNOWN)
-      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
-          predict_dataset)), cardinality.UNKNOWN)
+      self.assertEqual(
+          keras.backend.get_value(cardinality.cardinality(dataset)),
+          cardinality.UNKNOWN)
+      self.assertEqual(
+          keras.backend.get_value(cardinality.cardinality(predict_dataset)),
+          cardinality.UNKNOWN)
 
       eval_with_ds = model.evaluate(dataset, steps=100)
       predict_with_ds = model.predict(predict_dataset, steps=100)
-      self.assertAllClose(
-          eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
@@ -1008,15 +1139,20 @@
                                    'Number of steps could not be infered'):
         model.fit(dataset, epochs=1)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_fit_eval_and_predict_methods_on_dataset(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(0.001)
         model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       dataset = get_dataset(distribution)
 
@@ -1025,12 +1161,14 @@
       model.predict(get_predict_dataset(distribution), steps=2)
 
   @combinations.generate(strategy_and_optimizer_combinations())
-  def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
+  def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer,
+                                               cloning):
     with self.cached_session():
       with distribution.scope():
+
         model = get_model()
         loss = 'mse'
-        model.compile(optimizer(), loss)
+        model.compile(optimizer(), loss, cloning=cloning)
 
       dataset = get_dataset(distribution)
 
@@ -1038,14 +1176,21 @@
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(strategy_minus_tpu_combinations())
-  def test_dataset_with_sample_weights(self, distribution):
+  @combinations.generate(
+      combinations.times(strategy_minus_tpu_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_dataset_with_sample_weights(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
         model = get_model()
-        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            rmsprop.RMSPropOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       inputs = np.zeros((10, 3), np.float32)
       targets = np.zeros((10, 4), np.float32)
@@ -1064,16 +1209,21 @@
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
           ],
-          mode=['graph', 'eager']))
+          mode=['graph', 'eager'], cloning=[True, False]))
   # TODO(b/120943676, b/120957836): Re-enable once the validation code is
   # restored.
-  def DISABLED_test_dataset_wrong_input_shape(self, distribution):
+  def DISABLED_test_dataset_wrong_input_shape(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            rmsprop.RMSPropOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(learning_rate=0.001)
         model = get_model()
-        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       # Wrong input shape
       inputs = np.zeros((10, 5), dtype=np.float32)
@@ -1091,16 +1241,18 @@
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
           ],
-          mode=['graph', 'eager']))
+          mode=['graph', 'eager'],
+          cloning=[True, False]))
   # TODO(b/120943676, b/120957836): Re-enable once the validation code is
   # restored.
-  def DISABLED_test_dataset_no_batch_input_validation(self, distribution):
+  def DISABLED_test_dataset_no_batch_input_validation(self, distribution,
+                                                      cloning):
     with self.cached_session():
       with distribution.scope():
         model = get_model()
         optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       # User forgets to batch the dataset
       inputs = np.zeros((10, 3), dtype=np.float32)
@@ -1113,32 +1265,12 @@
 
   @combinations.generate(
       combinations.combine(
-          distribution=[strategy_combinations.tpu_strategy_one_step],
-          mode=['graph']))
-  def test_dataset_input_shape_fully_defined(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-        loss = 'mse'
-        model.compile(optimizer, loss)
-
-      dataset = get_dataset(distribution)
-      # Input shapes are not fully known. Batch dimension is unknown as we are
-      # not using the drop_remainder argument.
-      dataset = dataset.repeat(100).batch(10)
-
-      with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
-
-  @combinations.generate(
-      combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.mirrored_strategy_with_two_gpus
           ],
-          mode=['graph', 'eager']))
-  def test_learning_phase_value(self, distribution):
+          mode=['graph', 'eager'], cloning=[True, False]))
+  def test_learning_phase_value(self, distribution, cloning):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
@@ -1150,10 +1282,15 @@
         model = keras.Model(x, z)
         initial_weights = model.get_weights()
 
-        optimizer = gradient_descent.GradientDescentOptimizer(0.005)
+        # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+        # mirrored.
+        optimizer_fn = (
+            gradient_descent.GradientDescentOptimizer
+            if cloning else gradient_descent_keras.SGD)
+        optimizer = optimizer_fn(0.005)
         loss = 'mse'
         metrics = ['acc']
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       batch_size = 8
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
@@ -1182,14 +1319,14 @@
       ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
-  @combinations.generate(all_strategy_combinations())
-  def testOptimizerWithCallbacks(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def testOptimizerWithCallbacks(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
         model = get_model()
         optimizer = gradient_descent_keras.SGD(0.01)
         loss = 'mse'
-        model.compile(optimizer, loss)
+        model.compile(optimizer, loss, cloning=cloning)
 
       dataset = get_dataset(distribution)
 
@@ -1200,15 +1337,58 @@
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
       self.assertAllClose(0.001, keras.backend.get_value(model.optimizer.lr))
 
-  @combinations.generate(tpu_strategy_combinations())
-  def test_predict_with_dataset_with_partial_batch(self, distribution):
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(batch_size=[4, 6])))
+  def test_evaluate_with_dataset_with_partial_batch(self, distribution,
+                                                    batch_size):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+
+      with distribution.scope():
+        model_with_ds_strategy = get_model()
+        model_with_ds_strategy.compile(optimizer, loss, metrics=metrics)
+
+      cpu_model = get_model()
+      cpu_model.compile(optimizer, loss, metrics=metrics)
+
+      x = np.random.random((10, 3)).astype('float32')
+      y = np.random.random((10, 4)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+
+      # As sample size is 10, we make the last batch a partial batch.
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+      dataset_with_partial_batch = dataset.batch(batch_size)
+
+      # We don't compare the loss as loss is currently not computed as metric
+      # in Keras, the loss value is inaccurate for last partial batch due to
+      # more weights for the last batch samples.
+      steps = np.ceil(10.0 / batch_size)
+      self.assertAllClose(
+          model_with_ds_strategy.evaluate(
+              dataset_with_partial_batch, steps=steps)[1:],
+          cpu_model.evaluate(dataset_with_partial_batch, steps=steps)[1:],
+          atol=1e-5,
+          rtol=1e-5)
+      self.assertAllClose(
+          model_with_ds_strategy.evaluate(dataset_with_partial_batch)[1:],
+          cpu_model.evaluate(dataset_with_partial_batch)[1:],
+          atol=1e-5,
+          rtol=1e-5)
+
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_predict_with_dataset_with_partial_batch(self, distribution, cloning):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
 
       with distribution.scope():
         model_with_ds_strategy = get_model()
-        model_with_ds_strategy.compile(optimizer, loss)
+        model_with_ds_strategy.compile(optimizer, loss, cloning=cloning)
 
       cpu_model = get_model()
       cpu_model.compile(optimizer, loss)
@@ -1224,18 +1404,21 @@
       self.assertAllClose(
           model_with_ds_strategy.predict(dataset_with_partial_batch, steps=3),
           cpu_model.predict(dataset_with_partial_batch, steps=3),
-          atol=1e-5, rtol=1e-5)
+          atol=1e-5,
+          rtol=1e-5)
 
-  @combinations.generate(tpu_strategy_combinations())
+  @combinations.generate(
+      combinations.times(tpu_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
   def test_predict_multi_output_model_with_dataset_with_partial_batch(
-      self, distribution):
+      self, distribution, cloning):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
 
       with distribution.scope():
         model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
-        model_with_ds_strategy.compile(optimizer, loss)
+        model_with_ds_strategy.compile(optimizer, loss, cloning=cloning)
 
       cpu_model = simple_multi_inputs_multi_outputs_model()
       cpu_model.compile(optimizer, loss)
@@ -1258,6 +1441,59 @@
           cpu_model.predict(dataset_with_partial_batch, steps=12),
           atol=1e-4, rtol=1e-4)
 
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_match_model_input_matches_with_dataset_tensors(self, distribution):
+
+    def _create_model_input_output_tensors():
+      input_a = keras.layers.Input(shape=(16,), name='z_input_sorted_last')
+      input_b = keras.layers.Input(shape=(32,), name='a_input_sorted_first')
+      intermediate_a = keras.layers.Dense(10)(input_a)
+      intermediate_b = keras.layers.Dense(10)(input_b)
+      merged = keras.layers.Add()([intermediate_a, intermediate_b])
+      output = keras.layers.Dense(2)(merged)
+      return input_a, input_b, output
+
+    input_dict = {
+        'z_input_sorted_last': np.random.rand(32, 16).astype(np.float32),
+        'a_input_sorted_first': np.random.rand(32, 32).astype(np.float32)
+    }
+    target = np.ones((32, 2), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((input_dict, target))
+    dataset = dataset.batch(4, drop_remainder=True)
+
+    with self.cached_session():
+      with distribution.scope():
+        input_a, input_b, output = _create_model_input_output_tensors()
+        # `input_a`, which has input name that comes last in alphanumeric
+        # order, is the first input of the model input layers. If tensors
+        # from `input_dict` is blindly flattened and passed to model
+        # inputs incorrectly, this would result in `input_a` input layer
+        # matching with tensor `a_input_sorted_first` and would result in
+        # shape mismatch.
+        model_with_array_input = keras.models.Model(
+            inputs=[input_a, input_b], outputs=output)
+        model_with_array_input.compile('sgd', 'mse')
+        model_weights = model_with_array_input.get_weights()
+        model_with_array_input_fit = model_with_array_input.fit(
+            dataset, steps_per_epoch=1, epochs=1).history
+
+        input_a, input_b, output = _create_model_input_output_tensors()
+        model_with_dict_input = keras.models.Model(
+            inputs={
+                'z_input_sorted_last': input_a,
+                'a_input_sorted_first': input_b,
+            },
+            outputs=output)
+        model_with_dict_input.compile('sgd', 'mse')
+        model_with_dict_input.set_weights(model_weights)
+        model_with_dict_input_fit = model_with_dict_input.fit(
+            dataset, steps_per_epoch=1, epochs=1).history
+        self.assertAllClose(
+            model_with_dict_input_fit,
+            model_with_array_input_fit,
+            atol=1e-4,
+            rtol=1e-4)
+
 
 class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
   class IdentityRegularizer(keras.regularizers.Regularizer):
@@ -1280,8 +1516,10 @@
     return math_ops.reduce_mean(y_pred)
 
   @combinations.generate(
-      strategy_combinations.all_strategy_combinations_minus_default())
-  def test_regularizer_loss(self, distribution):
+      combinations.times(
+          strategy_combinations.all_strategy_combinations_minus_default(),
+          combinations.combine(cloning=[True, False])))
+  def test_regularizer_loss(self, distribution, cloning):
     batch_size = 2
     if not distributed_training_utils.global_batch_size_supported(distribution):
       batch_size //= distribution.num_replicas_in_sync
@@ -1296,11 +1534,11 @@
       # replicas, the variable value will be incorrect when number of replicas
       # >1. For e.g. it will be -2 if num replicas = 2.
     with distribution.scope():
-      x = keras.layers.Input(shape=(), batch_size=batch_size)
+      x = keras.layers.Input(shape=(1,), batch_size=batch_size)
       y = TestRegularizerLoss.AddLayer()(x)
       model = keras.models.Model(inputs=x, outputs=y)
       opt = gradient_descent_keras.SGD(1.)
-      model.compile(opt, loss=TestRegularizerLoss.loss_fn)
+      model.compile(opt, loss=TestRegularizerLoss.loss_fn, cloning=cloning)
       model.fit(
           x=np.array([[1.], [1.]], dtype=np.float32),
           y=np.array([[1.], [1.]], dtype=np.float32),
@@ -1312,13 +1550,18 @@
 class TestDistributionStrategyWithKerasModels(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
-  def test_distribution_strategy_on_sequential_model(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_distribution_strategy_on_sequential_model(self, distribution,
+                                                     cloning):
     with distribution.scope():
+      # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+      # mirrored.
+      optimizer_fn = (
+          rmsprop.RMSPropOptimizer if cloning else gradient_descent_keras.SGD)
+      optimizer = optimizer_fn(learning_rate=0.001)
       model = simple_sequential_model()
-      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss)
+      model.compile(optimizer, loss, cloning=cloning)
 
       inputs = np.zeros((20, 10), np.float32)
       targets = np.zeros((20, 2), np.float32)
@@ -1327,13 +1570,18 @@
     model.predict(inputs, steps=1)
     model.evaluate(inputs, targets, steps=1)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_distribution_strategy_on_functional_model(self, distribution):
+  @combinations.generate(all_strategy_combinations_plus_cloning())
+  def test_distribution_strategy_on_functional_model(self, distribution,
+                                                     cloning):
     with distribution.scope():
+      # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+      # mirrored.
+      optimizer_fn = (
+          rmsprop.RMSPropOptimizer if cloning else gradient_descent_keras.SGD)
+      optimizer = optimizer_fn(learning_rate=0.001)
       model = get_model()
-      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss)
+      model.compile(optimizer, loss, cloning=cloning)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
@@ -1343,8 +1591,11 @@
     model.evaluate(inputs, targets, steps=1)
 
   @combinations.generate(
-      strategy_combinations.all_strategy_combinations_minus_default())
-  def test_distribution_strategy_one_dimensional(self, distribution):
+      combinations.times(
+          all_strategy_minus_default_and_tpu_combinations() +
+          tpu_strategy_combinations(),
+          combinations.combine(cloning=[True, False])))
+  def test_distribution_strategy_one_dimensional(self, distribution, cloning):
     with distribution.scope():
       inp = keras.layers.Input(shape=(10,))
       out = keras.layers.Dense(3, activation='softmax')(inp)
@@ -1353,15 +1604,18 @@
           optimizer='rmsprop',
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'],
-      )
+          cloning=cloning)
 
       x = np.random.random((64, 10)).astype('float32')
       y = np.random.randint(3, size=64)
 
       model.fit(x, y, epochs=1, steps_per_epoch=2)
 
-  @combinations.generate(all_strategy_minus_default_and_tpu_combinations())
-  def test_distribution_strategy_with_symbolic_add_loss(self, distribution):
+  @combinations.generate(
+      combinations.times(all_strategy_minus_default_and_tpu_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_distribution_strategy_with_symbolic_add_loss(self, distribution,
+                                                        cloning):
 
     def _make_model_with_add_loss():
       inputs = keras.Input((10,))
@@ -1381,7 +1635,7 @@
 
     with distribution.scope():
       ds_model = _make_model_with_add_loss()
-      ds_model.compile('sgd')
+      ds_model.compile('sgd', cloning=cloning)
       ds_history = ds_model.fit(x, steps_per_epoch=2, epochs=1)
 
     self.assertAllClose(history.history, ds_history.history)
@@ -1418,8 +1672,11 @@
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(all_strategy_minus_default_and_tpu_combinations())
-  def test_distribution_strategy_with_add_metric_in_call(self, distribution):
+  @combinations.generate(
+      combinations.times(all_strategy_minus_default_and_tpu_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_distribution_strategy_with_add_metric_in_call(
+      self, distribution, cloning):
 
     class Bias(keras.layers.Layer):
 
@@ -1457,7 +1714,7 @@
     with distribution.scope():
       ds_model = _make_model_with_add_metric()
       self.assertLen(ds_model.metrics, 1)
-      ds_model.compile('sgd', 'mse')
+      ds_model.compile('sgd', 'mse', cloning=cloning)
       ds_history = ds_model.fit(
           x,
           y,
@@ -1469,9 +1726,11 @@
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(all_strategy_minus_default_and_tpu_combinations())
+  @combinations.generate(
+      combinations.times(all_strategy_minus_default_and_tpu_combinations(),
+                         combinations.combine(cloning=[True, False])))
   def test_distribution_strategy_with_add_metric_outside_call(
-      self, distribution):
+      self, distribution, cloning):
 
     def _make_model_with_add_metric():
       inputs = keras.Input((10,))
@@ -1500,7 +1759,7 @@
     with distribution.scope():
       ds_model = _make_model_with_add_metric()
       self.assertLen(ds_model.metrics, 1)
-      ds_model.compile('sgd', 'mse')
+      ds_model.compile('sgd', 'mse', cloning=cloning)
       ds_history = ds_model.fit(
           x,
           y,
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index cbd8027..5ecdf88 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
@@ -26,6 +28,7 @@
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -75,9 +78,9 @@
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                   grouped_updates=None, grouped_session_args=None,
                   with_loss_tensor=False):
-  """Unwrap and return the list of values contained in the PerDevice parameters.
+  """Unwrap the list of values contained in the PerReplica parameters.
 
-  This function calls `flatten_perdevice_values` to parse each of the input
+  This function calls `flatten_per_replica_values` to parse each of the input
   parameters into a list of values on the different devices. If we set
   `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
   the different devices to give us one loss tensor.
@@ -85,39 +88,31 @@
   Args:
     distribution_strategy: DistributionStrategy used to distribute training and
         validation.
-    grouped_inputs: PerDevice inputs returned from the train or test function
+    grouped_inputs: PerReplica inputs returned from the train or test function
         that we ran on each device.
-    grouped_outputs: PerDevice outputs returned from the train or test function
+    grouped_outputs: PerReplica outputs returned from the train or test function
         that we ran on each device.
-    grouped_updates: PerDevice updates returned from the train or test function
+    grouped_updates: PerReplica updates returned from the train or test function
         that we ran on each device.
-    grouped_session_args: PerDevice session args returned from the train or
+    grouped_session_args: PerReplica session args returned from the train or
         test function that we ran on each device.
     with_loss_tensor: Boolean that indicates if we need to add the reduced loss
         tensor as one of the outputs.
 
   Returns:
-    Values of each of the PerDevice parameters.
+    Values of each of the PerReplica parameters.
 
   """
   # Unwrap per device values returned from each model's train function.
   # This will be used to construct the main train function.
-  all_inputs = flatten_perdevice_values(distribution_strategy,
-                                        grouped_inputs)
-  if with_loss_tensor:
-    # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
-                                        grouped_outputs[0], axis=None)
-    all_outputs = flatten_perdevice_values(distribution_strategy,
-                                           grouped_outputs[1:])
-    all_outputs = [loss] + all_outputs
-  else:
-    all_outputs = flatten_perdevice_values(distribution_strategy,
-                                           grouped_outputs)
+  all_inputs = flatten_per_replica_values(distribution_strategy,
+                                          grouped_inputs)
+  all_outputs = unwrap_outputs(distribution_strategy, grouped_outputs,
+                               with_loss_tensor)
 
   if grouped_updates:
-    all_updates = flatten_perdevice_values(distribution_strategy,
-                                           grouped_updates)
+    all_updates = flatten_per_replica_values(distribution_strategy,
+                                             grouped_updates)
   else:
     all_updates = None
 
@@ -125,38 +120,74 @@
   if grouped_session_args:
     grouped_feed_dict = grouped_session_args.get('feed_dict')
     if grouped_feed_dict:
-      all_session_args['feed_dict'] = flatten_perdevice_values(
+      all_session_args['feed_dict'] = flatten_per_replica_values(
           distribution_strategy, grouped_feed_dict)
 
     grouped_fetches = grouped_session_args.get('fetches')
     if grouped_fetches:
-      all_session_args['fetches'] = flatten_perdevice_values(
+      all_session_args['fetches'] = flatten_per_replica_values(
           distribution_strategy, grouped_fetches)
 
   # TODO(priyag): Return only non empty/None values
   return all_inputs, all_outputs, all_updates, all_session_args
 
 
-def flatten_perdevice_values(distribution_strategy, perdevice_values):
-  """Unwraps and flattens a nest of PerDevice parameters.
+def unwrap_outputs(distribution_strategy, grouped_outputs,
+                   with_loss_tensor=False):
+  """Unwrap the list of outputs contained in the PerReplica parameters.
 
-  PerDevice values have one value associated with each device. Each entry in
-  the PerDevice dict has a device `key` and the corresponding value on the
-  device as the `value`. In this function we take a PerDevice value or a list of
-  PerDevice values and return all the values in the PerDevice dict.
+  This function calls `flatten_per_replica_values` to parse each of the input
+  parameters into a list of outputs on the different devices. If we set
+  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
+  the different devices to give us one loss tensor.
 
   Args:
     distribution_strategy: DistributionStrategy used to distribute training and
         validation.
-    perdevice_values: List of PerDevice object or a single PerDevice object.
+    grouped_outputs: PerReplica outputs returned from the train or test function
+        that we ran on each device.
+    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
+        tensor as one of the outputs.
 
   Returns:
-    List of values of all the PerDevice objects.
+    Values of each of the PerReplica outputs.
 
   """
-  # This function takes a PerDevice object or a list of PerDevice objects and
+  if not with_loss_tensor:
+    return flatten_per_replica_values(distribution_strategy,
+                                      grouped_outputs)
+
+  if not isinstance(grouped_outputs, list):
+    grouped_outputs = [grouped_outputs]
+  # reduce loss tensor before adding it to the list of fetches
+  loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
+                                      grouped_outputs[0], axis=None)
+  all_outputs = flatten_per_replica_values(distribution_strategy,
+                                           grouped_outputs[1:])
+  return [loss] + all_outputs
+
+
+def flatten_per_replica_values(distribution_strategy, per_replica_values):
+  """Unwraps and flattens a nest of PerReplica parameters.
+
+  PerReplica values have one value associated with each device. Each entry in
+  the PerReplica dict has a device `key` and the corresponding value on the
+  device as the `value`. In this function we take a PerReplica value or a list
+  of PerReplica values and return all the values in the PerReplica dict.
+
+  Args:
+    distribution_strategy: DistributionStrategy used to distribute training and
+      validation.
+    per_replica_values: List of PerReplica object or a single PerReplica object.
+
+  Returns:
+    List of values of all the PerReplica objects.
+
+  """
+  # pylint: disable=g-complex-comprehension
+  # This function takes a PerReplica object or a list of PerReplica objects and
   # returns all the values associated with it.
-  return [e for flattened in nest.flatten(perdevice_values)
+  return [e for flattened in nest.flatten(per_replica_values)
           for e in distribution_strategy.unwrap(flattened)]
 
 
@@ -175,19 +206,6 @@
   """
   if input_callbacks:
     for callback in input_callbacks:
-      if not isinstance(callback,
-                        (callbacks.TensorBoard, callbacks.ReduceLROnPlateau,
-                         callbacks.LearningRateScheduler, callbacks.CSVLogger,
-                         callbacks.EarlyStopping, callbacks.ModelCheckpoint,
-                         callbacks.TerminateOnNaN, callbacks.ProgbarLogger,
-                         callbacks.History, callbacks.RemoteMonitor)):
-        logging.warning('Your input callback is not one of the predefined '
-                        'Callbacks that supports DistributionStrategy. You '
-                        'might encounter an error if you access one of the '
-                        'model\'s attributes as part of the callback since '
-                        'these attributes are not set. You can access each of '
-                        'the individual distributed models using the '
-                        '`_grouped_model` attribute of your original model.')
       if isinstance(callback, (callbacks.LearningRateScheduler,
                                callbacks.ReduceLROnPlateau)):
 
@@ -223,15 +241,15 @@
     distribution_strategy: The current DistributionStrategy used to call
         `fit`/`evaluate`.
     x: Input Dataset DistributedValue object. For example, when we use
-        `MirroredStrategy` this is a PerDevice object with a tensor for each
+        `MirroredStrategy` this is a PerReplica object with a tensor for each
         device set in the dict. x can also be a tuple or dict. The keys of the
         dict should match the names of the input layers of the model.
     y: Target Dataset DistributedValue object. For example, when we use
-        `MirroredStrategy` this is a PerDevice object with a tensor for each
+        `MirroredStrategy` this is a PerReplica object with a tensor for each
         device set in the dict. y can also be a tuple or dict. The keys of the
         dict should match the names of the output layers of the model.
     sample_weights: Sample weights Dataset DistributedValue object. For example,
-        when we use `MirroredStrategy` this is a PerDevice object with a tensor
+        when we use `MirroredStrategy` this is a PerReplica object with a tensor
         for each device set in the dict.
 
   Returns:
@@ -248,16 +266,16 @@
 
   # If each element of x and y are not tensors, we cannot standardize and
   # validate the input and targets.
-  x_values_list = validate_per_device_inputs(distribution_strategy, x)
+  x_values_list = validate_per_replica_inputs(distribution_strategy, x)
 
   if y is not None:
-    y_values_list = validate_per_device_inputs(distribution_strategy, y)
+    y_values_list = validate_per_replica_inputs(distribution_strategy, y)
   else:
     y_values_list = None
 
   if sample_weights is not None:
-    sample_weights_list = validate_per_device_inputs(distribution_strategy,
-                                                     sample_weights)
+    sample_weights_list = validate_per_replica_inputs(distribution_strategy,
+                                                      sample_weights)
   else:
     sample_weights_list = None
 
@@ -265,27 +283,27 @@
   return x_values_list, y_values_list, sample_weights_list
 
 
-def validate_per_device_inputs(distribution_strategy, x):
-  """Validates PerDevice dataset input list.
+def validate_per_replica_inputs(distribution_strategy, x):
+  """Validates PerReplica dataset input list.
 
   Args:
     distribution_strategy: The current DistributionStrategy used to call
       `fit`, `evaluate` and `predict`.
-    x: A list of PerDevice objects that represent the input or
+    x: A list of PerReplica objects that represent the input or
       target values.
 
   Returns:
-    List containing the first element of each of the PerDevice objects in
+    List containing the first element of each of the PerReplica objects in
     the input list.
 
   Raises:
-    ValueError: If any of the objects in the `per_device_list` is not a tensor.
+    ValueError: If any of the objects in the `per_replica_list` is not a tensor.
 
   """
-  # Convert the inputs and targets into a list of PerDevice objects.
-  per_device_list = nest.flatten(x)
+  # Convert the inputs and targets into a list of PerReplica objects.
+  per_replica_list = nest.flatten(x)
   x_values_list = []
-  for x in per_device_list:
+  for x in per_replica_list:
     if not tensor_util.is_tensor(x):
       raise ValueError('Dataset input to the model should be tensors instead '
                        'they are of type {}'.format(type(x)))
@@ -294,8 +312,9 @@
     # structure.
     x_values = distribution_strategy.unwrap(x)
 
-    # Validate that the shape and dtype of all the elements in x are the same.
-    validate_all_tensor_shapes(x, x_values)
+    if not context.executing_eagerly():
+      # Validate that the shape and dtype of all the elements in x are the same.
+      validate_all_tensor_shapes(x, x_values)
     validate_all_tensor_types(x, x_values)
 
     x_values_list.append(x_values[0])
@@ -353,16 +372,12 @@
     _wait_for_variable_initialization(session)
 
 
-def validate_inputs(x, y, distribution_strategy, allow_partial_batch=False):
+def validate_inputs(x, y):
   """Validate inputs when using DistributionStrategy.
 
   Args:
     x: Model Inputs.
     y: Model Targets.
-    distribution_strategy: The DistributionStrategy with which the model is
-      compiled.
-    allow_partial_batch: Boolean. If false, datasets must have fully
-      defined shapes.
 
   Raises:
     ValueError: if input is not a Dataset or a numpy array(when we use
@@ -374,16 +389,6 @@
                      'Iterator. You must pass a `tf.data.Dataset` object or a '
                      'numpy array as input.')
 
-  if is_tpu_strategy(distribution_strategy):
-    for i in [x, y]:
-      if (isinstance(i, dataset_ops.DatasetV2) and not allow_partial_batch):
-        if not is_dataset_shape_fully_defined(i):
-          raise ValueError(
-              'Using TPUs currently requires fully defined shapes. Either use '
-              'set_shape() on the input tensors or use '
-              'dataset.batch(..., drop_remainder=True).'
-              'Found unknown shape in input {}.'.format(i))
-
 
 # TODO(b/118776054): Currently we support global batch size for TPUStrategy and
 # core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
@@ -440,9 +445,9 @@
   # Partial batches are allowed for training as we repeat the
   # dataset when converting numpy arrays into a dataset.
   # For other modes uneven batch sizes are not allowed except
-  # for `predict()` on TPUStrategy.
+  # for `test()` and `predict()` on TPUStrategy.
   allow_partial_batch = (mode == ModeKeys.TRAIN or
-                         (mode == ModeKeys.PREDICT
+                         ((mode == ModeKeys.PREDICT or mode == ModeKeys.TEST)
                           and is_tpu_strategy(distribution_strategy)))
 
   if steps is None:
@@ -574,26 +579,48 @@
   # correct order.
   if isinstance(inputs, dict):
     inputs = [inputs[key] for key in model._feed_input_names]
-  inputs = flatten_perdevice_values(strategy, inputs)
-  targets = flatten_perdevice_values(strategy, targets)
+  if is_distributing_by_cloning(model):
+    inputs = flatten_per_replica_values(strategy, inputs)
+    targets = flatten_per_replica_values(strategy, targets)
+    # Expand 1-dimensional inputs.
+    # TODO(b/124535720): Remove once this standarize data logic is shared with
+    # main flow.
+    inputs, targets = nest.map_structure(
+        training_utils.standardize_single_array, (inputs, targets))
 
-  # Expand 1-dimensional inputs.
-  # TODO(b/124535720): Remove once this standarize data logic is shared with
-  # main flow.
-  inputs, targets = nest.map_structure(training_utils.standardize_single_array,
-                                       (inputs, targets))
   if mode == ModeKeys.PREDICT:
     sample_weights = []
     targets = []
+  elif not is_distributing_by_cloning(model):
+    sample_weights = None  # b/129503665
   else:
     sample_weights = [
         None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
     ]
-  ins = inputs + targets + sample_weights
+  ins = [inputs, targets, sample_weights]
   if mode == ModeKeys.TRAIN and not isinstance(K.symbolic_learning_phase(),
                                                int):
     ins += [True]
-  return ins
+  return tuple(ins)
+
+
+def is_distributing_by_cloning(model):
+  """Decide whether this model is going to be distributed via cloning.
+
+  We are going to distribute the model by cloning if the user has signaled
+  that intent by not setting `cloning=False` in `Model.compile()` unless we
+  are in graph mode or running on TPU.
+
+  Args:
+    model: Keras model to distribute.
+
+  Returns:
+    True if the `model` is going to be distributed using cloning and False
+    otherwise.
+  """
+  return (model._cloning or model._compile_distribution or
+          not context.executing_eagerly() or
+          K.is_tpu_strategy(model._distribution_strategy))
 
 
 def _custom_compile_for_predict(model):
@@ -744,6 +771,68 @@
 
 def _make_execution_function(model, mode):
   """Makes or reuses function to run one step of distributed model execution."""
+  if is_distributing_by_cloning(model):
+    return _make_execution_function_with_cloning(model, mode)
+
+  distributed_function = get_distributed_function(model, mode)
+  if distributed_function:
+    return distributed_function
+
+  distribution_function = _make_execution_function_without_cloning(model, mode)
+  set_distributed_function(model, mode, distribution_function)
+  return distribution_function
+
+
+def _make_execution_function_without_cloning(model, mode):
+  """Creates a function to run one step of distributed model execution."""
+  strategy = model._distribution_strategy
+
+  with strategy.scope():
+    per_replica_function = _make_replica_execution_function(model, mode)
+
+    @def_function.function
+    def distributed_function(x, y, sample_weights, learning_phase=None):
+      """A single step of the distributed execution across replicas."""
+      del learning_phase
+      # Call `Model.{train,test,predict}_on_batch` on every replica passing
+      # PerReplicas as arguments.  On every replica inside this call, each
+      # PerReplica object will return the value for that replica.  The outputs
+      # are PerReplicas too.
+      outputs = strategy.experimental_run_v2(
+          per_replica_function, args=(x, y, sample_weights))
+      # Out of PerReplica outputs reduce or pick values to return.
+      all_outputs = unwrap_outputs(
+          strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT))
+      return all_outputs
+
+    # `numpy` translates Tensors to values in Eager mode.
+    return lambda inputs: [out.numpy() for out in distributed_function(*inputs)]
+
+
+def _make_replica_execution_function(model, mode):
+  """A single step of the distributed execution on a replica."""
+  if mode == ModeKeys.TRAIN:
+    func = model.train_on_batch
+  elif mode == ModeKeys.TEST:
+    func = model.test_on_batch
+  else:
+
+    def predict_on_batch(x, y=None, sample_weights=None):
+      del y, sample_weights
+      return model.predict_on_batch(x)
+
+    func = predict_on_batch
+
+  if mode != ModeKeys.PREDICT:
+    # `reset_metrics` is set to False to maintain stateful metrics across
+    # batch-level calls.
+    func = functools.partial(func, reset_metrics=False)
+
+  return func
+
+
+def _make_execution_function_with_cloning(model, mode):
+  """Clones or re-uses models to run one step of distributed model execution."""
   strategy = model._distribution_strategy
 
   distributed_model = get_distributed_model(model, mode)
@@ -778,17 +867,17 @@
 def _make_graph_execution_function(model, mode):
   """Makes function to run one step of distributed model in graph mode."""
 
-  def _per_device_function(model):
+  def _per_replica_function(model):
     f = model._make_execution_function(mode)
     return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
 
   strategy = model._distribution_strategy
   with strategy.scope():
     # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
+    # `_per_replica_fit_function`.
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_device_function, args=(get_distributed_model(model, mode),))
+         _per_replica_function, args=(get_distributed_model(model, mode),))
 
     # Initialize the variables in the replicated model. This is necessary for
     # multi-worker training because on some workers, initialization is not
@@ -818,7 +907,7 @@
 
 def _make_eager_execution_function(model, mode):
   """Makes function to run one step of distributed model eager execution."""
-  def _per_device_function(model):
+  def _per_replica_function(model):
     f = model._make_execution_function(mode)
     return (f.inputs, f.outputs)
 
@@ -833,9 +922,9 @@
     # lift to a separate graph when creating the per-replica functions.
     with K._scratch_graph(global_graph):
       # Create train ops on each of the devices when we call
-      # `_per_device_fit_function`.
+      # `_per_replica_fit_function`.
       grouped = strategy.extended.call_for_each_replica(
-          _per_device_function, args=(get_distributed_model(model, mode),))
+          _per_replica_function, args=(get_distributed_model(model, mode),))
       grouped_inputs, grouped_outputs = grouped
 
       # Unwrap all the per device values returned from `call_for_each_replica`.
@@ -877,8 +966,8 @@
     model.set_weights(updated_weights)
 
 
-def _per_device_aggregate_batch(batch_outs, model, mode):
-  """Aggregates the per-device batch-level outputs from a distributed step."""
+def _per_replica_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-replica batch-level outputs from a distributed step."""
   if model._distribution_strategy is not None and mode == ModeKeys.PREDICT:
     total_batch_outs = []
     for i in range(len(model.outputs)):
@@ -908,6 +997,16 @@
   model._distributed_model_cache[key] = distributed_model
 
 
+def get_distributed_function(model, mode):
+  key = _generate_cache_key(mode)
+  return model._distributed_function_cache.get(key, None)
+
+
+def set_distributed_function(model, mode, distributed_function):
+  key = _generate_cache_key(mode)
+  model._distributed_function_cache[key] = distributed_function
+
+
 def _generate_cache_key(mode):
   key = hash(mode)
   return key
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_test.py b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
index 0ea777f..4adc8b5 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils_test.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
@@ -57,30 +57,6 @@
 
     self.assertEqual(0, mock_warning.call_count)
 
-  @test.mock.patch.object(logging, 'warning', autospec=True)
-  def test_validate_callbacks_custom_callback(self, mock_warning):
-
-    class CustomCallback(callbacks.Callback):
-      pass
-
-    distributed_training_utils.validate_callbacks([CustomCallback()],
-                                                  adam.Adam())
-
-    self.assertEqual(1, mock_warning.call_count)
-
-    call_args, call_kwargs = mock_warning.call_args
-
-    self.assertEqual(('Your input callback is not one of the predefined '
-                      'Callbacks that supports DistributionStrategy. You '
-                      'might encounter an error if you access one of the '
-                      'model\'s attributes as part of the callback since '
-                      'these attributes are not set. You can access each of '
-                      'the individual distributed models using the '
-                      '`_grouped_model` attribute of your original model.',),
-                     call_args)
-
-    self.assertEqual(0, len(call_kwargs))
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/distribute/keras_correctness_test_base.py b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
index 1da42dd..faf82f9 100644
--- a/tensorflow/python/keras/distribute/keras_correctness_test_base.py
+++ b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
@@ -32,6 +32,7 @@
 from tensorflow.python.eager import test
 from tensorflow.python.framework import random_seed
 from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.util import nest
 
 _RANDOM_SEED = 1337
 _EVAL_STEPS = 20
@@ -40,7 +41,6 @@
 # Note: Please make sure the tests in this file are also covered in
 # keras_backward_compat_test for features that are supported with both APIs.
 
-
 all_strategies = [
     strategy_combinations.default_strategy,
     strategy_combinations.one_device_strategy,
@@ -63,16 +63,9 @@
 
 
 def all_strategy_and_input_config_combinations():
-  return (
-      combinations.times(
-          combinations.combine(distribution=all_strategies),
-          eager_mode_test_configuration() + graph_mode_test_configuration()))
-
-
-def all_strategies_excluding_tpu_and_input_config_combinations():
   return (combinations.times(
       combinations.combine(
-          distribution=strategy_combinations.strategies_minus_tpu),
+          distribution=all_strategies, cloning=[True, False]),
       eager_mode_test_configuration() + graph_mode_test_configuration()))
 
 
@@ -90,12 +83,11 @@
 
 
 def test_combinations_for_embedding_model():
-  return (
-      combinations.times(
-          combinations.combine(distribution=
-                               strategies_for_embedding_models()),
-          (graph_mode_test_configuration() +
-           eager_mode_test_configuration())))
+  return (combinations.times(
+      combinations.combine(
+          distribution=strategies_for_embedding_models(),
+          cloning=[True, False]),
+      (graph_mode_test_configuration() + eager_mode_test_configuration())))
 
 
 def test_combinations_with_tpu_strategies():
@@ -128,16 +120,10 @@
       self._scope = None
 
 
-def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+def batch_wrapper(dataset, batch_size, repeat=None):
   if repeat:
     dataset = dataset.repeat(repeat)
-  # TPUs currently require fully defined input shapes, drop_remainder ensures
-  # the input will have fully defined shapes.
-  if isinstance(distribution, (tpu_strategy.TPUStrategy,
-                               tpu_strategy.TPUStrategyV1)):
-    return dataset.batch(batch_size, drop_remainder=True)
-  else:
-    return dataset.batch(batch_size)
+  return dataset.batch(batch_size)
 
 
 def get_batch_size(global_batch_size, distribution):
@@ -164,10 +150,23 @@
   return len(six.next(six.itervalues(data)))
 
 
-def get_correctness_test_inputs(use_numpy, use_validation_data,
-                                with_distribution, x_train, y_train, x_predict):
+def get_shapes(data):
+  shapes = None
+  if all(hasattr(x, 'shape') for x in nest.flatten(data)):
+    shapes = nest.map_structure(lambda x: x.shape, data)
+  return shapes
+
+
+def get_correctness_test_inputs(use_numpy,
+                                use_validation_data,
+                                with_distribution,
+                                x_train,
+                                y_train,
+                                x_eval,
+                                y_eval,
+                                x_predict,
+                                training_epochs):
   """Generates the inputs for correctness check when enable Keras with DS."""
-  training_epochs = 2
   global_batch_size = _GLOBAL_BATCH_SIZE
   batch_size = get_batch_size(global_batch_size, with_distribution)
 
@@ -182,56 +181,53 @@
 
     if use_validation_data:
       eval_inputs = None
-      training_inputs['validation_data'] = (x_train, y_train)
+      training_inputs['validation_data'] = (x_eval, y_eval)
     else:
       eval_inputs = {
           'batch_size': batch_size,
-          'x': x_train,
-          'y': y_train,
+          'x': x_eval,
+          'y': y_eval,
       }
     predict_inputs = {
         'x': x_predict
     }
   else:
     training_data_size = get_data_size(x_train)
-    if training_data_size < _GLOBAL_BATCH_SIZE * _EVAL_STEPS:
-      # Currently, we cannot detect the size of a dataset. So, the eval steps is
-      # hard coded.
-      raise ValueError('x_train must have at least '
-                       '_GLOBAL_BATCH_SIZE * _EVAL_STEPS samples')
     # For dataset inputs, we do not pass batch_size to
     # keras.fit/evaluate/predict. The batch size is part of the dataset.
     train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
-    x = batch_wrapper(train_dataset, batch_size, with_distribution,
-                      repeat=training_epochs)
+    x = batch_wrapper(train_dataset, batch_size, repeat=training_epochs)
 
+    steps_per_epoch = int(np.ceil(1.0 * training_data_size / global_batch_size))
     training_inputs = {
         'batch_size': None,
         'x': x,
         'y': None,
         'epochs': training_epochs,
         'shuffle': False,
-        'steps_per_epoch': training_data_size // global_batch_size,
+        'steps_per_epoch': steps_per_epoch
     }
     if use_validation_data:
       eval_inputs = None  # Remove the eval_inputs
-      eval_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
-      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices((x_eval, y_eval))
+      x = batch_wrapper(eval_dataset, batch_size)
       training_inputs['validation_data'] = x
       training_inputs['validation_steps'] = 5
     else:
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices((x_eval, y_eval))
+      x = batch_wrapper(eval_dataset, batch_size)
+      eval_steps = int(np.ceil(1.0 * get_data_size(x_eval) / global_batch_size))
       eval_inputs = {
           'batch_size': None,
           'x': x,
           'y': None,
-          'steps': _EVAL_STEPS,
+          'steps': eval_steps,
       }
 
     predict_batch_size = get_batch_size(get_data_size(x_predict),
                                         with_distribution)
     predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
-    predict_dataset = batch_wrapper(predict_dataset, predict_batch_size,
-                                    with_distribution)
+    predict_dataset = batch_wrapper(predict_dataset, predict_batch_size)
     predict_inputs = {
         'steps': 1,
         'x': predict_dataset,
@@ -240,11 +236,19 @@
   return training_inputs, eval_inputs, predict_inputs
 
 
-def fit_eval_and_predict(initial_weights, input_fn, model_fn,
-                         distribution=None, is_stateful_model=False):
+def fit_eval_and_predict(initial_weights,
+                         input_fn,
+                         model_fn,
+                         cloning=None,
+                         distribution=None,
+                         is_stateful_model=False):
   """Generates results for fit/predict/evaluate for given model."""
-  model = model_fn(initial_weights=initial_weights, distribution=distribution)
   training_inputs, eval_inputs, predict_inputs = input_fn()
+  model = model_fn(
+      cloning=cloning,
+      initial_weights=initial_weights,
+      distribution=distribution,
+      input_shapes=get_shapes(training_inputs['x']))
 
   result = {}
   result['training_history_1'] = model.fit(**training_inputs).history
@@ -277,8 +281,11 @@
   return result
 
 
-def compare_results(results_with_ds, results_without_ds, distribution,
-                    testcase):
+def compare_results(results_with_ds,
+                    results_without_ds,
+                    distribution,
+                    testcase,
+                    partial_last_batch=False):
   """Compares results of model compiled with/without distribution strategy."""
 
   default_tolerance = 1e-5
@@ -306,6 +313,18 @@
       continue
 
     tolerance = _get_compare_result_tolerance(key)
+
+    # We don't compare the loss as loss is currently not computed as metric
+    # in Keras, the loss value is inaccurate for last partial batch due to
+    # more weights for the last batch samples.
+    if partial_last_batch:
+      if key.startswith('eval_result'):
+        results_with_ds[key] = results_with_ds[key][1:]
+        results_without_ds[key] = results_without_ds[key][1:]
+      if key.startswith('training_history'):
+        results_with_ds[key]['val_loss'] = 0
+        results_without_ds[key]['val_loss'] = 0
+
     testcase.assertAllClose(
         results_with_ds[key],
         results_without_ds[key],
@@ -357,6 +376,10 @@
     y_train = x_train
     return (x_train.astype('float32'), y_train.astype('float32'), None)
 
+  def get_data_with_partial_last_batch(self):
+    x_train, y_train, x_predict = self.get_data()
+    return  x_train, y_train, x_train, y_train, x_predict
+
   def get_input_for_correctness_test(self, **kwargs):
     """Generates inputs that are dictionaries.
 
@@ -373,7 +396,7 @@
 
     return get_correctness_test_inputs(**kwargs)
 
-  def get_model(self, distribution=None):
+  def get_model(self, distribution=None, cloning=None, input_shapes=None):
     raise NotImplementedError
 
   def skip_unsupported_test_configuration(self, distribution):
@@ -385,19 +408,29 @@
                            distribution,
                            use_numpy,
                            use_validation_data,
+                           cloning=None,
                            with_batch_norm=False,
-                           is_stateful_model=False):
+                           is_stateful_model=False,
+                           partial_last_batch=False,
+                           training_epochs=2):
     with self.cached_session():
       self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
       self.skip_unsupported_test_configuration(distribution)
 
       # Train, eval, and predict datasets are created with the same input numpy
       # arrays.
-      x_train, y_train, x_predict = self.get_data()
+      if partial_last_batch:
+        x_train, y_train, x_eval, y_eval, x_predict = (
+            self.get_data_with_partial_last_batch())
+      else:
+        x_train, y_train, x_predict = self.get_data()
+        x_eval = x_train
+        y_eval = y_train
+
       # The model is built once and the initial weights are saved.
       # This is used to initialize the model for both the distribution and
       # non-distribution run.
-      model = self.get_model()
+      model = self.get_model(cloning=cloning, input_shapes=get_shapes(x_train))
       initial_weights = model.get_weights()
 
       ds_input_fn = functools.partial(
@@ -407,7 +440,10 @@
           with_distribution=distribution,
           x_train=x_train,
           y_train=y_train,
-          x_predict=x_predict)
+          x_eval=x_eval,
+          y_eval=y_eval,
+          x_predict=x_predict,
+          training_epochs=training_epochs)
 
       nods_input_fn = functools.partial(
           self.get_input_for_correctness_test,
@@ -416,31 +452,44 @@
           with_distribution=None,
           x_train=x_train,
           y_train=y_train,
-          x_predict=x_predict)
+          x_eval=x_eval,
+          y_eval=y_eval,
+          x_predict=x_predict,
+          training_epochs=training_epochs)
 
       results_with_ds = fit_eval_and_predict(
           initial_weights,
           input_fn=ds_input_fn,
           model_fn=self.get_model,
+          cloning=cloning,
           distribution=distribution,
           is_stateful_model=is_stateful_model)
       results_without_ds = fit_eval_and_predict(
           initial_weights,
           input_fn=nods_input_fn,
           model_fn=self.get_model,
+          cloning=cloning,
           distribution=None,
           is_stateful_model=is_stateful_model)
 
-      # First, special case, for multi-replica distributed training, batch norm
-      # is not aggregated globally. So it is expected to have different weights.
-      if (self.with_batch_norm and
-          distribution.num_replicas_in_sync > 1):
+      # First, special case, for multi-replica distributed training, batch
+      # norm is not aggregated globally. So it is expected to have different
+      # weights.
+      if (self.with_batch_norm and distribution.num_replicas_in_sync > 1):
         with self.assertRaises(AssertionError):
-          compare_results(results_with_ds, results_without_ds, distribution,
-                          testcase=self)
+          compare_results(
+              results_with_ds,
+              results_without_ds,
+              distribution,
+              testcase=self,
+              partial_last_batch=partial_last_batch)
       else:
-        compare_results(results_with_ds, results_without_ds, distribution,
-                        testcase=self)
+        compare_results(
+            results_with_ds,
+            results_without_ds,
+            distribution,
+            testcase=self,
+            partial_last_batch=partial_last_batch)
 
   def get_input_for_dynamic_lr_test(self, **kwargs):
     """Generates inputs that are dictionaries.
@@ -459,13 +508,13 @@
     training_input = kwargs
     return training_input, None, None
 
-  def run_dynamic_lr_test(self, distribution):
+  def run_dynamic_lr_test(self, distribution, cloning=None):
     with self.cached_session():
       self.set_up_test_config()
       self.skip_unsupported_test_configuration(distribution)
 
       x_train, y_train, _ = self.get_data()
-      model = self.get_model()
+      model = self.get_model(cloning=cloning, input_shapes=get_shapes(x_train))
       initial_weights = model.get_weights()
       update_freq = None
 
@@ -506,11 +555,13 @@
           initial_weights,
           input_fn=ds_input_fn,
           model_fn=self.get_model,
+          cloning=cloning,
           distribution=distribution)
       results_without_ds = fit_eval_and_predict(
           initial_weights,
           input_fn=nods_input_fn,
           model_fn=self.get_model,
+          cloning=cloning,
           distribution=None)
       compare_results(results_with_ds, results_without_ds, distribution,
                       testcase=self)
diff --git a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
index 1f0c5d5..844c516 100644
--- a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
@@ -21,27 +21,42 @@
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.training import gradient_descent
 
 
 def all_strategy_combinations_with_eager_and_graph_modes():
-  return combinations.combine(distribution=keras_correctness_test_base.
-                              all_strategies,
-                              mode=['graph', 'eager'])
+  return (combinations.combine(
+      distribution=keras_correctness_test_base.all_strategies,
+      mode=['graph', 'eager'],
+      cloning=[True, False]))
 
 
 def all_strategy_combinations_with_graph_mode():
-  return combinations.combine(distribution=keras_correctness_test_base.
-                              all_strategies, mode=['graph'])
+  return (combinations.combine(
+      distribution=keras_correctness_test_base.all_strategies,
+      mode=['graph'],
+      cloning=[True, False]))
+
+
+def is_default_strategy(strategy):
+  with strategy.scope():
+    return not distribution_strategy_context.has_strategy()
 
 
 class TestDistributionStrategyDnnCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
-  def get_model(self, initial_weights=None, distribution=None):
+  def get_model(self,
+                cloning,
+                initial_weights=None,
+                distribution=None,
+                input_shapes=None):
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       # We add few non-linear layers to make it non-trivial.
       model = keras.Sequential()
@@ -57,35 +72,50 @@
 
       model.compile(
           loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent_keras.SGD(0.5),
-          metrics=['mse'])
+          optimizer=gradient_descent_keras.SGD(0.05),
+          metrics=['mse'],
+          cloning=cloning)
       return model
 
   def get_data(self):
-    # TODO(xiejw): Change this back to 10000, once we support final partial
-    # batch.
-    num_samples = 9984
-    x_train = np.random.rand(num_samples, 1)
+    x_train = np.random.rand(9984, 1).astype('float32')
     y_train = 3 * x_train
-    x_train = x_train.astype('float32')
-    y_train = y_train.astype('float32')
     x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
     return x_train, y_train, x_predict
 
+  def get_data_with_partial_last_batch(self):
+    x_train = np.random.rand(9984, 1).astype('float32')
+    y_train = 3 * x_train
+    x_eval = np.random.rand(10000, 1).astype('float32')
+    y_eval = 3 * x_eval
+    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
+    return x_train, y_train, x_eval, y_eval, x_predict
+
   @combinations.generate(keras_correctness_test_base.
                          all_strategy_and_input_config_combinations())
-  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data,
+                           cloning):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              cloning)
+
+  @combinations.generate(
+      keras_correctness_test_base
+      .test_combinations_with_tpu_strategies())
+  def test_dnn_correctness_with_partial_last_batch(self, distribution,
+                                                   use_numpy,
+                                                   use_validation_data):
+    self.run_correctness_test(
+        distribution, use_numpy, use_validation_data, partial_last_batch=True)
 
   @combinations.generate(all_strategy_combinations_with_graph_mode())
-  def test_dnn_with_dynamic_learning_rate(self, distribution):
-    self.run_dynamic_lr_test(distribution)
+  def test_dnn_with_dynamic_learning_rate(self, distribution, cloning):
+    self.run_dynamic_lr_test(distribution, cloning)
 
 
 class TestDistributionStrategyDnnMetricCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
-  def get_model(self, distribution=None):
+  def get_model(self, cloning, distribution=None, input_shapes=None):
     with distribution.scope():
       model = keras.Sequential()
       model.add(keras.layers.Dense(1,
@@ -93,37 +123,40 @@
                                    kernel_initializer='ones'))
       model.compile(
           loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent.GradientDescentOptimizer(0.5),
-          metrics=[keras.metrics.BinaryAccuracy()])
+          # TODO(b/130808953):  Switch back to the V1 optimizer after
+          # global_step is made mirrored.
+          optimizer=gradient_descent_keras.SGD(0.5),
+          metrics=[keras.metrics.BinaryAccuracy()],
+          cloning=cloning)
     return model
 
-  def run_metric_correctness_test(self, distribution):
+  def run_metric_correctness_test(self, distribution, cloning):
     with self.cached_session():
       self.set_up_test_config()
       self.skip_unsupported_test_configuration(distribution)
 
       x_train, y_train, _ = self.get_data()
-      model = self.get_model(distribution=distribution)
+      model = self.get_model(cloning, distribution=distribution)
 
       batch_size = 64
       batch_size = (keras_correctness_test_base.
                     get_batch_size(batch_size, distribution))
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = (keras_correctness_test_base.
-                       batch_wrapper(train_dataset, batch_size, distribution))
+                       batch_wrapper(train_dataset, batch_size))
 
       history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
   @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
-  def test_simple_dnn_metric_correctness(self, distribution):
-    self.run_metric_correctness_test(distribution)
+  def test_simple_dnn_metric_correctness(self, distribution, cloning):
+    self.run_metric_correctness_test(distribution, cloning)
 
 
 class TestDistributionStrategyDnnMetricEvalCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
-  def get_model(self, distribution=None):
+  def get_model(self, cloning, distribution=None, input_shapes=None):
     with distribution.scope():
       model = keras.Sequential()
       model.add(
@@ -135,37 +168,125 @@
       model.compile(
           loss='mae',
           metrics=['accuracy', keras.metrics.BinaryAccuracy()],
-          optimizer=gradient_descent.GradientDescentOptimizer(0.001))
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+          cloning=cloning)
     return model
 
-  def run_eval_metrics_correctness_test(self, distribution):
+  def run_eval_metrics_correctness_test(self, distribution, cloning):
     with self.cached_session():
       self.set_up_test_config()
       self.skip_unsupported_test_configuration(distribution)
 
-      model = self.get_model(distribution=distribution)
+      model = self.get_model(cloning, distribution=distribution)
 
       # verify correctness of stateful and stateless metrics.
       x = np.ones((100, 4)).astype('float32')
       y = np.ones((100, 1)).astype('float32')
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = (keras_correctness_test_base.
-                 batch_wrapper(dataset, 4, distribution))
+      dataset = keras_correctness_test_base.batch_wrapper(dataset, 4)
       outs = model.evaluate(dataset, steps=10)
       self.assertEqual(outs[1], 1.)
       self.assertEqual(outs[2], 1.)
 
       y = np.zeros((100, 1)).astype('float32')
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = (keras_correctness_test_base.
-                 batch_wrapper(dataset, 4, distribution))
+      dataset = keras_correctness_test_base.batch_wrapper(dataset, 4)
       outs = model.evaluate(dataset, steps=10)
       self.assertEqual(outs[1], 0.)
       self.assertEqual(outs[2], 0.)
 
   @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
-  def test_identity_model_metric_eval_correctness(self, distribution):
-    self.run_eval_metrics_correctness_test(distribution)
+  def test_identity_model_metric_eval_correctness(self, distribution, cloning):
+    self.run_eval_metrics_correctness_test(distribution, cloning)
+
+
+class SubclassedModel(keras.Model):
+
+  def __init__(self, initial_weights, input_shapes):
+    super(SubclassedModel, self).__init__()
+    self.dense1 = keras.layers.Dense(10, activation='relu', input_shape=(1,))
+    self.dense2 = keras.layers.Dense(
+        10, activation='relu', kernel_regularizer=keras.regularizers.l2(1e-4))
+    self.dense3 = keras.layers.Dense(10, activation='relu')
+    self.dense4 = keras.layers.Dense(1)
+    if input_shapes:
+      self.build(input_shapes)
+    else:
+      # This covers cases when the input is DatasetV1Adapter.
+      self.build((None, 1))
+    if initial_weights:
+      self.set_weights(initial_weights)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.dense2(x)
+    x = self.dense3(x)
+    return self.dense4(x)
+
+
+class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
+    TestDistributionStrategyDnnCorrectness):
+
+  def get_model(self,
+                cloning,
+                initial_weights=None,
+                distribution=None,
+                input_shapes=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      model = SubclassedModel(initial_weights, input_shapes)
+
+      model.compile(
+          loss=keras.losses.mean_squared_error,
+          optimizer=gradient_descent_keras.SGD(0.5),
+          metrics=['mse'],
+          cloning=cloning)
+      return model
+
+  @combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data,
+                           cloning):
+    if ((not cloning and context.executing_eagerly() and
+         not K.is_tpu_strategy(distribution)) or
+        is_default_strategy(distribution)):
+      self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                                cloning)
+    else:
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We currently do not support distribution strategy with a '
+          '`Sequential` model that is created without `input_shape`/'
+          '`input_dim` set in its first layer or a subclassed model.'):
+        self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                                  cloning)
+
+  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  def test_dnn_with_dynamic_learning_rate(self, distribution, cloning):
+    if ((not cloning and context.executing_eagerly() and
+         not K.is_tpu_strategy(distribution)) or
+        is_default_strategy(distribution)):
+      self.run_dynamic_lr_test(distribution, cloning)
+    else:
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We currently do not support distribution strategy with a '
+          '`Sequential` model that is created without `input_shape`/'
+          '`input_dim` set in its first layer or a subclassed model.'):
+        self.run_dynamic_lr_test(distribution, cloning)
+
+  @combinations.generate(
+      keras_correctness_test_base
+      .test_combinations_with_tpu_strategies())
+  def test_dnn_correctness_with_partial_last_batch(self, distribution,
+                                                   use_numpy,
+                                                   use_validation_data):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'We currently do not support distribution strategy with a '
+        '`Sequential` model that is created without `input_shape`/'
+        '`input_dim` set in its first layer or a subclassed model.'):
+      self.run_correctness_test(
+          distribution, use_numpy, use_validation_data, partial_last_batch=True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
index ae2651b..396a43b 100644
--- a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
@@ -22,14 +22,20 @@
 from tensorflow.python.distribute import combinations
 from tensorflow.python.eager import test
 from tensorflow.python.keras.distribute import keras_correctness_test_base
-from tensorflow.python.training import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 
 
 class DistributionStrategyEmbeddingModelCorrectnessTest(
     keras_correctness_test_base.
     TestDistributionStrategyEmbeddingModelCorrectnessBase):
 
-  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+  def get_model(self,
+                max_words=10,
+                initial_weights=None,
+                distribution=None,
+                cloning=None,
+                input_shapes=None):
+    del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       word_ids = keras.layers.Input(
           shape=(max_words,), dtype=np.int32, name='words')
@@ -46,35 +52,45 @@
         model.set_weights(initial_weights)
 
       model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(
-              learning_rate=0.1),
+          # TODO(b/130808953): Switch back the V1 optimizer once global_step is
+          # mirrored.
+          optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
+          metrics=['sparse_categorical_accuracy'],
+          cloning=cloning)
     return model
 
   @combinations.generate(keras_correctness_test_base.
                          test_combinations_for_embedding_model())
   def test_embedding_model_correctness(self, distribution, use_numpy,
-                                       use_validation_data):
+                                       use_validation_data, cloning):
 
     self.use_distributed_dense = False
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              cloning)
 
   @combinations.generate(keras_correctness_test_base.
                          test_combinations_for_embedding_model())
-  def test_embedding_time_distributed_model_correctness(self,
-                                                        distribution,
+  def test_embedding_time_distributed_model_correctness(self, distribution,
                                                         use_numpy,
-                                                        use_validation_data):
+                                                        use_validation_data,
+                                                        cloning):
     self.use_distributed_dense = True
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              cloning)
 
 
 class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
     keras_correctness_test_base.
     TestDistributionStrategyEmbeddingModelCorrectnessBase):
 
-  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+  def get_model(self,
+                max_words=10,
+                initial_weights=None,
+                distribution=None,
+                cloning=None,
+                input_shapes=None):
+    del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       word_ids_a = keras.layers.Input(
           shape=(max_words,), dtype=np.int32, name='words_a')
@@ -101,10 +117,12 @@
       if initial_weights:
         model.set_weights(initial_weights)
 
+      # TODO(b/130808953): Switch back to the V1 optimizer after global_step
+      # is made mirrored.
       model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(
-              learning_rate=0.1),
+          optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
           loss='mse',
+          cloning=cloning,
           metrics=['mse'])
     return model
 
@@ -141,8 +159,9 @@
   @combinations.generate(keras_correctness_test_base.
                          test_combinations_for_embedding_model())
   def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
-                                               use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+                                               use_validation_data, cloning):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              cloning)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
index c05d405..816258b 100644
--- a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
@@ -28,7 +28,12 @@
 class DistributionStrategyCnnCorrectnessTest(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
-  def get_model(self, initial_weights=None, distribution=None):
+  def get_model(self,
+                initial_weights=None,
+                distribution=None,
+                cloning=None,
+                input_shapes=None):
+    del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       image = keras.layers.Input(shape=(28, 28, 3), name='image')
       c1 = keras.layers.Conv2D(
@@ -50,15 +55,12 @@
           optimizer=gradient_descent.SGD(
               learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
+          metrics=['sparse_categorical_accuracy'],
+          cloning=cloning)
 
     return model
 
-  def get_data(self,
-               count=keras_correctness_test_base._GLOBAL_BATCH_SIZE
-               * keras_correctness_test_base._EVAL_STEPS,
-               shape=(28, 28, 3),
-               num_classes=10):
+  def _get_data(self, count, shape=(28, 28, 3), num_classes=10):
     centers = np.random.randn(num_classes, *shape)
 
     features = []
@@ -70,22 +72,60 @@
       labels.append(label)
       features.append(centers[label] + offset)
 
-    x_train = np.asarray(features, dtype=np.float32)
-    y_train = np.asarray(labels, dtype=np.float32).reshape((count, 1))
+    x = np.asarray(features, dtype=np.float32)
+    y = np.asarray(labels, dtype=np.float32).reshape((count, 1))
+    return x, y
+
+  def get_data(self):
+    x_train, y_train = self._get_data(
+        count=keras_correctness_test_base._GLOBAL_BATCH_SIZE *
+        keras_correctness_test_base._EVAL_STEPS)
     x_predict = x_train
     return x_train, y_train, x_predict
 
+  def get_data_with_partial_last_batch(self):
+    x_train, y_train = self._get_data(count=1280)
+    x_eval, y_eval = self._get_data(count=1000)
+    return x_train, y_train, x_eval, y_eval, x_eval
+
   @combinations.generate(keras_correctness_test_base.
                          all_strategy_and_input_config_combinations())
-  def test_cnn_correctness(self, distribution, use_numpy, use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+  def test_cnn_correctness(self, distribution, use_numpy, use_validation_data,
+                           cloning):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              cloning)
 
   @combinations.generate(keras_correctness_test_base.
                          all_strategy_and_input_config_combinations())
   def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
-                                           use_validation_data):
+                                           use_validation_data, cloning):
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              with_batch_norm=True)
+                              with_batch_norm=True, cloning=cloning)
+
+  @combinations.generate(
+      keras_correctness_test_base
+      .test_combinations_with_tpu_strategies())
+  def test_cnn_correctness_with_partial_last_batch(self, distribution,
+                                                   use_numpy,
+                                                   use_validation_data):
+    self.run_correctness_test(
+        distribution,
+        use_numpy,
+        use_validation_data,
+        partial_last_batch=True,
+        training_epochs=1)
+
+  @combinations.generate(
+      keras_correctness_test_base
+      .test_combinations_with_tpu_strategies())
+  def test_cnn_with_batch_norm_correctness_and_partial_last_batch(
+      self, distribution, use_numpy, use_validation_data):
+    self.run_correctness_test(
+        distribution,
+        use_numpy,
+        use_validation_data,
+        with_batch_norm=True,
+        partial_last_batch=True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py
index d33ac42..af0cd0b 100644
--- a/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py
@@ -22,6 +22,7 @@
 from tensorflow.python.distribute import combinations
 from tensorflow.python.eager import test
 from tensorflow.python.keras.distribute import keras_correctness_test_base
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.training import gradient_descent
 
 
@@ -29,7 +30,13 @@
     keras_correctness_test_base.
     TestDistributionStrategyEmbeddingModelCorrectnessBase):
 
-  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+  def get_model(self,
+                max_words=10,
+                initial_weights=None,
+                distribution=None,
+                cloning=None,
+                input_shapes=None):
+    del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       word_ids = keras.layers.Input(
           shape=(max_words,), dtype=np.int32, name='words')
@@ -44,20 +51,24 @@
       if initial_weights:
         model.set_weights(initial_weights)
 
+      # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
+      # mirrored.
+      optimizer_fn = (
+          gradient_descent.GradientDescentOptimizer
+          if cloning else gradient_descent_keras.SGD)
+
       model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(
-              learning_rate=0.1),
+          optimizer=optimizer_fn(learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
+          metrics=['sparse_categorical_accuracy'], cloning=cloning)
     return model
 
   @combinations.generate(keras_correctness_test_base.
                          test_combinations_for_embedding_model())
-  def test_lstm_model_correctness(self,
-                                  distribution,
-                                  use_numpy,
-                                  use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+  def test_lstm_model_correctness(self, distribution, use_numpy,
+                                  use_validation_data, cloning):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              cloning)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
index 7f2e5d9..d1cd8a2 100644
--- a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
+++ b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
@@ -107,8 +107,9 @@
           distribution=[
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=['graph', 'eager']))
-  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
+          mode=['graph', 'eager'],
+          cloning=[True, False]))
+  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution, cloning):
     self.skipTest('b/130309197')
     with self.cached_session():
       with distribution.scope():
@@ -116,7 +117,8 @@
         optimizer = gradient_descent.SGD(0.001)
         loss = 'mse'
         metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics,
+                      cloning=cloning)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
diff --git a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
index d363d28..a1bedb3 100644
--- a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
@@ -23,6 +23,7 @@
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import test
 from tensorflow.python.keras.distribute import keras_correctness_test_base
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.training import gradient_descent
 
 
@@ -41,7 +42,8 @@
           distribution=strategies_for_stateful_embedding_model(),
           mode='graph',
           use_numpy=False,
-          use_validation_data=False
+          use_validation_data=False,
+          cloning=[True, False]
       ))
 
 
@@ -49,7 +51,13 @@
     keras_correctness_test_base.
     TestDistributionStrategyEmbeddingModelCorrectnessBase):
 
-  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+  def get_model(self,
+                max_words=10,
+                initial_weights=None,
+                distribution=None,
+                cloning=None,
+                input_shapes=None):
+    del input_shapes
     batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
 
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
@@ -69,9 +77,14 @@
       if initial_weights:
         model.set_weights(initial_weights)
 
+      # TODO(b/130808953): Re-enable the V1 optimizer after iterations
+      # is mirrored.
+      optimizer_fn = (
+          gradient_descent.GradientDescentOptimizer
+          if cloning else gradient_descent_keras.SGD)
+
       model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(
-              learning_rate=0.1),
+          optimizer=optimizer_fn(learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'])
     return model
@@ -80,21 +93,27 @@
   def test_stateful_lstm_model_correctness(self,
                                            distribution,
                                            use_numpy,
-                                           use_validation_data):
+                                           use_validation_data,
+                                           cloning):
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              is_stateful_model=True)
+                              is_stateful_model=True, cloning=cloning)
 
-  @combinations.generate(keras_correctness_test_base.
-                         test_combinations_with_tpu_strategies())
+  @combinations.generate(
+      combinations.times(
+          keras_correctness_test_base.test_combinations_with_tpu_strategies(),
+          combinations.combine(cloning=[True, False])))
   def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
-      self, distribution, use_numpy, use_validation_data):
-    with self.assertRaisesRegexp(ValueError,
-                                 'Single core must be used for computation '
-                                 'on stateful models. Consider adding '
-                                 '`device_assignment` parameter to '
-                                 'TPUStrategy'):
-      self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                                is_stateful_model=True)
+      self, distribution, use_numpy, use_validation_data, cloning):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Single core must be used for computation on stateful models. Consider '
+        'adding `device_assignment` parameter to TPUStrategy'):
+      self.run_correctness_test(
+          distribution,
+          use_numpy,
+          use_validation_data,
+          is_stateful_model=True,
+          cloning=cloning)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_two_input_layers_correctness_test.py b/tensorflow/python/keras/distribute/keras_two_input_layers_correctness_test.py
deleted file mode 100644
index 1f61788..0000000
--- a/tensorflow/python/keras/distribute/keras_two_input_layers_correctness_test.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Correctness tests for tf.keras DNN model using DistributionStrategy."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import tpu_strategy
-from tensorflow.python.eager import test
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.distribute import keras_correctness_test_base
-from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
-from tensorflow.python.ops import math_ops
-
-
-class TestTwoInputLayersCorrectness(
-    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
-
-  _batch_size = 8000
-  _num_users = 7000
-  _num_items = 7000
-
-  def get_model(self, initial_weights=None, distribution=None):
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-
-      batch_size = self._distribution_to_test.num_replicas_in_sync
-
-      user_input = keras.layers.Input(
-          shape=(self._batch_size,),
-          batch_size=batch_size,
-          name="users",
-          dtype=dtypes.int32)
-
-      item_input = keras.layers.Input(
-          shape=(self._batch_size,),
-          batch_size=batch_size,
-          name="items",
-          dtype=dtypes.int32)
-
-      concat = keras.layers.concatenate([user_input, item_input], axis=-1)
-      logits = keras.layers.Dense(
-          1, name="rating")(
-              math_ops.cast(concat, dtypes.float32))
-
-      keras_model = keras.Model(inputs=[user_input, item_input], outputs=logits)
-
-      if initial_weights:
-        keras_model.set_weights(initial_weights)
-
-      keras_model.compile(loss="mse", optimizer=gradient_descent_keras.SGD(0.5))
-      return keras_model
-
-  def get_data(self):
-    users, items, labels = self._get_raw_data()
-    x_train = {"users": users, "items": items}
-    y_train = labels
-    data = x_train, y_train
-    dataset = dataset_ops.Dataset.from_tensors(data).repeat()
-    dataset = dataset.batch(self._distribution_to_test.num_replicas_in_sync)
-    return dataset, None, None
-
-  def _get_raw_data(self):
-    np.random.seed(1337)
-
-    users = np.random.randint(0, self._num_users, size=(self._batch_size,))
-    items = np.random.randint(0, self._num_users, size=(self._batch_size,))
-    labels = np.random.randint(0, 10000, size=(self._batch_size,))
-
-    users = users.astype("int32")
-    items = items.astype("int32")
-    labels = labels.astype("int32")
-
-    return users, items, labels
-
-  def get_input_for_correctness_test(self, **kwargs):
-    update_freq = None
-    if (isinstance(self._distribution_to_test,
-                   (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)) and
-        self._distribution_to_test.extended.steps_per_run > 1):
-      # For TPUStrategy with steps_per_run > 1, the callback is not invoked
-      # every step. So, to compare the CPU/TPU, we let the CPU to behave the
-      # same as TPU.
-      update_freq = self._distribution_to_test.extended.steps_per_run
-
-    dataset, _, _ = self.get_data()
-    learning_rate_scheduler = (
-        keras_correctness_test_base.LearningRateBatchScheduler(update_freq))
-    training_inputs = {
-        "x": dataset,
-        "epochs": 1,
-        "steps_per_epoch": 1,
-        "verbose": 2,
-        "callbacks": [learning_rate_scheduler]
-    }
-
-    return training_inputs, None, None
-
-  def get_input_for_dynamic_lr_test(self, **kwargs):
-    dataset, _, _ = self.get_data()
-    training_inputs = {
-        "x": dataset,
-        "epochs": 1,
-        "steps_per_epoch": 1,
-        "verbose": 2
-    }
-
-    return training_inputs, None, None
-
-  def run_correctness_test(self,
-                           distribution,
-                           use_numpy,
-                           use_validation_data,
-                           with_batch_norm=False,
-                           is_stateful_model=False):
-    self._distribution_to_test = distribution
-    super(TestTwoInputLayersCorrectness, self).run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        with_batch_norm=False,
-        is_stateful_model=False)
-
-  @combinations.generate(
-      keras_correctness_test_base
-      .all_strategies_excluding_tpu_and_input_config_combinations())
-  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-  def run_dynamic_lr_test(self, distribution):
-    self._distribution_to_test = distribution
-    super(TestTwoInputLayersCorrectness, self).run_dynamic_lr_test(distribution)
-
-  @combinations.generate(
-      keras_correctness_test_base
-      .all_strategies_excluding_tpu_and_input_config_combinations())
-  def test_dnn_with_dynamic_learning_rate(self, distribution, use_numpy,
-                                          use_validation_data):
-    self.run_dynamic_lr_test(distribution)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index a72d892..ae2c762 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -28,12 +28,13 @@
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import losses
 from tensorflow.python.keras.distribute import distribute_strategy_test as keras_test_lib
 from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.optimizer_v2 import rmsprop as rms_prop_keras
 from tensorflow.python.training import gradient_descent
 
 
@@ -70,11 +71,14 @@
 class TestDistributionStrategyWithCallbacks(test.TestCase,
                                             parameterized.TestCase):
 
-  @combinations.generate(keras_test_lib.all_strategy_combinations())
-  def test_callbacks_in_fit(self, distribution):
+  @combinations.generate(
+      combinations.times(keras_test_lib.all_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_callbacks_in_fit(self, distribution, cloning):
     with distribution.scope():
       model = keras_test_lib.get_model()
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+      model.compile(
+          optimizer='sgd', loss='mse', metrics=['mae'], cloning=cloning)
 
     dataset = keras_test_lib.get_dataset(distribution)
     counter = Counter()
@@ -119,11 +123,14 @@
             'on_train_end': 1
         })
 
-  @combinations.generate(keras_test_lib.all_strategy_combinations())
-  def test_callbacks_in_eval(self, distribution):
+  @combinations.generate(
+      combinations.times(keras_test_lib.all_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_callbacks_in_eval(self, distribution, cloning):
     with distribution.scope():
       model = keras_test_lib.get_model()
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+      model.compile(
+          optimizer='sgd', loss='mse', metrics=['mae'], cloning=cloning)
 
     dataset = keras_test_lib.get_dataset(distribution)
     counter = Counter()
@@ -138,11 +145,14 @@
             'on_test_end': 1
         })
 
-  @combinations.generate(keras_test_lib.all_strategy_combinations())
-  def test_callbacks_in_predict(self, distribution):
+  @combinations.generate(
+      combinations.times(keras_test_lib.all_strategy_combinations(),
+                         combinations.combine(cloning=[True, False])))
+  def test_callbacks_in_predict(self, distribution, cloning):
     with distribution.scope():
       model = keras_test_lib.get_model()
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+      model.compile(
+          optimizer='sgd', loss='mse', metrics=['mae'], cloning=cloning)
 
     dataset = keras_test_lib.get_dataset(distribution)
     counter = Counter()
@@ -168,7 +178,7 @@
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
-          mode=['graph', 'eager']))
+          mode=['graph']))
   def test_validating_dataset_input_tensors_with_shape_mismatch(
       self, distribution):
     with self.cached_session():
@@ -218,15 +228,16 @@
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
-          mode=['graph', 'eager']))
-  def test_unsupported_features(self, distribution):
+          mode=['graph', 'eager'],
+          cloning=[True, False]))
+  def test_unsupported_features(self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
         model = keras_test_lib.get_model()
         optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       dataset = keras_test_lib.get_dataset(distribution)
 
@@ -280,15 +291,17 @@
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
-          mode=['graph', 'eager']))
-  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
+          mode=['graph', 'eager'],
+          cloning=[True, False]))
+  def test_calling_with_unsupported_predefined_callbacks(
+      self, distribution, cloning):
     with self.cached_session():
       with distribution.scope():
         model = keras_test_lib.get_model()
         optimizer = gradient_descent.GradientDescentOptimizer(0.001)
         loss = 'mse'
         metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
+        model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
       dataset = keras_test_lib.get_dataset(distribution)
 
@@ -318,8 +331,9 @@
   @combinations.generate(
       combinations.combine(
           distribution=[strategy_combinations.one_device_strategy],
-          mode=['eager']))
-  def test_distribution_strategy_with_run_eagerly(self, distribution):
+          mode=['eager'],
+          cloning=[True, False]))
+  def test_distribution_strategy_with_run_eagerly(self, distribution, cloning):
     with distribution.scope():
       x = keras.layers.Input(shape=(1,))
       y = keras.layers.Dense(1, kernel_initializer='ones')(x)
@@ -328,7 +342,7 @@
       err_msg = ('We currently do not support enabling `run_eagerly` with '
                  'distribution strategy.')
       with self.assertRaisesRegex(ValueError, err_msg):
-        model.compile('sgd', run_eagerly=True)
+        model.compile('sgd', run_eagerly=True, cloning=cloning)
 
   # TODO(b/124377929): Remove error assertions once subclassed models
   # are supported in DistributedStrategy.
@@ -336,9 +350,12 @@
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.one_device_strategy,
           ],
-          mode=['graph', 'eager']))
-  def test_distribution_strategy_on_subclassed_model(self, distribution):
+          mode=['graph', 'eager'],
+          cloning=[True, False]))
+  def test_distribution_strategy_on_subclassed_model(self, distribution,
+                                                     cloning):
     with distribution.scope():
 
       class _SimpleMLP(keras.Model):
@@ -352,34 +369,53 @@
 
       model = _SimpleMLP(3)
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          'We currently do not support distribution strategy with a '
-          '`Sequential` model that is created without '
-          '`input_shape`/`input_dim` set in its first layer or '
-          'a subclassed model.'):
-        model.compile('sgd')
+      if cloning or not context.executing_eagerly():
+        with self.assertRaisesRegexp(
+            ValueError,
+            'We currently do not support distribution strategy with a '
+            '`Sequential` model that is created without `input_shape`/'
+            '`input_dim` set in its first layer or a subclassed model.'):
+          model.compile('sgd', cloning=cloning)
+      else:
+        model.compile('sgd', cloning=cloning)
 
   @combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.one_device_strategy,
           ],
-          mode=['graph', 'eager']))
+          mode=['graph', 'eager'],
+          cloning=[True, False]))
   def test_distribution_strategy_on_deferred_sequential_model(
-      self, distribution):
+      self, distribution, cloning):
     with distribution.scope():
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(16, activation='relu'))
       model.add(keras.layers.Dense(3, activation='softmax'))
 
+      if not cloning and context.executing_eagerly():
+        model.compile('sgd', cloning=cloning)
+      else:
+        with self.assertRaisesRegexp(
+            ValueError,
+            'We currently do not support distribution strategy with a '
+            '`Sequential` model that is created without '
+            '`input_shape`/`input_dim` set in its first layer or '
+            'a subclassed model.'):
+          model.compile('sgd', cloning=cloning)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_standalone_loss_without_loss_reduction(self, distribution):
+    with distribution.scope():
+      loss_object = losses.MeanSquaredError()
+
       with self.assertRaisesRegexp(
-          ValueError,
-          'We currently do not support distribution strategy with a '
-          '`Sequential` model that is created without '
-          '`input_shape`/`input_dim` set in its first layer or '
-          'a subclassed model.'):
-        model.compile('sgd')
+          ValueError, 'Please use `tf.keras.losses.Reduction.SUM` or '
+          '`tf.keras.losses.Reduction.NONE`'):
+        y = np.asarray([1, 0])
+        loss_object(y, y)
 
 
 class TestDistributionStrategyWithLossMasking(test.TestCase,
@@ -388,12 +424,20 @@
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
   @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager']))
-  def test_masking(self, distribution):
+      combinations.times(
+          combinations.combine(
+              distribution=[
+                  strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              ],
+              mode=['graph', 'eager']),
+          combinations.combine(
+              cloning=True,
+              optimizer=strategy_combinations.gradient_descent_optimizer_v1_fn)
+          + combinations.combine(
+              cloning=False,
+              optimizer=strategy_combinations
+              .gradient_descent_optimizer_keras_v2_fn)))
+  def test_masking(self, distribution, cloning, optimizer):
     with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
@@ -403,9 +447,7 @@
         model.add(
             keras.layers.TimeDistributed(
                 keras.layers.Dense(1, kernel_initializer='one')))
-        model.compile(
-            loss='mse',
-            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+        model.compile(loss='mse', optimizer=optimizer(), cloning=cloning)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -418,9 +460,17 @@
                                                      parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(keras_test_lib.all_strategy_combinations(),
-                         combinations.combine(fused=[True, False])))
-  def test_batchnorm_correctness(self, distribution, fused):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations(),
+          combinations.combine(fused=[True, False]),
+          combinations.combine(
+              cloning=True,
+              optimizer=strategy_combinations.gradient_descent_optimizer_v1_fn)
+          + combinations.combine(
+              cloning=False,
+              optimizer=strategy_combinations
+              .gradient_descent_optimizer_keras_v2_fn)))
+  def test_batchnorm_correctness(self, distribution, fused, optimizer, cloning):
     with self.cached_session():
       with distribution.scope():
         model = keras.models.Sequential()
@@ -431,9 +481,7 @@
                 30,
             ), momentum=0.8, fused=fused)
         model.add(norm)
-        model.compile(
-            loss='mse',
-            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+        model.compile(loss='mse', optimizer=optimizer(), cloning=cloning)
 
       # centered on 5.0, variance 10.0
       x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
@@ -459,28 +507,42 @@
                                               parameterized.TestCase):
 
   @combinations.generate(
-      keras_test_lib.all_strategy_combinations_minus_default())
-  def test_save_load_h5(self, distribution):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations_minus_default(),
+          combinations.combine(
+              cloning=True,
+              optimizer=strategy_combinations.rmsprop_optimizer_v1_fn) +
+          combinations.combine(
+              cloning=False,
+              optimizer=strategy_combinations.rmsprop_optimizer_keras_v2_fn)))
+  def test_save_load_h5(self, distribution, optimizer, cloning):
     with self.cached_session():
       dataset = keras_test_lib.get_dataset(distribution)
       with distribution.scope():
         model = keras_test_lib.get_model()
-        model.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model.compile(optimizer(), 'mse', cloning=cloning)
         model.fit(dataset, epochs=1, steps_per_epoch=1)
 
         weights_file = tempfile.mktemp('.h5')
         model.save_weights(weights_file)
 
         model_2 = keras_test_lib.get_model()
-        model_2.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model_2.compile(optimizer(), 'mse', cloning=cloning)
         model_2.load_weights(weights_file)
         model_2.predict(
             keras_test_lib.get_predict_dataset(distribution), steps=2)
         model_2.fit(dataset, epochs=1, steps_per_epoch=1)
 
   @combinations.generate(
-      keras_test_lib.all_strategy_combinations_minus_default())
-  def test_save_load_trackable(self, distribution):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations_minus_default(),
+          combinations.combine(
+              cloning=True,
+              optimizer=strategy_combinations.rmsprop_optimizer_v1_fn) +
+          combinations.combine(
+              cloning=False,
+              optimizer=strategy_combinations.rmsprop_optimizer_keras_v2_fn)))
+  def test_save_load_trackable(self, distribution, optimizer, cloning):
     # TODO(b/123533246): Enable the test for TPU once bug is fixed
     if (isinstance(distribution, (tpu_strategy.TPUStrategy,
                                   tpu_strategy.TPUStrategyV1)) and
@@ -490,14 +552,14 @@
       dataset = keras_test_lib.get_dataset(distribution)
       with distribution.scope():
         model = keras_test_lib.get_model()
-        model.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model.compile(optimizer(), 'mse', cloning=cloning)
         model.fit(dataset, epochs=1, steps_per_epoch=1)
 
         weights_file = tempfile.mktemp()
         model.save_weights(weights_file)
 
         model_2 = keras_test_lib.get_model()
-        model_2.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model_2.compile(optimizer(), 'mse', cloning=cloning)
         model_2.load_weights(weights_file)
         model_2.predict(
             keras_test_lib.get_predict_dataset(distribution), steps=2)
@@ -507,8 +569,10 @@
 class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
-      keras_test_lib.all_strategy_combinations_minus_default())
-  def test_layer_outside_scope(self, distribution):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations_minus_default(),
+          combinations.combine(cloning=[True, False])))
+  def test_layer_outside_scope(self, distribution, cloning):
     with self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'was not created in the distribution strategy'):
@@ -519,11 +583,13 @@
           optimizer = gradient_descent.GradientDescentOptimizer(0.001)
           loss = 'mse'
           metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
+          model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
   @combinations.generate(
-      keras_test_lib.all_strategy_combinations_minus_default())
-  def test_model_outside_scope(self, distribution):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations_minus_default(),
+          combinations.combine(cloning=[True, False])))
+  def test_model_outside_scope(self, distribution, cloning):
     with self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'was not created in the distribution strategy'):
@@ -534,7 +600,7 @@
           optimizer = gradient_descent.GradientDescentOptimizer(0.001)
           loss = 'mse'
           metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
+          model.compile(optimizer, loss, metrics=metrics, cloning=cloning)
 
 
 class TestDistributionStrategyWithStaticShapes(test.TestCase,
diff --git a/tensorflow/python/keras/distribute/mnist_multi_worker.py b/tensorflow/python/keras/distribute/mnist_multi_worker.py
new file mode 100644
index 0000000..70e6d36
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mnist_multi_worker.py
@@ -0,0 +1,197 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An example training a Keras Model using MirroredStrategy and native APIs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import utils
+from tensorflow.python.keras.datasets import mnist
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import app
+from tensorflow.python.platform import tf_logging as logging
+
+NUM_CLASSES = 10
+
+flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
+flags.DEFINE_enum('distribution_strategy', None, ['multi_worker_mirrored'],
+                  'The Distribution Strategy to use.')
+flags.DEFINE_string('model_dir', None, 'Directory for TensorBoard/Checkpoint.')
+
+
+# TODO(rchao): Use multi_worker_util.maybe_shard_dataset() once that is provided
+# there.
+def maybe_shard_dataset(dataset):
+  """Shard the dataset if running in multi-node environment."""
+  cluster_resolver = TFConfigClusterResolver()
+  cluster_spec = cluster_resolver.cluster_spec().as_dict()
+  if cluster_spec:
+    dataset = dataset.shard(
+        multi_worker_util.worker_count(cluster_spec,
+                                       cluster_resolver.task_type),
+        multi_worker_util.id_in_cluster(
+            cluster_spec, cluster_resolver.task_type, cluster_resolver.task_id))
+  return dataset
+
+
+def get_data_shape():
+  # input image dimensions
+  img_rows, img_cols = 28, 28
+  if backend.image_data_format() == 'channels_first':
+    return 1, img_rows, img_cols
+  else:
+    return img_rows, img_cols, 1
+
+
+def get_input_datasets(use_bfloat16=False):
+  """Downloads the MNIST dataset and creates train and eval dataset objects.
+
+  Args:
+    use_bfloat16: Boolean to determine if input should be cast to bfloat16
+
+  Returns:
+    Train dataset and eval dataset. The dataset doesn't include batch dim.
+
+  """
+  cast_dtype = dtypes.bfloat16 if use_bfloat16 else dtypes.float32
+
+  # the data, split between train and test sets
+  (x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+  train_data_shape = (x_train.shape[0],) + get_data_shape()
+  test_data_shape = (x_test.shape[0],) + get_data_shape()
+  if backend.image_data_format() == 'channels_first':
+    x_train = x_train.reshape(train_data_shape)
+    x_test = x_test.reshape(test_data_shape)
+  else:
+    x_train = x_train.reshape(train_data_shape)
+    x_test = x_test.reshape(test_data_shape)
+
+  x_train = x_train.astype('float32')
+  x_test = x_test.astype('float32')
+  x_train /= 255
+  x_test /= 255
+
+  # convert class vectors to binary class matrices
+  y_train = utils.to_categorical(y_train, NUM_CLASSES)
+  y_test = utils.to_categorical(y_test, NUM_CLASSES)
+
+  # train dataset
+  train_ds = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+  # TODO(rchao): Remove maybe_shard_dataset() once auto-sharding is done.
+  train_ds = maybe_shard_dataset(train_ds)
+  train_ds = train_ds.repeat()
+  train_ds = train_ds.map(lambda x, y: (math_ops.cast(x, cast_dtype), y))
+  train_ds = train_ds.batch(64, drop_remainder=True)
+
+  # eval dataset
+  eval_ds = dataset_ops.Dataset.from_tensor_slices((x_test, y_test))
+  # TODO(rchao): Remove maybe_shard_dataset() once auto-sharding is done.
+  eval_ds = maybe_shard_dataset(eval_ds)
+  eval_ds = eval_ds.repeat()
+  eval_ds = eval_ds.map(lambda x, y: (math_ops.cast(x, cast_dtype), y))
+  eval_ds = eval_ds.batch(64, drop_remainder=True)
+
+  return train_ds, eval_ds
+
+
+def get_model(index=0):
+  """Builds a Sequential CNN model to recognize MNIST digits.
+
+  Args:
+    index: The worker index. Defaults to 0.
+
+  Returns:
+    a CNN Keras model used for MNIST
+
+  """
+
+  # Define a CNN model to recognize MNIST digits.
+  model = keras.models.Sequential()
+  model.add(
+      keras.layers.Conv2D(
+          32,
+          kernel_size=(3, 3),
+          activation='relu',
+          input_shape=get_data_shape()))
+  model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
+  model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
+  model.add(keras.layers.Dropout(0.25, name='dropout_worker%s_first' % index))
+  model.add(keras.layers.Flatten())
+  model.add(keras.layers.Dense(128, activation='relu'))
+  model.add(keras.layers.Dropout(0.5, name='dropout_worker%s_second' % index))
+  model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+  return model
+
+
+def main(_):
+  if flags.FLAGS.enable_eager:
+    ops.enable_eager_execution()
+    logging.info('Eager execution enabled for MNIST Multi-Worker.')
+  else:
+    logging.info('Eager execution not enabled for MNIST Multi-Worker.')
+
+  # Build the train and eval datasets from the MNIST data.
+  train_ds, eval_ds = get_input_datasets()
+
+  if flags.FLAGS.distribution_strategy == 'multi_worker_mirrored':
+    # MultiWorkerMirroredStrategy for multi-worker distributed MNIST training.
+    strategy = collective_strategy.CollectiveAllReduceStrategy()
+  else:
+    raise ValueError('Only `multi_worker_mirrored` is supported strategy '
+                     'in Keras MNIST example at this time. Strategy passed '
+                     'in is %s' % flags.FLAGS.distribution_strategy)
+
+  # Create and compile the model under Distribution strategy scope.
+  # `fit`, `evaluate` and `predict` will be distributed based on the strategy
+  # model was compiled with.
+  with strategy.scope():
+    model = get_model()
+    optimizer = rmsprop.RMSProp(learning_rate=0.001)
+    model.compile(
+        loss=keras.losses.categorical_crossentropy,
+        optimizer=optimizer,
+        metrics=['accuracy'])
+
+  # Train the model with the train dataset.
+  tensorboard_callback = keras.callbacks.TensorBoard(
+      log_dir=flags.FLAGS.model_dir)
+  model.fit(
+      x=train_ds,
+      epochs=20,
+      steps_per_epoch=468,
+      callbacks=[tensorboard_callback])
+
+  # Evaluate the model with the eval dataset.
+  score = model.evaluate(eval_ds, steps=10, verbose=0)
+  logging.info('Test loss:{}'.format(score[0]))
+  logging.info('Test accuracy:{}'.format(score[1]))
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  app.run()
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_test.py
index fa54fe7..efa7d9b 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_test.py
@@ -17,9 +17,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+import json
 import os
 import sys
+import tempfile
+import threading
 
 from absl.testing import parameterized
 
@@ -240,20 +242,37 @@
   def callableForTestLoadWeightFromModelCheckpoint(model, test_obj, train_ds,
                                                    num_epoch, steps, strategy,
                                                    saving_filepath):
+    filepaths = []
+    real_mkstemp = tempfile.mkstemp
+    def mocked_mkstemp():
+      # Only non-chief should call tempfile.mkstemp() inside fit() in sync
+      # training.
+      assert not test_base.is_chief()
+      file_handle, temp_file_name = real_mkstemp()
+      extension = os.path.splitext(saving_filepath)[1]
+      temp_filepath = temp_file_name + extension
+      filepaths.append(temp_filepath)
+      return file_handle, temp_file_name
 
-    saving_filepath, history_after_one_more_epoch = \
-        KerasMultiWorkerCallbackTest.initialFitting(
-            test_obj, model, train_ds, num_epoch, steps, saving_filepath)
+    # Mock tempfile.mkstemp() so the filepaths can be stored and verified later.
+    with test.mock.patch.object(tempfile, 'mkstemp', mocked_mkstemp):
+      saving_filepath, history_after_one_more_epoch = \
+          KerasMultiWorkerCallbackTest.initialFitting(
+              test_obj, model, train_ds, num_epoch, steps, saving_filepath)
 
-    with strategy.scope():
-      model.load_weights(saving_filepath)
+      with strategy.scope():
+        model.load_weights(saving_filepath)
 
-    history_after_loading_weight_and_one_more_epoch = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
+      history_after_loading_weight_and_one_more_epoch = model.fit(
+          x=train_ds, epochs=1, steps_per_epoch=steps)
 
-    test_obj.assertAllClose(
-        history_after_one_more_epoch.history,
-        history_after_loading_weight_and_one_more_epoch.history)
+      test_obj.assertAllClose(
+          history_after_one_more_epoch.history,
+          history_after_loading_weight_and_one_more_epoch.history)
+
+    # Verify the temp files are indeed removed (no trace left behind).
+    for filepath in filepaths:
+      assert not os.path.exists(filepath)
 
   @staticmethod
   def callableForTestModelRestoreCallback(model, test_obj, train_ds, num_epoch,
@@ -318,8 +337,7 @@
       model.compile(
           loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
-    # TODO(b/129779608): Fix the flakiness of the following check.
-    # test_obj.assertTrue(os.path.exists(saving_filepath))
+    test_obj.assertTrue(os.path.exists(saving_filepath))
 
     # Unmatched format. Should raise ValueError.
     with test_obj.assertRaisesRegexp(ValueError, 'Error loading file from'):
@@ -334,6 +352,258 @@
                   load_weights_on_restart=True)
           ])
 
+  class PreemptionAtBatchBoundarySimulatingCallback(callbacks.Callback):
+    """Callback to simulate preemtion at batch boundary."""
+
+    def on_epoch_begin(self, epoch, logs=None):
+      self._current_epoch = epoch
+
+    def on_batch_begin(self, batch, logs=None):
+      if self._current_epoch == 1 and batch == 1 and not test_base.is_chief():
+        # Simulate preemtion at the start of second batch of second epoch.
+        raise RuntimeError('Preemption!')
+
+    def on_batch_end(self, batch, logs=None):
+      assert self._current_epoch < 1 or batch < 1
+
+    def on_epoch_end(self, epoch, logs=None):
+      assert epoch < 1
+
+  class PreemptionAtEpochBoundarySimulatingCallback(callbacks.Callback):
+    """Callback to simulate preemtion at epoch boundary."""
+
+    def on_epoch_begin(self, epoch, logs=None):
+      if epoch == 1 and not test_base.is_chief():
+        # Simulate preemtion at the start of second epoch.
+        raise RuntimeError('Preemption!')
+
+    def on_epoch_end(self, epoch, logs=None):
+      assert epoch < 1
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
+          required_gpus=[0, 1],
+          file_format=['h5'],  # TODO(rchao): Support TF format.
+          preemption_callback=[
+              PreemptionAtEpochBoundarySimulatingCallback,
+              PreemptionAtBatchBoundarySimulatingCallback
+          ]))
+  def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format,
+                                       preemption_callback):
+    """Test fault-tolerance with multi-threading using sync dist-strat.
+
+    This test simulates multi-worker training that is interrupted by a
+    preemption, by having two threads, each of which represents a chief and a
+    non-chief worker, where the non-chief raises an error in the middle of
+    training loop. Upon excepting the error, a new thread with a new cluster
+    spec is created to simulate the recovered non-chief worker. Meanwhile, the
+    chief worker cannot proceed and hangs since the non-chief worker has
+    crashed. To simulate a restart of the chief, a new thread has been prepared
+    to run to take over chief with the help of a condition variable. It is
+    expected that after the restart of both chief and non-chief workers, the
+    training continues from the epoch they previously failed at. The test
+    concludes by verifying the preemption-interrupted training can finish with
+    the same loss and accuracy had the preemption not occurred.
+
+    Arguments:
+      strategy_cls: The strategy class to use.
+      file_format: `h5` or `tf`.
+      preemption_callback: The callback to simulate preemption.
+    """
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+        # Condition variable that blocks the thread that represents the
+        # restarted chief.
+        cv = kwargs.get('cv', None)
+        # `before_restart` is True for the threads that represent the original
+        # chief and non-chief worker, and False for threads that represent the
+        # restarted chief and non-chief workers.
+        before_restart = kwargs['before_restart']
+        if kwargs['new_chief']:
+          # `new_chief` is only True for the restarted chief thread. It waits
+          # until non-chief is preempted and restarted to simulate the causality
+          # where chief's restart results from non-chief's failure.
+          cv.acquire()
+          while not hasattr(cv, 'preempted'):
+            cv.wait()
+          cv.release()
+
+        # Model building under strategy scope. Following is the code we expect
+        # the user runs on every worker.
+        strategy = get_strategy_object(strategy_cls)
+        batch_size = 64
+        steps = 3
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        with strategy.scope():
+          model = _get_model((28, 28, 1))
+
+        # Function to start a new thread. This will be called twice in the
+        # following code: one represents the restart of the non-chief, and one
+        # represents the restart of the chief as a result of the restart of the
+        # non-chief (so the training can continue in sync).
+        def start_new_thread(new_chief=False):
+          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
+          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
+          return self._run_task_in_thread(
+              task_fn=_independent_worker_fn,
+              cluster_spec=None,
+              task_type=None,
+              task_id=None,
+              tf_config=new_thread_tf_config,
+              before_restart=False,
+              cv=cv,
+              new_chief=new_chief)
+
+        if test_base.is_chief() and before_restart:
+          # Chief to start a new thread (that will be blocked by a condition
+          # variable until the non-chief's new thread is started). The thread
+          # for (recovered) chief is started before entering `fit()` because
+          # the original chief thread will eventually hang and be ignored.
+          start_new_thread(new_chief=True)
+
+        try:
+
+          class CkptSavedEpochAssertingCallback(callbacks.Callback):
+
+            def __init__(self, test_obj):
+              super(CkptSavedEpochAssertingCallback, self).__init__()
+              self.test_obj = test_obj
+
+            def on_epoch_begin(self, epoch, logs=None):
+              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
+              self.test_obj.assertEqual(self.model._ckpt_saved_epoch is None,
+                                        epoch == 0)
+
+          callbacks_list = [
+              callbacks.ModelCheckpoint(
+                  filepath=saving_filepath,
+                  save_weights_only=True,
+                  load_weights_on_restart=True),
+              CkptSavedEpochAssertingCallback(self)
+          ]
+          if before_restart:
+            callbacks_list.append(preemption_callback())
+
+          self.assertIsNone(model._ckpt_saved_epoch)
+          history = model.fit(
+              x=train_ds,
+              epochs=num_epoch,
+              steps_per_epoch=steps,
+              callbacks=callbacks_list)
+          self.assertIsNone(model._ckpt_saved_epoch)
+
+          # `history` of the training result is collected to be compared against
+          # each other. It is expected that the training results (loss and
+          # accuracy`) are the same with or without preemption.
+          self._histories.append(history.history)
+
+        except RuntimeError:
+          # pylint: disable=g-assert-in-except
+          self.assertTrue(before_restart)
+          # Reset the barrier so the new threads simulating recovery can
+          # continue.
+          self._barrier._counter = 0
+          self._barrier._flag = False
+
+          # Now that the non-chief has been preempted, it notifies the thread
+          # that simulates the restarted chief to start so they can be back in
+          # sync.
+          cv.acquire()
+          cv.preempted = True
+          cv.notify()
+          cv.release()
+
+          # At this point we should discard the original non-chief thread, and
+          # start the new thread that simulates the restarted non-chief, hence
+          # joining the thread and return.
+          self.join_independent_workers([start_new_thread()])
+          return
+
+        # Successful end of a `fit()` call.
+        self._successful_thread_ends += 1
+        self.assertFalse(before_restart)
+
+    # Common parameters
+    num_workers = 2
+    num_epoch = 3
+    # History list storing the results for preemption and no preemption cases.
+    self._histories = []
+    # Pass `saving_filepath` from the parent thread to ensure every worker has
+    # the same filepath to save.
+    saving_filepath = os.path.join(self.get_temp_dir(),
+                                   'checkpoint.' + file_format)
+    strategy = get_strategy_object(strategy_cls)
+
+    # Case 1: Training for `num_epoch` without preemptions.
+    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
+    self._barrier = dc._Barrier(2)
+    self._successful_thread_ends = 0
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        saving_filepath=saving_filepath,
+        before_restart=False,
+        new_chief=False)
+    if os.path.exists(saving_filepath):
+      os.remove(saving_filepath)
+    threads_to_join = []
+    if strategy.extended.experimental_between_graph:
+      for ts in threads.values():
+        threads_to_join.extend(ts)
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+    self.assertEqual(self._successful_thread_ends, 2)
+
+    # Case 2: Training for `num_epoch` epoch with preemptions.
+    # The preemption is simulated at both epoch boundary and batch boundary.
+    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
+    cv = threading.Condition()
+    self._barrier = dc._Barrier(2)
+    # Ports reserved for new threads simulating recovery.
+    reserved_ports = [
+        'localhost:%s' % test_base.pick_unused_port()
+        for _ in range(num_workers)
+    ]
+    self._successful_thread_ends = 0
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        saving_filepath=saving_filepath,
+        reserved_ports=reserved_ports,
+        before_restart=True,
+        cv=cv,
+        new_chief=False)
+    if os.path.exists(saving_filepath):
+      os.remove(saving_filepath)
+    threads_to_join = []
+    if strategy.extended.experimental_between_graph:
+      # Only join the non-chief thread since the first thread for chief will
+      # eventually hang and be ignored.
+      threads_to_join = [threads['worker'][1]]
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+    self.assertEqual(self._successful_thread_ends, 2)
+
+    def assert_all_elements_are_identical(list_to_check):
+      first_item = list_to_check[0]
+      for item in list_to_check[1:]:
+        self.assertAllClose(first_item, item, rtol=1e-5, atol=1e-5)
+
+    # Important: the results from preemption interrupted and non-interrupted
+    # cases should give the same final results.
+    assert_all_elements_are_identical(
+        [history['acc'][-1] for history in self._histories])
+    assert_all_elements_are_identical(
+        [history['loss'][-1] for history in self._histories])
+    # The length of `self._histories` would be num_workers * num_runs (3).
+    self.assertLen(self._histories, 4)
+
   # The actual testing methods go here.
   test_chief_only_callback = generate_callback_test_function(
       callableForTestChiefOnlyCallback.__func__)
diff --git a/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py b/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py
new file mode 100644
index 0000000..27f0e1a
--- /dev/null
+++ b/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py
@@ -0,0 +1,159 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests Keras multi worker callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import threading
+
+from absl.testing import parameterized
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.keras.distribute import mnist_multi_worker
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent as gradient_descent_v1
+from tensorflow.python.training import rmsprop as rmsprop_v1
+
+
+# TODO(rchao): Move maybe_shard_dataset to shared util.
+maybe_shard_dataset = mnist_multi_worker.maybe_shard_dataset
+
+
+class KerasMultiWorkerOptimizerTest(test_base.IndependentWorkerTestBase,
+                                    parameterized.TestCase):
+
+  def run_optimizer_comparison_with_simple_bias_model(
+      self, strategy_cls, optimizer_class_1, optimizer_class_2):
+
+    def get_input_datasets():
+      # Simple training input.
+      train_input = [[1]] * 16
+      train_label = [[0]] * 16
+      ds = dataset_ops.Dataset.from_tensor_slices((train_input, train_label))
+      ds = maybe_shard_dataset(ds)
+      # TODO(rchao): Investigate to figure out the reason for having 8 workers
+      # instead of 2 as expected.
+      return ds.batch(8, drop_remainder=True)
+
+    def get_simple_bias_model():
+
+      class Bias(base_layer.Layer):
+
+        def build(self, input_shape):
+          self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+        def call(self, inputs):
+          return inputs + self.bias
+
+      model = sequential.Sequential()
+      model.add(Bias(input_shape=(1,)))
+
+      return model
+
+    self._lock = threading.Lock()
+    cluster_spec = test_base.create_cluster_spec(num_workers=2)
+    self._barrier = dc._Barrier(2)
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      """Simulates an Independent Worker inside a thread."""
+      # TODO(rchao): Refactor to abstract the common boilerplate out.
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+
+        model = get_simple_bias_model()
+
+        initial_weights = model.get_weights()
+
+        def _get_model_results(optimizer, initial_weights):
+
+          # Clear Keras session to reset device assignment
+          keras.backend._SESSION.session = None
+          strategy = strategy_cls()
+
+          with strategy.scope():
+            train_ds = get_input_datasets()
+            model = get_simple_bias_model()
+            model.set_weights(initial_weights)
+            model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])
+
+          return {
+              'trained_loss_and_accuracy':
+                  model.fit(x=train_ds, epochs=20).history,
+              'trained_weights':
+                  model.get_weights(),
+          }
+
+        results1 = _get_model_results(optimizer_class_1(0.01), initial_weights)
+        results2 = _get_model_results(optimizer_class_2(0.01), initial_weights)
+
+        for key in results1:
+          self.assertAllClose(
+              results1[key],
+              results2[key],
+              atol=1e-5,
+              rtol=1e-5,
+              msg='Fail to assert {}'.format(key))
+
+    threads = self.run_multiple_tasks_in_threads(_independent_worker_fn,
+                                                 cluster_spec)
+
+    threads_to_join = []
+    strategy = strategy_cls()
+    if strategy.extended.experimental_between_graph:
+      for ts in threads.values():
+        threads_to_join.extend(ts)
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
+          required_gpus=[0, 1]))
+  def test_sgd_optimizer_v1_v2_comparison(self, strategy_cls):
+    self.run_optimizer_comparison_with_simple_bias_model(
+        strategy_cls, gradient_descent.SGD,
+        gradient_descent_v1.GradientDescentOptimizer)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
+          required_gpus=[0, 1]))
+  def test_rmsprop_optimizer_v1_v2_comparison(self, strategy_cls):
+    self.skipTest('There is an issue in collective ops (b/127700538) that '
+                  'prevent us from running this test with rmsprop optimizers.')
+    self.run_optimizer_comparison_with_simple_bias_model(
+        strategy_cls, rmsprop.RMSprop, rmsprop_v1.RMSPropOptimizer)
+
+
+if __name__ == '__main__':
+  with test.mock.patch.object(sys, 'exit', os._exit):
+    test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_test.py b/tensorflow/python/keras/distribute/multi_worker_test.py
index 6abf231..411d02e 100644
--- a/tensorflow/python/keras/distribute/multi_worker_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_test.py
@@ -296,13 +296,6 @@
       cluster_spec=cluster_spec)
 
 
-# TODO(yuefengz): remove this function once
-# keras_multi_worker_optimizer_comparison_test no longer depends on it.
-def get_strategy_object(strategy_cls):
-  # CollectiveAllReduceStrategy and ParameterServerStrategy.
-  return strategy_cls()
-
-
 class KerasMultiWorkerTestStandaloneClient(test.TestCase,
                                            parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index c382816..01335e5 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -27,10 +27,12 @@
 
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python import autograph
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
+from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
@@ -66,6 +68,9 @@
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
+# Prefix that is added to the TF op layer names.
+_TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
+
 
 @keras_export('keras.layers.Layer')
 class Layer(module.Module):
@@ -148,14 +153,12 @@
         'activity_regularizer',
     }
     # Validate optional keyword arguments.
-    for kwarg in kwargs:
-      if kwarg not in allowed_kwargs:
-        raise TypeError('Keyword argument not understood:', kwarg)
+    generic_utils.validate_kwargs(kwargs, allowed_kwargs)
 
     # Mutable properties
     # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training
-    self.trainable = trainable
+    # and whether the layer's updates are run during training.
+    self._trainable = trainable
     # A stateful layer is a layer whose updates are run during inference too,
     # for instance stateful RNNs.
     self.stateful = False
@@ -193,10 +196,6 @@
     self._metrics_tensors = {}
 
     self._set_dtype_and_policy(dtype)
-
-    self._call_fn_args = function_utils.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in self._call_fn_args or
-                                   hasattr(self, 'compute_mask'))
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Dependencies tracked via attribute assignment.
@@ -376,7 +375,7 @@
         initializer=initializer,
         dtype=dtype,
         constraint=constraint,
-        trainable=trainable and self.trainable,
+        trainable=trainable,
         partitioner=partitioner,
         use_resource=use_resource,
         collections=collections,
@@ -562,11 +561,8 @@
 
     # Handle Keras mask propagation from previous layer to current layer.
     previous_mask = None
-    if (not hasattr(self, '_compute_previous_mask') or
-        self._compute_previous_mask):
+    if self._should_compute_mask:
       previous_mask = base_layer_utils.collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
         # The previous layer generated a mask, and mask was not explicitly
@@ -576,7 +572,7 @@
     # Clear eager losses on top level model call.
     # We are clearing the losses only on the top level model call and not on
     # every layer/mode call because layer/model may be reused.
-    if (context.executing_eagerly() and
+    if (base_layer_utils.is_in_eager_or_tf_function() and
         not base_layer_utils.is_in_call_context()):
       self._clear_losses()
 
@@ -588,7 +584,7 @@
         input_spec.assert_input_compatibility(self.input_spec, inputs,
                                               self.name)
         graph = backend.get_graph()
-        with graph.as_default(), ops.name_scope(self._name_scope()):
+        with graph.as_default(), backend.name_scope(self._name_scope()):
           # Build layer if applicable (if the `build` method has been
           # overridden).
           self._maybe_build(inputs)
@@ -598,11 +594,7 @@
           # autograph is strictly needed only for subclassed layers.
           if base_layer_utils.is_subclassed(self):
             decorators, original_func = tf_decorator.unwrap(self.call)
-            # TODO(psv): Remove optional_features param from the call here
-            # after b/129001876 is fixed.
-            converted_func = autograph.convert(
-                recursive=True, optional_features=None)(
-                    original_func)
+            converted_func = autograph.convert(recursive=True)(original_func)
             if decorators:
               call_fn = tf_decorator.rewrap(self.call, original_func,
                                             converted_func)
@@ -620,18 +612,25 @@
           if (self._expects_training_arg and
               not base_layer_utils.training_arg_passed_to_call(
                   tf_inspect.getfullargspec(self.call), args, kwargs) and
-              getattr(graph, 'name', None) == 'keras_graph'):
+              base_layer_utils.is_in_keras_graph()):
             learning_phase_passed_by_framework = True
             kwargs['training'] = backend.learning_phase()
           if not self.dynamic:
             try:
               with base_layer_utils.autocast_context_manager(
                   input_list,
-                  self._mixed_precision_policy.should_cast_variables), (
-                      base_layer_utils.AutoAddUpdates(self,
-                                                      inputs)) as auto_updater:
-                outputs = call_fn(inputs, *args, **kwargs)
-                auto_updater.set_outputs(outputs)
+                  self._mixed_precision_policy.should_cast_variables):
+                # Add auto_control_deps in V2 when they are not already added by
+                # a `tf.function`.
+                if (ops.executing_eagerly_outside_functions() and
+                    not base_layer_utils.is_in_eager_or_tf_function()):
+                  with auto_control_deps.AutomaticControlDependencies() as acd:
+                    outputs = call_fn(inputs, *args, **kwargs)
+                    # Wrap Tensors in `outputs` in `tf.identity` to avoid
+                    # circular dependencies.
+                    outputs = base_layer_utils.mark_as_return(outputs, acd)
+                else:
+                  outputs = call_fn(inputs, *args, **kwargs)
 
             except TypeError as e:
               exception_str = str(e)
@@ -672,7 +671,7 @@
             self._set_inputs(inputs, outputs)
       else:
         # Eager execution on data tensors.
-        with ops.name_scope(self._name_scope()):
+        with backend.name_scope(self._name_scope()):
           self._maybe_build(inputs)
           with base_layer_utils.autocast_context_manager(
               input_list, self._mixed_precision_policy.should_cast_variables):
@@ -703,6 +702,16 @@
     return self._dynamic
 
   @property
+  def trainable(self):
+    return self._trainable
+
+  @trainable.setter
+  def trainable(self, value):
+    self._trainable = value
+    for layer in getattr(self, '_layers', []):
+      layer.trainable = value
+
+  @property
   def activity_regularizer(self):
     """Optional regularizer function for the output of this layer."""
     return self._activity_regularizer
@@ -740,7 +749,27 @@
 
   @property
   def updates(self):
-    return self._get_unfiltered_updates(check_trainable=True)
+    if not self.trainable and not self.stateful:
+      return []
+    with backend.get_graph().as_default():
+      updates = []
+      for u in self._updates:
+        # Filter out updates created in a cross-replica context when in a
+        # replica context and vice versa.
+        if (getattr(u, '_in_cross_replica_context', False) !=
+            ds_context.in_cross_replica_context()):
+          continue
+        if callable(u):
+          try:
+            u = u()
+          except ValueError as e:
+            if 'Trying to capture a tensor from an inner function' in str(e):
+              base_layer_utils.check_graph_consistency(
+                  method='add_update', force_raise=True)
+            raise
+        base_layer_utils.check_graph_consistency(u, method='add_update')
+        updates.append(u)
+    return updates + self._gather_children_attribute('updates')
 
   @property
   def losses(self):
@@ -788,7 +817,7 @@
     class MyLayer(tf.keras.layers.Layer):
       def call(inputs, self):
         self.add_loss(tf.abs(tf.reduce_mean(inputs)), inputs=True)
-        return 2*inputs
+        return inputs
     ```
 
     This method can also be called directly on a Functional Model during
@@ -805,7 +834,7 @@
     model = tf.keras.Model(inputs, outputs)
     # Actvity regularization.
     model.add_loss(tf.abs(tf.reduce_mean(x)))
-    ````
+    ```
 
     If this is not the case for your loss (if, for example, your loss references
     a `Variable` of one of the model's layers), you can wrap your loss in a
@@ -844,6 +873,7 @@
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
         loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
+      base_layer_utils.check_graph_consistency(loss, method='add_loss')
       loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
       return loss
 
@@ -855,12 +885,17 @@
     for loss in losses:
       if callable(loss):
         callable_losses.append(functools.partial(_tag_unconditional, loss))
-      elif tf_utils.is_symbolic_tensor(loss):
+        continue
+      if loss is None:
+        continue
+      if not tensor_util.is_tensor(loss):
+        loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
+      # TF Functions should take the eager path.
+      if (tf_utils.is_symbolic_tensor(loss) and
+          not base_layer_utils.is_in_tf_function()):
         symbolic_losses.append(_tag_unconditional(loss))
       elif tensor_util.is_tensor(loss):
         eager_losses.append(_tag_unconditional(loss))
-      elif loss is not None:  # `None` is valid but should be ignored.
-        raise ValueError('Found non-Tensor loss: ' + str(loss))
 
     self._callable_losses += callable_losses
 
@@ -1008,33 +1043,47 @@
     """
     updates = generic_utils.to_list(updates)
 
-    if context.executing_eagerly():
-      # Don't run callable updates if currently executing inside the `call`
-      # of a Layer/Model with `trainable=False`.
+    # All updates can be run immediately in Eager or in a tf.function.
+    if base_layer_utils.is_in_eager_or_tf_function():
       if not base_layer_utils.is_in_frozen_context():
         for update in updates:
           if callable(update):
             update()
-      return  # Updates already applied when in eager mode.
+      return
 
     def process_update(x):
+      """Standardize update ops.
+
+      Arguments:
+        x: Tensor, op, or callable.
+
+      Returns:
+        An update op.
+      """
       if callable(x):
-        x = x()
-      if isinstance(x, ops.Operation):
-        return x
+        update = lambda: process_update(x())
+        if not ops.executing_eagerly_outside_functions():
+          # In V1 mode, call the callable right away and process. This is needed
+          # for TPU strategy.
+          return update()
+      elif isinstance(x, ops.Operation):
+        update = x
       elif hasattr(x, 'op'):
-        return x.op
+        update = x.op
       else:
-        return ops.convert_to_tensor(x)
+        update = ops.convert_to_tensor(x)
+      update._unconditional_update = (inputs is None)
+      update._in_cross_replica_context = (
+          ds_context.has_strategy() and ds_context.in_cross_replica_context())
+      return update
 
     updates = [process_update(x) for x in updates]
+    # Non-callable Updates are run automatically inside `call` in V2, so
+    # they do not need to be tracked later.
+    if (ops.executing_eagerly_outside_functions() and
+        base_layer_utils.is_in_call_context()):
+      updates = [u for u in updates if callable(u)]
     self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
-    else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
 
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
@@ -1086,25 +1135,16 @@
 
     Returns:
       List of update ops of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
     """
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
-
     if inputs is None:
       # Requesting unconditional updates.
-      return [
-          x for x in self._get_unfiltered_updates() if x._unconditional_update  # pylint: disable=protected-access
-      ]
+      return [u for u in self.updates if u._unconditional_update]
 
     # Requesting input-conditional updates.
+    updates = [u for u in self.updates if not u._unconditional_update]
     inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(
-        inputs, self._get_unfiltered_updates())
-    return [u for u in self._get_unfiltered_updates() if u in reachable]  # pylint: disable=protected-access
+    reachable = tf_utils.get_reachable_from_inputs(inputs, updates)
+    return [u for u in updates if u in reachable]
 
   def get_losses_for(self, inputs):
     """Retrieves losses relevant to a specific set of inputs.
@@ -1114,26 +1154,16 @@
 
     Returns:
       List of loss tensors of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
     """
     if inputs is None:
       # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+      return [l for l in self.losses if l._unconditional_loss]
 
     # Requesting input-conditional losses.
+    losses = [l for l in self.losses if not l._unconditional_loss]
     inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
+    reachable = tf_utils.get_reachable_from_inputs(inputs, losses)
+    return [l for l in losses if l in reachable]
 
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
@@ -1375,7 +1405,8 @@
     """
     if not self.built:
       if self.__class__.__name__ == 'Sequential':
-        self.build()  # pylint: disable=no-value-for-parameter
+        with tf_utils.maybe_init_scope(self):
+          self.build()  # pylint: disable=no-value-for-parameter
       else:
         raise ValueError('You tried to call `count_params` on ' + self.name +
                          ', but the layer isn\'t built. '
@@ -1533,6 +1564,7 @@
     self._metrics.append(metric_obj)
 
   def _symbolic_add_metric(self, value, aggregation=None, name=None):
+    base_layer_utils.check_graph_consistency(value, method='add_metric')
     match = self._get_existing_metric(name)
     if aggregation is None:
       # Iterate over the metrics and check if the given metric exists already.
@@ -1574,7 +1606,7 @@
 
     def _loss_for_variable(v):
       """Creates a regularization loss `Tensor` for variable `v`."""
-      with ops.name_scope(name + '/Regularizer'):
+      with backend.name_scope(name + '/Regularizer'):
         regularization = regularizer(v)
       return regularization
 
@@ -1590,7 +1622,7 @@
     # output, since it is output-specific.
     if self._activity_regularizer:
       output_list = nest.flatten(outputs)
-      with ops.name_scope('ActivityRegularizer'):
+      with backend.name_scope('ActivityRegularizer'):
         for output in output_list:
           activity_loss = self._activity_regularizer(output)
           batch_size = math_ops.cast(
@@ -1660,7 +1692,7 @@
     output_ls_copy = []
     for x in output_ls:
       if x in inputs_ls:
-        with ops.name_scope(self.name):
+        with backend.name_scope(self.name):
           x = array_ops.identity(x)
       output_ls_copy.append(x)
     outputs = nest.pack_sequence_as(outputs, output_ls_copy)
@@ -1839,7 +1871,11 @@
       input_shapes = nest.map_structure(lambda x: x.shape, inputs)
     # Only call `build` if the user has manually overridden the build method.
     if not hasattr(self.build, '_is_default'):
-      self.build(input_shapes)
+      # Any setup work performed only once should happen in an `init_scope`
+      # to avoid creating symbolic Tensors that will later pollute any eager
+      # operations.
+      with tf_utils.maybe_init_scope(self):
+        self.build(input_shapes)
     # We must set self.built since user defined build functions are not
     # constrained to set self.built.
     self.built = True
@@ -1926,7 +1962,13 @@
         getattr(self, '_is_graph_network', False) or
         # Exclude @property.setters from tracking
         hasattr(self.__class__, name)):
-      super(tracking.AutoTrackable, self).__setattr__(name, value)
+      try:
+        super(tracking.AutoTrackable, self).__setattr__(name, value)
+      except AttributeError:
+        raise AttributeError(
+            ('Can\'t set the attribute "{}", likely because it conflicts with '
+             'an existing read-only @property of the object. Please choose a '
+             'different name.').format(name))
       return
 
     # Keep track of trackable objects, for the needs of `Network.save_weights`.
@@ -2000,10 +2042,16 @@
   def _is_layer(self):
     return True
 
-  def _get_unfiltered_updates(self, check_trainable=True):
-    if check_trainable and not self.trainable and not self.stateful:
-      return []
-    return self._updates + self._gather_children_attribute('updates')
+  @property
+  def _call_fn_args(self):
+    if getattr(self, '__call_fn_args', None) is None:
+      self.__call_fn_args = function_utils.fn_args(self.call)
+    return self.__call_fn_args
+
+  @property
+  def _should_compute_mask(self):
+    return ('mask' in self._call_fn_args or
+            getattr(self, 'compute_mask', None) is not None)
 
 
 class Node(object):
@@ -2163,7 +2211,7 @@
                trainable=True,
                dtype=None):
     super(TensorFlowOpLayer, self).__init__(
-        name=name, trainable=trainable, dtype=dtype)
+        name=_TF_OP_LAYER_NAME_PREFIX + name, trainable=trainable, dtype=dtype)
     self.node_def = node_def_pb2.NodeDef.FromString(node_def)
     self.constants = constants or {}
     # Layer uses original op unless it is called on new inputs.
@@ -2279,4 +2327,3 @@
 # Avoid breaking users who directly import this symbol from this file.
 # TODO(fchollet): remove this.
 InputSpec = input_spec.InputSpec  # pylint:disable=invalid-name
-
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 5b83829..8083dc5 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -20,6 +20,7 @@
 
 import collections
 import itertools as it
+import os
 import sys
 import traceback
 from absl.testing import parameterized
@@ -37,12 +38,16 @@
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.layers import core as legacy_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
 
 
 class DynamicLayer(base_layer.Layer):
@@ -170,12 +175,67 @@
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
     self.assertEqual(model.outputs[0].shape.as_list(), [None, 3])
 
+  @keras_parameterized.run_all_keras_modes
+  def test_add_loss_correctness(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        self.add_loss(math_ops.reduce_sum(inputs))
+        return inputs
+
+    inputs = keras.Input((3,))
+    layer = MyLayer()
+    outputs = layer(inputs)
+    model = keras.Model(inputs, outputs)
+    self.assertEqual(len(model.losses), 1)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(loss, 2 * 3)
+
   @test_util.run_in_graph_and_eager_modes
   def test_invalid_forward_pass(self):
     inputs = keras.Input((3,))
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       _ = InvalidLayer()(inputs)
 
+  def test_no_legacy_model(self):
+    inputs = keras.Input((1,))
+    legacy_dense_0 = legacy_core.Dense(1, name='legacy_dense_0')
+    legacy_dense_1 = legacy_core.Dense(1, name='legacy_dense_1')
+
+    layer = legacy_dense_0(inputs)
+    layer = keras.layers.Dense(1)(layer)
+    layer = legacy_dense_1(layer)
+
+    expected_regex = (r'The following are legacy tf\.layers\.Layers:\n  '
+                      '{}\n  {}'.format(legacy_dense_0, legacy_dense_1))
+
+    with self.assertRaisesRegexp(TypeError, expected_regex):
+      _ = keras.models.Model(inputs=[inputs], outputs=[layer])
+
+    model = keras.models.Model(inputs=[inputs], outputs=[inputs])
+    with self.assertRaisesRegexp(TypeError, expected_regex):
+      model._insert_layers([legacy_dense_0, legacy_dense_1])
+
+  def test_no_legacy_sequential(self):
+    layers = [
+        keras.layers.Dense(1),
+        legacy_core.Dense(1, name='legacy_dense_0')
+    ]
+
+    expected_regex = r'legacy tf\.layers\.Layers:\n  {}'.format(layers[1])
+    with self.assertRaisesRegexp(TypeError, expected_regex):
+      _ = keras.models.Sequential(layers)
+
+    with self.assertRaisesRegexp(TypeError, expected_regex):
+      _ = keras.models.Sequential([keras.layers.Input(shape=(4,))] + layers)
+
+    model = keras.models.Sequential()
+    with self.assertRaisesRegexp(TypeError, expected_regex):
+      for l in layers:
+        model.add(l)
+
   @keras_parameterized.run_with_all_model_types
   @test_util.run_in_graph_and_eager_modes
   def test_build_with_numpy_data(self):
@@ -323,6 +383,35 @@
     # Checks that variables get initialized.
     model.fit(x, y, batch_size=2, epochs=2)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_layer_names(self):
+    inputs = keras.layers.Input(shape=[2])
+    add1 = inputs + inputs
+    add2 = keras.layers.Add()([inputs, inputs])
+    add3 = inputs + inputs
+    add4 = keras.layers.Add()([inputs, inputs])
+    model = keras.models.Model(
+        inputs=[inputs], outputs=[add1, add2, add3, add4])
+    self.assertEqual(
+        [l.name for l in model.layers],
+        ['input_1', 'tf_op_layer_add', 'add', 'tf_op_layer_add_2', 'add_1'])
+
+  def test_add_trainable_weight_on_frozen_layer(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = self.add_weight(shape=(), trainable=True)
+
+      def call(self, inputs):
+        return self.w * inputs
+
+    layer = TestLayer()
+    layer.trainable = False
+    layer.build(None)
+    layer.trainable = True
+    self.assertListEqual(layer.trainable_weights, [layer.w])
+
 
 class SymbolicSupportTest(test.TestCase):
 
@@ -416,6 +505,37 @@
       function_name = last_entry[2]
       self.assertEqual(function_name, 'easily_identifiable_name')
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_summaries_in_tf_function(self):
+    if not context.executing_eagerly():
+      return
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        summary_ops_v2.scalar('mean', math_ops.reduce_mean(inputs))
+        return inputs
+
+    tmp_dir = self.get_temp_dir()
+    writer = summary_ops_v2.create_file_writer_v2(tmp_dir)
+    with writer.as_default(), summary_ops_v2.always_record_summaries():
+      my_layer = MyLayer()
+      x = array_ops.ones((10, 10))
+
+      def my_fn(x):
+        return my_layer(x)
+
+      _ = my_fn(x)
+
+    event_file = gfile.Glob(os.path.join(tmp_dir, 'events*'))
+    self.assertLen(event_file, 1)
+    event_file = event_file[0]
+    tags = set()
+    for e in summary_iterator.summary_iterator(event_file):
+      for val in e.summary.value:
+        tags.add(val.tag)
+    self.assertEqual(set(['my_layer/mean']), tags)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class NestedTrackingTest(test.TestCase):
@@ -566,13 +686,20 @@
     self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
 
   def test_name_scope_sublayer(self):
+
+    class NameScopeTracker(keras.layers.Layer):
+
+      def call(self, inputs):
+        self.active_name_scope = ops.get_name_scope()
+        return inputs
+
     x = keras.backend.placeholder(shape=(10, 10))
-    layer = keras.layers.Dense(
-        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName2')
-    y = layer(x)
+    sublayer = NameScopeTracker(name='Sublayer')
+    layer = keras.layers.Dense(10, activation=sublayer, name='MyName2')
+    layer(x)
     self.assertEqual(layer.bias.name, 'MyName2/bias:0')
     self.assertEqual(layer.kernel.name, 'MyName2/kernel:0')
-    self.assertEqual(y.name, 'MyName2/MyAct/Relu:0')
+    self.assertEqual(sublayer.active_name_scope, 'MyName2/Sublayer')
 
   def test_name_scope_tf_tensor(self):
     x = ops.convert_to_tensor(np.ones((10, 10)))
@@ -583,6 +710,204 @@
     self.assertEqual(layer.kernel.name, 'MyName3/kernel:0')
 
 
+class AutographControlFlowTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_if_training_pattern_output(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        if training:
+          return inputs * 1.
+        return inputs * 0.
+
+    inputs = keras.Input((3,))
+    outputs = MyLayer()(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse', run_eagerly=eager)
+    train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(train_loss, 0.)
+    test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(test_loss, 1.)
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_if_training_pattern_loss(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        if training:
+          loss = math_ops.reduce_sum(inputs)
+        else:
+          loss = 0.
+        self.add_loss(loss)
+        return inputs
+
+    inputs = keras.Input((3,))
+    outputs = MyLayer()(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse', run_eagerly=eager)
+    train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(train_loss, 2 * 3)
+    test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(test_loss, 0)
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_if_training_pattern_metric(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        if training:
+          metric = math_ops.reduce_sum(inputs)
+        else:
+          metric = 0.
+        self.add_metric(metric, name='my_metric', aggregation='mean')
+        return inputs
+
+    inputs = keras.Input((3,))
+    outputs = MyLayer()(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse', run_eagerly=eager)
+    _, train_metric = model.train_on_batch(np.ones((2, 3)),
+                                           np.ones((2, 3)))
+    self.assertEqual(train_metric, 2 * 3)
+    _, test_metric = model.test_on_batch(np.ones((2, 3)),
+                                         np.ones((2, 3)))
+    self.assertEqual(test_metric, 0)
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_if_training_pattern_update(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.counter = self.add_weight(
+            shape=(), trainable=False, initializer='zeros')
+
+      def call(self, inputs, training=None):
+        if training:
+          increment = 1.
+        else:
+          increment = 0.
+        self.counter.assign_add(increment)
+        return inputs
+
+    inputs = keras.Input((3,))
+    layer = MyLayer()
+    outputs = layer(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse', run_eagerly=eager)
+    model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(keras.backend.get_value(layer.counter), 1.)
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_conditional_updates_in_call(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(MyLayer, self).__init__(self, dynamic=eager)
+
+      def build(self, input_shape):
+        self.counter = self.add_weight(
+            shape=(), trainable=False, initializer='zeros')
+
+      def call(self, inputs, training=None):
+        if training:
+          z = math_ops.reduce_sum(inputs)
+          self.add_update(lambda: self.counter.assign_add(z))
+        return inputs
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    if eager:
+      inputs = keras.Input((3,))
+      layer = MyLayer()
+      outputs = layer(inputs)
+      model = keras.Model(inputs, outputs)
+      model.compile('sgd', 'mse', run_eagerly=eager)
+      model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+      self.assertEqual(keras.backend.get_value(layer.counter), 6.)
+    else:
+      # TODO(fchollet): support the same workflow in graph mode.
+      with self.assertRaisesRegexp(RuntimeError,
+                                   '`add_update` in a control flow branch'):
+        layer = MyLayer()
+        layer(keras.Input((3,)))
+        _ = layer.updates
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_conditional_losses_in_call(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(MyLayer, self).__init__(self, dynamic=eager)
+
+      def call(self, inputs, training=None):
+        if training:
+          self.add_loss(math_ops.reduce_sum(inputs))
+        return inputs
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    if eager:
+      inputs = keras.Input((3,))
+      layer = MyLayer()
+      outputs = layer(inputs)
+      model = keras.Model(inputs, outputs)
+      model.compile('sgd', 'mse')
+      loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+      self.assertEqual(loss, 2 * 3)
+    else:
+      with self.assertRaisesRegexp(RuntimeError,
+                                   '`add_loss` in a control flow branch'):
+        layer = MyLayer()(keras.Input((3,)))
+
+  @parameterized.named_parameters(('eager', True),
+                                  ('symbolic', False))
+  def test_conditional_metrics_in_call(self, eager):
+
+    class MyLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(MyLayer, self).__init__(self, dynamic=eager)
+
+      def call(self, inputs, training=None):
+        if training:
+          self.add_metric(math_ops.reduce_sum(inputs),
+                          name='sum',
+                          aggregation='mean')
+        return inputs
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    if eager:
+      inputs = keras.Input((3,))
+      layer = MyLayer()
+      outputs = layer(inputs)
+      model = keras.Model(inputs, outputs)
+      model.compile('sgd', 'mse')
+      history = model.fit(np.ones((2, 3)), np.ones((2, 3)))
+      self.assertEqual(history.history['sum'][-1], 2 * 3)
+    else:
+      # TODO(fchollet): support the same workflow in graph mode.
+      with self.assertRaisesRegexp(RuntimeError,
+                                   '`add_metric` in a control flow branch'):
+        layer = MyLayer()(keras.Input((3,)))
+
+
 _LAYERS_TO_TEST = [
     (keras.layers.Dense, (1,), collections.OrderedDict(units=[1])),
     (keras.layers.Activation, (2, 2),
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 61622bf..0b2d3f7 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -23,13 +23,12 @@
 
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
-from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
@@ -350,6 +349,27 @@
   return getattr(_call_context, 'frozen', False)
 
 
+def is_in_keras_graph():
+  """Returns if currently executing inside of a Keras graph."""
+  # Returns True even if in a subgraph of the Keras graph, such as those
+  # created by control flow ops.
+  if context.executing_eagerly():
+    return False
+  return (getattr(backend.get_graph(), 'name', None) == 'keras_graph' or
+          getattr(_call_context, 'in_keras_graph', False))
+
+
+def is_in_eager_or_tf_function():
+  """Returns if in eager mode or inside of a tf.function."""
+  return context.executing_eagerly() or is_in_tf_function()
+
+
+def is_in_tf_function():
+  """Returns if inside of a tf.function."""
+  return (ops.executing_eagerly_outside_functions() and
+          not context.executing_eagerly() and not is_in_keras_graph())
+
+
 def uses_keras_history(tensors):
   """Check if at least one Tensor originates from a `keras.Input`.
 
@@ -412,7 +432,11 @@
   """Scope that marks when we are currently inside a Layer/Model's `call`."""
   was_in_call = is_in_call_context()
   was_frozen = is_in_frozen_context()
+  was_in_keras_graph = getattr(_call_context, 'in_keras_graph', False)
   _call_context.in_call = True
+  _call_context.in_keras_graph = (
+      was_in_keras_graph or
+      getattr(backend.get_graph(), 'name', None) == 'keras_graph')
   if not layer.trainable:
     _call_context.frozen = True
   try:
@@ -420,6 +444,7 @@
   finally:
     _call_context.in_call = was_in_call
     _call_context.frozen = was_frozen
+    _call_context.in_keras_graph = was_in_keras_graph
 
 
 def training_arg_passed_to_call(argspec, args, kwargs):
@@ -430,123 +455,6 @@
   return 'training' in full_args
 
 
-class AutoAddUpdates(object):
-  """Automatically track stateful ops with `add_update`.
-
-  This context manager is used to automatically add stateful ops to a Layer
-  or Model's `.updates`. This ensures that stateful ops are run in the Keras
-  training loop. It also allows for these stateful ops to be disabled by
-  setting `trainable=False`.
-
-  Example:
-
-  ```
-  with AutoAddUpdates(layer, inputs) as auto_updates:
-    outputs = layer.call(inputs)
-    auto_updates.set_outputs(outputs)
-  ```
-
-  Attributes:
-    layer: Layer or Model instance to add the updates to.
-    inputs: The inputs to this Layer or Model, to be used for input-conditional
-      updates.
-    outputs: The outputs of this Layer or Model.
-  """
-
-  def __init__(self, layer, inputs):
-    self.layer = layer
-    self.inputs = inputs
-    self.outputs = []
-
-  def set_outputs(self, outputs):
-    if self.outputs:
-      raise RuntimeError('`set_outputs` should only be called once on an'
-                         '`AutoAddUpdates` instance.')
-    self.outputs = outputs
-
-  def __enter__(self):
-    # Only run in V2 Function mode.
-    if (context.executing_eagerly() or
-        not ops.executing_eagerly_outside_functions()):
-      return self
-
-    self._graph = ops.get_default_graph()
-    self._num_operations = len(self._graph.get_operations())
-    return self
-
-  def __exit__(self, error_type, unused_value, unused_traceback):
-    if error_type:
-      # Allow errors that occurred inside this context manager to pass through
-      # normally.
-      return
-
-    # Only run in V2 Function mode.
-    if (context.executing_eagerly() or
-        not ops.executing_eagerly_outside_functions()):
-      return
-
-    if (self._graph is not ops.get_default_graph() or
-        self._graph.name != 'keras_graph'):
-      # Only auto-track updates when the Keras Graph is the only one used.
-      return
-
-    new_operations = self._graph.get_operations()[self._num_operations:]
-    new_stateful_ops = set()
-
-    # pylint: disable=protected-access
-    for op in new_operations:
-      # While loop is not supported in general for automatic control
-      # dependencies.
-      if control_flow_util.IsInWhileLoop(op):
-        continue
-
-      # Track stateful ops via `add_update`.
-      is_stateful_op = (
-          op.type not in self._graph._registered_ops or
-          auto_control_deps.op_is_stateful(
-              self._graph._registered_ops[op.type]))
-
-      # Ignore ReadVariableOps as they are not needed to be run separately.
-      # This ensures existing Layers don't get extra updates.
-      if is_stateful_op and op.type != 'ReadVariableOp':
-        new_stateful_ops.add(op)
-
-    explicit_updates = set([
-        u for u in self.layer._get_unfiltered_updates(check_trainable=False)
-        if not isinstance(u, tuple)
-    ])
-    # pylint: enable=protected-access
-
-    # Don't add updates that will already be run by virtue of being consumed by
-    # other stateful ops or by the Layer's outputs. This ensures that existing
-    # Layers like `BatchNormalization` continue to return the same values for
-    # `.update` calls.
-    minimum_ops = set()
-    targets = new_stateful_ops.union(
-        set(nest.flatten(self.outputs)), explicit_updates)
-    for op in new_stateful_ops:
-      # Scrub any ops that are consumed by the outputs or other stateful ops.
-      reachable = tf_utils.get_reachable_from_inputs(op)
-      if not (targets - {op}).intersection(reachable):
-        minimum_ops.add(op)
-    new_stateful_ops = minimum_ops
-
-    # Don't double-track updates added via explicitly calling `add_update`.
-    # Also don't double-track updates already tracked in sublayers.
-    new_stateful_ops = new_stateful_ops - explicit_updates
-
-    # Decide whether to track as input-conditional or unconditional.
-    input_reachable_ops = tf_utils.get_reachable_from_inputs(
-        self.inputs, targets=new_stateful_ops)
-    unconditional_updates = new_stateful_ops - input_reachable_ops
-    conditional_updates = new_stateful_ops - unconditional_updates
-
-    if unconditional_updates:
-      self.layer.add_update(list(unconditional_updates))
-    if conditional_updates:
-      self.layer.add_update(list(conditional_updates), inputs=self.inputs)
-
-
 def _get_var_read_dtype(input_list, should_cast):
   """Gets the dtype that AutoCastVariables should be read in."""
   if should_cast and input_list and input_list[0].dtype.is_floating:
@@ -578,3 +486,120 @@
 def is_subclassed(layer):
   return (layer.__module__.find('keras.engine') == -1 and
           layer.__module__.find('keras.layers') == -1)
+
+
+def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
+  """Checks that tensors passed to `add_*` method match the Keras graph.
+
+  When one of the `add_*` method is called inside a V2 conditional branch,
+  the underlying tensor gets created in a FuncGraph managed by control_flow_v2.
+  We need to raise clear error messages in such cases.
+
+  Arguments:
+    tensor: Tensor to check, or `False` if it is known that an error
+      should be raised.
+    method: Caller method, one of {'add_metric', 'add_loss', 'add_update'}.
+    force_raise: If an error should be raised regardless of `tensor`.
+
+  Raises:
+    RuntimeError: In case of an out-of-graph tensor.
+  """
+  if (force_raise or (ops.executing_eagerly_outside_functions() and
+                      hasattr(tensor, 'graph') and
+                      isinstance(tensor.graph,
+                                 (control_flow_util_v2.CondBranchFuncGraph,
+                                  control_flow_util_v2.WhileCondFuncGraph,
+                                  control_flow_util_v2.WhileBodyFuncGraph)))):
+    if method == 'add_metric':
+      bad_example = """
+      def call(self, inputs, training=None):
+        if training:
+          metric = compute_metric(inputs)
+          self.add_metric(metric, name='my_metric', aggregation='mean')
+        return inputs
+      """
+      correct_example = """
+      def call(self, inputs, training=None):
+        if training:
+          metric = compute_metric(inputs)
+        else:
+          metric = 0.
+        self.add_metric(metric, name='my_metric', aggregation='mean')
+        return inputs
+      """
+    elif method == 'add_loss':
+      bad_example = """
+      def call(self, inputs, training=None):
+        if training:
+          loss = compute_loss(inputs)
+          self.add_loss(loss)
+        return inputs
+      """
+      correct_example = """
+      def call(self, inputs, training=None):
+        if training:
+          loss = compute_loss(inputs)
+        else:
+          loss = 0.
+        self.add_loss(loss)
+        return inputs
+      """
+    else:
+      bad_example = """
+      def call(self, inputs, training=None):
+        if training:
+          self.add_update(self.w.assign_add(1))
+        return inputs
+      """
+      correct_example = """
+      def call(self, inputs, training=None):
+        if training:
+          increment = 1
+        else:
+          increment = 0
+        self.add_update(self.w.assign_add(increment))
+        return inputs
+      """
+    raise RuntimeError(
+        'You are using the method `{method}` in a control flow branch '
+        'in your layer, e.g.:\n{bad_example}\n'
+        'This is not currently supported. '
+        'You should either use static control flow (`tf.cond`) '
+        'or move your call to {method} out of the control flow branch, '
+        'e.g.:\n{correct_example}\n'
+        'You can also resolve this by marking your layer '
+        'as dynamic (eager-only) by passing '
+        '`dynamic=True` to the layer constructor. '
+        'Any kind of control flow is supported with dynamic layers. '
+        'Note that using `dynamic=True` requires you '
+        'to implement static shape inference '
+        'in the `compute_output_shape(input_shape)` method.'.format(
+            method=method,
+            bad_example=bad_example,
+            correct_example=correct_example))
+
+
+def mark_as_return(outputs, acd):
+  """Marks `outputs` as the return values for automatic control deps."""
+
+  def _mark_as_return(tensor):
+    """Marks `tensor` as the return value for automatic control deps."""
+    if not tensor_util.is_tensor(tensor):
+      return tensor
+
+    # pylint: disable=protected-access
+    return_tensor = acd.mark_as_return(tensor)
+    if getattr(tensor, '_keras_mask', None) is not None:
+      return_tensor._keras_mask = acd.mark_as_return(tensor._keras_mask)
+    else:
+      return_tensor._keras_mask = None
+
+    # Handle TensorFlow Probability attached metadata.
+    # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
+    if getattr(tensor, '_tfp_distribution', None) is not None:
+      return_tensor._tfp_distribution = tensor._tfp_distribution
+
+    return return_tensor
+    # pylint: enable=protected-access
+
+  return nest.map_structure(_mark_as_return, outputs)
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
index b0d1157..f456f71 100644
--- a/tensorflow/python/keras/engine/input_spec.py
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -91,10 +91,9 @@
   """
   if not input_spec:
     return
-  if not isinstance(input_spec, (list, tuple)):
-    input_spec = nest.flatten(input_spec)
 
   inputs = nest.flatten(inputs)
+  input_spec = nest.flatten(input_spec)
   if len(inputs) != len(input_spec):
     raise ValueError('Layer ' + layer_name + ' expects ' +
                      str(len(input_spec)) + ' inputs, '
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 91c5bc0..1b9ac7d 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -35,6 +35,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import saving
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
@@ -129,6 +130,18 @@
     def call(self, inputs):
       return self.layer1(inputs)
   ```
+
+  Allowed args in `super().__init__`:
+    name: String name of the model.
+    dynamic: (Subclassed models only) Set this to `True` if your model should
+      only be run eagerly, and should not be used to generate a static
+      computation graph. This attribute is automatically set for Functional API
+      models.
+    trainable: Boolean, whether the model's variables should be trainable.
+    dtype: (Subclassed models only) Default dtype of the model's weights (
+      default of `None` means use the type of the first input). This attribute
+      has no effect on Functional API models, which do not have weights of their
+      own.
   """
 
   # See tf.Module for the usage of this property.
@@ -150,6 +163,8 @@
       # Subclassed network
       self._init_subclassed_network(**kwargs)
 
+    tf_utils.assert_no_legacy_layers(self.layers)
+
   # Several Network methods have "no_automatic_dependency_tracking"
   # annotations. Since Network does automatic dependency tracking on attribute
   # assignment, including for common data structures such as lists, by default
@@ -164,7 +179,7 @@
   # checkpoints, but may cause "all Python objects matched" assertions to fail
   # (in which case less strict assertions may be substituted if necessary).
   @trackable.no_automatic_dependency_tracking
-  def _base_init(self, name=None):
+  def _base_init(self, name=None, **kwargs):
     # The following are implemented as property functions:
     # self.trainable_weights
     # self.non_trainable_weights
@@ -172,14 +187,18 @@
     # self.losses
     # self.updates
 
+    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic'})
+
     self._init_set_name(name, zero_based=True)
     self._activity_regularizer = None
     # This acts just like the `trainable` attribute of any layer instance.
-    # It does not affect users of the underlying layers, only users of the
-    # Network instance.
-    self.trainable = True
+    self._trainable = kwargs.get('trainable', True)
+    # This attribute has no effect if the model is created using the Functional
+    # API. Instead, `model.dynamic` is determined based on the internal layers.
+    self._dynamic = kwargs.get('dynamic', False)
     self._is_compiled = False
     self._expects_training_arg = False
+    self._layers = []
 
     # This is True for Sequential networks and Functional networks.
     self._compute_output_and_mask_jointly = False
@@ -208,7 +227,7 @@
     else:
       self._graph = ops.get_default_graph()  # Used in symbolic mode only.
       # A Network does not create weights of its own, thus has no dtype.
-    self._dtype = None
+    self._dtype = kwargs.get('dtype', None)
 
     # All layers in order of horizontal graph traversal.
     # Entries are unique. Includes input and output layers.
@@ -228,7 +247,11 @@
     self._mixed_precision_policy = policy.Policy('infer')
 
   @trackable.no_automatic_dependency_tracking
-  def _init_graph_network(self, inputs, outputs, name=None):
+  def _init_graph_network(self, inputs, outputs, name=None, **kwargs):
+    generic_utils.validate_kwargs(
+        kwargs, {'trainable'},
+        'Functional models may only specify `name` and `trainable` keyword '
+        'arguments during initialization. Got an unexpected argument:')
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
@@ -244,18 +267,14 @@
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
       base_layer_utils.create_keras_history(self._nested_outputs)
 
-    self._base_init(name=name)
+    self._base_init(name=name, **kwargs)
     self._validate_graph_inputs_and_outputs()
 
-    self._compute_previous_mask = (
-        'mask' in tf_inspect.getfullargspec(self.call).args or
-        hasattr(self, 'compute_mask'))
     # A Network does not create weights of its own, thus it is already
     # built.
     self.built = True
     self._compute_output_and_mask_jointly = True
     self._is_graph_network = True
-    self._dynamic = False
     # `_expects_training_arg` is True since the `training` argument is always
     # present in the signature of the `call` method of a graph network.
     self._expects_training_arg = True
@@ -313,8 +332,8 @@
         output_tensors=self._nested_outputs)
 
     # Build self.input_names and self.output_names.
+    self._set_output_names()
     self.input_names = []
-    self.output_names = []
     self._feed_input_names = []
     self._feed_inputs = []
     self._feed_input_shapes = []
@@ -324,14 +343,30 @@
         self._feed_input_names.append(layer.name)
         self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
         self._feed_inputs.append(layer.input)
+
+  def _set_output_names(self):
+    """Assigns unique names to the Network's outputs.
+
+    Output layers with multiple output tensors would otherwise lead to duplicate
+    names in self.output_names.
+    """
+    uniquified = []
+    output_names = set()
+    prefix_count = {}
     for layer in self._output_layers:
-      self.output_names.append(layer.name)
+      proposal = layer.name
+      while proposal in output_names:
+        existing_count = prefix_count.get(layer.name, 1)
+        proposal = '{}_{}'.format(layer.name, existing_count)
+        prefix_count[layer.name] = existing_count + 1
+      output_names.add(proposal)
+      uniquified.append(proposal)
+    self.output_names = uniquified
 
   @trackable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, name=None, dynamic=False):
-    self._base_init(name=name)
+  def _init_subclassed_network(self, name=None, **kwargs):
+    self._base_init(name=name, **kwargs)
     self._is_graph_network = False
-    self._dynamic = dynamic
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
@@ -391,14 +426,20 @@
     """Add Trackable dependencies on a list of Layers."""
     weight_layer_index = 0
     for layer_index, layer in enumerate(layers):
-      if layer.weights:
-        # Keep a separate index for layers which have weights. This allows users
-        # to insert Layers without weights anywhere in the network without
-        # breaking checkpoints.
-        self._track_trackable(
-            layer, name='layer_with_weights-%d' % weight_layer_index,
-            overwrite=True)
-        weight_layer_index += 1
+      try:
+        if layer.weights:
+          # Keep a separate index for layers which have weights. This allows
+          # users to insert Layers without weights anywhere in the network
+          # without breaking checkpoints.
+          self._track_trackable(
+              layer, name='layer_with_weights-%d' % weight_layer_index,
+              overwrite=True)
+          weight_layer_index += 1
+      except ValueError:
+        # The layer might have weights, but may not be built yet. We just treat
+        # it as layer without weight.
+        pass
+
       # Even if it doesn't have weights, we should still track everything in
       # case it has/will have Trackable dependencies.
       self._track_trackable(
@@ -464,12 +505,17 @@
     Returns:
       A list of variables.
     """
+    self._assert_weights_created()
     weights = []
     for layer in self._layers:
       weights += layer.weights
     weights += (self._trainable_weights + self._non_trainable_weights)
     return weights
 
+  @property
+  def _should_compute_mask(self):
+    return self._is_graph_network and super(Network, self)._should_compute_mask
+
   def compute_mask(self, inputs, mask):
     if not self._is_graph_network:
       return None
@@ -518,38 +564,6 @@
         return layer
     raise ValueError('No such layer: ' + name)
 
-  def _get_unfiltered_updates(self, check_trainable=True):
-    if check_trainable and not self.trainable and not self.stateful:
-      return []
-    updates = []
-    for layer in self.layers:
-      updates += layer._get_unfiltered_updates(check_trainable=check_trainable)
-    updates += list(self._updates)
-    return updates
-
-  @property
-  def _unfiltered_losses(self):
-    losses = []
-
-    # If any eager losses are present, we assume the model to be part of an
-    # eager training loop (either a custom one or the one used when
-    # `run_eagerly=True`), and so we always return just the eager losses in that
-    # case.
-    if self._eager_losses:
-      losses.extend(self._eager_losses)
-    else:
-      losses.extend(self._losses)
-    for regularizer in self._callable_losses:
-      loss_tensor = regularizer()
-      if loss_tensor is not None:
-        losses.append(loss_tensor)
-    for layer in self.layers:
-      if isinstance(layer, Network):
-        losses += layer._unfiltered_losses
-      else:
-        losses += layer.losses
-    return losses
-
   @trackable.no_automatic_dependency_tracking
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
@@ -558,134 +572,8 @@
       layer._clear_losses()
 
   @property
-  def updates(self):
-    """Retrieves the network's updates.
-
-    Will only include updates that are either
-    unconditional, or conditional on inputs to this model
-    (e.g. will not include updates that were created by layers of this model
-    outside of the model).
-
-    When the network has no registered inputs, all updates are returned.
-
-    Effectively, `network.updates` behaves like `layer.updates`.
-
-    Concrete example:
-
-    ```python
-      bn = keras.layers.BatchNormalization()
-      x1 = keras.layers.Input(shape=(10,))
-      _ = bn(x1)  # This creates 2 updates.
-
-      x2 = keras.layers.Input(shape=(10,))
-      y2 = bn(x2)  # This creates 2 more updates.
-
-      # The BN layer has now 4 updates.
-      self.assertEqual(len(bn.updates), 4)
-
-      # Let's create a model from x2 to y2.
-      model = keras.models.Model(x2, y2)
-
-      # The model does not list all updates from its underlying layers,
-      # but only the updates that are relevant to it. Updates created by layers
-      # outside of the model are discarded.
-      self.assertEqual(len(model.updates), 2)
-
-      # If you keep calling the model, you append to its updates, just like
-      # what happens for a layer.
-      x3 = keras.layers.Input(shape=(10,))
-      y3 = model(x3)
-      self.assertEqual(len(model.updates), 4)
-
-      # But if you call the inner BN layer independently, you don't affect
-      # the model's updates.
-      x4 = keras.layers.Input(shape=(10,))
-      _ = bn(x4)
-      self.assertEqual(len(model.updates), 4)
-    ```
-
-    Returns:
-        A list of update ops.
-    """
-
-    updates = self._get_unfiltered_updates(check_trainable=True)
-
-    # `updates` might contain irrelevant updates, so it needs to be filtered
-    # with respect to inputs the model has been called on.
-    relevant_inputs = []
-    for i in range(0, len(self._inbound_nodes)):
-      inputs = self.get_input_at(i)
-      if isinstance(inputs, list):
-        relevant_inputs += inputs
-      else:
-        relevant_inputs.append(inputs)
-    if not relevant_inputs:
-      return list(set(updates))
-
-    reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
-    relevant_conditional_updates = [x for x in updates if x in reachable]
-    unconditional_updates = [
-        x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
-    # A layer could be used multiple times in a nested structure,
-    # so the updates list must be de-duped.
-    return list(set(relevant_conditional_updates + unconditional_updates))
-
-  @property
-  def losses(self):
-    """Retrieves the network's losses.
-
-    Will only include losses that are either
-    unconditional, or conditional on inputs to this model
-    (e.g. will not include losses that depend on tensors
-    that aren't inputs to this model).
-
-    When the network has no registered inputs, all losses are returned.
-
-    Returns:
-        A list of loss tensors.
-    """
-    losses = self._unfiltered_losses
-
-    if context.executing_eagerly():
-      return losses
-
-    # TODO(kaftan/fchollet): Clean this up / make it obsolete.
-    # This is a super ugly, confusing check necessary to
-    # handle the case where we are executing in a function graph in eager mode
-    # but the model was constructed symbolically in a separate graph scope.
-    # We need to capture the losses created in the current graph function,
-    # and filter out the incorrect loss tensors created when symbolically
-    # building the graph.
-    # We have to use this check because the code after it that checks
-    # for reachable inputs only captures the part of the model that was
-    # built symbolically, and captures the wrong tensors from a different
-    # func graph (causing a crash later on when trying to execute the
-    # graph function)
-    if ops.executing_eagerly_outside_functions():
-      return [
-          loss for loss in losses
-          if getattr(loss, 'graph', None) == ops.get_default_graph()
-      ]
-
-    relevant_inputs = []
-    for i in range(0, len(self._inbound_nodes)):
-      inputs = self.get_input_at(i)
-      if isinstance(inputs, list):
-        relevant_inputs += inputs
-      else:
-        relevant_inputs.append(inputs)
-    if not relevant_inputs:
-      return losses
-
-    reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, losses)
-    relevant_conditional_losses = [x for x in losses if x in reachable]
-    unconditional_losses = [
-        x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
-    return list(set(
-        relevant_conditional_losses + unconditional_losses + self._losses))
-
-  @property
   def trainable_weights(self):
+    self._assert_weights_created()
     return trackable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
@@ -693,6 +581,7 @@
 
   @property
   def non_trainable_weights(self):
+    self._assert_weights_created()
     return trackable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
@@ -993,8 +882,9 @@
           if 'training' in argspec:
             kwargs.setdefault('training', training)
           if 'mask' in argspec:
-            computed_masks = nest.map_structure(lambda t: t._keras_mask,
-                                                computed_tensors)
+            computed_masks = nest.map_structure(
+                lambda t: getattr(t, '_keras_mask', None),
+                computed_tensors)
             kwargs.setdefault('mask', computed_masks)
 
           # Compute outputs.
@@ -1344,6 +1234,23 @@
     constructor. See the documentation of `tf.train.Checkpoint` and
     `tf.keras.Model` for details.
 
+    While the formats are the same, do not mix `save_weights` and
+    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
+    loaded using `Model.load_weights`. Checkpoints saved using
+    `tf.train.Checkpoint.save` should be restored using the corresponding
+    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+    `save_weights` for training checkpoints.
+
+    The TensorFlow format matches objects and variables by starting at a root
+    object, `self` for `save_weights`, and greedily matching attribute
+    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
+    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
+    means saving a `tf.keras.Model` using `save_weights` and loading into a
+    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
+    the `Model`'s variables. See the [guide to training
+    checkpoints](https://www.tensorflow.org/alpha/guide/checkpoints) for details
+    on the TensorFlow format.
+
     Arguments:
         filepath: String, path to the file to save the weights to. When saving
             in TensorFlow format, this is the prefix used for checkpoint files
@@ -1360,6 +1267,7 @@
             format.
         ValueError: For invalid/unknown format arguments.
     """
+    self._assert_weights_created()
     filepath_is_h5 = _is_hdf5_filepath(filepath)
     if save_format is None:
       if filepath_is_h5:
@@ -1398,6 +1306,11 @@
     if save_format == 'h5':
       with h5py.File(filepath, 'w') as f:
         saving.save_weights_to_hdf5_group(f, self.layers)
+        # TODO(rchao): Save this attribute in a decoupled checkpoint file
+        # that is solely for the purpose of fault tolerance.
+        if self._ckpt_saved_epoch is not None:
+          f.attrs[callbacks.CKPT_SAVED_EPOCH] = str(
+              self._ckpt_saved_epoch).encode('utf8')
     else:
       if context.executing_eagerly():
         session = None
@@ -1493,9 +1406,16 @@
           'Unable to load weights saved in HDF5 format into a subclassed '
           'Model which has not created its variables yet. Call the Model '
           'first, then load the weights.')
+    self._assert_weights_created()
     with h5py.File(filepath, 'r') as f:
       if 'layer_names' not in f.attrs and 'model_weights' in f:
         f = f['model_weights']
+      # TODO(rchao): Load this attribute from a decoupled metadata+checkpoint
+      # file that is solely for the purpose of fault tolerance. Decide if we
+      # should use TF or HDF5 format for the metadata.
+      if callbacks.CKPT_SAVED_EPOCH in f.attrs:
+        self._ckpt_saved_epoch = f.attrs[callbacks.CKPT_SAVED_EPOCH].decode(
+            'utf8')
       if by_name:
         saving.load_weights_from_hdf5_group_by_name(f, self.layers)
       else:
@@ -1665,6 +1585,7 @@
       ValueError: If the layers depend on `Input`s not found in this Model.
     """
     layers = nest.flatten(layers)
+    tf_utils.assert_no_legacy_layers(layers)
     node_to_depth = {}
     for depth, nodes in self._nodes_by_depth.items():
       node_to_depth.update({node: depth for node in nodes})
@@ -1723,6 +1644,32 @@
       self._layers.append(layer)
       self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
 
+  def _assert_weights_created(self):
+    """Asserts that all the weights for the network have been created.
+
+    For a non-dynamic network, the weights must already be created after the
+    layer has been called. For a dynamic network, the exact list of weights can
+    never be known for certain since it may change at any time during execution.
+
+    We run this check right before accessing weights or getting the Numpy value
+    for the current weights. Otherwise, if the layer has never been called,
+    the user would just get an empty list, which is misleading.
+
+    Raises:
+      ValueError: if the weights of the network has not yet been created.
+    """
+    if self.dynamic:
+      return
+    if (not self._is_graph_network and
+        'build' in self.__class__.__dict__ and
+        not self.built):
+      # For any model that has customized build() method but hasn't
+      # been invoked yet, this will cover both sequential and subclass model.
+      raise ValueError('Weights for model %s have not yet been created. '
+                       'Weights are created when the Model is first called on '
+                       'inputs or `build()` is called with an `input_shape`.' %
+                       self.name)
+
 
 def _is_hdf5_filepath(filepath):
   return (filepath.endswith('.h5') or filepath.endswith('.keras') or
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/network_test.py
similarity index 92%
rename from tensorflow/python/keras/engine/topology_test.py
rename to tensorflow/python/keras/engine/network_test.py
index 4729b53..d6d4ef6 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -30,6 +30,7 @@
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
+from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -41,7 +42,7 @@
   yaml = None
 
 
-class TopologyConstructionTest(keras_parameterized.TestCase):
+class NetworkConstructionTest(keras_parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_get_updates(self):
@@ -84,27 +85,27 @@
     self.assertEqual(len(layer.get_updates_for(None)), 1)
 
     network = network_lib.Network(x2, y2)
-    self.assertEqual(len(network.updates), 2)
+    self.assertEqual(len(network.updates), 3)
     self.assertEqual(len(network.get_updates_for(x2)), 1)
     self.assertEqual(len(network.get_updates_for(None)), 1)
 
     x3 = input_layer_lib.Input(shape=(1,))
     _ = layer.apply(x3)
-    self.assertEqual(len(network.updates), 2)
+    self.assertEqual(len(network.updates), 4)
 
     x4 = input_layer_lib.Input(shape=(1,))
     _ = network(x4)
-    self.assertEqual(len(network.updates), 3)
+    self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x2)), 1)
     self.assertEqual(len(network.get_updates_for(x4)), 1)
     self.assertEqual(len(network.get_updates_for(None)), 1)
 
     network.add_update(state_ops.assign_add(layer.a, [[1]]))
-    self.assertEqual(len(network.updates), 4)
+    self.assertEqual(len(network.updates), 6)
     self.assertEqual(len(network.get_updates_for(None)), 2)
 
     network.add_update(state_ops.assign_add(layer.b, x4), inputs=True)
-    self.assertEqual(len(network.updates), 5)
+    self.assertEqual(len(network.updates), 7)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -156,18 +157,18 @@
     self.assertEqual(len(layer.get_losses_for(None)), 1)
 
     network = network_lib.Network(x2, y2)
-    self.assertEqual(len(network.losses), 2)
-    self.assertEqual(len(network.get_losses_for(x1)), 0)
+    self.assertEqual(len(network.losses), 3)
+    self.assertEqual(len(network.get_losses_for(x1)), 1)
     self.assertEqual(len(network.get_losses_for(x2)), 1)
     self.assertEqual(len(network.get_losses_for(None)), 1)
 
     x3 = input_layer_lib.Input(shape=(1,))
     _ = layer.apply(x3)
-    self.assertEqual(len(network.losses), 2)
+    self.assertEqual(len(network.losses), 4)
 
     x4 = input_layer_lib.Input(shape=(1,))
     _ = network(x4)
-    self.assertEqual(len(network.losses), 3)
+    self.assertEqual(len(network.losses), 5)
     self.assertEqual(len(network.get_losses_for(x2)), 1)
     self.assertEqual(len(network.get_losses_for(x4)), 1)
     self.assertEqual(len(network.get_losses_for(None)), 1)
@@ -493,6 +494,19 @@
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
+  def test_multi_output_layer_output_names(self):
+    inp = keras.layers.Input(name='inp', shape=(None,), dtype=dtypes.float32)
+
+    class _MultiOutput(keras.layers.Layer):
+
+      def call(self, x):
+        return x + 1., x + 2.
+
+    out = _MultiOutput(name='out')(inp)
+    model = keras.models.Model(inp, out)
+    self.assertEqual(['out', 'out_1'], model.output_names)
+    self.assertAllClose([2., 3.], model(1.))
+
   @test_util.run_deprecated_v1
   def test_recursion(self):
     with self.cached_session():
@@ -927,6 +941,33 @@
     inputs_with_batch = keras.Input(batch_size=20, shape=5)
     self.assertEqual([20, 5], inputs_with_batch.shape.as_list())
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_model_initialization(self):
+    # Functional model
+    inputs = input_layer_lib.Input(shape=(32,))
+    outputs = keras.layers.Dense(4)(inputs)
+
+    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+      model = training.Model(inputs, outputs, name='m', trainable=False,
+                             dtype='int64')
+    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+      model = training.Model(inputs, outputs, name='m', trainable=False,
+                             dynamic=False)
+
+    model = training.Model(inputs, outputs, name='m', trainable=False)
+    self.assertEqual('m', model.name)
+    self.assertFalse(model.trainable)
+    self.assertFalse(model.dynamic)
+
+    # Subclassed model
+    model = training.Model(name='subclassed', trainable=True, dtype='int64',
+                           dynamic=True)
+    self.assertEqual('subclassed', model.name)
+    self.assertTrue(model.dynamic)
+    self.assertTrue(model.trainable)
+    w = model.add_weight('w', [], initializer=keras.initializers.Constant(1))
+    self.assertEqual(dtypes.int64, w.dtype)
+
 
 class DeferredModeTest(test.TestCase):
 
@@ -1378,5 +1419,67 @@
     self.assertAllClose(model.get_weights(), model2.get_weights())
 
 
+@keras_parameterized.run_all_keras_modes
+class WeightAccessTest(keras_parameterized.TestCase):
+
+  def test_functional_model(self):
+    inputs = keras.Input((10,))
+    x1 = keras.layers.Dense(10)(inputs)
+    x2 = keras.layers.Dense(10)(x1)
+    outputs = keras.layers.Dense(1)(x2)
+    model = keras.Model(inputs, outputs)
+
+    self.assertEqual(len(model.weights), 6)
+
+  def test_sequential_model_with_input_shape(self):
+    x1 = keras.layers.Dense(10, input_shape=(10,))
+    x2 = keras.layers.Dense(10)
+    x3 = keras.layers.Dense(1)
+    model = keras.models.Sequential([x1, x2, x3])
+
+    self.assertEqual(len(model.weights), 6)
+
+  def test_sequential_model_without_input_shape(self):
+    x1 = keras.layers.Dense(10)
+    x2 = keras.layers.Dense(10)
+    x3 = keras.layers.Dense(1)
+    model = keras.models.Sequential([x1, x2, x3])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for model .* have not yet been created'):
+      _ = model.weights
+
+  def test_subclass_model_with_build_method(self):
+    class SubclassModel(keras.models.Model):
+
+      def build(self, input_shape):
+        self.w = self.add_weight(shape=input_shape[-1], initializer='ones')
+
+      def call(self, inputs):
+        return inputs * self.w
+
+    model = SubclassModel()
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for model .* have not yet been created'):
+      _ = model.weights
+
+    model(keras.Input((10,)))
+    self.assertEqual(len(model.weights), 1)
+
+  def test_subclass_model_without_build_method(self):
+    class SubclassModel(keras.models.Model):
+
+      def __init__(self):
+        super(SubclassModel, self).__init__()
+        self.w = self.add_weight(shape=(), initializer='ones')
+
+      def call(self, inputs):
+        return inputs * self.w
+
+    model = SubclassModel()
+    self.assertEqual(len(model.weights), 1)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 4090e13..2ddcb93 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -27,6 +27,7 @@
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -104,6 +105,7 @@
 
     # Add to the model any layers passed to the constructor.
     if layers:
+      tf_utils.assert_no_legacy_layers(layers)
       for layer in layers:
         self.add(layer)
 
@@ -150,6 +152,9 @@
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
+
+    tf_utils.assert_no_legacy_layers([layer])
+
     self.built = False
     set_inputs = False
     if not self._layers:
@@ -348,9 +353,6 @@
       model.add(layer)
     if not model.inputs and build_input_shape:
       model.build(build_input_shape)
-    if not model._is_graph_network:
-      # Still needs to be built when passed input data.
-      model.built = False
     return model
 
   @property
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index abc5735..30f28b4 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -112,7 +112,9 @@
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
-    self.assertEqual(len(model.weights), 0)
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for model .* have not yet been created'):
+      len(model.weights)
     self.assertFalse(model.built)
 
     x = np.random.random((batch_size, input_dim))
@@ -137,7 +139,9 @@
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
-    self.assertEqual(len(model.weights), 0)
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for model .* have not yet been created'):
+      len(model.weights)
     self.assertFalse(model.built)
 
     x = array_ops.ones((num_samples, input_dim))
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 0296c40..78a9102 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -27,6 +27,7 @@
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -53,6 +54,9 @@
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
+_keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
+                                        'keras api usage', 'method')
+
 
 @keras_export('keras.models.Model', 'keras.Model')
 class Model(network.Network):
@@ -129,7 +133,11 @@
     # under distribution strategy scope.
     self._compile_distribution = False
 
-    self.run_eagerly = None
+    self._run_eagerly = None
+
+    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
+    # See `_maybe_load_initial_epoch_from_ckpt()` for more information.
+    self._ckpt_saved_epoch = None
 
   def get_weights(self):
     """Retrieves the weights of the model.
@@ -215,10 +223,8 @@
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    run_eagerly = kwargs.pop('run_eagerly', None)
-
-    self._run_eagerly = run_eagerly
-    optimizer = optimizers.get(optimizer)
+    _keras_api_gauge.get_cell('compile').set(True)
+    self._run_eagerly = kwargs.pop('run_eagerly', None)
 
     if distribute is not None:
       if tf2.enabled():
@@ -239,51 +245,39 @@
           self._distribution_strategy = (
               distribution_strategy_context.get_strategy())
 
-    # Validate that arguments passed by the user to `compile` are supported by
-    # DistributionStrategy.
-    if self._distribution_strategy:
-      if sample_weight_mode:
-        raise NotImplementedError('sample_weight_mode is not supported with '
-                                  'DistributionStrategy.')
-      if weighted_metrics:
-        raise NotImplementedError('weighted_metrics is not supported with '
-                                  'DistributionStrategy.')
-      if target_tensors:
-        raise ValueError('target_tensors is not supported with '
-                         'DistributionStrategy.')
+    # Check whether the experimental feature of distributing the Model without
+    # cloning is requested.
+    # TODO(b/124517980, b/124377929): Remove this temporary undocumented way
+    # of enabling the feature and graduate it to the main distributed code path.
+    self._cloning = kwargs.pop('cloning', False)
 
-      if run_eagerly:
-        raise ValueError(
-            'We currently do not support enabling `run_eagerly` with '
-            'distribution strategy.')
-
-      if not self.built or not self.inputs or not self.outputs:
-        raise ValueError(
-            'We currently do not support distribution strategy with a '
-            '`Sequential` model that is created without `input_shape`/'
-            '`input_dim` set in its first layer or a subclassed model.')
-
-    loss = loss or {}
-
-    self.optimizer = optimizer
+    self._validate_compile_param_for_distribution_strategy(self.run_eagerly,
+                                                           sample_weight_mode,
+                                                           target_tensors,
+                                                           weighted_metrics)
+    self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
     # to add a checkpoint dependency on the optimizer if it's trackable.
     if isinstance(self.optimizer, trackable.Trackable):
       self._track_trackable(
           self.optimizer, name='optimizer', overwrite=True)
-    self.loss = loss
-    self._compile_metrics = metrics or []
+    self.loss = loss or {}
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
+    self._compile_metrics = metrics or []
     self._compile_weighted_metrics = weighted_metrics
     if self.run_eagerly and target_tensors is not None:
       raise ValueError(
           'target_tensors argument is not supported when '
           'running a model eagerly.')
-    self.target_tensors = target_tensors
 
-    # Set DistributionStrategy specific parameters.
+    # _training_targets contains a list of _TrainingTarget object, which has all
+    # feedable and non feedable targets of the model and related metadata.
+    self._training_targets = []
+
+    # Set tf.distribute.Strategy specific parameters.
     self._distributed_model_cache = {}
+    self._distributed_function_cache = {}
 
     if (not context.executing_eagerly() and
         self._distribution_strategy is not None):
@@ -301,12 +295,7 @@
 
     # Prepare list of loss functions, same size of model outputs.
     self.loss_functions = training_utils.prepare_loss_functions(
-        loss, self.output_names)
-
-    self._feed_outputs = []
-    self._feed_output_names = []
-    self._feed_output_shapes = []
-    self._feed_loss_fns = []
+        self.loss, self.output_names)
 
     skip_target_indices = self._prepare_skip_target_indices()
     self._skip_target_weighing_indices = skip_target_indices[:]
@@ -317,87 +306,44 @@
 
     # Initialization for Eager mode execution.
     if self.run_eagerly:
-      if isinstance(optimizer, loss_scale_optimizer.LossScaleOptimizer):
-        # TODO(reedwm): Support this.
-        raise ValueError('We currently do not support enabling `run_eagerly` '
-                         'with a LossScaleOptimizer.')
-      # Prepare sample weights.
-      self._set_sample_weight_attributes(sample_weight_mode)
-      # Save all metric attributes per output of the model.
-      self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-      if target_tensors is not None:
-        raise ValueError('target_tensors are not currently supported in Eager '
-                         'mode.')
-      self.total_loss = None
-
-      # Set metric attributes on model.
-      self._set_metric_attributes(skip_target_indices=skip_target_indices)
-
-      self.targets = []
-      for i in range(len(self.outputs)):
-        self._feed_output_names.append(self.output_names[i])
-      self._collected_trainable_weights = self.trainable_weights
+      self._compile_eagerly(metrics, sample_weight_mode, skip_target_indices,
+                            target_tensors, weighted_metrics)
       return
 
     with K.get_graph().as_default():
       # Prepare targets of model.
-      self.targets = []
-      self._feed_targets = []
-      if target_tensors not in (None, []):
-        if isinstance(target_tensors, list):
-          if len(target_tensors) != len(self.outputs):
-            raise ValueError(
-                'When passing a list as `target_tensors`, '
-                'it should have one entry per model output. '
-                'The model has %s outputs, but you passed target_tensors=%s' %
-                (len(self.outputs), target_tensors))
-        elif isinstance(target_tensors, dict):
-          for name in target_tensors:
-            if name not in self.output_names:
-              raise ValueError(
-                  'Unknown entry in `target_tensors` '
-                  'dictionary: "' + name + '". '
-                  'Only expected the following keys: ' + str(self.output_names))
-          tmp_target_tensors = []
-          for name in self.output_names:
-            tmp_target_tensors.append(target_tensors.get(name, None))
-          target_tensors = tmp_target_tensors
-        elif tensor_util.is_tensor(target_tensors):
-          target_tensors = [target_tensors]
-        else:
-          raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                          'dict or a single tensor, but got:', target_tensors)
+      target_tensors = self._process_target_tensor_for_compile(target_tensors)
 
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
-          self.targets.append(None)
+          self._training_targets.append(_TrainingTarget(None))
         else:
-          shape = K.int_shape(self.outputs[i])
+          target = target_tensors[i]
           name = self.output_names[i]
-          if target_tensors not in (None, []):
-            target = target_tensors[i]
-          else:
-            target = None
-          if target is None or K.is_placeholder(target):
-            if target is None:
-              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
-                  self.loss_functions[i],
-                  K.dtype(self.outputs[i]))
+          shape = K.int_shape(self.outputs[i])
+          loss_fn = self.loss_functions[i]
 
-              target = K.placeholder(
-                  ndim=len(shape),
-                  name=name + '_target',
-                  sparse=K.is_sparse(self.outputs[i]),
-                  dtype=target_dtype)
-            self._feed_targets.append(target)
-            self._feed_outputs.append(self.outputs[i])
-            self._feed_output_names.append(name)
-            self._feed_output_shapes.append(shape)
-            self._feed_loss_fns.append(self.loss_functions[i])
-          else:
+          if target is not None and not K.is_placeholder(target):
             self._skip_target_weighing_indices.append(i)
-          self.targets.append(target)
+            feedable = False
+          else:
+            feedable = True
+
+          if target is None:
+            target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                loss_fn,
+                K.dtype(self.outputs[i]))
+
+            target = K.placeholder(
+                ndim=len(shape),
+                name=name + '_target',
+                sparse=K.is_sparse(self.outputs[i]),
+                dtype=target_dtype)
+
+          training_target = _TrainingTarget(
+              target, name=name, shape=shape, feedable=feedable,
+              loss_fn=loss_fn)
+          self._training_targets.append(training_target)
 
       # Save all metric attributes per output of the model.
       self._cache_output_metric_attributes(metrics, weighted_metrics)
@@ -409,9 +355,14 @@
       self._handle_metrics(
           self.outputs,
           masks=self._prepare_output_masks(),
-          targets=self.targets,
+          targets=self._targets,
           skip_target_indices=skip_target_indices)
 
+      # Prepare sample weight modes. List with the same length as model outputs.
+      self._sample_weight_modes = training_utils.prepare_sample_weight_modes(
+          self.output_names, sample_weight_mode,
+          self._skip_target_weighing_indices)
+
       # Creates the model loss and weighted metrics sub-graphs.
       self._compile_weights_loss_and_weighted_metrics()
 
@@ -425,8 +376,7 @@
       self.predict_function = None
 
       # Collected trainable weights, sorted in topological order.
-      trainable_weights = self.trainable_weights
-      self._collected_trainable_weights = trainable_weights
+      self._collected_trainable_weights = self.trainable_weights
 
       # Validate all variables were correctly created in distribution scope.
       if self._distribution_strategy and not self._compile_distribution:
@@ -663,6 +613,7 @@
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
+    _keras_api_gauge.get_cell('train').set(True)
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -804,8 +755,13 @@
         split_at = int(len(x[0]) * (1. - validation_split))
       x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
       y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
-      sample_weights, val_sample_weights = (slice_arrays(
-          sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
+      if sample_weights:
+        sample_weights, val_sample_weights = (
+            slice_arrays(sample_weights, 0, split_at),
+            slice_arrays(sample_weights, split_at),
+        )
+      else:
+        val_sample_weights = None
     else:
       if validation_steps:
         raise ValueError('`validation_steps` should not be specified if '
@@ -935,6 +891,7 @@
     Raises:
         ValueError: in case of invalid arguments.
     """
+    _keras_api_gauge.get_cell('evaluate').set(True)
     # Case 1: distribution strategy.
     if self._distribution_strategy:
       if K.in_multi_worker_mode():
@@ -1040,7 +997,7 @@
     Computation is done in batches.
 
     Arguments:
-         x: Input samples. It could be:
+        x: Input samples. It could be:
           - A Numpy array (or array-like), or a list of arrays
             (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
@@ -1087,6 +1044,7 @@
             or in case a stateful model receives a number of samples
             that is not a multiple of the batch size.
     """
+    _keras_api_gauge.get_cell('predict').set(True)
     # Case 1: distribution strategy.
     if self._distribution_strategy:
       return training_distributed.predict_distributed(self,
@@ -1205,15 +1163,23 @@
     Raises:
       ValueError: In case of invalid user-provided arguments.
     """
-    if self._distribution_strategy:
+    # If at this point we are in the replica context, then it is okay to execute
+    # the Eager code path.  The expected way to get here is to call `fit` that
+    # calls `train_on_batch` on each replica.
+    if (self._distribution_strategy and
+        distribution_strategy_context.in_cross_replica_context()):
       raise NotImplementedError('`train_on_batch` is not supported for models '
-                                'compiled with DistributionStrategy.')
+                                'distributed with tf.distribute.Strategy.')
     # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, class_weight=class_weight,
         extract_tensors_from_dataset=True)
 
-    if self.run_eagerly:
+    # If `self._distribution_strategy` is True, then we are in a replica context
+    # at this point because of the check above.  `train_on_batch` is being run
+    # for each replica by `self._distribution_strategy` and the same code path
+    # as Eager is expected to be taken.
+    if self.run_eagerly or self._distribution_strategy:
       outputs = training_eager.train_on_batch(
           self,
           x,
@@ -1227,6 +1193,7 @@
       if not isinstance(K.symbolic_learning_phase(), int):
         ins += [True]  # Add learning phase value.
 
+      self._update_sample_weight_modes(sample_weights=sample_weights)
       self._make_train_function()
       outputs = self.train_function(ins)  # pylint: disable=not-callable
 
@@ -1276,14 +1243,17 @@
     Raises:
         ValueError: In case of invalid user-provided arguments.
     """
-    if self._distribution_strategy:
+    if (self._distribution_strategy and
+        distribution_strategy_context.in_cross_replica_context()):
       raise NotImplementedError('`test_on_batch` is not supported for models '
-                                'compiled with DistributionStrategy.')
+                                'distributed with tf.distribute.Strategy.')
     # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
 
-    if self.run_eagerly:
+    # If `self._distribution_strategy` is True, then we are in a replica context
+    # at this point.
+    if self.run_eagerly or self._distribution_strategy:
       outputs = training_eager.test_on_batch(
           self,
           x,
@@ -1294,6 +1264,7 @@
       x = training_utils.ModelInputs(x).as_list()
       inputs = x + (y or []) + (sample_weights or [])
 
+      self._update_sample_weight_modes(sample_weights=sample_weights)
       self._make_test_function()
       outputs = self.test_function(inputs)  # pylint: disable=not-callable
 
@@ -1322,13 +1293,17 @@
         ValueError: In case of mismatch between given number of inputs and
           expectations of the model.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`predict_on_batch` is not supported for '
-                                'models compiled with DistributionStrategy.')
+    if (self._distribution_strategy and
+        distribution_strategy_context.in_cross_replica_context()):
+      raise NotImplementedError(
+          '`predict_on_batch` is not supported for models distributed with'
+          ' tf.distribute.Strategy.')
     # Validate and standardize user data.
     inputs, _, _ = self._standardize_user_data(
         x, extract_tensors_from_dataset=True)
-    if self.run_eagerly:
+    # If `self._distribution_strategy` is True, then we are in a replica context
+    # at this point.
+    if self.run_eagerly or self._distribution_strategy:
       inputs = training_utils.cast_if_floating_dtype(inputs)
       if isinstance(inputs, collections.Sequence):
         # Unwrap lists with only one input, as we do when training on batch
@@ -1459,7 +1434,8 @@
     """
     if self._distribution_strategy:
       raise NotImplementedError('`fit_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
+                                'models compiled with tf.distribute.Strategy.')
+    _keras_api_gauge.get_cell('train').set(True)
     return training_generator.fit_generator(
         self,
         generator,
@@ -1531,7 +1507,8 @@
     """
     if self._distribution_strategy:
       raise NotImplementedError('`evaluate_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
+                                'models compiled with tf.distribute.Strategy.')
+    _keras_api_gauge.get_cell('evaluate').set(True)
     return training_generator.evaluate_generator(
         self,
         generator,
@@ -1587,7 +1564,8 @@
     """
     if self._distribution_strategy:
       raise NotImplementedError('`predict_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
+                                'models compiled with tf.distribute.Strategy.')
+    _keras_api_gauge.get_cell('predict').set(True)
     return training_generator.predict_generator(
         self,
         generator,
@@ -1598,13 +1576,147 @@
         verbose=verbose,
         callbacks=callbacks)
 
+  def _validate_compile_param_for_distribution_strategy(
+      self, run_eagerly, sample_weight_mode, target_tensors, weighted_metrics):
+    # Validate that arguments passed by the user to `compile` are supported by
+    # tf.distribute.Strategy.
+    if self._distribution_strategy:
+      if sample_weight_mode:
+        raise NotImplementedError('sample_weight_mode is not supported with '
+                                  'tf.distribute.Strategy.')
+      if weighted_metrics:
+        raise NotImplementedError('weighted_metrics is not supported with '
+                                  'tf.distribute.Strategy.')
+      if target_tensors:
+        raise ValueError('target_tensors is not supported with '
+                         'tf.distribute.Strategy.')
+
+      if run_eagerly:
+        raise ValueError(
+            'We currently do not support enabling `run_eagerly` with '
+            'distribution strategy.')
+
+      if (distributed_training_utils.is_distributing_by_cloning(self) and
+          (not self.built or not self.inputs or not self.outputs)):
+        raise ValueError(
+            'We currently do not support distribution strategy with a '
+            '`Sequential` model that is created without `input_shape`/'
+            '`input_dim` set in its first layer or a subclassed model.')
+
+  def _process_target_tensor_for_compile(self, target_tensors):
+    if target_tensors not in (None, []):
+      if isinstance(target_tensors, list):
+        if len(target_tensors) != len(self.outputs):
+          raise ValueError(
+              'When passing a list as `target_tensors`, '
+              'it should have one entry per model output. '
+              'The model has %s outputs, but you passed target_tensors=%s' %
+              (len(self.outputs), target_tensors))
+      elif isinstance(target_tensors, dict):
+        unexpected_target_tensor_names = set(target_tensors.keys()).difference(
+            self.output_names)
+        if unexpected_target_tensor_names:
+          raise ValueError(
+              'Unknown entry in `target_tensors` dictionary: "{name}". '
+              'Only expected the following keys: {keys}'.format(
+                  name=unexpected_target_tensor_names,
+                  keys=str(self.output_names)))
+        tmp_target_tensors = []
+        for name in self.output_names:
+          tmp_target_tensors.append(target_tensors.get(name, None))
+        target_tensors = tmp_target_tensors
+      elif tensor_util.is_tensor(target_tensors):
+        target_tensors = [target_tensors]
+      else:
+        raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                        'dict or a single tensor, but got:', target_tensors)
+    else:
+      # In case target tensor is empty or None, create a list with Nones
+      # that has same length as self.output_names. With that, the None check of
+      # target tensor can be skipped downstream.
+      target_tensors = [None for _ in self.output_names]
+    return target_tensors
+
+  def _compile_eagerly(self, metrics, sample_weight_mode,
+                       skip_target_indices, target_tensors, weighted_metrics):
+    if isinstance(self.optimizer, loss_scale_optimizer.LossScaleOptimizer):
+      # TODO(reedwm): Support this.
+      raise ValueError('We currently do not support enabling `run_eagerly` '
+                       'with a LossScaleOptimizer.')
+    # Prepare sample weight modes. List with the same length as model outputs.
+    self._sample_weight_modes = training_utils.prepare_sample_weight_modes(
+        self.output_names, sample_weight_mode,
+        self._skip_target_weighing_indices)
+    # Prepare sample weights.
+    self._prepare_sample_weights()
+    # Save all metric attributes per output of the model.
+    self._cache_output_metric_attributes(metrics, weighted_metrics)
+    if target_tensors is not None:
+      raise ValueError('target_tensors are not currently supported in Eager '
+                       'mode.')
+    self.total_loss = None
+    # Set metric attributes on model.
+    self._set_metric_attributes(skip_target_indices=skip_target_indices)
+    for i in range(len(self.outputs)):
+      self._training_targets.append(
+          _TrainingTarget(None, self.output_names[i], None, True, None))
+    self._collected_trainable_weights = self.trainable_weights
+
+  def _update_sample_weight_modes(self, sample_weights=None):
+    """Updates sample weight modes based on training/eval inputs.
+
+    If model contains `_sample_weight_modes` we check if the input
+    `sample_weights` corresponds to the sample weight modes.
+      1. If sample weight mode for output i is 'temporal', we do not
+        change it as the `temporal` mode has been set by the user.
+      2. Set sample weight mode to be 'samplewise' for output i if sample
+        weight mode was not set before and sample weight inputs are given.
+      3. Reset sample weight mode to None for output i if sample weight mode
+        was set to 'samplewise' but there is no sample weight input.
+
+    Args:
+      sample_weights: List of sample weights of the same length as model outputs
+        or None.
+    """
+    if not getattr(self, '_sample_weight_modes', []):
+      return
+    for i in range(len(self._sample_weight_modes)):
+      sample_weight = sample_weights[i] if sample_weights else None
+      if self._sample_weight_modes[i] == 'temporal':
+        # If sample weight mode for output i is 'temporal', do nothing.
+        continue
+      if self._sample_weight_modes[i] is None and sample_weight is not None:
+        # Set sample weight mode to be 'samplewise' for output i if sample
+        # weight mode was not set before and sample weight inputs are given.
+        self._sample_weight_modes[i] = 'samplewise'
+      elif (self._sample_weight_modes[i] == 'samplewise' and
+            sample_weight is None):
+        # Reset sample weight mode to None for output i if sample weight mode
+        # was set to 'samplewise' but there is no sample weight input.
+        self._sample_weight_modes[i] = None
+
+  def _recompile_weights_loss_and_weighted_metrics(self):
+    recompile = False
+    for i, mode in enumerate(self._sample_weight_modes):
+      if ((mode is not None and self.sample_weights[i] is None) or
+          (mode is None and self.sample_weights[i] is not None)):
+        # If there is a mismatch between sample weight mode and the placeholders
+        # created, then recompile the sub-graphs that depend on sample weights.
+        recompile = True
+        break
+
+    if recompile:
+      self._compile_weights_loss_and_weighted_metrics()
+    return recompile
+
+  @trackable.no_automatic_dependency_tracking
   def _compile_weights_loss_and_weighted_metrics(self):
     """Compiles the model loss and weighted metric sub-graphs."""
 
     with K.get_graph().as_default():
 
       # Prepare sample weights.
-      self._set_sample_weight_attributes(self.sample_weight_mode)
+      self._prepare_sample_weights()
 
       masks = self._prepare_output_masks()
       skip_target_indices = self._prepare_skip_target_indices()
@@ -1613,9 +1725,10 @@
       self._handle_metrics(
           self.outputs,
           masks=masks,
-          targets=self.targets,
+          targets=self._targets,
           skip_target_indices=skip_target_indices,
-          sample_weights=self.sample_weights)
+          sample_weights=self.sample_weights,
+          return_weighted_metrics=True)
 
       # Compute total loss.
       # Used to keep track of the total loss value (stateless).
@@ -1661,7 +1774,7 @@
     skip_target_indices = skip_target_indices or []
     total_loss = None
     with K.name_scope('loss'):
-      zipped_inputs = zip(self.targets, self.outputs, self.loss_functions,
+      zipped_inputs = zip(self._targets, self.outputs, self.loss_functions,
                           self.sample_weights, masks, self.loss_weights_list)
       for i, (y_true, y_pred, loss_fn, sample_weight, mask,
               loss_weight) in enumerate(zipped_inputs):
@@ -1681,23 +1794,23 @@
                       mask, None, sample_weight))
               sample_weight *= mask
 
-          # Reset reduction on the loss so that we can get the per sample loss
-          # value. We use this to get both the stateless and stateful loss
-          # values without having to compute the underlying loss function
-          # twice.
-          weighted_losses = None
           if hasattr(loss_fn, 'reduction'):
-            current_loss_reduction = loss_fn.reduction
-            loss_fn.reduction = losses_utils.ReductionV2.NONE
-            weighted_losses = loss_fn(
-                y_true, y_pred, sample_weight=sample_weight)
-            loss_fn.reduction = current_loss_reduction
+            per_sample_losses = loss_fn.call(y_true, y_pred)
+            weighted_losses = losses_utils.compute_weighted_loss(
+                per_sample_losses,
+                sample_weight=sample_weight,
+                reduction=losses_utils.ReductionV2.NONE)
+            loss_reduction = loss_fn.reduction
+
+            # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all
+            # compile use cases.
+            if loss_reduction == losses_utils.ReductionV2.AUTO:
+              loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
 
             # Compute the stateless loss value.
             output_loss = losses_utils.reduce_weighted_loss(
-                weighted_losses, reduction=current_loss_reduction)
-            if (current_loss_reduction ==
-                losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE):
+                weighted_losses, reduction=loss_reduction)
+            if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
               output_loss = losses_utils.scale_loss_for_distribution(
                   output_loss)
           else:
@@ -1711,19 +1824,13 @@
             output_loss = losses_utils.scale_loss_for_distribution(output_loss)
 
         if len(self.outputs) > 1:
-          # Keep track of stateful result tensor and function for the loss.
-          # Compute the stateful loss value.
-          if weighted_losses is not None:
-            # TODO(b/120571621): Directly call metric when the bug is fixed.
-            aggregated_output_loss = (
-                distributed_training_utils.call_replica_local_fn(
-                    self._output_loss_metrics[i],
-                    weighted_losses,
-                    strategy=self._distribution_strategy))
-          else:
-            # Custom loss class.
-            aggregated_output_loss = self._call_metric_fn(
-                self._output_loss_metrics[i], y_true, y_pred, sample_weight)
+          # Keep track of stateful result tensor for the loss.
+          # TODO(b/120571621): Directly call metric when the bug is fixed.
+          aggregated_output_loss = (
+              distributed_training_utils.call_replica_local_fn(
+                  self._output_loss_metrics[i],
+                  output_loss,
+                  strategy=self._distribution_strategy))
           self._compile_metrics_tensors[loss_name] = aggregated_output_loss
 
         if total_loss is None:
@@ -1850,26 +1957,38 @@
     return batch_size
 
   def _list_functions_for_serialization(self):
-    return {
-        '_default_save_signature': saving_utils.trace_model_call(self)
-    }
+    """If available, saves a trace of call using self.inputs."""
+    all_functions = super(Model, self)._list_functions_for_serialization()
+    try:
+      # pylint:disable=pointless-statement
+      self.inputs
+      self.input_names
+      # pylint:enable=pointless-statement
+    except AttributeError:
+      # If the model does not have inputs set, because it was not called or its
+      # input shapes were not recorded, we won't have a signature so can't trace
+      # a function. But the user may still save an object with this Model
+      # attached; we won't fail the whole tf.saved_model.save.
+      pass
+    else:
+      if '_default_save_signature' not in all_functions:
+        all_functions['_default_save_signature'] = (
+            saving_utils.trace_model_call(self))
+    return all_functions
 
-  def _set_sample_weight_attributes(self, sample_weight_mode):
-    """Sets sample weight related attributes on the model."""
-    sample_weights, sample_weight_modes = training_utils.prepare_sample_weights(
-        self.output_names, sample_weight_mode,
-        self._skip_target_weighing_indices)
-    self.sample_weights = sample_weights
-    self.sample_weight_modes = sample_weight_modes
-    self._feed_sample_weight_modes = [
-        sample_weight_modes[i]
-        for i in range(len(self.outputs))
-        if i not in self._skip_target_weighing_indices
-    ]
+  def _prepare_sample_weights(self):
+    """Sets sample weight attribute on the model."""
+    # List with the same length as model outputs.
+    self.sample_weights = []
+    for i, name in enumerate(self.output_names):
+      self.sample_weights.append(
+          training_utils.get_output_sample_weight(
+              self._skip_target_weighing_indices, self._sample_weight_modes[i],
+              name, i))
+
+    # Filtering just the placeholders from the above list.
     self._feed_sample_weights = [
-        sample_weights[i]
-        for i in range(len(sample_weights))
-        if i not in self._skip_target_weighing_indices
+        s for s in self.sample_weights if s is not None
     ]
 
   def _cache_output_metric_attributes(self, metrics, weighted_metrics):
@@ -1991,12 +2110,12 @@
           self._set_per_output_metric_attributes(
               self._per_output_weighted_metrics[i], i))
 
-    # Create a metric wrapper for each output loss.
+    # Create a metric wrapper for each output loss. This computes mean of an
+    # output loss across mini-batches (irrespective of how we reduce within a
+    # batch).
     if len(self.outputs) > 1:
       self._output_loss_metrics = [
-          metrics_module.SumOverBatchSize() if hasattr(loss_fn, 'reduction')
-          else metrics_module.SumOverBatchSizeMetricWrapper(loss_fn)
-          for loss_fn in self.loss_functions
+          metrics_module.Mean() for _ in self.loss_functions
       ]
 
     self._per_output_metrics = updated_per_output_metrics
@@ -2049,6 +2168,7 @@
                       targets=None,
                       sample_weights=None,
                       masks=None,
+                      return_weighted_metrics=False,
                       return_weighted_and_unweighted_metrics=False):
     """Handles calling metric functions.
 
@@ -2058,10 +2178,13 @@
       targets: List of targets.
       sample_weights: Optional list of sample weight arrays.
       masks: List of computed output mask values.
+      return_weighted_metrics: Flag that indicates whether weighted metrics
+        should be computed instead of unweighted metrics. This flag is ignored
+        when `return_weighted_and_unweighted_metrics` is enabled.
       return_weighted_and_unweighted_metrics: Flag that is used to indicate
         whether both weighted and unweighted metrics should be computed. When
-        this is not enabled, we use `sample_weights` param to indicate whether
-        weighted or unweighted metrics should be returned.
+        this is not enabled, we use `return_weighted_metrics` param to
+        indicate whether weighted or unweighted metrics should be returned.
 
     Returns:
       A list of metric result tensors.
@@ -2077,18 +2200,19 @@
         target = targets[i] if targets else None
         output_mask = masks[i] if masks else None
 
-        if return_weighted_and_unweighted_metrics or sample_weights is None:
+        if (return_weighted_and_unweighted_metrics or
+            not return_weighted_metrics):
           metric_results.extend(
               self._handle_per_output_metrics(self._per_output_metrics[i],
                                               target, output, output_mask))
-        if return_weighted_and_unweighted_metrics or sample_weights is not None:
+        if return_weighted_and_unweighted_metrics or return_weighted_metrics:
           metric_results.extend(
               self._handle_per_output_metrics(
                   self._per_output_weighted_metrics[i],
                   target,
                   output,
                   output_mask,
-                  weights=sample_weights[i]))
+                  weights=sample_weights[i] if sample_weights else None))
     return metric_results
 
   def _check_trainable_weights_consistency(self):
@@ -2110,13 +2234,17 @@
           ' without calling `model.compile` after ?', 1)
 
   def _make_train_function(self):
+    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
     metrics_tensors = [
         self._all_metrics_tensors[m] for m in self.metrics_names[1:]
     ]
     if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
-    if getattr(self, 'train_function') is None:
+    # If we have re-compiled the loss/weighted metric sub-graphs then create
+    # train function even if one exists already. This is because
+    # `_feed_sample_weights` list has been updated on re-copmpile.
+    if getattr(self, 'train_function') is None or has_recompiled:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
@@ -2144,12 +2272,16 @@
         setattr(self, 'train_function', fn)
 
   def _make_test_function(self):
+    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
     metrics_tensors = [
         self._all_metrics_tensors[m] for m in self.metrics_names[1:]
     ]
     if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
-    if getattr(self, 'test_function') is None:
+    # If we have re-compiled the loss/weighted metric sub-graphs then create
+    # test function even if one exists already. This is because
+    # `_feed_sample_weights` list has been updated on re-copmpile.
+    if getattr(self, 'test_function') is None or has_recompiled:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
@@ -2204,7 +2336,7 @@
                                           allow_partial_batch=False):
     """Runs validation checks on input and target data passed by the user.
 
-    This is called when using DistributionStrategy to train, evaluate or serve
+    This is called when using tf.distribute.Strategy to train, evaluate or serve
     the model.
 
     Args:
@@ -2234,7 +2366,7 @@
     """
     if class_weight:
       raise NotImplementedError('`class_weight` is currently not supported '
-                                'when using DistributionStrategy.')
+                                'when using tf.distribute.Strategy.')
 
     if (sample_weight is not None and sample_weight.all() and
         distributed_training_utils.is_tpu_strategy(
@@ -2303,6 +2435,16 @@
         # input shape which is required for TPUs.
         drop_remainder = (not allow_partial_batch and
                           strategy.extended.experimental_require_static_shapes)
+
+        # TODO(b/131720208): We still drop remainder here if number of examples
+        # is divisible by batch size, as sometimes dynamic padder will time out
+        # with keras.metrics.CategoricalAccuracy() metric.
+        if distributed_training_utils.is_tpu_strategy(
+            strategy) and not drop_remainder:
+          dataset_size = first_x_value.shape[0]
+          if dataset_size % batch_size == 0:
+            drop_remainder = True
+
         x = ds.batch(batch_size, drop_remainder=drop_remainder)
       else:
         assert isinstance(x, dataset_ops.DatasetV2)
@@ -2510,7 +2652,8 @@
             weighted_metrics=self._compile_weighted_metrics,
             loss_weights=self.loss_weights,
             target_tensors=target_tensors,
-            run_eagerly=self.run_eagerly)
+            run_eagerly=self.run_eagerly,
+            cloning=self._cloning)
 
     # In graph mode, if we had just set inputs and targets as symbolic tensors
     # by invoking build and compile on the model respectively, we do not have to
@@ -2520,7 +2663,7 @@
     # mixed symbolic/value inputs.
     if (not self.run_eagerly and is_build_called and is_compile_called and
         not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
-      return [], [], []
+      return [], [], None
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
@@ -2559,7 +2702,7 @@
         feed_sample_weight_modes = [None for _ in self.outputs]
       else:
         feed_output_names = self._feed_output_names
-        feed_sample_weight_modes = self._feed_sample_weight_modes
+        feed_sample_weight_modes = self._sample_weight_modes
         feed_output_shapes = []
         for output_shape, loss_fn in zip(self._feed_output_shapes,
                                          self._feed_loss_fns):
@@ -2610,9 +2753,18 @@
           # Additional checks to avoid users mistakenly using improper loss fns.
           training_utils.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
+
+      # If sample weight mode has not been set and weights are None for all the
+      # model outputs, return None (we do not create placeholders for
+      # sample weights) so we do not want to feed any value.
+      is_sample_weight_mode_set = any(
+          s is not None for s in feed_sample_weight_modes)
+      if (not is_sample_weight_mode_set and
+          all(s is None for s in sample_weights)):
+        sample_weights = None  # If the list contains only None, return None
     else:
       y = []
-      sample_weights = []
+      sample_weights = None
 
     if self.stateful and batch_size:
       # Check that for stateful networks, number of samples is a multiple
@@ -2741,9 +2893,56 @@
     self.output_names = training_utils.generic_output_names(outputs)
     self.built = True
 
+  @property
+  def _targets(self):
+    """The output target tensors for the model."""
+    return [t.target for t in self._training_targets]
+
+  @property
+  def _feed_targets(self):
+    return [t.target for t in self._training_targets if t.feedable]
+
+  @property
+  def _feed_output_names(self):
+    return [t.name for t in self._training_targets if t.feedable]
+
+  @property
+  def _feed_output_shapes(self):
+    return [t.shape for t in self._training_targets if t.feedable]
+
+  @property
+  def _feed_loss_fns(self):
+    return [t.loss_fn for t in self._training_targets if t.feedable]
+
+  def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+    """Maybe load initial epoch from ckpt considering possible worker recovery.
+
+    When `_ckpt_saved_epoch` attribute is not None in a `Model` object at the
+    time the training starts, this is under multi-worker training setting and
+    indicates the worker is recovering from previous failure. In this case,
+    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
+    unfinished training from certain epoch.
+
+    Arguments:
+      initial_epoch: The original initial_epoch user passes in in `fit()`.
+      mode: The training mode.
+
+    Returns:
+      If the training is recovering from previous failure under multi-worker
+      training setting, return the epoch the training is supposed to continue
+      at. Otherwise, return the `initial_epoch` the user passes in.
+    """
+    # TODO(rchao): Add recovery for validation case
+    # (when mode == ModeKeys.TEST).
+    if mode == ModeKeys.TRAIN and self._ckpt_saved_epoch is not None:
+      # The most recently saved epoch is one epoch prior to the epoch it failed
+      # at, so return '_ckpt_saved_epoch' plus one.
+      return int(self._ckpt_saved_epoch) + 1
+    return initial_epoch
+
 
 class DistributedCallbackModel(Model):
-  """Model that is used for callbacks with DistributionStrategy."""
+  """Model that is used for callbacks with tf.distribute.Strategy."""
 
   def __init__(self, model):
     super(DistributedCallbackModel, self).__init__()
@@ -2775,10 +2974,60 @@
   def __getattr__(self, item):
     # Whitelisted atttributes of the model that can be accessed by the user
     # during a callback.
-    if item not in ['_setattr_tracking']:
+    if item not in ('_setattr_tracking', '_layers'):
       logging.warning('You are accessing attribute ' + item + ' of the '
                       'DistributedCallbackModel that may not have been set '
                       'correctly.')
+    return super(DistributedCallbackModel, self).__getattr__(item)
+
+
+class _TrainingTarget(object):
+  """Container for a target tensor and its metadata (shape, loss...).
+
+  Arguments:
+    target: A target tensor for the model. It may be `None` if the
+      output is excluded from loss computation. It is still kept as None
+      since each output of the model should have a corresponding target. If
+      the target is None, the rest of the attributes will be None as well.
+    name: String, the name of the target tensor.
+    shape: The shape of the target tensor.
+    feedable: Boolean, whether the target is feedable (requires data to be
+      passed in `fit` or `train_on_batch`), or not (model compiled with
+      `target_tensors` argument).
+    loss_fn: The loss function corresponding to this target. May be `None`.
+  """
+
+  def __init__(self,
+               target,
+               name=None,
+               shape=None,
+               feedable=False,
+               loss_fn=None):
+    self._target = target
+    self._name = name
+    self._shape = shape
+    self._feedable = feedable
+    self._loss_fn = loss_fn
+
+  @property
+  def target(self):
+    return self._target
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def shape(self):
+    return self._shape
+
+  @property
+  def feedable(self):
+    return self._feedable
+
+  @property
+  def loss_fn(self):
+    return self._loss_fn
 
 
 def _is_symbolic_tensor(x):
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index c6d11e5..ab890a1 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -140,13 +140,15 @@
   if mode == ModeKeys.TRAIN:
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
 
-  # Enter DistributionStrategy scope.
+  # Enter tf.distribute.Strategy scope.
   if model._distribution_strategy:
     scope = distributed_training_utils.distributed_scope(
         strategy=model._distribution_strategy,
         learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
     scope.__enter__()
 
+  model._update_sample_weight_modes(sample_weights=sample_weights)
+
   # Get step function and loop type.
   f = _make_execution_function(model, mode)
   use_steps = is_dataset or steps_per_epoch is not None
@@ -166,6 +168,9 @@
     ins = inputs
   else:
     ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+    # `ins` is a function when a distribute strategy is used in Eager mode.  In
+    # that case `is_dataset` is True.  The code branches that have requirements
+    # about the type of `ins` do not trigger in the distributed case.
   if not is_dataset:
     num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                      steps_per_epoch)
@@ -230,6 +235,8 @@
   callbacks._call_begin_hook(mode)
   progbar.on_train_begin()
 
+  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
+
   for epoch in range(initial_epoch, epochs):
     if callbacks.model.stop_training:
       break
@@ -258,7 +265,7 @@
 
         # Get outputs.
         try:
-          # `ins` can be callable in DistributionStrategy + eager case.
+          # `ins` can be callable in tf.distribute.Strategy + eager case.
           actual_inputs = ins() if callable(ins) else ins
           batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
@@ -298,7 +305,7 @@
           batch_outs = [batch_outs]
 
         if model._distribution_strategy:
-          batch_outs = distributed_training_utils._per_device_aggregate_batch(
+          batch_outs = distributed_training_utils._per_replica_aggregate_batch(
               batch_outs, model, mode)
 
         # Aggregate results.
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index b45c645..b125969 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -61,7 +61,7 @@
   """Fit loop for Distribution Strategies."""
   distributed_training_utils.validate_callbacks(callbacks, model.optimizer)
   distributed_training_utils.validate_inputs(
-      x, y, model._distribution_strategy)
+      x, y)
 
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
@@ -82,25 +82,34 @@
       validation_split=validation_split,
       shuffle=shuffle,
       repeat=True)
+  if not distributed_training_utils.is_distributing_by_cloning(model):
+    with model._distribution_strategy.scope():
+      (dataset, _, _) = model._standardize_user_data(
+          dataset,
+          sample_weight=sample_weight,
+          class_weight=class_weight,
+          batch_size=batch_size,
+          validation_split=validation_split,
+          shuffle=shuffle)
 
   val_dataset = None
   if validation_data:
     val_x, val_y, val_sample_weights = model._unpack_validation_data(
         validation_data)
-    distributed_training_utils.validate_inputs(
-        val_x, val_y, model._distribution_strategy)
+    distributed_training_utils.validate_inputs(val_x, val_y)
     first_valx_value = nest.flatten(val_x)[0]
     if isinstance(first_valx_value, np.ndarray):
       validation_steps, _ = distributed_training_utils.get_input_params(
           model._distribution_strategy, first_valx_value, validation_steps,
-          batch_size)
+          batch_size, mode=ModeKeys.TEST)
     val_dataset = model._distribution_standardize_user_data(
         val_x, val_y,
         sample_weight=val_sample_weights,
         class_weight=None,
         batch_size=batch_size,
         validation_split=validation_split,
-        shuffle=shuffle)
+        shuffle=shuffle,
+        allow_partial_batch=True)
   elif validation_split:
     raise ValueError('validation_split argument is not supported with '
                      'distribution strategies.')
@@ -143,16 +152,18 @@
                          steps=None,
                          callbacks=None):
   """Evaluate loop for Distribution Strategies."""
-  distributed_training_utils.validate_inputs(x, y, model._distribution_strategy)
+  distributed_training_utils.validate_inputs(x, y)
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
     steps, batch_size = distributed_training_utils.get_input_params(
-        model._distribution_strategy, first_x_value, steps, batch_size)
+        model._distribution_strategy, first_x_value, steps, batch_size,
+        mode=ModeKeys.TEST)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
   dataset = model._distribution_standardize_user_data(
       x, y,
       sample_weight=sample_weight,
-      batch_size=batch_size)
+      batch_size=batch_size,
+      allow_partial_batch=True)
 
   if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
     return experimental_tpu_test_loop(
@@ -174,8 +185,7 @@
                         steps=None,
                         callbacks=None):
   """Predict loop for Distribution Strategies."""
-  distributed_training_utils.validate_inputs(
-      x, None, model._distribution_strategy, allow_partial_batch=True)
+  distributed_training_utils.validate_inputs(x, None)
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
     steps, batch_size = distributed_training_utils.get_input_params(
@@ -199,7 +209,7 @@
         callbacks=callbacks)
 
 
-def _per_device_execution_function(model, mode):
+def _per_replica_execution_function(model, mode):
   exec_func = model._make_execution_function(mode)
   return (exec_func.inputs, exec_func.outputs, exec_func.updates_op,
           exec_func.session_kwargs)
@@ -229,12 +239,24 @@
 
   def _step_fn(ctx, inputs):
     """A step fn that returns update ops."""
-    inputs, targets = inputs
+    if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
+      inputs, targets = inputs
+    else:
+      targets = None
+
+    # When input feature is a dictionary of tensors, dictionary is flattended
+    # to an array and passed as a model input. This results in input mismatch
+    # when model input layer names are not sorted in alphabetical order as
+    # `nest.flatten()`sorts dictioary elements by keys. As so, transform input
+    # tensors into an array and order it along `model._feed_input_names`.
+    if isinstance(inputs, dict):
+      inputs = [inputs[input_name] for input_name in model._feed_input_names]
+
     _build_model(strategy, model, mode, inputs, targets)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_device_execution_function,
+         _per_replica_execution_function,
          args=(distributed_training_utils.get_distributed_model(model, mode),
                mode))
     (all_inputs, all_outputs, all_updates,
@@ -275,7 +297,7 @@
                               val_dataset=None,
                               validation_steps=None,
                               validation_freq=1):
-  """Fit loop for training with TPU DistributionStrategy.
+  """Fit loop for training with TPU tf.distribute.Strategy.
 
   Arguments:
       model: Keras Model instance.
@@ -371,6 +393,9 @@
   target_steps = len(steps_to_run)
 
   callbacks._call_begin_hook(mode)
+
+  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
+
   for epoch in range(initial_epoch, epochs):
     distributed_training_utils._reset_metrics(model)
     callbacks.on_epoch_begin(epoch)
@@ -443,7 +468,7 @@
                                verbose=0,
                                steps=None,
                                callbacks=None):
-  """Test loop for evaluating with TPU DistributionStrategy.
+  """Test loop for evaluating with TPU tf.distribute.Strategy.
 
   Arguments:
       model: Keras Model instance.
@@ -475,12 +500,16 @@
 
   def _test_step_fn(inputs):
     """A fn that returns output of single test step."""
-    inputs, targets = inputs
+    if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
+      inputs, targets = inputs
+    else:
+      targets = None
+
     (distribution_strategy_context.get_replica_context().merge_call(
         _build_model, args=(model, mode, inputs, targets)))
 
     (_, outputs, updates, _) = (
-        _per_device_execution_function(
+        _per_replica_execution_function(
             distributed_training_utils.get_distributed_model(model, mode),
             mode))
     with ops.control_dependencies([updates]):
@@ -574,7 +603,7 @@
                                   verbose=0,
                                   steps=None,
                                   callbacks=None):
-  """Predict loop for predicting with TPU DistributionStrategy.
+  """Predict loop for predicting with TPU tf.distribute.Strategy.
 
   Arguments:
       model: Keras Model instance.
@@ -631,7 +660,7 @@
         _build_model, args=(model, mode, inputs)))
 
     (_, outputs, updates, _) = (
-        _per_device_execution_function(
+        _per_replica_execution_function(
             distributed_training_utils.get_distributed_model(model, mode),
             mode))
 
@@ -644,7 +673,7 @@
   predict_input_data = iterator.get_next()
   per_replica_outputs = current_strategy.experimental_run_v2(
       _predict_step_fn, args=(predict_input_data,))
-  output_tensors = distributed_training_utils.flatten_perdevice_values(
+  output_tensors = distributed_training_utils.flatten_per_replica_values(
       current_strategy, per_replica_outputs)
 
   if verbose >= 1:
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index ff9bf41..b2fbdff 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -72,7 +73,7 @@
       for m in model.metrics
       if m not in model._compile_metric_functions
   ])
-  return [backend.mean(t) for t in metric_results]
+  return metric_results
 
 
 def _model_loss(model,
@@ -99,6 +100,7 @@
      regularization losses and applies masking and sample weighting
      to the loss value.
   """
+  # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn.
   # Used to keep track of the total loss value (stateless).
   # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
   #                   loss_weight_2 * output_2_loss_fn(...) +
@@ -119,8 +121,7 @@
   outs = model(inputs, **kwargs)
 
   outs = nest.flatten(outs)
-  # `None` by default for `EagerTensors`.
-  masks = [t._keras_mask for t in outs]
+  masks = [getattr(t, '_keras_mask', None) for t in outs]
   targets = nest.flatten(targets)
 
   # Used to keep track of individual output losses.
@@ -145,21 +146,24 @@
                 losses_utils.squeeze_or_expand_dimensions(mask, None, weights))
             weights *= mask
 
-        # Reset reduction on the loss so that we can get the per sample loss
-        # value. We use this to get both the stateless and stateful loss
-        # values without having to compute the underlying loss function
-        # twice.
         weighted_losses = None
         if hasattr(loss_fn, 'reduction'):
-          current_loss_reduction = loss_fn.reduction
-          loss_fn.reduction = losses_utils.ReductionV2.NONE
-          weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights)
-          loss_fn.reduction = current_loss_reduction
+          per_sample_losses = loss_fn.call(targets[i], outs[i])
+          weighted_losses = losses_utils.compute_weighted_loss(
+              per_sample_losses,
+              sample_weight=weights,
+              reduction=losses_utils.ReductionV2.NONE)
+          loss_reduction = loss_fn.reduction
+
+          # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all
+          # compile use cases.
+          if loss_reduction == losses_utils.ReductionV2.AUTO:
+            loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
 
           # Compute the stateless loss value.
-          output_loss = losses_utils.reduce_weighted_loss(weighted_losses)
-          if (current_loss_reduction ==
-              losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE):
+          output_loss = losses_utils.reduce_weighted_loss(
+              weighted_losses, reduction=loss_reduction)
+          if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
             output_loss = losses_utils.scale_loss_for_distribution(output_loss)
         else:
           # Compute the stateless loss value for a custom loss class.
@@ -176,20 +180,11 @@
       # associated with a model, each output's loss is calculated and returned
       # as part of the loss_metrics.
       if len(model.outputs) > 1:
-        # Compute the stateful loss value.
-        if weighted_losses is not None:
-          aggregated_output_loss = output_loss_metrics[i](weighted_losses)
-        else:
-          # Custom loss class.
-          aggregated_output_loss = training_utils.call_metric_function(
-              output_loss_metrics[i], targets[i], outs[i], weights=weights)
         # Keep track of the stateful output loss result.
-        output_losses.append(aggregated_output_loss)
+        output_losses.append(output_loss_metrics[i](output_loss))
 
       total_loss += model.loss_weights_list[i] * output_loss
 
-    if loss_fns:
-      total_loss = backend.mean(total_loss)
     # Add regularization losses
     custom_losses = model.losses
     if custom_losses:
@@ -240,13 +235,24 @@
       if total_loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
+      if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer):
+        # TODO(reedwm): Make loss_scale public instead of accessing private
+        # _loss_scale attribute.
+        loss_scale = model.optimizer._loss_scale()
+        scaled_total_loss = loss_scale_optimizer.scale_loss(total_loss,
+                                                            loss_scale)
+      else:
+        loss_scale = None
+        scaled_total_loss = total_loss
     if training:
       if not model.trainable_weights:
         logging.warning('The list of trainable weights is empty. Make sure that'
                         ' you are not setting model.trainable to False before '
                         'compiling the model.')
       else:
-        grads = tape.gradient(total_loss, model.trainable_weights)
+        grads = tape.gradient(scaled_total_loss, model.trainable_weights)
+        if loss_scale is not None:
+          grads = loss_scale_optimizer.unscale_grads(grads, loss_scale)
         model.optimizer.apply_gradients(zip(grads,
                                             model.trainable_weights))
     return outs, total_loss, output_losses, masks
@@ -272,12 +278,13 @@
   """
   if isinstance(inputs, collections.Sequence):
     if len(inputs) and tensor_util.is_tensor(inputs[0]):
-      inputs = training_utils.cast_if_floating_dtype(inputs)
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(inputs,
+                                                                     model)
       if targets:
         targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype(
-          [ops.convert_to_tensor(val) for val in inputs])
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(
+          [ops.convert_to_tensor(val) for val in inputs], model)
       if targets:
         targets = training_utils.cast_if_floating_dtype(
             [ops.convert_to_tensor(val) for val in targets])
@@ -302,7 +309,12 @@
   total_loss = nest.flatten(total_loss)
   results = total_loss + output_losses + metrics_results
 
-  return [tensor_util.constant_value(v) for v in results]
+  return [_non_none_constant_value(v) for v in results]
+
+
+def _non_none_constant_value(v):
+  constant_value = tensor_util.constant_value(v)
+  return constant_value if constant_value is not None else v
 
 
 def test_on_batch(model,
@@ -325,11 +337,12 @@
   """
   if isinstance(inputs, collections.Sequence):
     if len(inputs) and tensor_util.is_tensor(inputs[0]):
-      inputs = training_utils.cast_if_floating_dtype(inputs)
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(inputs,
+                                                                     model)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype(
-          [ops.convert_to_tensor(val) for val in inputs])
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(
+          [ops.convert_to_tensor(val) for val in inputs], model)
       targets = training_utils.cast_if_floating_dtype(
           [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
@@ -352,4 +365,4 @@
   total_loss = nest.flatten(total_loss)
   results = total_loss + output_losses + metrics_results
 
-  return [tensor_util.constant_value(v) for v in results]
+  return [_non_none_constant_value(v) for v in results]
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 3c91790..9e9c60a 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -190,6 +190,9 @@
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
   progbar.on_train_begin()
+
+  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
+
   for epoch in range(initial_epoch, epochs):
     if callbacks.model.stop_training:
       break
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 5e7d828..c2164c5 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1047,15 +1047,36 @@
     model.add_loss(2 * math_ops.reduce_mean(
         keras.losses.mean_absolute_error(targets, outputs)))
 
+    model.add_loss(keras.losses.MeanAbsoluteError()(targets, outputs))
+
     model.compile(
-        keras.optimizer_v2.gradient_descent.SGD(0.033333),
+        keras.optimizer_v2.gradient_descent.SGD(0.025),
         loss=keras.losses.MeanAbsoluteError(),
         run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.array([[0.], [1.], [2.]])
     y = np.array([[0.5], [2.], [3.5]])
     history = model.fit([x, y], y, batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [3., 2.7, 2.4, 2.1, 1.8], 1e-3)
+    self.assertAllClose(history.history['loss'], [4., 3.6, 3.2, 2.8, 2.4], 1e-3)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_unconditional_add_loss_correctness(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        # Reachable from the inputs but marked as unconditional.
+        self.add_loss(math_ops.reduce_sum(inputs))
+        return inputs
+
+    inputs = keras.Input((3,))
+    layer = MyLayer()
+    outputs = layer(inputs)
+    model = keras.Model(inputs, outputs)
+    self.assertEqual(len(model.losses), 1)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+    self.assertEqual(loss, 2 * 3)
 
   @keras_parameterized.run_all_keras_modes
   def test_clear_losses(self):
@@ -1064,7 +1085,7 @@
 
       def __init__(self):
         super(LayerWithSharedNestedLossLayer, self).__init__()
-        self.loss_layer = keras.layers.ActivityRegularization()
+        self.loss_layer = keras.layers.ActivityRegularization(l2=0.001)
         self.add_weight(shape=(1,), regularizer='l2')
 
       def call(self, x):
@@ -1074,12 +1095,20 @@
     inputs = keras.Input(shape=(1,))
     outputs = LayerWithSharedNestedLossLayer()(inputs)
     model = keras.Model(inputs, outputs)
+    # Weight loss + 2 activity losses.
+    self.assertEqual(len(model.losses), 3)
 
-    model(array_ops.ones((1, 1)))
-    self.assertEqual(len(model.losses), 3)  # Weight loss + 2 activity losses.
-
-    model(array_ops.ones((1, 1)))
-    self.assertEqual(len(model.losses), 3)  # Losses are reset upon __call__.
+    x = array_ops.ones((1, 1))
+    model(x)
+    y = array_ops.ones((1, 1))
+    model(y)
+    if context.executing_eagerly():
+      # Eager losses are cleared every `__call__`.
+      self.assertEqual(len(model.losses), 3)
+    else:
+      self.assertEqual(len(model.get_losses_for(x)), 2)
+      self.assertEqual(len(model.get_losses_for(y)), 2)
+      self.assertEqual(len(model.get_losses_for(None)), 1)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
@@ -1101,6 +1130,36 @@
 
     self.assertLen(model.trainable_variables, 3)
 
+  # TODO(b/131372221): Make this work with subclassed models.
+  @keras_parameterized.run_with_all_model_types(exclude_models=['subclass'])
+  @keras_parameterized.run_all_keras_modes
+  def test_model_dtype(self):
+
+    class AssertTypeLayer(keras.layers.Layer):
+
+      def __init__(self, assert_type=None, **kwargs):
+        super(AssertTypeLayer, self).__init__(**kwargs)
+        self.assert_type = assert_type
+
+      def call(self, inputs):
+        assert inputs.dtype.name == self.assert_type, (
+            'Input tensor has type %s which does not match assert type %s' %
+            (inputs.dtype.name, self.assert_type))
+        return inputs + 1.
+
+    for dtype in ('float16', 'float32', 'float64'):
+      model = testing_utils.get_model_from_layers([AssertTypeLayer(dtype)],
+                                                  input_shape=(10,),
+                                                  input_dtype=dtype)
+      model.compile('sgd', 'mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      x = np.ones((10, 10), dtype=dtype)
+      y = np.ones((10, 10), dtype=dtype)
+      model.fit(x, y)
+      model.test_on_batch(x, y)
+      model(x)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -1291,11 +1350,14 @@
         x_train[:batch_size],
         y_train[:batch_size],
         sample_weight=sample_weight[:batch_size])
-    ref_score = model.evaluate(x_test, y_test, verbose=0)
-    if not context.executing_eagerly():
-      score = model.evaluate(
-          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
-      self.assertLess(score[0], ref_score[0])
+    ref_score = model.evaluate(
+        x_test, y_test, verbose=0, sample_weight=sample_weight)
+    score = model.evaluate(
+        x_test[test_ids, :],
+        y_test[test_ids, :],
+        verbose=0,
+        sample_weight=sample_weight[test_ids])
+    self.assertLess(score[0], ref_score[0])
 
   @keras_parameterized.run_all_keras_modes
   def test_temporal_sample_weights(self):
@@ -2548,6 +2610,14 @@
     model = keras.models.Model(x, y)
     model.add_metric(
         math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+    if context.executing_eagerly():
+      # This is not a use case in v1 graph mode.
+      mean_result = metrics_module.Mean()(y)
+      with self.assertRaisesRegex(
+          ValueError, 'Expected a symbolic Tensor for the metric value'):
+        model.add_metric(mean_result, name='metric_2')
+
     with self.assertRaisesRegex(
         ValueError, 'Using the result of calling a `Metric` object '):
       with keras.backend.get_graph().as_default():
@@ -2676,6 +2746,13 @@
     model.add_metric(
         math_ops.reduce_sum(y), name='metric_3', aggregation='mean')
 
+    if context.executing_eagerly():
+      # This is not a use case in v1 graph mode.
+      mean_result = metrics_module.Mean()(y)
+      with self.assertRaisesRegex(
+          ValueError, 'Expected a symbolic Tensor for the metric value'):
+        model.add_metric(mean_result, name='metric_4')
+
     with self.assertRaisesRegex(
         ValueError, 'Using the result of calling a `Metric` object '):
       with keras.backend.get_graph().as_default():
@@ -2890,7 +2967,7 @@
     return math_ops.cast(self.counter, inputs.dtype) * inputs
 
 
-class AddUpdateLayer(keras.layers.Layer):
+class LambdaUpdateLayer(keras.layers.Layer):
 
   def build(self, input_shape):
     self.counter = self.add_weight(
@@ -2902,7 +2979,7 @@
 
   def call(self, inputs):
     # Make sure update isn't run twice.
-    self.add_update(state_ops.assign_add(self.counter, 1))
+    self.add_update(lambda: state_ops.assign_add(self.counter, 1))
     return math_ops.cast(self.counter, inputs.dtype) * inputs
 
 
@@ -2920,12 +2997,31 @@
     return self.layer(inputs)
 
 
+class SubgraphUpdateLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.counter = self.add_weight(
+        'counter',
+        dtype='int32',
+        shape=(),
+        initializer='zeros',
+        trainable=False)
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = keras.backend.learning_phase()
+
+    if training:
+      self.counter.assign(self.counter + 1)
+    return inputs
+
+
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class TestAutoUpdates(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
   @parameterized.named_parameters(('bare_update', BareUpdateLayer()),
-                                  ('add_update', AddUpdateLayer()),
+                                  ('lambda_update', LambdaUpdateLayer()),
                                   ('nested_update', NestedUpdateLayer()))
   def test_updates_in_model(self, layer):
     x, y = np.ones((10, 10)), np.ones((10, 1))
@@ -2933,16 +3029,34 @@
         [layer, keras.layers.Dense(1)], input_shape=(10,))
     model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, batch_size=2, epochs=1)
-    if not testing_utils.should_run_eagerly():
-      # Check that `trainable=False` disables updates.
-      layer.trainable = False
-      model.compile(
-          'sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
-      model.fit(x, y, batch_size=2, epochs=1)
+    self.assertEqual(self.evaluate(layer.counter), 5)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_lambda_updates_trainable_false(self):
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    layer = LambdaUpdateLayer()
+    model = testing_utils.get_model_from_layers(
+        [layer, keras.layers.Dense(1)], input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, batch_size=2, epochs=1)
+    self.assertEqual(self.evaluate(layer.counter), 5)
+    layer.trainable = False
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, batch_size=2, epochs=1)
+    self.assertEqual(self.evaluate(layer.counter), 5)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_subgraph_updates_in_model(self):
+    layer = SubgraphUpdateLayer()
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    model = testing_utils.get_model_from_layers(
+        [layer, keras.layers.Dense(1)], input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, batch_size=2, epochs=1)
     self.assertEqual(self.evaluate(layer.counter), 5)
 
   @parameterized.named_parameters(('bare_update', BareUpdateLayer()),
-                                  ('add_update', AddUpdateLayer()),
+                                  ('lambda_update', LambdaUpdateLayer()),
                                   ('nested_update', NestedUpdateLayer()))
   def test_updates_standalone_layer(self, layer):
     y = layer(np.ones((10, 10)))
@@ -2950,23 +3064,23 @@
     self.evaluate(y)
     self.assertEqual(self.evaluate(layer.counter), 1)
 
-  def test_trainable_false(self):
-    x = keras.backend.placeholder(shape=(10, 10), dtype='float32')
-    layer = NestedUpdateLayer()
+  def test_trainable_false_standalone_layer(self):
+    layer = LambdaUpdateLayer()
+    y = layer(np.ones((10, 10)))
+    self.evaluate(layer.counter.initializer)
+    self.evaluate(y)
+    self.assertEqual(self.evaluate(layer.counter), 1)
     layer.trainable = False
-    y = layer(x)
-    func = keras.backend.function([x], [y])
-    x_val = np.ones((10, 10))
-    func(x_val)
-    counter = keras.backend.get_value(layer.counter)
-    self.assertEqual(counter, 0)
+    y = layer(np.ones((10, 10)))
+    self.evaluate(y)
+    self.assertEqual(self.evaluate(layer.counter), 1)
 
   @keras_parameterized.run_with_all_model_types
   def test_batchnorm_trainable_false(self):
     bn = keras.layers.BatchNormalization()
-    bn.trainable = False
     model = testing_utils.get_model_from_layers([bn, keras.layers.Dense(1)],
                                                 input_shape=(10,))
+    bn.trainable = False
     model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model.fit(x, y, batch_size=2, epochs=1)
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index ad77e89..bd17ad0 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -42,7 +42,7 @@
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -683,7 +683,7 @@
   # Iterator may return sample_weight as 1-tuple
   if isinstance(sample_weight, tuple):
     sample_weight = sample_weight[0]
-  if sample_weight_mode is not None:
+  if sample_weight_mode is not None and sample_weight_mode != 'samplewise':
     if sample_weight_mode != 'temporal':
       raise ValueError('"sample_weight_mode '
                        'should be None or "temporal". '
@@ -866,7 +866,8 @@
       weights = mask
     else:
       # Update dimensions of weights to match with mask.
-      mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
+      mask, _, weights = losses_utils.squeeze_or_expand_dimensions(
+          mask, None, weights)
       weights *= mask
 
   if y_pred is not None:
@@ -876,7 +877,7 @@
 
 
 def get_loss_function(loss):
-  """Returns the loss function corresponding to the given loss input."""
+  """Returns the loss corresponding to the loss input in `compile` API."""
   if loss is None or isinstance(loss, losses.Loss):
     return loss
 
@@ -891,7 +892,14 @@
   # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
   # in `LossFunctionWrapper` class.
   loss_fn = losses.get(loss)
-  return losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
+
+  # For losses which are given as strings/functions in the compile API,
+  # we always set the loss reduction type to be `SUM_OVER_BATCH_SIZE`
+  # (both in distribution strategy context and otherwise).
+  return losses.LossFunctionWrapper(
+      loss_fn,
+      name=loss_fn.__name__,
+      reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
 
 
 def validate_dataset_input(x, y, sample_weight, validation_split=None):
@@ -985,10 +993,11 @@
   return False
 
 
-def cast_single_tensor(x):
+def cast_single_tensor(x, dtype=None):
   x = ops.convert_to_tensor(x)
+  dtype = dtype or K.floatx()
   if x.dtype.is_floating:
-    return math_ops.cast(x, dtype=K.floatx())
+    return math_ops.cast(x, dtype=dtype)
   return x
 
 
@@ -1005,34 +1014,50 @@
   return nest.map_structure(cast_single_tensor, x)
 
 
-def get_output_sample_weight_and_mode(skip_target_weighing_indices,
-                                      sample_weight_mode, output_name,
-                                      output_index):
-  """Returns the sample weight and weight mode for a single output."""
-  if output_index in skip_target_weighing_indices:
-    return None, None
+def cast_if_floating_to_model_input_dtypes(x, model):
+  """Casts the given data tensors to the dtypes of the model inputs.
 
+  Casts only if the input is already a floating point type.
+
+  Args:
+    x: tensor or list/tuple of tensors.
+    model: The model.
+
+  Returns:
+    Converted input. Each tensor is casted to the corresponding input in
+    `model.inputs`.
+  """
+  # TODO(b/131372221): We should probably cast even if the input is not
+  # floating-point.
+  input_dtypes = nest.map_structure(lambda t: t.dtype, model.inputs)
+  return nest.map_structure(cast_single_tensor, x, input_dtypes)
+
+
+def get_output_sample_weight(skip_target_weighing_indices, sample_weight_mode,
+                             output_name, output_index):
+  """Returns the sample weight and weight mode for a single output."""
+  if (output_index in skip_target_weighing_indices or
+      sample_weight_mode is None or context.executing_eagerly()):
+    return None
+
+  assert sample_weight_mode in ['temporal', 'samplewise']
   if sample_weight_mode == 'temporal':
     default_value = [[1.]]
     shape = [None, None]
-    mode = 'temporal'
-  else:
+  elif sample_weight_mode == 'samplewise':
     default_value = [1.]
     shape = [None]
-    mode = None
-  if context.executing_eagerly():
-    weight = None
-  else:
-    weight = array_ops.placeholder_with_default(
-        constant_op.constant(default_value, dtype=K.floatx()),
-        shape=shape,
-        name=output_name + '_sample_weights')
-  return weight, mode
+
+  weight = array_ops.placeholder_with_default(
+      constant_op.constant(default_value, dtype=K.floatx()),
+      shape=shape,
+      name=output_name + '_sample_weights')
+  return weight
 
 
-def prepare_sample_weights(output_names, sample_weight_mode,
-                           skip_target_weighing_indices):
-  """Prepares sample weights for the model.
+def prepare_sample_weight_modes(output_names, sample_weight_mode,
+                                skip_target_weighing_indices):
+  """Prepares sample weight modes for the model.
 
   Args:
     output_names: List of model output names.
@@ -1041,44 +1066,44 @@
       should be skipped.
 
   Returns:
-    A pair of list of sample weights and sample weight modes
-      (one for each output).
+    List of sample weight modes (one for each output).
 
   Raises:
     ValueError: In case of invalid `sample_weight_mode` input.
   """
-  sample_weights = []
-  sample_weight_modes = []
+
   if isinstance(sample_weight_mode, collections.Mapping):
     generic_utils.check_for_unexpected_keys('sample_weight_mode',
                                             sample_weight_mode, output_names)
+
+    sample_weight_modes = []
     for i, name in enumerate(output_names):
-      if (i not in skip_target_weighing_indices and
-          name not in sample_weight_mode):
-        raise ValueError('Output missing from sample_weight_modes dictionary')
-      weight, mode = get_output_sample_weight_and_mode(
-          skip_target_weighing_indices, sample_weight_mode.get(name), name, i)
-      sample_weights.append(weight)
-      sample_weight_modes.append(mode)
-  elif isinstance(sample_weight_mode, list):
+      if i in skip_target_weighing_indices:
+        sample_weight_modes.append(None)
+      elif name not in sample_weight_mode:
+        raise ValueError('Output ' + name +
+                         'missing from `_sample_weight_modes` dictionary')
+      else:
+        sample_weight_modes.append(sample_weight_mode.get(name))
+    return sample_weight_modes
+
+  if isinstance(sample_weight_mode, (list, tuple)):
     if len(sample_weight_mode) != len(output_names):
       raise ValueError('When passing a list as sample_weight_mode, '
                        'it should have one entry per model output. '
                        'The model has ' + str(len(output_names)) +
                        ' outputs, but you passed ' +
-                       str(len(sample_weight_mode)) + 'sample_weight_modes')
-    for i, name in enumerate(output_names):
-      weight, mode = get_output_sample_weight_and_mode(
-          skip_target_weighing_indices, sample_weight_mode[i], name, i)
-      sample_weights.append(weight)
-      sample_weight_modes.append(mode)
-  else:
-    for i, name in enumerate(output_names):
-      weight, mode = get_output_sample_weight_and_mode(
-          skip_target_weighing_indices, sample_weight_mode, name, i)
-      sample_weights.append(weight)
-      sample_weight_modes.append(mode)
-  return sample_weights, sample_weight_modes
+                       str(len(sample_weight_mode)) + '_sample_weight_modes.')
+
+    return [
+        None if i in skip_target_weighing_indices else sample_weight_mode[i]
+        for i in range(len(output_names))
+    ]
+
+  return [
+      None if i in skip_target_weighing_indices else sample_weight_mode
+      for i in range(len(output_names))
+  ]
 
 
 def prepare_loss_functions(loss, output_names):
@@ -1316,14 +1341,17 @@
 
 def get_iterator(dataset):
   """Create and initialize an iterator from a dataset."""
-  iterator = dataset_ops.make_initializable_iterator(dataset)
+  if context.executing_eagerly():
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+  else:
+    iterator = dataset_ops.make_initializable_iterator(dataset)
   initialize_iterator(iterator)
   return iterator
 
 
 def initialize_iterator(iterator):
-  init_op = iterator.initializer
   if not context.executing_eagerly():
+    init_op = iterator.initializer
     K.get_session((init_op,)).run(init_op)
 
 
diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py
index 86895c1..a7f69f0 100644
--- a/tensorflow/python/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -82,7 +82,7 @@
     stddev: a python scalar or a scalar tensor. Standard deviation of the random
       values to generate. Defaults to 0.05.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: The data type. Only floating point types are supported.
     
   Returns:
@@ -106,7 +106,7 @@
     maxval: A python scalar or a scalar tensor. Upper bound of the range of
       random values to generate. Defaults to 0.05.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: The data type.
     
   Returns:
@@ -131,7 +131,7 @@
     stddev: a python scalar or a scalar tensor. Standard deviation of the random
       values to generate. Defaults to 0.05.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index e9785ac..59682e0 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -226,6 +226,20 @@
         model.get_config(), custom_objects={'my_initializer': my_initializer})
     self.assertEqual(model2.layers[1].kernel_initializer, my_initializer)
 
+  @test_util.run_v2_only
+  def test_load_external_variance_scaling_v2(self):
+    external_serialized_json = {
+        'class_name': 'VarianceScaling',
+        'config': {
+            'distribution': 'normal',
+            'mode': 'fan_avg',
+            'scale': 1.0,
+            'seed': None
+        }
+    }
+    initializer = keras.initializers.deserialize(external_serialized_json)
+    self.assertEqual(initializer.distribution, 'truncated_normal')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 7250db2..cd1049b 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -91,8 +91,8 @@
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly())
     if not testing_utils.should_run_eagerly():
-      self.assertEqual(len(model.losses), 2)
-      self.assertEqual(len(model.updates), 2)
+      self.assertEqual(len(model.get_losses_for(None)), 2)
+      self.assertEqual(len(model.get_updates_for(x)), 2)
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 950e9b0..50ca739 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -79,6 +79,7 @@
 from tensorflow.python.keras.layers.core import ActivityRegularization
 
 # Dense Attention layers.
+from tensorflow.python.keras.layers.dense_attention import AdditiveAttention
 from tensorflow.python.keras.layers.dense_attention import Attention
 
 # Embedding layers.
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index ea08878..82238fa 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -421,7 +421,8 @@
       keras.layers.ZeroPadding3D(padding=None)
 
 
-@test_util.disable_all_xla('align_corners=False not supported by XLA')
+@test_util.for_all_test_methods(test_util.disable_xla,
+                                'align_corners=False not supported by XLA')
 @keras_parameterized.run_all_keras_modes
 class UpSamplingTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 73fbe23..a57c4ab 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -717,6 +717,9 @@
         output_shape` If a function, it specifies the entire shape as a function
         of the
       input shape: `output_shape = f(input_shape)`
+    mask: Either None (indicating no masking) or a callable with the same
+      signature as the `compute_mask` layer method, or a tensor that will be
+      returned as output mask regardless what the input is.
     arguments: Optional dictionary of keyword arguments to be passed to the
       function.
   Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
@@ -800,87 +803,66 @@
     return self.mask
 
   def get_config(self):
-    module = self.function.__module__
-    if isinstance(self.function, python_types.LambdaType):
-      function = generic_utils.func_dump(self.function)
-      function_type = 'lambda'
-    else:
-      function = self.function.__name__
-      function_type = 'function'
-
-    output_shape_module = None
-    if isinstance(self._output_shape, python_types.LambdaType):
-      output_shape = generic_utils.func_dump(self._output_shape)
-      output_shape_type = 'lambda'
-      output_shape_module = self._output_shape.__module__
-    elif callable(self._output_shape):
-      output_shape = self._output_shape.__name__
-      output_shape_type = 'function'
-      output_shape_module = self._output_shape.__module__
-    else:
-      output_shape = self._output_shape
-      output_shape_type = 'raw'
-
+    function_config = self._serialize_function_to_config(self.function)
+    output_shape_config = self._serialize_function_to_config(self._output_shape,
+                                                             allow_raw=True)
     config = {
-        'function': function,
-        'module': module,
-        'function_type': function_type,
-        'output_shape': output_shape,
-        'output_shape_type': output_shape_type,
-        'output_shape_module': output_shape_module,
-        'arguments': self.arguments
+        'function': function_config[0],
+        'function_type': function_config[1],
+        'module': function_config[2],
+        'output_shape': output_shape_config[0],
+        'output_shape_type': output_shape_config[1],
+        'output_shape_module': output_shape_config[2],
     }
+    if self.mask is not None:
+      mask_config = self._serialize_function_to_config(self.mask)
+      config.update({
+          'mask': mask_config[0],
+          'mask_type': mask_config[1],
+          'mask_module': mask_config[2]
+      })
+    config['arguments'] = self.arguments
+
     base_config = super(Lambda, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  def _serialize_function_to_config(self, inputs, allow_raw=False):
+    if isinstance(inputs, python_types.LambdaType):
+      output = generic_utils.func_dump(inputs)
+      output_type = 'lambda'
+      module = inputs.__module__
+    elif callable(inputs):
+      output = inputs.__name__
+      output_type = 'function'
+      module = inputs.__module__
+    elif allow_raw:
+      output = inputs
+      output_type = 'raw'
+      module = None
+    else:
+      raise ValueError(
+          'Invalid input for serialization, type: %s ' % type(inputs))
+
+    return output, output_type, module
+
   @classmethod
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
-    globs = globals()
-    module = config.pop('module', None)
-    if module in sys.modules:
-      globs.update(sys.modules[module].__dict__)
-    elif module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(module)
-                    , UserWarning)
-    if custom_objects:
-      globs.update(custom_objects)
-    function_type = config.pop('function_type')
-    if function_type == 'function':
-      # Simple lookup in custom objects
-      function = generic_utils.deserialize_keras_object(
-          config['function'],
-          custom_objects=custom_objects,
-          printable_module_name='function in Lambda layer')
-    elif function_type == 'lambda':
-      # Unsafe deserialization from bytecode
-      function = generic_utils.func_load(config['function'], globs=globs)
-    else:
-      raise TypeError('Unknown function type:', function_type)
+    function = cls._parse_function_from_config(
+        config, custom_objects, 'function', 'module', 'function_type')
 
-    output_shape_module = config.pop('output_shape_module', None)
-    if output_shape_module in sys.modules:
-      globs.update(sys.modules[output_shape_module].__dict__)
-    elif output_shape_module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(output_shape_module)
-                    , UserWarning)
-    output_shape_type = config.pop('output_shape_type')
-    if output_shape_type == 'function':
-      # Simple lookup in custom objects
-      output_shape = generic_utils.deserialize_keras_object(
-          config['output_shape'],
-          custom_objects=custom_objects,
-          printable_module_name='output_shape function in Lambda layer')
-    elif output_shape_type == 'lambda':
-      # Unsafe deserialization from bytecode
-      output_shape = generic_utils.func_load(config['output_shape'],
-                                             globs=globs)
+    output_shape = cls._parse_function_from_config(
+        config, custom_objects, 'output_shape', 'output_shape_module',
+        'output_shape_type')
+    if 'mask' in config:
+      mask = cls._parse_function_from_config(
+          config, custom_objects, 'mask', 'mask_module', 'mask_type')
     else:
-      output_shape = config['output_shape']
+      mask = None
+
+    config['function'] = function
+    config['output_shape'] = output_shape
+    config['mask'] = mask
 
     # If arguments were numpy array, they have been saved as
     # list. We need to recover the ndarray
@@ -892,10 +874,40 @@
             # Overwrite the argument with its numpy translation
             config['arguments'][key] = np.array(arg_dict['value'])
 
-    config['function'] = function
-    config['output_shape'] = output_shape
     return cls(**config)
 
+  @classmethod
+  def _parse_function_from_config(
+      cls, config, custom_objects, func_attr_name, module_attr_name,
+      func_type_attr_name):
+    globs = globals()
+    module = config.pop(module_attr_name, None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
+    if custom_objects:
+      globs.update(custom_objects)
+    function_type = config.pop(func_type_attr_name)
+    if function_type == 'function':
+      # Simple lookup in custom objects
+      function = generic_utils.deserialize_keras_object(
+          config[func_attr_name],
+          custom_objects=custom_objects,
+          printable_module_name='function in Lambda layer')
+    elif function_type == 'lambda':
+      # Unsafe deserialization from bytecode
+      function = generic_utils.func_load(
+          config[func_attr_name], globs=globs)
+    elif function_type == 'raw':
+      function = config[func_attr_name]
+    else:
+      raise TypeError('Unknown function type:', function_type)
+    return function
+
 
 @keras_export('keras.layers.Dense')
 class Dense(Layer):
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 9f818a5..9e87169 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -107,12 +107,14 @@
         'class_name': 'Lambda',
         'config': config
     })
+    self.assertEqual(ld.function(3), 4)
 
     # test with lambda
     ld = keras.layers.Lambda(
         lambda x: keras.backend.concatenate([math_ops.square(x), x]))
     config = ld.get_config()
     ld = keras.layers.Lambda.from_config(config)
+    self.assertAllEqual(self.evaluate(ld.function([3])), [9, 3])
 
   def test_lambda_multiple_inputs(self):
     ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
@@ -184,14 +186,25 @@
 
   def test_lambda_config_serialization(self):
     # Test serialization with output_shape and output_shape_type
-    layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+    layer = keras.layers.Lambda(
+        lambda x: x + 1,
+        output_shape=(1, 1),
+        mask=lambda i, m: m)
     layer(keras.backend.variable(np.ones((1, 1))))
     config = layer.get_config()
+
     layer = keras.layers.deserialize({
         'class_name': 'Lambda',
         'config': config
     })
+    self.assertAllEqual(layer.function(1), 2)
+    self.assertAllEqual(layer._output_shape, (1, 1))
+    self.assertAllEqual(layer.mask(1, True), True)
+
     layer = keras.layers.Lambda.from_config(config)
+    self.assertAllEqual(layer.function(1), 2)
+    self.assertAllEqual(layer._output_shape, (1, 1))
+    self.assertAllEqual(layer.mask(1, True), True)
 
   def test_lambda_with_variable(self):
 
@@ -217,6 +230,29 @@
     self.assertEqual(keras.backend.get_value(train_out), 1.)
     self.assertEqual(keras.backend.get_value(eval_out), 2.)
 
+  def test_lambda_with_mask(self):
+
+    def add_one(inputs):
+      return inputs + 1.0
+
+    def mask(unused_inputs, previous_mask):
+      return previous_mask
+
+    layer = keras.layers.Lambda(add_one, mask=mask)
+    x = np.ones([5, 4, 3])
+    x[:, -1, :] = 0
+    masking = keras.layers.Masking()
+    out = layer(masking(x))
+
+    expected_out = np.full([5, 4, 3], 2.0)
+    expected_out[:, -1, :] = 1.0
+    expected_mask = np.ones([5, 4])
+    expected_mask[:, -1] = 0.0
+
+    self.assertAllClose(self.evaluate(out), expected_out)
+    self.assertIsNotNone(out._keras_mask)
+    self.assertAllClose(self.evaluate(out._keras_mask), expected_mask)
+
 
 class TestStatefulLambda(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index 193447c..0a8cf97 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -78,7 +78,6 @@
     self.constants_spec = None
     self._states = None
     self._num_constants = None
-    self._num_inputs = None
     self._vector_shape = constant_op.constant([-1])
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index c7d8d82..9cd144a 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -321,6 +321,8 @@
         layers = [keras.layers.InputLayer(input_shape),
                   model] if (i == 1) else [model]
         model = keras.models.Sequential(layers)
+        if i > 1:
+          model.build((None,) + input_shape)
       return model
 
     # example: make_nested_func_model((1,), Dense(10), level=2).summary()
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index e595b11..a8a27bc 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -23,6 +23,7 @@
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
@@ -170,9 +171,9 @@
 class Attention(BaseDenseAttention):
   """Dot-product attention layer, a.k.a. Luong-style attention.
 
-  Inputs are `query` tensor of shape `[batch_size, Tq]`, `value` tensor of shape
-  `[batch_size, Tv]` and `key` tensor of shape `[batch_size, Tv]`.
-  The calculation follows the steps:
+  Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
+  shape `[batch_size, Tv, dim]` and `key` tensor of shape
+  `[batch_size, Tv, dim]`. The calculation follows the steps:
 
   1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
      product: `scores = tf.matmul(query, key, transpose_b=True)`.
@@ -271,7 +272,7 @@
           shape=(),
           initializer=init_ops.ones_initializer(),
           dtype=self.dtype,
-          trainable=self.trainable)
+          trainable=True)
     else:
       self.scale = None
     super(Attention, self).build(input_shape)
@@ -291,6 +292,143 @@
     return scores
 
 
+@keras_export('keras.layers.AdditiveAttention')
+class AdditiveAttention(BaseDenseAttention):
+  """Additive attention layer, a.k.a. Bahdanau-style attention.
+
+  Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
+  shape `[batch_size, Tv, dim]` and `key` tensor of shape
+  `[batch_size, Tv, dim]`. The calculation follows the steps:
+
+  1. Reshape `query` and `value` into shapes `[batch_size, Tq, 1, dim]`
+     and `[batch_size, 1, Tv, dim]` respectively.
+  2. Calculate scores with shape `[batch_size, Tq, Tv]` as a non-linear
+     sum: `scores = tf.reduce_sum(tf.tanh(query + value), axis=-1)`
+  3. Use scores to calculate a distribution with shape
+     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
+  4. Use `distribution` to create a linear combination of `value` with
+     shape `batch_size, Tq, dim]`:
+     `return tf.matmul(distribution, value)`.
+
+  Args:
+    use_scale: If `True`, will create a variable to scale the attention scores.
+    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
+      that position `i` cannot attend to positions `j > i`. This prevents the
+      flow of information from the future towards the past.
+
+  Call Arguments:
+
+    inputs: List of the following tensors:
+      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
+        given, will use `value` for both `key` and `value`, which is the
+        most common case.
+    mask: List of the following tensors:
+      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+        If given, the output will be zero at the positions where
+        `mask==False`.
+      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+        If given, will apply the mask such that values at positions where
+        `mask==False` do not contribute to the result.
+
+  Output shape:
+
+    Attention outputs of shape `[batch_size, Tq, dim]`.
+
+  The meaning of `query`, `value` and `key` depend on the application. In the
+  case of text similarity, for example, `query` is the sequence embeddings of
+  the first piece of text and `value` is the sequence embeddings of the second
+  piece of text. `key` is usually the same tensor as `value`.
+
+  Here is a code example for using `AdditiveAttention` in a CNN+Attention
+  network:
+
+  ```python
+  # Variable-length int sequences.
+  query_input = tf.keras.Input(shape=(None,), dtype='int32')
+  value_input = tf.keras.Input(shape=(None,), dtype='int32')
+
+  # Embedding lookup.
+  token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
+  # Query embeddings of shape [batch_size, Tq, dimension].
+  query_embeddings = token_embedding(query_input)
+  # Value embeddings of shape [batch_size, Tv, dimension].
+  value_embeddings = token_embedding(query_input)
+
+  # CNN layer.
+  cnn_layer = tf.keras.layers.Conv1D(
+      filters=100,
+      kernel_size=4,
+      # Use 'same' padding so outputs have the same shape as inputs.
+      padding='same')
+  # Query encoding of shape [batch_size, Tq, filters].
+  query_seq_encoding = cnn_layer(query_embeddings)
+  # Value encoding of shape [batch_size, Tv, filters].
+  value_seq_encoding = cnn_layer(value_embeddings)
+
+  # Query-value attention of shape [batch_size, Tq, filters].
+  query_value_attention_seq = tf.keras.layers.AdditiveAttention()(
+      [query_seq_encoding, value_seq_encoding])
+
+  # Reduce over the sequence axis to produce encodings of shape
+  # [batch_size, filters].
+  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
+      query_seq_encoding)
+  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
+      query_value_attention_seq)
+
+  # Concatenate query and document encodings to produce a DNN input layer.
+  input_layer = tf.keras.layers.Concatenate()(
+      [query_encoding, query_value_attention])
+
+  # Add DNN layers, and create Model.
+  # ...
+  ```
+  """
+
+  def __init__(self, use_scale=True, **kwargs):
+    super(AdditiveAttention, self).__init__(**kwargs)
+    self.use_scale = use_scale
+
+  def build(self, input_shape):
+    v_shape = tensor_shape.TensorShape(input_shape[1])
+    dim = v_shape[-1]
+    if isinstance(dim, tensor_shape.Dimension):
+      dim = dim.value
+    if self.use_scale:
+      self.scale = self.add_weight(
+          name='scale',
+          shape=[dim],
+          initializer=init_ops.glorot_uniform_initializer(),
+          dtype=self.dtype,
+          trainable=True)
+    else:
+      self.scale = None
+    super(AdditiveAttention, self).build(input_shape)
+
+  def _calculate_scores(self, query, key):
+    """Calculates attention scores as a nonlinear sum of query and key.
+
+    Args:
+      query: Query tensor of shape `[batch_size, Tq, dim]`.
+      key: Key tensor of shape `[batch_size, Tv, dim]`.
+    Returns:
+      Tensor of shape `[batch_size, Tq, Tv]`.
+    """
+    # Reshape tensors to enable broadcasting.
+    # Reshape into [batch_size, Tq, 1, dim].
+    q_reshaped = array_ops.expand_dims(query, axis=-2)
+    # Reshape into [batch_size, 1, Tv, dim].
+    k_reshaped = array_ops.expand_dims(key, axis=-3)
+    if self.use_scale:
+      scale = self.scale
+    else:
+      scale = 1.
+    return math_ops.reduce_sum(
+        scale * math_ops.tanh(q_reshaped + k_reshaped), axis=-1)
+
+
 def _lower_triangular_mask(shape):
   """Creates a lower-triangular boolean mask over the last 2 dimensions."""
   row_index = math_ops.cumsum(
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 66e3525..c77792f 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -415,6 +415,240 @@
 
 
 @test_util.run_all_in_graph_and_eager_modes
+class AdditiveAttentionTest(test.TestCase):
+
+  def test_calculate_scores_one_dim(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Key tensor of shape [1, 1, 1]
+    k = np.array([[[1.6]]], dtype=np.float32)
+    attention_layer = dense_attention.AdditiveAttention()
+    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+    # Scale tensor of shape [1]
+    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
+    expected = np.array([[[0.49550372683]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_calculate_scores_multi_dim(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Key tensor of shape [1, 3, 4]
+    k = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    attention_layer = dense_attention.AdditiveAttention()
+    attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
+    # Scale tensor of shape [4]
+    attention_layer.scale = np.array([[[0.5, 0.6, 0.7, 0.8]]], dtype=np.float32)
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # pylint:disable=line-too-long
+    # expected000 = 0.5*tanh(1.+1.5) + 0.6*tanh(1.1+1.6) + 0.7*tanh(1.2+1.7) + 0.8*tanh(1.3+1.8) = 2.58044532581
+    # expected001 = 0.5*tanh(1.+2.5) + 0.6*tanh(1.1+2.6) + 0.7*tanh(1.2+2.7) + 0.8*tanh(1.3+2.8) = 2.59734317449
+    # expected002 = 0.5*tanh(1.+3.5) + 0.6*tanh(1.1+3.6) + 0.7*tanh(1.2+3.7) + 0.8*tanh(1.3+3.8) = 2.59964024652
+    # expected010 = 0.5*tanh(2.+1.5) + 0.6*tanh(2.1+1.6) + 0.7*tanh(2.2+1.7) + 0.8*tanh(2.3+1.8) = 2.59734317449
+    # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + 0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652
+    # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + 0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916
+    # pylint:enable=line-too-long
+    expected = np.array(
+        [[[2.58044532581, 2.59734317449, 2.59964024652],
+          [2.59734317449, 2.59964024652, 2.59995130916]]],
+        dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_calculate_scores_one_dim_batch_size_two(self):
+    # Query tensor of shape [2, 1, 1]
+    q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+    # Key tensor of shape [2, 1, 1]
+    k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+    attention_layer = dense_attention.AdditiveAttention()
+    attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
+    # Scale tensor of shape [1]
+    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [2, 1, 1].
+    # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
+    # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277
+    expected = np.array(
+        [[[0.49550372683]], [[0.49991728277]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_shape(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.AdditiveAttention()
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_shape_no_scale(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.AdditiveAttention(use_scale=False)
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_shape_with_key(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Key tensor of shape [1, 3, 4]
+    k = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.AdditiveAttention()
+    actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_multi_dim(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.AdditiveAttention()
+    attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
+    # Scale tensor of shape [1]
+    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    # pylint:disable=line-too-long
+    # Expected scores of shape [1, 1, 3]
+    # scores = [[[0.5 * tanh(1.1 + 1.6), 0.5 * tanh(1.1 + 0.7), 0.5 * tanh(1.1 - 0.8)]]]
+    #        = [[[0.49550372683, 0.47340300642, 0.14565630622]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000
+    #      = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
+    #      = 0.50552495521
+    #    attention_distribution001
+    #      = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
+    #      = 0.49447504478
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.50552495521 * 1.6 + 0.49447504478 * 0.7 - 0 * 0.8
+    #             = 1.15497245968
+    # pylint:enable=line-too-long
+    expected = np.array([[[1.15497245968]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_multi_dim_with_key(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+    # Key tensor of shape [1, 3, 1]
+    k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.AdditiveAttention()
+    attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
+    # Scale tensor of shape [1]
+    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+    actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+    # pylint:disable=line-too-long
+    # Expected scores of shape [1, 1, 3]
+    # scores = [[[0.5 * tanh(1.1 + 1.6), 0.5 * tanh(1.1 + 0.7), 0.5 * tanh(1.1 - 0.8)]]]
+    #        = [[[0.49550372683, 0.47340300642, 0.14565630622]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000
+    #        = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
+    #        = 0.50552495521
+    #    attention_distribution001
+    #        = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
+    #        = 0.49447504478
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.50552495521 * 0.5 + 0.49447504478 * 0.8 - 0 * 0.3
+    #             = 0.64834251342
+    # pylint:enable=line-too-long
+    expected = np.array([[[0.64834251342]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_multi_dim_with_query_mask(self):
+    # Query tensor of shape [1, 2, 1]
+    q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Query mask tensor of shape [1, 2]
+    q_mask = np.array([[True, False]], dtype=np.bool_)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.AdditiveAttention()
+    attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
+    # Scale tensor of shape [1]
+    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+    actual = attention_layer([q, v], mask=[q_mask, v_mask])
+
+    # pylint:disable=line-too-long
+    # Expected scores of shape [1, 2, 3]
+    # scores = [[[0.5 * tanh(1.1 + 1.6), 0.5 * tanh(1.1 + 0.7), 0.5 * tanh(1.1 - 0.8)],
+    #            [0.5 * tanh(-0.5 + 1.6), 0.5 * tanh(-0.5 + 0.7), 0.5 * tanh(-0.5 - 0.8)]]]
+    #        = [[[0.49550372683, 0.47340300642, 0.14565630622],
+    #            [0.40024951088, 0.09868766011, -0.43086157965]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000
+    #        = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
+    #        = 0.50552495521
+    #    attention_distribution001
+    #        = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
+    #        = 0.49447504478
+    #    attention_distribution002 = 0
+    # => attention_distribution010
+    #        = exp(0.40024951088)/(exp(0.40024951088) + exp(0.09868766011))
+    #        = 0.57482427975
+    #    attention_distribution011
+    #        = exp(0.09868766011)/(exp(0.40024951088) + exp(0.09868766011))
+    #        = 0.42517572025
+    #    attention_distribution012 = 0
+    #
+    # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
+    # expected000 = 0.50552495521 * 1.6 + 0.49447504478 * 0.7 - 0 * 0.8
+    #             = 1.15497245968
+    # expected000 = 0
+    # pylint:enable=line-too-long
+    expected = np.array([[[1.15497245968], [0.]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class LowerTriangularMaskTest(test.TestCase):
 
   def test_square_shape(self):
diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index ffb2d6d..1fc06d6 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -585,9 +585,9 @@
             predict: y_train
         })
         if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
+          self.assertEqual(runtime_value, rnn._RUNTIME_GPU)
         else:
-          self.assertEqual(runtime_value, b'cpu')
+          self.assertEqual(runtime_value, rnn._RUNTIME_CPU)
         # Make sure the loss is updated for every epoch
         # (layer weights properly updated).
         self.assertNotEqual(existing_loss, loss_value)
@@ -622,8 +622,7 @@
           dtypes.float32, shape=(None, output_shape), name='predict')
 
       zeros = array_ops.zeros([batch, output_shape])
-      dummy_runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
+      dummy_runtime = rnn._runtime(rnn._RUNTIME_UNKNOWN)
       a = constant_op.constant(0)
       b = constant_op.constant(1)
       # Will always run the GRU layer.
@@ -644,9 +643,9 @@
             predict: y_train
         })
         if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
+          self.assertEqual(runtime_value, rnn._RUNTIME_GPU)
         else:
-          self.assertEqual(runtime_value, b'cpu')
+          self.assertEqual(runtime_value, rnn._RUNTIME_CPU)
         # Make sure the loss is updated for every epoch
         # (layer weights properly updated).
         self.assertNotEqual(existing_loss, loss_value)
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index 64eaf6e..61d543f 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -27,7 +27,6 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -122,10 +121,6 @@
     self.assertListEqual([3, 10], outputs.shape.as_list())
     num_trainable_vars = 1 if trainable else 0
     self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
-    if not context.executing_eagerly():
-      self.assertLen(
-          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
-          num_trainable_vars)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def test_no_eager_Leak(self):
@@ -258,10 +253,6 @@
       self.assertEqual('random_fourier_features/random_features_scale:0',
                        rff_layer.trainable_variables[0].name)
     self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
-    if not context.executing_eagerly():
-      self.assertLen(
-          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
-          num_trainable_vars)
 
   @parameterized.named_parameters(
       ('gaussian', 10, 'gaussian', 3.0, True),
diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
index ec5c26b..c590b63 100644
--- a/tensorflow/python/keras/layers/lstm_v2_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -734,9 +734,9 @@
             predict: y_train
         })
         if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
+          self.assertEqual(runtime_value, rnn._RUNTIME_GPU)
         else:
-          self.assertEqual(runtime_value, b'cpu')
+          self.assertEqual(runtime_value, rnn._RUNTIME_CPU)
         # Make sure the loss is updated for every epoch
         # (layer weights properly updated).
         self.assertNotEqual(existing_loss, loss_value)
@@ -771,8 +771,7 @@
           dtypes.float32, shape=(None, output_shape), name='predict')
 
       zeros = array_ops.zeros([batch, output_shape])
-      dummy_runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
+      dummy_runtime = rnn._runtime(rnn._RUNTIME_UNKNOWN)
       a = constant_op.constant(0)
       b = constant_op.constant(1)
       # Will always run the lstm layer.
@@ -793,9 +792,9 @@
             predict: y_train
         })
         if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
+          self.assertEqual(runtime_value, rnn._RUNTIME_GPU)
         else:
-          self.assertEqual(runtime_value, b'cpu')
+          self.assertEqual(runtime_value, rnn._RUNTIME_CPU)
         # Make sure the loss is updated for every epoch
         # (layer weights properly updated).
         self.assertNotEqual(existing_loss, loss_value)
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index b497bf4..6e592be 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -235,9 +235,8 @@
       x1 = keras.layers.Dense(8, activation='relu')(input1)
       input2 = keras.layers.Input(shape=(32,))
       x2 = keras.layers.Dense(8, activation='relu')(input2)
-      added = keras.layers.Add()([x1, x2])  # equivalent to added =
-      keras.layers.add([x1, x2])
-
+      # equivalent to `added = keras.layers.add([x1, x2])`
+      added = keras.layers.Add()([x1, x2])
       out = keras.layers.Dense(4)(added)
       model = keras.models.Model(inputs=[input1, input2], outputs=out)
   ```
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index d27dc01..2f8b13c 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -26,6 +26,7 @@
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
@@ -98,8 +99,8 @@
       normalized values (before gamma and beta), only during training. For
       example, if axis==-1,
         `adjustment = lambda shape: (
-          tf.random_uniform(shape[-1:], 0.93, 1.07),
-          tf.random_uniform(shape[-1:], -0.1, 0.1))`
+          tf.random.uniform(shape[-1:], 0.93, 1.07),
+          tf.random.uniform(shape[-1:], -0.1, 0.1))`
       will scale the normalized value by up to 7% up or down, then shift the
       result by up to 0.1 (with independent scaling and bias for each feature
       but shared across all examples), and finally apply gamma and/or beta. If
@@ -126,6 +127,8 @@
   References:
     - [Batch Normalization: Accelerating Deep Network Training by Reducing
       Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+
+  {{TRAINABLE_ATTRIBUTE_NOTE}}
   """
 
   # By default, the base class uses V2 behavior. The BatchNormalization V1
@@ -156,14 +159,14 @@
                name=None,
                **kwargs):
     super(BatchNormalizationBase, self).__init__(
-        name=name, trainable=trainable, **kwargs)
+        name=name, **kwargs)
     if isinstance(axis, list):
       self.axis = axis[:]
     elif isinstance(axis, int):
       self.axis = axis
     else:
       raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
+                      % type(axis))
     self.momentum = momentum
     self.epsilon = epsilon
     self.center = center
@@ -193,6 +196,8 @@
 
     self.fused = fused
     self._bessels_correction_test_only = True
+    self._trainable_var = None
+    self.trainable = trainable
 
     if renorm:
       renorm_clipping = renorm_clipping or {}
@@ -236,6 +241,22 @@
       return False
 
   @property
+  def trainable(self):
+    return self._trainable
+
+  @trainable.setter
+  def trainable(self, value):
+    self._trainable = value
+    if self._trainable_var is not None:
+      self._trainable_var.update_value(value)
+
+  def _get_trainable_var(self):
+    if self._trainable_var is None:
+      self._trainable_var = K.freezable_variable(
+          self._trainable, name=self.name + '_trainable')
+    return self._trainable_var
+
+  @property
   def _param_dtype(self):
     # Raise parameters of fp16 batch norm to fp32
     if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
@@ -423,15 +444,17 @@
         self._scope.set_partitioner(partitioner)
     self.built = True
 
-  def _assign_moving_average(self, variable, value, momentum):
-    with ops.name_scope(None, 'AssignMovingAvg',
-                        [variable, value, momentum]) as scope:
+  def _assign_moving_average(self, variable, value, momentum, inputs_size):
+    with K.name_scope('AssignMovingAvg') as scope:
       with ops.colocate_with(variable):
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
         update_delta = (
             variable - math_ops.cast(value, variable.dtype)) * decay
+        if inputs_size is not None:
+          update_delta = array_ops.where(inputs_size > 0, update_delta,
+                                         K.zeros_like(update_delta))
         return state_ops.assign_sub(variable, update_delta, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
@@ -439,6 +462,15 @@
     beta = self.beta if self.center else self._beta_const
     gamma = self.gamma if self.scale else self._gamma_const
 
+    # TODO(b/129279393): Support zero batch input in non DistributionStrategy
+    # code as well.
+    # TODO(b/130185866): Support zero batch input in graph mode.
+    if ops.executing_eagerly_outside_functions(
+    ) and distribution_strategy_context.has_strategy():
+      inputs_size = array_ops.size(inputs)
+    else:
+      inputs_size = None
+
     def _fused_batch_norm_training():
       return nn.fused_batch_norm(
           inputs,
@@ -479,31 +511,32 @@
     if training_value or training_value is None:
       if distribution_strategy_context.in_cross_replica_context():
         strategy = distribution_strategy_context.get_strategy()
-
         def mean_update():
           return strategy.extended.update(self.moving_mean,
                                           self._assign_moving_average,
-                                          (mean, self.momentum))
+                                          (mean, self.momentum, inputs_size))
 
         def variance_update():
-          return strategy.extended.update(self.moving_variance,
-                                          self._assign_moving_average,
-                                          (variance, self.momentum))
+          return strategy.extended.update(
+              self.moving_variance, self._assign_moving_average,
+              (variance, self.momentum, inputs_size))
       else:
 
         def mean_update():
-          return self._assign_moving_average(self.moving_mean, mean, momentum)
+          return self._assign_moving_average(self.moving_mean, mean, momentum,
+                                             inputs_size)
 
         def variance_update():
           return self._assign_moving_average(self.moving_variance, variance,
-                                             momentum)
+                                             momentum, inputs_size)
 
       self.add_update(mean_update, inputs=True)
       self.add_update(variance_update, inputs=True)
 
     return output
 
-  def _renorm_correction_and_moments(self, mean, variance, training):
+  def _renorm_correction_and_moments(self, mean, variance, training,
+                                     inputs_size):
     """Returns the correction and update values for renorm."""
     stddev = math_ops.sqrt(variance + self.epsilon)
     # Compute the average mean and standard deviation, as if they were
@@ -534,7 +567,7 @@
                             lambda: d,
                             lambda: array_ops.zeros_like(d))
 
-    def _update_renorm_variable(var, weight, value):
+    def _update_renorm_variable(var, weight, value, inputs_size):
       """Updates a moving average and weight, returns the unbiased value."""
       value = array_ops.identity(value)
       def _do_update():
@@ -547,9 +580,11 @@
         # Make sure the weight is not updated until before r and d computation.
         with ops.control_dependencies([value]):
           weight_value = array_ops.constant(1., dtype=weight.dtype)
-        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
+        new_var = self._assign_moving_average(var, value, self.renorm_momentum,
+                                              inputs_size)
         new_weight = self._assign_moving_average(weight, weight_value,
-                                                 self.renorm_momentum)
+                                                 self.renorm_momentum,
+                                                 inputs_size)
         # TODO(yuefengz): the updates to var and weighted can not be batched
         # together if we fetch their updated values here. Consider calculating
         # new values and delaying the updates.
@@ -561,20 +596,43 @@
 
     # TODO(yuefengz): colocate the operations
     new_mean = _update_renorm_variable(self.renorm_mean,
-                                       self.renorm_mean_weight, mean)
+                                       self.renorm_mean_weight, mean,
+                                       inputs_size)
     new_stddev = _update_renorm_variable(self.renorm_stddev,
-                                         self.renorm_stddev_weight, stddev)
+                                         self.renorm_stddev_weight, stddev,
+                                         inputs_size)
     # Make sqrt(moving_variance + epsilon) = new_stddev.
     new_variance = math_ops.square(new_stddev) - self.epsilon
 
     return (r, d, new_mean, new_variance)
 
   def _moments(self, inputs, reduction_axes, keep_dims):
-    return nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+    mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+    # TODO(b/129279393): Support zero batch input in non DistributionStrategy
+    # code as well.
+    # TODO(b/130185866): Support zero batch input in graph mode.
+    if (ops.executing_eagerly_outside_functions() and
+        distribution_strategy_context.has_strategy()):
+      inputs_size = array_ops.size(inputs)
+      mean = array_ops.where(inputs_size > 0, mean, K.zeros_like(mean))
+      variance = array_ops.where(inputs_size > 0, variance,
+                                 K.zeros_like(variance))
+    return mean, variance
 
-  def call(self, inputs, training=None):
+  def _get_training_value(self, training=None):
     if training is None:
       training = K.learning_phase()
+    if self._USE_V2_BEHAVIOR:
+      if isinstance(training, int):
+        training = bool(training)
+      if base_layer_utils.is_in_keras_graph():
+        training = math_ops.logical_and(training, self._get_trainable_var())
+      else:
+        training = math_ops.logical_and(training, self.trainable)
+    return training
+
+  def call(self, inputs, training=None):
+    training = self._get_training_value(training)
 
     if self.virtual_batch_size is not None:
       # Virtual batches (aka ghost batches) can be simulated by reshaping the
@@ -651,10 +709,11 @@
 
       mean = tf_utils.smart_cond(training,
                                  lambda: mean,
-                                 lambda: moving_mean)
-      variance = tf_utils.smart_cond(training,
-                                     lambda: variance,
-                                     lambda: moving_variance)
+                                 lambda: ops.convert_to_tensor(moving_mean))
+      variance = tf_utils.smart_cond(
+          training,
+          lambda: variance,
+          lambda: ops.convert_to_tensor(moving_variance))
 
       if self.virtual_batch_size is not None:
         # This isn't strictly correct since in ghost batch norm, you are
@@ -667,9 +726,14 @@
       else:
         new_mean, new_variance = mean, variance
 
+      if ops.executing_eagerly_outside_functions(
+      ) and distribution_strategy_context.has_strategy():
+        inputs_size = array_ops.size(inputs)
+      else:
+        inputs_size = None
       if self.renorm:
         r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            new_mean, new_variance, training)
+            new_mean, new_variance, training, inputs_size)
         # When training, the normalized values (say, x) will be transformed as
         # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
         # = x * (r * gamma) + (d * gamma + beta) with renorm.
@@ -683,7 +747,8 @@
         def _do_update(var, value):
           """Compute the updates for mean and variance."""
           return strategy.extended.update(
-              var, self._assign_moving_average, (value, self.momentum),
+              var,
+              self._assign_moving_average, (value, self.momentum, inputs_size),
               group=False)
         # We need to unwrap the moving_mean or moving_variance in the case of
         # training being false to match the output of true_fn and false_fn
@@ -700,7 +765,9 @@
       else:
         def _do_update(var, value):
           """Compute the updates for mean and variance."""
-          return self._assign_moving_average(var, value, self.momentum)
+          return self._assign_moving_average(var, value, self.momentum,
+                                             inputs_size)
+
 
         def mean_update():
           true_branch = lambda: _do_update(self.moving_mean, new_mean)
@@ -778,27 +845,27 @@
     return dict(list(base_config.items()) + list(config.items()))
 
 
-def _replace_in_base_docstring(old, new):
+def replace_in_base_docstring(replacements):
   string = BatchNormalizationBase.__doc__
-  if old not in string:
-    raise ValueError('Could not find following string in BatchNormalizationBase'
-                     ' docstring: "{}"'.format(old))
-  return string.replace(old, new)
+  for old, new in replacements:
+    assert old in string
+    string.replace(old, new)
+  return string
 
 
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
-  __doc__ = _replace_in_base_docstring(
-      '''
+  __doc__ = replace_in_base_docstring(
+      [('''
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
       implementation.''',
-
-      '''
+        '''
     fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.''')
+      If `False`, use the system recommended implementation.'''),
+       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
 
   _USE_V2_BEHAVIOR = False
 
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 9372415..1f5c4a2 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,12 +22,16 @@
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
@@ -154,6 +158,61 @@
     self.assertEqual(norm.beta.dtype.base_dtype, 'float32')
     self.assertEqual(norm.gamma.dtype.base_dtype, 'float32')
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_batchnorm_non_trainable_with_fit(self):
+    inputs = keras.Input((3,))
+    bn = normalization_v2.BatchNormalization()
+    outputs = bn(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
+
+    test_data = np.random.random((10, 3))
+    test_targets = np.random.random((10, 3))
+    test_loss = model.evaluate(test_data, test_targets)
+
+    bn.trainable = False
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    train_loss = model.train_on_batch(test_data, test_targets)
+    self.assertAlmostEqual(test_loss, train_loss)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_batchnorm_non_trainable_with_tf_function(self):
+    inputs = keras.Input((3,))
+    bn = normalization_v2.BatchNormalization()
+    outputs = bn(inputs)
+    model = keras.Model(inputs, outputs)
+    loss_fn = keras.losses.MeanSquaredError()
+    optimizer = rmsprop_v2.RMSprop()
+
+    @def_function.function()
+    def train_step(x, y):
+      with backprop.GradientTape() as tape:
+        y_pred = model(x, training=True)
+        loss = loss_fn(y, y_pred)
+      grads = tape.gradient(loss, model.trainable_weights)
+      optimizer.apply_gradients(zip(grads, model.trainable_weights))
+      return loss
+
+    @def_function.function()
+    def test_step(x, y):
+      y_pred = model(x, training=False)
+      loss = loss_fn(y, y_pred)
+      return loss
+
+    train_step(np.random.random((100, 3)), np.random.random((100, 3)))
+
+    test_data = np.random.random((10, 3))
+    test_targets = np.random.random((10, 3))
+    test_loss = test_step(test_data, test_targets)
+
+    bn.trainable = False
+    train_loss = train_step(test_data, test_targets)
+    if context.executing_eagerly():
+      self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
+
 
 class BatchNormalizationV1Test(test.TestCase):
 
@@ -291,18 +350,18 @@
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       model.train_on_batch(x, x)
 
-      self.assertEqual(len(bn.updates), 4)
-      self.assertEqual(len(model.updates), 2)
-      self.assertEqual(len(model.get_updates_for(x2)), 2)
+      self.assertLen(bn.updates, 4)
+      self.assertLen(bn.get_updates_for(x1), 2)
+      self.assertLen(model.get_updates_for(x2), 2)
 
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
       y3 = model(x3)
       new_model = keras.models.Model(x3, y3, name='new_model')
 
-      self.assertEqual(len(new_model.updates), 2)
-      self.assertEqual(len(model.updates), 4)
-      self.assertEqual(len(new_model.get_updates_for(x3)), 2)
+      self.assertLen(new_model.updates, 6)
+      self.assertLen(model.updates, 6)
+      self.assertLen(new_model.get_updates_for(x3), 2)
       new_model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       new_model.train_on_batch(x, x)
 
diff --git a/tensorflow/python/keras/layers/normalization_v2.py b/tensorflow/python/keras/layers/normalization_v2.py
index 05501a7..6a1049e 100644
--- a/tensorflow/python/keras/layers/normalization_v2.py
+++ b/tensorflow/python/keras/layers/normalization_v2.py
@@ -18,11 +18,48 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.layers.normalization import BatchNormalizationBase
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.layers.BatchNormalization', v1=[])  # pylint: disable=missing-docstring
-class BatchNormalization(BatchNormalizationBase):
+class BatchNormalization(normalization.BatchNormalizationBase):
+
+  __doc__ = normalization.replace_in_base_docstring([
+      ('{{TRAINABLE_ATTRIBUTE_NOTE}}',
+       '''
+  **About setting `layer.trainable = False` on a `BatchNormalization layer:**
+
+  The meaning of setting `layer.trainable = False` is to freeze the layer,
+  i.e. its internal state will not change during training:
+  its trainable weights will not be updated
+  during `fit()` or `train_on_batch()`, and its state updates will not be run.
+
+  Usually, this does not necessarily mean that the layer is run in inference
+  mode (which is normally controlled by the `training` argument that can
+  be passed when calling a layer). "Frozen state" and "inference mode"
+  are two separate concepts.
+
+  However, in the case of the `BatchNormalization` layer, **setting
+  `trainable = False` on the layer means that the layer will be
+  subsequently run in inference mode** (meaning that it will use
+  the moving mean and the moving variance to normalize the current batch,
+  rather than using the mean and variance of the current batch).
+
+  This behavior has been introduced in TensorFlow 2.0, in order
+  to enable `layer.trainable = False` to produce the most commonly
+  expected behavior in the convnet fine-tuning use case.
+
+  Note that:
+    - This behavior only occurs as of TensorFlow 2.0. In 1.*,
+      setting `layer.trainable = False` would freeze the layer but would
+      not switch it to inference mode.
+    - Setting `trainable` on an model containing other layers will
+      recursively set the `trainable` value of all inner layers.
+    - If the value of the `trainable`
+      attribute is changed after calling `compile()` on a model,
+      the new value doesn't take effect for this model
+      until `compile()` is called again.
+      ''')])
 
   _USE_V2_BEHAVIOR = True
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index bc7a9a1..3630599 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -37,7 +37,7 @@
   This class only exists for code reuse. It will never be an exposed API.
 
   Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
     pool_size: An integer or tuple/list of a single integer,
       representing the size of the pooling window.
     strides: An integer or tuple/list of a single integer, specifying the
@@ -199,7 +199,7 @@
   This class only exists for code reuse. It will never be an exposed API.
 
   Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
     pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
       specifying the size of the pooling window.
       Can be a single integer to specify the same value for
@@ -384,7 +384,7 @@
   This class only exists for code reuse. It will never be an exposed API.
 
   Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
     pool_size: An integer or tuple/list of 3 integers:
       (pool_depth, pool_height, pool_width)
       specifying the size of the pooling window.
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 054bbd7..ec5048f 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -199,22 +199,23 @@
         `state_size`.
       - A `get_initial_state(inputs=None, batch_size=None, dtype=None)`
         method that creates a tensor meant to be fed to `call()` as the
-        initial state, if user didn't specify any initial state via other
-        means. The returned initial state should be in shape of
-        [batch, cell.state_size]. Cell might choose to create zero filled
-        tensor, or with other values based on the cell implementations.
+        initial state, if the user didn't specify any initial state via other
+        means. The returned initial state should have a shape of
+        [batch_size, cell.state_size]. The cell might choose to create a
+        tensor full of zeros, or full of other values based on the cell's
+        implementation.
         `inputs` is the input tensor to the RNN layer, which should
         contain the batch size as its shape[0], and also dtype. Note that
-        the shape[0] might be None during the graph construction. Either
-        the `inputs` or the pair of `batch` and `dtype `are provided.
-        `batch` is a scalar tensor that represent the batch size
-        of the input. `dtype` is `tf.dtype` that represent the dtype of
-        the input.
+        the shape[0] might be `None` during the graph construction. Either
+        the `inputs` or the pair of `batch_size` and `dtype` are provided.
+        `batch_size` is a scalar tensor that represents the batch size
+        of the inputs. `dtype` is `tf.DType` that represents the dtype of
+        the inputs.
         For backward compatible reason, if this method is not implemented
-        by the cell, RNN layer will create a zero filled tensors with the
-        size of [batch, cell.state_size].
+        by the cell, the RNN layer will create a zero filled tensor with the
+        size of [batch_size, cell.state_size].
       In the case that `cell` is a list of RNN cell instances, the cells
-      will be stacked on after the other in the RNN, implementing an
+      will be stacked on top of each other in the RNN, resulting in an
       efficient stacked RNN.
     return_sequences: Boolean. Whether to return the last output
       in the output sequence, or the full sequence.
@@ -398,13 +399,13 @@
 
     self.supports_masking = True
     # The input shape is unknown yet, it could have nested tensor inputs, and
-    # the input spec will be the list of specs for flattened inputs.
+    # the input spec will be the list of specs for nested inputs, the structure
+    # of the input_spec will be the same as the input.
     self.input_spec = None
     self.state_spec = None
     self._states = None
     self.constants_spec = None
     self._num_constants = None
-    self._num_inputs = None
 
   @property
   def states(self):
@@ -526,13 +527,11 @@
         self.input_spec = [get_input_spec(input_shape)]
       step_input_shape = get_step_input_shape(input_shape)
     else:
-      flat_input_shapes = nest.flatten(input_shape)
-      flat_input_shapes = nest.map_structure(get_input_spec, flat_input_shapes)
-      assert len(flat_input_shapes) == self._num_inputs
       if self.input_spec is not None:
-        self.input_spec[:self._num_inputs] = flat_input_shapes
+        self.input_spec[0] = nest.map_structure(get_input_spec, input_shape)
       else:
-        self.input_spec = flat_input_shapes
+        self.input_spec = generic_utils.to_list(
+            nest.map_structure(get_input_spec, input_shape))
       step_input_shape = nest.map_structure(get_step_input_shape, input_shape)
 
     # allow cell (if layer) to build before we set or validate state_spec
@@ -574,15 +573,17 @@
         '`cell.state_size`. Received `state_spec`={}; '
         'however `cell.state_size` is '
         '{}'.format(init_state_specs, cell_state_sizes))
-    if len(cell_state_sizes) == len(init_state_specs):
-      for i in range(len(cell_state_sizes)):
-        if not tensor_shape.TensorShape(
-            # Ignore the first axis for init_state which is for batch
-            init_state_specs[i].shape[1:]).is_compatible_with(
-                tensor_shape.TensorShape(cell_state_sizes[i])):
-          raise validation_error
-    else:
+    flat_cell_state_size = nest.flatten(cell_state_sizes)
+    flat_state_spec = nest.flatten(init_state_specs)
+
+    if len(flat_cell_state_size) != len(flat_state_spec):
       raise validation_error
+    for i in range(len(flat_cell_state_size)):
+      if not tensor_shape.TensorShape(
+          # Ignore the first axis for init_state which is for batch
+          flat_state_spec[i].shape[1:]).is_compatible_with(
+              tensor_shape.TensorShape(flat_cell_state_size[i])):
+        raise validation_error
 
   def get_initial_state(self, inputs):
     get_initial_state_fn = getattr(self.cell, 'get_initial_state', None)
@@ -611,12 +612,7 @@
     inputs, initial_state, constants = _standardize_args(inputs,
                                                          initial_state,
                                                          constants,
-                                                         self._num_constants,
-                                                         self._num_inputs)
-    # in case the real inputs is a nested structure, set the size of flatten
-    # input so that we can distinguish between real inputs, initial_state and
-    # constants.
-    self._num_inputs = len(nest.flatten(inputs))
+                                                         self._num_constants)
 
     if initial_state is None and constants is None:
       return super(RNN, self).__call__(inputs, **kwargs)
@@ -629,9 +625,8 @@
     additional_specs = []
     if initial_state is not None:
       additional_inputs += initial_state
-      self.state_spec = [
-          InputSpec(shape=K.int_shape(state)) for state in initial_state
-      ]
+      self.state_spec = nest.map_structure(
+          lambda s: InputSpec(shape=K.int_shape(s)), initial_state)
       additional_specs += self.state_spec
     if constants is not None:
       additional_inputs += constants
@@ -641,8 +636,8 @@
       self._num_constants = len(constants)
       additional_specs += self.constants_spec
     # at this point additional_inputs cannot be empty
-    is_keras_tensor = K.is_keras_tensor(additional_inputs[0])
-    for tensor in additional_inputs:
+    is_keras_tensor = K.is_keras_tensor(nest.flatten(additional_inputs)[0])
+    for tensor in nest.flatten(additional_inputs):
       if K.is_keras_tensor(tensor) != is_keras_tensor:
         raise ValueError('The initial state or constants of an RNN'
                          ' layer cannot be specified with a mix of'
@@ -655,8 +650,8 @@
       full_input = [inputs] + additional_inputs
       # The original input_spec is None since there could be a nested tensor
       # input. Update the input_spec to match the inputs.
-      full_input_spec = [None for _ in range(len(nest.flatten(inputs)))
-                        ] + additional_specs
+      full_input_spec = generic_utils.to_list(
+          nest.map_structure(lambda _: None, inputs)) + additional_specs
       # Perform the call with temporarily replaced input_spec
       self.input_spec = full_input_spec
       output = super(RNN, self).__call__(full_input, **kwargs)
@@ -2392,7 +2387,7 @@
   """Long Short-Term Memory layer - Hochreiter 1997.
 
    Note that this cell is not optimized for performance on GPU. Please use
-  `tf.keras.layers.CuDNNLSTM` for better performance on GPU.
+  `tf.compat.v1.keras.layers.CuDNNLSTM` for better performance on GPU.
 
   Arguments:
     units: Positive integer, dimensionality of the output space.
@@ -2672,8 +2667,7 @@
   return K.in_train_phase(dropped_inputs, ones, training=training)
 
 
-def _standardize_args(
-    inputs, initial_state, constants, num_constants, num_inputs=1):
+def _standardize_args(inputs, initial_state, constants, num_constants):
   """Standardizes `__call__` to a single list of tensor inputs.
 
   When running a model loaded from a file, the input tensors
@@ -2689,8 +2683,6 @@
     constants: Tensor or list of tensors or None, constant tensors.
     num_constants: Expected number of constants (if constants are passed as
       part of the `inputs` list.
-    num_inputs: Expected number of real input tensors (exclude initial_states
-      and constants).
 
   Returns:
     inputs: Single tensor or tuple of tensors.
@@ -2704,21 +2696,19 @@
     # In the eager mode, __call__ will be called twice, once during
     # rnn_layer(inputs=input_t, constants=c_t, ...), and second time will be
     # model.fit/train_on_batch/predict with real np data. In the second case,
-    # the inputs will contain initial_state and constants, and more importantly,
-    # the real inputs will be in a flat list, instead of nested tuple.
+    # the inputs will contain initial_state and constants as eager tensor.
     #
-    # For either case, we will use num_inputs to split the input list, and
-    # restructure the real input into tuple.
+    # For either case, the real input is the first item in the list, which
+    # could be a nested structure itself. Then followed by initial_states, which
+    # could be a list of items, or list of list if the initial_state is complex
+    # structure, and finally followed by constants which is a flat list.
     assert initial_state is None and constants is None
-    inputs = nest.flatten(inputs)
     if num_constants is not None:
       constants = inputs[-num_constants:]
       inputs = inputs[:-num_constants]
-    if num_inputs is None:
-      num_inputs = 1
-    if len(inputs) > num_inputs:
-      initial_state = inputs[num_inputs:]
-      inputs = inputs[:num_inputs]
+    if len(inputs) > 1:
+      initial_state = inputs[1:]
+      inputs = inputs[:1]
 
     if len(inputs) > 1:
       inputs = tuple(inputs)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index c8eb9b9..af6b756 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -1312,6 +1312,36 @@
     simple_rnn = keras.layers.SimpleRNN(5, input_length=10)
     self.assertEqual(simple_rnn._batch_input_shape, (None, 10, None))
 
+  @parameterized.parameters(
+      [keras.layers.SimpleRNNCell, keras.layers.GRUCell, keras.layers.LSTMCell])
+  def test_state_spec_with_stack_cell(self, cell):
+    # See https://github.com/tensorflow/tensorflow/issues/27817 for more detail.
+    batch = 12
+    timesteps = 10
+    input_dim = 8
+    output_dim = 8
+
+    def create_cell():
+      return [cell(output_dim),
+              cell(output_dim),
+              cell(output_dim)]
+
+    inputs = keras.Input((timesteps, input_dim))
+    encoder_output = keras.layers.RNN(create_cell(), return_state=True)(inputs)
+
+    states = encoder_output[1:]
+
+    decoder_output = keras.layers.RNN(
+        create_cell())(inputs, initial_state=states)
+
+    model = keras.models.Model(inputs, decoder_output)
+    model.compile(optimizer='rmsprop', loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.train_on_batch(
+        np.zeros((batch, timesteps, input_dim)),
+        np.zeros((batch, output_dim)))
+    model.predict(np.ones((batch, timesteps, input_dim)))
+
 
 class RNNCellWithConstants(keras.layers.Layer):
 
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 58bf527..903de3d 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -44,6 +44,14 @@
 _CPU_DEVICE_NAME = 'CPU'
 _GPU_DEVICE_NAME = 'GPU'
 
+# The following number constants are used to represent the runtime of the defun
+# backend function. Since the CPU/GPU implementation are mathematically same, we
+# need some signal for the function to indicate which function is executed. This
+# is for testing purpose to verify the correctness of swapping backend function.
+_RUNTIME_UNKNOWN = 0
+_RUNTIME_CPU = 1
+_RUNTIME_GPU = 2
+
 
 @keras_export('keras.layers.GRUCell', v1=[])
 class GRUCell(recurrent.GRUCell):
@@ -329,7 +337,7 @@
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
       # This is a dummy tensor for testing purpose.
-      runtime = _runtime('unknown')
+      runtime = _runtime(_RUNTIME_UNKNOWN)
     else:
       last_output, outputs, runtime, states = self._defun_gru_call(
           inputs, initial_state, training, mask)
@@ -490,7 +498,7 @@
       mask=mask,
       go_backwards=go_backwards,
       input_length=timesteps)
-  return last_output, outputs, new_states[0], _runtime('cpu')
+  return last_output, outputs, new_states[0], _runtime(_RUNTIME_CPU)
 
 
 def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
@@ -554,7 +562,7 @@
   if mask is not None:
     last_output = h
 
-  return last_output, outputs, h, _runtime('cudnn')
+  return last_output, outputs, h, _runtime(_RUNTIME_GPU)
 
 
 @keras_export('keras.layers.LSTMCell', v1=[])
@@ -823,7 +831,7 @@
           input_length=timesteps,
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
-      runtime = _runtime('unknown')
+      runtime = _runtime(_RUNTIME_UNKNOWN)
     else:
       # Use the new defun approach for backend implementation swap.
       # Note that different implementations need to have same function
@@ -1021,7 +1029,8 @@
       mask=mask,
       go_backwards=go_backwards,
       input_length=timesteps)
-  return last_output, outputs, new_states[0], new_states[1], _runtime('cpu')
+  return (last_output, outputs, new_states[0], new_states[1],
+          _runtime(_RUNTIME_CPU))
 
 
 def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
@@ -1100,7 +1109,7 @@
   # the last_output, since it is numerically same as the output.
   if mask is not None:
     last_output = h
-  return last_output, outputs, h, c, _runtime('cudnn')
+  return last_output, outputs, h, c, _runtime(_RUNTIME_GPU)
 
 
 def is_sequence_right_padded(mask, time_major):
@@ -1193,4 +1202,4 @@
 def _runtime(runtime_name):
   with ops.device('/cpu:0'):
     return constant_op.constant(
-        runtime_name, dtype=dtypes.string, name='runtime')
+        runtime_name, dtype=dtypes.float32, name='runtime')
diff --git a/tensorflow/python/keras/layers/subclassed_layers_test.py b/tensorflow/python/keras/layers/subclassed_layers_test.py
new file mode 100644
index 0000000..16dcc97
--- /dev/null
+++ b/tensorflow/python/keras/layers/subclassed_layers_test.py
@@ -0,0 +1,82 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras subclassed layers utilizing desired user syntax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+@keras_parameterized.run_with_all_model_types
+class SubclassedLayersTest(keras_parameterized.TestCase):
+
+  def test_simple_build_with_constant(self):
+
+    class BuildConstantLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.b = ops.convert_to_tensor(2.0)
+
+      def call(self, inputs):
+        return self.b * inputs
+
+    layer = BuildConstantLayer()
+    model = testing_utils.get_model_from_layers(
+        [layer, keras.layers.Dense(1)], input_shape=(1,))
+
+    x = ops.convert_to_tensor([[3.0]])
+    self.assertEqual(
+        tf_utils.is_symbolic_tensor(model(x)), not context.executing_eagerly())
+    self.assertEqual(
+        tf_utils.is_symbolic_tensor(layer(x)), not context.executing_eagerly())
+    self.assertAllClose(keras.backend.get_value(layer(x)), [[6.0]])
+
+  def test_build_with_derived_constant(self):
+
+    class BuildDerivedConstantLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        a = ops.convert_to_tensor(1.0)
+        b = 2.0 * a
+        self.variable = variables.Variable(b)
+        self.constant = ops.convert_to_tensor(self.variable)
+
+      def call(self, inputs):
+        return self.variable * self.constant * inputs
+
+    layer = BuildDerivedConstantLayer()
+    model = testing_utils.get_model_from_layers(
+        [layer, keras.layers.Dense(1)], input_shape=(1,))
+
+    x = ops.convert_to_tensor([[3.0]])
+    self.assertEqual(
+        tf_utils.is_symbolic_tensor(model(x)), not context.executing_eagerly())
+    self.assertEqual(
+        tf_utils.is_symbolic_tensor(layer(x)), not context.executing_eagerly())
+    self.assertAllClose(keras.backend.get_value(layer(x)), [[12.0]])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 229bad2..e56b3b8 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -206,8 +206,10 @@
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     child_input_shape = tensor_shape.TensorShape([input_shape[0]] +
                                                  input_shape[2:])
-    child_output_shape = self.layer.compute_output_shape(
-        child_input_shape).as_list()
+    child_output_shape = self.layer.compute_output_shape(child_input_shape)
+    if not isinstance(child_output_shape, tensor_shape.TensorShape):
+      child_output_shape = tensor_shape.TensorShape(child_output_shape)
+    child_output_shape = child_output_shape.as_list()
     timesteps = input_shape[1]
     return tensor_shape.TensorShape([child_output_shape[0], timesteps] +
                                     child_output_shape[1:])
@@ -348,47 +350,95 @@
       One of {'sum', 'mul', 'concat', 'ave', None}.
       If None, the outputs will not be combined,
       they will be returned as a list.
+    backward_layer: Optional `Recurrent` instance to be used to handle
+      backwards input processing. If `backward_layer` is not provided,
+      the layer instance passed as the `layer` argument will be used to
+      generate the backward layer automatically.
+      Note that the provided `backward_layer` layer should have properties
+      matching those of the `layer` argument, in particular it should have the
+      same values for `stateful`, `return_states`, `return_sequence`, etc.
+      In addition, `backward_layer` and `layer` should have
+      different `go_backwards` argument values.
+      A `ValueError` will be raised if these requirements are not met.
 
   Call arguments:
     The call arguments for this layer are the same as those of the wrapped RNN
       layer.
 
   Raises:
-    ValueError: If not initialized with a `Layer` instance or
-      In case of invalid `merge_mode` argument.
+    ValueError:
+      1. If `layer` or `backward_layer` is not a `Layer` instance.
+      2. In case of invalid `merge_mode` argument.
+      3. If `backward_layer` has mismatched properties compared to `layer`.
 
   Examples:
 
   ```python
   model = Sequential()
-  model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
-  10)))
+  model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5, 10)))
   model.add(Bidirectional(LSTM(10)))
   model.add(Dense(5))
   model.add(Activation('softmax'))
   model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+   # With custom backward layer
+   model = Sequential()
+   forward_layer = LSTM(10, return_sequences=True)
+   backard_layer = LSTM(10, activation='relu', return_sequences=True,
+                        go_backwards=True)
+   model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
+                           input_shape=(5, 10)))
+   model.add(Dense(5))
+   model.add(Activation('softmax'))
+   model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
   ```
   """
 
-  def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
+  def __init__(self,
+               layer,
+               merge_mode='concat',
+               weights=None,
+               backward_layer=None,
+               **kwargs):
     if not isinstance(layer, Layer):
       raise ValueError(
           'Please initialize `Bidirectional` layer with a '
           '`Layer` instance. You passed: {input}'.format(input=layer))
+    if backward_layer is not None and not isinstance(backward_layer, Layer):
+      raise ValueError('`backward_layer` need to be a `Layer` instance. '
+                       'You passed: {input}'.format(input=backward_layer))
     if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]:
       raise ValueError('Invalid merge mode. '
                        'Merge mode should be one of '
                        '{"sum", "mul", "ave", "concat", None}')
-    if getattr(layer, 'zero_output_for_mask', None) is not None:
-      # Force the zero_output_for_mask to be True if returning sequences.
-      layer.zero_output_for_mask = layer.return_sequences
+    # Recreate the forward layer from the original layer config, so that it will
+    # not carry over any state from the layer.
+    self.forward_layer = layer.__class__.from_config(layer.get_config())
 
-    self.forward_layer = copy.copy(layer)
-    config = layer.get_config()
-    config['go_backwards'] = not config['go_backwards']
-    self.backward_layer = layer.__class__.from_config(config)
+    if backward_layer is None:
+      config = layer.get_config()
+      config['go_backwards'] = not config['go_backwards']
+      self.backward_layer = layer.__class__.from_config(config)
+    else:
+      self.backward_layer = backward_layer
+      # Keep the custom backward layer config, so that we can save it later. The
+      # layer's name might be updated below with prefix 'backward_', and we want
+      # to preserve the original config.
+      self._backward_layer_config = backward_layer.get_config()
+
     self.forward_layer._name = 'forward_' + self.forward_layer.name
     self.backward_layer._name = 'backward_' + self.backward_layer.name
+
+    self._verify_layer_config()
+
+    def force_zero_output_for_mask(layer):
+      # Force the zero_output_for_mask to be True if returning sequences.
+      if getattr(layer, 'zero_output_for_mask', None) is not None:
+        layer.zero_output_for_mask = layer.return_sequences
+
+    force_zero_output_for_mask(self.forward_layer)
+    force_zero_output_for_mask(self.backward_layer)
+
     self.merge_mode = merge_mode
     if weights:
       nw = len(weights)
@@ -407,10 +457,28 @@
     self._setattr_tracking = True
     self.input_spec = layer.input_spec
 
+  def _verify_layer_config(self):
+    """Ensure the forward and backward layers have valid common property."""
+    if self.forward_layer.go_backwards == self.backward_layer.go_backwards:
+      raise ValueError('Forward layer and backward layer should have different '
+                       '`go_backwards` value.')
+
+    common_attributes = ('stateful', 'return_sequences', 'return_state')
+    for a in common_attributes:
+      forward_value = getattr(self.forward_layer, a)
+      backward_value = getattr(self.backward_layer, a)
+      if forward_value != backward_value:
+        raise ValueError(
+            'Forward layer and backward layer are expected to have the same '
+            'value for attribute {attr}, got {forward} and {backward}'.format(
+                attr=a, forward=forward_value, backward=backward_value))
+
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
-    output_shape = tuple(self.forward_layer.compute_output_shape(
-        input_shape).as_list())
+    output_shape = self.forward_layer.compute_output_shape(input_shape)
+    if not isinstance(output_shape, tensor_shape.TensorShape):
+      output_shape = tensor_shape.TensorShape(output_shape)
+    output_shape = tuple(output_shape.as_list())
     if self.return_state:
       state_shape = output_shape[1:]
       output_shape = output_shape[0]
@@ -611,12 +679,27 @@
     config = {'merge_mode': self.merge_mode}
     if self._num_constants is not None:
       config['num_constants'] = self._num_constants
+
+    if hasattr(self, '_backward_layer_config'):
+      config['backward_layer'] = {
+          'class_name': self.backward_layer.__class__.__name__,
+          'config': self._backward_layer_config,
+      }
     base_config = super(Bidirectional, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
+    # Instead of updating the input, create a copy and use that.
+    config = config.copy()
     num_constants = config.pop('num_constants', None)
+    backward_layer_config = config.pop('backward_layer', None)
+    if backward_layer_config is not None:
+      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+      backward_layer = deserialize_layer(
+          backward_layer_config, custom_objects=custom_objects)
+      config['backward_layer'] = backward_layer
+
     layer = super(Bidirectional, cls).from_config(config,
                                                   custom_objects=custom_objects)
     layer._num_constants = num_constants
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index d394a20..e079852 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -20,12 +20,15 @@
 
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.ops.array_ops import concat
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import object_identity
 from tensorflow.python.training.tracking import util as trackable_util
@@ -290,8 +293,45 @@
     td3 = keras.layers.TimeDistributed(NoReshapeLayer())
     self.assertFalse(td3._always_use_reshape)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_TimeDistributed_output_shape_return_types(self):
 
-class BidirectionalTest(test.TestCase):
+    class TestLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return concat([inputs, inputs], axis=-1)
+
+      def compute_output_shape(self, input_shape):
+        output_shape = tensor_shape.TensorShape(input_shape).as_list()
+        output_shape[-1] = output_shape[-1] * 2
+        output_shape = tensor_shape.TensorShape(output_shape)
+        return output_shape
+
+    class TestListLayer(TestLayer):
+
+      def compute_output_shape(self, input_shape):
+        shape = super(TestListLayer, self).compute_output_shape(input_shape)
+        return shape.as_list()
+
+    class TestTupleLayer(TestLayer):
+
+      def compute_output_shape(self, input_shape):
+        shape = super(TestTupleLayer, self).compute_output_shape(input_shape)
+        return tuple(shape.as_list())
+
+    # Layers can specify output shape as list/tuple/TensorShape
+    test_layers = [TestLayer, TestListLayer, TestTupleLayer]
+    for layer in test_layers:
+      input_layer = keras.layers.TimeDistributed(layer())
+      inputs = keras.backend.placeholder(shape=(None, 2, 4))
+      output = input_layer(inputs)
+      self.assertEqual(output.shape.as_list(), [None, 2, 8])
+      self.assertEqual(
+          input_layer.compute_output_shape([None, 2, 4]).as_list(),
+          [None, 2, 8])
+
+
+class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
   def test_bidirectional(self):
     rnn = keras.layers.SimpleRNN
@@ -677,6 +717,42 @@
       y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_Bidirectional_output_shape_return_types(self):
+
+    class TestLayer(keras.layers.SimpleRNN):
+
+      def call(self, inputs):
+        return concat([inputs, inputs], axis=-1)
+
+      def compute_output_shape(self, input_shape):
+        output_shape = tensor_shape.TensorShape(input_shape).as_list()
+        output_shape[-1] = output_shape[-1] * 2
+        return tensor_shape.TensorShape(output_shape)
+
+    class TestListLayer(TestLayer):
+
+      def compute_output_shape(self, input_shape):
+        shape = super(TestListLayer, self).compute_output_shape(input_shape)
+        return shape.as_list()
+
+    class TestTupleLayer(TestLayer):
+
+      def compute_output_shape(self, input_shape):
+        shape = super(TestTupleLayer, self).compute_output_shape(input_shape)
+        return tuple(shape.as_list())
+
+    # Layers can specify output shape as list/tuple/TensorShape
+    test_layers = [TestLayer, TestListLayer, TestTupleLayer]
+    for layer in test_layers:
+      input_layer = keras.layers.Bidirectional(layer(1))
+      inputs = keras.backend.placeholder(shape=(None, 2, 4))
+      output = input_layer(inputs)
+      self.assertEqual(output.shape.as_list(), [None, 2, 16])
+      self.assertEqual(
+          input_layer.compute_output_shape([None, 2, 4]).as_list(),
+          [None, 2, 16])
+
   def test_Bidirectional_last_output_with_masking(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -695,12 +771,12 @@
       wrapped = keras.layers.Bidirectional(
           rnn(units, return_state=True), merge_mode=merge_mode)
       outputs = _to_list(wrapped(masked_inputs, training=True))
-      self.assertEqual(len(outputs), 5)
+      self.assertLen(outputs, 5)
       self.assertEqual(outputs[0].shape.as_list(), [None, units * 2])
 
       model = keras.Model(inputs, outputs)
       y = _to_list(model.predict(x))
-      self.assertEqual(len(y), 5)
+      self.assertLen(y, 5)
       self.assertAllClose(y[0], np.concatenate([y[1], y[3]], axis=1))
 
   def test_Bidirectional_sequence_output_with_masking(self):
@@ -722,14 +798,121 @@
           rnn(units, return_sequences=True),
           merge_mode=merge_mode)
       outputs = _to_list(wrapped(masked_inputs, training=True))
-      self.assertEqual(len(outputs), 1)
+      self.assertLen(outputs, 1)
       self.assertEqual(outputs[0].shape.as_list(), [None, timesteps, units * 2])
 
       model = keras.Model(inputs, outputs)
       y = _to_list(model.predict(x))
-      self.assertEqual(len(y), 1)
+      self.assertLen(y, 1)
       self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
 
+  @parameterized.parameters(['sum', 'concat'])
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_custom_backward_layer(self, mode):
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+
+    x = np.random.random((samples, timesteps, dim))
+    target_dim = 2 * output_dim if mode == 'concat' else output_dim
+    y = np.random.random((samples, target_dim))
+    forward_layer = rnn(output_dim)
+    backward_layer = rnn(output_dim, go_backwards=True)
+
+    # test with Sequential model
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Bidirectional(
+            forward_layer,
+            merge_mode=mode,
+            backward_layer=backward_layer,
+            input_shape=(timesteps, dim)))
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.fit(x, y, epochs=1, batch_size=1)
+
+    # check whether the model variables are present in the
+    # trackable list of objects
+    checkpointed_objects = object_identity.ObjectIdentitySet(
+        trackable_util.list_objects(model))
+    for v in model.variables:
+      self.assertIn(v, checkpointed_objects)
+
+    # test compute output shape
+    ref_shape = model.layers[-1].output.shape
+    shape = model.layers[-1].compute_output_shape((None, timesteps, dim))
+    self.assertListEqual(shape.as_list(), ref_shape.as_list())
+
+    # test config
+    model.get_config()
+    model = keras.models.model_from_json(model.to_json())
+    model.summary()
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_custom_backward_layer_error_check(self):
+    rnn = keras.layers.LSTM
+    units = 2
+
+    forward_layer = rnn(units)
+    backward_layer = rnn(units)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'should have different `go_backwards` value.'):
+      keras.layers.Bidirectional(
+          forward_layer, merge_mode='concat', backward_layer=backward_layer)
+
+    for attr in ('stateful', 'return_sequences', 'return_state'):
+      kwargs = {attr: True}
+      backward_layer = rnn(units, go_backwards=True, **kwargs)
+      with self.assertRaisesRegexp(
+          ValueError, 'expected to have the same value for attribute ' + attr):
+        keras.layers.Bidirectional(
+            forward_layer, merge_mode='concat', backward_layer=backward_layer)
+
+  def test_custom_backward_layer_serialization(self):
+    rnn = keras.layers.LSTM
+    units = 2
+
+    forward_layer = rnn(units)
+    backward_layer = rnn(units, go_backwards=True)
+    layer = keras.layers.Bidirectional(
+        forward_layer, merge_mode='concat', backward_layer=backward_layer)
+    config = layer.get_config()
+    layer_from_config = keras.layers.Bidirectional.from_config(config)
+    new_config = layer_from_config.get_config()
+    self.assertDictEqual(config, new_config)
+
+  def test_rnn_layer_name(self):
+    rnn = keras.layers.LSTM
+    units = 2
+
+    layer = keras.layers.Bidirectional(rnn(units, name='rnn'))
+    config = layer.get_config()
+
+    self.assertEqual(config['layer']['config']['name'], 'rnn')
+
+    layer_from_config = keras.layers.Bidirectional.from_config(config)
+    self.assertEqual(layer_from_config.forward_layer.name, 'forward_rnn')
+    self.assertEqual(layer_from_config.backward_layer.name, 'backward_rnn')
+
+  def test_custom_backward_rnn_layer_name(self):
+    rnn = keras.layers.LSTM
+    units = 2
+
+    forward_layer = rnn(units)
+    backward_layer = rnn(units, go_backwards=True)
+    layer = keras.layers.Bidirectional(
+        forward_layer, merge_mode='concat', backward_layer=backward_layer)
+    config = layer.get_config()
+
+    self.assertEqual(config['layer']['config']['name'], 'lstm')
+    self.assertEqual(config['backward_layer']['config']['name'], 'lstm_1')
+
+    layer_from_config = keras.layers.Bidirectional.from_config(config)
+    self.assertEqual(layer_from_config.forward_layer.name, 'forward_lstm')
+    self.assertEqual(layer_from_config.backward_layer.name, 'backward_lstm_1')
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 43cf8cb..bcfad56 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -22,13 +22,14 @@
 
 import six
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -53,15 +54,40 @@
       return K.mean(math_ops.square(y_pred - y_true), axis=-1)
   ```
 
+  When used with `tf.distribute.Strategy`, outside of built-in training loops
+  such as `tf.keras` `compile` and `fit`, please use 'SUM' or 'NONE' reduction
+  types, and reduce losses explicitly in your training loop. Using 'AUTO' or
+  'SUM_OVER_BATCH_SIZE' will raise an error.
+
+  Please see
+  https://www.tensorflow.org/alpha/tutorials/distribute/training_loops for more
+  details on this.
+
+  You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like:
+  ```
+  with strategy.scope():
+    loss_obj = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    ....
+    loss = (tf.reduce_sum(loss_obj(labels, predictions)) *
+            (1. / global_batch_size))
+  ```
+
   Args:
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
+      `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: Optional name for the op.
   """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
+  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name=None):
+    losses_utils.ReductionV2.validate(reduction)
     self.reduction = reduction
     self.name = name
 
@@ -91,11 +117,12 @@
     # If we are wrapping a lambda function strip '<>' from the name as it is not
     # accepted in scope name.
     scope_name = 'lambda' if self.name == '<lambda>' else self.name
-    with ops.name_scope(scope_name, format(self.__class__.__name__),
-                        (y_pred, y_true, sample_weight)):
+    graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
+        y_true, y_pred, sample_weight)
+    with K.name_scope(scope_name or self.__class__.__name__), graph_ctx:
       losses = self.call(y_true, y_pred)
       return losses_utils.compute_weighted_loss(
-          losses, sample_weight, reduction=self.reduction)
+          losses, sample_weight, reduction=self._get_reduction())
 
   @classmethod
   def from_config(cls, config):
@@ -123,6 +150,29 @@
     """
     NotImplementedError('Must be implemented in subclasses.')
 
+  def _get_reduction(self):
+    """Handles `AUTO` reduction cases and returns the reduction value."""
+    if distribution_strategy_context.has_strategy() and (
+        self.reduction == losses_utils.ReductionV2.AUTO or
+        self.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE):
+      raise ValueError(
+          'Please use `tf.keras.losses.Reduction.SUM` or '
+          '`tf.keras.losses.Reduction.NONE` for loss reduction when losses are '
+          'used with `tf.distribute.Strategy` outside of the built-in training '
+          'loops. You can implement '
+          '`tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` using global batch '
+          'size like:\n```\nwith strategy.scope():\n'
+          '    loss_obj = tf.keras.losses.CategoricalCrossentropy('
+          'reduction=tf.keras.losses.reduction.None)\n....\n'
+          '    loss = tf.reduce_sum(loss_obj(labels, predictions)) * '
+          '(1. / global_batch_size)\n```\nPlease see '
+          'https://www.tensorflow.org/alpha/tutorials/distribute/training_loops'
+          ' for more details.')
+
+    if self.reduction == losses_utils.ReductionV2.AUTO:
+      return losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+    return self.reduction
+
 
 class LossFunctionWrapper(Loss):
   """Wraps a loss function in the `Loss` class.
@@ -131,14 +181,21 @@
     fn: The loss function to wrap, with signature `fn(y_true, y_pred,
       **kwargs)`.
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
+      `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: (Optional) name for the loss.
     **kwargs: The keyword arguments that are passed on to `fn`.
   """
 
   def __init__(self,
                fn,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name=None,
                **kwargs):
     super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name)
@@ -160,7 +217,7 @@
   def get_config(self):
     config = {}
     for k, v in six.iteritems(self._fn_kwargs):
-      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+      config[k] = K.eval(v) if tf_utils.is_tensor_or_variable(v) else v
     base_config = super(LossFunctionWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
@@ -189,7 +246,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='mean_squared_error'):
     super(MeanSquaredError, self).__init__(
         mean_squared_error, name=name, reduction=reduction)
@@ -219,7 +276,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='mean_absolute_error'):
     super(MeanAbsoluteError, self).__init__(
         mean_absolute_error, name=name, reduction=reduction)
@@ -249,7 +306,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='mean_absolute_percentage_error'):
     super(MeanAbsolutePercentageError, self).__init__(
         mean_absolute_percentage_error, name=name, reduction=reduction)
@@ -279,7 +336,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='mean_squared_logarithmic_error'):
     super(MeanSquaredLogarithmicError, self).__init__(
         mean_squared_logarithmic_error, name=name, reduction=reduction)
@@ -320,17 +377,22 @@
       compute the loss between the predicted labels and a smoothed version of
       the true labels, where the smoothing squeezes the labels towards 0.5.
       Larger values of `label_smoothing` correspond to heavier smoothing.
-    reduction: (Optional) The type of `tf.keras.losses.Reduction` to use to
-      combine the computed loss values for the individual examples into a single
-      loss value for the entire batch of examples. Defaults to
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
       `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: (Optional) Name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='binary_crossentropy'):
     super(BinaryCrossentropy, self).__init__(
         binary_crossentropy,
@@ -379,14 +441,21 @@
       `label_smoothing=0.2` means that we will use a value of `0.1` for label
       `0` and `0.9` for label `1`"
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
+      `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='categorical_crossentropy'):
     super(CategoricalCrossentropy, self).__init__(
         categorical_crossentropy,
@@ -432,13 +501,20 @@
     from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
       we assume that `y_pred` encodes a probability distribution.
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
+      `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name=None):
     super(SparseCategoricalCrossentropy, self).__init__(
         sparse_categorical_crossentropy,
@@ -473,9 +549,7 @@
   ```
   """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
+  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name=None):
     super(Hinge, self).__init__(hinge, name=name, reduction=reduction)
 
 
@@ -506,7 +580,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='squared_hinge'):
     super(SquaredHinge, self).__init__(
         squared_hinge, name=name, reduction=reduction)
@@ -533,7 +607,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='categorical_hinge'):
     super(CategoricalHinge, self).__init__(
         categorical_hinge, name=name, reduction=reduction)
@@ -561,9 +635,7 @@
   ```
   """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name='poisson'):
+  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name='poisson'):
     super(Poisson, self).__init__(poisson, name=name, reduction=reduction)
 
 
@@ -589,9 +661,7 @@
   ```
   """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name='logcosh'):
+  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name='logcosh'):
     super(LogCosh, self).__init__(logcosh, name=name, reduction=reduction)
 
 
@@ -618,7 +688,7 @@
   """
 
   def __init__(self,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='kullback_leibler_divergence'):
     super(KLDivergence, self).__init__(
         kullback_leibler_divergence, name=name, reduction=reduction)
@@ -655,13 +725,20 @@
     delta: A float, the point where the Huber loss function changes from a
       quadratic to linear.
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
+      `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: Optional name for the op.
   """
 
   def __init__(self,
                delta=1.0,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='huber_loss'):
     super(Huber, self).__init__(
         huber_loss, name=name, reduction=reduction, delta=delta)
@@ -961,13 +1038,20 @@
     axis: (Optional) Defaults to -1. The dimension along which the cosine
       similarity is computed.
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
+      Default value is `AUTO`. `AUTO` indicates that the reduction option will
+      be determined by the usage context. For almost all cases this defaults to
+      `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
     name: Optional name for the op.
   """
 
   def __init__(self,
                axis=-1,
-               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.AUTO,
                name='cosine_similarity'):
     super(CosineSimilarity, self).__init__(
         cosine_similarity, reduction=reduction, name=name, axis=axis)
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 1bca040..201da56 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -187,8 +187,7 @@
     mse_obj = keras.losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
 
     self.assertEqual(mse_obj.name, 'mean_squared_error')
-    self.assertEqual(mse_obj.reduction,
-                     losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
+    self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.AUTO)
 
     y_true = constant_op.constant([[1., 9.], [2., 5.]])
     y_pred = constant_op.constant([[4., 8.], [12., 3.]])
@@ -201,6 +200,16 @@
     # reduced_weighted_mse = (6 + 26) / 2 =
     self.assertAllClose(self.evaluate(loss), 16, 1e-2)
 
+  def test_invalid_reduction(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Foo.'):
+      keras.losses.MeanSquaredError(reduction='Foo')
+
+    mse_obj = keras.losses.MeanSquaredError()
+    y = constant_op.constant([1])
+    mse_obj.reduction = 'Bar'
+    with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Bar.'):
+      mse_obj(y, y)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class MeanSquaredErrorTest(test.TestCase):
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 775c311..42ece0e 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -84,7 +84,7 @@
   model.add(tf.keras.layers.Dense(64, activation='relu'))
   model.add(tf.keras.layers.Dense(10, activation='softmax'))
 
-  model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
+  model.compile(optimizer=tf.compat.v1.train.RMSPropOptimizer(0.01),
                 loss=tf.keras.losses.categorical_crossentropy,
                 metrics=[tf.keras.metrics.CategoricalAccuracy()])
 
@@ -2290,6 +2290,10 @@
     Returns:
       Update op.
     """
+
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+
     # Flatten the input if its rank > 1.
     if y_pred.shape.ndims > 1:
       y_pred = array_ops.reshape(y_pred, [-1])
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
index abef3c4..4f761bf 100644
--- a/tensorflow/python/keras/metrics_correctness_test.py
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -18,32 +18,58 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.platform import test
 
 
+def get_multi_io_model():
+  inp_1 = layers.Input(shape=(1,), name='input_1')
+  inp_2 = layers.Input(shape=(1,), name='input_2')
+  x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+  out_1 = layers.Dense(
+      1, kernel_initializer='ones', name='output_1', trainable=False)
+  out_2 = layers.Dense(
+      1, kernel_initializer='ones', name='output_2', trainable=False)
+
+  branch_a = [inp_1, x, out_1]
+  branch_b = [inp_2, x, out_2]
+  return testing_utils.get_multi_io_model(branch_a, branch_b)
+
+
+def custom_generator_multi_io():
+  batch_size = 2
+  num_samples = 4
+  inputs = np.asarray([[1.], [2.], [3.], [4.]])
+  targets = np.asarray([[2.], [4.], [6.], [8.]])
+  w1 = np.asarray([2., 3., 4., 5.])
+  w2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+  i = 0
+  while True:
+    batch_index = i * batch_size % num_samples
+    i += 1
+    start = batch_index
+    end = start + batch_size
+    x = [inputs[start:end], inputs[start:end]]
+    y = [targets[start:end], targets[start:end]]
+    w = [w1[start:end], w2[start:end]]
+    yield x, y, w
+
+
 @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
 @keras_parameterized.run_all_keras_modes
 class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
 
-  def _get_multi_io_model(self):
-    inp_1 = layers.Input(shape=(1,), name='input_1')
-    inp_2 = layers.Input(shape=(1,), name='input_2')
-    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
-    out_1 = layers.Dense(
-        1, kernel_initializer='ones', name='output_1', trainable=False)
-    out_2 = layers.Dense(
-        1, kernel_initializer='ones', name='output_2', trainable=False)
-
-    branch_a = [inp_1, x, out_1]
-    branch_b = [inp_2, x, out_2]
-    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+  def _get_compiled_multi_io_model(self):
+    model = get_multi_io_model()
     model.compile(
         optimizer='rmsprop',
         loss='mse',
@@ -135,7 +161,7 @@
     self.expected_batch_result = [41.25, 32.5, 8.75, 7.5, 9.286, 7.5, 4.375]
 
   def test_fit(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     history = model.fit([self.x, self.x], [self.y, self.y],
                         sample_weight={
                             'output_1': self.weights_1,
@@ -148,7 +174,7 @@
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_eval(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
                                  batch_size=2,
                                  sample_weight={
@@ -167,7 +193,7 @@
     self.assertAllClose(mse1, mse2, 1e-3)
 
   def test_train_on_batch(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     result = model.train_on_batch([self.x, self.x], [self.y, self.y],
                                   sample_weight={
                                       'output_1': self.weights_1,
@@ -176,7 +202,7 @@
     self.assertAllClose(result, self.expected_batch_result, 1e-3)
 
   def test_test_on_batch(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     result = model.test_on_batch([self.x, self.x], [self.y, self.y],
                                  sample_weight={
                                      'output_1': self.weights_1,
@@ -185,15 +211,15 @@
     self.assertAllClose(result, self.expected_batch_result, 1e-3)
 
   def test_fit_generator(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     history = model.fit_generator(
-        self._custom_generator(), steps_per_epoch=2, epochs=2)
+        custom_generator_multi_io(), steps_per_epoch=2, epochs=2)
     for key, value in self.expected_fit_result.items():
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_eval_generator(self):
-    model = self._get_multi_io_model()
-    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    model = self._get_compiled_multi_io_model()
+    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=2)
     self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
 
 
@@ -318,5 +344,150 @@
     self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
 
 
+@keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+@keras_parameterized.run_all_keras_modes
+@parameterized.parameters([
+    loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE,
+    loss_reduction.ReductionV2.AUTO,
+    loss_reduction.ReductionV2.SUM
+])
+class TestOutputLossMetrics(keras_parameterized.TestCase):
+
+  def _get_compiled_multi_io_model(self, loss):
+    model = get_multi_io_model()
+    model.compile(
+        optimizer='rmsprop',
+        loss=loss,
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def setUp(self):
+    super(TestOutputLossMetrics, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights_1 = np.asarray([2., 3., 4., 5.])
+    self.weights_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Loss `output_1`:
+    #   Per-sample weighted losses
+    #   Batch 1 = [(3 - 2)^2 * 2, (6 - 4)^2 * 3)] = [2, 12]
+    #   Batch 2 = [((9 - 6)^2 * 4, (12 - 8)^2 * 5)] = [36, 80]
+
+    #   Result (reduction=SUM) = ((2 + 12) + (36 + 80))/2 = 65
+    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 130 / 4 = 32.5
+
+    # Loss `output_2`:
+    #   Per-sample weighted losses
+    #   Batch 1 = [(3 - 2)^2 * 3.5, (6 - 4)^2 * 2.5)] = [3.5, 10]
+    #   Batch 2 = [(9 - 6)^2 * 1.5, (12 - 8)^2 * 0.5)] = [13.5, 8]
+
+    #   Result (reduction=SUM) = ((3.5 + 10) + (13.5 + 8))/2 = 17.5
+    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 35 / 4 = 8.75
+
+    # When reduction is 'NONE' loss value that is passed to the optimizer will
+    # be vector loss but what is reported is a scalar, which is an average of
+    # all the values in all the batch vectors.
+
+    # Total loss = Output_loss_1 + Output_loss_2
+
+    sum_over_batch_size_fit_result = {
+        'loss': [41.25, 41.25],
+        'output_1_loss': [32.5, 32.5],
+        'output_2_loss': [8.75, 8.75],
+    }
+
+    self.expected_fit_result = {
+        loss_reduction.ReductionV2.NONE:
+            sum_over_batch_size_fit_result,
+        loss_reduction.ReductionV2.SUM: {
+            'loss': [82.5, 82.5],
+            'output_1_loss': [65, 65],
+            'output_2_loss': [17.5, 17.5],
+        },
+        loss_reduction.ReductionV2.AUTO:
+            sum_over_batch_size_fit_result,
+        loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE:
+            sum_over_batch_size_fit_result,
+    }
+
+    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+    self.expected_batch_result = {
+        loss_reduction.ReductionV2.NONE: [41.25, 32.5, 8.75],
+        loss_reduction.ReductionV2.SUM: [82.5, 65, 17.5],
+        loss_reduction.ReductionV2.AUTO: [41.25, 32.5, 8.75],
+        loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE: [41.25, 32.5, 8.75],
+    }
+
+  def test_fit(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    history = model.fit([self.x, self.x], [self.y, self.y],
+                        sample_weight={
+                            'output_1': self.weights_1,
+                            'output_2': self.weights_2,
+                        },
+                        batch_size=2,
+                        epochs=2,
+                        shuffle=False)
+    for key, value in self.expected_fit_result[reduction].items():
+      self.assertAllClose(history.history[key], value)
+
+  def test_eval(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
+                                 batch_size=2,
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(eval_result, self.expected_batch_result[reduction])
+
+  def test_train_on_batch(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    result = model.train_on_batch([self.x, self.x], [self.y, self.y],
+                                  sample_weight={
+                                      'output_1': self.weights_1,
+                                      'output_2': self.weights_2,
+                                  })
+
+    expected_values = self.expected_batch_result[reduction]
+    if reduction == loss_reduction.ReductionV2.SUM:
+      # We are taking all the data as one batch, so undo the averaging here.
+      expected_values = [x * 2 for x in self.expected_batch_result[reduction]]
+    self.assertAllClose(result, expected_values)
+
+  def test_test_on_batch(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    result = model.test_on_batch([self.x, self.x], [self.y, self.y],
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    expected_values = self.expected_batch_result[reduction]
+    if reduction == loss_reduction.ReductionV2.SUM:
+      # We are taking all the data as one batch, so undo the averaging here.
+      expected_values = [x * 2 for x in self.expected_batch_result[reduction]]
+    self.assertAllClose(result, expected_values)
+
+  def test_fit_generator(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    history = model.fit_generator(
+        custom_generator_multi_io(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result[reduction].items():
+      self.assertAllClose(history.history[key], value)
+
+  def test_eval_generator(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result[reduction])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 89485b2..c8b3a35 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -158,8 +158,8 @@
     self.assertEqual(600., self.evaluate(restore_sum.result()))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class KerasMeanTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class KerasMeanTest(keras_parameterized.TestCase):
 
   # TODO(b/120949004): Re-enable garbage collection check
   # @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
@@ -294,6 +294,43 @@
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  def test_multiple_instances(self):
+    m = metrics.Mean()
+    m2 = metrics.Mean()
+
+    self.assertEqual(m.name, 'mean')
+    self.assertEqual(m2.name, 'mean')
+
+    self.assertEqual([v.name for v in m.variables],
+                     testing_utils.get_expected_metric_variable_names(
+                         ['total', 'count']))
+    self.assertEqual([v.name for v in m2.variables],
+                     testing_utils.get_expected_metric_variable_names(
+                         ['total', 'count'], name_suffix='_1'))
+
+    self.evaluate(variables.variables_initializer(m.variables))
+    self.evaluate(variables.variables_initializer(m2.variables))
+
+    # check initial state
+    self.assertEqual(self.evaluate(m.total), 0)
+    self.assertEqual(self.evaluate(m.count), 0)
+    self.assertEqual(self.evaluate(m2.total), 0)
+    self.assertEqual(self.evaluate(m2.count), 0)
+
+    # check __call__()
+    self.assertEqual(self.evaluate(m(100)), 100)
+    self.assertEqual(self.evaluate(m.total), 100)
+    self.assertEqual(self.evaluate(m.count), 1)
+    self.assertEqual(self.evaluate(m2.total), 0)
+    self.assertEqual(self.evaluate(m2.count), 0)
+
+    self.assertEqual(self.evaluate(m2([63, 10])), 36.5)
+    self.assertEqual(self.evaluate(m2.total), 73)
+    self.assertEqual(self.evaluate(m2.count), 2)
+    self.assertEqual(self.evaluate(m.result()), 100)
+    self.assertEqual(self.evaluate(m.total), 100)
+    self.assertEqual(self.evaluate(m.count), 1)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class KerasAccuracyTest(test.TestCase):
@@ -1141,8 +1178,8 @@
     self.assertEqual(m_obj2.num_classes, 2)
 
   def test_unweighted(self):
-    y_pred = constant_op.constant([0, 1, 0, 1], dtype=dtypes.float32)
-    y_true = constant_op.constant([0, 0, 1, 1])
+    y_pred = [0, 1, 0, 1]
+    y_true = [0, 0, 1, 1]
 
     m_obj = metrics.MeanIoU(num_classes=2)
     self.evaluate(variables.variables_initializer(m_obj.variables))
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 492c067..2f827a3 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -130,8 +130,8 @@
     return self._variable.assign_sub(
         delta, use_locking=use_locking, name=name, read_value=read_value)
 
-  # TODO(reedwm): Support assigning variables with tf.assign(), var.scatter_add,
-  # etc.
+  # TODO(reedwm): Support assigning variables with tf.compat.v1.assign(),
+  # var.scatter_add, etc.
 
   def __getattr__(self, name):
     return getattr(self._variable, name)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index 8ff28fb..9c2d76e 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -293,9 +293,14 @@
       'testcase_name': 'regularizer',
       'strategy_fn': create_mirrored_strategy,
       'use_regularizer': True
+  }, {
+      'testcase_name': 'nocloning',
+      'strategy_fn': create_mirrored_strategy,
+      'cloning': False
   })
   @test_util.run_in_graph_and_eager_modes
-  def test_model(self, strategy_fn, use_operator=False, use_regularizer=False):
+  def test_model(self, strategy_fn, use_operator=False, use_regularizer=False,
+                 cloning=True):
     regularizer = IdentityRegularizer() if use_regularizer else None
     with strategy_fn().scope():
       with policy.policy_scope('infer_float32_vars'):
@@ -314,7 +319,7 @@
         # the variable will not change. So this tests the learning rate not
         # applied to a float16 value, but instead the float32 variable.
         opt = gradient_descent.SGD(2 ** -14)
-        model.compile(opt, loss=loss_fn)
+        model.compile(opt, loss=loss_fn, cloning=cloning)
 
     self.assertEqual(backend.eval(layer.v), 1)
     x = np.ones((2, 1))
@@ -336,6 +341,53 @@
       'testcase_name': 'distribute',
       'strategy_fn': create_mirrored_strategy,
   }, {
+      'testcase_name': 'nocloning',
+      'strategy_fn': create_mirrored_strategy,
+      'cloning': False,
+  })
+  @test_util.run_in_graph_and_eager_modes
+  def test_fixed_loss_scaling(self, strategy_fn, cloning=True):
+    # Note: We do not test mixed precision in this method, only loss scaling.
+    loss_scale = 8.
+    batch_size = 4
+    with strategy_fn().scope():
+      x = layers.Input(shape=(1,), batch_size=batch_size)
+      layer = AddLayer()
+      y = layer(x)
+
+      # The gradient of 'y' at this point is 1. With loss scaling, the gradient
+      # is 'loss_scale'. We divide by the batch size since the loss is averaged
+      # across batch elements.
+      expected_gradient = loss_scale / batch_size
+      identity_with_grad_check_fn = (
+          mp_test_util.create_identity_with_grad_check_fn([expected_gradient]))
+      y = core.Lambda(identity_with_grad_check_fn)(y)
+      model = models.Model(inputs=x, outputs=y)
+
+      def loss_fn(y_true, y_pred):
+        del y_true
+        return math_ops.reduce_mean(y_pred)
+
+      opt = gradient_descent.SGD(1.)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      model.compile(opt, loss=loss_fn, cloning=cloning)
+
+    self.assertEqual(backend.eval(layer.v), 1)
+    x = np.ones((batch_size, 1))
+    y = np.ones((batch_size, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size)
+    model.fit(dataset)
+    # Variable starts at 1, and should have gradient of 1 subtracted from it.
+    expected = 0
+    self.assertEqual(backend.eval(layer.v), expected)
+
+  @parameterized.named_parameters({
+      'testcase_name': 'base',
+      'strategy_fn': default_strategy_fn
+  }, {
+      'testcase_name': 'distribute',
+      'strategy_fn': create_mirrored_strategy,
+  }, {
       'testcase_name': 'loss_scaling',
       'strategy_fn': create_mirrored_strategy,
       'use_loss_scaling': True
@@ -405,15 +457,21 @@
         # Layer does not have weight regularizer
         self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
 
+  # TODO(reedwm): Add and fix test where cloning=False is passed to
+  # Model.compile. Currently the test fails if cloning=False is passed.
   @parameterized.named_parameters({
       'testcase_name': 'base',
       'strategy_fn': default_strategy_fn
   }, {
       'testcase_name': 'distribute',
       'strategy_fn': create_mirrored_strategy,
+  }, {
+      'testcase_name': 'nocloning',
+      'strategy_fn': create_mirrored_strategy,
+      'cloning': False,
   })
   @test_util.run_in_graph_and_eager_modes
-  def test_dynamic_loss_scaling(self, strategy_fn):
+  def test_dynamic_loss_scaling(self, strategy_fn, cloning=True):
     strategy = strategy_fn()
     initial_loss_scale = 2.
     batch_size = 4
@@ -447,12 +505,12 @@
         loss_scale = loss_scale_module.DynamicLossScale(
             initial_loss_scale=initial_loss_scale, increment_period=2)
         opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-        model.compile(opt, loss=loss_fn)
+        model.compile(opt, loss=loss_fn, cloning=cloning)
 
     self.assertEqual(backend.eval(layer.v), 1)
-    x = np.ones((2, 1))
-    y = np.ones((2, 1))
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
+    x = np.ones((batch_size, 1))
+    y = np.ones((batch_size, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size)
     model.fit(dataset)
     # The variables starts with 1 and has a gradient of 1, so will go down by 1
     # each step.
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 0327626..1f20122 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -41,6 +41,20 @@
     self.value = value
 
 
+def scale_loss(loss, loss_scale):
+  """Scales the loss by the loss scale."""
+  if callable(loss):
+    return lambda: loss() * loss_scale
+  else:
+    return loss * loss_scale
+
+
+def unscale_grads(grads, loss_scale):
+  """Unscales the gradients by the loss scale."""
+  loss_scale_reciprocal = 1. / loss_scale
+  return [g * loss_scale_reciprocal if g is not None else None for g in grads]
+
+
 @keras_export('keras.mixed_precision.experimental.LossScaleOptimizer')
 class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   """An optimizer that applies loss scaling.
@@ -101,35 +115,23 @@
     self._track_trackable(self._loss_scale, 'loss_scale')
 
   def _compute_gradients(self, loss, var_list, grad_loss=None):
-    loss = self._scale_loss(loss)
+    loss = scale_loss(loss, self._loss_scale())
     grads_and_vars = self._optimizer._compute_gradients(loss, var_list,  # pylint: disable=protected-access
                                                         grad_loss)
     grads = [g for g, _ in grads_and_vars]
     variables = [v for _, v in grads_and_vars]
-    scaled_grads = self._scale_grads(grads)
-    return list(zip(scaled_grads, variables))
+    unscaled_grads = unscale_grads(grads, self._loss_scale())
+    return list(zip(unscaled_grads, variables))
 
   def get_gradients(self, loss, params):
-    loss = self._scale_loss(loss)
+    loss = scale_loss(loss, self._loss_scale())
     grads = self._optimizer.get_gradients(loss, params)
-    return self._scale_grads(grads)
-
-  def _scale_loss(self, loss):
-    # The loss is callable for `_compute_gradients`, but not `get_gradients`.
-    loss_scale = self._loss_scale()
-    if callable(loss):
-      return lambda: loss() * loss_scale
-    else:
-      return loss * loss_scale
-
-  def _scale_grads(self, grads):
-    loss_scale = self._loss_scale()
-    loss_scale_reciprocal = 1 / loss_scale
-    return [None if g is None else g * loss_scale_reciprocal for g in grads]
+    return unscale_grads(grads, self._loss_scale())
 
   def apply_gradients(self, grads_and_vars, name=None):
     if distribution_strategy_context.in_cross_replica_context():
       raise ValueError('apply_gradients() must be called in a replica context.')
+    grads_and_vars = tuple(grads_and_vars)
     return distribution_strategy_context.get_replica_context().merge_call(
         self._apply_gradients_cross_replica, args=(grads_and_vars, name))
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/test_util.py b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
index a06b485..1d1e22d 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/test_util.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
@@ -81,7 +81,12 @@
     """Function whose gradient is NaN iff `have_nan_gradients` is True."""
     x = array_ops.identity(x)
     def grad(dx):
-      nan_scalar = constant_op.constant(float('NaN'), dtype=dx.dtype)
+      # We need this control dependency, because otherwise the NaN could be
+      # produced before `dx`. This in turn could cause the final gradient to be
+      # produced because `dx`, causing the loss scale to be updated before `dx`,
+      # which can cause `tf.assert_equal`s to fail.
+      with ops.control_dependencies([dx]):
+        nan_scalar = constant_op.constant(float('NaN'), dtype=dx.dtype)
       return control_flow_ops.cond(
           have_nan_gradients,
           lambda: array_ops.fill(dx.shape, nan_scalar),
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index b2d35b6..4c1c9cf 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -210,6 +210,18 @@
     self.assertTrue(test_model.uses_custom_build, 'Model should use user '
                                                   'defined build when called.')
 
+  def test_attribute_conflict_error(self):
+
+    class ModelWithProperty(keras.Model):
+
+      @property
+      def read_only(self):
+        return 1.
+
+    m = ModelWithProperty()
+    with self.assertRaisesRegexp(AttributeError, 'read_only'):
+      m.read_only = 2.
+
   def test_custom_build_with_fit(self):
 
     class DummyModel(keras.Model):
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index b864be4..6ae8795 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -406,14 +406,9 @@
       attributes_to_cache = [
           'inputs',
           'outputs',
-          '_feed_outputs',
-          '_feed_output_names',
-          '_feed_output_shapes',
-          '_feed_loss_fns',
           'loss_weights_list',
-          'targets',
-          '_feed_targets',
-          'sample_weight_modes',
+          '_training_targets',
+          '_sample_weight_modes',
           'total_loss',
           'sample_weights',
           '_feed_sample_weights',
diff --git a/tensorflow/python/keras/ops.py b/tensorflow/python/keras/ops.py
index b2d8520..6880bfd 100644
--- a/tensorflow/python/keras/ops.py
+++ b/tensorflow/python/keras/ops.py
@@ -98,5 +98,4 @@
     init_ops_v2.TruncatedNormal)
 # pylint: enable=bad-continuation
 
-
-keras_export("keras.backend.name_scope")(ops.name_scope)
+keras_export(v1=["keras.backend.name_scope"])(ops.name_scope)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index c2d7262..14884d9 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -227,18 +227,18 @@
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "learning_rate_schedule_test",
     size = "medium",
     srcs = ["learning_rate_schedule_test.py"],
-    shard_count = 4,
-    deps = [
+    additional_deps = [
         ":optimizer_v2",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 29d3bee..f9ff0a9 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -32,16 +32,16 @@
   """Optimizer that implements the Adam algorithm.
 
   Adam optimization is a stochastic gradient descent method that is based on
-  adaptive estimation of first-order and second-order moments. According to the
-  reference, the method is 'computationally efficient, has little memory
+  adaptive estimation of first-order and second-order moments.
+  According to the paper 
+  [Adam: A Method for Stochastic Optimization. Kingma et al., 
+  2014](http://arxiv.org/abs/1412.6980),
+   the method is "*computationally efficient, has little memory
   requirement, invariant to diagonal rescaling of gradients, and is well suited
-  for problems that are large in terms of data/parameters'.
+  for problems that are large in terms of data/parameters*".
 
-  # References
-      See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-        ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
-      For AMSGrad see [Reddi et al., 2-18]
-        (https://openreview.net/pdf?id=ryQu7f-RZ)
+  For AMSGrad see [On The Convergence Of Adam And Beyond. 
+  Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index c444f96..c36f6eb 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -60,7 +60,7 @@
   """
 
   def __init__(self,
-               learning_rate=0.001,
+               learning_rate=0.01,
                momentum=0.0,
                nesterov=False,
                name="SGD",
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index c44263b..d7ae0ba 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -141,10 +141,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(
-        self.name, "ExponentialDecay",
-        [self.initial_learning_rate, step, self.decay_steps, self.decay_rate]
-    ) as name:
+    with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
@@ -238,8 +235,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(self.name, "PiecewiseConstant",
-                        [step, self.boundaries, self.values, self.name]):
+    with ops.name_scope_v2(self.name or "PiecewiseConstant"):
       boundaries = ops.convert_n_to_tensor(self.boundaries)
       values = ops.convert_n_to_tensor(self.values)
       x_recomp = ops.convert_to_tensor(step)
@@ -389,11 +385,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(
-        self.name, "PolynomialDecay",
-        [self.initial_learning_rate, step, self.decay_steps,
-         self.end_learning_rate, self.power]
-    ) as name:
+    with ops.name_scope_v2(self.name or "PolynomialDecay") as name:
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
@@ -512,9 +504,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(self.name, "InverseTimeDecay",
-                        [self.initial_learning_rate, step, self.decay_rate]
-                       ) as name:
+    with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
@@ -606,8 +596,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(self.name, "CosineDecay",
-                        [self.initial_learning_rate, step]):
+    with ops.name_scope_v2(self.name or "CosineDecay"):
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
@@ -707,9 +696,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(self.name, "SGDRDecay",
-                        [self.initial_learning_rate, step]
-                       ) as name:
+    with ops.name_scope_v2(self.name or "SGDRDecay") as name:
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
@@ -844,8 +831,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(self.name, "LinearCosineDecay",
-                        [self.initial_learning_rate, step]) as name:
+    with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
@@ -969,8 +955,7 @@
     self.name = name
 
   def __call__(self, step):
-    with ops.name_scope(self.name, "NoisyLinearCosineDecay",
-                        [self.initial_learning_rate, step]) as name:
+    with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name:
       initial_learning_rate = ops.convert_to_tensor(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index 87b97fa..b0dfd12 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -21,8 +21,11 @@
 import math
 from absl.testing import parameterized
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
@@ -117,6 +120,26 @@
     self.evaluate(x.assign(999))
     self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
 
+  def testPiecewiseFunction(self, serialize):
+    del serialize
+    with context.eager_mode():
+      v = variables.Variable(1.)
+      def loss_fn():
+        return v * v
+      learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
+          [1.], [1., 0.1])
+      opt = gradient_descent.SGD(learning_rate=learning_rate)
+
+      @def_function.function
+      def minimize():
+        with backprop.GradientTape() as tape:
+          loss = loss_fn()
+        g = tape.gradient(loss, [v])
+        opt.apply_gradients(list(zip(g, [v])))
+
+      minimize()
+      self.assertAllEqual(v.read_value(), -1.0)
+
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstantEdgeCases(self, serialize):
     x_int = resource_variable_ops.ResourceVariable(
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 61fdcbf..0adacd2 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -381,14 +381,16 @@
       ValueError: In case any gradient cannot be computed (e.g. if gradient
         function not implemented).
     """
+    params = nest.flatten(params)
     with backend.get_graph().as_default():
       grads = gradients.gradients(loss, params)
-    if None in grads:
-      raise ValueError("An operation has `None` for gradient. "
-                       "Please make sure that all of your ops have a "
-                       "gradient defined (i.e. are differentiable). "
-                       "Common ops without gradient: "
-                       "K.argmax, K.round, K.eval.")
+    for grad, param in zip(grads, params):
+      if grad is None:
+        raise ValueError("Variable {} has `None` for gradient. "
+                         "Please make sure that all of your ops have a "
+                         "gradient defined (i.e. are differentiable). "
+                         "Common ops without gradient: "
+                         "K.argmax, K.round, K.eval.".format(param))
     if hasattr(self, "clipnorm"):
       grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
     if hasattr(self, "clipvalue"):
@@ -456,11 +458,11 @@
         return update_op
 
     update_ops = []
-    with ops.name_scope(name, self._name) as name:
+    with backend.name_scope(name or self._name):
       for grad, var in grads_and_vars:
         scope_name = ("" if ops.executing_eagerly_outside_functions() else
                       "_" + var.op.name)
-        with ops.name_scope("update" + scope_name):
+        with backend.name_scope("update" + scope_name):
           update_ops.extend(
               distribution.extended.update(
                   var, apply_grad_to_update_var, args=(grad,), group=False))
@@ -614,7 +616,7 @@
   @iterations.setter
   def iterations(self, variable):
     if self._iterations is not None:
-      raise RuntimeError("Cannot set `iterations` to a new Variable after"
+      raise RuntimeError("Cannot set `iterations` to a new Variable after "
                          "the Optimizer weights have been created")
     self._iterations = variable
     self._weights.append(self._iterations)
@@ -672,7 +674,7 @@
     if "learning_rate" in config:
       if isinstance(config["learning_rate"], dict):
         config["learning_rate"] = learning_rate_schedule.deserialize(
-            config["learning_rate"])
+            config["learning_rate"], custom_objects=custom_objects)
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 998f336..2d1170b 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -558,7 +558,9 @@
     loss = lambda: losses.mean_squared_error(model(x), y)
     var_list = lambda: model.trainable_weights
 
-    self.assertLen(var_list(), 0)
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for model .* have not yet been created'):
+      var_list()
     train_op = opt.minimize(loss, var_list)
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
@@ -681,10 +683,10 @@
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
       model_tf.set_weights(model_k_v2.get_weights())
 
-      opt_k_v1 = optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
+      opt_k_v1 = optimizers.SGD(momentum=0.9, nesterov=True)
       opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
       opt_tf = momentum.MomentumOptimizer(
-          learning_rate=0.001, momentum=0.9, use_nesterov=True)
+          learning_rate=0.01, momentum=0.9, use_nesterov=True)
 
       model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
       model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 03ce3ab..fe37d95 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -118,9 +118,9 @@
   def test_adadelta(self):
     with self.cached_session():
       self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
-      # Accuracy seems dependent on the initialization. Even adding tf.Print
-      # nodes in the graph seemed to affect the initialization seed, and hence
-      # the accuracy.
+      # Accuracy seems dependent on the initialization. Even adding
+      # tf.compat.v1.Print nodes in the graph seemed to affect the
+      # initialization seed, and hence the accuracy.
       self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
   def test_adam(self):
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
index ffaf02b..866b884 100644
--- a/tensorflow/python/keras/saving/saved_model.py
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -150,11 +150,14 @@
     if isinstance(model, sequential.Sequential):
       # If input shape is not directly set in the model, the exported model
       # will infer the expected shapes of the input from the model.
-      if not model.built and input_signature is None:
-        raise ValueError(
-            'Sequential model\'s input shape is unknown. Please build the '
-            'model, or use the input_signature argument to specify the '
-            'model inputs.')
+      if not model.built:
+        raise ValueError('Weights for sequential model have not yet been '
+                         'created. Weights are created when the Model is first '
+                         'called on inputs or `build()` is called with an '
+                         '`input_shape`, or the first layer in the model has '
+                         '`input_shape` during construction.')
+      # TODO(kathywu): Build the model with input_signature to create the
+      # weights before _export_model_variables().
     else:
       raise NotImplementedError(
           'Subclassed models can only be exported for serving. Please set '
@@ -253,8 +256,8 @@
 
     # Make sure that iterations variable is added to the global step collection,
     # to ensure that, when the SavedModel graph is loaded, the iterations
-    # variable is returned by `tf.train.get_global_step()`. This is required for
-    # compatibility with the SavedModelEstimator.
+    # variable is returned by `tf.compat.v1.train.get_global_step()`. This is
+    # required for compatibility with the SavedModelEstimator.
     if compile_clone:
       g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
 
@@ -295,7 +298,11 @@
       builder.add_meta_graph(
           model_utils.EXPORT_TAG_MAP[mode],
           signature_def_map=_create_signature_def_map(clone, mode),
-          saver=saver_lib.Saver(clone_var_list),
+          saver=saver_lib.Saver(
+              clone_var_list,
+              # Allow saving Models with no variables. This is somewhat odd, but
+              # it's not necessarily a bug.
+              allow_empty=True),
           init_op=variables.local_variables_initializer(),
           train_op=train_op)
     return None
@@ -306,7 +313,7 @@
   inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
   if model.optimizer:
     targets_dict = {x.name.split(':')[0]: x
-                    for x in model.targets if x is not None}
+                    for x in model._targets if x is not None}
     inputs_dict.update(targets_dict)
   outputs_dict = {name: x
                   for name, x in zip(model.output_names, model.outputs)}
diff --git a/tensorflow/python/keras/saving/saved_model_test.py b/tensorflow/python/keras/saving/saved_model_test.py
index 50ddf1f..9542883 100644
--- a/tensorflow/python/keras/saving/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model_test.py
@@ -492,23 +492,20 @@
   def testSaveSequentialModelWithoutInputShapes(self):
     model = sequential_model_without_input_shape(True)
     # A Sequential model that hasn't been built should raise an error.
-    with self.assertRaisesRegexp(ValueError, 'Please build the model'):
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for sequential model have not yet been created'):
       keras_saved_model.export_saved_model(model, '')
 
-    saved_model_dir = self._save_model_dir()
-    keras_saved_model.export_saved_model(
-        model,
-        saved_model_dir,
-        input_signature=tensor_spec.TensorSpec(
-            shape=(10, 11, 12, 13, 14), dtype=dtypes.float32,
-            name='spec_input'))
-
-    with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, saved_model_dir,
-                                      mode_keys.ModeKeys.PREDICT)
-      self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
-      self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
-      self.assertEqual(3, outputs[next(iter(outputs.keys()))].shape[-1])
+    # Even with input_signature, the model's weights has not been created.
+    with self.assertRaisesRegexp(
+        ValueError, 'Weights for sequential model have not yet been created'):
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(
+          model,
+          saved_model_dir,
+          input_signature=tensor_spec.TensorSpec(
+              shape=(10, 11, 12, 13, 14), dtype=dtypes.float32,
+              name='spec_input'))
 
   @parameterized.parameters(
       {
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index dabfe1a..86ecb0f 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -23,6 +23,7 @@
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -405,7 +406,7 @@
     return x
 
 
-def get_model_from_layers(layers, input_shape=None):
+def get_model_from_layers(layers, input_shape=None, input_dtype=None):
   """Builds a model from a sequence of layers."""
   model_type = get_model_type()
   if model_type == 'subclass':
@@ -418,7 +419,8 @@
   if model_type == 'sequential':
     model = keras.models.Sequential()
     if input_shape:
-      model.add(keras.layers.InputLayer(input_shape=input_shape))
+      model.add(keras.layers.InputLayer(input_shape=input_shape,
+                                        dtype=input_dtype))
     for layer in layers:
       model.add(layer)
     return model
@@ -427,7 +429,7 @@
     if not input_shape:
       raise ValueError('Cannot create a functional model from layers with no '
                        'input shape.')
-    inputs = keras.Input(shape=input_shape)
+    inputs = keras.Input(shape=input_shape, dtype=input_dtype)
     outputs = inputs
     for layer in layers:
       outputs = layer(outputs)
@@ -682,3 +684,12 @@
     raise ValueError(
         'Could not find requested v2 optimizer: {}\nValid choices: {}'.format(
             name, list(_V2_OPTIMIZER_MAP.keys())))
+
+
+def get_expected_metric_variable_names(var_names, name_suffix=''):
+  """Returns expected metric variable names given names and prefix/suffix."""
+  if tf2.enabled() or context.executing_eagerly():
+    # In V1 eager mode and V2 variable names are not made unique.
+    return [n + ':0' for n in var_names]
+  # In V1 graph mode variable names are made unique using a suffix.
+  return [n + name_suffix + ':0' for n in var_names]
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 4000378..ea376b2 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -590,3 +590,11 @@
     raise ValueError('Unknown entries in {} dictionary: {}. Only expected '
                      'following keys: {}'.format(name, list(unknown),
                                                  expected_values))
+
+
+def validate_kwargs(kwargs, allowed_kwargs,
+                    error_message='Keyword argument not understood:'):
+  """Checks that all keyword arguments are in the set of allowed keys."""
+  for kwarg in kwargs:
+    if kwarg not in allowed_kwargs:
+      raise TypeError(error_message, kwarg)
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index 66d3c6e..c3e3d9e 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -144,7 +144,7 @@
 
 def _num_elements(losses):
   """Computes the number of elements in `losses` tensor."""
-  with ops.name_scope(None, 'num_elements', values=[losses]) as scope:
+  with K.name_scope('num_elements') as scope:
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
@@ -182,9 +182,14 @@
     `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
   """
   ReductionV2.validate(reduction)
+
+  # If this function is called directly, then we just default 'AUTO' to
+  # 'SUM_OVER_BATCH_SIZE'. Eg. Canned estimator use cases.
+  if reduction == ReductionV2.AUTO:
+    reduction = ReductionV2.SUM_OVER_BATCH_SIZE
   if sample_weight is None:
     sample_weight = 1.0
-  with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
+  with K.name_scope(name or 'weighted_loss'):
     # Save the `reduction` argument for loss normalization when distributing
     # to multiple replicas. Used only for estimator + v1 optimizer flow.
     ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index e31ef79..ce1eb3f 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -27,6 +27,7 @@
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.generic_utils import to_list
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
@@ -68,7 +69,8 @@
   def decorated(metric_obj, *args, **kwargs):
     """Decorated function with `add_update()`."""
 
-    update_op = update_state_fn(*args, **kwargs)
+    with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
+      update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
       metric_obj.add_update(update_op, inputs=True)
     return update_op
@@ -108,7 +110,7 @@
       # with distribution object as the first parameter. We create a wrapper
       # here so that the result function need not have that parameter.
       def merge_fn_wrapper(distribution, merge_fn, *args):
-        # We will get `PerDevice` merge function. Taking the first one as all
+        # We will get `PerReplica` merge function. Taking the first one as all
         # are identical copies of the function that we had passed below.
         merged_result_fn = (
             distribution.experimental_local_results(merge_fn)[0](*args))
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
index 1668287..9eeade4 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -219,7 +219,7 @@
   # each getting a slice of the inputs.
   for i, gpu_id in enumerate(target_gpu_ids):
     with ops.device('/gpu:%d' % gpu_id):
-      with ops.name_scope('replica_%d' % gpu_id):
+      with K.name_scope('replica_%d' % gpu_id):
         inputs = []
         # Retrieve a slice of the input.
         for x in model.inputs:
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 9c711bd..38f53c6 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -221,7 +221,7 @@
                                    name='siamese')
       parallel_siamese = keras.utils.multi_gpu_model(siamese, gpus)
       self.assertEqual(parallel_siamese.output_names,
-                       ['add', 'nested_1', 'nested_2'])
+                       ['add', 'nested', 'nested_1'])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 420a74f..beed7ce 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -25,9 +25,11 @@
 from tensorflow.python.framework import smart_cond as smart_module
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 
 
 def smart_cond(pred, true_fn=None, false_fn=None, name=None):
@@ -370,3 +372,57 @@
 
 def is_tensor_or_variable(x):
   return tensor_util.is_tensor(x) or isinstance(x, variables.Variable)
+
+
+def assert_no_legacy_layers(layers):
+  """Prevent tf.layers.Layers from being used with Keras.
+
+  Certain legacy layers inherit from their keras analogs; however they are
+  not supported with keras and can lead to subtle and hard to diagnose bugs.
+
+  Args:
+    layers: A list of layers to check
+
+  Raises:
+    TypeError: If any elements of layers are tf.layers.Layers
+  """
+
+  # isinstance check for tf.layers.Layer introduces a circular dependency.
+  legacy_layers = [l for l in layers if getattr(l, '_is_legacy_layer', None)]
+  if legacy_layers:
+    layer_str = '\n'.join(['  ' + str(l) for l in legacy_layers])
+    raise TypeError(
+        'The following are legacy tf.layers.Layers:\n{}\nTo use keras as a '
+        'framework (for instance using the Network, Model, or Sequential '
+        'classes), please use the tf.keras.layers implementation instead. '
+        '(Or, if writing custom layers, subclass from tf.keras.layers rather '
+        'than tf.layers)'.format(layer_str))
+
+
+@tf_contextlib.contextmanager
+def maybe_init_scope(layer):
+  """Open an `init_scope` if in V2 mode and using the keras graph.
+
+  Arguments:
+    layer: The Layer/Model that is currently active.
+
+  Yields:
+    None
+  """
+  # Don't open an init_scope in V1 mode or when using legacy tf.layers.
+  if (ops.executing_eagerly_outside_functions() and
+      getattr(layer, '_keras_style', True)):
+    with ops.init_scope():
+      yield
+  else:
+    yield
+
+
+@tf_contextlib.contextmanager
+def graph_context_for_symbolic_tensors(*args, **kwargs):
+  """Returns graph context manager if any of the inputs is a symbolic tensor."""
+  if any(is_symbolic_tensor(v) for v in list(args) + list(kwargs.values())):
+    with K.get_graph().as_default():
+      yield
+  else:
+    yield
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 38c4735..32951c3 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -131,6 +131,7 @@
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:client_testlib",
@@ -235,7 +236,10 @@
         "//tensorflow/python/ops/linalg",
     ],
     shard_count = 5,
-    tags = ["no_gpu"],
+    tags = [
+        "no_windows_gpu",
+        "nomsan",  # TODO(b/131773093): Re-enable.
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -464,6 +468,15 @@
 )
 
 tf_py_test(
+    name = "fingerprint_op_test",
+    size = "small",
+    srcs = ["fingerprint_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
     name = "fractional_avg_pool_op_test",
     size = "small",
     srcs = ["fractional_avg_pool_op_test.py"],
@@ -1069,6 +1082,7 @@
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:string_ops",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//tensorflow/python/ops/ragged:ragged_test_util",
     ],
 )
 
@@ -1114,6 +1128,34 @@
 )
 
 tf_py_test(
+    name = "string_lower_op_test",
+    size = "small",
+    srcs = ["string_lower_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
+    name = "string_upper_op_test",
+    size = "small",
+    srcs = ["string_upper_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
     name = "substr_op_test",
     size = "small",
     srcs = ["substr_op_test.py"],
@@ -1715,6 +1757,9 @@
         "//tensorflow/python:while_v2",
     ],
     shard_count = 16,
+    tags = [
+        "notsan",  # TODO(b/132205147): Re-enable this.
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1743,7 +1788,6 @@
         "//tensorflow/python:control_flow_util_v2",
         "//tensorflow/python:while_v2",
     ],
-    tags = ["no_gpu"],  # TODO(b/117796385): runs out of memory
 )
 
 cuda_py_test(
@@ -2271,7 +2315,6 @@
     ],
     shard_count = 6,
     tags = [
-        "no_oss",
         "no_windows_gpu",
     ],
     xla_enable_strict_auto_jit = True,
@@ -2676,6 +2719,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:cond_v2",
         "//tensorflow/python:while_v2",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
@@ -3198,6 +3242,7 @@
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    tags = ["no_oss"],  # b/124474135
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3390,7 +3435,9 @@
         "no_rocm",  # flaky test
         "no_windows",
     ],
-    # b/127344411: xla_enable_strict_auto_jit = True,
+    # TODO(b/127344411): This test passes because XLA does not actually cluster
+    # the self_adjoint_eig op.
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3427,7 +3474,9 @@
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
-    # b/127344411: xla_enable_strict_auto_jit = True,
+    # TODO(b/127344411): This test passes because XLA does not actually cluster
+    # the svd op.
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3447,7 +3496,7 @@
         "no_windows_gpu",
         "nomsan",
     ],
-    # b/127344411: xla_enable_strict_auto_jit = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3760,4 +3809,18 @@
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
     ],
+    xla_enable_strict_auto_jit = True,
+)
+
+tf_py_test(
+    name = "tridiagonal_matmul_op_test",
+    size = "medium",
+    srcs = ["tridiagonal_matmul_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+    shard_count = 5,
 )
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 7bf2d1d..a4e3f64 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -23,6 +23,7 @@
 
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -263,6 +264,22 @@
       with self.assertRaisesRegexp(ValueError, "incompatible"):
         array_ops.boolean_mask(tensor, mask).eval()
 
+  @test_util.run_deprecated_v1
+  def testStringMask(self):
+    # Reproduces b/111171330, where the optimized boolean_mask graph would
+    # be incorrectly placed on GPU.
+    with ops.Graph().as_default():
+      tile_placeholder = array_ops.placeholder(dtypes.int32, [2])
+      string_tensor = array_ops.tile([["hello"]], tile_placeholder)
+      bool_tensor = array_ops.tile([[True]], tile_placeholder)
+      masked_tensor = array_ops.boolean_mask(string_tensor, bool_tensor)
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.shape_optimization = 1
+      config.gpu_options.per_process_gpu_memory_fraction = 0.3
+      with session.Session(config=config) as sess:
+        result = sess.run(masked_tensor, feed_dict={tile_placeholder: [2, 2]})
+        self.assertAllEqual([b"hello", b"hello", b"hello", b"hello"], result)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class OperatorShapeTest(test_util.TensorFlowTestCase):
@@ -1058,8 +1075,8 @@
         var = variables.Variable(self.x)
       sess.run(variables.variables_initializer([var]))
       val = sess.run(var[index].assign(value))
-      # val_copy is used to check that tf.assign works equivalently to the
-      # assign method above.
+      # val_copy is used to check that tf.compat.v1.assign works equivalently
+      # to the assign method above.
       val_copy = sess.run(state_ops.assign(var[index], value))
       valnp = np.copy(self.x_np)
       valnp[index] = np.array(value)
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 1d40115..4cc4279 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -28,7 +28,6 @@
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
@@ -143,7 +142,7 @@
                                       use_static_shape):
 
   def Test(self):
-    with compat.forward_compatibility_horizon(2019, 4, 19):
+    with compat.forward_compatibility_horizon(2019, 4, 26):
       np.random.seed(42)
       self._testBroadcasting(dtype, adjoint_a, adjoint_b, use_static_shape)
 
@@ -200,7 +199,7 @@
     def CheckGradients(self, a_shape, b_shape):
       self._compare(a_shape, b_shape, dtype, adjoint_a, adjoint_b)
 
-    with compat.forward_compatibility_horizon(2019, 4, 19):
+    with compat.forward_compatibility_horizon(2019, 4, 26):
       CheckGradients(self, [1, 5, 2, 3], [7, 1, 3, 2])
       CheckGradients(self, [2, 3], [1, 3, 5])
       CheckGradients(self, [2, 3], [5, 3, 5])
@@ -231,7 +230,7 @@
 
   def benchmarkBatchMatMulBroadcast(self):
     for (a_shape, b_shape) in self.shape_pairs:
-      with compat.forward_compatibility_horizon(2019, 4, 19):
+      with compat.forward_compatibility_horizon(2019, 4, 26):
         with ops.Graph().as_default(), \
             session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
@@ -264,17 +263,6 @@
               name="batch_matmul_manual_broadcast_cpu_{}_{}".format(
                   a_shape, b_shape))
 
-          # Use linear_operator_util.matmul_with_broadcast.
-          name_template = (
-              "batch_matmul_manual_broadcast_with_linear_operator_util"
-              "_cpu_{}_{}"
-          )
-          self.run_op_benchmark(
-              sess,
-              linear_operator_util.matmul_with_broadcast(matrix_a, matrix_b),
-              min_iters=50,
-              name=name_template.format(a_shape, b_shape))
-
 
 if __name__ == "__main__":
   for dtype_ in [
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
index d2f6c97..d23795e 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/BUILD
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -56,9 +56,13 @@
     srcs = ["stats_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index f952cee..cda881b 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -392,7 +392,7 @@
     self.assertAllEqual([1, 1], feature_dimensions)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
-  def testCalculateBestGainsWithMinNodeWeightNoSplitOnFeturePossible(self):
+  def testCalculateBestGainsWithMinNodeWeightNoSplitOnFeaturePossible(self):
     """Testing Gain calculation with min node weight and no split."""
     with self.cached_session() as sess:
       max_splits = 7
@@ -694,5 +694,43 @@
     self._verify_precision(length=50000000)
 
 
+class BestFeatureSplitMultiClass(test_util.TensorFlowTestCase):
+  """Tests multi-class/multi-regression for best splits."""
+
+  def testCalculateBestMultiDimFeatureSplitsWithoutRegularizationMultiClass(
+      self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateMultiDimBestFeatureSplitsWithL2(self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateMultiDimBestFeatureSplitsWithMinNodeWeight(self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateMultiDimBestFeatureSplitsGradAlmostZero(self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateBestMultiDimFeatureSplitsWithL1(self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateBestMultiDimFeatureSplitsWithTreeComplexity(self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateMultiDimBestSplitsWithMinNodeWeight(self):
+    # TODO(crawles)
+    pass
+
+  def testCalculateBestMultiDimFeatureSplitsWithNoSplitOnFeaturePossible(
+      self):
+    # TODO(crawles)
+    pass
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 2305c0b..e17a029 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -29,9 +29,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -43,10 +41,6 @@
 
 
 # Different gradient implementations for benchmark purposes
-def SpecializedGrad(l, grad):
-  return gen_linalg_ops.cholesky_grad(l, grad)
-
-
 def _GradWithInverseL(l, l_inverse, grad):
   middle = math_ops.matmul(l, grad, adjoint_a=True)
   middle = array_ops.matrix_set_diag(middle,
@@ -251,20 +245,6 @@
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex128,), scalarTest=True)
 
-  def testAgainstSpecialized(self):
-    np.random.seed(0)
-    data = np.random.randn(33, 33).astype(np.float32)
-    data = np.matmul(data, data.T)
-    grad_data = np.random.randn(*data.shape).astype(np.float32)
-
-    with ops.Graph().as_default(), self.session(use_gpu=False) as s:
-      x = constant_op.constant(data, dtypes_lib.float32)
-      chol = linalg_ops.cholesky(x)
-      composite_grad = gradients_impl.gradients(chol, x, grad_data)[0]
-      specialized_grad = SpecializedGrad(chol, grad_data)
-      reference, actual = s.run([specialized_grad, composite_grad])
-    self.assertAllClose(reference, actual)
-
   def runFiniteDifferences(self,
                            shapes,
                            dtypes=(dtypes_lib.float32, dtypes_lib.float64,
@@ -403,7 +383,6 @@
                    "/cpu:0")
     _BenchmarkGrad(TriAngSolveCompositeGrad, "composite_triangular_solve",
                    "/cpu:0")
-    _BenchmarkGrad(SpecializedGrad, "specialized", "/cpu:0")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index cc07315..93a3c38 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -486,7 +486,7 @@
 
   def testClipByAverageNormReplacedWithClipByNorm(self):
     # Check clip_by_average_norm(t) is the same as
-    # clip_by_norm(t, clip_norm * tf.to_float(tf.size(t)))
+    # clip_by_norm(t, clip_norm * tf.compat.v1.to_float(tf.size(t)))
     with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 1b1ff0d..34de124 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -42,6 +42,11 @@
 from tensorflow.python.util import compat
 
 
+_OPTIONAL_OPS = frozenset([
+    "OptionalFromValue", "OptionalNone", "OptionalHasValue", "OptionalGetValue"
+])
+
+
 class CondV2Test(test.TestCase):
 
   def _testCond(self, true_fn, false_fn, train_vals, feed_dict=None):
@@ -753,6 +758,47 @@
     # TODO(skyewm): check the actual graphs that are run once we have a way to
     # programmatically access those graphs.
 
+  # b/131355614
+  @test_util.run_deprecated_v1
+  def testNoOptionalsInXla(self):
+
+    @def_function.function
+    def func_with_cond():
+      pred = constant_op.constant(True, name="pred")
+      x = constant_op.constant(1.0, name="x")
+
+      def true_fn():
+        intermediate = x + 1
+        return intermediate * x
+
+      def false_fn():
+        return x + 1
+
+      output = cond_v2.cond_v2(pred, true_fn, false_fn)
+      grad = gradients_impl.gradients(output, x)[0]
+
+      forward_if_op = output.op.inputs[0].op
+      gradient_if_op = grad.op.inputs[0].op
+
+      def verify_no_optional_ops(op, branch_name):
+        branch_function = ops.get_default_graph()._get_function(
+            op.get_attr(branch_name).name)
+        function_def = branch_function.definition
+        for node_def in function_def.node_def:
+          self.assertNotIn(node_def.op, _OPTIONAL_OPS)
+
+      verify_no_optional_ops(forward_if_op, "then_branch")
+      verify_no_optional_ops(forward_if_op, "else_branch")
+      verify_no_optional_ops(gradient_if_op, "then_branch")
+      verify_no_optional_ops(gradient_if_op, "else_branch")
+
+      return grad
+
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    func_with_cond()
+    xla_context.Exit()
+
   @test_util.run_deprecated_v1
   def testLoweringDisabledWithSingleThreadedExecutorContext(self):
     with self.session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 583082c..6780011 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -865,7 +865,7 @@
     # Load graph generated from earlier version of TF where
     # placeholder shape was not set.
     #
-    # a = tf.placeholder(tf.float32)
+    # a = tf.compat.v1.placeholder(tf.float32)
     # b = a + 1.0
     #
     # Older graph's default shape is 'shape {}', not 'shape {
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 73d9852..feb1043 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -448,12 +448,8 @@
       values = constant_op.constant(10)
       indices = constant_op.constant(0)
       x = ops.IndexedSlices(values, indices)
-      v1_msg = "The two structures don't have the same nested structure"
-      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
-                "number, type, and overall structure of return values.")
       with self.assertRaisesRegexp(
-          TypeError,
-          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
+          TypeError, "Cannot reconcile tf.cond 0-th outputs"):
         control_flow_ops.cond(
             constant_op.constant(True),
             lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices),
@@ -516,7 +512,6 @@
       self.assertAllEqual(sess.run(g, {pred: True}), [2.0, 2.0, 2.0])
       self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
 
-  @test_util.disable_control_flow_v2("b/113293074")
   @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
@@ -814,19 +809,17 @@
       test_result = self.evaluate(r)
       self.assertDictEqual({"a": 210, "b": 210}, test_result)
 
-  @test_util.run_deprecated_v1
   def testEmbeddedListOutput(self):
-    with self.cached_session() as sess:
-      x = constant_op.constant(10)
-      y = constant_op.constant(200)
-      pred = math_ops.less(1, 2)
-      fn1 = lambda: [[math_ops.add(x, y), math_ops.add(x, y)]]
-      fn2 = lambda: [[y, y]]
-      # Pass strict=True flag as cond_v2 allows for tensors to be
-      # in nested output structures as singletons
-      r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
-      test_result = self.evaluate(r)
-      self.assertListEqual([[210, 210]], test_result)
+    x = constant_op.constant(10)
+    y = constant_op.constant(200)
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: [[math_ops.add(x, y), math_ops.add(x, y)]]
+    fn2 = lambda: [[y, y]]
+    # Pass strict=True flag as cond_v2 allows for tensors to be
+    # in nested output structures as singletons
+    r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
+    test_result = self.evaluate(r)
+    self.assertListEqual([[210, 210]], test_result)
 
   def testEmbeddedTupleOutput(self):
     with self.cached_session() as sess:
@@ -1046,7 +1039,6 @@
       result = gradients_impl.gradients(z, x)[0]
       self.assertEqual(1.0, self.evaluate(result))
 
-  @test_util.disable_control_flow_v2("b/113327884")
   @test_util.run_v1_only("b/120545219")
   def testCondGrad_Gather(self):
     with self.cached_session() as sess:
@@ -1790,6 +1782,18 @@
       r = r[1] * array_ops.ones([8, 8])
       self.assertAllEqual(np.ones((8, 8)), self.evaluate(r))
 
+  @test_util.disable_control_flow_v2("b/131265085")
+  @test_util.run_v1_only("b/131265085")
+  def testWhileBadShape(self):
+    x = constant_op.constant([2.0, 4.0], name="values")
+    i = constant_op.constant(0)
+    c = lambda i, _: math_ops.less(i, 10)
+    b = lambda i, x: [i + 1, x + 1]
+    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
+      # Shape of x is [2], but we specify a shape of [5].
+      control_flow_ops.while_loop(
+          c, b, [i, x], [i.shape, tensor_shape.TensorShape([5])])
+
   @test_util.run_deprecated_v1
   def testWhileWithNonTensorInput_Scalar(self):
     with self.cached_session():
@@ -1807,7 +1811,6 @@
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual([10000], self.evaluate(r))
 
-  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInference(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1822,19 +1825,23 @@
       r = control_flow_ops.while_loop(
           c, b, [i, m],
           [i.get_shape(), tensor_shape.TensorShape([None, 2])])
-      self.assertIsNone(r[1].shape.dims[0].value)
-      self.assertEqual(r[1].shape.dims[1], tensor_shape.Dimension(2))
+      self.assertTrue(r[1].shape.is_compatible_with([8, 2]))
 
+  @test_util.run_v1_only("b/120545219")
+  def testWhileShapeInferenceBadShape(self):
+    with self.cached_session():
+      i = constant_op.constant(0)
+      m = array_ops.ones([2, 2])
+      c = lambda i, j: math_ops.less(i, 2)
+      b = lambda i, j: [i + 1, array_ops.concat([j, j], 0)]
       with self.assertRaisesRegexp(
           ValueError,
           r"Input tensor 'ones:0' enters the loop with shape \(2, 2\), but has "
           r"shape \(4, 2\) after one iteration. To allow the shape to vary "
           r"across iterations, use the `shape_invariants` argument of "
           r"tf.while_loop to specify a less-specific shape."):
-        r = control_flow_ops.while_loop(c, b, [i, m])
+        control_flow_ops.while_loop(c, b, [i, m])
 
-  @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
-  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
     values = constant_op.constant([2.0, 4.0], name="values")
     indices = constant_op.constant([[0], [3]],
@@ -1873,61 +1880,72 @@
               array_ops.concat([x.dense_shape, [10]], axis=0))
       ]
 
+    def check_shapes(r, indices, values, dense_shape):
+      self.assertTrue(r.indices.shape.is_compatible_with(indices))
+      self.assertTrue(r.values.shape.is_compatible_with(values))
+      self.assertTrue(r.dense_shape.shape.is_compatible_with(dense_shape))
+
     # Default shape invariant; b1 only modifies values.
     _, r = control_flow_ops.while_loop(c, b1, [i, x])
-    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
-    self.assertEqual(r.values.get_shape().as_list(), [None])
-    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+    check_shapes(r, indices=[None, 1], values=[None], dense_shape=[1])
 
     # Default shape invariant; b2 adds new values
     _, r = control_flow_ops.while_loop(c, b2, [i, x])
-    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
-    self.assertEqual(r.values.get_shape().as_list(), [None])
-    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
-
-    # Default shape invariant; b3 modifies rank (which is not allowed).
-    with self.assertRaises(ValueError):
-      _, r = control_flow_ops.while_loop(c, b3, [i, x])
+    check_shapes(r, indices=[None, 1], values=[None], dense_shape=[1])
 
     # Explicit shape invariant, allowing any rank; b1 only modifies values.
     _, r = control_flow_ops.while_loop(
         c, b1, [i, x],
         [i.get_shape(), tensor_shape.TensorShape([None])])
-    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
-    self.assertEqual(r.values.get_shape().as_list(), [None])
-    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    check_shapes(r, indices=[None, None], values=[None], dense_shape=[None])
 
     # Explicit shape invariant, allowing any rank; b3 modifies rank.
     _, r = control_flow_ops.while_loop(
         c, b3, [i, x],
         [i.get_shape(), tensor_shape.TensorShape([None])])
-    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
-    self.assertEqual(r.values.get_shape().as_list(), [None])
-    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    check_shapes(r, indices=[None, None], values=[None], dense_shape=[None])
 
     # Shape invariant with ndims=None.  Technically, this isn't supported
     # according to the docs, but we support it for backwards compatibility.
     _, r = control_flow_ops.while_loop(
         c, b1, [i, x],
         [i.get_shape(), tensor_shape.TensorShape(None)])
-    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
-    self.assertEqual(r.values.get_shape().as_list(), [None])
-    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    check_shapes(r, indices=[None, None], values=[None], dense_shape=[None])
     _, r = control_flow_ops.while_loop(
         c, b3, [i, x],
         [i.get_shape(), tensor_shape.TensorShape(None)])
-    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
-    self.assertEqual(r.values.get_shape().as_list(), [None])
-    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    check_shapes(r, indices=[None, None], values=[None], dense_shape=[None])
+
+  @test_util.disable_control_flow_v2("b/131265085")
+  @test_util.run_v1_only("b/131265085")
+  def testWhileBadShapeSparseTensor(self):
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    i = constant_op.constant(0)
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+    c = lambda i, _: i < 10
+    b1 = lambda i, x: [i+1, x]
+    def b2(i, x):  # modifies rank.  (shape of all components is changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(
+              array_ops.concat([x.indices, [[i], [i]]], axis=1), x.values * 2.0,
+              array_ops.concat([x.dense_shape, [10]], axis=0))
+      ]
 
     # Explicit shape invariant, with a specific (incompatible) rank.
     with self.assertRaisesRegexp(ValueError, "is not compatible with"):
-      _, r = control_flow_ops.while_loop(
+      control_flow_ops.while_loop(
           c, b1, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([5])])
 
-  @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
-  @test_util.run_v1_only("b/120545219")
+    # Default shape invariant, but b2 modifies rank (which is not allowed).
+    with self.assertRaises(ValueError):
+      control_flow_ops.while_loop(c, b2, [i, x])
+
   def testWhileShapeInferenceIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([[2.0, 4.0], [3.0, 5.0]], name="values")
@@ -1953,17 +1971,28 @@
           c, b, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertEqual(r.dense_shape.get_shape()[0], 2)
-      self.assertEqual(r.values.get_shape().as_list(), [None, 2])
+      self.assertTrue(r.values.get_shape().is_compatible_with([None, 2]))
 
-      with self.assertRaisesRegexp(ValueError, "is not compatible with"):
-        _, r = control_flow_ops.while_loop(
-            c, b, [i, x],
-            [i.get_shape(), tensor_shape.TensorShape([None, 5])])
+  @test_util.disable_control_flow_v2("b/131265085")
+  @test_util.run_v1_only("b/131265085")
+  def testWhileBadShapeIndexedSlices(self):
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    i = constant_op.constant(0)
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+    c = lambda i, _: 10
+    b = lambda i, x: [i+1, x]
 
-  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+    # Explicit shape invariant, with a specific (incompatible) rank.
+    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
+      control_flow_ops.while_loop(
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([5])])
+
   def testWhileShapeInferenceRaggedTensor(self):
-    if context.executing_eagerly():
-      self.skipTest("b/116328420")
     i = constant_op.constant(0)
     x = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
     c = lambda i, _: i < 10
@@ -1980,11 +2009,13 @@
           array_ops.concat([x, x], axis=0)
       ]
 
+    def check_shapes(r, values, splits):
+      self.assertTrue(r.values.shape.is_compatible_with(values))
+      self.assertTrue(r.row_splits.shape.is_compatible_with(splits))
+
     # Default shape invariant; b1 adds new values to rows.
     _, r = control_flow_ops.while_loop(c, b1, [i, x])
-    self.assertEqual(r.row_splits.shape.as_list(), [4])
-
-    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+    check_shapes(r, values=[None], splits=[4])
 
     # Default shape invariant; b2 adds new rows (not allowed).
     if not context.executing_eagerly():
@@ -1995,20 +2026,15 @@
     _, r = control_flow_ops.while_loop(
         c, b1, [i, x],
         [i.get_shape(), tensor_shape.TensorShape([None, None])])
-    self.assertTrue(r.row_splits.shape.as_list() in ([4], [None]))
-    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+    check_shapes(r, values=[None], splits=[None])
 
     # Explicit shape invariant; b2 adds new rows.
     _, r = control_flow_ops.while_loop(
         c, b2, [i, x],
         [i.get_shape(), tensor_shape.TensorShape([None, None])])
-    self.assertTrue(r.row_splits.shape.as_list() in ([3 * 2**10 + 1], [None]))
-    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+    check_shapes(r, values=[None], splits=[None])
 
-  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
   def testWhileShapeInferenceRaggedTensorRaggedRank2(self):
-    if context.executing_eagerly():
-      self.skipTest("b/116328420")
     i = constant_op.constant(0)
     x = ragged_factory_ops.constant([[[1, 2], [3], [4, 5, 6]],
                                      [[], [8, 9, 10]]])
@@ -2776,7 +2802,6 @@
 
       self.assertEqual(self.evaluate(fn()), 32.)
 
-  @test_util.disable_xla("b/128643381")
   def testWhileGrad_ResourceVarInFunctionCall(self):
 
     @def_function.function
@@ -2797,7 +2822,6 @@
     self.assertIsInstance(grad, ops.IndexedSlicesValue)
     self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
 
-  @test_util.disable_xla("b/128643461")
   def testWhileGrad_ResourceVarInNestedFunctionCall(self):
 
     @def_function.function
@@ -2848,7 +2872,6 @@
     self.assertIsInstance(grad, ops.IndexedSlicesValue)
     self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 6., 6., 0.])
 
-  @test_util.disable_xla("b/128639858")
   def testWhileCondGrad_ResourceVarInFunctionCall(self):
 
     @def_function.function
@@ -3043,7 +3066,6 @@
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(self.evaluate(foo()), 9.0)
 
-  @test_util.disable_xla("b/128643398")
   def testNestedResourceAccess(self):
     var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
 
@@ -3477,8 +3499,7 @@
     self.assertEqual(0, value_x)
     self.assertEqual(73, value_x_grad)
 
-  @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testWhileGrad_IndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -3500,8 +3521,7 @@
       r = gradients_impl.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testWhileGrad_SparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -3524,7 +3544,7 @@
       r = gradients_impl.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
       i0 = constant_op.constant(0)
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 9e90d10..e136d09 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -51,7 +51,7 @@
 
   def _DtypesToTest(self, use_gpu):
     if use_gpu:
-      if not test_util.CudaSupportsHalfMatMulAndConv():
+      if not test_util.GpuSupportsHalfMatMulAndConv():
         return [dtypes.float64, dtypes.float32]
       else:
         # It is important that float32 comes before float16 here,
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index e780548..1930ad5 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -163,7 +163,7 @@
 class Conv2DTest(test.TestCase):
 
   def _DtypesToTest(self, use_gpu):
-    if use_gpu and not test_util.CudaSupportsHalfMatMulAndConv():
+    if use_gpu and not test_util.GpuSupportsHalfMatMulAndConv():
       return [dtypes.float32, dtypes.float64]
     else:
       # It is important that float32 comes before float16 here,
@@ -403,7 +403,6 @@
         padding,
         expected,
         dilations,
-        gpu_only=True,
         test_grappler_layout_optimizer=test_grappler_layout_optimizer,
         tol=tol,
         fp16_tol=fp16_tol)
@@ -778,7 +777,8 @@
             stride_cols=1,
             test_input=test_input,
             data_format=data_format,
-            use_gpu=True)
+            use_gpu=True,
+            max_err=0.005)
 
   @test_util.deprecated_graph_mode_only
   @test_util.run_cuda_only
@@ -1428,8 +1428,14 @@
                                                 strides,
                                                 padding,
                                                 data_format,
+                                                use_gpu,
                                                 dilations=(1, 1),
                                                 err=2e-5):
+    if use_gpu and not test.is_gpu_available(cuda_only=True):
+      return
+    if not use_gpu and dilations != (1, 1):
+      return  # Non-default dilations is currently not supported on the CPU.
+
     x1 = self._CreateNumpyTensor(filter_sizes)
     x2 = self._CreateNumpyTensor(output_sizes)
     dilations = list(dilations)
@@ -1454,133 +1460,128 @@
         padding,
         expected,
         data_format,
-        use_gpu=True,
+        use_gpu=use_gpu,
         err=err,
         dilations=dilations)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding0x0BackpropInput(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 2, 3, 1],
-            filter_sizes=[2, 2, 1, 1],
-            output_sizes=[1, 1, 2, 1],
-            strides=[1, 1],
-            padding=[[0, 0], [0, 0]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[1, 1, 2, 1],
+          strides=[1, 1],
+          padding=[[0, 0], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
 
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 3, 4, 2],
-            filter_sizes=[2, 2, 2, 3],
-            output_sizes=[1, 1, 2, 3],
-            strides=[2, 2],
-            padding=[[0, 0], [0, 0]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 3, 4, 2],
+          filter_sizes=[2, 2, 2, 3],
+          output_sizes=[1, 1, 2, 3],
+          strides=[2, 2],
+          padding=[[0, 0], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding1x1BackpropInput(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 2, 3, 1],
-            filter_sizes=[2, 2, 1, 2],
-            output_sizes=[1, 3, 4, 2],
-            strides=[1, 1],
-            padding=[[1, 1], [1, 1]],
-            data_format=data_format, err=1e-4)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 2],
+          output_sizes=[1, 3, 4, 2],
+          strides=[1, 1],
+          padding=[[1, 1], [1, 1]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=1e-4)
 
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 2, 3, 2],
-            filter_sizes=[1, 1, 2, 1],
-            output_sizes=[1, 4, 3, 1],
-            strides=[1, 2],
-            padding=[[1, 1], [1, 1]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 2, 3, 2],
+          filter_sizes=[1, 1, 2, 1],
+          output_sizes=[1, 4, 3, 1],
+          strides=[1, 2],
+          padding=[[1, 1], [1, 1]],
+          data_format=data_format,
+          use_gpu=use_gpu)
 
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 4, 3, 1],
-            filter_sizes=[2, 2, 1, 1],
-            output_sizes=[1, 4, 2, 1],
-            strides=[1, 2],
-            padding=[[1, 1], [1, 1]],
-            data_format=data_format,
-            dilations=[2, 2])
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 4, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[1, 4, 2, 1],
+          strides=[1, 2],
+          padding=[[1, 1], [1, 1]],
+          data_format=data_format,
+          dilations=[2, 2], use_gpu=use_gpu)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding2x2BackpropInput(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[2, 3, 1, 1],
-            filter_sizes=[2, 1, 1, 1],
-            output_sizes=[2, 2, 5, 1],
-            strides=[3, 1],
-            padding=[[2, 2], [2, 2]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[2, 3, 1, 1],
+          filter_sizes=[2, 1, 1, 1],
+          output_sizes=[2, 2, 5, 1],
+          strides=[3, 1],
+          padding=[[2, 2], [2, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu)
 
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 3, 6, 1],
-            filter_sizes=[3, 2, 1, 1],
-            output_sizes=[1, 3, 4, 1],
-            strides=[1, 2],
-            padding=[[2, 2], [2, 2]],
-            data_format=data_format,
-            dilations=[2, 3])
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 3, 6, 1],
+          filter_sizes=[3, 2, 1, 1],
+          output_sizes=[1, 3, 4, 1],
+          strides=[1, 2],
+          padding=[[2, 2], [2, 2]],
+          data_format=data_format,
+          dilations=[2, 3],
+          use_gpu=use_gpu)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding_1_8_4_1_BackpropInput(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 2, 3, 1],
-            filter_sizes=[2, 2, 1, 1],
-            output_sizes=[1, 10, 8, 1],
-            strides=[1, 1],
-            padding=[[1, 8], [4, 2]],
-            data_format=data_format, err=5e-5)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[1, 10, 8, 1],
+          strides=[1, 1],
+          padding=[[1, 8], [4, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=5e-5)
 
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 5, 3, 1],
-            filter_sizes=[3, 2, 1, 1],
-            output_sizes=[1, 4, 8, 1],
-            strides=[3, 1],
-            padding=[[1, 8], [4, 2]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 5, 3, 1],
+          filter_sizes=[3, 2, 1, 1],
+          output_sizes=[1, 4, 8, 1],
+          strides=[3, 1],
+          padding=[[1, 8], [4, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding_5_0_2_2_BackpropInput(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 3, 3, 1],
-            filter_sizes=[2, 1, 1, 1],
-            output_sizes=[1, 7, 7, 1],
-            strides=[1, 1],
-            padding=[[5, 0], [2, 2]],
-            data_format=data_format,
-            err=5e-5)
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 3, 3, 1],
+          filter_sizes=[2, 1, 1, 1],
+          output_sizes=[1, 7, 7, 1],
+          strides=[1, 1],
+          padding=[[5, 0], [2, 2]],
+          data_format=data_format,
+          err=5e-5,
+          use_gpu=use_gpu)
 
-        self._RunAndVerifyBackpropInputExplicitPadding(
-            input_sizes=[1, 4, 2, 1],
-            filter_sizes=[3, 3, 1, 1],
-            output_sizes=[1, 5, 2, 1],
-            strides=[1, 2],
-            padding=[[5, 0], [2, 2]],
-            data_format=data_format,
-            dilations=[2, 1])
+      self._RunAndVerifyBackpropInputExplicitPadding(
+          input_sizes=[1, 4, 2, 1],
+          filter_sizes=[3, 3, 1, 1],
+          output_sizes=[1, 5, 2, 1],
+          strides=[1, 2],
+          padding=[[5, 0], [2, 2]],
+          data_format=data_format,
+          dilations=[2, 1],
+          use_gpu=use_gpu)
 
   def _RunAndVerifyBackpropFilterExplicitPadding(self,
                                                  input_sizes,
@@ -1589,8 +1590,14 @@
                                                  strides,
                                                  padding,
                                                  data_format,
+                                                 use_gpu,
                                                  dilations=(1, 1),
                                                  err=1e-5):
+    if use_gpu and not test.is_gpu_available(cuda_only=True):
+      return
+    if not use_gpu and dilations != (1, 1):
+      return  # Non-default dilations is currently not supported on the CPU.
+
     x0 = self._CreateNumpyTensor(input_sizes)
     x2 = self._CreateNumpyTensor(output_sizes)
     dilations = list(dilations)
@@ -1612,135 +1619,127 @@
         padding,
         expected,
         data_format,
-        use_gpu=True,
+        use_gpu=use_gpu,
         dilations=dilations,
         err=err)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding0x0BackpropFilter(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 2, 3, 1],
-            filter_sizes=[2, 2, 1, 1],
-            output_sizes=[1, 1, 2, 1],
-            strides=[1, 1],
-            padding=[[0, 0], [0, 0]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[1, 1, 2, 1],
+          strides=[1, 1],
+          padding=[[0, 0], [0, 0]],
+          data_format=data_format, use_gpu=use_gpu)
 
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 3, 4, 2],
-            filter_sizes=[2, 2, 2, 3],
-            output_sizes=[1, 1, 2, 3],
-            strides=[2, 2],
-            padding=[[0, 0], [0, 0]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 3, 4, 2],
+          filter_sizes=[2, 2, 2, 3],
+          output_sizes=[1, 1, 2, 3],
+          strides=[2, 2],
+          padding=[[0, 0], [0, 0]],
+          data_format=data_format, use_gpu=use_gpu)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding1x1BackpropFilter(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 2, 3, 1],
-            filter_sizes=[2, 2, 1, 2],
-            output_sizes=[1, 3, 4, 2],
-            strides=[1, 1],
-            padding=[[1, 1], [1, 1]],
-            data_format=data_format,
-            err=5e-5)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 2],
+          output_sizes=[1, 3, 4, 2],
+          strides=[1, 1],
+          padding=[[1, 1], [1, 1]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=5e-5)
 
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 2, 3, 2],
-            filter_sizes=[1, 1, 2, 1],
-            output_sizes=[1, 4, 3, 1],
-            strides=[1, 2],
-            padding=[[1, 1], [1, 1]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 2, 3, 2],
+          filter_sizes=[1, 1, 2, 1],
+          output_sizes=[1, 4, 3, 1],
+          strides=[1, 2],
+          padding=[[1, 1], [1, 1]],
+          use_gpu=use_gpu,
+          data_format=data_format)
 
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 4, 3, 1],
-            filter_sizes=[2, 2, 1, 1],
-            output_sizes=[1, 4, 2, 1],
-            strides=[1, 2],
-            padding=[[1, 1], [1, 1]],
-            data_format=data_format,
-            dilations=[2, 2])
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 4, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[1, 4, 2, 1],
+          strides=[1, 2],
+          padding=[[1, 1], [1, 1]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          dilations=[2, 2])
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding2x2BackpropFilter(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[2, 3, 1, 1],
-            filter_sizes=[2, 1, 1, 1],
-            output_sizes=[2, 2, 5, 1],
-            strides=[3, 1],
-            padding=[[2, 2], [2, 2]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[2, 3, 1, 1],
+          filter_sizes=[2, 1, 1, 1],
+          output_sizes=[2, 2, 5, 1],
+          strides=[3, 1],
+          padding=[[2, 2], [2, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu)
 
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 3, 6, 1],
-            filter_sizes=[3, 2, 1, 1],
-            output_sizes=[1, 3, 4, 1],
-            strides=[1, 2],
-            padding=[[2, 2], [2, 2]],
-            data_format=data_format,
-            dilations=[2, 3])
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 3, 6, 1],
+          filter_sizes=[3, 2, 1, 1],
+          output_sizes=[1, 3, 4, 1],
+          strides=[1, 2],
+          padding=[[2, 2], [2, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          dilations=[2, 3])
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding_1_8_4_1_BackpropFilter(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 2, 3, 1],
-            filter_sizes=[2, 2, 1, 1],
-            output_sizes=[1, 10, 8, 1],
-            strides=[1, 1],
-            padding=[[1, 8], [4, 2]],
-            data_format=data_format,
-            err=1e-4)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[1, 10, 8, 1],
+          strides=[1, 1],
+          padding=[[1, 8], [4, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=1e-4)
 
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 5, 3, 1],
-            filter_sizes=[3, 2, 1, 1],
-            output_sizes=[1, 4, 8, 1],
-            strides=[3, 1],
-            padding=[[1, 8], [4, 2]],
-            data_format=data_format)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 5, 3, 1],
+          filter_sizes=[3, 2, 1, 1],
+          output_sizes=[1, 4, 8, 1],
+          strides=[3, 1],
+          padding=[[1, 8], [4, 2]],
+          use_gpu=use_gpu,
+          data_format=data_format)
 
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth1Padding_5_0_2_2_BackpropFilter(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 3, 3, 1],
-            filter_sizes=[2, 1, 1, 1],
-            output_sizes=[1, 7, 7, 1],
-            strides=[1, 1],
-            padding=[[5, 0], [2, 2]],
-            data_format=data_format,
-            err=1e-4)
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 3, 3, 1],
+          filter_sizes=[2, 1, 1, 1],
+          output_sizes=[1, 7, 7, 1],
+          strides=[1, 1],
+          padding=[[5, 0], [2, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=1e-4)
 
-        self._RunAndVerifyBackpropFilterExplicitPadding(
-            input_sizes=[1, 4, 2, 1],
-            filter_sizes=[3, 3, 1, 1],
-            output_sizes=[1, 5, 2, 1],
-            strides=[1, 2],
-            padding=[[5, 0], [2, 2]],
-            data_format=data_format,
-            dilations=[2, 1])
+      self._RunAndVerifyBackpropFilterExplicitPadding(
+          input_sizes=[1, 4, 2, 1],
+          filter_sizes=[3, 3, 1, 1],
+          output_sizes=[1, 5, 2, 1],
+          strides=[1, 2],
+          padding=[[5, 0], [2, 2]],
+          data_format=data_format,
+          use_gpu=use_gpu,
+          dilations=[2, 1])
 
   # Gradient checkers
   def ConstructAndTestGradient(self,
@@ -2106,257 +2105,221 @@
 
   @test_util.deprecated_graph_mode_only
   def testInputGradient1x1PaddingStrideOne(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=5,
-            input_cols=4,
-            filter_rows=3,
-            filter_cols=3,
-            in_depth=2,
-            out_depth=3,
-            stride_rows=1,
-            stride_cols=1,
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            test_input=True,
-            data_format=data_format,
-            use_gpu=use_gpu,
-            max_err=0.0025)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=5,
+          input_cols=4,
+          filter_rows=3,
+          filter_cols=3,
+          in_depth=2,
+          out_depth=3,
+          stride_rows=1,
+          stride_cols=1,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          test_input=True,
+          data_format=data_format,
+          use_gpu=use_gpu,
+          max_err=0.0025)
 
   @test_util.deprecated_graph_mode_only
   def testFilterGradient1x1PaddingStrideOne(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=5,
-            input_cols=4,
-            filter_rows=3,
-            filter_cols=3,
-            in_depth=2,
-            out_depth=3,
-            stride_rows=1,
-            stride_cols=1,
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            test_input=False,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=5,
+          input_cols=4,
+          filter_rows=3,
+          filter_cols=3,
+          in_depth=2,
+          out_depth=3,
+          stride_rows=1,
+          stride_cols=1,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          test_input=False,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testInputGradient1x1PaddingStrideTwo(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=4,
-            input_cols=5,
-            filter_rows=3,
-            filter_cols=3,
-            in_depth=2,
-            out_depth=3,
-            stride_rows=2,
-            stride_cols=2,
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            test_input=True,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=4,
+          input_cols=5,
+          filter_rows=3,
+          filter_cols=3,
+          in_depth=2,
+          out_depth=3,
+          stride_rows=2,
+          stride_cols=2,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          test_input=True,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testFilterGradient1x1PaddingStrideTwo(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=4,
-            input_cols=5,
-            filter_rows=3,
-            filter_cols=3,
-            in_depth=2,
-            out_depth=3,
-            stride_rows=2,
-            stride_cols=2,
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            test_input=False,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=4,
+          input_cols=5,
+          filter_rows=3,
+          filter_cols=3,
+          in_depth=2,
+          out_depth=3,
+          stride_rows=2,
+          stride_cols=2,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          test_input=False,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testInputGradient2x2PaddingStrideOne(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=5,
-            input_cols=4,
-            filter_rows=3,
-            filter_cols=3,
-            in_depth=2,
-            out_depth=3,
-            stride_rows=1,
-            stride_cols=1,
-            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
-            test_input=True,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=5,
+          input_cols=4,
+          filter_rows=3,
+          filter_cols=3,
+          in_depth=2,
+          out_depth=3,
+          stride_rows=1,
+          stride_cols=1,
+          padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+          test_input=True,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testFilterGradient2x2PaddingStrideOne(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=5,
-            input_cols=4,
-            filter_rows=3,
-            filter_cols=3,
-            in_depth=2,
-            out_depth=3,
-            stride_rows=1,
-            stride_cols=1,
-            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
-            test_input=False,
-            data_format=data_format,
-            use_gpu=use_gpu,
-            max_err=0.003)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=5,
+          input_cols=4,
+          filter_rows=3,
+          filter_cols=3,
+          in_depth=2,
+          out_depth=3,
+          stride_rows=1,
+          stride_cols=1,
+          padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+          test_input=False,
+          data_format=data_format,
+          use_gpu=use_gpu,
+          max_err=0.003)
 
   @test_util.deprecated_graph_mode_only
   def testInputGradient1_2_3_4PaddingStride3x2(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=8,
-            input_cols=5,
-            filter_rows=4,
-            filter_cols=2,
-            in_depth=3,
-            out_depth=2,
-            stride_rows=3,
-            stride_cols=2,
-            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
-            test_input=True,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=8,
+          input_cols=5,
+          filter_rows=4,
+          filter_cols=2,
+          in_depth=3,
+          out_depth=2,
+          stride_rows=3,
+          stride_cols=2,
+          padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+          test_input=True,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testFilterGradient1_2_3_4PaddingStride3x2(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=8,
-            input_cols=5,
-            filter_rows=4,
-            filter_cols=2,
-            in_depth=3,
-            out_depth=2,
-            stride_rows=3,
-            stride_cols=2,
-            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
-            test_input=False,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=8,
+          input_cols=5,
+          filter_rows=4,
+          filter_cols=2,
+          in_depth=3,
+          out_depth=2,
+          stride_rows=3,
+          stride_cols=2,
+          padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+          test_input=False,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testInputGradient4_3_2_1PaddingStride2x1(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=3,
-            input_rows=5,
-            input_cols=7,
-            filter_rows=3,
-            filter_cols=2,
-            in_depth=1,
-            out_depth=2,
-            stride_rows=2,
-            stride_cols=1,
-            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
-            test_input=True,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=3,
+          input_rows=5,
+          input_cols=7,
+          filter_rows=3,
+          filter_cols=2,
+          in_depth=1,
+          out_depth=2,
+          stride_rows=2,
+          stride_cols=1,
+          padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+          test_input=True,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testFilterGradient4_3_2_1PaddingStride2x1(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=3,
-            input_rows=5,
-            input_cols=7,
-            filter_rows=3,
-            filter_cols=2,
-            in_depth=1,
-            out_depth=2,
-            stride_rows=2,
-            stride_cols=1,
-            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
-            test_input=False,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=3,
+          input_rows=5,
+          input_cols=7,
+          filter_rows=3,
+          filter_cols=2,
+          in_depth=1,
+          out_depth=2,
+          stride_rows=2,
+          stride_cols=1,
+          padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+          test_input=False,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testInputGradient0_0_0_5PaddingStride1x2(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=6,
-            input_cols=7,
-            filter_rows=3,
-            filter_cols=4,
-            in_depth=3,
-            out_depth=2,
-            stride_rows=1,
-            stride_cols=2,
-            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
-            test_input=True,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=6,
+          input_cols=7,
+          filter_rows=3,
+          filter_cols=4,
+          in_depth=3,
+          out_depth=2,
+          stride_rows=1,
+          stride_cols=2,
+          padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+          test_input=True,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testFilterGradient0_0_0_5PaddingStride1x2(self):
-    if not test.is_gpu_available(cuda_only=True):
-      return
     for (data_format, use_gpu) in GetTestConfigs():
-      if use_gpu:
-        self.ConstructAndTestGradient(
-            batch=2,
-            input_rows=6,
-            input_cols=7,
-            filter_rows=3,
-            filter_cols=4,
-            in_depth=3,
-            out_depth=2,
-            stride_rows=1,
-            stride_cols=2,
-            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
-            test_input=False,
-            data_format=data_format,
-            use_gpu=use_gpu)
+      self.ConstructAndTestGradient(
+          batch=2,
+          input_rows=6,
+          input_cols=7,
+          filter_rows=3,
+          filter_cols=4,
+          in_depth=3,
+          out_depth=2,
+          stride_rows=1,
+          stride_cols=2,
+          padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+          test_input=False,
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   @test_util.deprecated_graph_mode_only
   def testShapeFunctionEdgeCases(self):
@@ -2504,31 +2467,29 @@
                 strides=[1, 1, 1, 1],
                 padding=[[0, 0], [2, 2], [2, 2], [0, 0]]))
 
-    if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
-        # Negative padding during backprop.
-        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                     "nonnegative"):
-          sess.run(
-              nn_ops.conv2d_backprop_input([32, 20, 20, 3],
-                                           array_ops.placeholder(
-                                               dtypes.float32,
-                                               shape=[18, 18, 3, 2]),
-                                           array_ops.placeholder(
-                                               dtypes.float32,
-                                               shape=[32, 3, 2, 2]),
-                                           strides=[1, 1, 1, 1],
-                                           padding=[[0, 0], [-1, 0], [0, 0],
-                                                    [0, 0]]))
-        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                     "nonnegative"):
-          sess.run(
-              nn_ops.conv2d_backprop_filter(
-                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
-                  [18, 18, 3, 2],
-                  array_ops.placeholder(dtypes.float32, shape=[32, 3, 2, 2]),
-                  strides=[1, 1, 1, 1],
-                  padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]))
+      # Negative padding during backprop.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "nonnegative"):
+        sess.run(
+            nn_ops.conv2d_backprop_input([32, 20, 20, 3],
+                                         array_ops.placeholder(
+                                             dtypes.float32,
+                                             shape=[18, 18, 3, 2]),
+                                         array_ops.placeholder(
+                                             dtypes.float32,
+                                             shape=[32, 3, 2, 2]),
+                                         strides=[1, 1, 1, 1],
+                                         padding=[[0, 0], [-1, 0], [0, 0],
+                                                  [0, 0]]))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "nonnegative"):
+        sess.run(
+            nn_ops.conv2d_backprop_filter(
+                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                [18, 18, 3, 2],
+                array_ops.placeholder(dtypes.float32, shape=[32, 3, 2, 2]),
+                strides=[1, 1, 1, 1],
+                padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]))
 
 
 class DepthwiseConv2DTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
index 7b1519c..cc719d6 100644
--- a/tensorflow/python/kernel_tests/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -56,6 +56,7 @@
                         sorted(r_value))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.xla_allow_fallback("b/128495870")
   def testCriticalSectionWithControlFlow(self):
     for outer_cond in [False, True]:
       for inner_cond in [False, True]:
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index 352dede..edbbb8b 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -20,6 +20,8 @@
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -305,7 +307,7 @@
 
 class CTCLossTestV2(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes
   def testCtcLossV2(self):
     random_seed.set_random_seed(5)
 
@@ -326,17 +328,21 @@
     labels *= label_mask
     logit_length = [num_frames] * batch_size
 
-    ref_loss = ctc_ops.ctc_loss_v2(
-        labels=labels,
-        logits=logits,
-        label_length=label_length,
-        logit_length=logit_length)
-    ref_grad = gradients_impl.gradients(ref_loss, [logits])
+    with backprop.GradientTape() as t:
+      t.watch(logits)
+      ref_loss = ctc_ops.ctc_loss_v2(
+          labels=labels,
+          logits=logits,
+          label_length=label_length,
+          logit_length=logit_length)
+    ref_grad = t.gradient(ref_loss, [logits])
 
     sparse_labels = ctc_ops.dense_labels_to_sparse(labels, label_length)
 
     def assert_same_loss_and_grads(loss):
-      with self.cached_session() as sess:
+      if context.executing_eagerly():
+        return
+      with self.cached_session():
         self.assertAllClose(*self.evaluate([loss, ref_loss]))
         grad = gradients_impl.gradients(loss, [logits])
         self.assertAllClose(
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index c73ab7c..1cb9cfa 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -827,6 +827,7 @@
     self._compare_values(x, y=y)
 
   def testTypes(self):
+    self.skipTest("b/131162241")
     for dtype in [np.float16, np.float32, np.float64]:
       self._testDtype(dtype)
 
@@ -1116,7 +1117,7 @@
 
   @test_util.run_deprecated_v1
   def testGradientAtSingularity(self):
-    if not compat.forward_compatible(2019, 5, 14):
+    if not compat.forward_compatible(2019, 6, 14):
       self.skipTest("Skipping test for future functionality.")
 
     ops_and_singularity = [
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 709a20f..f3ae548 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
@@ -29,6 +30,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -542,6 +544,24 @@
           for analytical, numerical in grads:
             self.assertAllClose(analytical, numerical, rtol=tol, atol=tol)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testComplexAbsGradGrad(self):
+
+    def f(x):
+      real = math_ops.cos(x)
+      imag = ops.convert_to_tensor(1.)
+      return math_ops.abs(math_ops.complex(real, imag))
+
+    def g(x):
+      with backprop.GradientTape() as t:
+        t.watch(x)
+        y = f(x)
+      return t.gradient(y, x)
+
+    err = gradient_checker_v2.max_error(
+        *gradient_checker_v2.compute_gradient(g, [ops.convert_to_tensor(2.0)]))
+    self.assertLess(err, 1e-3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index d97fcfa..14f8280 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -295,8 +295,9 @@
       # grad_eval.shape = (N, N), with grad_eval[i, j] the partial derivative of
       # the ith output point w.r.t. the jth grid point.  We only expect the
       # diagonal to be nonzero.
-      # TODO(b/31131137): Replace tf.test.compute_gradient with our own custom
-      # gradient evaluation to ensure we correctly handle small function delta.
+      # TODO(b/31131137): Replace tf.compat.v1.test.compute_gradient with our
+      # own custom gradient evaluation to ensure we correctly handle small
+      # function delta.
       grad_eval, _ = gradient_checker.compute_gradient(grid, grid_spec.shape,
                                                        fn(grid),
                                                        grid_spec.shape)
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
index 7ba2dc6..cfc3ea7 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
@@ -118,7 +118,8 @@
                                               rates=[1, 1, 1, 1],
                                               padding='SAME')
     # Github issue: #20146
-    # tf.extract_image_patches() gradient very slow at graph construction time
+    # tf.image.extract_image_patches() gradient very slow at graph construction
+    # time
     gradients = gradients_impl.gradients(patches, images)
     # Won't time out.
     self.assertIsNotNone(gradients)
diff --git a/tensorflow/python/kernel_tests/fingerprint_op_test.py b/tensorflow/python/kernel_tests/fingerprint_op_test.py
new file mode 100644
index 0000000..0af3f51
--- /dev/null
+++ b/tensorflow/python/kernel_tests/fingerprint_op_test.py
@@ -0,0 +1,42 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.fingerprint_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# Fingerprint op has C++ tests. This simple test case tests that fingerprint
+# function is accessible via Python API.
+class FingerprintTest(test.TestCase):
+
+  def test_default_values(self):
+    data = np.arange(10)
+    data = np.expand_dims(data, axis=0)
+    fingerprint0 = self.evaluate(array_ops.fingerprint(data))
+    fingerprint1 = self.evaluate(array_ops.fingerprint(data[:, 1:]))
+    self.assertEqual(fingerprint0.ndim, 2)
+    self.assertTupleEqual(fingerprint0.shape, fingerprint1.shape)
+    self.assertTrue(np.any(fingerprint0 != fingerprint1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 9168304..29e0653 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -23,6 +23,7 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import def_function as eager_def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
@@ -944,6 +945,35 @@
 class PartitionedCallTest(test.TestCase):
 
   @test_util.run_deprecated_v1
+  def testRemoteDeviceInPartitionedCallOp(self):
+    workers, _ = test_util.create_local_cluster(2, 0)
+
+    worker0_device = "/job:worker/replica:0/task:0/cpu:0"
+    worker1_device = "/job:worker/replica:0/task:1/cpu:0"
+
+    @eager_def_function.function
+    def f(a, b):
+      return a + b
+
+    with session.Session(workers[0].target) as sess:
+      with ops.device(worker0_device):
+        a = variable_scope.get_variable(
+            "a", initializer=constant_op.constant(1.), use_resource=True)
+      with ops.device(worker1_device):
+        b = variable_scope.get_variable(
+            "b", initializer=constant_op.constant(1.), use_resource=True)
+
+      sess.run(variables.global_variables_initializer())
+
+    config = config_pb2.ConfigProto()
+    config.experimental.share_cluster_devices_in_session = True
+
+    with session.Session(workers[0].target, config=config) as sess:
+      res = sess.run(f(a, b))
+
+    self.assertEqual(res, 2)
+
+  @test_util.run_deprecated_v1
   def testBasicSingleDevice(self):
 
     @function.Defun(*[dtypes.float32] * 2)
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index af91dd0..f4044ed 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -272,7 +272,8 @@
           expected=[[[[8, 9], [9, 8]], [[8, 8], [9, 9]]],
                     [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
 
-      # batch_dims=indices.shape.ndims - 1 (equivalent to tf.batch_gather)
+      # batch_dims=indices.shape.ndims - 1
+      # (equivalent to tf.compat.v1.batch_gather)
       dict(  # 2D indices (1 batch dim)
           batch_dims=1,
           params=[[10, 11, 12, 13], [20, 21, 22, 23]],
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index c1aa99c..de68fbb 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -145,7 +145,7 @@
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/signal",
     ],
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "noasan",  # times out, b/63678675
         "optonly",  # times out, b/79171797
@@ -177,6 +177,29 @@
 )
 
 cuda_py_test(
+    name = "linear_operator_householder_test",
+    size = "medium",
+    srcs = ["linear_operator_householder_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
     name = "linear_operator_identity_test",
     size = "medium",
     srcs = ["linear_operator_identity_test.py"],
@@ -257,7 +280,7 @@
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 8,
+    shard_count = 10,
     tags = [
         "noasan",
         "optonly",
@@ -287,7 +310,7 @@
 
 cuda_py_test(
     name = "linear_operator_low_rank_update_test",
-    size = "large",
+    size = "medium",
     srcs = ["linear_operator_low_rank_update_test.py"],
     additional_deps = [
         "//tensorflow/python/ops/linalg",
@@ -298,7 +321,7 @@
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "noasan",  # times out
         "optonly",
@@ -328,6 +351,31 @@
 )
 
 cuda_py_test(
+    name = "linear_operator_toeplitz_test",
+    size = "medium",
+    srcs = ["linear_operator_toeplitz_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out, b/79171797
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
     name = "linear_operator_zeros_test",
     size = "medium",
     srcs = ["linear_operator_zeros_test.py"],
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
index f70d8c4..d305277 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -17,6 +17,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -37,11 +39,11 @@
     self._atol[dtypes.complex64] = 1e-5
     self._rtol[dtypes.complex64] = 1e-5
 
-  def _operator_and_matrix(self,
-                           build_info,
-                           dtype,
-                           use_placeholder,
-                           ensure_self_adjoint_and_pd=False):
+  def operator_and_matrix(self,
+                          build_info,
+                          dtype,
+                          use_placeholder,
+                          ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
 
     if ensure_self_adjoint_and_pd:
@@ -113,12 +115,116 @@
 
     self.assertEqual("my_operator_adjoint", operator.name)
 
+  def test_matmul_adjoint_operator(self):
+    matrix1 = np.random.randn(4, 4)
+    matrix2 = np.random.randn(4, 4)
+    full_matrix1 = linalg.LinearOperatorFullMatrix(matrix1)
+    full_matrix2 = linalg.LinearOperatorFullMatrix(matrix2)
+
+    self.assertAllClose(
+        np.matmul(matrix1, matrix2.T),
+        self.evaluate(
+            full_matrix1.matmul(full_matrix2, adjoint_arg=True).to_dense()))
+
+    self.assertAllClose(
+        np.matmul(matrix1.T, matrix2),
+        self.evaluate(
+            full_matrix1.matmul(full_matrix2, adjoint=True).to_dense()))
+
+    self.assertAllClose(
+        np.matmul(matrix1.T, matrix2.T),
+        self.evaluate(
+            full_matrix1.matmul(
+                full_matrix2, adjoint=True, adjoint_arg=True).to_dense()))
+
+  def test_matmul_adjoint_complex_operator(self):
+    matrix1 = np.random.randn(4, 4) + 1j * np.random.randn(4, 4)
+    matrix2 = np.random.randn(4, 4) + 1j * np.random.randn(4, 4)
+    full_matrix1 = linalg.LinearOperatorFullMatrix(matrix1)
+    full_matrix2 = linalg.LinearOperatorFullMatrix(matrix2)
+
+    self.assertAllClose(
+        np.matmul(matrix1, matrix2.conj().T),
+        self.evaluate(
+            full_matrix1.matmul(full_matrix2, adjoint_arg=True).to_dense()))
+
+    self.assertAllClose(
+        np.matmul(matrix1.conj().T, matrix2),
+        self.evaluate(
+            full_matrix1.matmul(full_matrix2, adjoint=True).to_dense()))
+
+    self.assertAllClose(
+        np.matmul(matrix1.conj().T, matrix2.conj().T),
+        self.evaluate(
+            full_matrix1.matmul(
+                full_matrix2, adjoint=True, adjoint_arg=True).to_dense()))
+
+  def test_solve_adjoint_operator(self):
+    matrix1 = self.evaluate(
+        linear_operator_test_util.random_tril_matrix(
+            [4, 4], dtype=dtypes.float64, force_well_conditioned=True))
+    matrix2 = np.random.randn(4, 4)
+    full_matrix1 = linalg.LinearOperatorLowerTriangular(
+        matrix1, is_non_singular=True)
+    full_matrix2 = linalg.LinearOperatorFullMatrix(matrix2)
+
+    self.assertAllClose(
+        self.evaluate(linalg.triangular_solve(matrix1, matrix2.T)),
+        self.evaluate(
+            full_matrix1.solve(full_matrix2, adjoint_arg=True).to_dense()))
+
+    self.assertAllClose(
+        self.evaluate(
+            linalg.triangular_solve(
+                matrix1.T, matrix2, lower=False)),
+        self.evaluate(
+            full_matrix1.solve(full_matrix2, adjoint=True).to_dense()))
+
+    self.assertAllClose(
+        self.evaluate(
+            linalg.triangular_solve(matrix1.T, matrix2.T, lower=False)),
+        self.evaluate(
+            full_matrix1.solve(
+                full_matrix2, adjoint=True, adjoint_arg=True).to_dense()))
+
+  def test_solve_adjoint_complex_operator(self):
+    matrix1 = self.evaluate(linear_operator_test_util.random_tril_matrix(
+        [4, 4], dtype=dtypes.complex128, force_well_conditioned=True) +
+                            1j * linear_operator_test_util.random_tril_matrix(
+                                [4, 4], dtype=dtypes.complex128,
+                                force_well_conditioned=True))
+    matrix2 = np.random.randn(4, 4) + 1j * np.random.randn(4, 4)
+
+    full_matrix1 = linalg.LinearOperatorLowerTriangular(
+        matrix1, is_non_singular=True)
+    full_matrix2 = linalg.LinearOperatorFullMatrix(matrix2)
+
+    self.assertAllClose(
+        self.evaluate(linalg.triangular_solve(matrix1, matrix2.conj().T)),
+        self.evaluate(
+            full_matrix1.solve(full_matrix2, adjoint_arg=True).to_dense()))
+
+    self.assertAllClose(
+        self.evaluate(
+            linalg.triangular_solve(
+                matrix1.conj().T, matrix2, lower=False)),
+        self.evaluate(
+            full_matrix1.solve(full_matrix2, adjoint=True).to_dense()))
+
+    self.assertAllClose(
+        self.evaluate(
+            linalg.triangular_solve(
+                matrix1.conj().T, matrix2.conj().T, lower=False)),
+        self.evaluate(
+            full_matrix1.solve(
+                full_matrix2, adjoint=True, adjoint_arg=True).to_dense()))
+
 
 class LinearOperatorAdjointNonSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
   """Tests done in the base class NonSquareLinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape_before_adjoint = list(build_info.shape)
     # We need to swap the last two dimensions because we are taking the adjoint
     # of this operator
@@ -139,4 +245,5 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorAdjointTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index 12da865..8057d05 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -23,6 +23,7 @@
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable=unused-import
+from tensorflow.python.ops.linalg import solve_registrations  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
@@ -34,6 +35,8 @@
 _registered_inverse = linear_operator_algebra._registered_inverse
 _MATMUL = linear_operator_algebra._MATMUL
 _registered_matmul = linear_operator_algebra._registered_matmul
+_SOLVE = linear_operator_algebra._SOLVE
+_registered_solve = linear_operator_algebra._registered_solve
 # pylint: enable=protected-access
 
 
@@ -175,6 +178,55 @@
       self.assertEqual(v, _registered_matmul(k[0], k[1]))
 
 
+class SolveTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _solve(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Solve to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterSolve(CustomLinOp, CustomLinOp)
+    def _solve(a, b):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.solve(custom_linop))
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterSolve(CustomLinOp, CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterSolve(
+        CustomLinOp, CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterSolve(
+          CustomLinOp, CustomLinOp)(lambda a: None)
+
+  def testExactSolveRegistrationsAllMatch(self):
+    for (k, v) in _SOLVE.items():
+      self.assertEqual(v, _registered_solve(k[0], k[1]))
+
+
 class InverseTest(test.TestCase):
 
   def testRegistration(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 28f8d20..a00e61c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -68,23 +68,23 @@
     self._rtol[dtypes.complex64] = 1e-4
 
   @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
+  def operator_shape_infos(self):
+    shape_info = linear_operator_test_util.OperatorShapeInfo
     return [
-        build_info((0, 0)),
-        build_info((1, 1)),
-        build_info((1, 3, 3)),
-        build_info((5, 5), blocks=[(2, 2), (3, 3)]),
-        build_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
-        build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
+        shape_info((0, 0)),
+        shape_info((1, 1)),
+        shape_info((1, 3, 3)),
+        shape_info((5, 5), blocks=[(2, 2), (3, 3)]),
+        shape_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
+        shape_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
-    shape = list(build_info.shape)
+    shape = list(shape_info.shape)
     expected_blocks = (
-        build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
+        shape_info.__dict__["blocks"] if "blocks" in shape_info.__dict__
         else [shape])
     matrices = [
         linear_operator_test_util.random_positive_definite_matrix(
@@ -111,7 +111,7 @@
     self.assertTrue(operator.is_square)
 
     # Broadcast the shapes.
-    expected_shape = list(build_info.shape)
+    expected_shape = list(shape_info.shape)
 
     matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
 
@@ -258,4 +258,5 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(SquareLinearOperatorBlockDiagTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index fbf20c0..bd01406 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -38,6 +38,21 @@
 class LinearOperatorCirculantBaseTest(object):
   """Common class for circulant tests."""
 
+  _atol = {
+      dtypes.float16: 1e-3,
+      dtypes.float32: 1e-6,
+      dtypes.float64: 1e-7,
+      dtypes.complex64: 1e-6,
+      dtypes.complex128: 1e-7
+  }
+  _rtol = {
+      dtypes.float16: 1e-3,
+      dtypes.float32: 1e-6,
+      dtypes.float64: 1e-7,
+      dtypes.complex64: 1e-6,
+      dtypes.complex128: 1e-7
+  }
+
   @contextlib.contextmanager
   def _constrain_devices_and_set_default(self, sess, use_gpu, force_gpu):
     """We overwrite the FFT operation mapping for testing."""
@@ -77,7 +92,7 @@
       x = np.zeros([domain_dimension])
       # x is a basis vector.
       x[m] = 1.0
-      fft_x = fft_ops.fft(x.astype(np.complex64))
+      fft_x = fft_ops.fft(math_ops.cast(x, spectrum.dtype))
       h_convolve_x = fft_ops.ifft(spectrum * fft_x)
       matrix_rows.append(h_convolve_x)
     matrix = array_ops.stack(matrix_rows, axis=-1)
@@ -93,22 +108,24 @@
   Note that when the spectrum is real, the operator may still be complex.
   """
 
-  @property
-  def _dtypes_to_test(self):
+  @staticmethod
+  def dtypes_to_test():
     # This operator will always be complex because, although the spectrum is
     # real, the matrix will not be real.
-    return [dtypes.complex64]
+    return [dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
-    shape = build_info.shape
+    shape = shape_info.shape
     # For this test class, we are creating real spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
     #
     # spectrum is bounded away from zero.
     spectrum = linear_operator_test_util.random_sign_uniform(
-        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+        shape=self._shape_to_spectrum_shape(shape),
+        minval=1.,
+        maxval=2.)
     if ensure_self_adjoint_and_pd:
       spectrum = math_ops.abs(spectrum)
     # If dtype is complex, cast spectrum to complex.  The imaginary part will be
@@ -153,20 +170,20 @@
   zero imaginary part.
   """
 
-  @property
-  def _dtypes_to_test(self):
-    return [dtypes.float32, dtypes.complex64]
-
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
-    shape = build_info.shape
+    shape = shape_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
     #
     # pre_spectrum is bounded away from zero.
     pre_spectrum = linear_operator_test_util.random_uniform(
-        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+        shape=self._shape_to_spectrum_shape(shape),
+        dtype=dtype,
+        minval=1.,
+        maxval=2.)
+    pre_spectrum = math_ops.cast(math_ops.abs(pre_spectrum), dtype=dtype)
     pre_spectrum_c = _to_complex(pre_spectrum)
 
     # Real{IFFT[pre_spectrum]}
@@ -220,25 +237,25 @@
   We test only complex dtypes here.
   """
 
-  @property
-  def _dtypes_to_test(self):
-    return [dtypes.complex64]
+  @staticmethod
+  def dtypes_to_test():
+    return [dtypes.complex64, dtypes.complex128]
 
   # Skip Cholesky since we are explicitly testing non-hermitian
   # spectra.
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     return ["cholesky"]
 
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     del ensure_self_adjoint_and_pd
-    shape = build_info.shape
+    shape = shape_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
         shape=self._shape_to_spectrum_shape(shape),
-        dtype=dtypes.complex64,
+        dtype=dtype,
         minval=1.,
         maxval=2.)
 
@@ -375,16 +392,16 @@
       with spectral_ops_test_util.fft_kernel_label_map():
         yield sess
 
-  @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
+  @staticmethod
+  def operator_shapes_infos():
+    shape_info = linear_operator_test_util.OperatorShapesInfo
     # non-batch operators (n, n) and batch operators.
     return [
-        build_info((0, 0)),
-        build_info((1, 1)),
-        build_info((1, 6, 6)),
-        build_info((3, 4, 4)),
-        build_info((2, 1, 3, 3))
+        shape_info((0, 0)),
+        shape_info((1, 1)),
+        shape_info((1, 6, 6)),
+        shape_info((3, 4, 4)),
+        shape_info((2, 1, 3, 3))
     ]
 
   def _shape_to_spectrum_shape(self, shape):
@@ -435,7 +452,7 @@
         x = np.zeros(block_shape)
         # x is a basis vector.
         x[n0, n1] = 1.0
-        fft_x = fft_ops.fft2d(x.astype(np.complex64))
+        fft_x = fft_ops.fft2d(math_ops.cast(x, spectrum.dtype))
         h_convolve_x = fft_ops.ifft2d(spectrum * fft_x)
         # We want the flat version of the action of the operator on a basis
         # vector, not the block version.
@@ -455,20 +472,19 @@
   zero imaginary part.
   """
 
-  @property
-  def _dtypes_to_test(self):
-    return [dtypes.float32, dtypes.complex64]
-
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
-    shape = build_info.shape
+    shape = shape_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
     #
     # pre_spectrum is bounded away from zero.
     pre_spectrum = linear_operator_test_util.random_uniform(
-        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+        shape=self._shape_to_spectrum_shape(shape),
+        dtype=dtype,
+        minval=1.,
+        maxval=2.)
     pre_spectrum_c = _to_complex(pre_spectrum)
 
     # Real{IFFT[pre_spectrum]}
@@ -509,19 +525,19 @@
   We test only complex dtypes here.
   """
 
-  @property
-  def _dtypes_to_test(self):
-    return [dtypes.complex64]
+  @staticmethod
+  def dtypes_to_test():
+    return [dtypes.complex64, dtypes.complex128]
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     return ["cholesky"]
 
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     del ensure_self_adjoint_and_pd
-    shape = build_info.shape
+    shape = shape_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
         shape=self._shape_to_spectrum_shape(shape),
@@ -549,8 +565,7 @@
       operator = linalg.LinearOperatorCirculant(spectrum)
 
       matrix_tensor = operator.to_dense()
-      self.assertEqual(matrix_tensor.dtype,
-                       linear_operator_circulant._DTYPE_COMPLEX)
+      self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_t = array_ops.matrix_transpose(matrix_tensor)
       imag_matrix = math_ops.imag(matrix_tensor)
       matrix, matrix_transpose, imag_matrix = sess.run(
@@ -561,15 +576,14 @@
 
   @test_util.run_v1_only("b/120545219")
   def test_real_spectrum_gives_self_adjoint_operator(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       # This is a real and hermitian spectrum.
       spectrum = linear_operator_test_util.random_normal(
           shape=(3, 3), dtype=dtypes.float32)
       operator = linalg.LinearOperatorCirculant2D(spectrum)
 
       matrix_tensor = operator.to_dense()
-      self.assertEqual(matrix_tensor.dtype,
-                       linear_operator_circulant._DTYPE_COMPLEX)
+      self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_h = linalg.adjoint(matrix_tensor)
       matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllClose(matrix, matrix_h, atol=0)
@@ -614,11 +628,6 @@
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     self.assertTrue(operator.is_self_adjoint)
 
-  def test_invalid_dtype_raises(self):
-    spectrum = array_ops.constant(rng.rand(2, 2, 2))
-    with self.assertRaisesRegexp(TypeError, "must have dtype"):
-      linalg.LinearOperatorCirculant2D(spectrum)
-
   def test_invalid_rank_raises(self):
     spectrum = array_ops.constant(np.float32(rng.rand(2)))
     with self.assertRaisesRegexp(ValueError, "must have at least 2 dimensions"):
@@ -638,7 +647,7 @@
 
   @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       # This is a real and hermitian spectrum.
       spectrum = linear_operator_test_util.random_normal(
           shape=(2, 2, 3, 5), dtype=dtypes.float32)
@@ -646,8 +655,7 @@
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
 
       matrix_tensor = operator.to_dense()
-      self.assertEqual(matrix_tensor.dtype,
-                       linear_operator_circulant._DTYPE_COMPLEX)
+      self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_h = linalg.adjoint(matrix_tensor)
 
       matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
@@ -737,4 +745,14 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(
+      LinearOperatorCirculantTestSelfAdjointOperator)
+  linear_operator_test_util.add_tests(
+      LinearOperatorCirculantTestHermitianSpectrum)
+  linear_operator_test_util.add_tests(
+      LinearOperatorCirculantTestNonHermitianSpectrum)
+  linear_operator_test_util.add_tests(
+      LinearOperatorCirculant2DTestHermitianSpectrum)
+  linear_operator_test_util.add_tests(
+      LinearOperatorCirculant2DTestNonHermitianSpectrum)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 214b73a..70f69f4 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -43,12 +43,12 @@
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     # Cholesky not implemented.
     return ["cholesky"]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
 
     # Either 1 or 2 matrices, depending.
@@ -141,7 +141,7 @@
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def operator_and_matrix(self, build_info, dtype, use_placeholder):
     sess = ops.get_default_session()
     shape = list(build_info.shape)
 
@@ -218,4 +218,6 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(SquareLinearOperatorCompositionTest)
+  linear_operator_test_util.add_tests(NonSquareLinearOperatorCompositionTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 5c3220e..3d5cb2d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -33,7 +33,7 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
@@ -187,6 +187,35 @@
         linalg_lib.LinearOperatorDiag))
     self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
 
+  def test_diag_solve(self):
+    operator1 = linalg_lib.LinearOperatorDiag([2., 3.], is_non_singular=True)
+    operator2 = linalg_lib.LinearOperatorDiag([1., 2.], is_non_singular=True)
+    operator3 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3., is_non_singular=True)
+    operator_solve = operator1.solve(operator2)
+    self.assertTrue(isinstance(
+        operator_solve,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([0.5, 2 / 3.], self.evaluate(operator_solve.diag))
+
+    operator_solve = operator2.solve(operator1)
+    self.assertTrue(isinstance(
+        operator_solve,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 3 / 2.], self.evaluate(operator_solve.diag))
+
+    operator_solve = operator1.solve(operator3)
+    self.assertTrue(isinstance(
+        operator_solve,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([3 / 2., 1.], self.evaluate(operator_solve.diag))
+
+    operator_solve = operator3.solve(operator1)
+    self.assertTrue(isinstance(
+        operator_solve,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2 / 3., 1.], self.evaluate(operator_solve.diag))
+
   def test_diag_adjoint_type(self):
     diag = [1., 3., 5., 8.]
     operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
@@ -208,4 +237,5 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorDiagTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 0679bda..c8a006b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -35,7 +35,7 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
@@ -129,11 +129,11 @@
     self._atol[dtypes.float64] = 1e-10
     self._rtol[dtypes.float64] = 1e-10
 
-  @property
-  def _dtypes_to_test(self):
+  @staticmethod
+  def dtypes_to_test():
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
 
@@ -205,7 +205,7 @@
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     matrix = linear_operator_test_util.random_normal(shape, dtype=dtype)
 
@@ -234,4 +234,8 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(SquareLinearOperatorFullMatrixTest)
+  linear_operator_test_util.add_tests(NonSquareLinearOperatorFullMatrixTest)
+  linear_operator_test_util.add_tests(
+      SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
new file mode 100644
index 0000000..5f43576
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
@@ -0,0 +1,93 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_householder as householder
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+
+class LinearOperatorHouseholderTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  @staticmethod
+  def operator_shapes_infos():
+    shape_info = linear_operator_test_util.OperatorShapesInfo
+    return [
+        shape_info((1, 1)),
+        shape_info((1, 3, 3)),
+        shape_info((3, 4, 4)),
+        shape_info((2, 1, 4, 4))]
+
+  @staticmethod
+  def tests_to_skip():
+    # This linear operator is never positive definite.
+    return ["cholesky"]
+
+  def operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+    reflection_axis = linear_operator_test_util.random_sign_uniform(
+        shape[:-1], minval=1., maxval=2., dtype=dtype)
+    # Make sure unit norm.
+    reflection_axis = reflection_axis / linalg_ops.norm(
+        reflection_axis, axis=-1, keepdims=True)
+
+    lin_op_reflection_axis = reflection_axis
+
+    if use_placeholder:
+      lin_op_reflection_axis = array_ops.placeholder_with_default(
+          reflection_axis, shape=None)
+
+    operator = householder.LinearOperatorHouseholder(lin_op_reflection_axis)
+
+    mat = reflection_axis[..., array_ops.newaxis]
+    matrix = -2 * math_ops.matmul(mat, mat, adjoint_b=True)
+    matrix = array_ops.matrix_set_diag(
+        matrix, 1. + array_ops.matrix_diag_part(matrix))
+
+    return operator, matrix
+
+  def test_scalar_reflection_axis_raises(self):
+    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+      householder.LinearOperatorHouseholder(1.)
+
+  def test_householder_adjoint_type(self):
+    reflection_axis = [1., 3., 5., 8.]
+    operator = householder.LinearOperatorHouseholder(reflection_axis)
+    self.assertIsInstance(
+        operator.adjoint(), householder.LinearOperatorHouseholder)
+
+  def test_householder_inverse_type(self):
+    reflection_axis = [1., 3., 5., 8.]
+    operator = householder.LinearOperatorHouseholder(reflection_axis)
+    self.assertIsInstance(
+        operator.inverse(), householder.LinearOperatorHouseholder)
+
+
+if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorHouseholderTest)
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 55eff59..3d29adc 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -37,13 +37,13 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  @property
-  def _dtypes_to_test(self):
-    # TODO(langmore) Test tf.float16 once tf.matrix_solve works in
+  @staticmethod
+  def dtypes_to_test():
+    # TODO(langmore) Test tf.float16 once tf.linalg.solve works in
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     # Identity matrix is already Hermitian Positive Definite.
@@ -80,7 +80,7 @@
       operator.assert_self_adjoint().run()  # Should not fail
 
   def test_float16_matmul(self):
-    # float16 cannot be tested by base test class because tf.matrix_solve does
+    # float16 cannot be tested by base test class because tf.linalg.solve does
     # not work with float16.
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(
@@ -285,13 +285,13 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  @property
-  def _dtypes_to_test(self):
-    # TODO(langmore) Test tf.float16 once tf.matrix_solve works in
+  @staticmethod
+  def dtypes_to_test():
+    # TODO(langmore) Test tf.float16 once tf.linalg.solve works in
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
 
@@ -374,7 +374,7 @@
         operator.assert_self_adjoint().run()
 
   def test_float16_matmul(self):
-    # float16 cannot be tested by base test class because tf.matrix_solve does
+    # float16 cannot be tested by base test class because tf.linalg.solve does
     # not work with float16.
     with self.cached_session():
       multiplier = rng.rand(3).astype(np.float16)
@@ -495,6 +495,20 @@
         linalg_lib.LinearOperatorScaledIdentity))
     self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
 
+  def test_identity_solve(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    self.assertTrue(isinstance(
+        operator1.solve(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    operator_solve = operator1.solve(operator2)
+    self.assertTrue(isinstance(
+        operator_solve,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_solve.multiplier))
+
   def test_scaled_identity_cholesky_type(self):
     operator = linalg_lib.LinearOperatorScaledIdentity(
         num_rows=2,
@@ -518,4 +532,6 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorIdentityTest)
+  linear_operator_test_util.add_tests(LinearOperatorScaledIdentityTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
index 9344c52..bab2c9b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -37,11 +37,11 @@
     self._atol[dtypes.complex64] = 1e-5
     self._rtol[dtypes.complex64] = 1e-5
 
-  def _operator_and_matrix(self,
-                           build_info,
-                           dtype,
-                           use_placeholder,
-                           ensure_self_adjoint_and_pd=False):
+  def operator_and_matrix(self,
+                          build_info,
+                          dtype,
+                          use_placeholder,
+                          ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
 
     if ensure_self_adjoint_and_pd:
@@ -127,4 +127,5 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorInversionTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 166188f..1dc296b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -87,22 +87,18 @@
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
-  @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
+  @staticmethod
+  def operator_shapes_infos():
+    shape_info = linear_operator_test_util.OperatorShapesInfo
     return [
-        build_info((1, 1), factors=[(1, 1), (1, 1)]),
-        build_info((8, 8), factors=[(2, 2), (2, 2), (2, 2)]),
-        build_info((12, 12), factors=[(2, 2), (3, 3), (2, 2)]),
-        build_info((1, 3, 3), factors=[(1, 1), (1, 3, 3)]),
-        build_info((3, 6, 6), factors=[(3, 1, 1), (1, 2, 2), (1, 3, 3)]),
+        shape_info((1, 1), factors=[(1, 1), (1, 1)]),
+        shape_info((8, 8), factors=[(2, 2), (2, 2), (2, 2)]),
+        shape_info((12, 12), factors=[(2, 2), (3, 3), (2, 2)]),
+        shape_info((1, 3, 3), factors=[(1, 1), (1, 3, 3)]),
+        shape_info((3, 6, 6), factors=[(3, 1, 1), (1, 2, 2), (1, 3, 3)]),
     ]
 
-  @property
-  def _tests_to_skip(self):
-    return ["det", "inverse", "solve", "solve_with_broadcast"]
-
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     # Kronecker products constructed below will be from symmetric
@@ -258,4 +254,5 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(SquareLinearOperatorKroneckerTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 2920f3a..3485741 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -46,19 +47,19 @@
   # If False, A = L + UDU^H or A = L + UU^H, depending on _use_diag_update
   _use_v = None
 
-  @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
+  @staticmethod
+  def operator_shapes_infos():
+    shape_info = linear_operator_test_util.OperatorShapesInfo
     # Previously we had a (2, 10, 10) shape at the end.  We did this to test the
     # inversion and determinant lemmas on not-tiny matrices, since these are
     # known to have stability issues.  This resulted in test timeouts, so this
     # shape has been removed, but rest assured, the tests did pass.
     return [
-        build_info((0, 0)),
-        build_info((1, 1)),
-        build_info((1, 3, 3)),
-        build_info((3, 4, 4)),
-        build_info((2, 1, 4, 4))]
+        shape_info((0, 0)),
+        shape_info((1, 1)),
+        shape_info((1, 3, 3)),
+        shape_info((3, 4, 4)),
+        shape_info((2, 1, 4, 4))]
 
   def _gen_positive_diag(self, dtype, diag_shape):
     if dtype.is_complex:
@@ -69,10 +70,10 @@
     return linear_operator_test_util.random_uniform(
         diag_shape, minval=1e-4, maxval=1., dtype=dtype)
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder,
-                           ensure_self_adjoint_and_pd=False):
+  def operator_and_matrix(self, shape_info, dtype, use_placeholder,
+                          ensure_self_adjoint_and_pd=False):
     # Recall A = L + UDV^H
-    shape = list(build_info.shape)
+    shape = list(shape_info.shape)
     diag_shape = shape[:-1]
     k = shape[-2] // 2 + 1
     u_perturbation_shape = shape[:-1] + [k]
@@ -179,8 +180,8 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     return ["cholesky"]
 
   _use_diag_update = True
@@ -195,7 +196,7 @@
     self._rtol[dtypes.float32] = 1e-4
     self._atol[dtypes.float64] = 1e-9
     self._rtol[dtypes.float64] = 1e-9
-    self._rtol[dtypes.complex64] = 1e-4
+    self._rtol[dtypes.complex64] = 2e-4
 
 
 class LinearOperatorLowRankUpdatetestNoDiagUseCholesky(
@@ -222,8 +223,8 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     return ["cholesky"]
 
   _use_diag_update = False
@@ -238,7 +239,7 @@
     self._rtol[dtypes.float32] = 1e-4
     self._atol[dtypes.float64] = 1e-9
     self._rtol[dtypes.float64] = 1e-9
-    self._rtol[dtypes.complex64] = 1e-4
+    self._rtol[dtypes.complex64] = 2e-4
 
 
 class LinearOperatorLowRankUpdatetestWithDiagNotSquare(
@@ -263,9 +264,9 @@
 
     # domain_dimension is 3
     self.assertAllEqual([2, 3, 3], operator.shape)
-    with self.cached_session():
-      self.assertAllEqual([2, 3, 3], operator.to_dense().eval().shape)
+    self.assertAllEqual([2, 3, 3], self.evaluate(operator.to_dense()).shape)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_shape_broadcasts_up_from_operator_to_other_args(self):
     num_rows_ph = array_ops.placeholder(dtypes.int32)
 
@@ -323,4 +324,14 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(
+      LinearOperatorLowRankUpdatetestWithDiagUseCholesky)
+  linear_operator_test_util.add_tests(
+      LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky)
+  linear_operator_test_util.add_tests(
+      LinearOperatorLowRankUpdatetestNoDiagUseCholesky)
+  linear_operator_test_util.add_tests(
+      LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky)
+  linear_operator_test_util.add_tests(
+      LinearOperatorLowRankUpdatetestWithDiagNotSquare)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index bd41f9e..c86beeb 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -30,12 +30,12 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     # Cholesky does not make sense for triangular matrices.
     return ["cholesky"]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     # Upper triangle will be nonzero, but ignored.
     # Use a diagonal that ensures this matrix is well conditioned.
@@ -103,4 +103,5 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorLowerTriangularTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 8f8b15e..c62f3f0 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -238,7 +238,7 @@
 
     self.assertTrue(operator_matmul.is_square)
     self.assertTrue(operator_matmul.is_non_singular)
-    self.assertTrue(operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
     self.assertEqual(None, operator_matmul.is_positive_definite)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
new file mode 100644
index 0000000..22ae26f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
@@ -0,0 +1,143 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+import numpy as np
+import scipy.linalg
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.linalg import linear_operator_toeplitz
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+_to_complex = linear_operator_toeplitz._to_complex
+
+
+class LinearOperatorToeplitzTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  @contextlib.contextmanager
+  def _constrain_devices_and_set_default(self, sess, use_gpu, force_gpu):
+    """We overwrite the FFT operation mapping for testing."""
+    with test.TestCase._constrain_devices_and_set_default(
+        self, sess, use_gpu, force_gpu) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  def setUp(self):
+    # TODO(srvasude): Lower these tolerances once specialized solve and
+    # determinants are implemented.
+    self._atol[dtypes.float32] = 1e-3
+    self._rtol[dtypes.float32] = 1e-3
+    self._atol[dtypes.float64] = 1e-10
+    self._rtol[dtypes.float64] = 1e-10
+    self._atol[dtypes.complex64] = 1e-3
+    self._rtol[dtypes.complex64] = 1e-3
+    self._atol[dtypes.complex128] = 1e-10
+    self._rtol[dtypes.complex128] = 1e-10
+
+  @staticmethod
+  def tests_to_skip():
+    # Skip solve tests, as these could have better stability
+    # (currently exercises the base class).
+    # TODO(srvasude): Enable these when solve is implemented.
+    return ["cholesky", "inverse", "solve", "solve_with_broadcast"]
+
+  @staticmethod
+  def operator_shapes_infos():
+    shape_info = linear_operator_test_util.OperatorShapesInfo
+    # non-batch operators (n, n) and batch operators.
+    return [
+        shape_info((1, 1)),
+        shape_info((1, 6, 6)),
+        shape_info((3, 4, 4)),
+        shape_info((2, 1, 3, 3))
+    ]
+
+  def operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+    row = np.random.uniform(low=1., high=5., size=shape[:-1])
+    col = np.random.uniform(low=1., high=5., size=shape[:-1])
+
+    # Make sure first entry is the same
+    row[..., 0] = col[..., 0]
+
+    if ensure_self_adjoint_and_pd:
+      # Note that a Toeplitz matrix generated from a linearly decreasing
+      # non-negative sequence is positive definite. See
+      # https://www.math.cinvestav.mx/~grudsky/Papers/118_29062012_Albrecht.pdf
+      # for details.
+      row = np.linspace(start=10., stop=1., num=shape[-1])
+
+      # The entries for the first row and column should be the same to guarantee
+      # symmetric.
+      row = col
+
+    lin_op_row = math_ops.cast(row, dtype=dtype)
+    lin_op_col = math_ops.cast(col, dtype=dtype)
+
+    if use_placeholder:
+      lin_op_row = array_ops.placeholder_with_default(
+          lin_op_row, shape=None)
+      lin_op_col = array_ops.placeholder_with_default(
+          lin_op_col, shape=None)
+
+    operator = linear_operator_toeplitz.LinearOperatorToeplitz(
+        row=lin_op_row,
+        col=lin_op_col,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
+
+    flattened_row = np.reshape(row, (-1, shape[-1]))
+    flattened_col = np.reshape(col, (-1, shape[-1]))
+    flattened_toeplitz = np.zeros(
+        [flattened_row.shape[0], shape[-1], shape[-1]])
+    for i in range(flattened_row.shape[0]):
+      flattened_toeplitz[i] = scipy.linalg.toeplitz(
+          flattened_col[i],
+          flattened_row[i])
+    matrix = np.reshape(flattened_toeplitz, shape)
+    matrix = math_ops.cast(matrix, dtype=dtype)
+
+    return operator, matrix
+
+  def test_scalar_row_col_raises(self):
+    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+      linear_operator_toeplitz.LinearOperatorToeplitz(1., 1.)
+
+    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+      linear_operator_toeplitz.LinearOperatorToeplitz([1.], 1.)
+
+    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+      linear_operator_toeplitz.LinearOperatorToeplitz(1., [1.])
+
+
+if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorToeplitzTest)
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index d1e6c37..3fedb27 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -241,111 +241,6 @@
       self.assertAllClose(expected, result)
 
 
-class MatmulWithBroadcastTest(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def test_static_dims_broadcast_x_has_extra_dims(self):
-    # batch_shape = [2]
-    # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
-    x = rng.rand(2, 1, 3)
-    y = rng.rand(3, 7)
-    y_broadcast = y + np.zeros((2, 1, 1))
-
-    with self.cached_session():
-      result = linear_operator_util.matmul_with_broadcast(x, y)
-      self.assertAllEqual((2, 1, 7), result.get_shape())
-      expected = math_ops.matmul(x, y_broadcast)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
-
-  @test_util.run_deprecated_v1
-  def test_static_dims_broadcast_y_has_extra_dims(self):
-    # Since the second arg has extra dims, and the domain dim of the first arg
-    # is larger than the number of linear equations, code will "flip" the extra
-    # dims of the first arg to the far right, making extra linear equations
-    # (then call the matrix function, then flip back).
-    # We have verified that this optimization indeed happens.  How? We stepped
-    # through with a debugger.
-    x = rng.rand(5, 7)
-    y = rng.rand(2, 3, 7, 5)
-    x_broadcast = x + np.zeros((2, 3, 5, 7))
-
-    with self.cached_session():
-      result = linear_operator_util.matmul_with_broadcast(x, y)
-      self.assertAllEqual((2, 3, 5, 5), result.get_shape())
-      expected = math_ops.matmul(x_broadcast, y)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
-
-  @test_util.run_deprecated_v1
-  def test_static_dims_broadcast_y_has_extra_dims_transpose_a_and_b(self):
-    # Since the second arg has extra dims, and the domain dim of the first arg
-    # is larger than the number of linear equations, code will "flip" the extra
-    # dims of the first arg to the far right, making extra linear equations
-    # (then call the matrix function, then flip back).
-    # We have verified that this optimization indeed happens.  How? We stepped
-    # through with a debugger.
-    x = rng.rand(1, 7, 5)
-    y = rng.rand(2, 3, 1, 7)
-    x_broadcast = x + np.zeros((2, 3, 1, 1))
-
-    with self.cached_session():
-      result = linear_operator_util.matmul_with_broadcast(
-          x, y, transpose_a=True, transpose_b=True)
-      self.assertAllEqual((2, 3, 5, 1), result.get_shape())
-      expected = math_ops.matmul(
-          x_broadcast, y, transpose_a=True, transpose_b=True)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
-
-  @test_util.run_deprecated_v1
-  def test_static_dims_broadcast_y_has_extra_dims_transpose_dynamic(self):
-    # Since the second arg has extra dims, and the domain dim of the first arg
-    # is larger than the number of linear equations, code will "flip" the extra
-    # dims of the first arg to the far right, making extra linear equations
-    # (then call the matrix function, then flip back).
-    # We have verified that this optimization indeed happens.  How? We stepped
-    # through with a debugger.
-    x = rng.rand(1, 7, 5)
-    y = rng.rand(2, 3, 1, 7)
-    x_broadcast = x + np.zeros((2, 3, 1, 1))
-
-    x_ph = array_ops.placeholder(dtypes.float64, [None, None, None])
-    y_ph = array_ops.placeholder(dtypes.float64, [None, None, None, None])
-
-    with self.cached_session():
-      result = linear_operator_util.matmul_with_broadcast(
-          x_ph, y_ph, transpose_a=True, transpose_b=True)
-      self.assertAllEqual(4, result.shape.ndims)
-      expected = math_ops.matmul(
-          x_broadcast, y, transpose_a=True, transpose_b=True)
-      self.assertAllClose(expected.eval(),
-                          result.eval(feed_dict={
-                              x_ph: x,
-                              y_ph: y
-                          }))
-
-  @test_util.run_deprecated_v1
-  def test_dynamic_dims_broadcast_64bit(self):
-    # batch_shape = [2]
-    # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
-    x = rng.rand(2, 1, 3)
-    y = rng.rand(3, 7)
-    y_broadcast = y + np.zeros((2, 1, 1))
-
-    x_ph = array_ops.placeholder(dtypes.float64)
-    y_ph = array_ops.placeholder(dtypes.float64)
-
-    with self.cached_session() as sess:
-      result, expected = sess.run(
-          [
-              linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
-              math_ops.matmul(x, y_broadcast)
-          ],
-          feed_dict={
-              x_ph: x,
-              y_ph: y
-          })
-      self.assertAllClose(expected, result)
-
-
 class MatrixSolveWithBroadcastTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index 10651d3..60f9c48 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -34,21 +34,21 @@
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     return [
         "cholesky", "log_abs_det", "inverse", "solve", "solve_with_broadcast"]
 
-  @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
+  @staticmethod
+  def operator_shapes_infos():
+    shapes_info = linear_operator_test_util.OperatorShapesInfo
     return [
-        build_info((1, 1)),
-        build_info((1, 3, 3)),
-        build_info((3, 4, 4)),
-        build_info((2, 1, 4, 4))]
+        shapes_info((1, 1)),
+        shapes_info((1, 3, 3)),
+        shapes_info((3, 4, 4)),
+        shapes_info((2, 1, 4, 4))]
 
-  def _operator_and_matrix(
+  def operator_and_matrix(
       self, build_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     del ensure_self_adjoint_and_pd
@@ -192,7 +192,7 @@
 class LinearOperatorZerosNotSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def operator_and_matrix(self, build_info, dtype, use_placeholder):
     del use_placeholder
     shape = list(build_info.shape)
 
@@ -209,4 +209,6 @@
 
 
 if __name__ == "__main__":
+  linear_operator_test_util.add_tests(LinearOperatorZerosTest)
+  linear_operator_test_util.add_tests(LinearOperatorZerosNotSquareTest)
   test.main()
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index f5fda3c..3c35b97 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -839,10 +840,6 @@
       self.assertEqual(self.evaluate(element_shape), -1)
 
   def testSerializeListWithMaxNumElements(self):
-    if test_util.is_gpu_available():
-      # TODO(b/119151861): Enable on GPU.
-      return
-
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
@@ -1561,6 +1558,19 @@
     grad = gradients_impl.gradients(t1, t)[0]
     self.assertAllEqual(self.evaluate(grad), [1., 1., 1.])
 
+  def testHandleDataAcrossFunctionCall(self):
+
+    @def_function.function
+    def func():
+      t = constant_op.constant([1., 2., 3.])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      return l
+
+    tensor_list = func()
+    element = list_ops.tensor_list_get_item(
+        tensor_list, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(element.shape.as_list(), [])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 3896b13..2a28aa6 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -169,6 +169,16 @@
       self.assertTrue((expected + "\n") in printed.contents())
 
   @test_util.run_in_graph_and_eager_modes()
+  def testPrintTwoTensorsDifferentSep(self):
+    with self.cached_session():
+      tensor = math_ops.range(10)
+      with self.captureWritesToStream(sys.stderr) as printed:
+        print_op = logging_ops.print_v2(tensor, tensor * 10, sep="<separator>")
+        self.evaluate(print_op)
+      expected = "[0 1 2 ... 7 8 9]<separator>[0 10 20 ... 70 80 90]"
+      self.assertIn(expected + "\n", printed.contents())
+
+  @test_util.run_in_graph_and_eager_modes()
   def testPrintPlaceholderGeneration(self):
     with self.cached_session():
       tensor = math_ops.range(10)
@@ -208,6 +218,16 @@
       self.assertTrue((expected + "\n") in printed.contents())
 
   @test_util.run_in_graph_and_eager_modes()
+  def testPrintStringScalarDifferentEnd(self):
+    with self.cached_session():
+      tensor = ops.convert_to_tensor("scalar")
+      with self.captureWritesToStream(sys.stderr) as printed:
+        print_op = logging_ops.print_v2(tensor, end="<customend>")
+        self.evaluate(print_op)
+      expected = "scalar<customend>"
+      self.assertIn(expected, printed.contents())
+
+  @test_util.run_in_graph_and_eager_modes()
   def testPrintComplexTensorStruct(self):
     with self.cached_session():
       tensor = math_ops.range(10)
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 66125c1..a3dd7db 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -74,7 +74,7 @@
 
     use_gpu = True
     if a_np_.dtype is np.float16 and (
-        not test_util.CudaSupportsHalfMatMulAndConv()):
+        not test_util.GpuSupportsHalfMatMulAndConv()):
       use_gpu = False
       print("Built without fp16 matmul support for Cuda, running test on CPU.")
 
@@ -99,8 +99,8 @@
     self.assertAllCloseAccordingToType(
         tf_val,
         np_val,
-        float_rtol=2e-5,
-        float_atol=2e-5,
+        float_rtol=3e-5,
+        float_atol=3e-5,
         half_rtol=0.2,
         half_atol=0.2)
 
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index 856ba7b..7b5bd82 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -420,6 +420,13 @@
         truth=None,
         raises=TypeError)
 
+  def testOneHotUint8WithLargeArray(self):
+    with self.cached_session(use_gpu=False) as sess:
+      matrix = np.random.rand(256) * 10
+      tensor = constant_op.constant(matrix, dtypes.uint8, shape=matrix.shape)
+      tensor_one_hot = array_ops.one_hot(tensor, depth=10, axis=0)
+      self.assertEqual(sess.run(tensor_one_hot).shape, (10, 256))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index 43c8fa4..ab270bf 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -212,7 +212,7 @@
                 "a": parsing_ops.FixedLenFeature((1, 3), dtypes.float32)
             }
         },
-        # TODO(mrry): Consider matching the `tf.parse_example()` error message.
+        # TODO(mrry): Consider matching the `io.parse_example()` error message.
         expected_err=(errors_impl.OpError, "Key: a."))
 
   def testDenseDefaultNoShapeShouldFail(self):
@@ -774,7 +774,7 @@
                         (2, 1, 1), dtype=dtypes.string, allow_missing=True),
             }
         },
-        # TODO(mrry): Consider matching the `tf.parse_example()` error message.
+        # TODO(mrry): Consider matching the `io.parse_example()` error message.
         expected_err=(errors_impl.OpError, "Key: b."))
 
     self._test(
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index af76e09..672f571 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -1131,7 +1131,7 @@
                 expected_context_values=None,
                 expected_feat_list_values=None,
                 expected_err=None):
-    # Test using tf.parse_single_sequence_example
+    # Test using tf.io.parse_single_sequence_example
     self._test(
         kwargs,
         expected_context_values=expected_context_values,
@@ -1616,6 +1616,76 @@
         batch=True)
 
 
+class DecodeRawTest(test.TestCase):
+
+  def _decode_v1(self, words):
+    with self.cached_session():
+      examples = np.array(words)
+      example_tensor = constant_op.constant(
+          examples, shape=examples.shape, dtype=dtypes.string)
+      byte_tensor = parsing_ops.decode_raw_v1(example_tensor, dtypes.uint8)
+      return self.evaluate(byte_tensor)
+
+  def _decode_v2(self, words, fixed_length=None):
+    with self.cached_session():
+      examples = np.array(words)
+      byte_tensor = parsing_ops.decode_raw(
+          examples, dtypes.uint8, fixed_length=fixed_length)
+      return self.evaluate(byte_tensor)
+
+  def _ordinalize(self, words, fixed_length=None):
+    outputs = []
+    if fixed_length is None:
+      fixed_length = len(words[0])
+
+    for word in words:
+      output = []
+      for i in range(fixed_length):
+        if i < len(word):
+          output.append(ord(word[i]))
+        else:
+          output.append(0)
+      outputs.append(output)
+    return np.array(outputs)
+
+  def testDecodeRawV1EqualLength(self):
+    words = ["string1", "string2"]
+
+    observed = self._decode_v1(words)
+    expected = self._ordinalize(words)
+
+    self.assertAllEqual(expected.shape, observed.shape)
+    self.assertAllEqual(expected, observed)
+
+  def testDecodeRawV2FallbackEqualLength(self):
+    words = ["string1", "string2"]
+
+    observed = self._decode_v2(words)
+    expected = self._ordinalize(words)
+
+    self.assertAllEqual(expected.shape, observed.shape)
+    self.assertAllEqual(expected, observed)
+
+  def testDecodeRawV1VariableLength(self):
+    words = ["string", "longer_string"]
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self._decode_v1(words)
+
+  def testDecodeRawV2FallbackVariableLength(self):
+    words = ["string", "longer_string"]
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self._decode_v2(words)
+
+  def testDecodeRawV2VariableLength(self):
+    words = ["string", "longer_string"]
+
+    observed = self._decode_v2(words, fixed_length=8)
+    expected = self._ordinalize(words, fixed_length=8)
+
+    self.assertAllEqual(expected.shape, observed.shape)
+    self.assertAllEqual(expected, observed)
+
+
 class DecodeJSONExampleTest(test.TestCase):
 
   def _testRoundTrip(self, examples):
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index aa207eb..68b23a2 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -209,7 +209,7 @@
     self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                         data_format, dtypes.float64, expected, use_gpu, v2)
 
-    if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
+    if not use_gpu or test_util.GpuSupportsHalfMatMulAndConv():
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                           data_format, dtypes.float16, expected, use_gpu, v2)
 
@@ -1412,7 +1412,7 @@
             use_gpu=use_gpu,
             v2=v2)
 
-  @test_util.disable_xla("b/123923733")  # NaNs handled differently
+  @test_util.no_xla_auto_jit("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
     output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
@@ -1487,7 +1487,7 @@
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
-  @test_util.disable_xla("b/123923733")  # NaNs handled differently
+  @test_util.no_xla_auto_jit("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index a7a3d5c..7247eaf 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -335,8 +335,8 @@
 
   @test_util.run_v1_only("b/120545219")
   def testGradientFunction(self):
-    # Input to tf.py_func is necessary, otherwise get_gradient_function()
-    # returns None per default.
+    # Input to tf.compat.v1.py_func is necessary,
+    # otherwise get_gradient_function() returns None per default.
     a = constant_op.constant(0)
     x, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64])
     y, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64], stateful=False)
@@ -353,7 +353,8 @@
 
   @test_util.run_v1_only("b/120545219")
   def testParallel(self):
-    # Tests that tf.py_func's can run in parallel if they release the GIL.
+    # Tests that tf.compat.v1.py_func's can run in parallel if they release
+    # the GIL.
     with self.cached_session() as session:
       q = queue.Queue(1)
 
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 1b00fe8..7ddf2b7 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -155,7 +155,7 @@
     xla_enable_strict_auto_jit = True,
 )
 
-# TODO(b/130359919)
+# TODO(b/130359919): Reenable test when it becomes stable
 tf_py_test(
     name = "random_binomial_test",
     size = "medium",
@@ -170,6 +170,7 @@
         "//tensorflow/python:platform",
         "//tensorflow/python:stateful_random_ops",
     ],
+    tags = ["no_oss"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 68672a0..1023b8f 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -257,7 +257,8 @@
       self.assertAllEqual(rnd1, rnd2)
 
 
-@test_util.disable_all_xla("This never passed on XLA")
+@test_util.for_all_test_methods(test_util.disable_xla,
+                                "This never passed on XLA")
 class RandomUniformTest(RandomOpTestCommon):
 
   def _Sampler(self, num, minv, maxv, dtype, use_gpu, seed=None):
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index c65e141..82625c9 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -1067,9 +1067,7 @@
     with copy_to_graph.as_default():  # Intentionally testing v1 behavior
       copied = resource_variable_ops.copy_to_graph_uninitialized(v)
       self.assertEqual(v.name, copied.name)
-      with self.session(copy_to_graph) as session:
-        with self.assertRaises(errors.InvalidArgumentError):
-          session.run(copied.initializer)
+      self.assertIsNone(copied.initializer)
 
   def create_variant_shape_and_type_data(self):
     variant_shape_and_type_data = (
@@ -1144,7 +1142,8 @@
           expected=[[[[8, 9], [9, 8]], [[8, 8], [9, 9]]],
                     [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
 
-      # batch_dims=indices.shape.ndims - 1 (equivalent to tf.batch_gather)
+      # batch_dims=indices.shape.ndims - 1 (equivalent to
+      # tf.compat.v1.batch_gather)
       dict(  # 2D indices (1 batch dim)
           batch_dims=1,
           params=[[10, 11, 12, 13], [20, 21, 22, 23]],
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 5bc301b..126cb04 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -250,8 +250,8 @@
   # def testBooleanScatterUpdate(self):
   #   with self.session(use_gpu=False) as session:
   #     var = tf.Variable([True, False])
-  #     update0 = tf.scatter_nd_update(var, [[1]], [True])
-  #     update1 = tf.scatter_nd_update(
+  #     update0 = tf.compat.v1.scatter_nd_update(var, [[1]], [True])
+  #     update1 = tf.compat.v1.scatter_nd_update(
   #         var, tf.constant(
   #             [[0]], dtype=tf.int64), [False])
   #     var.initializer.run()
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index 0a8a621..4d1807e 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -31,11 +31,11 @@
     fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away).
       Grappler uses the 'train_op' collection to look for fetches, so if not
       provided this collection should be non-empty.
-    config_proto: An optional `tf.ConfigProto` to use when rewriting the
-      graph.
+    config_proto: An optional `tf.compat.v1.ConfigProto` to use when rewriting
+      the graph.
 
   Returns:
-    A `tf.GraphDef` containing the rewritten graph.
+    A `tf.compat.v1.GraphDef` containing the rewritten graph.
   """
   if config_proto is None:
     config_proto = config_pb2.ConfigProto()
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index fa2bab1..8e2115f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the gradient of `tf.sparse_tensor_dense_matmul()`."""
+"""Tests for the gradient of `tf.sparse.sparse_dense_matmul()`."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/kernel_tests/string_lower_op_test.py b/tensorflow/python/kernel_tests/string_lower_op_test.py
new file mode 100644
index 0000000..ec2f2ea
--- /dev/null
+++ b/tensorflow/python/kernel_tests/string_lower_op_test.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for string_lower_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class StringLowerOpTest(test.TestCase):
+  """Test cases for tf.strings.lower."""
+
+  def test_string_lower(self):
+    strings = ["Pigs on The Wing", "aNimals"]
+
+    with self.cached_session():
+      output = string_ops.string_lower(strings)
+      output = self.evaluate(output)
+      self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
+
+  def test_string_lower_2d(self):
+    strings = [["pigS on THE wIng", "aniMals"], [" hello ", "\n\tWorld! \r \n"]]
+
+    with self.cached_session():
+      output = string_ops.string_lower(strings)
+      output = self.evaluate(output)
+      self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
+                                   [b" hello ", b"\n\tworld! \r \n"]])
+
+  def test_string_upper_unicode(self):
+    strings = [["ÓÓSSCHLOË"]]
+    with self.cached_session():
+      output = string_ops.string_lower(strings, encoding="utf-8")
+      output = self.evaluate(output)
+      # output: "óósschloë"
+      self.assertAllEqual(output, [[b"\xc3\xb3\xc3\xb3sschlo\xc3\xab"]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index 8b44fd3..005044d 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -18,15 +18,20 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class StringSplitOpTest(test.TestCase):
@@ -34,7 +39,7 @@
   def testStringSplit(self):
     strings = ["pigs on the wing", "animals"]
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       tokens = string_ops.string_split(strings)
       indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
@@ -45,7 +50,7 @@
   def testStringSplitEmptyDelimiter(self):
     strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       tokens = string_ops.string_split(strings, delimiter="")
       indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
@@ -63,7 +68,7 @@
   def testStringSplitEmptyToken(self):
     strings = ["", " a", "b ", " c", " ", " d ", "  e", "f  ", "  g  ", "  "]
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       tokens = string_ops.string_split(strings)
       indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
@@ -75,7 +80,7 @@
   def testStringSplitOnSetEmptyToken(self):
     strings = ["", " a", "b ", " c", " ", " d ", ". e", "f .", " .g. ", " ."]
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       tokens = string_ops.string_split(strings, delimiter=" .")
       indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
@@ -88,7 +93,7 @@
   def testStringSplitWithDelimiter(self):
     strings = ["hello|world", "hello world"]
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertRaises(
           ValueError, string_ops.string_split, strings, delimiter=["|", ""])
 
@@ -149,7 +154,7 @@
   def testStringSplitWithNoSkipEmpty(self):
     strings = ["#a", "b#", "#c#"]
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       tokens = string_ops.string_split(strings, "#", skip_empty=False)
       indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
@@ -158,7 +163,7 @@
       self.assertAllEqual(values, [b"", b"a", b"b", b"", b"", b"c", b""])
       self.assertAllEqual(shape, [3, 3])
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       tokens = string_ops.string_split(strings, "#")
       indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(values, [b"a", b"b", b"c"])
@@ -166,122 +171,143 @@
       self.assertAllEqual(shape, [3, 1])
 
 
-class StringSplitV2OpTest(test.TestCase):
+class StringSplitV2OpTest(ragged_test_util.RaggedTensorTestCase,
+                          parameterized.TestCase):
 
-  def testSplitV2(self):
-    strings = ["pigs on the wing", "animals"]
+  @parameterized.named_parameters([
+      {"testcase_name": "Simple",
+       "input": [b"pigs on the wing", b"animals"],
+       "expected": [[b"pigs", b"on", b"the", b"wing"], [b"animals"]]},
 
-    tokens = string_ops.string_split_v2(strings)
-    indices, values, shape = self.evaluate(tokens)
-    self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
-    self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
-    self.assertAllEqual(shape, [2, 4])
+      {"testcase_name": "MultiCharSeparator",
+       "input": [b"1<>2<>3", b"<><>4<>5<><>6<>"],
+       "sep": b"<>",
+       "expected": [[b"1", b"2", b"3"],
+                    [b"", b"", b"4", b"5", b"", b"6", b""]]},
 
-    ragged_tokens = ragged_string_ops.string_split_v2(strings)
-    self.assertAllEqual(ragged_tokens.row_splits, [0, 4, 5])
-    self.assertAllEqual(ragged_tokens.values,
-                        [b"pigs", b"on", b"the", b"wing", b"animals"])
+      {"testcase_name": "SimpleSeparator",
+       "input": [b"1,2,3", b"4,5,,6,"],
+       "sep": b",",
+       "expected": [[b"1", b"2", b"3"], [b"4", b"5", b"", b"6", b""]]},
 
-  def testSplitV2MultiCharSeparator(self):
-    # Match Python behavior:
-    # >>> '1<>2<>3'.split('<>')
-    # ['1', '2', '3']
-    # >>> "<><>4<>5<><>6<>".split("<>")
-    # ['', '', '4', '5', '', '6', '']
-    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+      {"testcase_name": "EmptySeparator",
+       "input": [b"1 2 3", b"  4  5    6  "],
+       "expected": [[b"1", b"2", b"3"], [b"4", b"5", b"6"]]},
 
-    tokens = string_ops.string_split_v2(strings, sep="<>")
-    indices, values, shape = self.evaluate(tokens)
-    self.assertAllEqual(indices,
-                        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3],
-                         [1, 4], [1, 5], [1, 6]])
-    self.assertAllEqual(
-        values, [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
-    self.assertAllEqual(shape, [2, 7])
+      {"testcase_name": "EmptySeparatorEmptyInputString",
+       "input": [b""],
+       "expected": [[]]},
 
-    ragged_tokens = ragged_string_ops.string_split_v2(strings, sep="<>")
-    self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 10])
-    self.assertAllEqual(
-        ragged_tokens.values,
-        [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
+      {"testcase_name": "EmptyInputVector",
+       "input": [],
+       "expected": []},
 
-  def testSplitV2SimpleSeparator(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',')
-    # ['1', '2', '3']
-    # >>> '1,2,,3,'.split(',')
-    # ['1', '2', '', '3', '']
-    strings = ["1,2,3", "4,5,,6,"]
+      {"testcase_name": "SimpleSeparatorMaxSplit",
+       "input": [b"1,2,3", b"4,5,,6,"],
+       "sep": b",",
+       "maxsplit": 1,
+       "expected": [[b"1", b"2,3"], [b"4", b"5,,6,"]]},
 
-    tokens = string_ops.string_split_v2(strings, sep=",")
-    indices, values, shape = self.evaluate(tokens)
-    self.assertAllEqual(
-        indices,
-        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
-    self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"", b"6", b""])
-    self.assertAllEqual(shape, [2, 5])
+      {"testcase_name": "EmptySeparatorMaxSplit",
+       "input": [b"1 2 3", b"  4  5    6  "],
+       "maxsplit": 1,
+       "expected": [[b"1", b"2 3"], [b"4", b"5    6  "]]},
 
-    ragged_tokens = ragged_string_ops.string_split_v2(strings, sep=",")
-    self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 8])
-    self.assertAllEqual(ragged_tokens.values,
-                        [b"1", b"2", b"3", b"4", b"5", b"", b"6", b""])
+      {"testcase_name": "ScalarInput",
+       "input": b"1,2,3",
+       "sep": b",",
+       "expected": [b"1", b"2", b"3"]},
 
-  def testSplitV2EmptySeparator(self):
-    # Match Python behavior:
-    # >>> '1 2 3'.split()
-    # ['1', '2', '3']
-    # >>> '   1   2   3   '.split()
-    # ['1', '2', '3']
-    strings = ["1 2 3", "  4  5    6  "]
+      {"testcase_name": "Dense2DInput",
+       "input": [[b"1,2,3", b"4"], [b"5,6", b"7,8,9"]],
+       "sep": b",",
+       "expected": [[[b"1", b"2", b"3"], [b"4"]],
+                    [[b"5", b"6"], [b"7", b"8", b"9"]]]},
 
-    tokens = string_ops.string_split_v2(strings)
-    indices, values, shape = self.evaluate(tokens)
-    self.assertAllEqual(indices,
-                        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]])
-    self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
-    self.assertAllEqual(shape, [2, 3])
+      {"testcase_name": "Ragged2DInput",
+       "input": [[b"1,2,3", b"4"], [b"5,6"]],
+       "input_is_ragged": True,
+       "sep": b",",
+       "expected": [[[b"1", b"2", b"3"], [b"4"]], [[b"5", b"6"]]]},
 
-    ragged_tokens = ragged_string_ops.string_split_v2(strings)
-    self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 6])
-    self.assertAllEqual(ragged_tokens.values,
-                        [b"1", b"2", b"3", b"4", b"5", b"6"])
+      {"testcase_name": "Ragged3DInput",
+       "input": [[[b"1,2,3", b"4"], [b"5,6"]], [[b"7,8,9"]]],
+       "input_is_ragged": True,
+       "sep": b",",
+       "expected": [[[[b"1", b"2", b"3"], [b"4"]], [[b"5", b"6"]]],
+                    [[[b"7", b"8", b"9"]]]]},
 
-  def testSplitV2SimpleSeparatorMaxSplit(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',', maxsplit=1)
-    # ['1', '2,3']
-    # >>> '4,5,,6,'.split(',', maxsplit=1)
-    # ['4', '5,,6,']
-    strings = ["1,2,3", "4,5,,6,"]
+      {"testcase_name": "Ragged4DInput",
+       "input": [[[[b"1,2,3", b"4"], [b"5,6"]], [[b"7,8,9"]]], [[[b""]]]],
+       "input_is_ragged": True,
+       "sep": b",",
+       "expected": [[[[[b"1", b"2", b"3"], [b"4"]], [[b"5", b"6"]]],
+                     [[[b"7", b"8", b"9"]]]], [[[[b""]]]]]},
 
-    tokens = string_ops.string_split_v2(strings, sep=",", maxsplit=1)
-    indices, values, shape = self.evaluate(tokens)
-    self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
-    self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
-    self.assertAllEqual(shape, [2, 2])
+      {"testcase_name": "Ragged4DInputEmptySeparator",
+       "input": [[[[b"1 2 3", b"4"], [b"5 6"]], [[b"7 8 9"]]], [[[b""]]]],
+       "input_is_ragged": True,
+       "expected": [[[[[b"1", b"2", b"3"], [b"4"]], [[b"5", b"6"]]],
+                     [[[b"7", b"8", b"9"]]]], [[[[]]]]]},
 
-    ragged_tokens = ragged_string_ops.string_split_v2(
-        strings, sep=",", maxsplit=1)
-    self.assertAllEqual(ragged_tokens.row_splits, [0, 2, 4])
-    self.assertAllEqual(ragged_tokens.values, [b"1", b"2,3", b"4", b"5,,6,"])
+      ])  # pyformat: disable
+  def testSplitV2(self,
+                  input,
+                  expected,
+                  input_is_ragged=False,
+                  **kwargs):  # pylint: disable=redefined-builtin
+    # Check that we are matching the behavior of Python's str.split:
+    self.assertEqual(expected, self._py_split(input, **kwargs))
 
-  def testSplitV2EmptySeparatorMaxSplit(self):
-    # Match Python behavior:
-    # '1 2 3'.split(maxsplit=1)
-    # ['1', '2 3']
-    # >>> "  4  5    6  ".split(maxsplit=1)
-    # ['4', '5    6  ']
-    strings = ["1 2 3", "  4  5    6  "]
+    # Prepare the input tensor.
+    if input_is_ragged:
+      input = ragged_factory_ops.constant(input, dtype=dtypes.string)
+    else:
+      input = constant_op.constant(input, dtype=dtypes.string)
 
-    tokens = string_ops.string_split_v2(strings, maxsplit=1)
-    indices, values, shape = self.evaluate(tokens)
-    self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
-    self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
-    self.assertAllEqual(shape, [2, 2])
+    # Check that the public version (which returns a RaggedTensor) works
+    # correctly.
+    expected_ragged = ragged_factory_ops.constant(
+        expected, ragged_rank=input.shape.ndims)
+    actual_ragged_v1 = ragged_string_ops.strings_split_v1(
+        input, result_type="RaggedTensor", **kwargs)
+    actual_ragged_v1_input_kwarg = ragged_string_ops.strings_split_v1(
+        input=input, result_type="RaggedTensor", **kwargs)
+    actual_ragged_v1_source_kwarg = ragged_string_ops.strings_split_v1(
+        source=input, result_type="RaggedTensor", **kwargs)
+    actual_ragged_v2 = ragged_string_ops.string_split_v2(input, **kwargs)
+    actual_ragged_v2_input_kwarg = ragged_string_ops.string_split_v2(
+        input=input, **kwargs)
+    self.assertRaggedEqual(expected_ragged, actual_ragged_v1)
+    self.assertRaggedEqual(expected_ragged, actual_ragged_v1_input_kwarg)
+    self.assertRaggedEqual(expected_ragged, actual_ragged_v1_source_kwarg)
+    self.assertRaggedEqual(expected_ragged, actual_ragged_v2)
+    self.assertRaggedEqual(expected_ragged, actual_ragged_v2_input_kwarg)
 
-    ragged_tokens = ragged_string_ops.string_split_v2(strings, maxsplit=1)
-    self.assertAllEqual(ragged_tokens.row_splits, [0, 2, 4])
-    self.assertAllEqual(ragged_tokens.values, [b"1", b"2 3", b"4", b"5    6  "])
+    # Check that the internal version (which returns a SparseTensor) works
+    # correctly.  Note: the internal version oly supports vector inputs.
+    if input.shape.ndims == 1:
+      expected_sparse = self.evaluate(expected_ragged.to_sparse())
+      actual_sparse_v1 = ragged_string_ops.strings_split_v1(
+          input, result_type="SparseTensor", **kwargs)
+      actual_sparse_v2 = string_ops.string_split_v2(input, **kwargs)
+      for actual_sparse in [actual_sparse_v1, actual_sparse_v2]:
+        self.assertEqual(expected_sparse.indices.tolist(),
+                         self.evaluate(actual_sparse.indices).tolist())
+        self.assertEqual(expected_sparse.values.tolist(),
+                         self.evaluate(actual_sparse.values).tolist())
+        self.assertEqual(expected_sparse.dense_shape.tolist(),
+                         self.evaluate(actual_sparse.dense_shape).tolist())
+
+  def _py_split(self, strings, **kwargs):
+    if isinstance(strings, compat.bytes_or_text_types):
+      # Note: str.split doesn't accept keyword args.
+      if "maxsplit" in kwargs:
+        return strings.split(kwargs.get("sep", None), kwargs["maxsplit"])
+      else:
+        return strings.split(kwargs.get("sep", None))
+    else:
+      return [self._py_split(s, **kwargs) for s in strings]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/string_upper_op_test.py b/tensorflow/python/kernel_tests/string_upper_op_test.py
new file mode 100644
index 0000000..fa685f5
--- /dev/null
+++ b/tensorflow/python/kernel_tests/string_upper_op_test.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for string_upper_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class StringUpperOpTest(test.TestCase):
+  """Test cases for tf.strings.upper."""
+
+  def test_string_upper(self):
+    strings = ["Pigs on The Wing", "aNimals"]
+
+    with self.cached_session():
+      output = string_ops.string_upper(strings)
+      output = self.evaluate(output)
+      self.assertAllEqual(output, [b"PIGS ON THE WING", b"ANIMALS"])
+
+  def test_string_upper_2d(self):
+    strings = [["pigS on THE wIng", "aniMals"], [" hello ", "\n\tWorld! \r \n"]]
+
+    with self.cached_session():
+      output = string_ops.string_upper(strings)
+      output = self.evaluate(output)
+      self.assertAllEqual(output, [[b"PIGS ON THE WING", b"ANIMALS"],
+                                   [b" HELLO ", b"\n\tWORLD! \r \n"]])
+
+  def test_string_upper_unicode(self):
+    strings = [["óósschloë"]]
+    with self.cached_session():
+      output = string_ops.string_upper(strings, encoding="utf-8")
+      output = self.evaluate(output)
+      # output: "ÓÓSSCHLOË"
+      self.assertAllEqual(output, [[b"\xc3\x93\xc3\x93SSCHLO\xc3\x8b"]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
index b9840dd..11047f6 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -570,6 +570,34 @@
     with summary_ops.summary_scope('with/slash') as (tag, _):
       self.assertEqual('with/slash', tag)
 
+  def testAllV2SummaryOps(self):
+    logdir = self.get_temp_dir()
+    def define_ops():
+      result = []
+      # TF 2.0 summary ops
+      result.append(summary_ops.write('write', 1, step=0))
+      result.append(summary_ops.write_raw_pb(b'', step=0, name='raw_pb'))
+      # TF 1.x tf.contrib.summary ops
+      result.append(summary_ops.generic('tensor', 1, step=1))
+      result.append(summary_ops.scalar('scalar', 2.0, step=1))
+      result.append(summary_ops.histogram('histogram', [1.0], step=1))
+      result.append(summary_ops.image('image', [[[[1.0]]]], step=1))
+      result.append(summary_ops.audio('audio', [[1.0]], 1.0, 1, step=1))
+      return result
+    with context.graph_mode():
+      ops_without_writer = define_ops()
+      with summary_ops.create_file_writer_v2(logdir).as_default():
+        with summary_ops.record_if(True):
+          ops_recording_on = define_ops()
+        with summary_ops.record_if(False):
+          ops_recording_off = define_ops()
+      # We should be collecting all ops defined with a default writer present,
+      # regardless of whether recording was set on or off, but not those defined
+      # without a writer at all.
+      del ops_without_writer
+      expected_ops = ops_recording_on + ops_recording_off
+      self.assertCountEqual(expected_ops, summary_ops.all_v2_summary_ops())
+
 
 class SummaryWriterTest(test_util.TensorFlowTestCase):
 
@@ -737,7 +765,7 @@
       with summary_ops.create_file_writer_v2(
           logdir, max_queue=1, flush_millis=999999).as_default():
         get_total = lambda: len(events_from_logdir(logdir))
-        # Note: First tf.Event is always file_version.
+        # Note: First tf.compat.v1.Event is always file_version.
         self.assertEqual(1, get_total())
         summary_ops.write('tag', 1, step=0)
         self.assertEqual(1, get_total())
@@ -769,7 +797,7 @@
           logdir, max_queue=999999, flush_millis=999999)
       with writer.as_default():
         get_total = lambda: len(events_from_logdir(logdir))
-        # Note: First tf.Event is always file_version.
+        # Note: First tf.compat.v1.Event is always file_version.
         self.assertEqual(1, get_total())
         summary_ops.write('tag', 1, step=0)
         summary_ops.write('tag', 1, step=0)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index d66357b..1b12d23 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -22,6 +22,7 @@
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -1740,19 +1741,18 @@
 
 class TensorArrayBenchmark(test.Benchmark):
 
+  def _tensorArrayWriteInWhile(self):
+    size = 10000
+    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=size)
+    (_, ta) = control_flow_ops.while_loop(
+        lambda i, _: i < size,
+        lambda i, ta: (i + 1, ta.write(i, 0.)), [0, ta],
+        parallel_iterations=1)
+    return ta.stack()
+
   def _benchmarkWriteInWhile(self):
     ops.reset_default_graph()
-
-    def write():
-      size = 10000
-      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=size)
-      ta = control_flow_ops.while_loop(
-          lambda i, _: i < size,
-          lambda i, ta: (i + 1, ta.write(i, 0.)), [0, ta],
-          parallel_iterations=1)[1]
-      return ta.stack()
-
-    op = write()
+    op = self._tensorArrayWriteInWhile()
     self.run_op_benchmark(session_lib.Session(), op)
 
   def benchmarkWriteInWhile(self):
@@ -1762,6 +1762,18 @@
   def benchmarkWriteInWhileWithControlFlowV2(self):
     self._benchmarkWriteInWhile()
 
+  def benchmarkWriteInDatasetMapFn(self):
+    ds = dataset_ops.Dataset.from_tensors(array_ops.zeros([10])).repeat()
+    ds = ds.map(lambda _: self._tensorArrayWriteInWhile())
+    op = ds.make_one_shot_iterator().get_next()
+    self.run_op_benchmark(session_lib.Session(), op)
+
+  def benchmarkWriteInDatasetParallelMapFn(self):
+    ds = dataset_ops.Dataset.from_tensors(array_ops.zeros([10])).repeat()
+    ds = ds.map(lambda _: self._tensorArrayWriteInWhile(), num_parallel_calls=2)
+    op = ds.make_one_shot_iterator().get_next()
+    self.run_op_benchmark(session_lib.Session(), op)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/tridiagonal_matmul_op_test.py b/tensorflow/python/kernel_tests/tridiagonal_matmul_op_test.py
new file mode 100644
index 0000000..648b6fc
--- /dev/null
+++ b/tensorflow/python/kernel_tests/tridiagonal_matmul_op_test.py
@@ -0,0 +1,169 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.linalg.linalg_impl.tridiagonal_matmul."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+
+def _tfconst(array):
+  return constant_op.constant(array, dtypes.float64)
+
+
+class TridiagonalMulOpTest(test.TestCase):
+
+  def _testAllFormats(self,
+                      superdiag,
+                      maindiag,
+                      subdiag,
+                      rhs,
+                      expected,
+                      dtype=dtypes.float64):
+    superdiag_extended = np.pad(superdiag, [0, 1], 'constant')
+    subdiag_extended = np.pad(subdiag, [1, 0], 'constant')
+    diags_compact = np.stack([superdiag_extended, maindiag, subdiag_extended])
+    diags_matrix = np.diag(superdiag, 1) + np.diag(maindiag, 0) + np.diag(
+        subdiag, -1)
+
+    diags_sequence = (constant_op.constant(superdiag_extended, dtype),
+                      constant_op.constant(maindiag, dtype),
+                      constant_op.constant(subdiag_extended, dtype))
+    diags_compact = constant_op.constant(diags_compact, dtype)
+    diags_matrix = constant_op.constant(diags_matrix, dtype)
+    rhs = constant_op.constant(rhs, dtype)
+
+    rhs_batch = array_ops.stack([rhs, 2 * rhs])
+    diags_compact_batch = array_ops.stack([diags_compact, 2 * diags_compact])
+    diags_matrix_batch = array_ops.stack([diags_matrix, 2 * diags_matrix])
+    diags_sequence_batch = [array_ops.stack([x, 2 * x]) for x in diags_sequence]
+
+    results = [
+        linalg_impl.tridiagonal_matmul(
+            diags_sequence, rhs, diagonals_format='sequence'),
+        linalg_impl.tridiagonal_matmul(
+            diags_compact, rhs, diagonals_format='compact'),
+        linalg_impl.tridiagonal_matmul(
+            diags_matrix, rhs, diagonals_format='matrix')
+    ]
+    results_batch = [
+        linalg_impl.tridiagonal_matmul(
+            diags_sequence_batch, rhs_batch, diagonals_format='sequence'),
+        linalg_impl.tridiagonal_matmul(
+            diags_compact_batch, rhs_batch, diagonals_format='compact'),
+        linalg_impl.tridiagonal_matmul(
+            diags_matrix_batch, rhs_batch, diagonals_format='matrix')
+    ]
+
+    with self.cached_session(use_gpu=False):
+      results = self.evaluate(results)
+      results_batch = self.evaluate(results_batch)
+
+    expected = np.array(expected)
+    expected_batch = np.stack([expected, 4 * expected])
+    for result in results:
+      self.assertAllClose(result, expected)
+    for result in results_batch:
+      self.assertAllClose(result, expected_batch)
+
+  def test1x1(self):
+    self._testAllFormats([], [2], [], [[1, 4]], [[2, 8]])
+
+  def test2x2(self):
+    self._testAllFormats([1], [2, 3], [4], [[2, 1], [4, 3]], [[8, 5], [20, 13]])
+
+  def test3x3(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      self._testAllFormats([1, 2], [1, 2, 1], [2, 1], [[1, 1], [2, 2], [3, 3]],
+                           [[3, 3], [12, 12], [5, 5]],
+                           dtype=dtype)
+
+  def testComplex(self):
+    for dtype in [dtypes.complex64, dtypes.complex128]:
+      self._testAllFormats([1j, 1j], [1, -1, 0], [1j, 1j],
+                           np.array([[1, 1j], [1, 1j], [1, 1j]]),
+                           [[1 + 1j, -1 + 1j], [-1 + 2j, -2 - 1j], [1j, -1]],
+                           dtype=dtype)
+
+  # Benchmark
+
+  class TridiagonalMulBenchmark(test.Benchmark):
+    sizes = [(1, 1000000), (1000000, 1), (1000, 1000), (10000, 10000)]
+
+    def baseline(self, upper, diag, lower, vec):
+      diag_part = diag * vec
+      lower_part = array_ops.pad(lower * vec[:, :-1], [[0, 0], [1, 0]])
+      upper_part = array_ops.pad(upper * vec[:, 1:], [[0, 0], [0, 1]])
+      return lower_part + diag_part + upper_part
+
+    def _generateData(self, batch_size, matrix_size, seed=42):
+      np.random.seed(seed)
+      data = np.random.normal(size=(batch_size, matrix_size, 4))
+      upper = data[:, 1:, 0]
+      diag = data[:, :, 1]
+      lower = data[:, 1:, 2]
+      vec = data[:, :, 3]
+
+      return (ops.convert_to_tensor(upper, dtype=dtypes.float64),
+              ops.convert_to_tensor(diag, dtype=dtypes.float64),
+              ops.convert_to_tensor(lower, dtype=dtypes.float64),
+              ops.convert_to_tensor(vec, dtype=dtypes.float64))
+
+    def benchmarkTridiagonalMulOp(self):
+      devices = [('/cpu:0', 'cpu')]
+
+      for device_id, device_name in devices:
+        for batch_size, matrix_size in self.sizes:
+          with ops.Graph().as_default(), \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
+              ops.device(device_id):
+            upper, diag, lower, vec = self._generateData(
+                batch_size, matrix_size)
+            x1 = self.baseline(upper, diag, lower, vec)
+            x2 = linalg_impl.tridiagonal_matmul((upper, diag, lower), vec)
+            variables.global_variables_initializer().run()
+            self.run_op_benchmark(
+                sess,
+                control_flow_ops.group(x1),
+                min_iters=10,
+                store_memory_usage=False,
+                name=('tridiagonal_matmul_baseline_%s'
+                      '_batch_size_%d_matrix_size_%d' %
+                      (device_name, batch_size, matrix_size)))
+
+            self.run_op_benchmark(
+                sess,
+                control_flow_ops.group(x2),
+                min_iters=10,
+                store_memory_usage=False,
+                name=('tridiagonal_matmul_%s_batch_size_%d_matrix_size_%d' %
+                      (device_name, batch_size, matrix_size)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
index aea924c..a7a638b 100644
--- a/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
@@ -18,16 +18,20 @@
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
 from tensorflow.python.platform import benchmark
@@ -37,6 +41,23 @@
 _sample_rhs = np.array([1, 2, 3, 4])
 _sample_result = np.array([-9, 5, -4, 4])
 
+# Flag, indicating that test should be run only with partial_pivoting=True
+FLAG_REQUIRES_PIVOTING = "FLAG_REQUIRES_PIVOT"
+
+# Flag, indicating that test shouldn't be parameterized by different values of
+# partial_pivoting, etc.
+FLAG_NO_PARAMETERIZATION = "FLAG_NO_PARAMETERIZATION"
+
+
+def flags(*args):
+
+  def decorator(f):
+    for flag in args:
+      setattr(f, flag, True)
+    return f
+
+  return decorator
+
 
 def _tfconst(array):
   return constant_op.constant(array, dtypes.float64)
@@ -141,6 +162,7 @@
 
   # Other edge cases
 
+  @flags(FLAG_REQUIRES_PIVOTING)
   def testCaseRequiringPivoting(self):
     # Without partial pivoting (e.g. Thomas algorithm) this would fail.
     self._testWithLists(
@@ -148,6 +170,7 @@
         rhs=[1, 2, 3, 4],
         expected=[8, -3.5, 0, -4])
 
+  @flags(FLAG_REQUIRES_PIVOTING)
   def testCaseRequiringPivotingLastRows(self):
     self._testWithLists(
         diags=[[2, 1, -1, 0], [1, -1, 2, 1], [0, 1, -6, 1]],
@@ -156,7 +179,7 @@
 
   def testNotInvertible(self):
     if test.is_gpu_available(cuda_only=True):
-      # CuSparse gtsv routine doesn't raise errors for non-invertible
+      # CuSparse gtsv routines don't raise errors for non-invertible
       # matrices.
       return
     with self.assertRaises(errors_impl.InvalidArgumentError):
@@ -197,6 +220,16 @@
         rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
         expected=np.array([_sample_result, -2 * _sample_result]))
 
+  def testWithTwoBatchingDimensions(self):
+    self._testWithLists(
+        diags=np.array([[_sample_diags, -_sample_diags, _sample_diags],
+                        [-_sample_diags, _sample_diags, -_sample_diags]]),
+        rhs=np.array([[_sample_rhs, 2 * _sample_rhs, 3 * _sample_rhs],
+                      [4 * _sample_rhs, 5 * _sample_rhs, 6 * _sample_rhs]]),
+        expected=np.array(
+            [[_sample_result, -2 * _sample_result, 3 * _sample_result],
+             [-4 * _sample_result, 5 * _sample_result, -6 * _sample_result]]))
+
   def testBatchingAndTwoRightHandSides(self):
     rhs = np.transpose([_sample_rhs, 2 * _sample_rhs])
     expected_result = np.transpose([_sample_result, 2 * _sample_result])
@@ -319,8 +352,131 @@
         expected=np.array([_sample_result, -2 * _sample_result]),
         transpose_rhs=True)
 
+  # Gradient tests
+
+  def _gradientTest(
+      self,
+      diags,
+      rhs,
+      y,  # output = reduce_sum(y * tridiag_solve(diags, rhs))
+      expected_grad_diags,  # expected gradient of output w.r.t. diags
+      expected_grad_rhs,  # expected gradient of output w.r.t. rhs
+      diags_format="compact",
+      transpose_rhs=False,
+      conjugate_rhs=False,
+      feed_dict=None):
+    expected_grad_diags = _tfconst(expected_grad_diags)
+    expected_grad_rhs = _tfconst(expected_grad_rhs)
+    with backprop.GradientTape() as tape_diags:
+      with backprop.GradientTape() as tape_rhs:
+        tape_diags.watch(diags)
+        tape_rhs.watch(rhs)
+        x = linalg_impl.tridiagonal_solve(
+            diags,
+            rhs,
+            diagonals_format=diags_format,
+            transpose_rhs=transpose_rhs,
+            conjugate_rhs=conjugate_rhs)
+        res = math_ops.reduce_sum(x * y)
+    with self.cached_session(use_gpu=True) as sess:
+      actual_grad_diags = sess.run(
+          tape_diags.gradient(res, diags), feed_dict=feed_dict)
+      actual_rhs_diags = sess.run(
+          tape_rhs.gradient(res, rhs), feed_dict=feed_dict)
+    self.assertAllClose(expected_grad_diags, actual_grad_diags)
+    self.assertAllClose(expected_grad_rhs, actual_rhs_diags)
+
+  def _gradientTestWithLists(self,
+                             diags,
+                             rhs,
+                             y,
+                             expected_grad_diags,
+                             expected_grad_rhs,
+                             diags_format="compact",
+                             transpose_rhs=False,
+                             conjugate_rhs=False):
+    self._gradientTest(
+        _tfconst(diags), _tfconst(rhs), _tfconst(y), expected_grad_diags,
+        expected_grad_rhs, diags_format, transpose_rhs, conjugate_rhs)
+
+  def testGradientSimple(self):
+    self._gradientTestWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs,
+        y=[1, 3, 2, 4],
+        expected_grad_diags=[[-5, 0, 4, 0], [9, 0, -4, -16], [0, 0, 5, 16]],
+        expected_grad_rhs=[1, 0, -1, 4])
+
+  def testGradientWithMultipleRhs(self):
+    self._gradientTestWithLists(
+        diags=_sample_diags,
+        rhs=[[1, 2], [2, 4], [3, 6], [4, 8]],
+        y=[[1, 5], [2, 6], [3, 7], [4, 8]],
+        expected_grad_diags=([[-20, 28, -60, 0], [36, -35, 60, 80],
+                              [0, 63, -75, -80]]),
+        expected_grad_rhs=[[0, 2], [1, 3], [1, 7], [0, -10]])
+
+  def _makeDataForGradientWithBatching(self):
+    y = np.array([1, 3, 2, 4])
+    grad_diags = np.array([[-5, 0, 4, 0], [9, 0, -4, -16], [0, 0, 5, 16]])
+    grad_rhs = np.array([1, 0, -1, 4])
+
+    diags_batched = np.array(
+        [[_sample_diags, 2 * _sample_diags, 3 * _sample_diags],
+         [4 * _sample_diags, 5 * _sample_diags, 6 * _sample_diags]])
+    rhs_batched = np.array([[_sample_rhs, -_sample_rhs, _sample_rhs],
+                            [-_sample_rhs, _sample_rhs, -_sample_rhs]])
+    y_batched = np.array([[y, y, y], [y, y, y]])
+    expected_grad_diags_batched = np.array(
+        [[grad_diags, -grad_diags / 4, grad_diags / 9],
+         [-grad_diags / 16, grad_diags / 25, -grad_diags / 36]])
+    expected_grad_rhs_batched = np.array(
+        [[grad_rhs, grad_rhs / 2, grad_rhs / 3],
+         [grad_rhs / 4, grad_rhs / 5, grad_rhs / 6]])
+
+    return (y_batched, diags_batched, rhs_batched, expected_grad_diags_batched,
+            expected_grad_rhs_batched)
+
+  def testGradientWithBatchDims(self):
+    y, diags, rhs, expected_grad_diags, expected_grad_rhs = \
+      self._makeDataForGradientWithBatching()
+
+    self._gradientTestWithLists(
+        diags=diags,
+        rhs=rhs,
+        y=y,
+        expected_grad_diags=expected_grad_diags,
+        expected_grad_rhs=expected_grad_rhs)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithUnknownShapes(self):
+
+    def placeholder(rank):
+      return array_ops.placeholder(
+          dtypes.float64, shape=(None for _ in range(rank)))
+
+    y, diags, rhs, expected_grad_diags, expected_grad_rhs = \
+      self._makeDataForGradientWithBatching()
+
+    diags_placeholder = placeholder(rank=4)
+    rhs_placeholder = placeholder(rank=3)
+    y_placeholder = placeholder(rank=3)
+
+    self._gradientTest(
+        diags=diags_placeholder,
+        rhs=rhs_placeholder,
+        y=y_placeholder,
+        expected_grad_diags=expected_grad_diags,
+        expected_grad_rhs=expected_grad_rhs,
+        feed_dict={
+            diags_placeholder: diags,
+            rhs_placeholder: rhs,
+            y_placeholder: y
+        })
+
   # Invalid input shapes
 
+  @flags(FLAG_NO_PARAMETERIZATION)
   def testInvalidShapesCompactFormat(self):
 
     def test_raises(diags_shape, rhs_shape):
@@ -331,6 +487,7 @@
     test_raises((5, 3, 4), (5))
     test_raises((5), (5, 4))
 
+  @flags(FLAG_NO_PARAMETERIZATION)
   def testInvalidShapesSequenceFormat(self):
 
     def test_raises(diags_tuple_shapes, rhs_shape):
@@ -344,6 +501,7 @@
     test_raises(((5, 4), (7, 4), (5, 4)), (5, 4))
     test_raises(((5, 4), (7, 4), (5, 4)), (3, 4))
 
+  @flags(FLAG_NO_PARAMETERIZATION)
   def testInvalidShapesMatrixFormat(self):
 
     def test_raises(diags_shape, rhs_shape):
@@ -366,11 +524,13 @@
       return
     diags = array_ops.placeholder(dtypes.float64, shape=diags_shape)
     rhs = array_ops.placeholder(dtypes.float64, shape=rhs_shape)
-    x = linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+    x = linalg_impl.tridiagonal_solve(
+        diags, rhs, diags_format, partial_pivoting=self.pivoting)
     with self.cached_session(use_gpu=True) as sess:
       result = sess.run(x, feed_dict={diags: diags_feed, rhs: rhs_feed})
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testCompactFormatAllDimsUnknown(self):
     self._testWithPlaceholders(
         diags_shape=[None, None],
@@ -379,6 +539,7 @@
         rhs_feed=_sample_rhs,
         expected=_sample_result)
 
+  @test_util.run_deprecated_v1
   def testCompactFormatUnknownMatrixSize(self):
     self._testWithPlaceholders(
         diags_shape=[3, None],
@@ -387,6 +548,7 @@
         rhs_feed=_sample_rhs,
         expected=_sample_result)
 
+  @test_util.run_deprecated_v1
   def testCompactFormatUnknownRhsCount(self):
     self._testWithPlaceholders(
         diags_shape=[3, 4],
@@ -395,6 +557,7 @@
         rhs_feed=np.transpose([_sample_rhs, 2 * _sample_rhs]),
         expected=np.transpose([_sample_result, 2 * _sample_result]))
 
+  @test_util.run_deprecated_v1
   def testCompactFormatUnknownBatchSize(self):
     self._testWithPlaceholders(
         diags_shape=[None, 3, 4],
@@ -403,6 +566,7 @@
         rhs_feed=np.array([_sample_rhs, 2 * _sample_rhs]),
         expected=np.array([_sample_result, -2 * _sample_result]))
 
+  @test_util.run_deprecated_v1
   def testMatrixFormatWithUnknownDims(self):
     if context.executing_eagerly():
       return
@@ -426,6 +590,7 @@
     with self.assertRaises(ValueError):
       test_with_matrix_shapes(matrix_shape=[None, None])
 
+  @test_util.run_deprecated_v1
   def testSequenceFormatWithUnknownDims(self):
     if context.executing_eagerly():
       return
@@ -436,7 +601,8 @@
 
     x = linalg_impl.tridiagonal_solve((superdiag, diag, subdiag),
                                       rhs,
-                                      diagonals_format="sequence")
+                                      diagonals_format="sequence",
+                                      partial_pivoting=self.pivoting)
     with self.cached_session(use_gpu=True) as sess:
       result = sess.run(
           x,
@@ -455,6 +621,8 @@
              (100000, 100, 1), (10000, 1, 100), (10000, 1, 1000),
              (10000, 1, 10000)]
 
+    pivoting_options = [(True, "pivoting"), (False, "no_pivoting")]
+
     def _generateData(self, matrix_size, batch_size, num_rhs, seed=42):
       np.random.seed(seed)
       data = np.random.normal(size=(batch_size, matrix_size, 3 + num_rhs))
@@ -468,23 +636,53 @@
       if test.is_gpu_available(cuda_only=True):
         devices += [("/gpu:0", "gpu")]
 
-      for device_id, device_name in devices:
-        for matrix_size, batch_size, num_rhs in self.sizes:
-          with ops.Graph().as_default(), \
-              session.Session(config=benchmark.benchmark_config()) as sess, \
-              ops.device(device_id):
-            diags, rhs = self._generateData(matrix_size, batch_size, num_rhs)
-            x = linalg_impl.tridiagonal_solve(diags, rhs)
-            variables.global_variables_initializer().run()
-            self.run_op_benchmark(
-                sess,
-                control_flow_ops.group(x),
-                min_iters=10,
-                store_memory_usage=False,
-                name=("tridiagonal_solve_{}_matrix_size_{}_batch_size_{}_"
-                      "num_rhs_{}").format(device_name, matrix_size, batch_size,
-                                           num_rhs))
+      for device_option, pivoting_option, size_option in \
+          itertools.product(devices, self.pivoting_options, self.sizes):
+
+        device_id, device_name = device_option
+        pivoting, pivoting_name = pivoting_option
+        matrix_size, batch_size, num_rhs = size_option
+
+        with ops.Graph().as_default(), \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
+            ops.device(device_id):
+          diags, rhs = self._generateData(matrix_size, batch_size, num_rhs)
+          x = linalg_impl.tridiagonal_solve(
+              diags, rhs, partial_pivoting=pivoting)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(x),
+              min_iters=10,
+              store_memory_usage=False,
+              name=("tridiagonal_solve_{}_matrix_size_{}_batch_size_{}_"
+                    "num_rhs_{}_{}").format(device_name, matrix_size,
+                                            batch_size, num_rhs, pivoting_name))
 
 
 if __name__ == "__main__":
+  for name, fun in dict(TridiagonalSolveOpTest.__dict__).items():
+    if not name.startswith("test"):
+      continue
+    if hasattr(fun, FLAG_NO_PARAMETERIZATION):
+      continue
+
+    # Replace testFoo with testFoo_pivoting and testFoo_noPivoting, setting
+    # self.pivoting to corresponding value.
+    delattr(TridiagonalSolveOpTest, name)
+
+    def decor(test_fun, pivoting):
+
+      def wrapped(instance):
+        instance.pivoting = pivoting
+        test_fun(instance)
+
+      return wrapped
+
+    setattr(TridiagonalSolveOpTest, name + "_pivoting",
+            decor(fun, pivoting=True))
+    if not hasattr(fun, FLAG_REQUIRES_PIVOTING):
+      setattr(TridiagonalSolveOpTest, name + "_noPivoting",
+              decor(fun, pivoting=False))
+
   test.main()
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 98a47b4..22c7cb1 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -260,7 +260,7 @@
       op_names = [
           op.op_def.name for op in sess.graph.get_operations() if op.op_def
       ]
-      if compat.forward_compatible(2019, 4, 18):
+      if compat.forward_compatible(2019, 4, 25):
         self.assertIn("BatchMatMulV2", op_names)
       else:
         self.assertIn("BatchMatMul", op_names)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index d4e8a8b..5071fda 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -27,6 +27,7 @@
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
@@ -214,7 +215,6 @@
     else:
       self._keras_style = False
 
-    self._graph = None
     self._call_has_scope_arg = 'scope' in self._call_fn_args
     if scope:
       with vs.variable_scope(scope) as captured_scope:
@@ -223,11 +223,17 @@
       self._scope = None
     self._current_scope = None
 
+  # We no longer track graph in tf.layers layers. This property is only kept to
+  # maintain API backward compatibility.
   @property
+  @deprecation.deprecated(
+      date=None,
+      instructions='Stop using this property because tf.layers layers no '
+      'longer track their graph.')
   def graph(self):
     if context.executing_eagerly():
       raise RuntimeError('Layer.graph not supported when executing eagerly.')
-    return self._graph
+    return None
 
   def _init_set_name(self, name):
     # Determine layer name (non-unique).
@@ -366,7 +372,7 @@
           dtype=dtype,
           initializer=initializer,
           regularizer=regularizer,
-          trainable=trainable,
+          trainable=trainable and self.trainable,
           constraint=constraint,
           use_resource=use_resource,
           synchronization=vs.VariableSynchronization.AUTO,
@@ -433,7 +439,7 @@
             shape,
             dtype=dtypes.as_dtype(dtype),
             initializer=initializer,
-            trainable=trainable,
+            trainable=trainable and self.trainable,
             constraint=constraint,
             partitioner=partitioner,
             use_resource=use_resource,
@@ -498,14 +504,6 @@
 
     self._set_scope(scope)
 
-    if not context.executing_eagerly():
-      try:
-        # Set layer's "graph" at build time
-        self._graph = ops._get_graph_from_inputs(nest.flatten(inputs),  # pylint: disable=protected-access
-                                                 graph=self._graph)
-      except ValueError as e:
-        raise ValueError('Input graph and Layer graph are not the same: %s' % e)
-
     if self.built:
       try:
         # Some classes which inherit from Layer do not use its constructor, so
@@ -563,6 +561,11 @@
     # By-pass the automatic dependency tracking performed by the parent Layer.
     super(trackable.Trackable, self).__setattr__(value, name)
 
+  @property
+  def _is_legacy_layer(self):
+    """Used by keras to check compatibility. This should not be overridden."""
+    return True
+
 
 def _add_elements_to_collection(elements, collection_list):
   if context.executing_eagerly():
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index a55751f..3dd09a0 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -210,7 +210,6 @@
     layer_copy = copy.deepcopy(layer)
     self.assertEqual(layer_copy.name, layer.name)
     self.assertEqual(layer_copy._scope.name, layer._scope.name)
-    self.assertEqual(layer_copy._graph, layer._graph)
     self.assertEqual(layer_copy._private_tensor, layer._private_tensor)
 
   @test_util.run_in_graph_and_eager_modes
@@ -639,16 +638,5 @@
     self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_losses_for([outputs])), 0)
 
-  def testLayerGraphSetInFirstApply(self):
-    with ops.Graph().as_default():
-      # Graph at construction time is ignored
-      layer = core_layers.Dense(1)
-    with ops.Graph().as_default():
-      layer.apply(constant_op.constant([[1.]]))
-      # layer is now bound to second Graph
-    with ops.Graph().as_default(), self.assertRaisesRegexp(
-        ValueError, 'Input graph and Layer graph are not the same'):
-      layer.apply(constant_op.constant([[1.]]))
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index d123afc..2dbdc09 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -59,8 +59,7 @@
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.average_pooling1d instead.')
+    date=None, instructions='Use keras.layers.AveragePooling1D instead.')
 @tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -130,8 +129,7 @@
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.max_pooling1d instead.')
+    date=None, instructions='Use keras.layers.MaxPooling1D instead.')
 @tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -201,8 +199,7 @@
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.average_pooling2d instead.')
+    date=None, instructions='Use keras.layers.AveragePooling2D instead.')
 @tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
@@ -275,8 +272,7 @@
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.max_pooling2d instead.')
+    date=None, instructions='Use keras.layers.MaxPooling2D instead.')
 @tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
@@ -351,8 +347,7 @@
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.average_pooling3d instead.')
+    date=None, instructions='Use keras.layers.AveragePooling3D instead.')
 @tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
@@ -429,33 +424,30 @@
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.max_pooling3d instead.')
+    date=None, instructions='Use keras.layers.MaxPooling3D instead.')
 @tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
                   name=None):
-  """Max pooling layer for 3D inputs (e.g. volumes).
+  """Max pooling layer for 3D inputs (e.g.
+
+  volumes).
 
   Arguments:
     inputs: The tensor over which to pool. Must have rank 5.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
+    pool_size: An integer or tuple/list of 3 integers: (pool_depth, pool_height,
+      pool_width) specifying the size of the pooling window. Can be a single
+      integer to specify the same value for all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers, specifying the strides of
+      the pooling operation. Can be a single integer to specify the same value
+      for all spatial dimensions.
     padding: A string. The padding method, either 'valid' or 'same'.
       Case-insensitive.
     data_format: A string. The ordering of the dimensions in the inputs.
       `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
+      `channels_last` corresponds to inputs with shape `(batch, depth, height,
+      width, channels)` while `channels_first` corresponds to inputs with shape
       `(batch, channels, depth, height, width)`.
     name: A string, the name of the layer.
 
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index 7e94dda..a0cd66a 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -18,7 +18,10 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -88,5 +91,51 @@
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 
+class ConstantValueTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testConstantValue(self):
+    f1 = lambda: constant_op.constant(5)
+    f2 = lambda: constant_op.constant(32)
+
+    # Boolean pred
+    self.assertEqual(5, utils.constant_value(utils.smart_cond(True, f1, f2)))
+    self.assertEqual(32, utils.constant_value(utils.smart_cond(False, f1, f2)))
+
+    # Integer pred
+    self.assertEqual(5, utils.constant_value(utils.smart_cond(1, f1, f2)))
+    self.assertEqual(32, utils.constant_value(utils.smart_cond(0, f1, f2)))
+
+    # Unknown pred
+    pred = array_ops.placeholder_with_default(True, shape=())
+    self.assertIsNone(utils.constant_value(utils.smart_cond(pred, f1, f2)))
+
+    #Error case
+    with self.assertRaises(TypeError):
+      utils.constant_value(5)
+
+
+class GetReachableFromInputsTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testGetReachableFromInputs(self):
+
+    pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+    pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+    pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+    x_1 = pl_1 + pl_2
+    x_2 = pl_2 * 2
+    x_3 = pl_3 + 1
+    x_4 = x_1 + x_2
+    x_5 = x_3 * pl_1
+
+    self.assertEqual({pl_1, x_1, x_4, x_5},
+                     utils.get_reachable_from_inputs([pl_1]))
+    self.assertEqual({pl_1, pl_2, x_1, x_2, x_4, x_5},
+                     utils.get_reachable_from_inputs([pl_1, pl_2]))
+    self.assertEqual({pl_3, x_3, x_5}, utils.get_reachable_from_inputs([pl_3]))
+    self.assertEqual({x_3, x_5}, utils.get_reachable_from_inputs([x_3]))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 0d58385..bf584bc 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -204,20 +204,21 @@
     return s;
   }
 
-  PyObject* np_array =
-      PyArray_SimpleNewFromData(dim_size, dims, type_num, data);
+  auto* np_array = reinterpret_cast<PyArrayObject*>(
+      PyArray_SimpleNewFromData(dim_size, dims, type_num, data));
+  PyArray_CLEARFLAGS(np_array, NPY_ARRAY_OWNDATA);
   if (PyType_Ready(&TensorReleaserType) == -1) {
     return errors::Unknown("Python type initialization failed.");
   }
-  TensorReleaser* releaser = reinterpret_cast<TensorReleaser*>(
+  auto* releaser = reinterpret_cast<TensorReleaser*>(
       TensorReleaserType.tp_alloc(&TensorReleaserType, 0));
   releaser->destructor = new std::function<void()>(std::move(destructor));
-  if (PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(np_array),
-                            reinterpret_cast<PyObject*>(releaser)) == -1) {
+  if (PyArray_SetBaseObject(np_array, reinterpret_cast<PyObject*>(releaser)) ==
+      -1) {
     Py_DECREF(releaser);
     return errors::Unknown("Python array refused to use memory.");
   }
-  *result = PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+  *result = PyArray_Return(np_array);
   return Status::OK();
 }
 
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 354949c..8f66a8a 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -316,6 +316,31 @@
 
 DEFINE_HELPER(ConvertInt64, int64, DT_INT64, ConvertOneInt64);
 
+const char* ConvertOneUint64(PyObject* v, uint64* out) {
+#if PY_MAJOR_VERSION < 3
+  if (TF_PREDICT_TRUE(PyInt_Check(v))) {
+    *out = PyInt_AsUnsignedLongLongMask(v);
+    return nullptr;
+  }
+#endif
+  if (TF_PREDICT_TRUE(PyLong_Check(v) || IsPyDimension(v))) {
+    *out = PyLong_AsUnsignedLongLong(v);
+    return nullptr;
+  }
+  if (PyIsInstance(v, &PyIntegerArrType_Type)) {  // NumPy integers
+#if PY_MAJOR_VERSION < 3
+    Safe_PyObjectPtr as_int = make_safe(PyNumber_Int(v));
+#else
+    Safe_PyObjectPtr as_int = make_safe(PyNumber_Long(v));
+#endif
+    return ConvertOneUint64(as_int.get(), out);
+  }
+  if (IsPyFloat(v)) return ErrorFoundFloat;
+  return ErrorMixedTypes;
+}
+
+DEFINE_HELPER(ConvertUint64, uint64, DT_UINT64, ConvertOneUint64);
+
 const char* ConvertOneInt32(PyObject* v, int32* out) {
   int64 i;
 #if PY_MAJOR_VERSION < 3
@@ -522,6 +547,10 @@
       if (ConvertInt32(obj, shape, ret) == nullptr) return Status::OK();
       break;
 
+    case DT_UINT64:
+      if (ConvertUint64(obj, shape, ret) == nullptr) return Status::OK();
+      break;
+
     case DT_COMPLEX128:
       if (ConvertComplex(obj, shape, ret) == nullptr) return Status::OK();
       break;
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 7139277..9ac66af 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -128,8 +128,7 @@
         pywrap_tensorflow.ReadFromStream(self._read_buf, length))
 
   @deprecation.deprecated_args(
-      None,
-      "position is deprecated in favor of the offset argument.",
+      None, "position is deprecated in favor of the offset argument.",
       "position")
   def seek(self, offset=None, whence=0, position=None):
     # TODO(jhseu): Delete later. Used to omit `position` from docs.
@@ -169,8 +168,8 @@
       else:
         raise errors.InvalidArgumentError(
             None, None,
-            "Invalid whence argument: {}. Valid values are 0, 1, or 2."
-            .format(whence))
+            "Invalid whence argument: {}. Valid values are 0, 1, or 2.".format(
+                whence))
       ret_status = self._read_buf.Seek(offset)
       pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
 
@@ -241,7 +240,6 @@
         pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
     self._writable_file = None
 
-  @property
   def seekable(self):
     """Returns True as FileIO supports random access ops of seek()/tell()"""
     return True
@@ -319,7 +317,7 @@
   Args:
     filename: string, path to a file
     binary_mode: whether to open the file in binary mode or not. This changes
-        the type of the object returned.
+      the type of the object returned.
 
   Returns:
     contents of the file as a string or bytes.
@@ -401,10 +399,8 @@
 
   Args:
     dirname: string, name of the directory to be created
-
-  Notes:
-    The parent directories need to exist. Use recursive_create_dir instead if
-    there is the possibility that the parent dirs don't exist.
+  Notes: The parent directories need to exist. Use recursive_create_dir instead
+    if there is the possibility that the parent dirs don't exist.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -418,10 +414,8 @@
 
   Args:
     path: string, name of the directory to be created
-
-  Notes:
-    The parent directories need to exist. Use recursive_create_dir instead if
-    there is the possibility that the parent dirs don't exist.
+  Notes: The parent directories need to exist. Use recursive_create_dir instead
+    if there is the possibility that the parent dirs don't exist.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -461,13 +455,13 @@
 
 @tf_export(v1=["gfile.Copy"])
 def copy(oldpath, newpath, overwrite=False):
-  """Copies data from oldpath to newpath.
+  """Copies data from `oldpath` to `newpath`.
 
   Args:
     oldpath: string, name of the file who's contents need to be copied
     newpath: string, name of the file to which to copy to
-    overwrite: boolean, if false its an error for newpath to be occupied by an
-        existing file.
+    overwrite: boolean, if false it's an error for `newpath` to be occupied by
+      an existing file.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -477,13 +471,13 @@
 
 @tf_export("io.gfile.copy")
 def copy_v2(src, dst, overwrite=False):
-  """Copies data from src to dst.
+  """Copies data from `src` to `dst`.
 
   Args:
     src: string, name of the file whose contents need to be copied
     dst: string, name of the file to which to copy to
-    overwrite: boolean, if false its an error for newpath to be occupied by an
-        existing file.
+    overwrite: boolean, if false it's an error for `dst` to be occupied by an
+      existing file.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -500,7 +494,7 @@
     oldname: string, pathname for a file
     newname: string, pathname to which the file needs to be moved
     overwrite: boolean, if false it's an error for `newname` to be occupied by
-        an existing file.
+      an existing file.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -515,8 +509,8 @@
   Args:
     src: string, pathname for a file
     dst: string, pathname to which the file needs to be moved
-    overwrite: boolean, if false it's an error for `dst` to be occupied by
-        an existing file.
+    overwrite: boolean, if false it's an error for `dst` to be occupied by an
+      existing file.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -538,7 +532,7 @@
     filename: string, pathname for a file
     contents: string, contents that need to be written to the file
     overwrite: boolean, if false it's an error for `filename` to be occupied by
-        an existing file.
+      an existing file.
   """
   temp_pathname = filename + ".tmp" + uuid.uuid4().hex
   write_string_to_file(temp_pathname, contents)
@@ -657,9 +651,8 @@
 
   Args:
     top: string, a Directory name
-    in_order: bool, Traverse in order if True, post order if False.
-
-  Errors that happen while listing directories are ignored.
+    in_order: bool, Traverse in order if True, post order if False.  Errors that
+      happen while listing directories are ignored.
 
   Yields:
     Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
@@ -679,8 +672,7 @@
     topdown: bool, Traverse pre order if True, post order if False.
     onerror: optional handler for errors. Should be a function, it will be
       called with the error as argument. Rethrowing the error aborts the walk.
-
-  Errors that happen while listing directories are ignored.
+      Errors that happen while listing directories are ignored.
 
   Yields:
     Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index f53c5fa..5f01258 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -20,9 +20,11 @@
 from __future__ import print_function
 
 import os.path
+import numpy as np
 
 from tensorflow.python.framework import errors
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -606,6 +608,14 @@
     # Change noread back so that it could be cleaned during tearDown.
     os.chmod(noread_path, 0o777)
 
+  def testFileSeekableWithZip(self):
+    # Note: Test case for GitHub issue 27276, issue only exposed in python 3.7+.
+    filename = os.path.join(self._base_dir, "a.npz")
+    np.savez_compressed(filename, {"a": 1, "b": 2})
+    with gfile.GFile(filename, "rb") as f:
+      info = np.load(f, allow_pickle=True)
+    _ = [i for i in info.items()]
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 0afc5f0..f6ee7f9 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -501,7 +501,6 @@
                                   layers.Dense(2)])
 
     self.assertEqual(m.submodules, (m.layers[0], m.layers[1]))
-    self.assertEmpty(m.variables)
     m(layers.Input((1,)))
     self.assertLen(m.variables, 4)
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 34fc5dd..d587a46 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -130,9 +130,8 @@
 
   Args:
     input: A `Tensor`.
-    axis: 0-D (scalar). Specifies the dimension index at which to
-      expand the shape of `input`. Must be in the range
-      `[-rank(input) - 1, rank(input)]`.
+    axis: 0-D (scalar). Specifies the dimension index at which to expand the
+      shape of `input`. Must be in the range `[-rank(input) - 1, rank(input)]`.
     name: The name of the output `Tensor` (optional).
     dim: 0-D (scalar). Equivalent to `axis`, to be deprecated.
 
@@ -187,9 +186,8 @@
 
   Args:
     input: A `Tensor`.
-    axis: 0-D (scalar). Specifies the dimension index at which to
-      expand the shape of `input`. Must be in the range
-      `[-rank(input) - 1, rank(input)]`.
+    axis: 0-D (scalar). Specifies the dimension index at which to expand the
+      shape of `input`. Must be in the range `[-rank(input) - 1, rank(input)]`.
     name: The name of the output `Tensor` (optional).
 
   Returns:
@@ -204,10 +202,9 @@
 
 # Aliases for some automatically-generated names.
 # pylint: disable=protected-access
-@deprecation.deprecated(
-    "2016-11-30",
-    "This op will be removed after the deprecation date. "
-    "Please switch to tf.setdiff1d().")
+@deprecation.deprecated("2016-11-30",
+                        "This op will be removed after the deprecation date. "
+                        "Please switch to tf.setdiff1d().")
 def listdiff(x, y, out_idx=None, name=None):
   return gen_array_ops.list_diff(x, y, out_idx, name)
 
@@ -218,10 +215,9 @@
 
 
 # pylint: disable=undefined-variable
-@deprecation.deprecated(
-    "2018-11-30",
-    "This op will be removed after the deprecation date. "
-    "Please switch to tf.sets.difference().")
+@deprecation.deprecated("2018-11-30",
+                        "This op will be removed after the deprecation date. "
+                        "Please switch to tf.sets.difference().")
 @tf_export(v1=["setdiff1d"])
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   return gen_array_ops.list_diff(x, y, index_dtype, name)
@@ -325,8 +321,8 @@
   Args:
     input: A `Tensor` or `SparseTensor`.
     name: A name for the operation (optional).
-    out_type: (Optional) The specified output type of the operation
-      (`int32` or `int64`). Defaults to `tf.int32`.
+    out_type: (Optional) The specified output type of the operation (`int32` or
+      `int64`). Defaults to `tf.int32`.
 
   Returns:
     A `Tensor` of type `out_type`.
@@ -342,16 +338,16 @@
     input: A `Tensor` or `SparseTensor`.
     name: A name for the operation (optional).
     optimize: if true, encode the shape as a constant when possible.
-    out_type: (Optional) The specified output type of the operation
-      (`int32` or `int64`). Defaults to tf.int32.
+    out_type: (Optional) The specified output type of the operation (`int32` or
+      `int64`). Defaults to tf.int32.
 
   Returns:
     A `Tensor` of type `out_type`.
 
   """
   with ops.name_scope(name, "Shape", [input]) as name:
-    if isinstance(input, (sparse_tensor.SparseTensor,
-                          sparse_tensor.SparseTensorValue)):
+    if isinstance(
+        input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
       return gen_math_ops.cast(input.dense_shape, out_type)
     else:
       if not context.executing_eagerly():
@@ -369,8 +365,8 @@
 
   Args:
     input: A list of at least 1 `Tensor` object with the same type.
-    out_type: The specified output type of the operation
-      (`int32` or `int64`). Defaults to `tf.int32`(optional).
+    out_type: The specified output type of the operation (`int32` or `int64`).
+      Defaults to `tf.int32`(optional).
     name: A name for the operation (optional).
 
   Returns:
@@ -407,8 +403,8 @@
   Args:
     input: A `Tensor` or `SparseTensor`.
     name: A name for the operation (optional).
-    out_type: (Optional) The specified non-quantized numeric output type
-      of the operation. Defaults to `tf.int32`.
+    out_type: (Optional) The specified non-quantized numeric output type of the
+      operation. Defaults to `tf.int32`.
 
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
@@ -428,8 +424,8 @@
     input: A `Tensor` or `SparseTensor`.
     name: A name for the operation (optional).
     optimize: if true, encode the size as a constant when possible.
-    out_type: (Optional) The specified non-quantized numeric output type
-      of the operation. Defaults to `tf.int32`.
+    out_type: (Optional) The specified non-quantized numeric output type of the
+      operation. Defaults to `tf.int32`.
 
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
@@ -441,8 +437,8 @@
     num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-access
     return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
-    if isinstance(input, (sparse_tensor.SparseTensor,
-                          sparse_tensor.SparseTensorValue)):
+    if isinstance(
+        input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
       return gen_math_ops.prod(
           gen_math_ops.cast(input.dense_shape, out_type), 0, name=name)
     else:
@@ -503,8 +499,8 @@
     A `Tensor` of type `int32`.
   """
   with ops.name_scope(name, "Rank", [input]) as name:
-    if isinstance(input, (sparse_tensor.SparseTensor,
-                          sparse_tensor.SparseTensorValue)):
+    if isinstance(
+        input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
       return gen_array_ops.size(input.dense_shape, name=name)
     else:
       input_tensor = ops.convert_to_tensor(input)
@@ -519,12 +515,8 @@
     "tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid "
     "indices")
 
-_SUPPORTED_SLICE_DTYPES = (
-    dtypes.int32,
-    dtypes.int32_ref,
-    dtypes.int64,
-    dtypes.int64_ref
-)
+_SUPPORTED_SLICE_DTYPES = (dtypes.int32, dtypes.int32_ref, dtypes.int64,
+                           dtypes.int64_ref)
 
 
 def _check_index(idx):
@@ -536,8 +528,7 @@
   # * any object with a dtype is supported
   # * any object with a dtype has a sizeable shape attribute.
   dtype = getattr(idx, "dtype", None)
-  if (dtype is None or
-      dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
+  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
       idx.shape and len(idx.shape) == 1):
     # TODO(slebedev): IndexError seems more appropriate here, but it
     # will break `_slice_helper` contract.
@@ -592,9 +583,8 @@
   Args:
     tensor: An ops.Tensor object.
     slice_spec: The arguments to Tensor.__getitem__.
-    var: In the case of variable slice assignment, the Variable
-      object to slice (i.e. tensor is the read-only view of this
-      variable).
+    var: In the case of variable slice assignment, the Variable object to slice
+      (i.e. tensor is the read-only view of this variable).
 
   Returns:
     The appropriate slice of "tensor", based on "slice_spec".
@@ -858,6 +848,7 @@
   parent_name = name
 
   if not (var is None and isinstance(op, ops.EagerTensor)):
+
     def assign(val, name=None):
       """Closure that holds all the arguments to create an assignment."""
 
@@ -913,8 +904,8 @@
   ```python
   import tensorflow as tf
   A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
-  with tf.Session() as sess:
-    sess.run(tf.global_variables_initializer())
+  with tf.compat.v1.Session() as sess:
+    sess.run(tf.compat.v1.global_variables_initializer())
     print(sess.run(A[:2, :2]))  # => [[1,2], [4,5]]
 
     op = A[:2,:2].assign(22. * tf.ones((2, 2)))
@@ -1049,8 +1040,8 @@
   if value_shape is not None:
     expanded_num_dims = len(value_shape) + 1
     if axis < -expanded_num_dims or axis >= expanded_num_dims:
-      raise ValueError("axis = %d not in [%d, %d)" % (axis, -expanded_num_dims,
-                                                      expanded_num_dims))
+      raise ValueError("axis = %d not in [%d, %d)" %
+                       (axis, -expanded_num_dims, expanded_num_dims))
 
   return gen_array_ops.pack(values, axis=axis, name=name)
 
@@ -1079,8 +1070,8 @@
       if ops.is_dense_tensor_like(elem):
         if dtype is not None and elem.dtype.base_dtype != dtype:
           raise TypeError("Cannot convert a list containing a tensor of dtype "
-                          "%s to %s (Tensor is: %r)" % (elem.dtype, dtype,
-                                                        elem))
+                          "%s to %s (Tensor is: %r)" %
+                          (elem.dtype, dtype, elem))
         converted_elems.append(elem)
         must_pack = True
       elif isinstance(elem, (list, tuple)):
@@ -1110,8 +1101,8 @@
   """Returns the dtype of any tensor-like object in `list_or_tuple`, if found.
 
   Args:
-    list_or_tuple: A list or tuple representing an object that can be
-      converted to a `tf.Tensor`.
+    list_or_tuple: A list or tuple representing an object that can be converted
+      to a `tf.Tensor`.
 
   Returns:
     The dtype of any tensor-like object in `list_or_tuple`, or `None` if no
@@ -1128,11 +1119,13 @@
 
 
 def _cast_nested_seqs_to_dtype(dtype):
+
   def _maybe_cast(elem):
     if ops.is_dense_tensor_like(elem):
       if dtype != elem.dtype.base_dtype:
         elem = gen_math_ops.cast(elem, dtype)
     return elem
+
   return _maybe_cast
 
 
@@ -1182,10 +1175,10 @@
 
   Args:
     value: A rank `R > 0` `Tensor` to be unstacked.
-    num: An `int`. The length of the dimension `axis`. Automatically inferred
-      if `None` (the default).
-    axis: An `int`. The axis to unstack along. Defaults to the first
-      dimension. Negative values wrap around, so the valid range is `[-R, R)`.
+    num: An `int`. The length of the dimension `axis`. Automatically inferred if
+      `None` (the default).
+    axis: An `int`. The axis to unstack along. Defaults to the first dimension.
+      Negative values wrap around, so the valid range is `[-R, R)`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1280,10 +1273,10 @@
   Args:
     values: A list of `Tensor` objects or a single `Tensor`.
     axis: 0-D `int32` `Tensor`.  Dimension along which to concatenate. Must be
-      in the range `[-rank(values), rank(values))`. As in Python, indexing
-      for axis is 0-based. Positive axis in the rage of
-      `[0, rank(values))` refers to `axis`-th dimension. And negative axis
-      refers to `axis + rank(values)`-th dimension.
+      in the range `[-rank(values), rank(values))`. As in Python, indexing for
+      axis is 0-based. Positive axis in the rage of `[0, rank(values))` refers
+      to `axis`-th dimension. And negative axis refers to `axis +
+      rank(values)`-th dimension.
     name: A name for the operation (optional).
 
   Returns:
@@ -1308,7 +1301,9 @@
 
 @tf_export(v1=["boolean_mask"])
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
-  """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
+  """Apply boolean mask to tensor.
+
+  Numpy equivalent is `tensor[mask]`.
 
   ```python
   # 1-D example
@@ -1329,9 +1324,9 @@
     tensor:  N-D tensor.
     mask:  K-D boolean tensor, K <= N and K must be known statically.
     name:  A name for this operation (optional).
-    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from.
-      By default, axis is 0 which will mask from the first dimension. Otherwise
-      K + axis <= N.
+    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
+      default, axis is 0 which will mask from the first dimension. Otherwise K +
+      axis <= N.
 
   Returns:
     (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
@@ -1372,15 +1367,16 @@
     shape_tensor[axis:axis + ndims_mask].assert_is_compatible_with(shape_mask)
 
     leading_size = gen_math_ops.prod(shape(tensor)[axis:axis + ndims_mask], [0])
-    tensor = reshape(tensor,
-                     concat([
-                         shape(tensor)[:axis], [leading_size],
-                         shape(tensor)[axis + ndims_mask:]
-                     ], 0))
+    tensor = reshape(
+        tensor,
+        concat([
+            shape(tensor)[:axis], [leading_size],
+            shape(tensor)[axis + ndims_mask:]
+        ], 0))
     first_dim = shape_tensor[axis:axis + ndims_mask].num_elements()
     tensor.set_shape(
-        tensor_shape.as_shape(shape_tensor[:axis]).concatenate([first_dim])
-        .concatenate(shape_tensor[axis + ndims_mask:]))
+        tensor_shape.as_shape(shape_tensor[:axis]).concatenate(
+            [first_dim]).concatenate(shape_tensor[axis + ndims_mask:]))
 
     mask = reshape(mask, [-1])
     return _apply_mask_1d(tensor, mask, axis)
@@ -1532,13 +1528,13 @@
 
   Args:
     value: The `Tensor` to split.
-    num_or_size_splits: Either an integer indicating the number of
-      splits along split_dim or a 1-D integer `Tensor` or Python list containing
-      the sizes of each output tensor along split_dim. If a scalar then it must
-      evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
-      split dimension must match that of the `value`.
+    num_or_size_splits: Either an integer indicating the number of splits along
+      split_dim or a 1-D integer `Tensor` or Python list containing the sizes of
+      each output tensor along split_dim. If a scalar then it must evenly divide
+      `value.shape[axis]`; otherwise the sum of sizes along the split dimension
+      must match that of the `value`.
     axis: An integer or scalar `int32` `Tensor`. The dimension along which to
-    split. Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
+      split. Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
     num: Optional, used to specify the number of outputs when it cannot be
       inferred from the shape of `size_splits`.
     name: A name for the operation (optional).
@@ -1576,7 +1572,9 @@
 
 @tf_export("transpose", v1=[])
 def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
-  """Transposes `a`. Permutes the dimensions according to `perm`.
+  """Transposes `a`.
+
+  Permutes the dimensions according to `perm`.
 
   The returned tensor's dimension i will correspond to the input dimension
   `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
@@ -1633,7 +1631,7 @@
     a: A `Tensor`.
     perm: A permutation of the dimensions of `a`.
     conjugate: Optional bool. Setting it to `True` is mathematically equivalent
-      to tf.conj(tf.transpose(input)).
+      to tf.math.conj(tf.transpose(input)).
     name: A name for the operation (optional).
 
   Returns:
@@ -1644,7 +1642,9 @@
 
 @tf_export(v1=["transpose"])
 def transpose(a, perm=None, name="transpose", conjugate=False):
-  """Transposes `a`. Permutes the dimensions according to `perm`.
+  """Transposes `a`.
+
+  Permutes the dimensions according to `perm`.
 
   The returned tensor's dimension i will correspond to the input dimension
   `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
@@ -1702,15 +1702,15 @@
     perm: A permutation of the dimensions of `a`.
     name: A name for the operation (optional).
     conjugate: Optional bool. Setting it to `True` is mathematically equivalent
-      to tf.conj(tf.transpose(input)).
+      to tf.math.conj(tf.transpose(input)).
 
   Returns:
     A transposed `Tensor`.
   """
   with ops.name_scope(name, "transpose", [a]) as name:
     transpose_fn = (
-        gen_array_ops.conjugate_transpose
-        if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
+        gen_array_ops.conjugate_transpose if
+        (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
       a = ops.convert_to_tensor(a, name="a")
       if not a.get_shape().ndims:
@@ -1781,7 +1781,7 @@
     a: A `Tensor` with `rank >= 2`.
     name: A name for the operation (optional).
     conjugate: Optional bool. Setting it to `True` is mathematically equivalent
-      to tf.conj(tf.linalg.matrix_transpose(input)).
+      to tf.math.conj(tf.linalg.matrix_transpose(input)).
 
   Returns:
     A transposed batch matrix `Tensor`.
@@ -1806,8 +1806,8 @@
       perm = list(range(ndims - 2)) + [ndims - 1] + [ndims - 2]
     else:
       a_rank = rank(a)
-      perm = concat((gen_math_ops._range(0, a_rank - 2, 1),
-                     [a_rank - 1, a_rank - 2]), 0)
+      perm = concat(
+          (gen_math_ops._range(0, a_rank - 2, 1), [a_rank - 1, a_rank - 2]), 0)
 
     return transpose(a, perm=perm, conjugate=conjugate)
 
@@ -1899,8 +1899,8 @@
       `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
       `complex64`, `complex128`, `bool` or `string`.
     name: A name for the operation (optional).
-    optimize: if true, attempt to statically determine the shape of 'tensor'
-    and encode it as a constant.
+    optimize: if true, attempt to statically determine the shape of 'tensor' and
+      encode it as a constant.
 
   Returns:
     A `Tensor` with all elements set to zero.
@@ -1987,11 +1987,11 @@
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
-      `complex64`, `complex128` or `bool`.
+      `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`, `complex64`,
+      `complex128` or `bool`.
     name: A name for the operation (optional).
-    optimize: if true, attempt to statically determine the shape of 'tensor'
-    and encode it as a constant.
+    optimize: if true, attempt to statically determine the shape of 'tensor' and
+      encode it as a constant.
 
   Returns:
     A `Tensor` with all elements set to 1.
@@ -2101,10 +2101,10 @@
   For example:
 
   ```python
-  x = tf.placeholder(tf.float32, shape=(1024, 1024))
+  x = tf.compat.v1.placeholder(tf.float32, shape=(1024, 1024))
   y = tf.matmul(x, x)
 
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
     print(sess.run(y))  # ERROR: will fail because x was not fed.
 
     rand_array = np.random.rand(1024, 1024)
@@ -2141,8 +2141,8 @@
 
   Args:
     input: A `Tensor`. The default value to produce when output is not fed.
-    shape: A `tf.TensorShape` or list of `int`s. The (possibly partial) shape
-      of the tensor.
+    shape: A `tf.TensorShape` or list of `int`s. The (possibly partial) shape of
+      the tensor.
     name: A name for the operation (optional).
 
   Returns:
@@ -2174,17 +2174,18 @@
   For example:
 
   ```python
-  x = tf.sparse.placeholder(tf.float32)
+  x = tf.compat.v1.sparse.placeholder(tf.float32)
   y = tf.sparse.reduce_sum(x)
 
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
     print(sess.run(y))  # ERROR: will fail because x was not fed.
 
     indices = np.array([[3, 2, 0], [4, 5, 1]], dtype=np.int64)
     values = np.array([1.0, 2.0], dtype=np.float32)
     shape = np.array([7, 9, 2], dtype=np.int64)
     print(sess.run(y, feed_dict={
-      x: tf.SparseTensorValue(indices, values, shape)}))  # Will succeed.
+      x: tf.compat.v1.SparseTensorValue(indices, values, shape)}))  # Will
+      succeed.
     print(sess.run(y, feed_dict={
       x: (indices, values, shape)}))  # Will succeed.
 
@@ -2222,7 +2223,8 @@
           shape=[None],
           name=(name + "/values") if name is not None else None),
       indices=placeholder(
-          dtypes.int64, shape=[None, rank],
+          dtypes.int64,
+          shape=[None, rank],
           name=(name + "/indices") if name is not None else None),
       dense_shape=shape)
 
@@ -2369,8 +2371,8 @@
     paddings_constant = tensor_util.constant_value(
         result.op.inputs[1], partial=True)
     input_shape = result.op.inputs[0].shape
-    if (input_shape.ndims is not None and not result.shape.is_fully_defined()
-        and paddings_constant is not None):
+    if (input_shape.ndims is not None and
+        not result.shape.is_fully_defined() and paddings_constant is not None):
       new_shape = []
       for padding, dim in zip(paddings_constant, input_shape.as_list()):
         if padding is None or dim is None or any((x is None for x in padding)):
@@ -2582,11 +2584,12 @@
   Raises:
     TypeError: If either `hypothesis` or `truth` are not a `SparseTensor`.
   """
-  if not isinstance(hypothesis, (sparse_tensor.SparseTensor,
-                                 sparse_tensor.SparseTensorValue)):
+  if not isinstance(
+      hypothesis,
+      (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
     raise TypeError("Hypothesis must be a SparseTensor.")
-  if not isinstance(truth, (sparse_tensor.SparseTensor,
-                            sparse_tensor.SparseTensorValue)):
+  if not isinstance(
+      truth, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
     raise TypeError("Truth must be a SparseTensor.")
 
   return gen_array_ops.edit_distance(
@@ -2710,8 +2713,8 @@
     result_paddings = stack(
         [[pad_start[i], pad_end[i]] for i in range(num_block_dims)],
         name="paddings")
-    result_crops = stack(
-        [[0, pad_end_extra[i]] for i in range(num_block_dims)], name="crops")
+    result_crops = stack([[0, pad_end_extra[i]] for i in range(num_block_dims)],
+                         name="crops")
     return result_paddings, result_crops
 
 
@@ -2719,8 +2722,9 @@
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(  # pylint: disable=missing-docstring
     input, paddings, block_size=None, name=None, block_shape=None):  # pylint: disable=redefined-builtin
-  block_size = deprecation.deprecated_argument_lookup(
-      "block_shape", block_shape, "block_size", block_size)
+  block_size = deprecation.deprecated_argument_lookup("block_shape",
+                                                      block_shape, "block_size",
+                                                      block_size)
   result = space_to_batch_nd(
       input,
       paddings=paddings,
@@ -2777,8 +2781,9 @@
 
 @tf_export(v1=["batch_to_space"])
 def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
-  block_size = deprecation.deprecated_argument_lookup(
-      "block_shape", block_shape, "block_size", block_size)
+  block_size = deprecation.deprecated_argument_lookup("block_shape",
+                                                      block_shape, "block_size",
+                                                      block_size)
   result = batch_to_space_nd(
       input,
       crops=crops,
@@ -2803,125 +2808,59 @@
   is the reverse of SpaceToBatch.  See below for a precise description.
 
   Args:
-    input: A `Tensor`.
-      N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-      where spatial_shape has M dimensions.
-    block_shape: A `Tensor`. Must be one of the following types:
-      `int32`, `int64`. 1-D with shape `[M]`, all values must be >= 1.
-      For backwards compatibility with TF 1.0, this parameter may be an int, in
-      which case it is converted to
-      `numpy.array([block_shape, block_shape], dtype=numpy.int64)`.
-    crops: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-      2-D with shape `[M, 2]`, all values must be >= 0.
-        `crops[i] = [crop_start, crop_end]` specifies the amount to crop from
-        input dimension `i + 1`, which corresponds to spatial dimension `i`.  It
-        is required that
-        `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-
+    input: A `Tensor`. N-D with shape `input_shape = [batch] + spatial_shape +
+      remaining_shape`, where spatial_shape has M dimensions.
+    block_shape: A `Tensor`. Must be one of the following types: `int32`,
+      `int64`. 1-D with shape `[M]`, all values must be >= 1. For backwards
+      compatibility with TF 1.0, this parameter may be an int, in which case it
+      is converted to `numpy.array([block_shape, block_shape],
+      dtype=numpy.int64)`.
+    crops: A `Tensor`. Must be one of the following types: `int32`, `int64`. 2-D
+      with shape `[M, 2]`, all values must be >= 0. `crops[i] = [crop_start,
+      crop_end]` specifies the amount to crop from input dimension `i + 1`,
+      which corresponds to spatial dimension `i`.  It is required that
+      `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
       This operation is equivalent to the following steps:
-
-      1. Reshape `input` to `reshaped` of shape:
-           [block_shape[0], ..., block_shape[M-1],
-            batch / prod(block_shape),
-            input_shape[1], ..., input_shape[N-1]]
-
-      2. Permute dimensions of `reshaped` to produce `permuted` of shape
-           [batch / prod(block_shape),
-
-            input_shape[1], block_shape[0],
-            ...,
-            input_shape[M], block_shape[M-1],
-
-            input_shape[M+1], ..., input_shape[N-1]]
-
-      3. Reshape `permuted` to produce `reshaped_permuted` of shape
-           [batch / prod(block_shape),
-
-            input_shape[1] * block_shape[0],
-            ...,
-            input_shape[M] * block_shape[M-1],
-
-            input_shape[M+1],
-            ...,
-            input_shape[N-1]]
-
-      4. Crop the start and end of dimensions `[1, ..., M]` of
-         `reshaped_permuted` according to `crops` to produce the
-         output of shape:
-           [batch / prod(block_shape),
-
-            input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-            ...,
-            input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-
-            input_shape[M+1], ..., input_shape[N-1]]
-
-      Some examples:
-
-      (1) For the following input of shape `[4, 1, 1, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-
-      ```
-      [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-      ```
-
-      The output tensor has shape `[1, 2, 2, 1]` and value:
-
-      ```
-      x = [[[[1], [2]], [[3], [4]]]]
-      ```
-
-      (2) For the following input of shape `[4, 1, 1, 3]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-
-      ```
-      [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-      ```
-
-      The output tensor has shape `[1, 2, 2, 3]` and value:
-
-      ```
-      x = [[[[1, 2, 3], [4, 5, 6]],
-            [[7, 8, 9], [10, 11, 12]]]]
-      ```
-
-      (3) For the following input of shape `[4, 2, 2, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-
-      ```
-      x = [[[[1], [3]], [[9], [11]]],
-           [[[2], [4]], [[10], [12]]],
-           [[[5], [7]], [[13], [15]]],
-           [[[6], [8]], [[14], [16]]]]
-      ```
-
-      The output tensor has shape `[1, 4, 4, 1]` and value:
-
-      ```
-      x = [[[1],   [2],  [3],  [4]],
-           [[5],   [6],  [7],  [8]],
-           [[9],  [10], [11],  [12]],
-           [[13], [14], [15],  [16]]]
-      ```
-
-      (4) For the following input of shape `[8, 1, 3, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
-
-      ```
-      x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-           [[[0], [2], [4]]], [[[0], [10], [12]]],
-           [[[0], [5], [7]]], [[[0], [13], [15]]],
-           [[[0], [6], [8]]], [[[0], [14], [16]]]]
-      ```
-
-      The output tensor has shape `[2, 2, 4, 1]` and value:
-
-      ```
-      x = [[[[1],   [2],  [3],  [4]],
-            [[5],   [6],  [7],  [8]]],
-           [[[9],  [10], [11],  [12]],
-            [[13], [14], [15],  [16]]]]
-      ```
+      1. Reshape `input` to `reshaped` of shape: [block_shape[0], ...,
+        block_shape[M-1], batch / prod(block_shape), input_shape[1], ...,
+        input_shape[N-1]]  2. Permute dimensions of `reshaped` to produce
+        `permuted` of shape [batch / prod(block_shape),  input_shape[1],
+        block_shape[0], ..., input_shape[M], block_shape[M-1],
+        input_shape[M+1], ..., input_shape[N-1]]  3. Reshape `permuted` to
+        produce `reshaped_permuted` of shape [batch / prod(block_shape),
+        input_shape[1] * block_shape[0], ..., input_shape[M] * block_shape[M-1],
+        input_shape[M+1], ..., input_shape[N-1]]  4. Crop the start and end of
+        dimensions `[1, ..., M]` of `reshaped_permuted` according to `crops` to
+        produce the
+         output of shape: [batch / prod(block_shape),  input_shape[1] *
+           block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
+           block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],
+           ..., input_shape[N-1]]
+      Some examples:  (1) For the following input of shape `[4, 1, 1, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:  ``` [[[[1]]],
+            [[[2]]], [[[3]]], [[[4]]]] ```
+      The output tensor has shape `[1, 2, 2, 1]` and value:  ``` x = [[[[1],
+        [2]], [[3], [4]]]] ```  (2) For the following input of shape `[4, 1, 1,
+        3]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:  ``` [[[1, 2,
+            3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]] ```
+      The output tensor has shape `[1, 2, 2, 3]` and value:  ``` x = [[[[1, 2,
+        3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]] ```  (3) For the following
+        input of shape `[4, 2, 2, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:  ``` x =
+            [[[[1], [3]], [[9], [11]]], [[[2], [4]], [[10], [12]]], [[[5], [7]],
+            [[13], [15]]], [[[6], [8]], [[14], [16]]]] ```
+      The output tensor has shape `[1, 4, 4, 1]` and value:  ``` x = [[[1],
+        [2],  [3],  [4]], [[5],   [6],  [7],  [8]], [[9],  [10], [11],  [12]],
+        [[13], [14], [15],  [16]]] ```  (4) For the following input of shape
+        `[8, 1, 3, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:  ``` x =
+            [[[[0], [1], [3]]], [[[0], [9], [11]]], [[[0], [2], [4]]], [[[0],
+            [10], [12]]], [[[0], [5], [7]]], [[[0], [13], [15]]], [[[0], [6],
+            [8]]], [[[0], [14], [16]]]] ```
+      The output tensor has shape `[2, 2, 4, 1]` and value:  ``` x = [[[[1],
+        [2],  [3],  [4]], [[5],   [6],  [7],  [8]]], [[[9],  [10], [11],  [12]],
+        [[13], [14], [15],  [16]]]] ```
     name: A name for the operation (optional).
 
   Returns:
@@ -2930,10 +2869,8 @@
   if isinstance(block_shape, int):
     block_shape = np.array([block_shape, block_shape], dtype=np.int64)
 
-  return batch_to_space_nd(input=input,
-                           block_shape=block_shape,
-                           crops=crops,
-                           name=name)
+  return batch_to_space_nd(
+      input=input, block_shape=block_shape, crops=crops, name=name)
 
 
 @tf_export("one_hot")
@@ -3037,16 +2974,17 @@
     TypeError: If dtype of either `on_value` or `off_value` don't match `dtype`
     TypeError: If dtype of `on_value` and `off_value` don't match one another
   """
-  with ops.name_scope(name, "one_hot",
-                      [indices, depth, on_value, off_value, axis,
-                       dtype]) as name:
+  with ops.name_scope(
+      name, "one_hot",
+      [indices, depth, on_value, off_value, axis, dtype]) as name:
     on_exists = on_value is not None
     off_exists = off_value is not None
 
-    on_dtype = (ops.convert_to_tensor(on_value).dtype.base_dtype if on_exists
-                else None)
-    off_dtype = (ops.convert_to_tensor(off_value).dtype.base_dtype if off_exists
-                 else None)
+    on_dtype = (
+        ops.convert_to_tensor(on_value).dtype.base_dtype if on_exists else None)
+    off_dtype = (
+        ops.convert_to_tensor(off_value).dtype.base_dtype
+        if off_exists else None)
 
     if on_exists or off_exists:
       if dtype is not None:
@@ -3126,6 +3064,7 @@
       Default is the maximum value in `lengths`.
     dtype: output type of the resulting tensor.
     name: name of the op.
+
   Returns:
     A mask tensor of shape `lengths.shape + (maxlen,)`, cast to specified dtype.
   Raises:
@@ -3161,6 +3100,7 @@
 
 
 @tf_export(v1=["squeeze"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
@@ -3186,12 +3126,18 @@
   tf.shape(tf.squeeze(t, [2, 4]))  # [1, 2, 3, 1]
   ```
 
+  Note: When it comes to squeezing ragged tensors, it has O(number of elements).
+
+  Note: if `input` is a `tf.RaggedTensor`, then this operation takes `O(N)`
+  time, where `N` is the number of elements in the squeezed dimensions.
+
   Args:
     input: A `Tensor`. The `input` to squeeze.
-    axis: An optional list of `ints`. Defaults to `[]`.
-      If specified, only squeezes the dimensions listed. The dimension
-      index starts at 0. It is an error to squeeze a dimension that is not 1.
-      Must be in the range `[-rank(input), rank(input))`.
+    axis: An optional list of `ints`. Defaults to `[]`. If specified, only
+      squeezes the dimensions listed. The dimension index starts at 0. It is an
+      error to squeeze a dimension that is not 1. Must be in the range
+      `[-rank(input), rank(input))`.
+      Must be specified if `input` is a `RaggedTensor`.
     name: A name for the operation (optional).
     squeeze_dims: Deprecated keyword argument that is now axis.
 
@@ -3203,14 +3149,15 @@
   Raises:
     ValueError: When both `squeeze_dims` and `axis` are specified.
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "squeeze_dims", squeeze_dims)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "squeeze_dims",
+                                                squeeze_dims)
   if np.isscalar(axis):
     axis = [axis]
   return gen_array_ops.squeeze(input, axis, name)
 
 
 @tf_export("squeeze", v1=[])
+@dispatch.add_dispatch_support
 def squeeze_v2(input, axis=None, name=None):
   # pylint: disable=redefined-builtin
   return squeeze(input, axis, name)
@@ -3272,10 +3219,12 @@
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["reverse_sequence"])
-@deprecation.deprecated_args(
-    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
-@deprecation.deprecated_args(
-    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
+@deprecation.deprecated_args(None,
+                             "seq_dim is deprecated, use seq_axis instead",
+                             "seq_dim")
+@deprecation.deprecated_args(None,
+                             "batch_dim is deprecated, use batch_axis instead",
+                             "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
@@ -3302,8 +3251,11 @@
 
 
 @tf_export("reverse_sequence", v1=[])
-def reverse_sequence_v2(
-    input, seq_lengths, seq_axis=None, batch_axis=None, name=None):
+def reverse_sequence_v2(input,
+                        seq_lengths,
+                        seq_axis=None,
+                        batch_axis=None,
+                        name=None):
   return gen_array_ops.reverse_sequence(
       input=input,
       seq_lengths=seq_lengths,
@@ -3414,10 +3366,19 @@
 
 @tf_export("gather", v1=[])
 @dispatch.add_dispatch_support
-def gather_v2(params, indices, validate_indices=None, axis=None,
-              batch_dims=0, name=None):
-  return gather(params, indices, validate_indices=validate_indices, name=name,
-                axis=axis, batch_dims=batch_dims)
+def gather_v2(params,
+              indices,
+              validate_indices=None,
+              axis=None,
+              batch_dims=0,
+              name=None):
+  return gather(
+      params,
+      indices,
+      validate_indices=validate_indices,
+      name=name,
+      axis=axis,
+      batch_dims=batch_dims)
 
 
 gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
@@ -3725,8 +3686,7 @@
     else:
       return gen_array_ops.gather_nd(params, indices, name=name)
   else:
-    return batch_gather_nd(
-        params, indices, batch_dims=batch_dims, name=name)
+    return batch_gather_nd(params, indices, batch_dims=batch_dims, name=name)
 
 
 @tf_export("gather_nd", v1=[])
@@ -3782,7 +3742,8 @@
     batch_dim_list = unstack(batch_shape, axis=0)
     dim_ranges = [
         gen_math_ops.cast(gen_math_ops._range(0, x, 1), indices.dtype)
-        for x in batch_dim_list]
+        for x in batch_dim_list
+    ]
     mesh_list = meshgrid(*dim_ranges, indexing="ij") if dim_ranges else []
     # Then we flatten and stack the tensors to form a (B1.B2) by 2 matrix.
     flat_list = [reshape(x, shape=(-1,)) for x in mesh_list]
@@ -3791,10 +3752,13 @@
     # concat -> index_grid [B1.B2, 2] with indices [i1, ..., iK, C]
     # So we reshape them both to [(B1.B2), i1, ..., iK, *]
     index_grid_shape = shape(index_grid)
-    index_grid = reshape(index_grid,
-                         concat([index_grid_shape[:1],
-                                 ones(index_internal_ndims, dtype=dtypes.int32),
-                                 index_grid_shape[1:]], axis=0))
+    index_grid = reshape(
+        index_grid,
+        concat([
+            index_grid_shape[:1],
+            ones(index_internal_ndims, dtype=dtypes.int32), index_grid_shape[1:]
+        ],
+               axis=0))
     tile_shape = concat(((1,), indices_internal_shape, (1,)), axis=0)
     index_grid = tile(index_grid, multiples=tile_shape)
     # index_grid now has shape [(B1.B2), i1, ..., iK, 2]
@@ -3820,27 +3784,30 @@
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
     "instead.")  # pylint: disable=missing-docstring
-def quantize_v2(input,  # pylint: disable=redefined-builtin
-                min_range,
-                max_range,
-                T,
-                mode="MIN_COMBINED",
-                name=None,
-                round_mode="HALF_AWAY_FROM_ZERO"):
-  return gen_array_ops.quantize_v2(input,
-                                   min_range,
-                                   max_range,
-                                   T=T,
-                                   mode=mode,
-                                   name=name,
-                                   round_mode=round_mode)
+def quantize_v2(
+    input,  # pylint: disable=redefined-builtin
+    min_range,
+    max_range,
+    T,
+    mode="MIN_COMBINED",
+    name=None,
+    round_mode="HALF_AWAY_FROM_ZERO"):
+  return gen_array_ops.quantize_v2(
+      input,
+      min_range,
+      max_range,
+      T=T,
+      mode=mode,
+      name=name,
+      round_mode=round_mode)
 
 
 quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 
 
-# We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
-# tf.quantize_v2 in next version of TensorFlow.
+# We want to expose tf.quantization.quantize instead of
+# tf.quantization.quantize; we can deprecate tf.quantization.quantize in next
+# version of TensorFlow.
 @tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
 @deprecation.deprecated_endpoints("quantize")
 def quantize(input,  # pylint: disable=redefined-builtin
@@ -3925,13 +3892,7 @@
 
 
 @tf_export("image.extract_patches")
-def extract_image_patches_v2(
-    images,
-    sizes,
-    strides,
-    rates,
-    padding,
-    name=None):
+def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   # pylint: disable=line-too-long
   r"""Extract `patches` from `images` and put them in the \"depth\" output dimension.
 
@@ -3960,13 +3921,13 @@
     the output patches.
   """
   # pylint: enable=line-too-long
-  return gen_array_ops.extract_image_patches(
-      images, sizes, strides, rates, padding, name)
+  return gen_array_ops.extract_image_patches(images, sizes, strides, rates,
+                                             padding, name)
 
 
 @tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
-@deprecation.deprecated_args(
-    None, "ksizes is deprecated, use sizes instead", "ksizes")
+@deprecation.deprecated_args(None, "ksizes is deprecated, use sizes instead",
+                             "ksizes")
 def extract_image_patches(  # pylint: disable=missing-docstring
     images,
     ksizes=None,
@@ -3975,10 +3936,59 @@
     padding=None,
     name=None,
     sizes=None):
-  ksizes = deprecation.deprecated_argument_lookup(
-      "sizes", sizes, "ksizes", ksizes)
-  return gen_array_ops.extract_image_patches(
-      images, ksizes, strides, rates, padding, name)
+  ksizes = deprecation.deprecated_argument_lookup("sizes", sizes, "ksizes",
+                                                  ksizes)
+  return gen_array_ops.extract_image_patches(images, ksizes, strides, rates,
+                                             padding, name)
 
 
 extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
+
+
+@tf_export("fingerprint")
+def fingerprint(data, method="farmhash64", name=None):
+  r"""Generates fingerprint values.
+
+  Generates fingerprint values of `data`.
+
+  Fingerprint op considers the first dimension of `data` as the batch dimension,
+  and `output[i]` contains the fingerprint value generated from contents in
+  `data[i, ...]` for all `i`.
+
+  Fingerprint op writes fingerprint values as byte arrays. For example, the
+  default method `farmhash64` generates a 64-bit fingerprint value at a time.
+  This 8-byte value is written out as an `tf.uint8` array of size 8, in
+  little-endian order.
+
+  For example, suppose that `data` has data type `tf.int32` and shape (2, 3, 4),
+  and that the fingerprint method is `farmhash64`. In this case, the output
+  shape is (2, 8), where 2 is the batch dimension size of `data`, and 8 is the
+  size of each fingerprint value in bytes. `output[0, :]` is generated from
+  12 integers in `data[0, :, :]` and similarly `output[1, :]` is generated from
+  other 12 integers in `data[1, :, :]`.
+
+  Note that this op fingerprints the raw underlying buffer, and it does not
+  fingerprint Tensor's metadata such as data type and/or shape. For example, the
+  fingerprint values are invariant under reshapes and bitcasts as long as the
+  batch dimension remain the same:
+
+  ```python
+  tf.fingerprint(data) == tf.fingerprint(tf.reshape(data, ...))
+  tf.fingerprint(data) == tf.fingerprint(tf.bitcast(data, ...))
+  ```
+
+  For string data, one should expect `tf.fingerprint(data) !=
+  tf.fingerprint(tf.string.reduce_join(data))` in general.
+
+  Args:
+    data: A `Tensor`. Must have rank 1 or higher.
+    method: A `Tensor` of type `tf.string`. Fingerprint method used by this op.
+      Currently available method is `farmhash64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to
+    `data`'s first dimension, and the second dimension size depends on the
+    fingerprint algorithm.
+  """
+  return gen_array_ops.fingerprint(data, method, name)
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index d83b819..f57a820 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -48,7 +48,7 @@
 
 
 # Note that the naive implementation is much slower:
-# batch_norm = (tensor - mean) * tf.rsqrt(variance + 0.001)
+# batch_norm = (tensor - mean) * tf.math.rsqrt(variance + 0.001)
 # if scale:
 #   batch_norm *= gamma
 # return batch_norm + beta
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 8c9ca64..488945d 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -159,7 +159,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_negative(x)]):
+  with tf.control_dependencies([tf.compat.v1.assert_negative(x)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -233,7 +233,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_positive(x)]):
+  with tf.control_dependencies([tf.compat.v1.assert_positive(x)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -308,7 +308,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_non_negative(x)]):
+  with tf.control_dependencies([tf.compat.v1.assert_non_negative(x)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -385,7 +385,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_non_positive(x)]):
+  with tf.control_dependencies([tf.compat.v1.assert_non_positive(x)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -461,7 +461,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_equal(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_equal(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -603,7 +603,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_none_equal(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_none_equal(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -709,7 +709,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_near(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_near(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -820,7 +820,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_less(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_less(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -905,7 +905,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_less_equal(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_less_equal(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -989,7 +989,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_greater(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_greater(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -1076,7 +1076,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_greater_equal(x, y)]):
+  with tf.control_dependencies([tf.compat.v1.assert_greater_equal(x, y)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -1206,7 +1206,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_rank(x, 2)]):
+  with tf.control_dependencies([tf.compat.v1.assert_rank(x, 2)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -1303,7 +1303,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_rank_at_least(x, 2)]):
+  with tf.control_dependencies([tf.compat.v1.assert_rank_at_least(x, 2)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -1467,7 +1467,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_rank_in(x, (2, 4))]):
+  with tf.control_dependencies([tf.compat.v1.assert_rank_in(x, (2, 4))]):
     output = tf.reduce_sum(x)
   ```
 
@@ -1549,7 +1549,7 @@
   Example of adding a dependency to an operation:
 
   ```python
-  with tf.control_dependencies([tf.assert_integer(x)]):
+  with tf.control_dependencies([tf.compat.v1.assert_integer(x)]):
     output = tf.reduce_sum(x)
   ```
 
@@ -1886,7 +1886,7 @@
 
   For example:
   ```python
-  x = tf.placeholder(tf.int32)
+  x = tf.compat.v1.placeholder(tf.int32)
   print(x.shape)
   ==> TensorShape(None)
   y = x * 2
@@ -1897,7 +1897,7 @@
   print(y.shape)
   ==> TensorShape([Dimension(None), Dimension(3), Dimension(3)])
 
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
     # Raises tf.errors.InvalidArgumentError, because the shape (3,) is not
     # compatible with the shape (None, 3, 3)
     sess.run(y, feed_dict={x: [1, 2, 3]})
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 682bb6f..5d66139 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -115,7 +115,7 @@
     # NOTE(skyewm): if there are any active sessions, this modification to `op`
     # may make them unrunnable!
 
-    if control_flow_util.InXlaContext(ops.get_default_graph()):
+    if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
       # XLA does not yet support optionals, so output intermediates directly and
       # make them match via FakeParams, which can be converted to zeros in XLA.
       # TODO(skyewm,jpienaar): can XLA support optionals?
@@ -185,6 +185,7 @@
     A list of Tensors which are the outputs of the If op. Does not include added
     intermediate outputs.
   """
+  _make_indexed_slices_indices_types_match(true_graph, false_graph)
   _check_same_outputs(true_graph, false_graph)
 
   # Add inputs to true_graph and false_graph to make them match. Note that
@@ -479,7 +480,7 @@
 
 
 def _make_output_composite_tensors_match(true_graph, false_graph):
-  """Rewrites {true,false}_graph's outputs to use the same _TensorLike classes.
+  """Modifies true_graph and false_graph so they have the same output signature.
 
   Currently the only transformation implemented is turning a Tensor into an
   equivalent IndexedSlices if the other branch returns an IndexedSlices.
@@ -522,6 +523,63 @@
   false_graph.outputs = func_graph_module.flatten(false_outputs)
 
 
+def _make_indexed_slices_indices_types_match(true_graph, false_graph):
+  """Match dtype of IndexedSlices.indices in outputs of {true|false}_graphs."""
+  indexed_slice_indices = []
+  current_index = 0
+  true_outputs_flat_with_composites = nest.flatten(
+      true_graph.structured_outputs, expand_composites=False)
+  false_outputs_flat_with_composites = nest.flatten(
+      false_graph.structured_outputs, expand_composites=False)
+  # Store indices of IndexedSlices.indices in `indexed_slice_indices`.
+  for idx, (true_out, false_out) in enumerate(
+      zip(true_outputs_flat_with_composites,
+          false_outputs_flat_with_composites)):
+    if isinstance(true_out, ops.IndexedSlices) != isinstance(
+        false_out, ops.IndexedSlices):
+      raise TypeError("Cannot reconcile tf.cond %i-th outputs:\n"
+                      "  true_fn returned:  %s\n"
+                      "  false_fn returned: %s" % (idx, true_out, false_out))
+    if isinstance(true_out, ops.IndexedSlices):
+      # indices is the second component of the composite tensor.
+      indexed_slice_indices.append(current_index + 1)
+    if nest.is_sequence_or_composite(true_out):
+      current_index += len(nest.flatten(true_out, expand_composites=True))
+    else:
+      current_index += 1
+
+  if not indexed_slice_indices:
+    return
+
+  if current_index != len(true_graph.outputs):
+    raise ValueError("Insufficient elements in true_graph.outputs.\n"
+                     "Expected: %i\n"
+                     "Actual: %i" % (current_index, len(true_graph.outputs)))
+
+  # Cast indices with mismatching types to int64.
+  for index in indexed_slice_indices:
+    if true_graph.outputs[index].dtype not in (dtypes.int32, dtypes.int64):
+      raise TypeError("Type of IndexedSlices.indices must be int32 or int64. "
+                      "Found: %s" % str(true_graph.outputs[index].dtype))
+    if false_graph.outputs[index].dtype not in (dtypes.int32, dtypes.int64):
+      raise TypeError("Type of IndexedSlices.indices must be int32 or int64. "
+                      "Found: %s" % str(false_graph.outputs[index].dtype))
+    if true_graph.outputs[index].dtype != false_graph.outputs[index].dtype:
+      if false_graph.outputs[index].dtype == dtypes.int32:
+        with false_graph.as_default():
+          false_graph.outputs[index] = math_ops.cast(false_graph.outputs[index],
+                                                     dtypes.int64)
+      else:
+        with true_graph.as_default():
+          true_graph.outputs[index] = math_ops.cast(true_graph.outputs[index],
+                                                    dtypes.int64)
+
+  true_graph.structured_outputs = func_graph_module.pack_sequence_as(
+      true_graph.structured_outputs, true_graph.outputs)
+  false_graph.structured_outputs = func_graph_module.pack_sequence_as(
+      false_graph.structured_outputs, false_graph.outputs)
+
+
 def _wrap_intermediates(func_graph, intermediates):
   with func_graph.as_default():
     return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
@@ -656,7 +714,7 @@
         tensor in self._forward_graph.outputs):
       return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
 
-    if control_flow_util.InXlaContext(ops.get_default_graph()):
+    if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
       # XLA does not yet support optionals, so capture intermediates directly.
       # TODO(skyewm,jpienaar): can XLA support optionals?
       if tensor not in self.captures:
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index f61d681..bdee7b4 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -116,7 +116,7 @@
   For example:
 
   ```python
-    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+    tf.math.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
         [[0 0 0 0 0]
          [0 0 1 0 0]
          [0 0 1 0 0]
@@ -226,7 +226,7 @@
   For example:
 
   ```python
-    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+    tf.math.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
         [[0 0 0 0 0]
          [0 0 1 0 0]
          [0 0 1 0 0]
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e1e35a6..4ad443e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -247,9 +247,11 @@
       result.set_shape(data.get_shape())
     return result
   elif isinstance(data, composite_tensor.CompositeTensor):
+
     def enter_component(t):
-      return _Enter(t, frame_name, is_constant, parallel_iterations,
-                    use_ref, use_input_shape)
+      return _Enter(t, frame_name, is_constant, parallel_iterations, use_ref,
+                    use_input_shape)
+
     return nest.map_structure(enter_component, data, expand_composites=True)
   else:
     raise TypeError("Type %s not supported" % type(data))
@@ -409,12 +411,14 @@
         nest.assert_same_structure(inputs[0], v, expand_composites=True)
 
       flat_inputs = [nest.flatten(v, expand_composites=True) for v in inputs]
-      merged_results = [gen_control_flow_ops.merge(component)
-                        for component in zip(*flat_inputs)]
+      merged_results = [
+          gen_control_flow_ops.merge(component)
+          for component in zip(*flat_inputs)
+      ]
       flat_merged = [tensor for (tensor, _) in merged_results]
       chosen_index = merged_results[0][1]
-      merged_inputs = nest.pack_sequence_as(inputs[0], flat_merged,
-                                            expand_composites=True)
+      merged_inputs = nest.pack_sequence_as(
+          inputs[0], flat_merged, expand_composites=True)
       return (merged_inputs, chosen_index)
 
 
@@ -566,6 +570,7 @@
     # pylint: disable=protected-access
     def update_component(m_component, v_component):
       m_component.op._update_input(1, v_component)
+
     if isinstance(m, ops.IndexedSlices):
       v = math_ops._as_indexed_slices(v, optimize=False)
     # pylint: enable=protected-access
@@ -1488,8 +1493,10 @@
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
-      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result,
-                         expand_composites=True)
+      nest.map_structure(
+          lambda x: self._outer_context.AddName(x.name),
+          result,
+          expand_composites=True)
 
   def GetWhileContext(self):
     """Return the while context containing this context."""
@@ -1797,8 +1804,8 @@
       # Use pivot as the proxy for this op.
       return with_dependencies([v], self._pivot)
     else:
-      v = nest.map_structure(_convert_tensorarray_to_flow, v,
-                             expand_composites=True)
+      v = nest.map_structure(
+          _convert_tensorarray_to_flow, v, expand_composites=True)
       return self._ProcessOutputTensor(ops.convert_to_tensor(v))
 
   def BuildCondBranch(self, fn):
@@ -1814,14 +1821,13 @@
         if original_result is None:
           return no_op(), None
         else:
-          original_result = nest.map_structure(array_ops.identity,
-                                               original_result,
-                                               expand_composites=True)
+          original_result = nest.map_structure(
+              array_ops.identity, original_result, expand_composites=True)
     if original_result is None:
       return None, None
 
-    result = nest.map_structure(self._BuildCondTensor, original_result,
-                                expand_composites=True)
+    result = nest.map_structure(
+        self._BuildCondTensor, original_result, expand_composites=True)
     if not isinstance(result, (list, _basetuple)):
       result = [result]
     return original_result, result
@@ -1946,8 +1952,12 @@
   with ops.name_scope(name, "cond", [pred]):
     if context.executing_eagerly():
       if pred:
-        return _UnpackIfSingleton(true_fn())
-      return _UnpackIfSingleton(false_fn())
+        result = true_fn()
+      else:
+        result = false_fn()
+      if not strict:
+        result = _UnpackIfSingleton(result)
+      return result
 
     # Add the Switch to the graph.
     if isinstance(pred, bool):
@@ -1988,8 +1998,7 @@
 
     # Check that the return values of the two branches have the same structure.
     try:
-      nest.assert_same_structure(orig_res_t, orig_res_f,
-                                 expand_composites=True)
+      nest.assert_same_structure(orig_res_t, orig_res_f, expand_composites=True)
     except TypeError as e:
       raise TypeError(
           "Incompatible return types of true_fn and false_fn: {}".format(e))
@@ -2024,8 +2033,8 @@
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
-    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges,
-                                   expand_composites=True)
+    merges = nest.pack_sequence_as(
+        structure=orig_res_t, flat_sequence=merges, expand_composites=True)
 
     # Singleton lists and tuples are automatically unpacked if strict == False.
     if not strict:
@@ -2044,12 +2053,12 @@
 
   Args:
     structure: The nested structure that was flattened.
-    flat_a: A flattened list of `Tensors` whose structure matches
-        `structure`.  Will be modified in place to cast `IndexedSlices`
-        indices tensors to int64, where necessary.
-    flat_a: A flattened list of `Tensors` whose structure matches
-        `structure`.  Will be modified in place to cast `IndexedSlices`
-        indices tensors to int64, where necessary.
+    flat_a: A flattened list of `Tensors` whose structure matches `structure`.
+      Will be modified in place to cast `IndexedSlices` indices tensors to
+      int64, where necessary.
+    flat_a: A flattened list of `Tensors` whose structure matches `structure`.
+      Will be modified in place to cast `IndexedSlices` indices tensors to
+      int64, where necessary.
   """
   # Find the locations (in flat_a and flat_b) of the IndexedSlices'
   # indices tensors.
@@ -2080,10 +2089,7 @@
 
 
 @tf_export("cond", v1=[])
-def cond_for_tf_v2(pred,
-                   true_fn=None,
-                   false_fn=None,
-                   name=None):
+def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
   `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
@@ -2944,15 +2950,15 @@
             return x
           return array_ops.identity(x)
 
-        body_result = nest.map_structure(map_fn, body_result,
-                                         expand_composites=True)
+        body_result = nest.map_structure(
+            map_fn, body_result, expand_composites=True)
 
     # Compare the structure types of input and output of body.
     # For backwards compatibility, the first layer is forced to a list
     # during this comparison, because inputs are typically lists and
     # outputs of the body are typically tuples.
-    nest.assert_same_structure(list(packed_vars_for_body), list(body_result),
-                               expand_composites=True)
+    nest.assert_same_structure(
+        list(packed_vars_for_body), list(body_result), expand_composites=True)
 
     # Store body_result to keep track of TensorArrays returned by body
     original_body_result = body_result
@@ -3196,9 +3202,10 @@
   n = 10000
   x = tf.constant(list(range(n)))
   c = lambda i, x: i < n
-  b = lambda i, x: (tf.Print(i + 1, [i]), tf.Print(x + 1, [i], "x:"))
+  b = lambda i, x: (tf.compat.v1.Print(i + 1, [i]), tf.compat.v1.Print(x + 1,
+  [i], "x:"))
   i, out = tf.while_loop(c, b, (0, x))
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
       print(sess.run(i))  # prints [0] ... [9999]
 
       # The following line may increment the counter and x in parallel.
@@ -3382,9 +3389,10 @@
   n = 10000
   x = tf.constant(list(range(n)))
   c = lambda i, x: i < n
-  b = lambda i, x: (tf.Print(i + 1, [i]), tf.Print(x + 1, [i], "x:"))
+  b = lambda i, x: (tf.compat.v1.Print(i + 1, [i]), tf.compat.v1.Print(x + 1,
+  [i], "x:"))
   i, out = tf.while_loop(c, b, (0, x))
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
       print(sess.run(i))  # prints [0] ... [9999]
 
       # The following line may increment the counter and x in parallel.
@@ -3461,7 +3469,8 @@
         if isinstance(x, tensor_array_ops.TensorArray):
           return x
         return ops.convert_to_tensor(x)
-      loop_vars = nest.map_structure(convert, loop_vars)
+
+      loop_vars = nest.map_structure(convert, loop_vars, expand_composites=True)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
@@ -3471,10 +3480,12 @@
       if maximum_iterations is not None:
         shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
 
-      nest.assert_same_structure(loop_vars, shape_invariants,
-                                 expand_composites=False)
+      nest.assert_same_structure(
+          loop_vars, shape_invariants, expand_composites=False)
       shape_invariants = nest.map_structure(
-          _get_shape_invariant, loop_vars, shape_invariants,
+          _get_shape_invariant,
+          loop_vars,
+          shape_invariants,
           expand_composites=False)
 
     loop_context = WhileContext(
@@ -4036,8 +4047,8 @@
   def to_control_flow_context_def(self, context_def, export_scope=None):
     # pylint: disable=useless-super-delegation
     # NOTE(slebedev): the method is required by `ControlFlowContext`.
-    super(XLAControlFlowContext, self).to_control_flow_context_def(
-        context_def, export_scope)
+    super(XLAControlFlowContext,
+          self).to_control_flow_context_def(context_def, export_scope)
 
   def IsXLAContext(self):
     return True
diff --git a/tensorflow/python/ops/critical_section_ops.py b/tensorflow/python/ops/critical_section_ops.py
index b5acde0..85d828c 100644
--- a/tensorflow/python/ops/critical_section_ops.py
+++ b/tensorflow/python/ops/critical_section_ops.py
@@ -124,7 +124,7 @@
   will not ensure serial execution:
 
   ```python
-  v = tf.get_variable("v", initializer=0.0, use_resource=True)
+  v = tf.compat.v1.get_variable("v", initializer=0.0, use_resource=True)
   def accumulate(up):
     x = v.read_value()
     with tf.control_dependencies([x]):
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 1fe300b..22a8c95 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """CTC (Connectionist Temporal Classification) Operations."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -27,6 +28,7 @@
 from tensorflow.python.framework import tensor_shape
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops import inplace_ops
@@ -43,10 +45,13 @@
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
-def ctc_loss(labels, inputs=None, sequence_length=None,
+def ctc_loss(labels,
+             inputs=None,
+             sequence_length=None,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
-             ignore_longer_outputs_than_inputs=False, time_major=True,
+             ignore_longer_outputs_than_inputs=False,
+             time_major=True,
              logits=None):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
 
@@ -119,28 +124,24 @@
 
   Args:
     labels: An `int32` `SparseTensor`.
-      `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
-      the id for (batch b, time t).
-      `labels.values[i]` must take on values in `[0, num_labels)`.
-      See `core/ops/ctc_ops.cc` for more details.
+      `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores the id
+        for (batch b, time t). `labels.values[i]` must take on values in `[0,
+        num_labels)`. See `core/ops/ctc_ops.cc` for more details.
     inputs: 3-D `float` `Tensor`.
-      If time_major == False, this will be a `Tensor` shaped:
-        `[batch_size, max_time, num_classes]`.
+      If time_major == False, this will be a `Tensor` shaped: `[batch_size,
+        max_time, num_classes]`.
       If time_major == True (default), this will be a `Tensor` shaped:
-        `[max_time, batch_size, num_classes]`.
-      The logits.
-    sequence_length: 1-D `int32` vector, size `[batch_size]`.
-      The sequence lengths.
-    preprocess_collapse_repeated: Boolean.  Default: False.
-      If True, repeated labels are collapsed prior to the CTC calculation.
+        `[max_time, batch_size, num_classes]`. The logits.
+    sequence_length: 1-D `int32` vector, size `[batch_size]`. The sequence
+      lengths.
+    preprocess_collapse_repeated: Boolean.  Default: False. If True, repeated
+      labels are collapsed prior to the CTC calculation.
     ctc_merge_repeated: Boolean.  Default: True.
-    ignore_longer_outputs_than_inputs: Boolean. Default: False.
-      If True, sequences with longer outputs than inputs will be ignored.
-    time_major: The shape format of the `inputs` Tensors.
-      If True, these `Tensors` must be shaped `[max_time, batch_size,
-      num_classes]`.
-      If False, these `Tensors` must be shaped `[batch_size, max_time,
-      num_classes]`.
+    ignore_longer_outputs_than_inputs: Boolean. Default: False. If True,
+      sequences with longer outputs than inputs will be ignored.
+    time_major: The shape format of the `inputs` Tensors. If True, these
+      `Tensors` must be shaped `[max_time, batch_size, num_classes]`. If False,
+      these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
       Using `time_major = True` (default) is a bit more efficient because it
       avoids transposes at the beginning of the ctc_loss calculation.  However,
       most TensorFlow data is batch-major, so by this function also accepts
@@ -160,8 +161,8 @@
     raise TypeError("Expected labels (first argument) to be a SparseTensor")
 
   # For internal calculations, we transpose to [time, batch, num_classes]
-  inputs = deprecation.deprecated_argument_lookup(
-      "logits", logits, "inputs", inputs)
+  inputs = deprecation.deprecated_argument_lookup("logits", logits, "inputs",
+                                                  inputs)
   if not time_major:
     inputs = array_ops.transpose(inputs, [1, 0, 2])  # (B,T,N) => (T,B,N)
 
@@ -196,7 +197,8 @@
   # so we make sure we prevent silently incorrect results by raising
   # an error if the second derivative is requested via prevent_gradient.
   grad_without_gradient = array_ops.prevent_gradient(
-      op.outputs[1], message="Currently there is no way to take the second "
+      op.outputs[1],
+      message="Currently there is no way to take the second "
       " derivative of ctc_loss due to the fused implementation's interaction "
       " with tf.gradients()")
   # Return gradient for inputs and None for
@@ -221,10 +223,10 @@
     * `A B B B B` if `merge_repeated=False`.
 
   Args:
-    inputs: 3-D `float` `Tensor` sized
-      `[max_time, batch_size, num_classes]`.  The logits.
-    sequence_length: 1-D `int32` vector containing sequence lengths,
-      having size `[batch_size]`.
+    inputs: 3-D `float` `Tensor` sized `[max_time, batch_size, num_classes]`.
+      The logits.
+    sequence_length: 1-D `int32` vector containing sequence lengths, having size
+      `[batch_size]`.
     merge_repeated: Boolean.  Default: True.
 
   Returns:
@@ -249,13 +251,16 @@
   outputs = gen_ctc_ops.ctc_greedy_decoder(
       inputs, sequence_length, merge_repeated=merge_repeated)
   (decoded_ix, decoded_val, decoded_shape, log_probabilities) = outputs
-  return ([sparse_tensor.SparseTensor(decoded_ix, decoded_val, decoded_shape)],
-          log_probabilities)
+  return ([sparse_tensor.SparseTensor(decoded_ix, decoded_val,
+                                      decoded_shape)], log_probabilities)
 
 
 @tf_export(v1=["nn.ctc_beam_search_decoder"])
-def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
-                            top_paths=1, merge_repeated=True):
+def ctc_beam_search_decoder(inputs,
+                            sequence_length,
+                            beam_width=100,
+                            top_paths=1,
+                            merge_repeated=True):
   """Performs beam search decoding on the logits given in input.
 
   **Note** The `ctc_greedy_decoder` is a special case of the
@@ -271,10 +276,10 @@
     * `A B B B` if `merge_repeated = False`.
 
   Args:
-    inputs: 3-D `float` `Tensor`, size
-      `[max_time x batch_size x num_classes]`.  The logits.
-    sequence_length: 1-D `int32` vector containing sequence lengths,
-      having size `[batch_size]`.
+    inputs: 3-D `float` `Tensor`, size `[max_time x batch_size x num_classes]`.
+      The logits.
+    sequence_length: 1-D `int32` vector containing sequence lengths, having size
+      `[batch_size]`.
     beam_width: An int scalar >= 0 (beam search beam width).
     top_paths: An int scalar >= 0, <= beam_width (controls output size).
     merge_repeated: Boolean.  Default: True.
@@ -300,17 +305,22 @@
 
   decoded_ixs, decoded_vals, decoded_shapes, log_probabilities = (
       gen_ctc_ops.ctc_beam_search_decoder(
-          inputs, sequence_length, beam_width=beam_width, top_paths=top_paths,
+          inputs,
+          sequence_length,
+          beam_width=beam_width,
+          top_paths=top_paths,
           merge_repeated=merge_repeated))
 
-  return (
-      [sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape)
-       in zip(decoded_ixs, decoded_vals, decoded_shapes)],
-      log_probabilities)
+  return ([
+      sparse_tensor.SparseTensor(ix, val, shape)
+      for (ix, val, shape) in zip(decoded_ixs, decoded_vals, decoded_shapes)
+  ], log_probabilities)
 
 
 @tf_export("nn.ctc_beam_search_decoder", v1=["nn.ctc_beam_search_decoder_v2"])
-def ctc_beam_search_decoder_v2(inputs, sequence_length, beam_width=100,
+def ctc_beam_search_decoder_v2(inputs,
+                               sequence_length,
+                               beam_width=100,
                                top_paths=1):
   """Performs beam search decoding on the logits given in input.
 
@@ -319,10 +329,10 @@
   that decoder is faster for this special case).
 
   Args:
-    inputs: 3-D `float` `Tensor`, size
-      `[max_time, batch_size, num_classes]`.  The logits.
-    sequence_length: 1-D `int32` vector containing sequence lengths,
-      having size `[batch_size]`.
+    inputs: 3-D `float` `Tensor`, size `[max_time, batch_size, num_classes]`.
+      The logits.
+    sequence_length: 1-D `int32` vector containing sequence lengths, having size
+      `[batch_size]`.
     beam_width: An int scalar >= 0 (beam search beam width).
     top_paths: An int scalar >= 0, <= beam_width (controls output size).
 
@@ -347,9 +357,12 @@
 
   # Note, merge_repeated is an invalid optimization that is removed from the
   # public API: it returns low probability paths.
-  return ctc_beam_search_decoder(inputs, sequence_length=sequence_length,
-                                 beam_width=beam_width, top_paths=top_paths,
-                                 merge_repeated=False)
+  return ctc_beam_search_decoder(
+      inputs,
+      sequence_length=sequence_length,
+      beam_width=beam_width,
+      top_paths=top_paths,
+      merge_repeated=False)
 
 
 ops.NotDifferentiable("CTCGreedyDecoder")
@@ -388,8 +401,8 @@
     label_to_blank = array_ops.stack([blank_states, label_states], 1)
 
     # Scatter transitions that don't depend on sequence.
-    indices = array_ops.concat(
-        [start_to_label, blank_to_label, label_to_blank], 0)
+    indices = array_ops.concat([start_to_label, blank_to_label, label_to_blank],
+                               0)
     values = array_ops.ones([_get_dim(indices, 0)])
     trans = array_ops.scatter_nd(
         indices, values, shape=[num_states, num_states])
@@ -398,8 +411,8 @@
     # Label to label transitions. Disallow transitions between repeated labels
     # with no blank state in between.
     batch_idx = array_ops.zeros_like(label_states[2:])
-    indices = array_ops.stack(
-        [batch_idx, label_states[2:], label_states[1:-1]], 1)
+    indices = array_ops.stack([batch_idx, label_states[2:], label_states[1:-1]],
+                              1)
     indices = array_ops.tile(
         array_ops.expand_dims(indices, 0), [batch_size, 1, 1])
     batch_idx = array_ops.expand_dims(math_ops.range(batch_size), 1) * [1, 0, 0]
@@ -431,14 +444,14 @@
   num_duration_states = 2
   num_states = num_duration_states * num_label_states
   log_0 = math_ops.cast(
-      math_ops.log(math_ops.cast(0, dtypes.float64) + 1e-307),
-      dtypes.float32)
+      math_ops.log(math_ops.cast(0, dtypes.float64) + 1e-307), dtypes.float32)
 
   initial_state_log_probs = array_ops.one_hot(
       indices=array_ops.zeros([batch_size], dtype=dtypes.int32),
       depth=num_states,
       on_value=0.0,
-      off_value=log_0, axis=1)
+      off_value=log_0,
+      axis=1)
 
   label_final_state_mask = array_ops.one_hot(
       seq_lengths, depth=num_label_states, axis=0)
@@ -446,8 +459,8 @@
       [num_duration_states, 1, batch_size])
   final_state_mask = duration_final_state_mask * label_final_state_mask
   final_state_log_probs = (1.0 - final_state_mask) * log_0
-  final_state_log_probs = array_ops.reshape(
-      final_state_log_probs, [num_states, batch_size])
+  final_state_log_probs = array_ops.reshape(final_state_log_probs,
+                                            [num_states, batch_size])
 
   return initial_state_log_probs, array_ops.transpose(final_state_log_probs)
 
@@ -475,13 +488,14 @@
   label_states = states[:, :, 1:num_label_states]
   blank_states = states[:, :, num_label_states:]
   one_hot = array_ops.one_hot(
-      labels - 1, depth=(num_labels - 1),
-      on_value=0.0, off_value=math_ops.log(0.0))
+      labels - 1,
+      depth=(num_labels - 1),
+      on_value=0.0,
+      off_value=math_ops.log(0.0))
   one_hot = array_ops.expand_dims(one_hot, axis=0)
   label_states = array_ops.expand_dims(label_states, axis=3)
   label_olabels = math_ops.reduce_logsumexp(label_states + one_hot, axis=2)
-  blank_olabels = math_ops.reduce_logsumexp(
-      blank_states, axis=2, keepdims=True)
+  blank_olabels = math_ops.reduce_logsumexp(blank_states, axis=2, keepdims=True)
   return array_ops.concat([blank_olabels, label_olabels], axis=-1)
 
 
@@ -500,8 +514,8 @@
   batch_size = states.shape[1]
   num_states = num_label_states - 1
   batch_state_major = array_ops.transpose(mul_reduce, perm=[1, 2, 0])
-  batch_state_major = array_ops.reshape(
-      batch_state_major, [batch_size * num_states, num_frames])
+  batch_state_major = array_ops.reshape(batch_state_major,
+                                        [batch_size * num_states, num_frames])
   batch_offset = math_ops.range(batch_size, dtype=unique_y.dtype) * num_labels
   indices = unique_y + array_ops.expand_dims(batch_offset, axis=-1)
   indices = array_ops.reshape(indices, [-1, 1])
@@ -512,13 +526,11 @@
   scatter = array_ops.reshape(scatter, [batch_size, num_labels, num_frames])
   scatter = array_ops.where(
       math_ops.equal(scatter, 0.0),
-      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)),
-      scatter)
+      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)), scatter)
   label_olabels = array_ops.transpose(scatter, [2, 0, 1])
   label_olabels = label_olabels[:, :, 1:]
 
-  blank_olabels = math_ops.reduce_logsumexp(
-      blank_states, axis=2, keepdims=True)
+  blank_olabels = math_ops.reduce_logsumexp(blank_states, axis=2, keepdims=True)
 
   return array_ops.concat([blank_olabels, label_olabels], axis=-1)
 
@@ -534,12 +546,12 @@
   Args:
     logits: tensor of shape [frames, batch_size, num_labels]
     labels: tensor of shape [batch_size, max_label_seq_length]
-    label_length: tensor of shape [batch_size]
-      Length of reference label sequence in labels.
-    logit_length: tensor of shape [batch_size]
-      Length of input sequence in logits.
-    unique: (optional) unique label indices as computed by unique(labels)
-      If supplied, enables an implementation that is faster and more memory
+    label_length: tensor of shape [batch_size] Length of reference label
+      sequence in labels.
+    logit_length: tensor of shape [batch_size] Length of input sequence in
+      logits.
+    unique: (optional) unique label indices as computed by unique(labels) If
+      supplied, enables an implementation that is faster and more memory
       efficient on TPU.
 
   Returns:
@@ -563,8 +575,8 @@
       sequence_length=logit_length)
 
   if unique:
-    olabel_log_probs = _state_to_olabel_unique(
-        labels, num_labels, fwd_bwd_log_probs, unique)
+    olabel_log_probs = _state_to_olabel_unique(labels, num_labels,
+                                               fwd_bwd_log_probs, unique)
   else:
     olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)
 
@@ -585,9 +597,14 @@
 
 
 @tf_export("nn.ctc_loss", v1=["nn.ctc_loss_v2"])
-def ctc_loss_v2(labels, logits, label_length, logit_length,
-                logits_time_major=True, unique=None,
-                blank_index=None, name=None):
+def ctc_loss_v2(labels,
+                logits,
+                label_length,
+                logit_length,
+                logits_time_major=True,
+                unique=None,
+                blank_index=None,
+                name=None):
   """Computes CTC (Connectionist Temporal Classification) loss.
 
   This op implements the CTC loss as presented in the article:
@@ -598,7 +615,8 @@
   pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Notes:
-      - Same as the "Classic CTC" in TensorFlow 1.x's tf.nn.ctc_loss setting of
+      - Same as the "Classic CTC" in TensorFlow 1.x's tf.compat.v1.nn.ctc_loss
+      setting of
         preprocess_collapse_repeated=False, ctc_merge_repeated=True
       - Labels may be supplied as either a dense, zero-padded tensor with a
         vector of label sequence lengths OR as a SparseTensor.
@@ -612,22 +630,22 @@
 
   Args:
     labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
-    logits: tensor of shape [frames, batch_size, num_labels],
-      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    logits: tensor of shape [frames, batch_size, num_labels], if
+      logits_time_major == False, shape is [batch_size, frames, num_labels].
     label_length: tensor of shape [batch_size], None if labels is SparseTensor
       Length of reference label sequence in labels.
-    logit_length: tensor of shape [batch_size]
-      Length of input sequence in logits.
-    logits_time_major: (optional) If True (default), logits is shaped
-      [time, batch, logits]. If False, shape is [batch, time, logits]
+    logit_length: tensor of shape [batch_size] Length of input sequence in
+      logits.
+    logits_time_major: (optional) If True (default), logits is shaped [time,
+      batch, logits]. If False, shape is [batch, time, logits]
     unique: (optional) Unique label indices as computed by
-      ctc_unique_labels(labels).  If supplied, enable a faster, memory
-      efficient implementation on TPU.
+      ctc_unique_labels(labels).  If supplied, enable a faster, memory efficient
+      implementation on TPU.
     blank_index: (optional) Set the class index to use for the blank label.
       Negative values will start from num_classes, ie, -1 will reproduce the
-      ctc_loss behavior of using num_classes - 1 for the blank symbol.
-      There is some memory/performance overhead to switching from the default
-      of 0 as an additional shifted copy of the logits may be created.
+      ctc_loss behavior of using num_classes - 1 for the blank symbol. There is
+      some memory/performance overhead to switching from the default of 0 as an
+      additional shifted copy of the logits may be created.
     name: A name for this `Op`. Defaults to "ctc_loss_dense".
 
   Returns:
@@ -644,37 +662,43 @@
     if blank_index != _get_dim(logits, 2) - 1:
       logits = array_ops.concat([
           logits[:, :, :blank_index],
-          logits[:, :, blank_index+1:],
-          logits[:, :, blank_index:blank_index+1],
-      ], axis=2)
+          logits[:, :, blank_index + 1:],
+          logits[:, :, blank_index:blank_index + 1],
+      ],
+                                axis=2)
       labels = sparse_tensor.SparseTensor(
           labels.indices,
-          array_ops.where(labels.values < blank_index,
-                          labels.values,
-                          labels.values - 1),
-          labels.dense_shape)
+          array_ops.where(labels.values < blank_index, labels.values,
+                          labels.values - 1), labels.dense_shape)
 
-    return ctc_loss(labels=labels,
-                    inputs=logits,
-                    sequence_length=logit_length,
-                    time_major=logits_time_major)
+    return ctc_loss(
+        labels=labels,
+        inputs=logits,
+        sequence_length=logit_length,
+        time_major=logits_time_major)
 
   if blank_index is None:
     blank_index = 0
 
-  return ctc_loss_dense(labels=labels,
-                        logits=logits,
-                        label_length=label_length,
-                        logit_length=logit_length,
-                        logits_time_major=logits_time_major,
-                        unique=unique,
-                        blank_index=blank_index,
-                        name=name)
+  return ctc_loss_dense(
+      labels=labels,
+      logits=logits,
+      label_length=label_length,
+      logit_length=logit_length,
+      logits_time_major=logits_time_major,
+      unique=unique,
+      blank_index=blank_index,
+      name=name)
 
 
-def ctc_loss_dense(labels, logits, label_length, logit_length,
-                   logits_time_major=True, unique=None,
-                   blank_index=0, name=None):
+def ctc_loss_dense(labels,
+                   logits,
+                   label_length,
+                   logit_length,
+                   logits_time_major=True,
+                   unique=None,
+                   blank_index=0,
+                   name=None):
   """Computes CTC (Connectionist Temporal Classification) loss.
 
   This op implements the CTC loss as presented in the article:
@@ -694,8 +718,8 @@
   ](https://ieeexplore.ieee.org/iel7/8260578/8268903/08268944.pdf)
 
   Notes:
-    Significant differences from tf.nn.ctc_loss:
-      Supports GPU and TPU (tf.nn.ctc_loss supports CPU only):
+    Significant differences from tf.compat.v1.nn.ctc_loss:
+      Supports GPU and TPU (tf.compat.v1.nn.ctc_loss supports CPU only):
         For batched operations, GPU and TPU are significantly faster than using
         ctc_loss on CPU.
         This implementation runs on CPU, but significantly slower than ctc_loss.
@@ -714,21 +738,21 @@
 
   Args:
     labels: tensor of shape [batch_size, max_label_seq_length]
-    logits: tensor of shape [frames, batch_size, num_labels],
-      if logits_time_major == False, shape is [batch_size, frames, num_labels].
-    label_length: tensor of shape [batch_size]
-      Length of reference label sequence in labels.
-    logit_length: tensor of shape [batch_size]
-      Length of input sequence in logits.
-    logits_time_major: (optional) If True (default), logits is shaped
-      [time, batch, logits]. If False, shape is [batch, time, logits]
-    unique: (optional) Unique label indices as computed by unique(labels).
-      If supplied, enable a faster, memory efficient implementation on TPU.
+    logits: tensor of shape [frames, batch_size, num_labels], if
+      logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size] Length of reference label
+      sequence in labels.
+    logit_length: tensor of shape [batch_size] Length of input sequence in
+      logits.
+    logits_time_major: (optional) If True (default), logits is shaped [time,
+      batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by unique(labels). If
+      supplied, enable a faster, memory efficient implementation on TPU.
     blank_index: (optional) Set the class index to use for the blank label.
       Negative values will start from num_classes, ie, -1 will reproduce the
-      ctc_loss behavior of using num_classes - 1 for the blank symbol.
-      There is some memory/performance overhead to switching from the default
-      of 0 as an additional shifted copy of the logits may be created.
+      ctc_loss behavior of using num_classes - 1 for the blank symbol. There is
+      some memory/performance overhead to switching from the default of 0 as an
+      additional shifted copy of the logits may be created.
     name: A name for this `Op`. Defaults to "ctc_loss_dense".
 
   Returns:
@@ -749,10 +773,11 @@
       if blank_index < 0:
         blank_index += _get_dim(logits, 2)
       logits = array_ops.concat([
-          logits[:, :, blank_index:blank_index+1],
+          logits[:, :, blank_index:blank_index + 1],
           logits[:, :, :blank_index],
-          logits[:, :, blank_index+1:],
-      ], axis=2)
+          logits[:, :, blank_index + 1:],
+      ],
+                                axis=2)
       labels = array_ops.where(labels < blank_index, labels + 1, labels)
 
     args = [logits, labels, label_length, logit_length]
@@ -761,10 +786,7 @@
       unique_y, unique_idx = unique
       args.extend([unique_y, unique_idx])
 
-    # TODO(tombagby): Update to tfe.defun
-    @function.Defun(*[x.dtype for x in args],
-                    python_grad_func=_ctc_loss_grad,
-                    shape_func=_ctc_loss_shape)
+    @custom_gradient.custom_gradient
     def compute_ctc_loss(logits_t, labels_t, label_length_t, logit_length_t,
                          *unique_t):
       """Compute CTC loss."""
@@ -779,9 +801,15 @@
           logit_length=logit_length_t)
       if unique_t:
         kwargs["unique"] = unique_t
-      return ctc_loss_and_grad(**kwargs)
+      result = ctc_loss_and_grad(**kwargs)
+      def grad(grad_loss):
+        grad = [array_ops.reshape(grad_loss, [1, -1, 1]) * result[1]]
+        grad += [None] * (len(args) - len(grad))
+        return grad
 
-    return compute_ctc_loss(*args)[0]
+      return result[0], grad
+
+    return compute_ctc_loss(*args)
 
 
 @tf_export("nn.collapse_repeated")
@@ -789,29 +817,30 @@
   """Merge repeated labels into single labels.
 
   Args:
-    labels: Tensor of shape (batch, max value in seq_length)
-    seq_length: Tensor of shape (batch), sequence length of each batch element.
+    labels: Tensor of shape [batch, max value in seq_length]
+    seq_length: Tensor of shape [batch], sequence length of each batch element.
     name: A name for this `Op`. Defaults to "collapse_repeated_labels".
 
   Returns:
-    tuple of Tensor of shape (batch, max_seq_length) with repeated labels
-    collapsed and padded to max_seq_length, eg:
-        [[A, A, B, B, A],
-         [A, B, C, D, E]] => [[A, B, A, 0, 0],
-                              [A, B, C, D, E]]
-    and int tensor of shape [batch] with new sequence lengths.
+    A tuple `(collapsed_labels, new_seq_length)` where
+
+    collapsed_labels: Tensor of shape [batch, max_seq_length] with repeated
+    labels collapsed and padded to max_seq_length, eg:
+    `[[A, A, B, B, A], [A, B, C, D, E]] => [[A, B, A, 0, 0], [A, B, C, D, E]]`
+
+    new_seq_length: int tensor of shape [batch] with new sequence lengths.
   """
 
-  with ops.name_scope(name, "collapse_repeated_labels",
-                      [labels, seq_length]):
+  with ops.name_scope(name, "collapse_repeated_labels", [labels, seq_length]):
     labels = ops.convert_to_tensor(labels, name="labels")
     seq_length = ops.convert_to_tensor(seq_length, name="seq_length")
 
     # Mask labels that don't equal previous label.
-    label_mask = array_ops.concat(
-        [array_ops.ones_like(labels[:, :1], dtypes.bool),
-         math_ops.not_equal(labels[:, 1:], labels[:, :-1])],
-        axis=1)
+    label_mask = array_ops.concat([
+        array_ops.ones_like(labels[:, :1], dtypes.bool),
+        math_ops.not_equal(labels[:, 1:], labels[:, :-1])
+    ],
+                                  axis=1)
 
     # Filter labels that aren't in the original sequence.
     maxlen = _get_dim(labels, 1)
@@ -851,8 +880,7 @@
 
   Args:
     dense: tensor of shape [batch, max_length]
-    length: int tensor of shape [batch]
-      The length of each sequence in dense.
+    length: int tensor of shape [batch] The length of each sequence in dense.
 
   Returns:
     tf.SparseTensor with values only for the valid elements of sequences.
@@ -867,7 +895,8 @@
       array_ops.boolean_mask(flat_indices, flat_mask), 1)
   values = array_ops.boolean_mask(flat_values, flat_mask)
   sparse = sparse_tensor.SparseTensor(
-      indices=indices, values=math_ops.cast(values, dtypes.int32),
+      indices=indices,
+      values=math_ops.cast(values, dtypes.int32),
       dense_shape=array_ops.shape(flat_values, out_type=dtypes.int64))
   reshaped = sparse_ops.sparse_reshape(sparse, array_ops.shape(dense))
   max_length = math_ops.reduce_max(length)
@@ -876,14 +905,15 @@
       values=reshaped.values,
       dense_shape=[
           math_ops.cast(reshaped.dense_shape[0], dtypes.int64),
-          math_ops.cast(max_length, dtypes.int64)])
+          math_ops.cast(max_length, dtypes.int64)
+      ])
 
 
 @tf_export("nn.ctc_unique_labels")
 def ctc_unique_labels(labels, name=None):
   """Get unique labels and indices for batched labels for `tf.nn.ctc_loss`.
 
-  For use with `tf.nn.ctc_loss_v2` optional argument `unique`: This op can be
+  For use with `tf.nn.ctc_loss` optional argument `unique`: This op can be
   used to preprocess labels in input pipeline to for better speed/memory use
   computing the ctc loss on TPU.
 
@@ -904,25 +934,24 @@
 
   with ops.name_scope(name, "ctc_unique_labels", [labels]):
     labels = ops.convert_to_tensor(labels, name="labels")
+
     def _unique(x):
       u = array_ops.unique(x)
-      y = array_ops.pad(
-          u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
+      y = array_ops.pad(u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
       y = math_ops.cast(y, dtypes.int64)
       return [y, u.idx]
-    return map_fn.map_fn(
-        _unique, labels, dtype=[dtypes.int64, dtypes.int32])
+
+    return map_fn.map_fn(_unique, labels, dtype=[dtypes.int64, dtypes.int32])
 
 
 def _sum_states(idx, states):
   """Take logsumexp for each unique state out of all label states.
 
   Args:
-    idx: tensor of shape [batch, label_length]
-      For each sequence, indices into a set of unique labels as computed by
-      calling unique.
-    states: tensor of shape [frames, batch, label_length]
-      Log probabilities for each label state.
+    idx: tensor of shape [batch, label_length] For each sequence, indices into a
+      set of unique labels as computed by calling unique.
+    states: tensor of shape [frames, batch, label_length] Log probabilities for
+      each label state.
 
   Returns:
     tensor of shape [frames, batch_size, label_length], log probabilites summed
@@ -934,7 +963,10 @@
     num_states = _get_dim(states, 2)
     states = array_ops.expand_dims(states, axis=2)
     one_hot = array_ops.one_hot(
-        idx, depth=num_states, on_value=0.0, off_value=math_ops.log(0.0),
+        idx,
+        depth=num_states,
+        on_value=0.0,
+        off_value=math_ops.log(0.0),
         axis=1)
     return math_ops.reduce_logsumexp(states + one_hot, axis=-1)
 
@@ -945,8 +977,8 @@
   """Forward-backward algorithm computed in log domain.
 
   Args:
-    state_trans_log_probs: tensor of shape [states, states] or
-      if different transition matrix per batch [batch_size, states, states]
+    state_trans_log_probs: tensor of shape [states, states] or if different
+      transition matrix per batch [batch_size, states, states]
     initial_state_log_probs: tensor of shape [batch_size, states]
     final_state_log_probs: tensor of shape [batch_size, states]
     observed_log_probs: tensor of shape [frames, batch_size, states]
@@ -982,8 +1014,8 @@
     state_log_prob -= log_prob_sum
     return state_log_prob
 
-  fwd = _scan(_forward, observed_log_probs, initial_state_log_probs,
-              inclusive=True)
+  fwd = _scan(
+      _forward, observed_log_probs, initial_state_log_probs, inclusive=True)
 
   def _backward(accs, elems):
     """Calculate log probs and cumulative sum masked for sequence length."""
@@ -1009,9 +1041,11 @@
   mask = array_ops.sequence_mask(sequence_length, maxlen, dtypes.float32)
   mask = array_ops.transpose(mask, perm=[1, 0])
 
-  bwd, cum_log_sum = _scan(_backward, (observed_log_probs, mask),
-                           (final_state_log_probs, zero_log_sum),
-                           reverse=True, inclusive=True)
+  bwd, cum_log_sum = _scan(
+      _backward, (observed_log_probs, mask),
+      (final_state_log_probs, zero_log_sum),
+      reverse=True,
+      inclusive=True)
 
   fwd_bwd_log_probs = fwd[1:] + bwd[1:]
   fwd_bwd_log_probs_sum = math_ops.reduce_logsumexp(
@@ -1045,9 +1079,9 @@
       scan(lambda a, e: a + (e[0] * e[1]), (elems1, elems2), 0.0)
 
   Args:
-    fn: callable, fn(accumulators, element) return new accumulator values.
-      The (possibly nested) sequence of accumulators is the same as `initial`
-      and the return value must have the same structure.
+    fn: callable, fn(accumulators, element) return new accumulator values. The
+      (possibly nested) sequence of accumulators is the same as `initial` and
+      the return value must have the same structure.
     elems: A (possibly nested) tensor which will be unpacked along the first
       dimension. The resulting slices will be the second argument to fn. The
       first dimension of all nested input tensors must be the same.
@@ -1055,8 +1089,8 @@
       values for the accumulators.
     reverse: (optional) True enables scan and output elems in reverse order.
     inclusive: (optional) True includes the initial accumulator values in the
-      output. Length of output will be len(elem sequence) + 1. Not meaningful
-      if final_only is True.
+      output. Length of output will be len(elem sequence) + 1. Not meaningful if
+      final_only is True.
     final_only: (optional) When True, return only the final accumulated values,
       not the concatenation of accumulated values for each input.
 
@@ -1080,14 +1114,12 @@
     loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes + accum_dtypes
 
   # TODO(tombagby): Update to tfe.defun
-  @function.Defun(*loop_dtypes)
   def cond(i, num_elems, *args):
     del args
     return i >= 0 if reverse else i < num_elems
 
   # The loop *args are [output tensors] + [accumulator tensors] which must
   # be paired. Each output corresponds to one accumulator.
-  @function.Defun(*loop_dtypes)
   def body(i, num_elems, *args):
     """Loop body."""
     i.set_shape([])
@@ -1102,13 +1134,16 @@
       new_out = []
     else:
       update_i = i + 1 if inclusive and not reverse else i
-      new_out = [inplace_ops.alias_inplace_update(x, update_i, y)
-                 for x, y in zip(out, flat_accum)]
+      new_out = [
+          inplace_ops.alias_inplace_update(x, update_i, y)
+          for x, y in zip(out, flat_accum)
+      ]
     i = i - 1 if reverse else i + 1
     return [i, num_elems] + new_out + flat_accum
 
-  init_i = (array_ops.shape(flat_elems[0])[0] - 1 if reverse
-            else constant_op.constant(0, dtype=dtypes.int32))
+  init_i = (
+      array_ops.shape(flat_elems[0])[0] -
+      1 if reverse else constant_op.constant(0, dtype=dtypes.int32))
   outputs = []
   if not final_only:
     num_outputs = array_ops.shape(flat_elems[0])[0] + (1 if inclusive else 0)
@@ -1117,8 +1152,8 @@
           [[num_outputs], array_ops.shape(initial_accum)], 0)
       out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
       if inclusive:
-        out = inplace_ops.alias_inplace_add(
-            out, init_i + (1 if reverse else 0), initial_accum)
+        out = inplace_ops.alias_inplace_add(out, init_i + (1 if reverse else 0),
+                                            initial_accum)
       outputs.append(out)
   loop_in = [init_i, num_elems] + outputs + flat_initial
   hostmem = [
@@ -1126,8 +1161,15 @@
       if x.dtype.base_dtype in (dtypes.int32, dtypes.int64)
   ]
 
-  # TODO(tombagby): Update to while_v2.
-  loop_results = functional_ops.While(loop_in, cond, body, hostmem=hostmem)
+  if context.executing_eagerly():
+    loop_results = loop_in
+    while cond(*loop_results):
+      loop_results = body(*loop_results)
+  else:
+    # TODO(tombagby): Update to while_v2.
+    cond = function.Defun(*loop_dtypes)(cond)
+    body = function.Defun(*loop_dtypes)(body)
+    loop_results = functional_ops.While(loop_in, cond, body, hostmem=hostmem)
   out = loop_results[2:num_accums + 2]
   return pack(out)
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 33b1651..0ef72e1 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -89,7 +89,7 @@
 
   ```python
   def log1pexp(x):
-    return tf.log(1 + tf.exp(x))
+    return tf.math.log(1 + tf.exp(x))
   ```
 
   Due to numerical instability, the gradient this function evaluated at x=100 is
@@ -110,7 +110,7 @@
     e = tf.exp(x)
     def grad(dy):
       return dy * (1 - 1 / (1 + e))
-    return tf.log(1 + e), grad
+    return tf.math.log(1 + e), grad
   ```
 
   With this definition, the gradient at x=100 will be correctly evaluated as
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 41d8726..eac7eda 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -129,8 +129,8 @@
   handle single elements, versions that support enqueuing and
   dequeuing a batch of elements at once.
 
-  See `tf.FIFOQueue` and
-  `tf.RandomShuffleQueue` for concrete
+  See `tf.queue.FIFOQueue` and
+  `tf.queue.RandomShuffleQueue` for concrete
   implementations of this class, and instructions on how to create
   them.
   """
@@ -625,7 +625,7 @@
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
-  See `tf.QueueBase` for a description of the methods on
+  See `tf.queue.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -674,7 +674,7 @@
         with the same length as `dtypes`, or `None`.  If specified the dequeue
         methods return a dictionary with the names as keys.
       seed: A Python integer. Used to create a random seed. See
-        `tf.set_random_seed`
+        `tf.compat.v1.set_random_seed`
         for behavior.
       shared_name: (Optional.) If non-empty, this queue will be shared under
         the given name across multiple sessions.
@@ -711,7 +711,7 @@
 class FIFOQueue(QueueBase):
   """A queue implementation that dequeues elements in first-in first-out order.
 
-  See `tf.QueueBase` for a description of the methods on
+  See `tf.queue.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -774,7 +774,7 @@
   A `PaddingFIFOQueue` may contain components with dynamic shape, while also
   supporting `dequeue_many`.  See the constructor for more details.
 
-  See `tf.QueueBase` for a description of the methods on
+  See `tf.queue.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -847,7 +847,7 @@
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
-  See `tf.QueueBase` for a description of the methods on
+  See `tf.queue.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -1324,9 +1324,7 @@
 
 
 @tf_export(
-    "sparse.SparseConditionalAccumulator",
     v1=["sparse.SparseConditionalAccumulator", "SparseConditionalAccumulator"])
-@deprecation.deprecated_endpoints("SparseConditionalAccumulator")
 class SparseConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating sparse gradients.
 
diff --git a/tensorflow/python/ops/distributions/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
index 784bfd5..96a2ffa 100644
--- a/tensorflow/python/ops/distributions/bijector_test_util.py
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -74,7 +74,7 @@
       huge `n`.
     n:  Number of samples to draw for the checks.
     rtol:  Positive number.  Used for the Jacobian check.
-    sess:  `tf.Session`.  Defaults to the default session.
+    sess:  `tf.compat.v1.Session`.  Defaults to the default session.
 
   Raises:
     AssertionError:  If tests fail.
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 57505d1..8b95699 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -90,7 +90,7 @@
   the samples that are smaller than `np.finfo(dtype).tiny` are rounded
   to this value, so it appears more often than it should.
   This should only be noticeable when the `concentration` is very small, or the
-  `rate` is very large. See note in `tf.random_gamma` docstring.
+  `rate` is very large. See note in `tf.random.gamma` docstring.
 
   Samples of this distribution are reparameterized (pathwise differentiable).
   The derivatives are computed using the approach described in the paper
@@ -213,7 +213,7 @@
     return tensor_shape.scalar()
 
   @distribution_util.AppendDocstring(
-      """Note: See `tf.random_gamma` docstring for sampling details and
+      """Note: See `tf.random.gamma` docstring for sampling details and
       caveats.""")
   def _sample_n(self, n, seed=None):
     return random_ops.random_gamma(
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 3c647686..eada3cc 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -182,9 +182,9 @@
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Inline(
       forward_fn=tf.exp,
-      inverse_fn=tf.log,
+      inverse_fn=tf.math.log,
       inverse_log_det_jacobian_fn=(
-        lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
+        lambda y: -tf.reduce_sum(tf.math.log(y), axis=-1)),
     name="LogNormalTransformedDistribution")
   ```
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 24314e8..71d8477 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -36,9 +36,12 @@
 from tensorflow.python.util import tf_inspect
 
 
-def assert_integer_form(
-    x, data=None, summarize=None, message=None,
-    int_dtype=None, name="assert_integer_form"):
+def assert_integer_form(x,
+                        data=None,
+                        summarize=None,
+                        message=None,
+                        int_dtype=None,
+                        name="assert_integer_form"):
   """Assert that x has integer components (or floats equal to integers).
 
   Args:
@@ -69,8 +72,12 @@
       except KeyError:
         raise TypeError("Unrecognized type {}".format(x.dtype.name))
     return check_ops.assert_equal(
-        x, math_ops.cast(math_ops.cast(x, int_dtype), x.dtype),
-        data=data, summarize=summarize, message=message, name=name)
+        x,
+        math_ops.cast(math_ops.cast(x, int_dtype), x.dtype),
+        data=data,
+        summarize=summarize,
+        message=message,
+        name=name)
 
 
 def assert_symmetric(matrix):
@@ -91,8 +98,8 @@
     if not x.dtype.is_integer:
       assertions += [
           assert_integer_form(
-              x, message="'{}' cannot contain fractional components.".format(
-                  x)),
+              x,
+              message="'{}' cannot contain fractional components.".format(x)),
       ]
     return control_flow_ops.with_dependencies(assertions, x)
 
@@ -114,16 +121,18 @@
   # static shape inference may break the equality comparison between
   # shape(a) and shape(b) in math_ops.equal.
   def all_shapes_equal():
-    return math_ops.reduce_all(math_ops.equal(
-        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
-        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
+    return math_ops.reduce_all(
+        math_ops.equal(
+            array_ops.concat(
+                [array_ops.shape(a), array_ops.shape(b)], 0),
+            array_ops.concat(
+                [array_ops.shape(b), array_ops.shape(a)], 0)))
 
   # One of the shapes isn't fully defined, so we need to use the dynamic
   # shape.
   return control_flow_ops.cond(
       math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
-      all_shapes_equal,
-      lambda: constant_op.constant(False))
+      all_shapes_equal, lambda: constant_op.constant(False))
 
 
 def maybe_get_static_value(x, dtype=None):
@@ -162,13 +171,13 @@
   Args:
     logits: Floating-point `Tensor` representing log-odds.
     probs: Floating-point `Tensor` representing probabilities.
-    multidimensional: Python `bool`, default `False`.
-      If `True`, represents whether the last dimension of `logits` or `probs`,
-      a `[N1, N2, ...  k]` dimensional tensor, representing the
-      logit or probability of `shape[-1]` classes.
-    validate_args: Python `bool`, default `False`. When `True`, either assert
-      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
-      of `probs` sums to one.
+    multidimensional: Python `bool`, default `False`. If `True`, represents
+      whether the last dimension of `logits` or `probs`, a `[N1, N2, ...  k]`
+      dimensional tensor, representing the logit or probability of `shape[-1]`
+      classes.
+    validate_args: Python `bool`, default `False`. When `True`, either assert `0
+      <= probs <= 1` (if not `multidimensional`) or that the last dimension of
+      `probs` sums to one.
     name: A name for this operation (optional).
     dtype: `tf.DType` to prefer when converting args to `Tensor`s.
 
@@ -213,8 +222,10 @@
                   message="probs does not sum to 1.")
           ]
         else:
-          dependencies += [check_ops.assert_less_equal(
-              probs, one, message="probs has components greater than 1.")]
+          dependencies += [
+              check_ops.assert_less_equal(
+                  probs, one, message="probs has components greater than 1.")
+          ]
         probs = control_flow_ops.with_dependencies(dependencies, probs)
 
     with ops.name_scope("logits"):
@@ -288,8 +299,7 @@
 
 
 def embed_check_categorical_event_shape(
-    categorical_param,
-    name="embed_check_categorical_event_shape"):
+    categorical_param, name="embed_check_categorical_event_shape"):
   """Embeds checks that categorical distributions don't have too many classes.
 
   A categorical-type distribution is one which, e.g., returns the class label
@@ -341,8 +351,8 @@
     # For more details, see:
     # https://en.wikipedia.org/wiki/Floating-point_arithmetic#Internal_representation
     x_dtype = x.dtype.base_dtype
-    max_event_size = (_largest_integer_by_dtype(x_dtype)
-                      if x_dtype.is_floating else 0)
+    max_event_size = (
+        _largest_integer_by_dtype(x_dtype) if x_dtype.is_floating else 0)
     if max_event_size == 0:
       raise TypeError("Unable to validate size of unrecognized dtype "
                       "({}).".format(x_dtype.name))
@@ -357,34 +367,36 @@
         raise ValueError("A categorical-distribution parameter must have at "
                          "least 2 events.")
       if event_size > max_event_size:
-        raise ValueError(
-            "Number of classes exceeds `dtype` precision, i.e., "
-            "{} implies shape ({}) cannot exceed {}.".format(
-                x_dtype.name, event_size, max_event_size))
+        raise ValueError("Number of classes exceeds `dtype` precision, i.e., "
+                         "{} implies shape ({}) cannot exceed {}.".format(
+                             x_dtype.name, event_size, max_event_size))
       return x
     else:
       event_size = array_ops.shape(x, name="x_shape")[-1]
       return control_flow_ops.with_dependencies([
           check_ops.assert_rank_at_least(
-              x, 1, message=("A categorical-distribution parameter must have "
-                             "at least 1 dimension.")),
+              x,
+              1,
+              message=("A categorical-distribution parameter must have "
+                       "at least 1 dimension.")),
           check_ops.assert_greater_equal(
-              array_ops.shape(x)[-1], 2,
+              array_ops.shape(x)[-1],
+              2,
               message=("A categorical-distribution parameter must have at "
                        "least 2 events.")),
           check_ops.assert_less_equal(
-              event_size, max_event_size,
+              event_size,
+              max_event_size,
               message="Number of classes exceeds `dtype` precision, "
-                      "i.e., {} dtype cannot exceed {} shape.".format(
-                          x_dtype.name, max_event_size)),
+              "i.e., {} dtype cannot exceed {} shape.".format(
+                  x_dtype.name, max_event_size)),
       ], x)
 
 
-def embed_check_integer_casting_closed(
-    x,
-    target_dtype,
-    assert_nonnegative=True,
-    name="embed_check_casting_closed"):
+def embed_check_integer_casting_closed(x,
+                                       target_dtype,
+                                       assert_nonnegative=True,
+                                       name="embed_check_casting_closed"):
   """Ensures integers remain unaffected despite casting to/from int/float types.
 
   Example integer-types: `uint8`, `int32`, `bool`.
@@ -416,19 +428,18 @@
 
   with ops.name_scope(name, values=[x]):
     x = ops.convert_to_tensor(x, name="x")
-    if (not _is_integer_like_by_dtype(x.dtype)
-        and not x.dtype.is_floating):
+    if (not _is_integer_like_by_dtype(x.dtype) and not x.dtype.is_floating):
       raise TypeError("{}.dtype must be floating- or "
                       "integer-type.".format(x.dtype.name))
-    if (not _is_integer_like_by_dtype(target_dtype)
-        and not target_dtype.is_floating):
+    if (not _is_integer_like_by_dtype(target_dtype) and
+        not target_dtype.is_floating):
       raise TypeError("target_dtype ({}) must be floating- or "
                       "integer-type.".format(target_dtype.name))
-    if (not _is_integer_like_by_dtype(x.dtype)
-        and not _is_integer_like_by_dtype(target_dtype)):
+    if (not _is_integer_like_by_dtype(x.dtype) and
+        not _is_integer_like_by_dtype(target_dtype)):
       raise TypeError("At least one of {}.dtype ({}) and target_dtype ({}) "
-                      "must be integer-type.".format(
-                          x, x.dtype.name, target_dtype.name))
+                      "must be integer-type.".format(x, x.dtype.name,
+                                                     target_dtype.name))
 
     assertions = []
     if assert_nonnegative:
@@ -442,26 +453,28 @@
       # Since this check implies the magnitude check below, we need only it.
       assertions += [
           assert_integer_form(
-              x, int_dtype=target_dtype,
+              x,
+              int_dtype=target_dtype,
               message="Elements must be {}-equivalent.".format(
                   target_dtype.name)),
       ]
     else:
-      if (_largest_integer_by_dtype(x.dtype)
-          > _largest_integer_by_dtype(target_dtype)):
+      if (_largest_integer_by_dtype(x.dtype) >
+          _largest_integer_by_dtype(target_dtype)):
         # Cast may lose integer precision.
         assertions += [
             check_ops.assert_less_equal(
-                x, _largest_integer_by_dtype(target_dtype),
+                x,
+                _largest_integer_by_dtype(target_dtype),
                 message=("Elements cannot exceed {}.".format(
                     _largest_integer_by_dtype(target_dtype)))),
         ]
-      if (not assert_nonnegative and
-          (_smallest_integer_by_dtype(x.dtype)
-           < _smallest_integer_by_dtype(target_dtype))):
+      if (not assert_nonnegative and (_smallest_integer_by_dtype(
+          x.dtype) < _smallest_integer_by_dtype(target_dtype))):
         assertions += [
             check_ops.assert_greater_equal(
-                x, _smallest_integer_by_dtype(target_dtype),
+                x,
+                _smallest_integer_by_dtype(target_dtype),
                 message=("Elements cannot be smaller than {}.".format(
                     _smallest_integer_by_dtype(target_dtype)))),
         ]
@@ -547,11 +560,10 @@
   Args:
     matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
       equal.
-    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
-      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
+    transform:  Element-wise function mapping `Tensors` to `Tensors`. To be
+      applied to the diagonal of `matrix`. If `None`, `matrix` is returned
       unchanged. Defaults to `None`.
-    name:  A name to give created ops.
-      Defaults to "matrix_diag_transform".
+    name:  A name to give created ops. Defaults to "matrix_diag_transform".
 
   Returns:
     A `Tensor` with same shape and `dtype` as `matrix`.
@@ -583,7 +595,7 @@
   Example:
 
   ```python
-  x = tf.random_normal([1, 2, 3, 4])  # Tensor of shape [1, 2, 3, 4].
+  x = tf.random.normal([1, 2, 3, 4])  # Tensor of shape [1, 2, 3, 4].
   rotate_transpose(x, -1).shape == [2, 3, 4, 1]
   rotate_transpose(x, -2).shape == [3, 4, 1, 2]
   rotate_transpose(x,  1).shape == [4, 1, 2, 3]
@@ -612,10 +624,12 @@
     shift_value_static = tensor_util.constant_value(shift)
     ndims = x.get_shape().ndims
     if ndims is not None and shift_value_static is not None:
-      if ndims < 2: return x
+      if ndims < 2:
+        return x
       shift_value_static = np.sign(shift_value_static) * (
           abs(shift_value_static) % ndims)
-      if shift_value_static == 0: return x
+      if shift_value_static == 0:
+        return x
       perm = np.roll(np.arange(ndims), shift_value_static)
       return array_ops.transpose(x, perm=perm)
     else:
@@ -633,19 +647,16 @@
       # Finally, we transform shift by modulo length so it can be specified
       # independently from the array upon which it operates (like python).
       ndims = array_ops.rank(x)
-      shift = array_ops.where(math_ops.less(shift, 0),
-                              math_ops.mod(-shift, ndims),
-                              ndims - math_ops.mod(shift, ndims))
+      shift = array_ops.where(
+          math_ops.less(shift, 0), math_ops.mod(-shift, ndims),
+          ndims - math_ops.mod(shift, ndims))
       first = math_ops.range(0, shift)
       last = math_ops.range(shift, ndims)
       perm = array_ops.concat([last, first], 0)
       return array_ops.transpose(x, perm=perm)
 
 
-def pick_vector(cond,
-                true_vector,
-                false_vector,
-                name="pick_vector"):
+def pick_vector(cond, true_vector, false_vector, name="pick_vector"):
   """Picks possibly different length row `Tensor`s based on condition.
 
   Value `Tensor`s should have exactly one dimension.
@@ -659,13 +670,9 @@
     true_vector: `Tensor` of one dimension. Returned when cond is `True`.
     false_vector: `Tensor` of one dimension. Returned when cond is `False`.
     name: Python `str`. The name to give this op.
-
-  Example:
-
-  ```python
-  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))  # [10, 11]
-  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))  # [15, 16, 17]
-  ```
+  Example:  ```python pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15,
+    18))  # [10, 11] pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15,
+    18))  # [15, 16, 17] ```
 
   Returns:
     true_or_false_vector: `Tensor`.
@@ -687,17 +694,17 @@
     false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
     if true_vector.dtype != false_vector.dtype:
       raise TypeError(
-          "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector, true_vector.dtype,
-             false_vector, false_vector.dtype))
+          "%s.dtype=%s does not match %s.dtype=%s" %
+          (true_vector, true_vector.dtype, false_vector, false_vector.dtype))
     n = array_ops.shape(true_vector)[0]
     return array_ops.slice(
         array_ops.concat([true_vector, false_vector], 0),
         [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
 
 
-def prefer_static_broadcast_shape(
-    shape1, shape2, name="prefer_static_broadcast_shape"):
+def prefer_static_broadcast_shape(shape1,
+                                  shape2,
+                                  name="prefer_static_broadcast_shape"):
   """Convenience function which statically broadcasts shape when possible.
 
   Args:
@@ -710,6 +717,7 @@
       statically), or as a `Tensor`.
   """
   with ops.name_scope(name, values=[shape1, shape2]):
+
     def make_shape_tensor(x):
       return ops.convert_to_tensor(x, name="shape", dtype=dtypes.int32)
 
@@ -892,14 +900,11 @@
     else:
       x_list = [x[..., n:], array_ops.reverse(x, axis=[ndims - 1])]
     new_shape = (
-        static_final_shape.as_list()
-        if static_final_shape.is_fully_defined()
+        static_final_shape.as_list() if static_final_shape.is_fully_defined()
         else array_ops.concat([array_ops.shape(x)[:-1], [n, n]], axis=0))
     x = array_ops.reshape(array_ops.concat(x_list, axis=-1), new_shape)
     x = array_ops.matrix_band_part(
-        x,
-        num_lower=(0 if upper else -1),
-        num_upper=(-1 if upper else 0))
+        x, num_lower=(0 if upper else -1), num_upper=(-1 if upper else 0))
     x.set_shape(static_final_shape)
     return x
 
@@ -1041,17 +1046,17 @@
     return _add(below, diag, above)
 
 
-def reduce_weighted_logsumexp(
-    logx,
-    w=None,
-    axis=None,
-    keep_dims=False,
-    return_sign=False,
-    name=None):
+def reduce_weighted_logsumexp(logx,
+                              w=None,
+                              axis=None,
+                              keep_dims=False,
+                              return_sign=False,
+                              name=None):
   """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`.
 
   If all weights `w` are known to be positive, it is more efficient to directly
-  use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more
+  use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.math.log(w))` is
+  more
   efficient than `du.reduce_weighted_logsumexp(logx, w)`.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1094,9 +1099,9 @@
   Args:
     logx: The tensor to reduce. Should have numeric type.
     w: The weight tensor. Should have numeric type identical to `logx`.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     return_sign: If `True`, returns the sign of the result.
     name: A name for the operation (optional).
@@ -1121,8 +1126,7 @@
     # this is ok follows from the fact that we're actually free to subtract any
     # value we like, so long as we add it back after taking the `log(sum(...))`.
     max_log_absw_x = array_ops.where(
-        math_ops.is_inf(max_log_absw_x),
-        array_ops.zeros_like(max_log_absw_x),
+        math_ops.is_inf(max_log_absw_x), array_ops.zeros_like(max_log_absw_x),
         max_log_absw_x)
     wx_over_max_absw_x = (
         math_ops.sign(w) * math_ops.exp(log_absw_x - max_log_absw_x))
@@ -1187,8 +1191,9 @@
     too_large_value = x
     # This `where` will ultimately be a NOP because we won't select this
     # codepath whenever we used the surrogate `ones_like`.
-    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
-                        array_ops.ones_like(x), x)
+    x = array_ops.where(
+        math_ops.logical_or(is_too_small, is_too_large), array_ops.ones_like(x),
+        x)
     y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
     return array_ops.where(is_too_small, too_small_value,
                            array_ops.where(is_too_large, too_large_value, y))
@@ -1206,15 +1211,17 @@
   return array_ops.shape(x)[axis]
 
 
-def process_quadrature_grid_and_probs(
-    quadrature_grid_and_probs, dtype, validate_args, name=None):
+def process_quadrature_grid_and_probs(quadrature_grid_and_probs,
+                                      dtype,
+                                      validate_args,
+                                      name=None):
   """Validates quadrature grid, probs or computes them as necessary.
 
   Args:
     quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
       representing the sample points and the corresponding (possibly
       normalized) weight.  When `None`, defaults to:
-      `np.polynomial.hermite.hermgauss(deg=8)`.
+        `np.polynomial.hermite.hermgauss(deg=8)`.
     dtype: The expected `dtype` of `grid` and `probs`.
     validate_args: Python `bool`, default `False`. When `True` distribution
       parameters are checked for validity despite possibly degrading runtime
@@ -1244,8 +1251,7 @@
 
     grid, probs = tuple(quadrature_grid_and_probs)
     grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
-    probs = ops.convert_to_tensor(probs, name="unnormalized_probs",
-                                  dtype=dtype)
+    probs = ops.convert_to_tensor(probs, name="unnormalized_probs", dtype=dtype)
     probs /= linalg_ops.norm(probs, ord=1, axis=-1, keepdims=True, name="probs")
 
     def _static_event_size(x):
@@ -1281,13 +1287,13 @@
       (Negative indexing is supported.)
     front: Python `bool`; if `True` the beginning of the `axis` dimension is
       padded with `value`, `count` times. If `False` no front padding is made.
-    back: Python `bool`; if `True` the end of the `axis` dimension is
-      padded with `value`, `count` times. If `False` no end padding is made.
+    back: Python `bool`; if `True` the end of the `axis` dimension is padded
+      with `value`, `count` times. If `False` no end padding is made.
     value: Scalar `int`-like `Tensor` representing the actual value added to the
       front and/or back of the `axis` dimension of `x`.
     count: Scalar `int`-like `Tensor` representing number of elements added to
-      the front and/or back of the `axis` dimension of `x`. E.g., if
-      `front = back = True` then `2 * count` elements are added.
+      the front and/or back of the `axis` dimension of `x`. E.g., if `front =
+      back = True` then `2 * count` elements are added.
     name: Python `str` name prefixed to Ops created by this function.
 
   Returns:
@@ -1306,8 +1312,9 @@
           count.dtype.name))
     if not front and not back:
       raise ValueError("At least one of `front`, `back` must be `True`.")
-    ndims = (x.shape.ndims if x.shape.ndims is not None
-             else array_ops.rank(x, name="ndims"))
+    ndims = (
+        x.shape.ndims if x.shape.ndims is not None else array_ops.rank(
+            x, name="ndims"))
     axis = ops.convert_to_tensor(axis, name="axis")
     axis_ = tensor_util.constant_value(axis)
     if axis_ is not None:
@@ -1317,11 +1324,10 @@
       count_ = tensor_util.constant_value(count)
       if axis_ >= 0 or x.shape.ndims is not None:
         head = x.shape[:axis]
-        middle = tensor_shape.TensorShape(
-            None if count_ is None
-            else (tensor_shape.dimension_at_index(
-                x.shape, axis) + count_ * (front + back)))
-        tail = x.shape[axis+1:]
+        middle = tensor_shape.TensorShape(None if count_ is None else (
+            tensor_shape.dimension_at_index(x.shape, axis) + count_ *
+            (front + back)))
+        tail = x.shape[axis + 1:]
         final_shape = head.concatenate(middle.concatenate(tail))
       else:
         final_shape = None
@@ -1331,8 +1337,8 @@
     x = array_ops.pad(
         x,
         paddings=array_ops.one_hot(
-            indices=array_ops.stack([axis if front else -1,
-                                     axis if back else -1]),
+            indices=array_ops.stack(
+                [axis if front else -1, axis if back else -1]),
             depth=ndims,
             axis=0,
             on_value=count,
@@ -1407,8 +1413,8 @@
     Args:
       additional_note: Python string added as additional docstring to public
         version of function.
-      kwargs_dict: Python string/string dictionary representing
-        specific kwargs expanded from the **kwargs input.
+      kwargs_dict: Python string/string dictionary representing specific kwargs
+        expanded from the **kwargs input.
 
     Raises:
       ValueError: if kwargs_dict.key contains whitespace.
@@ -1420,20 +1426,20 @@
       for key in sorted(kwargs_dict.keys()):
         value = kwargs_dict[key]
         if any(x.isspace() for x in key):
-          raise ValueError(
-              "Parameter name \"%s\" contains whitespace." % key)
+          raise ValueError("Parameter name \"%s\" contains whitespace." % key)
         value = value.lstrip()
         if "\n" in value:
           raise ValueError(
               "Parameter description for \"%s\" contains newlines." % key)
         bullets.append("*  `%s`: %s" % (key, value))
-      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
-                                "\n".join(bullets))
+      self._additional_note += ("\n\n##### `kwargs`:\n\n" + "\n".join(bullets))
 
   def __call__(self, fn):
+
     @functools.wraps(fn)
     def _fn(*args, **kwargs):
       return fn(*args, **kwargs)
+
     if _fn.__doc__ is None:
       _fn.__doc__ = self._additional_note
     else:
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f533a0e..e4c7087 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -45,8 +45,8 @@
   Args:
     params: A `Tensor` of embeddings retrieved by `gather`.
     ids: The `ids` argument that was passed to `gather`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value.
 
   Returns:
     A `Tensor` with the same type as `params`.
@@ -76,8 +76,7 @@
   return clip_ops.clip_by_norm(
       params,
       max_norm,
-      axes=(list(range(ids_rank, params_rank))
-            if ids_static and params_static
+      axes=(list(range(ids_rank, params_rank)) if ids_static and params_static
             else math_ops.range(ids_rank, params_rank)))
 
 
@@ -105,8 +104,8 @@
     partition_strategy: See embedding_lookup.
     name: See embedding_lookup.
     max_norm: See embedding_lookup.
-    transform_fn: An optional function to apply to each retrieved embedding.
-      If max_norm is provided, transform_fn is applied to the norm-limited
+    transform_fn: An optional function to apply to each retrieved embedding. If
+      max_norm is provided, transform_fn is applied to the norm-limited
       embeddings.
 
   Returns:
@@ -130,8 +129,8 @@
     ids = ops.convert_to_tensor(ids, name="ids")
     if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
       with ops.colocate_with(params[0]):
-        result = _clip(array_ops.gather(params[0], ids, name=name),
-                       ids, max_norm)
+        result = _clip(
+            array_ops.gather(params[0], ids, name=name), ids, max_norm)
         if transform_fn:
           result = transform_fn(result)
       # Make sure the final result does not have colocation contraints on the
@@ -155,11 +154,11 @@
         # Compute num_total_ids as the sum of dim-0 of params, then assign to
         # partitions based on a constant number of ids per partition. Optimize
         # if we already know the full shape statically.
-        dim_0_size = tensor_shape.Dimension(tensor_shape.dimension_value(
-            params[0].get_shape()[0]))
+        dim_0_size = tensor_shape.Dimension(
+            tensor_shape.dimension_value(params[0].get_shape()[0]))
         for p in xrange(1, np):
-          dim_0_size += tensor_shape.Dimension(tensor_shape.dimension_value(
-              params[p].get_shape()[0]))
+          dim_0_size += tensor_shape.Dimension(
+              tensor_shape.dimension_value(params[p].get_shape()[0]))
         if dim_0_size.value:
           num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
         else:
@@ -176,9 +175,9 @@
         ids_per_partition = num_total_ids // np
         extras = num_total_ids % np
 
-        p_assignments = math_ops.maximum(
-            flat_ids // (ids_per_partition + 1),
-            (flat_ids - extras) // ids_per_partition)
+        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1),
+                                         (flat_ids - extras) //
+                                         ids_per_partition)
 
         # Emulate a conditional using a boolean indicator tensor
         new_ids = array_ops.where(p_assignments < extras,
@@ -233,9 +232,8 @@
         element_shape_d = array_ops.shape(ret)[1:]
 
       # Reshape to reverse the flattening of ids.
-      ret = array_ops.reshape(ret,
-                              array_ops.concat(
-                                  [array_ops.shape(ids), element_shape_d], 0))
+      ret = array_ops.reshape(
+          ret, array_ops.concat([array_ops.shape(ids), element_shape_d], 0))
 
       # Normally the reshape is sufficient, but setting shape explicitly
       # teaches shape inference that params[1:].get_shape() matters
@@ -261,7 +259,8 @@
   tensors in `params`.  It is a generalization of
   `tf.gather`, where `params` is
   interpreted as a partitioning of a large embedding tensor.  `params` may be
-  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  a `PartitionedVariable` as returned by using `tf.compat.v1.get_variable()`
+  with a
   partitioner.
 
   If `len(params) > 1`, each element `id` of `ids` is partitioned between
@@ -283,8 +282,8 @@
   tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
 
   Args:
-    params: A single tensor representing the complete embedding tensor,
-      or a list of P tensors all of same shape except for the first dimension,
+    params: A single tensor representing the complete embedding tensor, or a
+      list of P tensors all of same shape except for the first dimension,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
@@ -298,8 +297,8 @@
       in `indices` are always validated to be within range.  If assigned to GPU,
       out-of-bound indices result in safe but unspecified behavior, which may
       include raising an error.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value.
 
   Returns:
     A `Tensor` with the same type as the tensors in `params`.
@@ -317,18 +316,15 @@
 
 
 @tf_export("nn.embedding_lookup", v1=[])
-def embedding_lookup_v2(
-    params,
-    ids,
-    max_norm=None,
-    name=None):
+def embedding_lookup_v2(params, ids, max_norm=None, name=None):
   """Looks up `ids` in a list of embedding tensors.
 
   This function is used to perform parallel lookups on the list of
   tensors in `params`.  It is a generalization of
   `tf.gather`, where `params` is
   interpreted as a partitioning of a large embedding tensor.  `params` may be
-  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  a `PartitionedVariable` as returned by using `tf.compat.v1.get_variable()`
+  with a
   partitioner.
 
   If `len(params) > 1`, each element `id` of `ids` is partitioned between
@@ -346,15 +342,15 @@
   tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
 
   Args:
-    params: A single tensor representing the complete embedding tensor,
-      or a list of P tensors all of same shape except for the first dimension,
+    params: A single tensor representing the complete embedding tensor, or a
+      list of P tensors all of same shape except for the first dimension,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the 'div' `partition_strategy`.
     ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
       up in `params`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value.
     name: A name for the operation (optional).
 
   Returns:
@@ -363,8 +359,7 @@
   Raises:
     ValueError: If `params` is empty.
   """
-  return embedding_lookup(params, ids, "div", name,
-                          max_norm=max_norm)
+  return embedding_lookup(params, ids, "div", name, max_norm=max_norm)
 
 
 @tf_export(v1=["nn.embedding_lookup_sparse"])
@@ -385,8 +380,8 @@
   is the sum of the size of params along dimension 0.
 
   Args:
-    params: A single tensor representing the complete embedding tensor,
-      or a list of P tensors all of same shape except for the first dimension,
+    params: A single tensor representing the complete embedding tensor, or a
+      list of P tensors all of same shape except for the first dimension,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
@@ -400,13 +395,12 @@
       is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: Optional name for the op.
     combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
-      and "sum" are supported.
-      "sum" computes the weighted sum of the embedding results for each row.
-      "mean" is the weighted sum divided by the total weight.
-      "sqrtn" is the weighted sum divided by the square root of the sum of the
-      squares of the weights.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
+      and "sum" are supported. "sum" computes the weighted sum of the embedding
+      results for each row. "mean" is the weighted sum divided by the total
+      weight. "sqrtn" is the weighted sum divided by the square root of the sum
+      of the squares of the weights.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value, before combining.
 
   Returns:
     A dense tensor representing the combined embeddings for the
@@ -559,8 +553,8 @@
   is the sum of the size of params along dimension 0.
 
   Args:
-    params: A single tensor representing the complete embedding tensor,
-      or a list of P tensors all of same shape except for the first dimension,
+    params: A single tensor representing the complete embedding tensor, or a
+      list of P tensors all of same shape except for the first dimension,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for ``"div"`` `partition_strategy`.
@@ -570,13 +564,12 @@
       indicate all weights should be taken to be 1. If specified, `sp_weights`
       must have exactly the same shape and indices as `sp_ids`.
     combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
-      and "sum" are supported.
-      "sum" computes the weighted sum of the embedding results for each row.
-      "mean" is the weighted sum divided by the total weight.
-      "sqrtn" is the weighted sum divided by the square root of the sum of the
-      squares of the weights.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
+      and "sum" are supported. "sum" computes the weighted sum of the embedding
+      results for each row. "mean" is the weighted sum divided by the total
+      weight. "sqrtn" is the weighted sum divided by the square root of the sum
+      of the squares of the weights.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value, before combining.
     name: Optional name for the op.
 
   Returns:
@@ -619,8 +612,8 @@
       neither `None` nor `SparseTensor`.
     ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
-  return embedding_lookup_sparse(
-      params, sp_ids, sp_weights, "div", name, combiner, max_norm)
+  return embedding_lookup_sparse(params, sp_ids, sp_weights, "div", name,
+                                 combiner, max_norm)
 
 
 @tf_export("nn.safe_embedding_lookup_sparse", v1=[])
@@ -636,7 +629,8 @@
   The partitioned embedding in `embedding_weights` must all be the same shape
   except for the first dimension. The first dimension is allowed to vary as the
   vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  may be a `PartitionedVariable` as returned by using
+  `tf.compat.v1.get_variable()` with a
   partitioner.
 
   Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
@@ -690,17 +684,18 @@
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
-                                 combiner='mean',
+                                 combiner="mean",
                                  default_id=None,
                                  name=None,
-                                 partition_strategy='div',
+                                 partition_strategy="div",
                                  max_norm=None):
   """Lookup embedding results, accounting for invalid IDs and empty features.
 
   The partitioned embedding in `embedding_weights` must all be the same shape
   except for the first dimension. The first dimension is allowed to vary as the
   vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  may be a `PartitionedVariable` as returned by using
+  `tf.compat.v1.get_variable()` with a
   partitioner.
 
   Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
@@ -712,25 +707,24 @@
 
   Args:
     embedding_weights:  A list of `P` float `Tensor`s or values representing
-        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-        created by partitioning along dimension 0.  The total unpartitioned
-        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-        vocab size and `e_1, ..., e_m` are the embedding dimensions.
+      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+      created by partitioning along dimension 0.  The total unpartitioned shape
+      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
+      and `e_1, ..., e_m` are the embedding dimensions.
     sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-        ids. `d_0` is typically batch size.
+      ids. `d_0` is typically batch size.
     sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
-        float weights corresponding to `sparse_ids`, or `None` if all weights
-        are be assumed to be 1.0.
+      float weights corresponding to `sparse_ids`, or `None` if all weights are
+      be assumed to be 1.0.
     combiner: A string specifying how to combine embedding results for each
-        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-        the default.
+      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
+      default.
     default_id: The id to use for an entry with no features.
     name: A name for this operation (optional).
-    partition_strategy: A string specifying the partitioning strategy.
-        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
+    partition_strategy: A string specifying the partitioning strategy. Currently
+      `"div"` and `"mod"` are supported. Default is `"div"`.
     max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
-        combining.
-
+      combining.
 
   Returns:
     Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
@@ -739,13 +733,13 @@
     ValueError: if `embedding_weights` is empty.
   """
   if embedding_weights is None:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+    raise ValueError("Missing embedding_weights %s." % embedding_weights)
   if isinstance(embedding_weights, variables.PartitionedVariable):
     embedding_weights = list(embedding_weights)  # get underlying Variables.
   if not isinstance(embedding_weights, list):
     embedding_weights = [embedding_weights]
   if len(embedding_weights) < 1:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+    raise ValueError("Missing embedding_weights %s." % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   embedding_weights = [
@@ -755,36 +749,34 @@
       for w in embedding_weights
   ]
 
-  with ops.name_scope(name, 'embedding_lookup',
-                      embedding_weights + [sparse_ids,
-                                           sparse_weights]) as scope:
+  with ops.name_scope(name, "embedding_lookup", embedding_weights +
+                      [sparse_ids, sparse_weights]) as scope:
     # Reshape higher-rank sparse ids and weights to linear segment ids.
     original_shape = sparse_ids.dense_shape
     original_rank_dim = tensor_shape.dimension_value(
         sparse_ids.dense_shape.get_shape()[0])
     original_rank = (
         array_ops.size(original_shape)
-        if original_rank_dim is None
-        else original_rank_dim)
+        if original_rank_dim is None else original_rank_dim)
     sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
         math_ops.reduce_prod(
             array_ops.slice(original_shape, [0], [original_rank - 1])),
-        array_ops.gather(original_shape, original_rank - 1)])
+        array_ops.gather(original_shape, original_rank - 1)
+    ])
     if sparse_weights is not None:
-      sparse_weights = sparse_tensor.SparseTensor(
-          sparse_ids.indices,
-          sparse_weights.values, sparse_ids.dense_shape)
+      sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices,
+                                                  sparse_weights.values,
+                                                  sparse_ids.dense_shape)
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
-    if combiner != 'sum':
+    if combiner != "sum":
       sparse_ids, sparse_weights = _prune_invalid_weights(
           sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
-    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
-                                                                 default_id or
-                                                                 0)
+    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(
+        sparse_ids, default_id or 0)
     if sparse_weights is not None:
       sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
 
@@ -804,10 +796,8 @@
           array_ops.reshape(is_row_empty, [-1, 1]),
           array_ops.stack([1, array_ops.shape(result)[1]]))
 
-      result = array_ops.where(is_row_empty,
-                               array_ops.zeros_like(result),
-                               result,
-                               name=scope)
+      result = array_ops.where(
+          is_row_empty, array_ops.zeros_like(result), result, name=scope)
 
     # Reshape back from linear ids back into higher-dimensional dense result.
     final_result = array_ops.reshape(
@@ -818,9 +808,10 @@
                 [original_rank - 1]),
             array_ops.slice(array_ops.shape(result), [1], [-1])
         ], 0))
-    final_result.set_shape(tensor_shape.unknown_shape(
-        (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
-            result.get_shape()[1:]))
+    final_result.set_shape(
+        tensor_shape.unknown_shape(
+            (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
+                result.get_shape()[1:]))
     return final_result
 
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index addd02f..133dbc5 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
 """Functional operations."""
 
 from __future__ import absolute_import
@@ -44,8 +43,13 @@
 
 # TODO(yuanbyu, mrry): Handle stride to support sliding windows.
 @tf_export("foldl")
-def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-          swap_memory=False, name=None):
+def foldl(fn,
+          elems,
+          initializer=None,
+          parallel_iterations=10,
+          back_prop=True,
+          swap_memory=False,
+          name=None):
   """foldl on the list of tensors unpacked from `elems` on dimension 0.
 
   This foldl operator repeatedly applies the callable `fn` to a sequence
@@ -67,13 +71,13 @@
 
   Args:
     fn: The callable to be performed.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which
-      will be unpacked along their first dimension.  The nested sequence
-      of the resulting slices will be the first argument to `fn`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension.  The nested sequence of the
+      resulting slices will be the first argument to `fn`.
     initializer: (optional) A tensor or (possibly nested) sequence of tensors,
       as the initial value for the accumulator.
-    parallel_iterations: (optional) The number of iterations allowed to run
-      in parallel.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
     name: (optional) Name prefix for the returned tensors.
@@ -120,8 +124,9 @@
     elems_flat = [
         ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
     ]
-    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
-         or array_ops.shape(elems_flat[0])[0])
+    n = (
+        tensor_shape.dimension_value(elems_flat[0].shape[0]) or
+        array_ops.shape(elems_flat[0])[0])
 
     elems_ta = nest.map_structure(create_ta, elems)
 
@@ -138,7 +143,8 @@
       return [i + 1, a]
 
     _, r_a = control_flow_ops.while_loop(
-        lambda i, a: i < n, compute, [i, a],
+        lambda i, a: i < n,
+        compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
         swap_memory=swap_memory,
@@ -153,8 +159,13 @@
 
 
 @tf_export("foldr")
-def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-          swap_memory=False, name=None):
+def foldr(fn,
+          elems,
+          initializer=None,
+          parallel_iterations=10,
+          back_prop=True,
+          swap_memory=False,
+          name=None):
   """foldr on the list of tensors unpacked from `elems` on dimension 0.
 
   This foldr operator repeatedly applies the callable `fn` to a sequence
@@ -176,13 +187,13 @@
 
   Args:
     fn: The callable to be performed.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which
-      will be unpacked along their first dimension.  The nested sequence
-      of the resulting slices will be the first argument to `fn`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension.  The nested sequence of the
+      resulting slices will be the first argument to `fn`.
     initializer: (optional) A tensor or (possibly nested) sequence of tensors,
       as the initial value for the accumulator.
-    parallel_iterations: (optional) The number of iterations allowed to run
-      in parallel.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
     name: (optional) Name prefix for the returned tensors.
@@ -229,8 +240,9 @@
     elems_flat = [
         ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
     ]
-    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
-         or array_ops.shape(elems_flat[0])[0])
+    n = (
+        tensor_shape.dimension_value(elems_flat[0].shape[0]) or
+        array_ops.shape(elems_flat[0])[0])
 
     elems_ta = nest.map_structure(create_ta, elems)
 
@@ -264,8 +276,15 @@
 
 
 @tf_export("scan")
-def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-         swap_memory=False, infer_shape=True, reverse=False, name=None):
+def scan(fn,
+         elems,
+         initializer=None,
+         parallel_iterations=10,
+         back_prop=True,
+         swap_memory=False,
+         infer_shape=True,
+         reverse=False,
+         name=None):
   """scan on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `scan` repeatedly applies the callable `fn` to a
@@ -300,24 +319,24 @@
   `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
 
   Args:
-    fn: The callable to be performed.  It accepts two arguments.  The first
-      will have the same structure as `initializer` if one is provided,
-      otherwise it will have the same structure as `elems`.  The second
-      will have the same (possibly nested) structure as `elems`.  Its output
-      must have the same structure as `initializer` if one is provided,
-      otherwise it must have the same structure as `elems`.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which
-      will be unpacked along their first dimension.  The nested sequence
-      of the resulting slices will be the first argument to `fn`.
+    fn: The callable to be performed.  It accepts two arguments.  The first will
+      have the same structure as `initializer` if one is provided, otherwise it
+      will have the same structure as `elems`.  The second will have the same
+      (possibly nested) structure as `elems`.  Its output must have the same
+      structure as `initializer` if one is provided, otherwise it must have the
+      same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension.  The nested sequence of the
+      resulting slices will be the first argument to `fn`.
     initializer: (optional) A tensor or (possibly nested) sequence of tensors,
       initial value for the accumulator, and the expected output type of `fn`.
-    parallel_iterations: (optional) The number of iterations allowed to run
-      in parallel.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
     infer_shape: (optional) False disables tests for consistent output shapes.
-    reverse: (optional) True scans the tensor last to first (instead of first
-      to last).
+    reverse: (optional) True scans the tensor last to first (instead of first to
+      last).
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
@@ -361,6 +380,7 @@
 
   input_is_sequence = nest.is_sequence(elems)
   input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
+
   def input_pack(x):
     return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
 
@@ -371,6 +391,7 @@
   else:
     output_is_sequence = nest.is_sequence(initializer)
     output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
+
     def output_pack(x):
       return (nest.pack_sequence_as(initializer, x)
               if output_is_sequence else x[0])
@@ -394,7 +415,8 @@
 
     # Convert elems to tensor array.
     elems_flat = [
-        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
+        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat
+    ]
 
     # Convert elems to tensor array. n may be known statically.
     n = tensor_shape.dimension_value(elems_flat[0].shape[0])
@@ -403,14 +425,17 @@
 
     # TensorArrays are always flat
     elems_ta = [
-        tensor_array_ops.TensorArray(dtype=elem.dtype, size=n,
-                                     dynamic_size=False,
-                                     element_shape=elem.shape[1:],
-                                     infer_shape=True)
-        for elem in elems_flat]
+        tensor_array_ops.TensorArray(
+            dtype=elem.dtype,
+            size=n,
+            dynamic_size=False,
+            element_shape=elem.shape[1:],
+            infer_shape=True) for elem in elems_flat
+    ]
     # Unpack elements
     elems_ta = [
-        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
+        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)
+    ]
 
     if initializer is None:
       a_flat = [elem.read(n - 1 if reverse else 0) for elem in elems_ta]
@@ -423,15 +448,18 @@
     # Create a tensor array to store the intermediate values.
     accs_ta = [
         tensor_array_ops.TensorArray(
-            dtype=init.dtype, size=n,
+            dtype=init.dtype,
+            size=n,
             element_shape=init.shape if infer_shape else None,
             dynamic_size=False,
-            infer_shape=infer_shape)
-        for init in a_flat]
+            infer_shape=infer_shape) for init in a_flat
+    ]
 
     if initializer is None:
-      accs_ta = [acc_ta.write(n - 1 if reverse else 0, a)
-                 for (acc_ta, a) in zip(accs_ta, a_flat)]
+      accs_ta = [
+          acc_ta.write(n - 1 if reverse else 0, a)
+          for (acc_ta, a) in zip(accs_ta, a_flat)
+      ]
 
     def compute(i, a_flat, tas):
       """The loop body of scan.
@@ -452,8 +480,8 @@
       packed_elems = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
       packed_a = output_pack(a_flat)
       a_out = fn(packed_a, packed_elems)
-      nest.assert_same_structure(
-          elems if initializer is None else initializer, a_out)
+      nest.assert_same_structure(elems if initializer is None else initializer,
+                                 a_out)
       flat_a_out = output_flatten(a_out)
       tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_a_out)]
       if reverse:
@@ -469,21 +497,26 @@
       initial_i = i
       condition = lambda i, _1, _2: i < n
     _, _, r_a = control_flow_ops.while_loop(
-        condition, compute, (initial_i, a_flat, accs_ta),
+        condition,
+        compute, (initial_i, a_flat, accs_ta),
         parallel_iterations=parallel_iterations,
-        back_prop=back_prop, swap_memory=swap_memory,
+        back_prop=back_prop,
+        swap_memory=swap_memory,
         maximum_iterations=n)
 
     results_flat = [r.stack() for r in r_a]
 
-    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
-        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
+    n_static = tensor_shape.Dimension(
+        tensor_shape.dimension_value(
+            elems_flat[0].get_shape().with_rank_at_least(1)[0]))
     for elem in elems_flat[1:]:
-      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
-          elem.get_shape().with_rank_at_least(1)[0])))
+      n_static.merge_with(
+          tensor_shape.Dimension(
+              tensor_shape.dimension_value(
+                  elem.get_shape().with_rank_at_least(1)[0])))
     for r in results_flat:
-      r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
-          r.get_shape()[1:]))
+      r.set_shape(
+          tensor_shape.TensorShape(n_static).concatenate(r.get_shape()[1:]))
 
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -495,19 +528,20 @@
 
 # pylint: disable=invalid-name
 def If(cond, inputs, then_branch, else_branch, name=None):
-  r"""output = Cond(inputs) ? then_branch(inputs) : else_branch(inputs).
+  r"""output = Cond(inputs) ?
+
+  then_branch(inputs) : else_branch(inputs).
 
   Args:
     cond: A `Tensor`. A scalar. If the scalar is not a boolean, the scalar is
-      converted to a boolean according to the following rule: if the
-      scalar is a numerical value, non-zero means True and zero means
-      False; if the scalar is a string, non-empty means True and empty
-      means False.
+      converted to a boolean according to the following rule: if the scalar is a
+        numerical value, non-zero means True and zero means False; if the scalar
+        is a string, non-empty means True and empty means False.
     inputs: A list of input tensors.
-    then_branch: A function takes 'inputs' and returns a list of tensors,
-        whose types are the same as what else_branch returns.
-    else_branch: A function takes 'inputs' and returns a list of tensors.
-        whose types are the same as what then_branch returns.
+    then_branch: A function takes 'inputs' and returns a list of tensors, whose
+      types are the same as what else_branch returns.
+    else_branch: A function takes 'inputs' and returns a list of tensors. whose
+      types are the same as what then_branch returns.
     name: A name for the operation (optional).
 
   Returns:
@@ -528,22 +562,14 @@
 
   Args:
     inputs: A list of tensors of size N + M.
-    f: The function we want to compute the gradient for.
-
-      The function 'f' must be a numerical function which takes N inputs and
-      produces M outputs. Its gradient function 'g', which is  a function
-      taking N + M inputs and produces N outputs.
-
-      I.e. if we have
-         (y1, y2, ..., yM) = f(x1, x2, ..., xN),
-      then, g is
-         (dL/dx1, dL/dx2, ..., dL/dxN) = g(x1, x2, ..., xN,
-                                           dL/dy1, dL/dy2, ..., dL/dyM),
-
-      where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
-      loss function). dL/dxi is the partial derivative of L with respect
-      to xi.
-
+    f: The function we want to compute the gradient for.  The function 'f' must
+      be a numerical function which takes N inputs and produces M outputs. Its
+      gradient function 'g', which is  a function taking N + M inputs and
+      produces N outputs.  I.e. if we have (y1, y2, ..., yM) = f(x1, x2, ...,
+      xN), then, g is (dL/dx1, dL/dx2, ..., dL/dxN) = g(x1, x2, ..., xN, dL/dy1,
+      dL/dy2, ..., dL/dyM),  where L is a scalar-value function of (x1, x2, ...,
+      xN) (e.g., the loss function). dL/dxi is the partial derivative of L with
+      respect to xi.
     name: A name for the operation (optional).
 
   Returns:
@@ -583,21 +609,19 @@
   r"""output = input; While (Cond(output)) { output = Body(output) }.
 
   Args:
-    input_: A list of `Tensor` objects.
-      A list of input tensors whose types are T.
-    cond: . A function takes 'input' and returns a tensor.  If the tensor is
-      a scalar of non-boolean, the scalar is converted to a boolean
-      according to the following rule: if the scalar is a numerical
-      value, non-zero means True and zero means False; if the scalar is
-      a string, non-empty means True and empty means False. If the
-      tensor is not a scalar, non-emptiness means True and False
-      otherwise.
-    body: . A function takes a list of tensors and returns another
-      list tensors. Both lists have the same types as specified
-      by T.
+    input_: A list of `Tensor` objects. A list of input tensors whose types are
+      T.
+    cond: . A function takes 'input' and returns a tensor.  If the tensor is a
+      scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical value,
+        non-zero means True and zero means False; if the scalar is a string,
+        non-empty means True and empty means False. If the tensor is not a
+        scalar, non-emptiness means True and False otherwise.
+    body: . A function takes a list of tensors and returns another list tensors.
+      Both lists have the same types as specified by T.
     name: A name for the operation (optional).
-    hostmem: A list of integer. If i is in the list, input[i] is a
-      host memory tensor.
+    hostmem: A list of integer. If i is in the list, input[i] is a host memory
+      tensor.
 
   Raises:
     ValueError: if `cond` has implicitly captured inputs or if `cond` and `body`
@@ -734,14 +758,14 @@
     start: A `Tensor` of type `int32`.
     limit: A `Tensor` of type `int32`.
     delta: A `Tensor` of type `int32`.
-    inputs: A list of `Tensor` objects.
-      A list of input tensors whose types are T.
-    body: A function takes a list of tensors and returns another
-      list of tensors. Both lists have the same types as (int32, T...).
+    inputs: A list of `Tensor` objects. A list of input tensors whose types are
+      T.
+    body: A function takes a list of tensors and returns another list of
+      tensors. Both lists have the same types as (int32, T...).
     name: A name for the operation (optional).
-    hostmem: A list of integer. If i is in the list, inputs[i] is a
-      host memory tensor. In other words, (i+1)-th argument of the body
-      function is expecting a host memory.
+    hostmem: A list of integer. If i is in the list, inputs[i] is a host memory
+      tensor. In other words, (i+1)-th argument of the body function is
+      expecting a host memory.
     rewrite_with_while: If True, using While op to implement the For.
 
   Returns:
@@ -773,10 +797,16 @@
     output_attr.list.i.extend(hostmem)
     ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
   return ret
+
+
 # pylint: enable=invalid-name,protected-access
 
 
-def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
+def partitioned_call(args,
+                     f,
+                     tout=None,
+                     executing_eagerly=None,
+                     config=None,
                      executor_type=None):
   """Executes a function while respecting device annotations.
 
@@ -791,9 +821,9 @@
       the signature of `f`.
     executing_eagerly: (Optional) A boolean indicating whether the context is
       executing eagerly. If `None`, fetched from the global context.
-    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If
-      `None`, all optimizations are disabled. Currently only handled for eager
-      defined functions.
+    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If `None`,
+      all optimizations are disabled. Currently only handled for eager defined
+      functions.
     executor_type: (Optional) A string for the name of the executor to be used
       in the function call. If not set, or set to an empty string, the default
       tensorflow executor will be used.
@@ -819,11 +849,17 @@
   if executing_eagerly or len(tout):
     if f.stateful_ops:
       outputs = gen_functional_ops.stateful_partitioned_call(
-          args=args, Tout=tout, f=f, config_proto=config,
+          args=args,
+          Tout=tout,
+          f=f,
+          config_proto=config,
           executor_type=executor_type)
     else:
       outputs = gen_functional_ops.partitioned_call(
-          args=args, Tout=tout, f=f, config_proto=config,
+          args=args,
+          Tout=tout,
+          f=f,
+          config_proto=config,
           executor_type=executor_type)
     return outputs if outputs else None
 
@@ -853,7 +889,6 @@
       op_name,
       args,
       tout,
-      compute_shapes=False,
       name="PartitionedFunctionCall",
       attrs={
           "Tin": tin_attr,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 683f78c..2ce28e8 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -284,10 +284,10 @@
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[::2, ::2] = d(Re y)/d(Re x)
-      J[::2, 1::2] = d(Im y)/d(Re x)
-      J[1::2, ::2] = d(Re y)/d(Im x)
-      J[1::2, 1::2] = d(Im y)/d(Im x)
+      J[:m, :n] = d(Re y)/d(Re x)
+      J[:m, n:] = d(Im y)/d(Re x)
+      J[m:, :n] = d(Re y)/d(Im x)
+      J[m:, n:] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 4d2b5ef..a1e1b7a 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.test.compute_gradient and tf.compute_gradient_error."""
+"""Tests for tf.compat.v1.test.compute_gradient and tf.compute_gradient_error."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 7291e05..61f470e 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -63,7 +63,7 @@
   value_range = [0.0, 5.0]
   new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 
-  with tf.get_default_session() as sess:
+  with tf.compat.v1.get_default_session() as sess:
     indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
     variables.global_variables_initializer().run()
     sess.run(indices) => [0, 0, 1, 2, 4]
@@ -127,7 +127,7 @@
   value_range = [0.0, 5.0]
   new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 
-  with tf.get_default_session() as sess:
+  with tf.compat.v1.get_default_session() as sess:
     hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
     variables.global_variables_initializer().run()
     sess.run(hist) => [2, 1, 1, 0, 2]
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index ea41ea3..43d9699 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -29,7 +29,8 @@
 from tensorflow.python.platform import test
 
 
-@test_util.disable_all_xla('align_corners=False not supported by XLA')
+@test_util.for_all_test_methods(test_util.disable_xla,
+                                'align_corners=False not supported by XLA')
 class ResizeNearestNeighborOpTest(test.TestCase):
 
   TYPES = [np.float32, np.float64]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index db305c9..e8a2cc9 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -124,8 +124,8 @@
 
   Args:
     image: 3-D Tensor of shape [height, width, channels]
-    require_static: If `True`, requires that all dimensions of `image` are
-      known and non-zero.
+    require_static: If `True`, requires that all dimensions of `image` are known
+      and non-zero.
 
   Raises:
     ValueError: if `image.shape` is not a 3-vector.
@@ -137,8 +137,8 @@
   try:
     image_shape = image.get_shape().with_rank(3)
   except ValueError:
-    raise ValueError(
-        "'image' (shape %s) must be three-dimensional." % image.shape)
+    raise ValueError("'image' (shape %s) must be three-dimensional." %
+                     image.shape)
   if require_static and not image_shape.is_fully_defined():
     raise ValueError("'image' (shape %s) must be fully defined." % image_shape)
   if any(x == 0 for x in image_shape):
@@ -203,8 +203,8 @@
 
   Args:
     image: >= 3-D Tensor of size [*, height, width, depth]
-    require_static: If `True`, requires that all dimensions of `image` are
-      known and non-zero.
+    require_static: If `True`, requires that all dimensions of `image` are known
+      and non-zero.
 
   Raises:
     ValueError: if image.shape is not a [>= 3] vector.
@@ -223,14 +223,86 @@
   if require_static and not image_shape.is_fully_defined():
     raise ValueError('\'image\' must be fully defined.')
   if any(x == 0 for x in image_shape):
-    raise ValueError(
-        'all dims of \'image.shape\' must be > 0: %s' % image_shape)
+    raise ValueError('all dims of \'image.shape\' must be > 0: %s' %
+                     image_shape)
   if not image_shape.is_fully_defined():
     return [
         check_ops.assert_positive(
             array_ops.shape(image),
             ["all dims of 'image.shape' "
-             'must be > 0.'])
+             'must be > 0.']),
+        check_ops.assert_greater_equal(
+            array_ops.rank(image),
+            3,
+            message="'image' must be at least three-dimensional.")
+    ]
+  else:
+    return []
+
+
+def _AssertGrayscaleImage(image):
+  """Assert that we are working with a properly shaped
+
+     grayscale image.
+
+    Performs the check statically if possible (i.e. if the shape
+    is statically known). Otherwise adds a control dependency
+    to an assert op that checks the dynamic shape.
+
+    Args:
+      image: >= 3-D Tensor of size [*, height, width, depth]
+
+    Raises:
+      ValueError: if image.shape is not a [>= 3] vector or if
+                last dimension is not size 1.
+
+    Returns:
+      If the shape of `image` could be verified statically, `image` is
+      returned unchanged, otherwise there will be a control dependency
+      added that asserts the correct dynamic shape.
+  """
+  return control_flow_ops.with_dependencies(
+      _CheckGrayscaleImage(image, require_static=False), image)
+
+
+def _CheckGrayscaleImage(image, require_static=True):
+  """Assert that we are working with properly shaped
+
+  grayscale image.
+
+  Args:
+    image: >= 3-D Tensor of size [*, height, width, depth]
+
+  Raises:
+    ValueError: if image.shape is not a [>= 3] vector or if
+              last dimension is not size 1.
+
+  Returns:
+    An empty list, if `image` has fully defined dimensions. Otherwise, a list
+    containing an assert op is returned.
+  """
+  try:
+    if image.get_shape().ndims is None:
+      image_shape = image.get_shape().with_rank(3)
+    else:
+      image_shape = image.get_shape().with_rank_at_least(3)
+  except ValueError:
+    raise ValueError('A grayscale image must be at least three-dimensional.')
+  if require_static and not image_shape.is_fully_defined():
+    raise ValueError('\'image\' must be fully defined.')
+  if image_shape.is_fully_defined():
+    if image_shape[-1] != 1:
+      raise ValueError('Last dimension of a grayscale image should be size 1.')
+  if not image_shape.is_fully_defined():
+    return [
+        check_ops.assert_equal(
+            array_ops.shape(image)[-1],
+            1,
+            message='Last dimension of a grayscale image should be size 1.'),
+        check_ops.assert_greater_equal(
+            array_ops.rank(image),
+            3,
+            message='A grayscale image must be at least three-dimensional.')
     ]
   else:
     return []
@@ -263,11 +335,10 @@
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     A tensor of the same type and shape as `image`.
@@ -285,11 +356,10 @@
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     A tensor of the same type and shape as `image`.
@@ -304,12 +374,11 @@
   """Randomly (50% chance) flip an image along axis `flip_index`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     flip_index: Dimension along which to flip image. Vertical: 0, Horizontal: 1
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     scope_name: Name of the scope in which the ops are added.
 
   Returns:
@@ -329,17 +398,16 @@
           mirror_cond,
           lambda: array_ops.reverse(image, [flip_index]),
           lambda: image,
-          name=scope
-      )
+          name=scope)
       return fix_image_flip_shape(image, result)
     elif shape.ndims == 4:
       batch_size = array_ops.shape(image)[0]
-      uniform_random = random_ops.random_uniform(
-          [batch_size], 0, 1.0, seed=seed
-      )
+      uniform_random = random_ops.random_uniform([batch_size],
+                                                 0,
+                                                 1.0,
+                                                 seed=seed)
       flips = math_ops.round(
-          array_ops.reshape(uniform_random, [batch_size, 1, 1, 1])
-      )
+          array_ops.reshape(uniform_random, [batch_size, 1, 1, 1]))
       flips = math_ops.cast(flips, image.dtype)
       flipped_input = array_ops.reverse(image, [flip_index + 1])
       return flips * flipped_input + (1 - flips) * image
@@ -356,8 +424,8 @@
   See also `reverse()`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
 
   Returns:
     A tensor of the same type and shape as `image`.
@@ -377,8 +445,8 @@
   See also `reverse()`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
 
   Returns:
     A tensor of the same type and shape as `image`.
@@ -397,8 +465,8 @@
   See also `reverse()`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     flip_index: 0 For vertical, 1 for horizontal.
 
   Returns:
@@ -414,7 +482,7 @@
     if shape.ndims == 3 or shape.ndims is None:
       return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
     elif shape.ndims == 4:
-      return array_ops.reverse(image, [flip_index+1])
+      return array_ops.reverse(image, [flip_index + 1])
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
@@ -424,8 +492,8 @@
   """Rotate image(s) counter-clockwise by 90 degrees.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     k: A scalar integer. The number of times the image is rotated by 90 degrees.
     name: A name for this operation (optional).
 
@@ -501,6 +569,7 @@
 
   def _rot180():
     return array_ops.reverse_v2(images, [1, 2])
+
   def _rot270():
     return array_ops.reverse_v2(array_ops.transpose(images, [0, 2, 1, 3]), [2])
 
@@ -519,8 +588,8 @@
   """Transpose image(s) by swapping the height and width dimension.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     name: A name for this operation (optional).
 
   Returns:
@@ -611,16 +680,16 @@
     # dimensions are statically defined.
     if dynamic_h:
       img_hd = math_ops.cast(img_h, dtypes.float64)
-      bbox_h_start = math_ops.cast(
-          (img_hd - img_hd * central_fraction) / 2, dtypes.int32)
+      bbox_h_start = math_ops.cast((img_hd - img_hd * central_fraction) / 2,
+                                   dtypes.int32)
     else:
       img_hd = float(img_h)
       bbox_h_start = int((img_hd - img_hd * central_fraction) / 2)
 
     if dynamic_w:
       img_wd = math_ops.cast(img_w, dtypes.float64)
-      bbox_w_start = math_ops.cast(
-          (img_wd - img_wd * central_fraction) / 2, dtypes.int32)
+      bbox_w_start = math_ops.cast((img_wd - img_wd * central_fraction) / 2,
+                                   dtypes.int32)
     else:
       img_wd = float(img_w)
       bbox_w_start = int((img_wd - img_wd * central_fraction) / 2)
@@ -641,15 +710,12 @@
     if rank == 3:
       image.set_shape([
           None if dynamic_h else bbox_h_size,
-          None if dynamic_w else bbox_w_size,
-          img_d
+          None if dynamic_w else bbox_w_size, img_d
       ])
     else:
       image.set_shape([
-          img_bs,
-          None if dynamic_h else bbox_h_size,
-          None if dynamic_w else bbox_w_size,
-          img_d
+          img_bs, None if dynamic_h else bbox_h_size,
+          None if dynamic_w else bbox_w_size, img_d
       ])
     return image
 
@@ -667,8 +733,8 @@
   `target_height` by `target_width`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     offset_height: Number of rows of zeros to add on top.
     offset_width: Number of columns of zeros to add on the left.
     target_height: Height of output image.
@@ -748,12 +814,12 @@
   `offset_height + target_height, offset_width + target_width`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     offset_height: Vertical coordinate of the top-left corner of the result in
-                   the input.
+      the input.
     offset_width: Horizontal coordinate of the top-left corner of the result in
-                  the input.
+      the input.
     target_height: Height of the result.
     target_width: Width of the result.
 
@@ -833,8 +899,8 @@
   dimension.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     target_height: Target height.
     target_width: Target width.
 
@@ -997,17 +1063,18 @@
           math_ops.cast(current_width, dtypes.float32))
       scale_factor = math_ops.minimum(scale_factor_height, scale_factor_width)
       scaled_height_const = math_ops.cast(
-          math_ops.round(
-              scale_factor * math_ops.cast(current_height, dtypes.float32)),
+          math_ops.round(scale_factor *
+                         math_ops.cast(current_height, dtypes.float32)),
           dtypes.int32)
       scaled_width_const = math_ops.cast(
-          math_ops.round(
-              scale_factor * math_ops.cast(current_width, dtypes.float32)),
+          math_ops.round(scale_factor *
+                         math_ops.cast(current_width, dtypes.float32)),
           dtypes.int32)
 
       # NOTE: Reset the size and other constants used later.
       size = ops.convert_to_tensor([scaled_height_const, scaled_width_const],
-                                   dtypes.int32, name='size')
+                                   dtypes.int32,
+                                   name='size')
       size_const_as_shape = tensor_util.constant_value_as_shape(size)
       new_height_const = size_const_as_shape.dims[0].value
       new_width_const = size_const_as_shape.dims[1].value
@@ -1044,7 +1111,7 @@
 
   Resized images will be distorted if their original aspect ratio is not
   the same as `size`.  To avoid distortions see
-  `tf.image.resize_image_with_pad`.
+  `tf.compat.v1.image.resize_image_with_pad`.
 
   `method` can be one of:
 
@@ -1461,8 +1528,7 @@
     image: An image or images to adjust.
     max_delta: float, must be non-negative.
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     The brightness-adjusted image(s).
@@ -1489,7 +1555,7 @@
     lower: float.  Lower bound for the random contrast factor.
     upper: float.  Upper bound for the random contrast factor.
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   Returns:
     The contrast-adjusted image(s).
@@ -1592,50 +1658,46 @@
 @tf_export('image.adjust_gamma')
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs Gamma Correction on the input image.
-
-  Also known as Power Law Transform. This function transforms the
-  input image pixelwise according to the equation `Out = In**gamma`
-  after scaling each pixel to the range 0 to 1.
-
+  Also known as Power Law Transform. This function converts the
+  input images at first to float representation, then transforms them
+  pixelwise according to the equation `Out = gain * In**gamma`,
+  and then converts the back to the original data type.
   Args:
-    image : A Tensor.
+    image : RGB image or images to adjust.
     gamma : A scalar or tensor. Non negative real number.
     gain  : A scalar or tensor. The constant multiplier.
-
   Returns:
-    A Tensor. Gamma corrected output image.
-
+    A Tensor. A Gamma-adjusted tensor of the same shape and type as `image`.
   Raises:
     ValueError: If gamma is negative.
-
   Notes:
     For gamma greater than 1, the histogram will shift towards left and
     the output image will be darker than the input image.
     For gamma less than 1, the histogram will shift towards right and
     the output image will be brighter than the input image.
-
   References:
     [1] http://en.wikipedia.org/wiki/Gamma_correction
   """
 
   with ops.name_scope(None, 'adjust_gamma', [image, gamma, gain]) as name:
-    # Convert pixel value to DT_FLOAT for computing adjusted image.
-    img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
-    # Keep image dtype for computing the scale of corresponding dtype.
     image = ops.convert_to_tensor(image, name='image')
+    # Remember original dtype to so we can convert back if needed
+    orig_dtype = image.dtype
+
+    if orig_dtype in [dtypes.float16, dtypes.float32]:
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
 
     assert_op = _assert(gamma >= 0, ValueError,
                         'Gamma should be a non-negative real number.')
     if assert_op:
       gamma = control_flow_ops.with_dependencies(assert_op, gamma)
 
-    # scale = max(dtype) - min(dtype).
-    scale = constant_op.constant(
-        image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
     # According to the definition of gamma correction.
-    adjusted_img = (img / scale)**gamma * scale * gain
+    adjusted_img = gain * flt_image**gamma
 
-    return adjusted_img
+    return convert_image_dtype(adjusted_img, orig_dtype, saturate=True)
 
 
 @tf_export('image.convert_image_dtype')
@@ -1753,6 +1815,7 @@
 
   Outputs a tensor of the same `DType` and rank as `images`.  The size of the
   last dimension of the output is 3, containing the RGB value of the pixels.
+  The input images' last dimension must be size 1.
 
   Args:
     images: The Grayscale tensor to convert. Last dimension must be size 1.
@@ -1762,6 +1825,8 @@
     The converted grayscale image(s).
   """
   with ops.name_scope(name, 'grayscale_to_rgb', [images]) as name:
+    images = _AssertGrayscaleImage(images)
+
     images = ops.convert_to_tensor(images, name='images')
     rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
     shape_list = ([array_ops.ones(rank_1, dtype=dtypes.int32)] +
@@ -1856,10 +1921,10 @@
     image: RGB image or images. Size of the last dimension must be 3.
     min_jpeg_quality: Minimum jpeg encoding quality to use.
     max_jpeg_quality: Maximum jpeg encoding quality to use.
-    seed: An operation-specific seed. It will be used in conjunction
-      with the graph-level seed to determine the real seeds that will be
-      used in this operation. Please see the documentation of
-      set_random_seed for its interaction with the graph-level random seed.
+    seed: An operation-specific seed. It will be used in conjunction with the
+      graph-level seed to determine the real seeds that will be used in this
+      operation. Please see the documentation of set_random_seed for its
+      interaction with the graph-level random seed.
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
@@ -1867,8 +1932,8 @@
   Raises:
     ValueError: if `min_jpeg_quality` or `max_jpeg_quality` is invalid.
   """
-  if (min_jpeg_quality < 0 or max_jpeg_quality < 0 or
-      min_jpeg_quality > 100 or max_jpeg_quality > 100):
+  if (min_jpeg_quality < 0 or max_jpeg_quality < 0 or min_jpeg_quality > 100 or
+      max_jpeg_quality > 100):
     raise ValueError('jpeg encoding range must be between 0 and 100.')
 
   if min_jpeg_quality >= max_jpeg_quality:
@@ -2030,29 +2095,55 @@
     substr = string_ops.substr(contents, 0, 3)
     return math_ops.equal(substr, b'\211PN', name=name)
 
-tf_export('io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg',
-          v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
-              gen_image_ops.decode_and_crop_jpeg)
 
-tf_export('io.decode_bmp', 'image.decode_bmp',
-          v1=['io.decode_bmp', 'image.decode_bmp'])(gen_image_ops.decode_bmp)
-tf_export('io.decode_gif', 'image.decode_gif',
-          v1=['io.decode_gif', 'image.decode_gif'])(gen_image_ops.decode_gif)
-tf_export('io.decode_jpeg', 'image.decode_jpeg',
-          v1=['io.decode_jpeg', 'image.decode_jpeg'])(gen_image_ops.decode_jpeg)
-tf_export('io.decode_png', 'image.decode_png',
-          v1=['io.decode_png', 'image.decode_png'])(gen_image_ops.decode_png)
+tf_export(
+    'io.decode_and_crop_jpeg',
+    'image.decode_and_crop_jpeg',
+    v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
+        gen_image_ops.decode_and_crop_jpeg)
 
-tf_export('io.encode_jpeg', 'image.encode_jpeg',
-          v1=['io.encode_jpeg', 'image.encode_jpeg'])(gen_image_ops.encode_jpeg)
-tf_export('io.extract_jpeg_shape', 'image.extract_jpeg_shape',
-          v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
-              gen_image_ops.extract_jpeg_shape)
+tf_export(
+    'io.decode_bmp',
+    'image.decode_bmp',
+    v1=['io.decode_bmp', 'image.decode_bmp'])(
+        gen_image_ops.decode_bmp)
+tf_export(
+    'io.decode_gif',
+    'image.decode_gif',
+    v1=['io.decode_gif', 'image.decode_gif'])(
+        gen_image_ops.decode_gif)
+tf_export(
+    'io.decode_jpeg',
+    'image.decode_jpeg',
+    v1=['io.decode_jpeg', 'image.decode_jpeg'])(
+        gen_image_ops.decode_jpeg)
+tf_export(
+    'io.decode_png',
+    'image.decode_png',
+    v1=['io.decode_png', 'image.decode_png'])(
+        gen_image_ops.decode_png)
+
+tf_export(
+    'io.encode_jpeg',
+    'image.encode_jpeg',
+    v1=['io.encode_jpeg', 'image.encode_jpeg'])(
+        gen_image_ops.encode_jpeg)
+tf_export(
+    'io.extract_jpeg_shape',
+    'image.extract_jpeg_shape',
+    v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
+        gen_image_ops.extract_jpeg_shape)
 
 
-@tf_export('io.decode_image', 'image.decode_image',
-           v1=['io.decode_image', 'image.decode_image'])
-def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
+@tf_export(
+    'io.decode_image',
+    'image.decode_image',
+    v1=['io.decode_image', 'image.decode_image'])
+def decode_image(contents,
+                 channels=None,
+                 dtype=dtypes.uint8,
+                 name=None,
+                 expand_animations=True):
   """Function for `decode_bmp`, `decode_gif`, `decode_jpeg`, and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
@@ -2063,7 +2154,9 @@
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
   arrays `[height, width, num_channels]`. Make sure to take this into account
   when constructing your graph if you are intermixing GIF files with BMP, JPEG,
-  and/or PNG files.
+  and/or PNG files. Alternately, set the `expand_animations` argument of this
+  function to `False`, in which case the op will return 3-dimensional tensors
+  and will truncate animated GIF files to the first frame.
 
   Args:
     contents: 0-D `string`. The encoded image bytes.
@@ -2071,11 +2164,15 @@
       the decoded image.
     dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
+    expand_animations: Controls the shape of the returned op's output.
+      If `True`, the returned op will produce a 3-D tensor for PNG, JPEG, and
+      BMP files; and a 4-D tensor for all GIFs, whether animated or not.
+      If, `False`, the returned op will produce a 3-D tensor for all file 
+      types and will truncate animated GIFs to the first frame.
 
   Returns:
-    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
-      BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
-      GIF images.
+    `Tensor` with type `dtype` and a 3- or 4-dimensional shape, depending on
+    the file type and the value of the `expand_animations` parameter.
 
   Raises:
     ValueError: On incorrect number of channels.
@@ -2086,7 +2183,7 @@
     substr = string_ops.substr(contents, 0, 3)
 
     def _bmp():
-      """Decodes a GIF image."""
+      """Decodes a BMP image."""
       signature = string_ops.substr(contents, 0, 2)
       # Create assert op to check that bytes are BMP decodable
       is_bmp = math_ops.equal(signature, 'BM', name='is_bmp')
@@ -2100,9 +2197,9 @@
         return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
+      """Decodes a GIF image."""
       # Create assert to make sure that channels is not set to 1
       # Already checked above that channels is in (None, 0, 1, 3)
-
       gif_channels = 0 if channels is None else channels
       good_channels = math_ops.logical_and(
           math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
@@ -2110,7 +2207,12 @@
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        result = convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        if not expand_animations:
+          # For now we decode animated GIFs fully and toss out all but the
+          # first frame when expand_animations is False
+          result = array_ops.gather(result, 0)
+        return result
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -2120,10 +2222,11 @@
     def _png():
       """Decodes a PNG image."""
       return convert_image_dtype(
-          gen_image_ops.decode_png(contents, channels,
-                                   dtype=dtypes.uint8
-                                   if dtype == dtypes.uint8
-                                   else dtypes.uint16), dtype)
+          gen_image_ops.decode_png(
+              contents,
+              channels,
+              dtype=dtypes.uint8 if dtype == dtypes.uint8 else dtypes.uint16),
+          dtype)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -2166,9 +2269,8 @@
   https://en.wikipedia.org/wiki/Total_variation_denoising
 
   Args:
-    images: 4-D Tensor of shape `[batch, height, width, channels]` or
-            3-D Tensor of shape `[height, width, channels]`.
-
+    images: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
     name: A name for the operation (optional).
 
   Raises:
@@ -2261,7 +2363,7 @@
       # Draw the bounding box in an image summary.
       image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                     bbox_for_draw)
-      tf.summary.image('images_with_box', image_with_box)
+      tf.compat.v1.summary.image('images_with_box', image_with_box)
 
       # Employ the bounding box to distort the image.
       distorted_image = tf.slice(image, begin, size)
@@ -2274,34 +2376,29 @@
 
   Args:
     image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
-      `int16`, `int32`, `int64`.
-      1-D, containing `[height, width, channels]`.
-    bounding_boxes: A `Tensor` of type `float32`.
-      3-D with shape `[batch, N, 4]` describing the N bounding boxes
-      associated with the image.
-    seed: An optional `int`. Defaults to `0`.
-      If `seed` is set to non-zero, the random number generator is seeded by
-      the given `seed`.  Otherwise, it is seeded by a random seed.
-    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
-      The cropped area of the image must contain at least this
-      fraction of any bounding box supplied. The value of this parameter should
-      be non-negative. In the case of 0, the cropped area does not need to
-      overlap any of the bounding boxes supplied.
+      `int16`, `int32`, `int64`. 1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`. 3-D with shape `[batch, N, 4]`
+      describing the N bounding boxes associated with the image.
+    seed: An optional `int`. Defaults to `0`. If `seed` is set to non-zero, the
+      random number generator is seeded by the given `seed`.  Otherwise, it is
+      seeded by a random seed.
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`. The
+      cropped area of the image must contain at least this fraction of any
+      bounding box supplied. The value of this parameter should be non-negative.
+      In the case of 0, the cropped area does not need to overlap any of the
+      bounding boxes supplied.
     aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
-      1.33]`.
-      The cropped area of the image must have an aspect `ratio =
-      width / height` within this range.
-    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
-      The cropped area of the image must contain a fraction of the
-      supplied image within this range.
-    max_attempts: An optional `int`. Defaults to `100`.
-      Number of attempts at generating a cropped region of the image
-      of the specified constraints. After `max_attempts` failures, return the
-      entire image.
+      1.33]`. The cropped area of the image must have an aspect `ratio = width /
+      height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`. The
+      cropped area of the image must contain a fraction of the supplied image
+      within this range.
+    max_attempts: An optional `int`. Defaults to `100`. Number of attempts at
+      generating a cropped region of the image of the specified constraints.
+      After `max_attempts` failures, return the entire image.
     use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
-      Controls behavior if no bounding boxes supplied.
-      If true, assume an implicit bounding box covering the whole input. If
-      false, raise an error.
+      Controls behavior if no bounding boxes supplied. If true, assume an
+      implicit bounding box covering the whole input. If false, raise an error.
     name: A name for the operation (optional).
 
   Returns:
@@ -2318,15 +2415,17 @@
     Provide as input to `tf.image.draw_bounding_boxes`.
   """
   seed1, seed2 = random_seed.get_seed(seed) if seed else (0, 0)
-  return sample_distorted_bounding_box(
-      image_size, bounding_boxes, seed1, seed2, min_object_covered,
-      aspect_ratio_range, area_range, max_attempts,
-      use_image_if_no_bounding_boxes, name)
+  return sample_distorted_bounding_box(image_size, bounding_boxes, seed1, seed2,
+                                       min_object_covered, aspect_ratio_range,
+                                       area_range, max_attempts,
+                                       use_image_if_no_bounding_boxes, name)
 
 
 @tf_export(v1=['image.sample_distorted_bounding_box'])
-@deprecation.deprecated(date=None, instructions='`seed2` arg is deprecated.'
-                        'Use sample_distorted_bounding_box_v2 instead.')
+@deprecation.deprecated(
+    date=None,
+    instructions='`seed2` arg is deprecated.'
+    'Use sample_distorted_bounding_box_v2 instead.')
 def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
@@ -2370,7 +2469,7 @@
       # Draw the bounding box in an image summary.
       image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                     bbox_for_draw)
-      tf.summary.image('images_with_box', image_with_box)
+      tf.compat.v1.summary.image('images_with_box', image_with_box)
 
       # Employ the bounding box to distort the image.
       distorted_image = tf.slice(image, begin, size)
@@ -2383,41 +2482,31 @@
 
   Args:
     image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
-      `int16`, `int32`, `int64`.
-      1-D, containing `[height, width, channels]`.
-    bounding_boxes: A `Tensor` of type `float32`.
-      3-D with shape `[batch, N, 4]` describing the N bounding boxes
-      associated with the image.
-    seed: An optional `int`. Defaults to `0`.
-      If either `seed` or `seed2` are set to non-zero, the random number
-      generator is seeded by the given `seed`.  Otherwise, it is seeded by a
-        random
-      seed.
-    seed2: An optional `int`. Defaults to `0`.
-      A second seed to avoid seed collision.
-    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
-      The cropped area of the image must contain at least this
-      fraction of any bounding box supplied. The value of this parameter should
-        be
-      non-negative. In the case of 0, the cropped area does not need to overlap
-      any of the bounding boxes supplied.
+      `int16`, `int32`, `int64`. 1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`. 3-D with shape `[batch, N, 4]`
+      describing the N bounding boxes associated with the image.
+    seed: An optional `int`. Defaults to `0`. If either `seed` or `seed2` are
+      set to non-zero, the random number generator is seeded by the given
+      `seed`.  Otherwise, it is seeded by a random seed.
+    seed2: An optional `int`. Defaults to `0`. A second seed to avoid seed
+      collision.
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`. The
+      cropped area of the image must contain at least this fraction of any
+      bounding box supplied. The value of this parameter should be non-negative.
+      In the case of 0, the cropped area does not need to overlap any of the
+      bounding boxes supplied.
     aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
-      1.33]`.
-      The cropped area of the image must have an aspect ratio =
-      width / height within this range.
-    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
-      The cropped area of the image must contain a fraction of the
-      supplied image within this range.
-    max_attempts: An optional `int`. Defaults to `100`.
-      Number of attempts at generating a cropped region of the image
-      of the specified constraints. After `max_attempts` failures, return the
-        entire
-      image.
+      1.33]`. The cropped area of the image must have an aspect ratio = width /
+      height within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`. The
+      cropped area of the image must contain a fraction of the supplied image
+      within this range.
+    max_attempts: An optional `int`. Defaults to `100`. Number of attempts at
+      generating a cropped region of the image of the specified constraints.
+      After `max_attempts` failures, return the entire image.
     use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
-      Controls behavior if no bounding boxes supplied.
-      If true, assume an implicit bounding box covering the whole input. If
-        false,
-      raise an error.
+      Controls behavior if no bounding boxes supplied. If true, assume an
+      implicit bounding box covering the whole input. If false, raise an error.
     name: A name for the operation (optional).
 
   Returns:
@@ -2536,8 +2625,8 @@
       overlap too much with respect to IOU.
     score_threshold: A float representing the threshold for deciding when to
       remove boxes based on score.
-    pad_to_max_output_size: bool.  If True, size of `selected_indices` output
-      is padded to `max_output_size`.
+    pad_to_max_output_size: bool.  If True, size of `selected_indices` output is
+      padded to `max_output_size`.
     name: A name for the operation (optional).
 
   Returns:
@@ -2551,12 +2640,16 @@
     score_threshold = ops.convert_to_tensor(
         score_threshold, name='score_threshold')
     if compat.forward_compatible(2018, 8, 7) or pad_to_max_output_size:
-      return gen_image_ops.non_max_suppression_v4(
-          boxes, scores, max_output_size, iou_threshold, score_threshold,
-          pad_to_max_output_size)
+      return gen_image_ops.non_max_suppression_v4(boxes, scores,
+                                                  max_output_size,
+                                                  iou_threshold,
+                                                  score_threshold,
+                                                  pad_to_max_output_size)
     else:
-      return gen_image_ops.non_max_suppression_v3(
-          boxes, scores, max_output_size, iou_threshold, score_threshold)
+      return gen_image_ops.non_max_suppression_v3(boxes, scores,
+                                                  max_output_size,
+                                                  iou_threshold,
+                                                  score_threshold)
 
 
 @tf_export('image.non_max_suppression_overlaps')
@@ -2605,8 +2698,8 @@
     # pylint: enable=protected-access
 
 
-_rgb_to_yiq_kernel = [[0.299, 0.59590059,
-                       0.2115], [0.587, -0.27455667, -0.52273617],
+_rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
+                      [0.587, -0.27455667, -0.52273617],
                       [0.114, -0.32134392, 0.31119955]]
 
 
@@ -2620,7 +2713,7 @@
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
-    size 3.
+      size 3.
 
   Returns:
     images: tensor with the same shape as `images`.
@@ -2647,7 +2740,7 @@
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
-    size 3.
+      size 3.
 
   Returns:
     images: tensor with the same shape as `images`.
@@ -2659,8 +2752,8 @@
   return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
 
 
-_rgb_to_yuv_kernel = [[0.299, -0.14714119,
-                       0.61497538], [0.587, -0.28886916, -0.51496512],
+_rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
+                      [0.587, -0.28886916, -0.51496512],
                       [0.114, 0.43601035, -0.10001026]]
 
 
@@ -2674,7 +2767,7 @@
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
-    size 3.
+      size 3.
 
   Returns:
     images: tensor with the same shape as `images`.
@@ -2701,7 +2794,7 @@
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
-    size 3.
+      size 3.
 
   Returns:
     images: tensor with the same shape as `images`.
@@ -2735,23 +2828,26 @@
   shape1[-3:].assert_is_compatible_with(shape2[-3:])
 
   if shape1.ndims is not None and shape2.ndims is not None:
-    for dim1, dim2 in zip(reversed(shape1.dims[:-3]),
-                          reversed(shape2.dims[:-3])):
+    for dim1, dim2 in zip(
+        reversed(shape1.dims[:-3]), reversed(shape2.dims[:-3])):
       if not (dim1 == 1 or dim2 == 1 or dim1.is_compatible_with(dim2)):
-        raise ValueError(
-            'Two images are not compatible: %s and %s' % (shape1, shape2))
+        raise ValueError('Two images are not compatible: %s and %s' %
+                         (shape1, shape2))
 
   # Now assign shape tensors.
   shape1, shape2 = array_ops.shape_n([img1, img2])
 
   # TODO(sjhwang): Check if shape1[:-3] and shape2[:-3] are broadcastable.
   checks = []
-  checks.append(control_flow_ops.Assert(
-      math_ops.greater_equal(array_ops.size(shape1), 3),
-      [shape1, shape2], summarize=10))
-  checks.append(control_flow_ops.Assert(
-      math_ops.reduce_all(math_ops.equal(shape1[-3:], shape2[-3:])),
-      [shape1, shape2], summarize=10))
+  checks.append(
+      control_flow_ops.Assert(
+          math_ops.greater_equal(array_ops.size(shape1), 3), [shape1, shape2],
+          summarize=10))
+  checks.append(
+      control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(shape1[-3:], shape2[-3:])),
+          [shape1, shape2],
+          summarize=10))
   return shape1, shape2, checks
 
 
@@ -2808,11 +2904,8 @@
     with ops.control_dependencies(checks):
       return array_ops.identity(psnr_val)
 
-_SSIM_K1 = 0.01
-_SSIM_K2 = 0.03
 
-
-def _ssim_helper(x, y, reducer, max_val, compensation=1.0):
+def _ssim_helper(x, y, reducer, max_val, compensation=1.0, k1=0.01, k2=0.03):
   r"""Helper function for computing SSIM.
 
   SSIM estimates covariances with weighted sums.  The default parameters
@@ -2830,19 +2923,23 @@
   Arguments:
     x: First set of images.
     y: Second set of images.
-    reducer: Function that computes 'local' averages from set of images.
-      For non-covolutional version, this is usually tf.reduce_mean(x, [1, 2]),
-      and for convolutional version, this is usually tf.nn.avg_pool or
+    reducer: Function that computes 'local' averages from set of images. For
+      non-covolutional version, this is usually tf.reduce_mean(x, [1, 2]), and
+      for convolutional version, this is usually tf.nn.avg_pool2d or
       tf.nn.conv2d with weighted-sum kernel.
     max_val: The dynamic range (i.e., the difference between the maximum
       possible allowed value and the minimum allowed value).
     compensation: Compensation factor. See above.
+    k1: Default value 0.01
+    k2: Default value 0.03 (SSIM is less sensitivity to K2 for lower values, so
+      it would be better if we taken the values in range of 0< K2 <0.4).
 
   Returns:
     A pair containing the luminance measure, and the contrast-structure measure.
   """
-  c1 = (_SSIM_K1 * max_val) ** 2
-  c2 = (_SSIM_K2 * max_val) ** 2
+
+  c1 = (k1 * max_val)**2
+  c2 = (k2 * max_val)**2
 
   # SSIM luminance measure is
   # (2 * mu_x * mu_y + c1) / (mu_x ** 2 + mu_y ** 2 + c1).
@@ -2883,7 +2980,13 @@
   return array_ops.reshape(g, shape=[size, size, 1, 1])
 
 
-def _ssim_per_channel(img1, img2, max_val=1.0):
+def _ssim_per_channel(img1,
+                      img2,
+                      max_val=1.0,
+                      filter_size=11,
+                      filter_sigma=1.5,
+                      k1=0.01,
+                      k2=0.03):
   """Computes SSIM index between img1 and img2 per color channel.
 
   This function matches the standard SSIM implementation from:
@@ -2900,20 +3003,32 @@
     img2: Second image batch.
     max_val: The dynamic range of the images (i.e., the difference between the
       maximum the and minimum allowed values).
+    filter_size: Default value 11 (size of gaussian filter).
+    filter_sigma: Default value 1.5 (width of gaussian filter).
+    k1: Default value 0.01
+    k2: Default value 0.03 (SSIM is less sensitivity to K2 for lower values, so
+      it would be better if we taken the values in range of 0< K2 <0.4).
 
   Returns:
     A pair of tensors containing and channel-wise SSIM and contrast-structure
     values. The shape is [..., channels].
   """
-  filter_size = constant_op.constant(11, dtype=dtypes.int32)
-  filter_sigma = constant_op.constant(1.5, dtype=img1.dtype)
+  filter_size = constant_op.constant(filter_size, dtype=dtypes.int32)
+  filter_sigma = constant_op.constant(filter_sigma, dtype=img1.dtype)
 
   shape1, shape2 = array_ops.shape_n([img1, img2])
   checks = [
-      control_flow_ops.Assert(math_ops.reduce_all(math_ops.greater_equal(
-          shape1[-3:-1], filter_size)), [shape1, filter_size], summarize=8),
-      control_flow_ops.Assert(math_ops.reduce_all(math_ops.greater_equal(
-          shape2[-3:-1], filter_size)), [shape2, filter_size], summarize=8)]
+      control_flow_ops.Assert(
+          math_ops.reduce_all(
+              math_ops.greater_equal(shape1[-3:-1], filter_size)),
+          [shape1, filter_size],
+          summarize=8),
+      control_flow_ops.Assert(
+          math_ops.reduce_all(
+              math_ops.greater_equal(shape2[-3:-1], filter_size)),
+          [shape2, filter_size],
+          summarize=8)
+  ]
 
   # Enforce the check to run before computation.
   with ops.control_dependencies(checks):
@@ -2934,10 +3049,11 @@
     shape = array_ops.shape(x)
     x = array_ops.reshape(x, shape=array_ops.concat([[-1], shape[-3:]], 0))
     y = nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1], padding='VALID')
-    return array_ops.reshape(y, array_ops.concat([shape[:-3],
-                                                  array_ops.shape(y)[1:]], 0))
+    return array_ops.reshape(
+        y, array_ops.concat([shape[:-3], array_ops.shape(y)[1:]], 0))
 
-  luminance, cs = _ssim_helper(img1, img2, reducer, max_val, compensation)
+  luminance, cs = _ssim_helper(img1, img2, reducer, max_val, compensation, k1,
+                               k2)
 
   # Average over the second and the third from the last: height, width.
   axes = constant_op.constant([-3, -2], dtype=dtypes.int32)
@@ -2947,7 +3063,13 @@
 
 
 @tf_export('image.ssim')
-def ssim(img1, img2, max_val):
+def ssim(img1,
+         img2,
+         max_val,
+         filter_size=11,
+         filter_sigma=1.5,
+         k1=0.01,
+         k2=0.03):
   """Computes SSIM index between img1 and img2.
 
   This function is based on the standard SSIM implementation from:
@@ -2972,12 +3094,14 @@
       im1 = tf.decode_png('path/to/im1.png')
       im2 = tf.decode_png('path/to/im2.png')
       # Compute SSIM over tf.uint8 Tensors.
-      ssim1 = tf.image.ssim(im1, im2, max_val=255)
+      ssim1 = tf.image.ssim(im1, im2, max_val=255, filter_size=11,
+                            filter_sigma=1.5, k1=0.01, k2=0.03)
 
       # Compute SSIM over tf.float32 Tensors.
       im1 = tf.image.convert_image_dtype(im1, tf.float32)
       im2 = tf.image.convert_image_dtype(im2, tf.float32)
-      ssim2 = tf.image.ssim(im1, im2, max_val=1.0)
+      ssim2 = tf.image.ssim(im1, im2, max_val=1.0, filter_size=11,
+                            filter_sigma=1.5, k1=0.01, k2=0.03)
       # ssim1 and ssim2 both have type tf.float32 and are almost equal.
   ```
 
@@ -2986,6 +3110,11 @@
     img2: Second image batch.
     max_val: The dynamic range of the images (i.e., the difference between the
       maximum the and minimum allowed values).
+    filter_size: Default value 11 (size of gaussian filter).
+    filter_sigma: Default value 1.5 (width of gaussian filter).
+    k1: Default value 0.01
+    k2: Default value 0.03 (SSIM is less sensitivity to K2 for lower values, so
+      it would be better if we taken the values in range of 0< K2 <0.4).
 
   Returns:
     A tensor containing an SSIM value for each image in batch.  Returned SSIM
@@ -3002,7 +3131,8 @@
   max_val = convert_image_dtype(max_val, dtypes.float32)
   img1 = convert_image_dtype(img1, dtypes.float32)
   img2 = convert_image_dtype(img2, dtypes.float32)
-  ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val)
+  ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
+                                          filter_sigma, k1, k2)
   # Compute average over color channels.
   return math_ops.reduce_mean(ssim_per_channel, [-1])
 
@@ -3012,7 +3142,14 @@
 
 
 @tf_export('image.ssim_multiscale')
-def ssim_multiscale(img1, img2, max_val, power_factors=_MSSSIM_WEIGHTS):
+def ssim_multiscale(img1,
+                    img2,
+                    max_val,
+                    power_factors=_MSSSIM_WEIGHTS,
+                    filter_size=11,
+                    filter_sigma=1.5,
+                    k1=0.01,
+                    k2=0.03):
   """Computes the MS-SSIM between img1 and img2.
 
   This function assumes that `img1` and `img2` are image batches, i.e. the last
@@ -3036,18 +3173,22 @@
       resolution's weight and each increasing scale corresponds to the image
       being downsampled by 2.  Defaults to (0.0448, 0.2856, 0.3001, 0.2363,
       0.1333), which are the values obtained in the original paper.
+    filter_size: Default value 11 (size of gaussian filter).
+    filter_sigma: Default value 1.5 (width of gaussian filter).
+    k1: Default value 0.01
+    k2: Default value 0.03 (SSIM is less sensitivity to K2 for lower values, so
+      it would be better if we taken the values in range of 0< K2 <0.4).
 
   Returns:
     A tensor containing an MS-SSIM value for each image in batch.  The values
     are in range [0, 1].  Returns a tensor with shape:
     broadcast(img1.shape[:-3], img2.shape[:-3]).
   """
-  # Shape checking.
-  shape1 = img1.get_shape().with_rank_at_least(3)
-  shape2 = img2.get_shape().with_rank_at_least(3)
-  shape1[-3:].merge_with(shape2[-3:])
-
   with ops.name_scope(None, 'MS-SSIM', [img1, img2]):
+    # Convert to tensor if needed.
+    img1 = ops.convert_to_tensor(img1, name='img1')
+    img2 = ops.convert_to_tensor(img2, name='img2')
+    # Shape checking.
     shape1, shape2, checks = _verify_compatible_image_shapes(img1, img2)
     with ops.control_dependencies(checks):
       img1 = array_ops.identity(img1)
@@ -3094,9 +3235,11 @@
                                          lambda: flat_imgs)
           # pylint: enable=cell-var-from-loop
 
-          downscaled = [nn_ops.avg_pool(x, ksize=divisor, strides=divisor,
-                                        padding='VALID')
-                        for x in padded]
+          downscaled = [
+              nn_ops.avg_pool(
+                  x, ksize=divisor, strides=divisor, padding='VALID')
+              for x in padded
+          ]
           tails = [x[1:] for x in array_ops.shape_n(downscaled)]
           imgs = [
               array_ops.reshape(x, array_ops.concat([h, t], 0))
@@ -3104,17 +3247,23 @@
           ]
 
         # Overwrite previous ssim value since we only need the last one.
-        ssim_per_channel, cs = _ssim_per_channel(*imgs, max_val=max_val)
+        ssim_per_channel, cs = _ssim_per_channel(
+            *imgs,
+            max_val=max_val,
+            filter_size=filter_size,
+            filter_sigma=filter_sigma,
+            k1=k1,
+            k2=k2)
         mcs.append(nn_ops.relu(cs))
 
     # Remove the cs score for the last scale. In the MS-SSIM calculation,
     # we use the l(p) at the highest scale. l(p) * cs(p) is ssim(p).
     mcs.pop()  # Remove the cs score for the last scale.
-    mcs_and_ssim = array_ops.stack(mcs + [nn_ops.relu(ssim_per_channel)],
-                                   axis=-1)
+    mcs_and_ssim = array_ops.stack(
+        mcs + [nn_ops.relu(ssim_per_channel)], axis=-1)
     # Take weighted geometric mean across the scale axis.
-    ms_ssim = math_ops.reduce_prod(math_ops.pow(mcs_and_ssim, power_factors),
-                                   [-1])
+    ms_ssim = math_ops.reduce_prod(
+        math_ops.pow(mcs_and_ssim, power_factors), [-1])
 
     return math_ops.reduce_mean(ms_ssim, [-1])  # Avg over color channels.
 
@@ -3165,7 +3314,7 @@
 
   Arguments:
     image: Image tensor with shape [batch_size, h, w, d] and type float32 or
-    float64.  The image(s) must be 2x2 or larger.
+      float64.  The image(s) must be 2x2 or larger.
 
   Returns:
     Tensor holding edge maps for each channel. Returns a tensor with shape
@@ -3182,8 +3331,8 @@
   kernels = np.expand_dims(kernels, -2)
   kernels_tf = constant_op.constant(kernels, dtype=image.dtype)
 
-  kernels_tf = array_ops.tile(kernels_tf, [1, 1, image_shape[-1], 1],
-                              name='sobel_filters')
+  kernels_tf = array_ops.tile(
+      kernels_tf, [1, 1, image_shape[-1], 1], name='sobel_filters')
 
   # Use depth-wise convolution to calculate edge maps per channel.
   pad_sizes = [[0, 0], [1, 1], [1, 1], [0, 0]]
@@ -3270,14 +3419,13 @@
 
 
 @tf_export('image.crop_and_resize', v1=[])
-def crop_and_resize_v2(
-    image,
-    boxes,
-    box_indices,
-    crop_size,
-    method='bilinear',
-    extrapolation_value=0,
-    name=None):
+def crop_and_resize_v2(image,
+                       boxes,
+                       box_indices,
+                       crop_size,
+                       method='bilinear',
+                       extrapolation_value=0,
+                       name=None):
   """Extracts crops from the input image tensor and resizes them.
 
   Extracts crops from the input image tensor and resizes them using bilinear
@@ -3292,8 +3440,9 @@
   `size = [crop_height, crop_width]`. The result is a 4-D tensor
   `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
   In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
-  results to using `tf.image.resize_bilinear()` or
-  `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+  results to using `tf.compat.v1.image.resize_bilinear()` or
+  `tf.compat.v1.image.resize_nearest_neighbor()`(depends on the `method`
+  argument) with
   `align_corners=True`.
 
   Args:
@@ -3320,7 +3469,7 @@
     method: An optional string specifying the sampling method for resizing. It
       can be either `"bilinear"` or `"nearest"` and default to `"bilinear"`.
       Currently two sampling methods are supported: Bilinear and Nearest
-      Neighbor.
+        Neighbor.
     extrapolation_value: An optional `float`. Defaults to `0`. Value used for
       extrapolation, when applicable.
     name: A name for the operation (optional).
@@ -3328,14 +3477,15 @@
   Returns:
     A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
   """
-  return gen_image_ops.crop_and_resize(
-      image, boxes, box_indices, crop_size, method, extrapolation_value, name)
+  return gen_image_ops.crop_and_resize(image, boxes, box_indices, crop_size,
+                                       method, extrapolation_value, name)
 
 
 @tf_export(v1=['image.crop_and_resize'])
-@deprecation.deprecated_args(
-    None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
-def crop_and_resize_v1(   # pylint: disable=missing-docstring
+@deprecation.deprecated_args(None,
+                             'box_ind is deprecated, use box_indices instead',
+                             'box_ind')
+def crop_and_resize_v1(  # pylint: disable=missing-docstring
     image,
     boxes,
     box_ind=None,
@@ -3344,13 +3494,15 @@
     extrapolation_value=0,
     name=None,
     box_indices=None):
-  box_ind = deprecation.deprecated_argument_lookup(
-      "box_indices", box_indices, "box_ind", box_ind)
-  return gen_image_ops.crop_and_resize(
-      image, boxes, box_ind, crop_size, method, extrapolation_value, name)
+  box_ind = deprecation.deprecated_argument_lookup('box_indices', box_indices,
+                                                   'box_ind', box_ind)
+  return gen_image_ops.crop_and_resize(image, boxes, box_ind, crop_size, method,
+                                       extrapolation_value, name)
+
 
 crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
 
+
 @tf_export(v1=['image.extract_glimpse'])
 def extract_glimpse(
     input,  # pylint: disable=redefined-builtin
@@ -3584,7 +3736,7 @@
   Returns:
     A `Tensor`. Has the same type as `images`.
   """
-  if colors is None and not compat.forward_compatible(2019, 5, 1):
+  if colors is None:
     return gen_image_ops.draw_bounding_boxes(images, boxes, name)
   return gen_image_ops.draw_bounding_boxes_v2(images, boxes, colors, name)
 
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 784df0f..3b7b699 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -212,6 +212,38 @@
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  def testGrayscaleToRGBInputValidation(self):
+    # tests whether the grayscale_to_rgb function raises
+    # an exception if the input images' last dimension is
+    # not of size 1, i.e. the images have shape
+    # [batch size, height, width] or [height, width]
+
+    # tests if an exception is raised if a three dimensional
+    # input is used, i.e. the images have shape [batch size, height, width]
+    with self.cached_session(use_gpu=True):
+      # 3-D input with batch dimension.
+      x_np = np.array([[1, 2]], dtype=np.uint8).reshape([1, 1, 2])
+
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+
+      # this is the error message we expect the function to raise
+      err_msg = "Last dimension of a grayscale image should be size 1"
+      with self.assertRaisesRegexp(ValueError, err_msg):
+        image_ops.grayscale_to_rgb(x_tf)
+
+    # tests if an exception is raised if a two dimensional
+    # input is used, i.e. the images have shape [height, width]
+    with self.cached_session(use_gpu=True):
+      # 2-D input without batch dimension.
+      x_np = np.array([[1, 2]], dtype=np.uint8).reshape([1, 2])
+
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+
+      # this is the error message we expect the function to raise
+      err_msg = "A grayscale image must be at least three-dimensional"
+      with self.assertRaisesRegexp(ValueError, err_msg):
+        image_ops.grayscale_to_rgb(x_tf)
+
   @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Shape inference works and produces expected output where possible
@@ -241,43 +273,37 @@
 
 class AdjustGamma(test_util.TensorFlowTestCase):
 
-  def test_adjust_gamma_one(self):
-    """Same image should be returned for gamma equal to one"""
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_less_zero_float32(self):
+    """White image should be returned for gamma equal to zero"""
     with self.cached_session():
-      x_data = np.random.uniform(0, 255, (8, 8))
+      x_data = np.random.uniform(0, 1.0, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
 
       x = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.adjust_gamma(x, gamma=1)
 
-      y_tf = self.evaluate(y)
-      y_np = x_np
+      err_msg = "Gamma should be a non-negative real number"
+      with self.assertRaisesRegexp(ValueError, err_msg):
+        image_ops.adjust_gamma(x, gamma=-1)
 
-      self.assertAllClose(y_tf, y_np, 1e-6)
-
-  def test_adjust_gamma_less_zero(self):
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_less_zero_uint8(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
       x_data = np.random.uniform(0, 255, (8, 8))
-      x_np = np.array(x_data, dtype=np.float32)
+      x_np = np.array(x_data, dtype=np.uint8)
 
       x = constant_op.constant(x_np, shape=x_np.shape)
 
-      err_msg = "Gamma should be a non-negative real number."
-
-      try:
+      err_msg = "Gamma should be a non-negative real number"
+      with self.assertRaisesRegexp(ValueError, err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
-      x_data = np.random.uniform(0, 255, (8, 8))
+      x_data = np.random.uniform(0, 1.0, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
 
       x = constant_op.constant(x_np, shape=x_np.shape)
@@ -285,73 +311,101 @@
 
       image = image_ops.adjust_gamma(x, gamma=y)
 
-      err_msg = "Gamma should be a non-negative real number."
-      try:
+      err_msg = "Gamma should be a non-negative real number"
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, err_msg):
         self.evaluate(image)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
-  def test_adjust_gamma_zero(self):
-    """White image should be returned for gamma equal to zero"""
+  def _test_adjust_gamma_uint8(self, gamma):
+    """Verifying the output with expected results for gamma
+
+    correction for uint8 images
+    """
     with self.cached_session():
-      x_data = np.random.uniform(0, 255, (8, 8))
-      x_np = np.array(x_data, dtype=np.float32)
-
+      x_np = np.random.uniform(0, 255, (8, 8)).astype(np.uint8)
       x = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.adjust_gamma(x, gamma=0)
+      y = image_ops.adjust_gamma(x, gamma=gamma)
+      y_tf = np.trunc(y.eval())
 
-      y_tf = self.evaluate(y)
+      # calculate gamma correction using numpy
+      # firstly, transform uint8 to float representation
+      # then perform correction
+      y_np = np.power(x_np / 255.0, gamma)
+      # convert correct numpy image back to uint8 type
+      y_np = np.trunc(np.clip(y_np * 255.5, 0, 255.0))
 
-      dtype = x.dtype.as_numpy_dtype
-      y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
-      y_np = y_np.reshape((8, 8))
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+  def _test_adjust_gamma_float32(self, gamma):
+    """Verifying the output with expected results for gamma
+
+    correction for float32 images
+    """
+    with self.cached_session():
+      x_np = np.random.uniform(0, 1.0, (8, 8))
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_gamma(x, gamma=gamma)
+      y_tf = y.eval()
+
+      y_np = np.clip(np.power(x_np, gamma), 0, 1.0)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   @test_util.run_deprecated_v1
-  def test_adjust_gamma_less_one(self):
-    """Verifying the output with expected results for gamma
-    correction with gamma equal to half"""
-    with self.cached_session():
-      x_np = np.arange(0, 255, 4, np.uint8).reshape(8, 8)
-      y = image_ops.adjust_gamma(x_np, gamma=0.5)
-      y_tf = np.trunc(y.eval())
-
-      y_np = np.array(
-          [[0, 31, 45, 55, 63, 71, 78, 84], [
-              90, 95, 100, 105, 110, 115, 119, 123
-          ], [127, 131, 135, 139, 142, 146, 149, 153], [
-              156, 159, 162, 165, 168, 171, 174, 177
-          ], [180, 183, 186, 188, 191, 194, 196, 199], [
-              201, 204, 206, 209, 211, 214, 216, 218
-          ], [221, 223, 225, 228, 230, 232, 234, 236],
-           [238, 241, 243, 245, 247, 249, 251, 253]],
-          dtype=np.float32)
-
-      self.assertAllClose(y_tf, y_np, 1e-6)
+  def test_adjust_gamma_one_float32(self):
+    """Same image should be returned for gamma equal to one"""
+    self._test_adjust_gamma_float32(1.0)
 
   @test_util.run_deprecated_v1
-  def test_adjust_gamma_greater_one(self):
+  def test_adjust_gamma_one_uint8(self):
+    self._test_adjust_gamma_uint8(1.0)
+
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_zero_uint8(self):
+    """White image should be returned for gamma equal
+
+    to zero for uint8 images
+    """
+    self._test_adjust_gamma_uint8(gamma=0.0)
+
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_less_one_uint8(self):
     """Verifying the output with expected results for gamma
-    correction with gamma equal to two"""
-    with self.cached_session():
-      x_np = np.arange(0, 255, 4, np.uint8).reshape(8, 8)
-      y = image_ops.adjust_gamma(x_np, gamma=2)
-      y_tf = np.trunc(y.eval())
 
-      y_np = np.array(
-          [[0, 0, 0, 0, 1, 1, 2, 3], [4, 5, 6, 7, 9, 10, 12, 14], [
-              16, 18, 20, 22, 25, 27, 30, 33
-          ], [36, 39, 42, 45, 49, 52, 56, 60], [64, 68, 72, 76, 81, 85, 90, 95],
-           [100, 105, 110, 116, 121, 127, 132, 138], [
-               144, 150, 156, 163, 169, 176, 182, 189
-           ], [196, 203, 211, 218, 225, 233, 241, 249]],
-          dtype=np.float32)
+    correction with gamma equal to half for uint8 images
+    """
+    self._test_adjust_gamma_uint8(gamma=0.5)
 
-      self.assertAllClose(y_tf, y_np, 1e-6)
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_greater_one_uint8(self):
+    """Verifying the output with expected results for gamma
+
+    correction for uint8 images
+    """
+    self._test_adjust_gamma_uint8(gamma=1.0)
+
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_less_one_float32(self):
+    """Verifying the output with expected results for gamma
+
+    correction with gamma equal to half for float32 images
+    """
+    self._test_adjust_gamma_float32(0.5)
+
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_greater_one_float32(self):
+    """Verifying the output with expected results for gamma
+
+    correction with gamma equal to two for float32 images
+    """
+    self._test_adjust_gamma_float32(1.0)
+
+  @test_util.run_deprecated_v1
+  def test_adjust_gamma_zero_float32(self):
+    """White image should be returned for gamma equal
+
+    to zero for float32 images
+    """
+    self._test_adjust_gamma_float32(0.0)
 
 
 class AdjustHueTest(test_util.TensorFlowTestCase):
@@ -3349,7 +3403,7 @@
 
 
 # half_pixel_centers not supported by XLA
-@test_util.disable_all_xla("b/127616992")
+@test_util.for_all_test_methods(test_util.disable_xla, "b/127616992")
 class ResizeImageWithPadV2Test(test_util.TensorFlowTestCase):
 
   def _ResizeImageWithPad(self, x, target_height, target_width,
@@ -4679,7 +4733,8 @@
     expected = self._ssim[np.triu_indices(3)]
 
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
-    ssim = image_ops.ssim(*ph, max_val=1.0)
+    ssim = image_ops.ssim(
+        *ph, max_val=1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       scores = [ssim.eval(dict(zip(ph, t)))
                 for t in itertools.combinations_with_replacement(img, 2)]
@@ -4693,8 +4748,14 @@
     img1 = np.concatenate(img1)
     img2 = np.concatenate(img2)
 
-    ssim = image_ops.ssim(constant_op.constant(img1),
-                          constant_op.constant(img2), 1.0)
+    ssim = image_ops.ssim(
+        constant_op.constant(img1),
+        constant_op.constant(img2),
+        1.0,
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
@@ -4706,7 +4767,8 @@
     img1 = array_ops.expand_dims(img, axis=0)  # batch dims: 1, 2.
     img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
 
-    ssim = image_ops.ssim(img1, img2, 1.0)
+    ssim = image_ops.ssim(
+        img1, img2, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
@@ -4720,8 +4782,14 @@
     img1 = img1.reshape((1, 16, 16, 1))
     img2 = img2.reshape((1, 16, 16, 1))
 
-    ssim = image_ops.ssim(constant_op.constant(img1),
-                          constant_op.constant(img2), 255)
+    ssim = image_ops.ssim(
+        constant_op.constant(img1),
+        constant_op.constant(img2),
+        255,
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertLess(ssim.eval(), 0)
 
@@ -4731,10 +4799,12 @@
     img2 = self._RandomImage((1, 16, 16, 3), 255)
     img1 = constant_op.constant(img1, dtypes.uint8)
     img2 = constant_op.constant(img2, dtypes.uint8)
-    ssim_uint8 = image_ops.ssim(img1, img2, 255)
+    ssim_uint8 = image_ops.ssim(
+        img1, img2, 255, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
-    ssim_float32 = image_ops.ssim(img1, img2, 1.0)
+    ssim_float32 = image_ops.ssim(
+        img1, img2, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(
           ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
@@ -4777,7 +4847,8 @@
     expected = self._msssim[np.triu_indices(3)]
 
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
-    msssim = image_ops.ssim_multiscale(*ph, max_val=1.0)
+    msssim = image_ops.ssim_multiscale(
+        *ph, max_val=1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       scores = [msssim.eval(dict(zip(ph, t)))
                 for t in itertools.combinations_with_replacement(img, 2)]
@@ -4790,8 +4861,14 @@
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
     scalar = constant_op.constant(1.0, dtype=dtypes.float32)
     scaled_ph = [x * scalar for x in ph]
-    msssim = image_ops.ssim_multiscale(*scaled_ph, max_val=1.0,
-                                       power_factors=(1, 1, 1, 1, 1))
+    msssim = image_ops.ssim_multiscale(
+        *scaled_ph,
+        max_val=1.0,
+        power_factors=(1, 1, 1, 1, 1),
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
     grads = gradients.gradients(msssim, scalar)
     with self.cached_session(use_gpu=True) as sess:
       np_grads = sess.run(grads, feed_dict={ph[0]: img[0], ph[1]: img[1]})
@@ -4806,8 +4883,14 @@
     img1 = np.concatenate(img1)
     img2 = np.concatenate(img2)
 
-    msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
-                                       constant_op.constant(img2), 1.0)
+    msssim = image_ops.ssim_multiscale(
+        constant_op.constant(img1),
+        constant_op.constant(img2),
+        1.0,
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
@@ -4820,7 +4903,8 @@
     img1 = array_ops.expand_dims(img, axis=0)  # batch dims: 1, 2.
     img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
 
-    score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
+    score_tensor = image_ops.ssim_multiscale(
+        img1, img2, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
@@ -4838,8 +4922,11 @@
                 np.full_like(img1, fill_value=255)]
 
       images = [ops.convert_to_tensor(x, dtype=dtypes.float32) for x in images]
-      msssim_ops = [image_ops.ssim_multiscale(x, y, 1.0)
-                    for x, y in itertools.combinations(images, 2)]
+      msssim_ops = [
+          image_ops.ssim_multiscale(
+              x, y, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
+          for x, y in itertools.combinations(images, 2)
+      ]
       msssim = self.evaluate(msssim_ops)
       msssim = np.squeeze(msssim)
 
@@ -4852,14 +4939,23 @@
     img2 = self._RandomImage((1, 180, 240, 3), 255)
     img1 = constant_op.constant(img1, dtypes.uint8)
     img2 = constant_op.constant(img2, dtypes.uint8)
-    ssim_uint8 = image_ops.ssim_multiscale(img1, img2, 255)
+    ssim_uint8 = image_ops.ssim_multiscale(
+        img1, img2, 255, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
-    ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
+    ssim_float32 = image_ops.ssim_multiscale(
+        img1, img2, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(
           ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
+  def testNumpyInput(self):
+    """Test case for GitHub issue 28241."""
+    image = np.random.random([512, 512, 1])
+    score_tensor = image_ops.ssim_multiscale(image, image, max_val=1.0)
+    with self.cached_session(use_gpu=True):
+      _ = self.evaluate(score_tensor)
+
 
 class ImageGradientsTest(test_util.TensorFlowTestCase):
 
@@ -5027,6 +5123,21 @@
       image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
+  def testExpandAnimations(self):
+    with self.cached_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(
+          gif0, dtype=dtypes.float32, expand_animations=False)
+      # image_ops.decode_png() handles GIFs and returns 3D tensors
+      animation = image_ops.decode_gif(gif0)
+      first_frame = array_ops.gather(animation, 0)
+      image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
+      image0, image1 = self.evaluate([image0, image1])
+      self.assertEqual(len(image0.shape), 3)
+      self.assertAllEqual(list(image0.shape), [40, 20, 3])
+      self.assertAllEqual(image0, image1)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 035534e..79dc3bb 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -25,6 +25,7 @@
     partition_info: (Optional) variable_scope._PartitionInfo object holding
       additional information about how the variable is partitioned. May be
       `None` if the variable is not partitioned.
+
   Returns:
     A `Tensor` of type `dtype` and `shape`.
 """
@@ -46,14 +47,13 @@
 from tensorflow.python.ops import random_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.deprecation import deprecated
-from tensorflow.python.util.deprecation import  deprecated_arg_values
-from tensorflow.python.util.deprecation import  deprecated_args
+from tensorflow.python.util.deprecation import deprecated_arg_values
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
 class Initializer(object):
-  """Initializer base class: all initializers inherit from this class.
-  """
+  """Initializer base class: all initializers inherit from this class."""
 
   def __call__(self, shape, dtype=None, partition_info=None):
     """Returns a tensor object initialized as specified by the initializer.
@@ -88,8 +88,8 @@
     ```
 
     Args:
-      config: A Python dictionary.
-        It will typically be the output of `get_config`.
+      config: A Python dictionary. It will typically be the output of
+        `get_config`.
 
     Returns:
       An Initializer instance.
@@ -104,8 +104,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, dtype=dtypes.float32):
     self.dtype = dtypes.as_dtype(dtype)
 
@@ -125,8 +124,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, dtype=dtypes.float32):
     self.dtype = dtypes.as_dtype(dtype)
 
@@ -182,11 +180,11 @@
     >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
     >>> # value = np.array(value)
     >>> # value = value.reshape([2, 4])
-    >>> init = tf.constant_initializer(value)
+    >>> init = tf.compat.v1.constant_initializer(value)
 
     >>> print('fitting shape:')
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[2, 4], initializer=init)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[2, 4], initializer=init)
     >>>   x.initializer.run()
     >>>   print(x.eval())
 
@@ -195,8 +193,8 @@
      [ 4.  5.  6.  7.]]
 
     >>> print('larger shape:')
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[3, 4], initializer=init)
     >>>   x.initializer.run()
     >>>   print(x.eval())
 
@@ -206,15 +204,17 @@
      [ 7.  7.  7.  7.]]
 
     >>> print('smaller shape:')
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[2, 3], initializer=init)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[2, 3], initializer=init)
 
     ValueError: Too many elements provided. Needed at most 6, but received 8
 
     >>> print('shape verification:')
-    >>> init_verify = tf.constant_initializer(value, verify_shape=True)
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init_verify)
+    >>> init_verify = tf.compat.v1.constant_initializer(value,
+    verify_shape=True)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[3, 4],
+    initializer=init_verify)
 
     TypeError: Expected Tensor's shape: (3, 4), got (8,).
   ```
@@ -222,12 +222,9 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
-  @deprecated_args(None,
-                   "Objects must now be the required shape or no shape "
-                   "can be specified",
-                   "verify_shape")
+                   "of passing it to the constructor", "dtype")
+  @deprecated_args(None, "Objects must now be the required shape or no shape "
+                   "can be specified", "verify_shape")
   def __init__(self, value=0, dtype=dtypes.float32, verify_shape=False):
     if not (np.isscalar(value) or isinstance(value, (list, tuple, np.ndarray))):
       raise TypeError(
@@ -260,21 +257,19 @@
   """Initializer that generates tensors with a uniform distribution.
 
   Args:
-    minval: A python scalar or a scalar tensor. Lower bound of the range
-      of random values to generate.
-    maxval: A python scalar or a scalar tensor. Upper bound of the range
-      of random values to generate.  Defaults to 1 for float types.
+    minval: A python scalar or a scalar tensor. Lower bound of the range of
+      random values to generate.
+    maxval: A python scalar or a scalar tensor. Upper bound of the range of
+      random values to generate.  Defaults to 1 for float types.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer.
   """
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
     self.minval = minval
     self.maxval = maxval
@@ -302,21 +297,19 @@
   """Initializer that generates tensors with a normal distribution.
 
   Args:
-    mean: a python scalar or a scalar tensor. Mean of the random values
-      to generate.
-    stddev: a python scalar or a scalar tensor. Standard deviation of the
-      random values to generate.
+    mean: a python scalar or a scalar tensor. Mean of the random values to
+      generate.
+    stddev: a python scalar or a scalar tensor. Standard deviation of the random
+      values to generate.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
   """
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
     self.mean = mean
     self.stddev = stddev
@@ -350,21 +343,19 @@
   neural network weights and filters.
 
   Args:
-    mean: a python scalar or a scalar tensor. Mean of the random values
-      to generate.
-    stddev: a python scalar or a scalar tensor. Standard deviation of the
-      random values to generate.
+    mean: a python scalar or a scalar tensor. Mean of the random values to
+      generate.
+    stddev: a python scalar or a scalar tensor. Standard deviation of the random
+      values to generate.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
   """
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
     self.mean = mean
     self.stddev = stddev
@@ -386,8 +377,9 @@
     }
 
 
-@tf_export(v1=["initializers.uniform_unit_scaling",
-               "uniform_unit_scaling_initializer"])
+@tf_export(v1=[
+    "initializers.uniform_unit_scaling", "uniform_unit_scaling_initializer"
+])
 @deprecation.deprecated_endpoints("uniform_unit_scaling_initializer",
                                   "initializers.uniform_unit_scaling")
 class UniformUnitScaling(Initializer):
@@ -411,11 +403,9 @@
   Args:
     factor: Float.  A multiplicative factor by which the values will be scaled.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
       ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
@@ -423,8 +413,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   @deprecated(None,
               "Use tf.initializers.variance_scaling instead with distribution="
               "uniform to get equivalent behavior.")
@@ -479,8 +468,7 @@
     mode: One of "fan_in", "fan_out", "fan_avg".
     distribution: Random distribution to use. One of "normal", "uniform".
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
 
@@ -491,8 +479,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   @deprecated_arg_values(
       None,
       "`normal` is a deprecated alias for `truncated_normal`",
@@ -508,8 +495,9 @@
     if mode not in {"fan_in", "fan_out", "fan_avg"}:
       raise ValueError("Invalid `mode` argument:", mode)
     distribution = distribution.lower()
-    if distribution not in {"normal", "uniform",
-                            "truncated_normal", "untruncated_normal"}:
+    if distribution not in {
+        "normal", "uniform", "truncated_normal", "untruncated_normal"
+    }:
       raise ValueError("Invalid `distribution` argument:", distribution)
     self.scale = scale
     self.mode = mode
@@ -538,8 +526,7 @@
           shape, 0.0, stddev, dtype, seed=self.seed)
     elif self.distribution == "untruncated_normal":
       stddev = math.sqrt(scale)
-      return random_ops.random_normal(
-          shape, 0.0, stddev, dtype, seed=self.seed)
+      return random_ops.random_normal(shape, 0.0, stddev, dtype, seed=self.seed)
     else:
       limit = math.sqrt(3.0 * scale)
       return random_ops.random_uniform(
@@ -575,11 +562,9 @@
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
       ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
@@ -587,8 +572,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
     self.gain = gain
     self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
@@ -640,14 +624,13 @@
 
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal
-      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
-      `gain` after applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of `gain` after applying
+      this convolution.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -671,7 +654,8 @@
 
     # Generate a random matrix
     a = random_ops.random_normal([shape[-1], shape[-1]],
-                                 dtype=dtype, seed=self.seed)
+                                 dtype=dtype,
+                                 seed=self.seed)
     # Compute the qr factorization
     q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
@@ -680,14 +664,15 @@
     q = q[:shape[-2], :]
     q *= math_ops.cast(self.gain, dtype=dtype)
     if len(shape) == 3:
-      weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
+      weight = array_ops.scatter_nd([[(shape[0] - 1) // 2]],
                                     array_ops.expand_dims(q, 0), shape)
     elif len(shape) == 4:
-      weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2]],
+      weight = array_ops.scatter_nd([[(shape[0] - 1) // 2,
+                                      (shape[1] - 1) // 2]],
                                     array_ops.expand_dims(q, 0), shape)
     else:
-      weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2,
-                                      (shape[2]-1)//2]],
+      weight = array_ops.scatter_nd([[(shape[0] - 1) // 2, (shape[1] - 1) // 2,
+                                      (shape[2] - 1) // 2]],
                                     array_ops.expand_dims(q, 0), shape)
     return weight
 
@@ -701,14 +686,13 @@
   Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
 
   Args:
-    gain: multiplicative factor to apply to the orthogonal
-      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
-      `gain` after applying this convolution.
+    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of `gain` after applying
+      this convolution.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -731,6 +715,7 @@
 
     Args:
       n: Dimension.
+
     Returns:
       A n x n orthogonal matrix.
     """
@@ -748,13 +733,14 @@
 
     Args:
       n: Dimension.
+
     Returns:
       A n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T.
     """
     q = self._orthogonal_matrix(n)
     # randomly zeroing out some columns
-    mask = math_ops.cast(random_ops.random_normal([n], seed=self.seed) > 0,
-                         self.dtype)
+    mask = math_ops.cast(
+        random_ops.random_normal([n], seed=self.seed) > 0, self.dtype)
     if self.seed:
       self.seed += 1
     c = math_ops.multiply(q, mask)
@@ -771,14 +757,12 @@
   See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal
-      matrix. Default is 1. This has the effect of scaling the output 2-norm by
-      a factor of `gain`.
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      This has the effect of scaling the output 2-norm by a factor of `gain`.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -807,6 +791,7 @@
       x: A k1 * k2 dictionary.
       k1: First dimension of x.
       k2: Second dimension of x.
+
     Returns:
       A k1 * k2 tensor.
     """
@@ -815,11 +800,14 @@
                             for i in range(k1)])
 
   def _block_orth(self, p1, p2):
-    """Construct a 2 x 2 kernel. Used to construct orthgonal kernel.
+    """Construct a 2 x 2 kernel.
+
+    Used to construct orthgonal kernel.
 
     Args:
       p1: A symmetric projection matrix.
       p2: A symmetric projection matrix.
+
     Returns:
       A 2 x 2 kernel [[p1p2,         p1(1-p2)],
                       [(1-p1)p2, (1-p1)(1-p2)]].
@@ -877,6 +865,7 @@
       ksize: Kernel size.
       cin: Number of input channels.
       cout: Number of output channels.
+
     Returns:
       An [ksize, ksize, cin, cout] orthogonal kernel.
     Raises:
@@ -889,11 +878,11 @@
     if ksize == 1:
       return array_ops.expand_dims(array_ops.expand_dims(orth, 0), 0)
 
-    p = self._block_orth(self._symmetric_projection(cout),
-                         self._symmetric_projection(cout))
+    p = self._block_orth(
+        self._symmetric_projection(cout), self._symmetric_projection(cout))
     for _ in range(ksize - 2):
-      temp = self._block_orth(self._symmetric_projection(cout),
-                              self._symmetric_projection(cout))
+      temp = self._block_orth(
+          self._symmetric_projection(cout), self._symmetric_projection(cout))
       p = self._matrix_conv(p, temp)
     for i in range(ksize):
       for j in range(ksize):
@@ -912,15 +901,13 @@
   See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal
-      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
-      `gain` after applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of `gain` after applying
+      this convolution.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -945,6 +932,7 @@
     Args:
       x: A dictionary of length k.
       k: Dimension of x.
+
     Returns:
       A tensor with the same dimension.
     """
@@ -952,10 +940,13 @@
     return array_ops.stack([x[i] for i in range(k)])
 
   def _block_orth(self, projection_matrix):
-    """Construct a kernel. Used to construct orthgonal kernel.
+    """Construct a kernel.
+
+    Used to construct orthgonal kernel.
 
     Args:
       projection_matrix: A symmetric projection matrix of size n x n.
+
     Returns:
       [projection_matrix, (1 - projection_matrix)].
     """
@@ -1002,6 +993,7 @@
       ksize: Kernel size.
       cin: Number of input channels.
       cout: Number of output channels.
+
     Returns:
       An [ksize, ksize, cin, cout] orthogonal kernel.
     Raises:
@@ -1034,14 +1026,13 @@
   See algorithm 1 (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal
-      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
-      `gain` after applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of `gain` after applying
+      this convolution.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -1071,6 +1062,7 @@
       k1: First dimension of x.
       k2: Second dimension of x.
       k3: Third dimension of x.
+
     Returns:
       A k1 * k2 * k3 tensor.
     """
@@ -1080,12 +1072,15 @@
          for j in range(k2)]) for i in range(k1)])
 
   def _block_orth(self, p1, p2, p3):
-    """Construct a 3 x 3 kernel. Used to construct orthgonal kernel.
+    """Construct a 3 x 3 kernel.
+
+    Used to construct orthgonal kernel.
 
     Args:
       p1: A symmetric projection matrix.
       p2: A symmetric projection matrix.
       p3: A symmetric projection matrix.
+
     Returns:
       A 2 x 2 x 2 kernel.
     Raises:
@@ -1097,11 +1092,14 @@
     n = p1_shape[0]
     eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel2x2x2 = {}
+
     def matmul(p1, p2, p3):
       return math_ops.matmul(math_ops.matmul(p1, p2), p3)
+
     def cast(i, p):
       """Return p or (1-p)."""
-      return i * p + (1-i) * (eye - p)
+      return i * p + (1 - i) * (eye - p)
+
     for i in [0, 1]:
       for j in [0, 1]:
         for k in [0, 1]:
@@ -1139,9 +1137,9 @@
             for index2 in range(min(k, j + 1)):
               for index3 in range(min(k, r + 1)):
                 if (i - index1) < l and (j - index2) < l and (r - index3) < l:
-                  result[i, j, r] += math_ops.matmul(m1[index1, index2, index3],
-                                                     m2[i - index1, j - index2,
-                                                        r - index3])
+                  result[i, j, r] += math_ops.matmul(
+                      m1[index1, index2, index3],
+                      m2[i - index1, j - index2, r - index3])
     return result
 
   def _orthogonal_kernel(self, ksize, cin, cout):
@@ -1151,6 +1149,7 @@
       ksize: Kernel size.
       cin: Number of input channels.
       cout: Number of output channels.
+
     Returns:
       An [ksize, ksize, ksize, cin, cout] orthogonal kernel.
     Raises:
@@ -1162,16 +1161,15 @@
     orth = self._orthogonal_matrix(cout)[0:cin, :]
     if ksize == 1:
       return array_ops.expand_dims(
-          array_ops.expand_dims(
-              array_ops.expand_dims(orth, 0), 0), 0)
+          array_ops.expand_dims(array_ops.expand_dims(orth, 0), 0), 0)
 
-    p = self._block_orth(self._symmetric_projection(cout),
-                         self._symmetric_projection(cout),
-                         self._symmetric_projection(cout))
+    p = self._block_orth(
+        self._symmetric_projection(cout), self._symmetric_projection(cout),
+        self._symmetric_projection(cout))
     for _ in range(ksize - 2):
-      temp = self._block_orth(self._symmetric_projection(cout),
-                              self._symmetric_projection(cout),
-                              self._symmetric_projection(cout))
+      temp = self._block_orth(
+          self._symmetric_projection(cout), self._symmetric_projection(cout),
+          self._symmetric_projection(cout))
       p = self._matrix_conv(p, temp)
     for i in range(ksize):
       for j in range(ksize):
@@ -1196,8 +1194,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, gain=1.0, dtype=dtypes.float32):
     self.gain = gain
     self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
@@ -1234,11 +1231,9 @@
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
@@ -1246,8 +1241,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
@@ -1274,10 +1268,9 @@
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     dtype: Default data type, used if no `dtype` argument is provided when
       calling the initializer. Only floating point types are supported.
-
   References:
       [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
@@ -1285,8 +1278,7 @@
 
   @deprecated_args(None,
                    "Call initializer instance with the dtype argument instead "
-                   "of passing it to the constructor",
-                   "dtype")
+                   "of passing it to the constructor", "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
@@ -1338,7 +1330,9 @@
 
   References:
       - Self-Normalizing Neural Networks,
-      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)  # pylint: disable=line-too-long
+      [Klambauer et al.,
+      2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      # pylint: disable=line-too-long
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
       [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
@@ -1363,7 +1357,9 @@
 
   References:
       - Self-Normalizing Neural Networks,
-      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)  # pylint: disable=line-too-long
+      [Klambauer et al.,
+      2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      # pylint: disable=line-too-long
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
       [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
@@ -1389,7 +1385,8 @@
 
   References:
       [He et al., 2015]
-      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)  # pylint: disable=line-too-long
+      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      # pylint: disable=line-too-long
       ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
@@ -1412,7 +1409,8 @@
 
   References:
       [He et al., 2015]
-      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)  # pylint: disable=line-too-long
+      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      # pylint: disable=line-too-long
       ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py
index 8b36761..28738e5 100644
--- a/tensorflow/python/ops/init_ops_v2.py
+++ b/tensorflow/python/ops/init_ops_v2.py
@@ -156,11 +156,11 @@
     >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
     >>> # value = np.array(value)
     >>> # value = value.reshape([2, 4])
-    >>> init = tf.constant_initializer(value)
+    >>> init = tf.compat.v1.constant_initializer(value)
 
     >>> print('fitting shape:')
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[2, 4], initializer=init)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[2, 4], initializer=init)
     >>>   x.initializer.run()
     >>>   print(x.eval())
 
@@ -169,8 +169,8 @@
      [ 4.  5.  6.  7.]]
 
     >>> print('larger shape:')
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[3, 4], initializer=init)
     >>>   x.initializer.run()
     >>>   print(x.eval())
 
@@ -180,8 +180,8 @@
      [ 7.  7.  7.  7.]]
 
     >>> print('smaller shape:')
-    >>> with tf.Session():
-    >>>   x = tf.get_variable('x', shape=[2, 3], initializer=init)
+    >>> with tf.compat.v1.Session():
+    >>>   x = tf.compat.v1.get_variable('x', shape=[2, 3], initializer=init)
 
     ValueError: Too many elements provided. Needed at most 6, but received 8
   ```
@@ -225,7 +225,7 @@
     maxval: A python scalar or a scalar tensor. Upper bound of the range
       of random values to generate.  Defaults to 1 for float types.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
   """
 
@@ -270,7 +270,7 @@
     stddev: a python scalar or a scalar tensor. Standard deviation of the
       random values to generate.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
   """
 
@@ -317,7 +317,7 @@
     stddev: a python scalar or a scalar tensor. Standard deviation of the
       random values to generate.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
   """
 
@@ -371,7 +371,7 @@
     distribution: Random distribution to use. One of "truncated_normal",
       "untruncated_normal" and  "uniform".
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
 
   Raises:
@@ -389,6 +389,9 @@
     if mode not in {"fan_in", "fan_out", "fan_avg"}:
       raise ValueError("Invalid `mode` argument:", mode)
     distribution = distribution.lower()
+    # Compatibility with keras-team/keras.
+    if distribution == "normal":
+      distribution = "truncated_normal"
     if distribution not in {"uniform", "truncated_normal",
                             "untruncated_normal"}:
       raise ValueError("Invalid `distribution` argument:", distribution)
@@ -459,7 +462,7 @@
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
     for behavior.
 
   References:
@@ -561,7 +564,7 @@
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
 
   References:
@@ -590,7 +593,7 @@
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
 
   References:
       [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
diff --git a/tensorflow/python/ops/linalg/adjoint_registrations.py b/tensorflow/python/ops/linalg/adjoint_registrations.py
index 59ec97d..862449c 100644
--- a/tensorflow/python/ops/linalg/adjoint_registrations.py
+++ b/tensorflow/python/ops/linalg/adjoint_registrations.py
@@ -25,6 +25,7 @@
 from tensorflow.python.ops.linalg import linear_operator_block_diag
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_householder
 from tensorflow.python.ops.linalg import linear_operator_identity
 from tensorflow.python.ops.linalg import linear_operator_kronecker
 
@@ -125,3 +126,9 @@
       is_self_adjoint=circulant_operator.is_self_adjoint,
       is_positive_definite=circulant_operator.is_positive_definite,
       is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_householder.LinearOperatorHouseholder)
+def _adjoint_householder(householder_operator):
+  return householder_operator
diff --git a/tensorflow/python/ops/linalg/inverse_registrations.py b/tensorflow/python/ops/linalg/inverse_registrations.py
index 12d1e75..48efded 100644
--- a/tensorflow/python/ops/linalg/inverse_registrations.py
+++ b/tensorflow/python/ops/linalg/inverse_registrations.py
@@ -23,6 +23,7 @@
 from tensorflow.python.ops.linalg import linear_operator_block_diag
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_householder
 from tensorflow.python.ops.linalg import linear_operator_identity
 from tensorflow.python.ops.linalg import linear_operator_inversion
 from tensorflow.python.ops.linalg import linear_operator_kronecker
@@ -112,3 +113,10 @@
       is_self_adjoint=circulant_operator.is_self_adjoint,
       is_positive_definite=circulant_operator.is_positive_definite,
       is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_householder.LinearOperatorHouseholder)
+def _inverse_householder(householder_operator):
+  return householder_operator
+
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index b9f8411..7a815a0 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -25,6 +25,7 @@
 from tensorflow.python.ops.linalg import inverse_registrations as _inverse_registrations
 from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
 from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
+from tensorflow.python.ops.linalg import solve_registrations as _solve_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
 from tensorflow.python.ops.linalg.linear_operator_block_diag import *
@@ -36,6 +37,7 @@
 from tensorflow.python.ops.linalg.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator_low_rank_update import *
 from tensorflow.python.ops.linalg.linear_operator_lower_triangular import *
+from tensorflow.python.ops.linalg.linear_operator_toeplitz import *
 from tensorflow.python.ops.linalg.linear_operator_zeros import *
 # pylint: enable=wildcard-import
 
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 8eeb3f0..31b0bef 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -127,9 +127,10 @@
   """3rd-order Pade approximant for matrix exponential."""
   b = [120.0, 60.0, 12.0]
   b = [constant_op.constant(x, matrix.dtype) for x in b]
-  ident = linalg_ops.eye(array_ops.shape(matrix)[-2],
-                         batch_shape=array_ops.shape(matrix)[:-2],
-                         dtype=matrix.dtype)
+  ident = linalg_ops.eye(
+      array_ops.shape(matrix)[-2],
+      batch_shape=array_ops.shape(matrix)[:-2],
+      dtype=matrix.dtype)
   matrix_2 = math_ops.matmul(matrix, matrix)
   tmp = matrix_2 + b[1] * ident
   matrix_u = math_ops.matmul(matrix, tmp)
@@ -141,9 +142,10 @@
   """5th-order Pade approximant for matrix exponential."""
   b = [30240.0, 15120.0, 3360.0, 420.0, 30.0]
   b = [constant_op.constant(x, matrix.dtype) for x in b]
-  ident = linalg_ops.eye(array_ops.shape(matrix)[-2],
-                         batch_shape=array_ops.shape(matrix)[:-2],
-                         dtype=matrix.dtype)
+  ident = linalg_ops.eye(
+      array_ops.shape(matrix)[-2],
+      batch_shape=array_ops.shape(matrix)[:-2],
+      dtype=matrix.dtype)
   matrix_2 = math_ops.matmul(matrix, matrix)
   matrix_4 = math_ops.matmul(matrix_2, matrix_2)
   tmp = matrix_4 + b[3] * matrix_2 + b[1] * ident
@@ -156,9 +158,10 @@
   """7th-order Pade approximant for matrix exponential."""
   b = [17297280.0, 8648640.0, 1995840.0, 277200.0, 25200.0, 1512.0, 56.0]
   b = [constant_op.constant(x, matrix.dtype) for x in b]
-  ident = linalg_ops.eye(array_ops.shape(matrix)[-2],
-                         batch_shape=array_ops.shape(matrix)[:-2],
-                         dtype=matrix.dtype)
+  ident = linalg_ops.eye(
+      array_ops.shape(matrix)[-2],
+      batch_shape=array_ops.shape(matrix)[:-2],
+      dtype=matrix.dtype)
   matrix_2 = math_ops.matmul(matrix, matrix)
   matrix_4 = math_ops.matmul(matrix_2, matrix_2)
   matrix_6 = math_ops.matmul(matrix_4, matrix_2)
@@ -175,9 +178,10 @@
       2162160.0, 110880.0, 3960.0, 90.0
   ]
   b = [constant_op.constant(x, matrix.dtype) for x in b]
-  ident = linalg_ops.eye(array_ops.shape(matrix)[-2],
-                         batch_shape=array_ops.shape(matrix)[:-2],
-                         dtype=matrix.dtype)
+  ident = linalg_ops.eye(
+      array_ops.shape(matrix)[-2],
+      batch_shape=array_ops.shape(matrix)[:-2],
+      dtype=matrix.dtype)
   matrix_2 = math_ops.matmul(matrix, matrix)
   matrix_4 = math_ops.matmul(matrix_2, matrix_2)
   matrix_6 = math_ops.matmul(matrix_4, matrix_2)
@@ -200,15 +204,15 @@
       33522128640.0, 1323241920.0, 40840800.0, 960960.0, 16380.0, 182.0
   ]
   b = [constant_op.constant(x, matrix.dtype) for x in b]
-  ident = linalg_ops.eye(array_ops.shape(matrix)[-2],
-                         batch_shape=array_ops.shape(matrix)[:-2],
-                         dtype=matrix.dtype)
+  ident = linalg_ops.eye(
+      array_ops.shape(matrix)[-2],
+      batch_shape=array_ops.shape(matrix)[:-2],
+      dtype=matrix.dtype)
   matrix_2 = math_ops.matmul(matrix, matrix)
   matrix_4 = math_ops.matmul(matrix_2, matrix_2)
   matrix_6 = math_ops.matmul(matrix_4, matrix_2)
   tmp_u = (
-      math_ops.matmul(matrix_6,
-                      matrix_6 + b[11] * matrix_4 + b[9] * matrix_2) +
+      math_ops.matmul(matrix_6, matrix_6 + b[11] * matrix_4 + b[9] * matrix_2) +
       b[7] * matrix_6 + b[5] * matrix_4 + b[3] * matrix_2 + b[1] * ident)
   matrix_u = math_ops.matmul(matrix, tmp_u)
   tmp_v = b[12] * matrix_6 + b[10] * matrix_4 + b[8] * matrix_2
@@ -234,8 +238,8 @@
   containing the exponential for all input submatrices `[..., :, :]`.
 
   Args:
-    input: A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
-      or `complex128` with shape `[..., M, M]`.
+    input: A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`, or
+      `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op` (optional).
 
   Returns:
@@ -260,10 +264,12 @@
     matrix = array_ops.reshape(
         matrix, array_ops.concat(([-1], array_ops.shape(matrix)[-2:]), axis=0))
     l1_norm = math_ops.reduce_max(
-        math_ops.reduce_sum(math_ops.abs(matrix),
-                            axis=array_ops.size(array_ops.shape(matrix)) - 2),
+        math_ops.reduce_sum(
+            math_ops.abs(matrix),
+            axis=array_ops.size(array_ops.shape(matrix)) - 2),
         axis=-1)
     const = lambda x: constant_op.constant(x, l1_norm.dtype)
+
     def _nest_where(vals, cases):
       assert len(vals) == len(cases) - 1
       if len(vals) == 1:
@@ -281,12 +287,11 @@
               math_ops.log(l1_norm / maxnorm) / math_ops.log(const(2.0))), 0)
       u3, v3 = _matrix_exp_pade3(matrix)
       u5, v5 = _matrix_exp_pade5(matrix)
-      u7, v7 = _matrix_exp_pade7(
-          matrix / math_ops.pow(
-              constant_op.constant(2.0, dtype=matrix.dtype),
-              math_ops.cast(squarings, matrix.dtype))[...,
-                                                      array_ops.newaxis,
-                                                      array_ops.newaxis])
+      u7, v7 = _matrix_exp_pade7(matrix / math_ops.pow(
+          constant_op.constant(2.0, dtype=matrix.dtype),
+          math_ops.cast(
+              squarings,
+              matrix.dtype))[..., array_ops.newaxis, array_ops.newaxis])
       conds = (4.258730016922831e-001, 1.880152677804762e+000)
       u = _nest_where(conds, (u3, u5, u7))
       v = _nest_where(conds, (v3, v5, v7))
@@ -299,21 +304,18 @@
       u5, v5 = _matrix_exp_pade5(matrix)
       u7, v7 = _matrix_exp_pade7(matrix)
       u9, v9 = _matrix_exp_pade9(matrix)
-      u13, v13 = _matrix_exp_pade13(
-          matrix / math_ops.pow(
-              constant_op.constant(2.0, dtype=matrix.dtype),
-              math_ops.cast(squarings, matrix.dtype))[...,
-                                                      array_ops.newaxis,
-                                                      array_ops.newaxis])
-      conds = (1.495585217958292e-002,
-               2.539398330063230e-001,
-               9.504178996162932e-001,
-               2.097847961257068e+000)
+      u13, v13 = _matrix_exp_pade13(matrix / math_ops.pow(
+          constant_op.constant(2.0, dtype=matrix.dtype),
+          math_ops.cast(
+              squarings,
+              matrix.dtype))[..., array_ops.newaxis, array_ops.newaxis])
+      conds = (1.495585217958292e-002, 2.539398330063230e-001,
+               9.504178996162932e-001, 2.097847961257068e+000)
       u = _nest_where(conds, (u3, u5, u7, u9, u13))
       v = _nest_where(conds, (v3, v5, v7, v9, v13))
     else:
-      raise ValueError(
-          'tf.linalg.expm does not support matrices of type %s' % matrix.dtype)
+      raise ValueError('tf.linalg.expm does not support matrices of type %s' %
+                       matrix.dtype)
     numer = u + v
     denom = -u + v
     result = linalg_ops.matrix_solve(denom, numer)
@@ -321,9 +323,11 @@
 
     i = const(0.0)
     c = lambda i, r: math_ops.less(i, max_squarings)
+
     def b(i, r):
-      return i+1, array_ops.where(math_ops.less(i, squarings),
-                                  math_ops.matmul(r, r), r)
+      return i + 1, array_ops.where(
+          math_ops.less(i, squarings), math_ops.matmul(r, r), r)
+
     _, result = control_flow_ops.while_loop(c, b, [i, result])
     if not matrix.shape.is_fully_defined():
       return array_ops.reshape(
@@ -338,13 +342,12 @@
                       diagonals_format='compact',
                       transpose_rhs=False,
                       conjugate_rhs=False,
-                      name=None):
+                      name=None,
+                      partial_pivoting=True):
   r"""Solves tridiagonal systems of equations.
 
-  Solution is computed via Gaussian elemination with partial pivoting.
-
-  The input can be supplied in various formats: `matrix`, `tuple` and `compact`,
-  specified by the `diagonals_format` arg.
+  The input can be supplied in various formats: `matrix`, `sequence` and
+  `compact`, specified by the `diagonals_format` arg.
 
   In `matrix` format, `diagonals` must be a tensor of shape `[..., M, M]`, with
   two inner-most dimensions representing the square tridiagonal matrices.
@@ -392,6 +395,15 @@
   invertible. `tf.debugging.check_numerics` can be applied to the output to
   detect invertibility problems.
 
+  **Note**: with large batch sizes, the computation on the GPU may be slow, if
+  either `partial_pivoting=True` or there are multiple right-hand sides
+  (`K > 1`). If this issue arises, consider if it's possible to disable pivoting
+  and have `K = 1`, or, alternatively, consider using CPU.
+
+  On CPU, solution is computed via Gaussian elimination with or without partial
+  pivoting, depending on `partial_pivoting` parameter. On GPU, Nvidia's cuSPARSE
+  library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
+
   Args:
     diagonals: A `Tensor` or tuple of `Tensor`s describing left-hand sides. The
       shape depends of `diagonals_format`, see description above. Must be
@@ -404,6 +416,10 @@
       if the shape of rhs is [..., M]).
     conjugate_rhs: If `True`, `rhs` is conjugated before solving.
     name:  A name to give this `Op` (optional).
+    partial_pivoting: whether to perform partial pivoting. `True` by default.
+      Partial pivoting makes the procedure more stable, but slower. Partial
+      pivoting is unnecessary in some cases, including diagonally dominant and
+      symmetric positive definite matrices (see e.g. theorem 9.12 in [1]).
 
   Returns:
     A `Tensor` of shape [..., M] or [..., M, K] containing the solutions.
@@ -412,10 +428,14 @@
     ValueError: An unsupported type is provided as input, or when the input
     tensors have incorrect shapes.
 
+  [1] Nicholas J. Higham (2002). Accuracy and Stability of Numerical Algorithms:
+  Second Edition. SIAM. p. 175. ISBN 978-0-89871-802-7.
+
   """
   if diagonals_format == 'compact':
     return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
-                                             conjugate_rhs, name)
+                                             conjugate_rhs, partial_pivoting,
+                                             name)
 
   if diagonals_format == 'sequence':
     if not isinstance(diagonals, (tuple, list)) or len(diagonals) != 3:
@@ -436,8 +456,8 @@
       if not n or n == m:
         return t
       if n == m - 1:
-        paddings = (
-            [[0, 0] for _ in range(len(t.shape) - 1)] + [last_dim_padding])
+        paddings = ([[0, 0] for _ in range(len(t.shape) - 1)] +
+                    [last_dim_padding])
         return array_ops.pad(t, paddings)
       raise ValueError('Expected {} to be have length {} or {}, got {}.'.format(
           name, m, m - 1, n))
@@ -447,7 +467,8 @@
 
     diagonals = array_ops.stack((superdiag, maindiag, subdiag), axis=-2)
     return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
-                                             conjugate_rhs, name)
+                                             conjugate_rhs, partial_pivoting,
+                                             name)
 
   if diagonals_format == 'matrix':
     m1 = tensor_shape.dimension_value(diagonals.shape[-1])
@@ -466,22 +487,20 @@
     # gather_nd slices into first indices, whereas we need to slice into the
     # last two, so transposing back and forth is necessary.
     dummy_idx = [0, 0]
-    indices = ([[[1, 0], [0, 0], dummy_idx]] + [
-        [[i + 1, i], [i, i], [i - 1, i]] for i in range(1, m - 1)
-    ] + [[dummy_idx, [m - 1, m - 1], [m - 2, m - 1]]])
+    indices = ([[[1, 0], [0, 0], dummy_idx]] +
+               [[[i + 1, i], [i, i], [i - 1, i]] for i in range(1, m - 1)] +
+               [[dummy_idx, [m - 1, m - 1], [m - 2, m - 1]]])
     diagonals = array_ops.transpose(
         array_ops.gather_nd(array_ops.transpose(diagonals), indices))
     return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
-                                             conjugate_rhs, name)
+                                             conjugate_rhs, partial_pivoting,
+                                             name)
 
   raise ValueError('Unrecognized diagonals_format: {}'.format(diagonals_format))
 
 
-def _tridiagonal_solve_compact_format(diagonals,
-                                      rhs,
-                                      transpose_rhs=False,
-                                      conjugate_rhs=False,
-                                      name=None):
+def _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                      conjugate_rhs, partial_pivoting, name):
   """Helper function used after the input has been cast to compact form."""
   diags_rank, rhs_rank = len(diagonals.shape), len(rhs.shape)
 
@@ -491,14 +510,15 @@
   if rhs_rank != diags_rank and rhs_rank != diags_rank - 1:
     raise ValueError('Expected the rank of rhs to be {} or {}, got {}'.format(
         diags_rank - 1, diags_rank, rhs_rank))
-  if diagonals.shape[-2] != 3:
+  if diagonals.shape[-2] and diagonals.shape[-2] != 3:
     raise ValueError('Expected 3 diagonals got {}'.format(diagonals.shape[-2]))
   if not diagonals.shape[:-2].is_compatible_with(rhs.shape[:diags_rank - 2]):
     raise ValueError('Batch shapes {} and {} are incompatible'.format(
         diagonals.shape[:-2], rhs.shape[:diags_rank - 2]))
 
   def check_num_lhs_matches_num_rhs():
-    if diagonals.shape[-1] != rhs.shape[-2]:
+    if (diagonals.shape[-1] and rhs.shape[-2] and
+        diagonals.shape[-1] != rhs.shape[-2]):
       raise ValueError('Expected number of left-hand sided and right-hand '
                        'sides to be equal, got {} and {}'.format(
                            diagonals.shape[-1], rhs.shape[-2]))
@@ -510,7 +530,8 @@
     rhs = array_ops.expand_dims(rhs, -1)
     check_num_lhs_matches_num_rhs()
     return array_ops.squeeze(
-        linalg_ops.tridiagonal_solve(diagonals, rhs, name), -1)
+        linalg_ops.tridiagonal_solve(diagonals, rhs, partial_pivoting, name),
+        -1)
 
   if transpose_rhs:
     rhs = array_ops.matrix_transpose(rhs, conjugate=conjugate_rhs)
@@ -518,5 +539,95 @@
     rhs = math_ops.conj(rhs)
 
   check_num_lhs_matches_num_rhs()
-  result = linalg_ops.tridiagonal_solve(diagonals, rhs, name)
+  result = linalg_ops.tridiagonal_solve(diagonals, rhs, partial_pivoting, name)
   return array_ops.matrix_transpose(result) if transpose_rhs else result
+
+
+@tf_export('linalg.tridiagonal_matmul')
+def tridiagonal_matmul(diagonals, rhs, diagonals_format='compact', name=None):
+  r"""Multiplies tridiagonal matrix by matrix.
+
+  `diagonals` is representation of 3-diagonal NxN matrix, which depends on
+  `diagonals_format`.
+
+  In `matrix` format, `diagonals` must be a tensor of shape `[..., M, M]`, with
+  two inner-most dimensions representing the square tridiagonal matrices.
+  Elements outside of the three diagonals will be ignored.
+
+  If `sequence` format, `diagonals` is list or tuple of three tensors:
+  `[superdiag, maindiag, subdiag]`, each having shape [..., M]. Last element
+  of `superdiag` first element of `subdiag` are ignored.
+
+  In `compact` format the three diagonals are brought together into one tensor
+  of shape `[..., 3, M]`, with last two dimensions containing superdiagonals,
+  diagonals, and subdiagonals, in order. Similarly to `sequence` format,
+  elements `diagonals[..., 0, M-1]` and `diagonals[..., 2, 0]` are ignored.
+
+  The `sequence` format is recommended as the one with the best performance.
+
+  `rhs` is matrix to the right of multiplication. It has shape `[..., M, N]`.
+
+  Example:
+
+  ```python
+  superdiag = tf.constant([-1, -1, 0], dtype=tf.float64)
+  maindiag = tf.constant([2, 2, 2], dtype=tf.float64)
+  subdiag = tf.constant([0, -1, -1], dtype=tf.float64)
+  diagonals = [superdiag, maindiag, subdiag]
+  rhs = tf.constant([[1, 1], [1, 1], [1, 1]], dtype=tf.float64)
+  x = tf.linalg.tridiagonal_matmul(diagonals, rhs, diagonals_format='sequence')
+  ```
+
+  Args:
+    diagonals: A `Tensor` or tuple of `Tensor`s describing left-hand sides. The
+      shape depends of `diagonals_format`, see description above. Must be
+      `float32`, `float64`, `complex64`, or `complex128`.
+    rhs: A `Tensor` of shape [..., M, N] and with the same dtype as `diagonals`.
+    diagonals_format: one of `sequence`, or `compact`. Default is `compact`.
+    name:  A name to give this `Op` (optional).
+
+  Returns:
+    A `Tensor` of shape [..., M, N] containing the result of multiplication.
+
+  Raises:
+    ValueError: An unsupported type is provided as input, or when the input
+    tensors have incorrect shapes.
+  """
+  if diagonals_format == 'compact':
+    superdiag = diagonals[..., 0, :]
+    maindiag = diagonals[..., 1, :]
+    subdiag = diagonals[..., 2, :]
+  elif diagonals_format == 'sequence':
+    superdiag, maindiag, subdiag = diagonals
+  elif diagonals_format == 'matrix':
+    m1 = tensor_shape.dimension_value(diagonals.shape[-1])
+    m2 = tensor_shape.dimension_value(diagonals.shape[-2])
+    if not m1 or not m2:
+      raise ValueError('The size of the matrix needs to be known for '
+                       'diagonals_format="matrix"')
+    if m1 != m2:
+      raise ValueError(
+          'Expected last two dimensions of diagonals to be same, got {} and {}'
+          .format(m1, m2))
+
+    # TODO(b/131695260): use matrix_diag_part when it supports extracting
+    # arbitrary diagonals.
+    maindiag = array_ops.matrix_diag_part(diagonals)
+    diagonals = array_ops.transpose(diagonals)
+    dummy_index = [0, 0]
+    superdiag_indices = [[i + 1, i] for i in range(0, m1 - 1)] + [dummy_index]
+    subdiag_indices = [dummy_index] + [[i - 1, i] for i in range(1, m1)]
+    superdiag = array_ops.transpose(
+        array_ops.gather_nd(diagonals, superdiag_indices))
+    subdiag = array_ops.transpose(
+        array_ops.gather_nd(diagonals, subdiag_indices))
+  else:
+    raise ValueError('Unrecognized diagonals_format: %s' % diagonals_format)
+
+  # C++ backend requires matrices.
+  # Converting 1-dimensional vectors to matrices with 1 row.
+  superdiag = array_ops.expand_dims(superdiag, -2)
+  maindiag = array_ops.expand_dims(maindiag, -2)
+  subdiag = array_ops.expand_dims(subdiag, -2)
+
+  return linalg_ops.tridiagonal_mat_mul(superdiag, maindiag, subdiag, rhs, name)
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 8fa9f63..80c9169 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -597,16 +597,18 @@
         as `self`.
     """
     if isinstance(x, LinearOperator):
-      if adjoint or adjoint_arg:
-        raise ValueError(".matmul not supported with adjoints.")
-      if (x.range_dimension is not None and
-          self.domain_dimension is not None and
-          x.range_dimension != self.domain_dimension):
+      left_operator = self.adjoint() if adjoint else self
+      right_operator = x.adjoint() if adjoint_arg else x
+
+      if (right_operator.range_dimension is not None and
+          left_operator.domain_dimension is not None and
+          right_operator.range_dimension != left_operator.domain_dimension):
         raise ValueError(
             "Operators are incompatible. Expected `x` to have dimension"
-            " {} but got {}.".format(self.domain_dimension, x.range_dimension))
+            " {} but got {}.".format(
+                left_operator.domain_dimension, right_operator.range_dimension))
       with self._name_scope(name):
-        return linear_operator_algebra.matmul(self, x)
+        return linear_operator_algebra.matmul(left_operator, right_operator)
 
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
@@ -780,6 +782,20 @@
       raise NotImplementedError(
           "Exact solve not implemented for an operator that is expected to "
           "not be square.")
+    if isinstance(rhs, LinearOperator):
+      left_operator = self.adjoint() if adjoint else self
+      right_operator = rhs.adjoint() if adjoint_arg else rhs
+
+      if (right_operator.range_dimension is not None and
+          left_operator.domain_dimension is not None and
+          right_operator.range_dimension != left_operator.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `rhs` to have dimension"
+            " {} but got {}.".format(
+                left_operator.domain_dimension, right_operator.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.solve(left_operator, right_operator)
+
     with self._name_scope(name, values=[rhs]):
       rhs = ops.convert_to_tensor(rhs, name="rhs")
       self._check_input_dtype(rhs)
@@ -959,7 +975,7 @@
     ==> [1., 2.]
 
     # Equivalent, but inefficient method
-    tf.matrix_diag_part(my_operator.to_dense())
+    tf.linalg.diag_part(my_operator.to_dense())
     ==> [1., 2.]
     ```
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
index 0d1eab4..cd4acea 100644
--- a/tensorflow/python/ops/linalg/linear_operator_algebra.py
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -28,6 +28,7 @@
 _ADJOINTS = {}
 _CHOLESKY_DECOMPS = {}
 _MATMUL = {}
+_SOLVE = {}
 _INVERSES = {}
 
 
@@ -62,6 +63,11 @@
   return _registered_function([type_a, type_b], _MATMUL)
 
 
+def _registered_solve(type_a, type_b):
+  """Get the Solve function registered for classes a and b."""
+  return _registered_function([type_a, type_b], _SOLVE)
+
+
 def _registered_inverse(type_a):
   """Get the Cholesky function registered for class a."""
   return _registered_function([type_a], _INVERSES)
@@ -138,6 +144,31 @@
     return matmul_fn(lin_op_a, lin_op_b)
 
 
+def solve(lin_op_a, lin_op_b, name=None):
+  """Compute lin_op_a.solve(lin_op_b).
+
+  Args:
+    lin_op_a: The LinearOperator on the left.
+    lin_op_b: The LinearOperator on the right.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the solve between `lin_op_a` and
+      `lin_op_b`.
+
+  Raises:
+    NotImplementedError: If no solve method is defined between types of
+      `lin_op_a` and `lin_op_b`.
+  """
+  solve_fn = _registered_solve(type(lin_op_a), type(lin_op_b))
+  if solve_fn is None:
+    raise ValueError("No solve registered for {}.solve({})".format(
+        type(lin_op_a), type(lin_op_b)))
+
+  with ops.name_scope(name, "Solve"):
+    return solve_fn(lin_op_a, lin_op_b)
+
+
 def inverse(lin_op_a, name=None):
   """Get the Inverse associated to lin_op_a.
 
@@ -291,6 +322,52 @@
     return matmul_fn
 
 
+class RegisterSolve(object):
+  """Decorator to register a Solve implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterSolve(
+    lin_op.LinearOperatorIdentity,
+    lin_op.LinearOperatorIdentity)
+  def _solve_identity(a, b):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a, lin_op_cls_b):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator that is computing solve.
+      lin_op_cls_b: the class of the second LinearOperator to solve.
+    """
+    self._key = (lin_op_cls_a, lin_op_cls_b)
+
+  def __call__(self, solve_fn):
+    """Perform the Solve registration.
+
+    Args:
+      solve_fn: The function to use for the Solve.
+
+    Returns:
+      solve_fn
+
+    Raises:
+      TypeError: if solve_fn is not a callable.
+      ValueError: if a Solve function has already been registered for
+        the given argument classes.
+    """
+    if not callable(solve_fn):
+      raise TypeError(
+          "solve_fn must be callable, received: {}".format(solve_fn))
+    if self._key in _SOLVE:
+      raise ValueError("Solve({}, {}) has already been registered.".format(
+          self._key[0].__name__,
+          self._key[1].__name__))
+    _SOLVE[self._key] = solve_fn
+    return solve_fn
+
+
 class RegisterInverse(object):
   """Decorator to register an Inverse implementation function.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index b0b418c..6a3c0de 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -83,18 +83,18 @@
   ==> tf.concat([operator_1.matmul(x1), operator_2.matmul(x2)])
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
-  matrix_44 = tf.random_normal(shape=[2, 3, 4, 4])
+  matrix_44 = tf.random.normal(shape=[2, 3, 4, 4])
   operator_44 = LinearOperatorFullMatrix(matrix)
 
   # Create a [1, 3] batch of 5 x 5 linear operators.
-  matrix_55 = tf.random_normal(shape=[1, 3, 5, 5])
+  matrix_55 = tf.random.normal(shape=[1, 3, 5, 5])
   operator_55 = LinearOperatorFullMatrix(matrix_55)
 
   # Combine to create a [2, 3] batch of 9 x 9 operators.
   operator_99 = LinearOperatorBlockDiag([operator_44, operator_55])
 
   # Create a shape [2, 3, 9] vector.
-  x = tf.random_normal(shape=[2, 3, 9])
+  x = tf.random.normal(shape=[2, 3, 9])
   operator_99.matmul(x)
   ==> Shape [2, 3, 9] Tensor
   ```
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index b74baa5..ca58347 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -43,10 +43,6 @@
 _FFT_OP = {1: fft_ops.fft, 2: fft_ops.fft2d, 3: fft_ops.fft3d}
 _IFFT_OP = {1: fft_ops.ifft, 2: fft_ops.ifft2d, 3: fft_ops.ifft3d}
 
-# This is the only dtype allowed with fft ops.
-# TODO(langmore) Add other types once available.
-_DTYPE_COMPLEX = dtypes.complex64
-
 
 # TODO(langmore) Add transformations that create common spectrums, e.g.
 #   starting with the convolution kernel
@@ -62,7 +58,7 @@
   def __init__(self,
                spectrum,
                block_depth,
-               input_output_dtype=_DTYPE_COMPLEX,
+               input_output_dtype=dtypes.complex64,
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
@@ -71,12 +67,12 @@
     r"""Initialize an `_BaseLinearOperatorCirculant`.
 
     Args:
-      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
-        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes: `float16`,
+        `float32`, `float64`, `complex64`, `complex128`.  Type can be different
+        than `input_output_dtype`
       block_depth:  Python integer, either 1, 2, or 3.  Will be 1 for circulant,
         2 for block circulant, and 3 for nested block circulant.
-      input_output_dtype: `dtype` for input/output.  Must be either
-        `float32` or `complex64`.
+      input_output_dtype: `dtype` for input/output.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `spectrum` is real, this will always be true.
@@ -141,10 +137,6 @@
     """Static check of spectrum.  Then return `Tensor` version."""
     spectrum = ops.convert_to_tensor(spectrum, name="spectrum")
 
-    allowed_dtypes = [dtypes.float32, dtypes.complex64]
-    if spectrum.dtype not in allowed_dtypes:
-      raise TypeError("Argument spectrum must have dtype in %s.  Found: %s" %
-                      (allowed_dtypes, spectrum.dtype))
     if spectrum.get_shape().ndims is not None:
       if spectrum.get_shape().ndims < self.block_depth:
         raise ValueError(
@@ -408,6 +400,8 @@
     # matmul(x, adjoint=True) = F^{H} diag(conj(spectrum)) F x.
     spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
 
+    x = math_ops.cast(x, spectrum.dtype)
+
     x, spectrum = self._broadcast_batch_dims(x, spectrum)
 
     x_vb = self._vectorize_then_blockify(x)
@@ -495,7 +489,7 @@
 
     # Imaginary part, "im_d".
     if self.is_self_adjoint:
-      im_d_value = 0.
+      im_d_value = array_ops.zeros_like(re_d_value)
     else:
       im_d_value = math_ops.reduce_sum(math_ops.imag(self.spectrum), axis=axis)
 
@@ -602,7 +596,7 @@
   ```python
   # convolution_kernel is real ==> spectrum is Hermitian.
   convolution_kernel = [1., 2., 1.]]
-  spectrum = tf.fft(tf.cast(convolution_kernel, tf.complex64))
+  spectrum = tf.signal.fft(tf.cast(convolution_kernel, tf.complex64))
 
   # spectrum is Hermitian ==> operator is real.
   # spectrum is shape [3] ==> operator is shape [3, 3]
@@ -654,7 +648,7 @@
        [0, 1, 4, 1],
        [1, 0, 1, 4]]
 
-  # convolution_kernel = tf.ifft(spectrum)
+  # convolution_kernel = tf.signal.ifft(spectrum)
   operator.convolution_kernel()
   ==> [4, 1, 0, 1]
   ```
@@ -688,7 +682,7 @@
 
   def __init__(self,
                spectrum,
-               input_output_dtype=_DTYPE_COMPLEX,
+               input_output_dtype=dtypes.complex64,
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
@@ -715,10 +709,10 @@
     a real type is fine.
 
     Args:
-      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
-        `float32`, `complex64`.  Type can be different than `input_output_dtype`
-      input_output_dtype: `dtype` for input/output.  Must be either
-        `float32` or `complex64`.
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes: `float16`,
+        `float32`, `float64`, `complex64`, `complex128`.  Type can be different
+        than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `spectrum` is real, this will always be true.
@@ -830,7 +824,7 @@
   ```python
   # convolution_kernel is real ==> spectrum is Hermitian.
   convolution_kernel = [[1., 2., 1.], [5., -1., 1.]]
-  spectrum = tf.fft2d(tf.cast(convolution_kernel, tf.complex64))
+  spectrum = tf.signal.fft2d(tf.cast(convolution_kernel, tf.complex64))
 
   # spectrum is shape [2, 3] ==> operator is shape [6, 6]
   # spectrum is Hermitian ==> operator is real.
@@ -865,7 +859,7 @@
 
   def __init__(self,
                spectrum,
-               input_output_dtype=_DTYPE_COMPLEX,
+               input_output_dtype=dtypes.complex64,
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
@@ -892,10 +886,10 @@
     a real type is fine.
 
     Args:
-      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
-        `float32`, `complex64`.  Type can be different than `input_output_dtype`
-      input_output_dtype: `dtype` for input/output.  Must be either
-        `float32` or `complex64`.
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes: `float16`,
+        `float32`, `float64`, `complex64`, `complex128`.  Type can be different
+        than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `spectrum` is real, this will always be true.
@@ -1015,7 +1009,7 @@
 
   def __init__(self,
                spectrum,
-               input_output_dtype=_DTYPE_COMPLEX,
+               input_output_dtype=dtypes.complex64,
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
@@ -1043,10 +1037,10 @@
     a real type is fine.
 
     Args:
-      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
-        `float32`, `complex64`.  Type can be different than `input_output_dtype`
-      input_output_dtype: `dtype` for input/output.  Must be either
-        `float32` or `complex64`.
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes: `float16`,
+        `float32`, `float64`, `complex64`, `complex128`.  Type can be different
+        than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `spectrum` is real, this will always be true.
@@ -1070,4 +1064,10 @@
 
 
 def _to_complex(x):
-  return math_ops.cast(x, _DTYPE_COMPLEX)
+  if x.dtype.is_complex:
+    return x
+  dtype = dtypes.complex64
+
+  if x.dtype == dtypes.float64:
+    dtype = dtypes.complex128
+  return math_ops.cast(x, dtype)
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index f499b306..7e6f79e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -73,18 +73,18 @@
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 5 linear operators.
-  matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
+  matrix_45 = tf.random.normal(shape=[2, 3, 4, 5])
   operator_45 = LinearOperatorFullMatrix(matrix)
 
   # Create a [2, 3] batch of 5 x 6 linear operators.
-  matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
+  matrix_56 = tf.random.normal(shape=[2, 3, 5, 6])
   operator_56 = LinearOperatorFullMatrix(matrix_56)
 
   # Compose to create a [2, 3] batch of 4 x 6 operators.
   operator_46 = LinearOperatorComposition([operator_45, operator_56])
 
   # Create a shape [2, 3, 6, 2] vector.
-  x = tf.random_normal(shape=[2, 3, 6, 2])
+  x = tf.random.normal(shape=[2, 3, 6, 2])
   operator.matmul(x)
   ==> Shape [2, 3, 4, 2] Tensor
   ```
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index be893c7..be190ae 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -63,13 +63,13 @@
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
-  diag = tf.random_normal(shape=[2, 3, 4])
+  diag = tf.random.normal(shape=[2, 3, 4])
   operator = LinearOperatorDiag(diag)
 
   # Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
   # since the batch dimensions, [2, 1], are broadcast to
   # operator.batch_shape = [2, 3].
-  y = tf.random_normal(shape=[2, 1, 4, 2])
+  y = tf.random.normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
   ==> operator.matmul(x) = y
   ```
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 746da8d..f6f36d6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -21,8 +21,8 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
-from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -57,7 +57,7 @@
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
-  matrix = tf.random_normal(shape=[2, 3, 4, 4])
+  matrix = tf.random.normal(shape=[2, 3, 4, 4])
   operator = LinearOperatorFullMatrix(matrix)
   ```
 
@@ -176,7 +176,7 @@
     return array_ops.shape(self._matrix)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return linear_operator_util.matmul_with_broadcast(
+    return math_ops.matmul(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py
new file mode 100644
index 0000000..be8f05b
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_householder.py
@@ -0,0 +1,239 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` acting like a Householder transformation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = ["LinearOperatorHouseholder",]
+
+
+@tf_export("linalg.LinearOperatorHouseholder")
+class LinearOperatorHouseholder(linear_operator.LinearOperator):
+  """`LinearOperator` acting like a [batch] of Householder transformations.
+
+  This operator acts like a [batch] of householder reflections with shape
+  `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  `LinearOperatorHouseholder` is initialized with a (batch) vector.
+
+  A Householder reflection, defined via a vector `v`, which reflects points
+  in `R^n` about the hyperplane orthogonal to `v` and through the origin.
+
+  ```python
+  # Create a 2 x 2 householder transform.
+  vec = [1 / np.sqrt(2), 1. / np.sqrt(2)]
+  operator = LinearOperatorHouseholder(vec)
+
+  operator.to_dense()
+  ==> [[0.,  -1.]
+       [-1., -0.]]
+
+  operator.shape
+  ==> [2, 2]
+
+  operator.log_abs_determinant()
+  ==> scalar Tensor
+
+  x = ... Shape [2, 4] Tensor
+  operator.matmul(x)
+  ==> Shape [2, 4] Tensor
+
+  #### Shape compatibility
+
+  This operator acts on [batch] matrix with compatible shape.
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
+
+  ```
+  operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
+  x.shape =   [C1,...,Cc] + [N, R],
+  and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
+  ```
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               reflection_axis,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name="LinearOperatorHouseholder"):
+    r"""Initialize a `LinearOperatorHouseholder`.
+
+    Args:
+      reflection_axis:  Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
+        The vector defining the hyperplane to reflect about.
+        Allowed dtypes: `float16`, `float32`, `float64`, `complex64`,
+        `complex128`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  This is autoset to true
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+        This is autoset to false.
+      is_square:  Expect that this operator acts like square [batch] matrices.
+        This is autoset to true.
+      name: A name for this `LinearOperator`.
+
+    Raises:
+      ValueError:  `is_self_adjoint` is not `True`, `is_positive_definite` is
+        not `False` or `is_square` is not `True`.
+    """
+
+    with ops.name_scope(name, values=[reflection_axis]):
+      self._reflection_axis = ops.convert_to_tensor(
+          reflection_axis, name="reflection_axis")
+      self._check_reflection_axis(self._reflection_axis)
+
+      # Check and auto-set hints.
+      if is_self_adjoint is False:  # pylint:disable=g-bool-id-comparison
+        raise ValueError("A Householder operator is always self adjoint.")
+      else:
+        is_self_adjoint = True
+
+      if is_positive_definite is True:  # pylint:disable=g-bool-id-comparison
+        raise ValueError(
+            "A Householder operator is always non-positive definite.")
+      else:
+        is_positive_definite = False
+
+      if is_square is False:  # pylint:disable=g-bool-id-comparison
+        raise ValueError("A Householder operator is always square.")
+      is_square = True
+
+      super(LinearOperatorHouseholder, self).__init__(
+          dtype=self._reflection_axis.dtype,
+          graph_parents=[self._reflection_axis],
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  def _check_reflection_axis(self, reflection_axis):
+    """Static check of reflection_axis."""
+    if (reflection_axis.get_shape().ndims is not None and
+        reflection_axis.get_shape().ndims < 1):
+      raise ValueError(
+          "Argument reflection_axis must have at least 1 dimension.  "
+          "Found: %s" % reflection_axis)
+
+  def _shape(self):
+    # If d_shape = [5, 3], we return [5, 3, 3].
+    d_shape = self._reflection_axis.get_shape()
+    return d_shape.concatenate(d_shape[-1:])
+
+  def _shape_tensor(self):
+    d_shape = array_ops.shape(self._reflection_axis)
+    k = d_shape[-1]
+    return array_ops.concat((d_shape, [k]), 0)
+
+  def _assert_non_singular(self):
+    return control_flow_ops.no_op("assert_non_singular")
+
+  def _assert_positive_definite(self):
+    raise errors.InvalidArgumentError(
+        node_def=None, op=None, message="Householder operators are always "
+        "non-positive definite.")
+
+  def _assert_self_adjoint(self):
+    return control_flow_ops.no_op("assert_self_adjoint")
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    # Given a vector `v`, we would like to reflect `x` about the hyperplane
+    # orthogonal to `v` going through the origin.  We first project `x` to `v`
+    # to get v * dot(v, x) / dot(v, v).  After we project, we can reflect the
+    # projection about the hyperplane by flipping sign to get
+    # -v * dot(v, x) / dot(v, v).  Finally, we can add back the component
+    # that is orthogonal to v. This is invariant under reflection, since the
+    # whole hyperplane is invariant. This component is equal to x - v * dot(v,
+    # x) / dot(v, v), giving the formula x - 2 * v * dot(v, x) / dot(v, v)
+    # for the reflection.
+
+    # Note that because this is a reflection, it lies in O(n) (for real vector
+    # spaces) or U(n) (for complex vector spaces), and thus is its own adjoint.
+    x = linalg.adjoint(x) if adjoint_arg else x
+    normalized_axis = self.reflection_axis / linalg.norm(
+        self.reflection_axis, axis=-1, keepdims=True)
+    mat = normalized_axis[..., array_ops.newaxis]
+    x_dot_normalized_v = math_ops.matmul(mat, x, adjoint_a=True)
+
+    return x - 2 * mat * x_dot_normalized_v
+
+  def _trace(self):
+    # We have (n - 1) +1 eigenvalues and a single -1 eigenvalue.
+    return math_ops.cast(
+        self.domain_dimension_tensor() - 2, self.dtype) * array_ops.ones(
+            shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _determinant(self):
+    # For householder transformations, the determinant is -1.
+    return -array_ops.ones(shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _log_abs_determinant(self):
+    # Orthogonal matrix -> log|Q| = 0.
+    return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    # A householder reflection is a reflection, hence is idempotent. Thus we
+    # can just apply a matmul.
+    return self._matmul(rhs, adjoint, adjoint_arg)
+
+  def _to_dense(self):
+    normalized_axis = self.reflection_axis / linalg.norm(
+        self.reflection_axis, axis=-1, keepdims=True)
+    mat = normalized_axis[..., array_ops.newaxis]
+    matrix = -2 * math_ops.matmul(mat, mat, adjoint_b=True)
+    return array_ops.matrix_set_diag(
+        matrix, 1. + array_ops.matrix_diag_part(matrix))
+
+  def _diag_part(self):
+    normalized_axis = self.reflection_axis / linalg.norm(
+        self.reflection_axis, axis=-1, keepdims=True)
+    return 1. - 2 * normalized_axis * math_ops.conj(normalized_axis)
+
+  @property
+  def reflection_axis(self):
+    return self._reflection_axis
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 694557e..5fc3d82 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -131,7 +131,7 @@
   operator.matmul(x)
   ==> Shape [2, 4] Tensor, same as x.
 
-  y = tf.random_normal(shape=[3, 2, 4])
+  y = tf.random.normal(shape=[3, 2, 4])
   # Note that y.shape is compatible with operator.shape because operator.shape
   # is broadcast to [3, 2, 2].
   # This broadcast does NOT require copying data, since we can infer that y
@@ -492,7 +492,7 @@
   operator.matmul(x)
   ==> 3 * x
 
-  y = tf.random_normal(shape=[3, 2, 4])
+  y = tf.random.normal(shape=[3, 2, 4])
   # Note that y.shape is compatible with operator.shape because operator.shape
   # is broadcast to [3, 2, 2].
   x = operator.solve(y)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 005b9b4..6de64ac 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -98,18 +98,18 @@
   ==> Shape [4, 2] Tensor
 
   # Create a [2, 3] batch of 4 x 5 linear operators.
-  matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
+  matrix_45 = tf.random.normal(shape=[2, 3, 4, 5])
   operator_45 = LinearOperatorFullMatrix(matrix)
 
   # Create a [2, 3] batch of 5 x 6 linear operators.
-  matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
+  matrix_56 = tf.random.normal(shape=[2, 3, 5, 6])
   operator_56 = LinearOperatorFullMatrix(matrix_56)
 
   # Compose to create a [2, 3] batch of 20 x 30 operators.
   operator_large = LinearOperatorKronecker([operator_45, operator_56])
 
   # Create a shape [2, 3, 20, 2] vector.
-  x = tf.random_normal(shape=[2, 3, 6, 2])
+  x = tf.random.normal(shape=[2, 3, 6, 2])
   operator_large.matmul(x)
   ==> Shape [2, 3, 30, 2] Tensor
   ```
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index aa0500a..3d67154 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -354,17 +354,14 @@
     leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
-      uh_x = linear_operator_util.matmul_with_broadcast(
-          u, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_uh_x = d.matmul(uh_x, adjoint=adjoint)
-      v_d_uh_x = linear_operator_util.matmul_with_broadcast(
-          v, d_uh_x)
+      v_d_uh_x = math_ops.matmul(v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
-      vh_x = linear_operator_util.matmul_with_broadcast(
-          v, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_vh_x = d.matmul(vh_x, adjoint=adjoint)
-      u_d_vh_x = linear_operator_util.matmul_with_broadcast(u, d_vh_x)
+      u_d_vh_x = math_ops.matmul(u, d_vh_x)
       return leading_term + u_d_vh_x
 
   def _determinant(self):
@@ -425,8 +422,7 @@
     # L^{-1} rhs
     linv_rhs = l.solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
     # V^H L^{-1} rhs
-    vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
-        v, linv_rhs, adjoint_a=True)
+    vh_linv_rhs = math_ops.matmul(v, linv_rhs, adjoint_a=True)
     # C^{-1} V^H L^{-1} rhs
     if self._use_cholesky:
       capinv_vh_linv_rhs = linear_operator_util.cholesky_solve_with_broadcast(
@@ -435,8 +431,7 @@
       capinv_vh_linv_rhs = linear_operator_util.matrix_solve_with_broadcast(
           self._capacitance, vh_linv_rhs, adjoint=adjoint)
     # U C^{-1} V^H M^{-1} rhs
-    u_capinv_vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
-        u, capinv_vh_linv_rhs)
+    u_capinv_vh_linv_rhs = math_ops.matmul(u, capinv_vh_linv_rhs)
     # L^{-1} U C^{-1} V^H L^{-1} rhs
     linv_u_capinv_vh_linv_rhs = l.solve(u_capinv_vh_linv_rhs, adjoint=adjoint)
 
@@ -450,8 +445,7 @@
     # L^{-1} U
     linv_u = self.base_operator.solve(self.u)
     # V^H L^{-1} U
-    vh_linv_u = linear_operator_util.matmul_with_broadcast(
-        self.v, linv_u, adjoint_a=True)
+    vh_linv_u = math_ops.matmul(self.v, linv_u, adjoint_a=True)
 
     # D^{-1} + V^H L^{-1} V
     capacitance = self._diag_inv_operator.add_to_tensor(vh_linv_u)
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index d33fe17..cc2e1ba 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -66,7 +66,7 @@
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
-  tril = tf.random_normal(shape=[2, 3, 4, 4])
+  tril = tf.random.normal(shape=[2, 3, 4, 4])
   operator = LinearOperatorLowerTriangular(tril)
   ```
 
@@ -191,7 +191,7 @@
         message="Singular operator:  Diagonal contained zero values.")
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return linear_operator_util.matmul_with_broadcast(
+    return math_ops.matmul(
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 854c4de..7207423 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import abc
+import itertools
 import numpy as np
 import six
 
@@ -37,7 +38,7 @@
 from tensorflow.python.platform import test
 
 
-class OperatorBuildInfo(object):
+class OperatorShapesInfo(object):
   """Object encoding expected shape for a test.
 
   Encodes the expected shape of a matrix for a test. Also
@@ -65,6 +66,7 @@
       dtypes.complex64: 1e-6,
       dtypes.complex128: 1e-12
   }
+
   _rtol = {
       dtypes.float16: 1e-3,
       dtypes.float32: 1e-6,
@@ -80,31 +82,31 @@
     rtol = self._rtol[dtype]
     self.assertAllClose(x, y, atol=atol, rtol=rtol)
 
-  @property
-  def _adjoint_options(self):
+  @staticmethod
+  def adjoint_options():
     return [False, True]
 
-  @property
-  def _adjoint_arg_options(self):
+  @staticmethod
+  def adjoint_arg_options():
     return [False, True]
 
-  @property
-  def _dtypes_to_test(self):
-    # TODO(langmore) Test tf.float16 once tf.matrix_solve works in 16bit.
+  @staticmethod
+  def dtypes_to_test():
+    # TODO(langmore) Test tf.float16 once tf.linalg.solve works in 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  @property
-  def _use_placeholder_options(self):
+  @staticmethod
+  def use_placeholder_options():
     return [False, True]
 
-  @abc.abstractproperty
-  def _operator_build_infos(self):
-    """Returns list of OperatorBuildInfo, encapsulating the shape to test."""
-    raise NotImplementedError("operator_build_infos has not been implemented.")
+  @staticmethod
+  def operator_shapes_infos():
+    """Returns list of OperatorShapesInfo, encapsulating the shape to test."""
+    raise NotImplementedError("operator_shapes_infos has not been implemented.")
 
   @abc.abstractmethod
-  def _operator_and_matrix(
-      self, build_info, dtype, use_placeholder,
+  def operator_and_matrix(
+      self, shapes_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
     """Build a batch matrix and an Operator that should have similar behavior.
 
@@ -112,7 +114,7 @@
     together, and is used by tests.
 
     Args:
-      build_info: `OperatorBuildInfo`, encoding shape information about the
+      shapes_info: `OperatorShapesInfo`, encoding shape information about the
         operator.
       dtype:  Numpy dtype.  Data type of returned array/operator.
       use_placeholder:  Python bool.  If True, initialize the operator with a
@@ -132,7 +134,7 @@
     raise NotImplementedError("Not implemented yet.")
 
   @abc.abstractmethod
-  def _make_rhs(self, operator, adjoint, with_batch=True):
+  def make_rhs(self, operator, adjoint, with_batch=True):
     """Make a rhs appropriate for calling operator.solve(rhs).
 
     Args:
@@ -146,10 +148,10 @@
     Returns:
       A `Tensor`
     """
-    raise NotImplementedError("_make_rhs is not defined.")
+    raise NotImplementedError("make_rhs is not defined.")
 
   @abc.abstractmethod
-  def _make_x(self, operator, adjoint, with_batch=True):
+  def make_x(self, operator, adjoint, with_batch=True):
     """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
@@ -162,282 +164,358 @@
     Returns:
       A `Tensor`
     """
-    raise NotImplementedError("_make_x is not defined.")
+    raise NotImplementedError("make_x is not defined.")
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     """List of test names to skip."""
     # Subclasses should over-ride if they want to skip some tests.
     # To skip "test_foo", add "foo" to this list.
     return []
 
-  def _skip_if_tests_to_skip_contains(self, test_name):
-    """If self._tests_to_skip contains test_name, raise SkipTest exception.
 
-    See tests below for usage.
+# pylint:disable=missing-docstring
 
-    Args:
-      test_name:  String name corresponding to a test.
 
-    Raises:
-      SkipTest Exception, if test_name is in self._tests_to_skip.
-    """
-    if test_name in self._tests_to_skip:
-      self.skipTest(
-          "{} skipped because it was added to self._tests_to_skip.".format(
-              test_name))
-
-  @test_util.run_deprecated_v1
+def _test_to_dense(use_placeholder, shapes_info, dtype):
   def test_to_dense(self):
-    self._skip_if_tests_to_skip_contains("to_dense")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_dense = operator.to_dense()
-            if not use_placeholder:
-              self.assertAllEqual(build_info.shape, op_dense.get_shape())
-            op_dense_v, mat_v = sess.run([op_dense, mat])
-            self.assertAC(op_dense_v, mat_v)
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_dense = operator.to_dense()
+      if not use_placeholder:
+        self.assertAllEqual(shapes_info.shape, op_dense.get_shape())
+      op_dense_v, mat_v = sess.run([op_dense, mat])
+      self.assertAC(op_dense_v, mat_v)
+  return test_to_dense
 
-  @test_util.run_deprecated_v1
+
+def _test_det(use_placeholder, shapes_info, dtype):
   def test_det(self):
-    self._skip_if_tests_to_skip_contains("det")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_det = operator.determinant()
-            if not use_placeholder:
-              self.assertAllEqual(build_info.shape[:-2], op_det.get_shape())
-            op_det_v, mat_det_v = sess.run(
-                [op_det, linalg_ops.matrix_determinant(mat)])
-            self.assertAC(op_det_v, mat_det_v)
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_det = operator.determinant()
+      if not use_placeholder:
+        self.assertAllEqual(shapes_info.shape[:-2], op_det.get_shape())
+      op_det_v, mat_det_v = sess.run(
+          [op_det, linalg_ops.matrix_determinant(mat)])
+      self.assertAC(op_det_v, mat_det_v)
+  return test_det
 
-  @test_util.run_deprecated_v1
+
+def _test_log_abs_det(use_placeholder, shapes_info, dtype):
   def test_log_abs_det(self):
-    self._skip_if_tests_to_skip_contains("log_abs_det")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_log_abs_det = operator.log_abs_determinant()
-            _, mat_log_abs_det = linalg.slogdet(mat)
-            if not use_placeholder:
-              self.assertAllEqual(
-                  build_info.shape[:-2], op_log_abs_det.get_shape())
-            op_log_abs_det_v, mat_log_abs_det_v = sess.run(
-                [op_log_abs_det, mat_log_abs_det])
-            self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_log_abs_det = operator.log_abs_determinant()
+      _, mat_log_abs_det = linalg.slogdet(mat)
+      if not use_placeholder:
+        self.assertAllEqual(
+            shapes_info.shape[:-2], op_log_abs_det.get_shape())
+      op_log_abs_det_v, mat_log_abs_det_v = sess.run(
+          [op_log_abs_det, mat_log_abs_det])
+      self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
+  return test_log_abs_det
 
-  def _test_matmul(self, with_batch):
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        # If batch dimensions are omitted, but there are
-        # no batch dimensions for the linear operator, then
-        # skip the test case. This is already checked with
-        # with_batch=True.
-        if not with_batch and len(build_info.shape) <= 2:
-          continue
-        for dtype in self._dtypes_to_test:
-          for adjoint in self._adjoint_options:
-            for adjoint_arg in self._adjoint_arg_options:
-              with self.session(graph=ops.Graph()) as sess:
-                sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-                operator, mat = self._operator_and_matrix(
-                    build_info, dtype, use_placeholder=use_placeholder)
-                x = self._make_x(
-                    operator, adjoint=adjoint, with_batch=with_batch)
-                # If adjoint_arg, compute A X^H^H = A X.
-                if adjoint_arg:
-                  op_matmul = operator.matmul(
-                      linalg.adjoint(x),
-                      adjoint=adjoint,
-                      adjoint_arg=adjoint_arg)
-                else:
-                  op_matmul = operator.matmul(x, adjoint=adjoint)
-                mat_matmul = linear_operator_util.matmul_with_broadcast(
-                    mat, x, adjoint_a=adjoint)
-                if not use_placeholder:
-                  self.assertAllEqual(op_matmul.get_shape(),
-                                      mat_matmul.get_shape())
-                op_matmul_v, mat_matmul_v = sess.run(
-                    [op_matmul, mat_matmul])
-                self.assertAC(op_matmul_v, mat_matmul_v)
 
-  @test_util.run_deprecated_v1
+def _test_matmul_base(
+    self,
+    use_placeholder,
+    shapes_info,
+    dtype,
+    adjoint,
+    adjoint_arg,
+    with_batch):
+  # If batch dimensions are omitted, but there are
+  # no batch dimensions for the linear operator, then
+  # skip the test case. This is already checked with
+  # with_batch=True.
+  if not with_batch and len(shapes_info.shape) <= 2:
+    return
+  with self.session(graph=ops.Graph()) as sess:
+    sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+    operator, mat = self.operator_and_matrix(
+        shapes_info, dtype, use_placeholder=use_placeholder)
+    x = self.make_x(
+        operator, adjoint=adjoint, with_batch=with_batch)
+    # If adjoint_arg, compute A X^H^H = A X.
+    if adjoint_arg:
+      op_matmul = operator.matmul(
+          linalg.adjoint(x),
+          adjoint=adjoint,
+          adjoint_arg=adjoint_arg)
+    else:
+      op_matmul = operator.matmul(x, adjoint=adjoint)
+    mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
+    if not use_placeholder:
+      self.assertAllEqual(op_matmul.get_shape(),
+                          mat_matmul.get_shape())
+    op_matmul_v, mat_matmul_v = sess.run(
+        [op_matmul, mat_matmul])
+    self.assertAC(op_matmul_v, mat_matmul_v)
+
+
+def _test_matmul(
+    use_placeholder,
+    shapes_info,
+    dtype,
+    adjoint,
+    adjoint_arg):
   def test_matmul(self):
-    self._skip_if_tests_to_skip_contains("matmul")
-    self._test_matmul(with_batch=True)
+    _test_matmul_base(
+        self,
+        use_placeholder,
+        shapes_info,
+        dtype,
+        adjoint,
+        adjoint_arg,
+        with_batch=True)
+  return test_matmul
 
-  @test_util.run_deprecated_v1
+
+def _test_matmul_with_broadcast(
+    use_placeholder,
+    shapes_info,
+    dtype,
+    adjoint,
+    adjoint_arg):
   def test_matmul_with_broadcast(self):
-    self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
-    self._test_matmul(with_batch=False)
+    _test_matmul_base(
+        self,
+        use_placeholder,
+        shapes_info,
+        dtype,
+        adjoint,
+        adjoint_arg,
+        with_batch=True)
+  return test_matmul_with_broadcast
 
-  @test_util.run_deprecated_v1
+
+def _test_adjoint(use_placeholder, shapes_info, dtype):
   def test_adjoint(self):
-    self._skip_if_tests_to_skip_contains("adjoint")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.test_session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_adjoint = operator.adjoint().to_dense()
-            op_adjoint_h = operator.H.to_dense()
-            mat_adjoint = linalg.adjoint(mat)
-            op_adjoint_v, op_adjoint_h_v, mat_adjoint_v = sess.run(
-                [op_adjoint, op_adjoint_h, mat_adjoint])
-            self.assertAC(mat_adjoint_v, op_adjoint_v)
-            self.assertAC(mat_adjoint_v, op_adjoint_h_v)
+    with self.test_session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_adjoint = operator.adjoint().to_dense()
+      op_adjoint_h = operator.H.to_dense()
+      mat_adjoint = linalg.adjoint(mat)
+      op_adjoint_v, op_adjoint_h_v, mat_adjoint_v = sess.run(
+          [op_adjoint, op_adjoint_h, mat_adjoint])
+      self.assertAC(mat_adjoint_v, op_adjoint_v)
+      self.assertAC(mat_adjoint_v, op_adjoint_h_v)
+  return test_adjoint
 
-  @test_util.run_deprecated_v1
+
+def _test_cholesky(use_placeholder, shapes_info, dtype):
   def test_cholesky(self):
-    self._skip_if_tests_to_skip_contains("cholesky")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.test_session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder,
-                ensure_self_adjoint_and_pd=True)
-            op_chol = operator.cholesky().to_dense()
-            mat_chol = linalg_ops.cholesky(mat)
-            op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
-            self.assertAC(mat_chol_v, op_chol_v)
+    with self.test_session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder,
+          ensure_self_adjoint_and_pd=True)
+      op_chol = operator.cholesky().to_dense()
+      mat_chol = linalg_ops.cholesky(mat)
+      op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
+      self.assertAC(mat_chol_v, op_chol_v)
+  return test_cholesky
 
-  @test_util.run_deprecated_v1
-  def _test_solve(self, with_batch):
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        # If batch dimensions are omitted, but there are
-        # no batch dimensions for the linear operator, then
-        # skip the test case. This is already checked with
-        # with_batch=True.
-        if not with_batch and len(build_info.shape) <= 2:
-          continue
-        for dtype in self._dtypes_to_test:
-          for adjoint in self._adjoint_options:
-            for adjoint_arg in self._adjoint_arg_options:
-              with self.session(graph=ops.Graph()) as sess:
-                sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-                operator, mat = self._operator_and_matrix(
-                    build_info, dtype, use_placeholder=use_placeholder)
-                rhs = self._make_rhs(
-                    operator, adjoint=adjoint, with_batch=with_batch)
-                # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
-                if adjoint_arg:
-                  op_solve = operator.solve(
-                      linalg.adjoint(rhs),
-                      adjoint=adjoint,
-                      adjoint_arg=adjoint_arg)
-                else:
-                  op_solve = operator.solve(
-                      rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
-                mat_solve = linear_operator_util.matrix_solve_with_broadcast(
-                    mat, rhs, adjoint=adjoint)
-                if not use_placeholder:
-                  self.assertAllEqual(op_solve.get_shape(),
-                                      mat_solve.get_shape())
-                op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve])
-                self.assertAC(op_solve_v, mat_solve_v)
 
-  @test_util.run_deprecated_v1
+def _test_solve_base(
+    self,
+    use_placeholder,
+    shapes_info,
+    dtype,
+    adjoint,
+    adjoint_arg,
+    with_batch):
+  # If batch dimensions are omitted, but there are
+  # no batch dimensions for the linear operator, then
+  # skip the test case. This is already checked with
+  # with_batch=True.
+  if not with_batch and len(shapes_info.shape) <= 2:
+    return
+  with self.session(graph=ops.Graph()) as sess:
+    sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+    operator, mat = self.operator_and_matrix(
+        shapes_info, dtype, use_placeholder=use_placeholder)
+    rhs = self.make_rhs(
+        operator, adjoint=adjoint, with_batch=with_batch)
+    # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
+    if adjoint_arg:
+      op_solve = operator.solve(
+          linalg.adjoint(rhs),
+          adjoint=adjoint,
+          adjoint_arg=adjoint_arg)
+    else:
+      op_solve = operator.solve(
+          rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+    mat_solve = linear_operator_util.matrix_solve_with_broadcast(
+        mat, rhs, adjoint=adjoint)
+    if not use_placeholder:
+      self.assertAllEqual(op_solve.get_shape(),
+                          mat_solve.get_shape())
+    op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve])
+    self.assertAC(op_solve_v, mat_solve_v)
+
+
+def _test_solve(
+    use_placeholder, shapes_info, dtype, adjoint, adjoint_arg):
   def test_solve(self):
-    self._skip_if_tests_to_skip_contains("solve")
-    self._test_solve(with_batch=True)
+    _test_solve_base(
+        self,
+        use_placeholder,
+        shapes_info,
+        dtype,
+        adjoint,
+        adjoint_arg,
+        with_batch=True)
+  return test_solve
 
-  @test_util.run_deprecated_v1
+
+def _test_solve_with_broadcast(
+    use_placeholder, shapes_info, dtype, adjoint, adjoint_arg):
   def test_solve_with_broadcast(self):
-    self._skip_if_tests_to_skip_contains("solve_with_broadcast")
-    self._test_solve(with_batch=False)
+    _test_solve_base(
+        self,
+        use_placeholder,
+        shapes_info,
+        dtype,
+        adjoint,
+        adjoint_arg,
+        with_batch=False)
+  return test_solve_with_broadcast
 
-  def _test_inverse(self):
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_inverse_v, mat_inverse_v = sess.run([
-                operator.inverse().to_dense(), linalg.inv(mat)])
-            self.assertAC(op_inverse_v, mat_inverse_v)
 
-  @test_util.run_deprecated_v1
+def _test_inverse(use_placeholder, shapes_info, dtype):
   def test_inverse(self):
-    self._skip_if_tests_to_skip_contains("inverse")
-    self._test_inverse()
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_inverse_v, mat_inverse_v = sess.run([
+          operator.inverse().to_dense(), linalg.inv(mat)])
+      self.assertAC(op_inverse_v, mat_inverse_v)
+  return test_inverse
 
-  @test_util.run_deprecated_v1
+
+def _test_trace(use_placeholder, shapes_info, dtype):
   def test_trace(self):
-    self._skip_if_tests_to_skip_contains("trace")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_trace = operator.trace()
-            mat_trace = math_ops.trace(mat)
-            if not use_placeholder:
-              self.assertAllEqual(op_trace.get_shape(), mat_trace.get_shape())
-            op_trace_v, mat_trace_v = sess.run([op_trace, mat_trace])
-            self.assertAC(op_trace_v, mat_trace_v)
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_trace = operator.trace()
+      mat_trace = math_ops.trace(mat)
+      if not use_placeholder:
+        self.assertAllEqual(op_trace.get_shape(), mat_trace.get_shape())
+      op_trace_v, mat_trace_v = sess.run([op_trace, mat_trace])
+      self.assertAC(op_trace_v, mat_trace_v)
+  return test_trace
 
-  @test_util.run_deprecated_v1
+
+def _test_add_to_tensor(use_placeholder, shapes_info, dtype):
   def test_add_to_tensor(self):
-    self._skip_if_tests_to_skip_contains("add_to_tensor")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_plus_2mat = operator.add_to_tensor(2 * mat)
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_plus_2mat = operator.add_to_tensor(2 * mat)
 
-            if not use_placeholder:
-              self.assertAllEqual(build_info.shape, op_plus_2mat.get_shape())
+      if not use_placeholder:
+        self.assertAllEqual(shapes_info.shape, op_plus_2mat.get_shape())
 
-            op_plus_2mat_v, mat_v = sess.run([op_plus_2mat, mat])
+      op_plus_2mat_v, mat_v = sess.run([op_plus_2mat, mat])
 
-            self.assertAC(op_plus_2mat_v, 3 * mat_v)
+      self.assertAC(op_plus_2mat_v, 3 * mat_v)
+  return test_add_to_tensor
 
-  @test_util.run_deprecated_v1
+
+def _test_diag_part(use_placeholder, shapes_info, dtype):
   def test_diag_part(self):
-    self._skip_if_tests_to_skip_contains("diag_part")
-    for use_placeholder in self._use_placeholder_options:
-      for build_info in self._operator_build_infos:
-        for dtype in self._dtypes_to_test:
-          with self.session(graph=ops.Graph()) as sess:
-            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-            operator, mat = self._operator_and_matrix(
-                build_info, dtype, use_placeholder=use_placeholder)
-            op_diag_part = operator.diag_part()
-            mat_diag_part = array_ops.matrix_diag_part(mat)
+    with self.session(graph=ops.Graph()) as sess:
+      sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+      operator, mat = self.operator_and_matrix(
+          shapes_info, dtype, use_placeholder=use_placeholder)
+      op_diag_part = operator.diag_part()
+      mat_diag_part = array_ops.matrix_diag_part(mat)
 
-            if not use_placeholder:
-              self.assertAllEqual(mat_diag_part.get_shape(),
-                                  op_diag_part.get_shape())
+      if not use_placeholder:
+        self.assertAllEqual(mat_diag_part.get_shape(),
+                            op_diag_part.get_shape())
 
-            op_diag_part_, mat_diag_part_ = sess.run(
-                [op_diag_part, mat_diag_part])
+      op_diag_part_, mat_diag_part_ = sess.run(
+          [op_diag_part, mat_diag_part])
 
-            self.assertAC(op_diag_part_, mat_diag_part_)
+      self.assertAC(op_diag_part_, mat_diag_part_)
+  return test_diag_part
+
+# pylint:enable=missing-docstring
+
+
+def add_tests(test_cls):
+  """Add tests for LinearOperator methods."""
+  test_name_dict = {
+      "add_to_tensor": _test_add_to_tensor,
+      "cholesky": _test_cholesky,
+      "det": _test_det,
+      "diag_part": _test_diag_part,
+      "inverse": _test_inverse,
+      "log_abs_det": _test_log_abs_det,
+      "matmul": _test_matmul,
+      "matmul_with_broadcast": _test_matmul_with_broadcast,
+      "solve": _test_solve,
+      "solve_with_broadcast": _test_solve_with_broadcast,
+      "to_dense": _test_to_dense,
+      "trace": _test_trace,
+  }
+  tests_with_adjoint_args = [
+      "matmul",
+      "matmul_with_broadcast",
+      "solve",
+      "solve_with_broadcast",
+  ]
+
+  for name, test_template_fn in test_name_dict.items():
+    if name in test_cls.tests_to_skip():
+      continue
+
+    for dtype, use_placeholder, shape_info in itertools.product(
+        test_cls.dtypes_to_test(),
+        test_cls.use_placeholder_options(),
+        test_cls.operator_shapes_infos()):
+      base_test_name = "_".join([
+          "test", name, "_shape={},dtype={},use_placeholder={}".format(
+              shape_info.shape, dtype, use_placeholder)])
+      if name in tests_with_adjoint_args:
+        for adjoint in test_cls.adjoint_options():
+          for adjoint_arg in test_cls.adjoint_arg_options():
+            test_name = base_test_name + ",adjoint={},adjoint_arg={}".format(
+                adjoint, adjoint_arg)
+            if hasattr(test_cls, test_name):
+              raise RuntimeError("Test %s defined more than once" % test_name)
+            setattr(
+                test_cls,
+                test_name,
+                test_util.run_deprecated_v1(test_template_fn(
+                    use_placeholder,
+                    shape_info,
+                    dtype,
+                    adjoint,
+                    adjoint_arg)))
+      else:
+        if hasattr(test_cls, base_test_name):
+          raise RuntimeError("Test %s defined more than once" % base_test_name)
+        setattr(
+            test_cls,
+            base_test_name,
+            test_util.run_deprecated_v1(test_template_fn(
+                use_placeholder, shape_info, dtype)))
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -448,24 +526,24 @@
   LinearOperatorDerivedClassTest that are not defined here.
   """
 
-  @property
-  def _operator_build_infos(self):
-    build_info = OperatorBuildInfo
+  @staticmethod
+  def operator_shapes_infos():
+    shapes_info = OperatorShapesInfo
     # non-batch operators (n, n) and batch operators.
     return [
-        build_info((0, 0)),
-        build_info((1, 1)),
-        build_info((1, 3, 3)),
-        build_info((3, 4, 4)),
-        build_info((2, 1, 4, 4))]
+        shapes_info((0, 0)),
+        shapes_info((1, 1)),
+        shapes_info((1, 3, 3)),
+        shapes_info((3, 4, 4)),
+        shapes_info((2, 1, 4, 4))]
 
-  def _make_rhs(self, operator, adjoint, with_batch=True):
+  def make_rhs(self, operator, adjoint, with_batch=True):
     # This operator is square, so rhs and x will have same shape.
     # adjoint value makes no difference because the operator shape doesn't
     # change since it is square, but be pedantic.
-    return self._make_x(operator, adjoint=not adjoint, with_batch=with_batch)
+    return self.make_x(operator, adjoint=not adjoint, with_batch=with_batch)
 
-  def _make_x(self, operator, adjoint, with_batch=True):
+  def make_x(self, operator, adjoint, with_batch=True):
     # Value of adjoint makes no difference because the operator is square.
     # Return the number of systems to solve, R, equal to 1 or 2.
     r = self._get_num_systems(operator)
@@ -508,8 +586,8 @@
   LinearOperatorDerivedClassTest that are not defined here.
   """
 
-  @property
-  def _tests_to_skip(self):
+  @staticmethod
+  def tests_to_skip():
     """List of test names to skip."""
     return [
         "cholesky",
@@ -520,23 +598,23 @@
         "log_abs_det"
     ]
 
-  @property
-  def _operator_build_infos(self):
-    build_info = OperatorBuildInfo
+  @staticmethod
+  def operator_shapes_infos():
+    shapes_info = OperatorShapesInfo
     # non-batch operators (n, n) and batch operators.
     return [
-        build_info((2, 1)),
-        build_info((1, 2)),
-        build_info((1, 3, 2)),
-        build_info((3, 3, 4)),
-        build_info((2, 1, 2, 4))]
+        shapes_info((2, 1)),
+        shapes_info((1, 2)),
+        shapes_info((1, 3, 2)),
+        shapes_info((3, 3, 4)),
+        shapes_info((2, 1, 2, 4))]
 
-  def _make_rhs(self, operator, adjoint, with_batch=True):
+  def make_rhs(self, operator, adjoint, with_batch=True):
     # TODO(langmore) Add once we're testing solve_ls.
     raise NotImplementedError(
-        "_make_rhs not implemented because we don't test solve")
+        "make_rhs not implemented because we don't test solve")
 
-  def _make_x(self, operator, adjoint, with_batch=True):
+  def make_x(self, operator, adjoint, with_batch=True):
     # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
new file mode 100644
index 0000000..3921689
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -0,0 +1,247 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` acting like a Toeplitz matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = ["LinearOperatorToeplitz",]
+
+
+@tf_export("linalg.LinearOperatorToeplitz")
+class LinearOperatorToeplitz(linear_operator.LinearOperator):
+  """`LinearOperator` acting like a [batch] of toeplitz matrices.
+
+  This operator acts like a [batch] Toeplitz matrix `A` with shape
+  `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of toeplitz matrices
+
+  Toeplitz means that `A` has constant diagonals. Hence, `A` can be generated
+  with two vectors. One represents the first column of the matrix, and the
+  other represents the first row.
+
+  Below is a 4 x 4 example:
+
+  ```
+  A = |a b c d|
+      |e a b c|
+      |f e a b|
+      |g f e a|
+  ```
+
+  #### Example of a Toeplitz operator.
+
+  ```python
+  # Create a 3 x 3 Toeplitz operator.
+  col = [1., 2., 3.]
+  row = [1., 4., -9.]
+  operator = LinearOperatorToeplitz(col, row)
+
+  operator.to_dense()
+  ==> [[1., 4., -9.],
+       [2., 1., 4.],
+       [3., 2., 1.]]
+
+  operator.shape
+  ==> [3, 3]
+
+  operator.log_abs_determinant()
+  ==> scalar Tensor
+
+  x = ... Shape [3, 4] Tensor
+  operator.matmul(x)
+  ==> Shape [3, 4] Tensor
+
+  #### Shape compatibility
+
+  This operator acts on [batch] matrix with compatible shape.
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
+
+  ```
+  operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
+  x.shape =   [C1,...,Cc] + [N, R],
+  and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
+  ```
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               col,
+               row,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name="LinearOperatorToeplitz"):
+    r"""Initialize a `LinearOperatorToeplitz`.
+
+    Args:
+      col: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
+        The first column of the operator. Allowed dtypes: `float16`, `float32`,
+          `float64`, `complex64`, `complex128`. Note that the first entry of
+          `col` is assumed to be the same as the first entry of `row`.
+      row: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
+        The first row of the operator. Allowed dtypes: `float16`, `float32`,
+          `float64`, `complex64`, `complex128`. Note that the first entry of
+          `row` is assumed to be the same as the first entry of `col`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `diag.dtype` is real, this is auto-set to `True`.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`.
+    """
+
+    with ops.name_scope(name, values=[row, col]):
+      self._row = ops.convert_to_tensor(row, name="row")
+      self._col = ops.convert_to_tensor(col, name="col")
+      self._check_row_col(self._row, self._col)
+
+      circulant_col = array_ops.concat(
+          [self._col,
+           array_ops.zeros_like(self._col[..., 0:1]),
+           array_ops.reverse(self._row[..., 1:], axis=[-1])], axis=-1)
+
+      # To be used for matmul.
+      self._circulant = linear_operator_circulant.LinearOperatorCirculant(
+          fft_ops.fft(_to_complex(circulant_col)),
+          input_output_dtype=self._row.dtype)
+
+      if is_square is False:  # pylint:disable=g-bool-id-comparison
+        raise ValueError("Only square Toeplitz operators currently supported.")
+      is_square = True
+
+      super(LinearOperatorToeplitz, self).__init__(
+          dtype=self._row.dtype,
+          graph_parents=[self._row, self._col],
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  def _check_row_col(self, row, col):
+    """Static check of row and column."""
+    for name, tensor in [["row", row], ["col", col]]:
+      if tensor.get_shape().ndims is not None and tensor.get_shape().ndims < 1:
+        raise ValueError("Argument {} must have at least 1 dimension.  "
+                         "Found: {}".format(name, tensor))
+
+    if row.get_shape()[-1] is not None and col.get_shape()[-1] is not None:
+      if row.get_shape()[-1] != col.get_shape()[-1]:
+        raise ValueError(
+            "Expected square matrix, got row and col with mismatched "
+            "dimensions.")
+
+  def _shape(self):
+    # If d_shape = [5, 3], we return [5, 3, 3].
+    v_shape = array_ops.broadcast_static_shape(
+        self.row.shape, self.col.shape)
+    return v_shape.concatenate(v_shape[-1:])
+
+  def _shape_tensor(self):
+    v_shape = array_ops.broadcast_dynamic_shape(
+        array_ops.shape(self.row),
+        array_ops.shape(self.col))
+    k = v_shape[-1]
+    return array_ops.concat((v_shape, [k]), 0)
+
+  def _assert_self_adjoint(self):
+    return check_ops.assert_equal(
+        self.row,
+        self.col,
+        message=("row and col are not the same, and "
+                 "so this operator is not self-adjoint."))
+
+  # TODO(srvasude): Add efficient solver and determinant calculations to this
+  # class (based on Levinson recursion.)
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    # Given a Toeplitz matrix, we can embed it in a Circulant matrix to perform
+    # efficient matrix multiplications. Given a Toeplitz matrix with first row
+    # [t_0, t_1, ... t_{n-1}] and first column [t0, t_{-1}, ..., t_{-(n-1)},
+    # let C by the circulant matrix with first column [t0, t_{-1}, ...,
+    # t_{-(n-1)}, 0, t_{n-1}, ..., t_1]. Also adjoin to our input vector `x`
+    # `n` zeros, to make it a vector of length `2n` (call it y). It can be shown
+    # that if we take the first n entries of `Cy`, this is equal to the Toeplitz
+    # multiplication. See:
+    # http://math.mit.edu/icg/resources/teaching/18.085-spring2015/toeplitz.pdf
+    # for more details.
+    x = linalg.adjoint(x) if adjoint_arg else x
+    expanded_x = array_ops.concat([x, array_ops.zeros_like(x)], axis=-2)
+    result = self._circulant.matmul(
+        expanded_x, adjoint=adjoint, adjoint_arg=False)
+
+    return math_ops.cast(
+        result[..., :self.domain_dimension_tensor(), :],
+        self.dtype)
+
+  def _trace(self):
+    return math_ops.cast(
+        self.domain_dimension_tensor(),
+        dtype=self.dtype) * self.col[..., 0]
+
+  def _diag_part(self):
+    diag_entry = self.col[..., 0:1]
+    return diag_entry * array_ops.ones(
+        [self.domain_dimension_tensor()], self.dtype)
+
+  @property
+  def col(self):
+    return self._col
+
+  @property
+  def row(self):
+    return self._row
+
+
+def _to_complex(x):
+  dtype = dtypes.complex64
+  if x.dtype in [dtypes.float64, dtypes.complex128]:
+    dtype = dtypes.complex128
+  return math_ops.cast(x, dtype)
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 6c18943..52fef7b 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -146,8 +146,8 @@
   Example broadcasting many batch dims
 
   ```python
-  x = tf.random_normal(shape=(2, 3, 1, 4, 4))
-  y = tf.random_normal(shape=(1, 3, 2, 5, 5))
+  x = tf.random.normal(shape=(2, 3, 1, 4, 4))
+  y = tf.random.normal(shape=(1, 3, 2, 5, 5))
   x_bc, y_bc = broadcast_matrix_batch_dims([x, y])
 
   x_bc.shape
@@ -193,11 +193,11 @@
           bcast_batch_shape,
           mat.get_shape()[:-2])
     if bcast_batch_shape.is_fully_defined():
-      # The [1, 1] at the end will broadcast with anything.
-      bcast_shape = bcast_batch_shape.concatenate([1, 1])
       for i, mat in enumerate(batch_matrices):
         if mat.get_shape()[:-2] != bcast_batch_shape:
-          batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
+          bcast_shape = array_ops.concat(
+              [bcast_batch_shape.as_list(), array_ops.shape(mat)[-2:]], axis=0)
+          batch_matrices[i] = array_ops.broadcast_to(mat, bcast_shape)
       return batch_matrices
 
     # Since static didn't work, do dynamic, which always copies data.
@@ -206,17 +206,15 @@
       bcast_batch_shape = array_ops.broadcast_dynamic_shape(
           bcast_batch_shape,
           array_ops.shape(mat)[:-2])
-    bcast_shape = array_ops.concat([bcast_batch_shape, [1, 1]], axis=0)
     for i, mat in enumerate(batch_matrices):
-      batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
+      batch_matrices[i] = array_ops.broadcast_to(
+          mat,
+          array_ops.concat(
+              [bcast_batch_shape, array_ops.shape(mat)[-2:]], axis=0))
 
     return batch_matrices
 
 
-def _broadcast_to_shape(x, shape):
-  return x + array_ops.zeros(shape=shape, dtype=x.dtype)
-
-
 def cholesky_solve_with_broadcast(chol, rhs, name=None):
   """Solve systems of linear equations."""
   with ops.name_scope(name, "CholeskySolveWithBroadcast", [chol, rhs]):
@@ -224,121 +222,6 @@
     return linalg_ops.cholesky_solve(chol, rhs)
 
 
-def matmul_with_broadcast(a,
-                          b,
-                          transpose_a=False,
-                          transpose_b=False,
-                          adjoint_a=False,
-                          adjoint_b=False,
-                          a_is_sparse=False,
-                          b_is_sparse=False,
-                          name=None):
-  """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
-
-  Works identically to `tf.matmul`, but broadcasts batch dims
-  of `a` and `b` if they are determined statically to be different, or if static
-  shapes are not fully defined. Attempts are made to avoid unnecessary
-  replication of data, but this is not always possible.
-
-  The inputs must be matrices (or tensors of rank > 2, representing batches of
-  matrices).
-
-  Both matrices must be of the same type. The supported types are:
-  `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
-
-  Either matrix can be transposed or adjointed (conjugated and transposed) on
-  the fly by setting one of the corresponding flag to `True`. These are `False`
-  by default.
-
-  If one or both of the matrices contain a lot of zeros, a more efficient
-  multiplication algorithm can be used by setting the corresponding
-  `a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
-  This optimization is only available for plain matrices (rank-2 tensors) with
-  datatypes `bfloat16` or `float32`.
-
-  For example:
-
-  ```python
-  # A 2-batch of 3x4 matrices
-  a = tf.random_normal(shape=(2, 3, 4))
-
-  # A single 4x5 matrix
-  b = tf.random_normal(shape=(4, 5))
-
-  result = matmul_with_broadcast(a, b)
-
-  result.shape
-  ==> (2, 3, 5)
-
-  result[0,...]
-  ==> tf.matmul(a[0,...], b)
-
-  result[1,...]
-  ==> tf.matmul(a[1,...], b)
-  ```
-
-  Args:
-    a: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
-      `complex128` and `rank > 1`.
-    b: `Tensor` with same type as `a` having compatible matrix dimensions and
-      broadcastable batch dimensions.
-    transpose_a: If `True`, `a` is transposed before multiplication.
-    transpose_b: If `True`, `b` is transposed before multiplication.
-    adjoint_a: If `True`, `a` is conjugated and transposed before
-      multiplication.
-    adjoint_b: If `True`, `b` is conjugated and transposed before
-      multiplication.
-    a_is_sparse: If `True`, `a` is treated as a sparse matrix.
-    b_is_sparse: If `True`, `b` is treated as a sparse matrix.
-    name: Name for the operation (optional).
-
-  Returns:
-    A `Tensor` of the same type as `a` and `b` where each inner-most matrix is
-    the product of the corresponding matrices in `a` and `b`, e.g. if all
-    transpose or adjoint attributes are `False`:
-
-    The leading shape of `output` is the result of broadcasting the leading
-    dimensions of `a` and `b`.
-
-    `output`[..., i, j] = sum_k (`a`[..., i, k] * `b`[..., k, j]),
-    for all indices i, j.
-
-    Note: This is matrix product, not element-wise product.
-
-
-  Raises:
-    ValueError: If transpose_a and adjoint_a, or transpose_b and adjoint_b
-      are both set to True.
-  """
-  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]):
-    a = ops.convert_to_tensor(a, name="a")
-    b = ops.convert_to_tensor(b, name="b", dtype=a.dtype)
-
-    # If either a or b has extra dims, we can reshape to get rid of them.
-    a, b, reshape_inv, still_need_to_transpose = _reshape_for_efficiency(
-        a,
-        b,
-        transpose_a=transpose_a,
-        transpose_b=transpose_b,
-        adjoint_a=adjoint_a,
-        adjoint_b=adjoint_b)
-
-    # This will broadcast by brute force if we still need to.
-    a, b = broadcast_matrix_batch_dims([a, b])
-
-    a_times_b = math_ops.matmul(
-        a,
-        b,
-        transpose_a=transpose_a and still_need_to_transpose,
-        transpose_b=transpose_b and still_need_to_transpose,
-        adjoint_a=adjoint_a and still_need_to_transpose,
-        adjoint_b=adjoint_b and still_need_to_transpose,
-        a_is_sparse=a_is_sparse,
-        b_is_sparse=b_is_sparse)
-
-    return reshape_inv(a_times_b)
-
-
 def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
   """Solve systems of linear equations."""
   with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
@@ -365,7 +248,7 @@
                                            name=None):
   """Solves triangular systems of linear equations with by backsubstitution.
 
-  Works identically to `tf.matrix_triangular_solve`, but broadcasts batch dims
+  Works identically to `tf.linalg.triangular_solve`, but broadcasts batch dims
   of `matrix` and `rhs` (by replicating) if they are determined statically to be
   different, or if static shapes are not fully defined.  Thus, this may result
   in an inefficient replication of data.
diff --git a/tensorflow/python/ops/linalg/matmul_registrations.py b/tensorflow/python/ops/linalg/matmul_registrations.py
index e0ac988..f624351 100644
--- a/tensorflow/python/ops/linalg/matmul_registrations.py
+++ b/tensorflow/python/ops/linalg/matmul_registrations.py
@@ -26,66 +26,7 @@
 from tensorflow.python.ops.linalg import linear_operator_identity
 from tensorflow.python.ops.linalg import linear_operator_lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_zeros
-
-
-def _combined_self_adjoint_hint(operator_a, operator_b):
-  """Get combined hint for self-adjoint-ness."""
-  # Note: only use this method in the commuting case.
-  # The property is preserved under composition when the operators commute.
-  if operator_a.is_self_adjoint and operator_b.is_self_adjoint:
-    return True
-
-  # The property is not preserved when an operator with the property is composed
-  # with an operator without the property.
-  if ((operator_a.is_self_adjoint is True and
-       operator_b.is_self_adjoint is False) or
-      (operator_a.is_self_adjoint is False and
-       operator_b.is_self_adjoint is True)):
-    return False
-
-  # The property is not known when operators are not known to have the property
-  # or both operators don't have the property (the property for the complement
-  # class is not closed under composition).
-  return None
-
-
-def _is_square(operator_a, operator_b):
-  """Return a hint to whether the composition is square."""
-  if operator_a.is_square and operator_b.is_square:
-    return True
-  if operator_a.is_square is False and operator_b.is_square is False:
-    # Let A have shape [B, M, N], B have shape [B, N, L].
-    m = operator_a.range_dimension
-    l = operator_b.domain_dimension
-    if m is not None and l is not None:
-      return m == l
-
-    return None
-
-
-def _combined_positive_definite_hint(operator_a, operator_b):
-  """Get combined PD hint for compositions."""
-  # Note: Positive definiteness is only guaranteed to be preserved
-  # when the operators commute and are symmetric. Only use this method in
-  # commuting cases.
-
-  if (operator_a.is_positive_definite is True and
-      operator_a.is_self_adjoint is True and
-      operator_b.is_positive_definite is True and
-      operator_b.is_self_adjoint is True):
-    return True
-
-  return None
-
-
-def _combined_non_singular_hint(operator_a, operator_b):
-  """Get combined hint for when ."""
-  # If either operator is not-invertible the composition isn't.
-  if (operator_a.is_non_singular is False or
-      operator_b.is_non_singular is False):
-    return False
-
-  return operator_a.is_non_singular and operator_b.is_non_singular
+from tensorflow.python.ops.linalg import registrations_util
 
 
 # By default, use a LinearOperatorComposition to delay the computation.
@@ -93,15 +34,15 @@
     linear_operator.LinearOperator, linear_operator.LinearOperator)
 def _matmul_linear_operator(linop_a, linop_b):
   """Generic matmul of two `LinearOperator`s."""
-  is_square = _is_square(linop_a, linop_b)
+  is_square = registrations_util.is_square(linop_a, linop_b)
   is_non_singular = None
   is_self_adjoint = None
   is_positive_definite = None
 
   if is_square:
-    is_non_singular = _combined_non_singular_hint(linop_a, linop_b)
-    is_self_adjoint = _combined_self_adjoint_hint(linop_a, linop_b)
-  elif is_square is False:
+    is_non_singular = registrations_util.combined_non_singular_hint(
+        linop_a, linop_b)
+  elif is_square is False:  # pylint:disable=g-bool-id-comparison
     is_non_singular = False
     is_self_adjoint = False
     is_positive_definite = False
@@ -165,11 +106,13 @@
 def _matmul_linear_operator_diag(linop_a, linop_b):
   return linear_operator_diag.LinearOperatorDiag(
       diag=linop_a.diag * linop_b.diag,
-      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
-      is_self_adjoint=_combined_self_adjoint_hint(
+      is_non_singular=registrations_util.combined_non_singular_hint(
           linop_a, linop_b),
-      is_positive_definite=_combined_positive_definite_hint(
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
           linop_a, linop_b),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_a, linop_b)),
       is_square=True)
 
 
@@ -180,12 +123,13 @@
     linop_diag, linop_scaled_identity):
   return linear_operator_diag.LinearOperatorDiag(
       diag=linop_diag.diag * linop_scaled_identity.multiplier,
-      is_non_singular=_combined_non_singular_hint(
+      is_non_singular=registrations_util.combined_non_singular_hint(
           linop_diag, linop_scaled_identity),
-      is_self_adjoint=_combined_self_adjoint_hint(
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
           linop_diag, linop_scaled_identity),
-      is_positive_definite=_combined_positive_definite_hint(
-          linop_diag, linop_scaled_identity),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_diag, linop_scaled_identity)),
       is_square=True)
 
 
@@ -196,12 +140,13 @@
     linop_scaled_identity, linop_diag):
   return linear_operator_diag.LinearOperatorDiag(
       diag=linop_diag.diag * linop_scaled_identity.multiplier,
-      is_non_singular=_combined_non_singular_hint(
+      is_non_singular=registrations_util.combined_non_singular_hint(
           linop_diag, linop_scaled_identity),
-      is_self_adjoint=_combined_self_adjoint_hint(
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
           linop_diag, linop_scaled_identity),
-      is_positive_definite=_combined_positive_definite_hint(
-          linop_diag, linop_scaled_identity),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_diag, linop_scaled_identity)),
       is_square=True)
 
 
@@ -211,11 +156,11 @@
 def _matmul_linear_operator_diag_tril(linop_diag, linop_triangular):
   return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
       tril=linop_diag.diag[..., None] * linop_triangular.to_dense(),
-      is_non_singular=_combined_non_singular_hint(
+      is_non_singular=registrations_util.combined_non_singular_hint(
           linop_diag, linop_triangular),
       # This is safe to do since the Triangular matrix is only self-adjoint
       # when it is a diagonal matrix, and hence commutes.
-      is_self_adjoint=_combined_self_adjoint_hint(
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
           linop_diag, linop_triangular),
       is_positive_definite=None,
       is_square=True)
@@ -227,11 +172,11 @@
 def _matmul_linear_operator_tril_diag(linop_triangular, linop_diag):
   return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
       tril=linop_triangular.to_dense() * linop_diag.diag,
-      is_non_singular=_combined_non_singular_hint(
+      is_non_singular=registrations_util.combined_non_singular_hint(
           linop_diag, linop_triangular),
       # This is safe to do since the Triangular matrix is only self-adjoint
       # when it is a diagonal matrix, and hence commutes.
-      is_self_adjoint=_combined_self_adjoint_hint(
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
           linop_diag, linop_triangular),
       is_positive_definite=None,
       is_square=True)
@@ -245,8 +190,11 @@
 def _matmul_linear_operator_circulant_circulant(linop_a, linop_b):
   return linear_operator_circulant.LinearOperatorCirculant(
       spectrum=linop_a.spectrum * linop_b.spectrum,
-      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
-      is_self_adjoint=_combined_self_adjoint_hint(linop_a, linop_b),
-      is_positive_definite=_combined_positive_definite_hint(
+      is_non_singular=registrations_util.combined_non_singular_hint(
           linop_a, linop_b),
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_a, linop_b)),
       is_square=True)
diff --git a/tensorflow/python/ops/linalg/registrations_util.py b/tensorflow/python/ops/linalg/registrations_util.py
new file mode 100644
index 0000000..c707a67
--- /dev/null
+++ b/tensorflow/python/ops/linalg/registrations_util.py
@@ -0,0 +1,91 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common utilities for registering LinearOperator methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# Note: only use this method in the commuting case.
+def combined_commuting_self_adjoint_hint(operator_a, operator_b):
+  """Get combined hint for self-adjoint-ness."""
+
+  # The property is preserved under composition when the operators commute.
+  if operator_a.is_self_adjoint and operator_b.is_self_adjoint:
+    return True
+
+  # The property is not preserved when an operator with the property is composed
+  # with an operator without the property.
+
+  # pylint:disable=g-bool-id-comparison
+  if ((operator_a.is_self_adjoint is True and
+       operator_b.is_self_adjoint is False) or
+      (operator_a.is_self_adjoint is False and
+       operator_b.is_self_adjoint is True)):
+    return False
+  # pylint:enable=g-bool-id-comparison
+
+  # The property is not known when operators are not known to have the property
+  # or both operators don't have the property (the property for the complement
+  # class is not closed under composition).
+  return None
+
+
+def is_square(operator_a, operator_b):
+  """Return a hint to whether the composition is square."""
+  if operator_a.is_square and operator_b.is_square:
+    return True
+  if operator_a.is_square is False and operator_b.is_square is False:  # pylint:disable=g-bool-id-comparison
+    # Let A have shape [B, M, N], B have shape [B, N, L].
+    m = operator_a.range_dimension
+    l = operator_b.domain_dimension
+    if m is not None and l is not None:
+      return m == l
+
+  if (operator_a.is_square != operator_b.is_square) and (
+      operator_a.is_square is not None and operator_a.is_square is not None):
+    return False
+
+  return None
+
+
+# Note: Positive definiteness is only guaranteed to be preserved
+# when the operators commute and are symmetric. Only use this method in
+# commuting cases.
+def combined_commuting_positive_definite_hint(operator_a, operator_b):
+  """Get combined PD hint for compositions."""
+  # pylint:disable=g-bool-id-comparison
+  if (operator_a.is_positive_definite is True and
+      operator_a.is_self_adjoint is True and
+      operator_b.is_positive_definite is True and
+      operator_b.is_self_adjoint is True):
+    return True
+  # pylint:enable=g-bool-id-comparison
+
+  return None
+
+
+def combined_non_singular_hint(operator_a, operator_b):
+  """Get combined hint for when ."""
+  # If either operator is not-invertible the composition isn't.
+
+  # pylint:disable=g-bool-id-comparison
+  if (operator_a.is_non_singular is False or
+      operator_b.is_non_singular is False):
+    return False
+  # pylint:enable=g-bool-id-comparison
+
+  return operator_a.is_non_singular and operator_b.is_non_singular
diff --git a/tensorflow/python/ops/linalg/solve_registrations.py b/tensorflow/python/ops/linalg/solve_registrations.py
new file mode 100644
index 0000000..cfdce44
--- /dev/null
+++ b/tensorflow/python/ops/linalg/solve_registrations.py
@@ -0,0 +1,164 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_composition
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+from tensorflow.python.ops.linalg import registrations_util
+
+
+# By default, use a LinearOperatorComposition to delay the computation.
+@linear_operator_algebra.RegisterSolve(
+    linear_operator.LinearOperator, linear_operator.LinearOperator)
+def _solve_linear_operator(linop_a, linop_b):
+  """Generic solve of two `LinearOperator`s."""
+  is_square = registrations_util.is_square(linop_a, linop_b)
+  is_non_singular = None
+  is_self_adjoint = None
+  is_positive_definite = None
+
+  if is_square:
+    is_non_singular = registrations_util.combined_non_singular_hint(
+        linop_a, linop_b)
+  elif is_square is False:  # pylint:disable=g-bool-id-comparison
+    is_non_singular = False
+    is_self_adjoint = False
+    is_positive_definite = False
+
+  return linear_operator_composition.LinearOperatorComposition(
+      operators=[
+          linear_operator_inversion.LinearOperatorInversion(linop_a),
+          linop_b
+      ],
+      is_non_singular=is_non_singular,
+      is_self_adjoint=is_self_adjoint,
+      is_positive_definite=is_positive_definite,
+      is_square=is_square,
+  )
+
+
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_inversion.LinearOperatorInversion,
+    linear_operator.LinearOperator)
+def _solve_inverse_linear_operator(linop_a, linop_b):
+  """Solve inverse of generic `LinearOperator`s."""
+  return linop_a.operator.matmul(linop_b)
+
+
+# Identity
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_identity.LinearOperatorIdentity,
+    linear_operator.LinearOperator)
+def _solve_linear_operator_identity_left(identity, linop):
+  del identity
+  return linop
+
+
+# Diag.
+
+
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_diag.LinearOperatorDiag)
+def _solve_linear_operator_diag(linop_a, linop_b):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_b.diag / linop_a.diag,
+      is_non_singular=registrations_util.combined_non_singular_hint(
+          linop_a, linop_b),
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_a, linop_b)),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _solve_linear_operator_diag_scaled_identity_right(
+    linop_diag, linop_scaled_identity):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_scaled_identity.multiplier / linop_diag.diag,
+      is_non_singular=registrations_util.combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_diag, linop_scaled_identity)),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_identity.LinearOperatorScaledIdentity,
+    linear_operator_diag.LinearOperatorDiag)
+def _solve_linear_operator_diag_scaled_identity_left(
+    linop_scaled_identity, linop_diag):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag / linop_scaled_identity.multiplier,
+      is_non_singular=registrations_util.combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_diag, linop_scaled_identity)),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular)
+def _solve_linear_operator_diag_tril(linop_diag, linop_triangular):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_triangular.to_dense() / linop_diag.diag[..., None],
+      is_non_singular=registrations_util.combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+
+# Circulant.
+
+
+@linear_operator_algebra.RegisterSolve(
+    linear_operator_circulant.LinearOperatorCirculant,
+    linear_operator_circulant.LinearOperatorCirculant)
+def _solve_linear_operator_circulant_circulant(linop_a, linop_b):
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=linop_b.spectrum / linop_a.spectrum,
+      is_non_singular=registrations_util.combined_non_singular_hint(
+          linop_a, linop_b),
+      is_self_adjoint=registrations_util.combined_commuting_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=(
+          registrations_util.combined_commuting_positive_definite_hint(
+              linop_a, linop_b)),
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index f6c9d2c..bceca2d 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -27,6 +27,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -465,3 +466,90 @@
 
     grad_a.set_shape(a_shape)
     return grad_a
+
+
+@ops.RegisterGradient("TridiagonalSolve")
+def _TridiagonalSolveGrad(op, grad):
+  """Gradient for TridiagonalSolveGrad."""
+  diags = op.inputs[0]
+  x = op.outputs[0]
+
+  # Transposing the matrix within tridiagonal_solve kernel by interchanging
+  # superdiagonal and subdiagonal wouldn't work on GPU due to mismatch with
+  # paddings required by cusparse*gtsv routines.
+  # So constructing the transposed matrix in Python.
+  diags_transposed = _TransposeTridiagonalMatrix(diags)
+
+  grad_rhs = linalg_ops.tridiagonal_solve(diags_transposed, grad)
+  grad_diags = -_MatmulExtractingThreeDiagonals(grad_rhs, x)
+  return grad_diags, grad_rhs
+
+
+def _TransposeTridiagonalMatrix(diags):
+  """Transposes a tridiagonal matrix.
+
+  Args:
+    diags: the diagonals of the input matrix in the compact form (see
+      linalg_ops.tridiagonal_solve).
+
+  Returns:
+    Diagonals of the transposed matrix in the compact form.
+  """
+
+  diag = diags[..., 1, :]
+
+  if diags.shape.is_fully_defined():
+    # For fully defined tensor we can concat with a tensor of zeros, which is
+    # faster than using array_ops.pad().
+    zeros = array_ops.zeros(list(diags.shape[:-2]) + [1], dtype=diags.dtype)
+    superdiag = array_ops.concat((diags[..., 2, 1:], zeros), axis=-1)
+    subdiag = array_ops.concat((zeros, diags[..., 0, :-1]), axis=-1)
+  else:
+    rank = array_ops.rank(diags)
+    zeros = array_ops.zeros((rank - 2, 2), dtype=dtypes.int32)
+    superdiag_pad = array_ops.concat((zeros, array_ops.constant([[0, 1]])),
+                                     axis=0)
+    superdiag = array_ops.pad(diags[..., 2, 1:], superdiag_pad)
+    subdiag_pad = array_ops.concat((zeros, array_ops.constant([[1, 0]])),
+                                   axis=0)
+    subdiag = array_ops.pad(diags[..., 0, :-1], subdiag_pad)
+  return array_ops.stack([superdiag, diag, subdiag], axis=-2)
+
+
+def _MatmulExtractingThreeDiagonals(x, y_tr):
+  """Multiplies matrices and extracts three diagonals from the product.
+
+  With sizes M x K and K x M, this function takes O(MK) time and O(M) space,
+  while using math_ops.matmul, and then extracting the diagonals would take
+  O(M^2 K) time and O(M^2) space.
+
+  Args:
+    x: first matrix
+    y_tr: second matrix transposed
+
+  Returns:
+    Diagonals of the product in compact format (see
+    linalg_ops.tridiagonal_solve)
+
+  """
+  diag = math_ops.reduce_sum(x * y_tr, axis=-1)
+
+  if y_tr.shape.is_fully_defined():
+    zeros = array_ops.zeros(
+        list(x.shape[:-2]) + [1, x.shape[-1]], dtype=x.dtype)
+    superdiag = math_ops.reduce_sum(
+        x * array_ops.concat((y_tr[..., 1:, :], zeros), axis=-2), axis=-1)
+    subdiag = math_ops.reduce_sum(
+        x * array_ops.concat((zeros, y_tr[..., :-1, :]), axis=-2), axis=-1)
+  else:
+    rank = array_ops.rank(y_tr)
+    zeros = array_ops.zeros((rank - 2, 2), dtype=dtypes.int32)
+    superdiag_pad = array_ops.concat(
+        (zeros, array_ops.constant([[0, 1], [0, 0]])), axis=0)
+    superdiag = math_ops.reduce_sum(
+        x * array_ops.pad(y_tr[..., 1:, :], superdiag_pad), axis=-1)
+    subdiag_pad = array_ops.concat(
+        (zeros, array_ops.constant([[1, 0], [0, 0]])), axis=0)
+    subdiag = math_ops.reduce_sum(
+        x * array_ops.pad(y_tr[..., :-1, :], subdiag_pad), axis=-1)
+  return array_ops.stack([superdiag, diag, subdiag], axis=-2)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index b67e1897..b822519 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -89,8 +89,8 @@
   # Solve 10 separate 2x2 linear systems:
   A = ... # shape 10 x 2 x 2
   RHS = ... # shape 10 x 2 x 1
-  chol = tf.cholesky(A)  # shape 10 x 2 x 2
-  X = tf.cholesky_solve(chol, RHS)  # shape 10 x 2 x 1
+  chol = tf.linalg.cholesky(A)  # shape 10 x 2 x 2
+  X = tf.linalg.cholesky_solve(chol, RHS)  # shape 10 x 2 x 1
   # tf.matmul(A, X) ~ RHS
   X[3, :, 0]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 0]
 
@@ -103,7 +103,7 @@
 
   Args:
     chol:  A `Tensor`.  Must be `float32` or `float64`, shape is `[..., M, M]`.
-      Cholesky factorization of `A`, e.g. `chol = tf.cholesky(A)`.
+      Cholesky factorization of `A`, e.g. `chol = tf.linalg.cholesky(A)`.
       For that reason, only the lower triangular parts (including the diagonal)
       of the last two dimensions of `chol` are used.  The strictly upper part is
       assumed to be zero and not accessed.
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index f05fbf4..0980fa3 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Logging and Summary Operations."""
 # pylint: disable=protected-access
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import pprint
 import random
 import sys
@@ -26,6 +26,7 @@
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -48,12 +49,10 @@
 except NameError:
   pass
 
-
 # The python wrapper for Assert is in control_flow_ops, as the Assert
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-
 # Assert and Print are special symbols in python, so we must
 # have an upper-case version of them.
 #
@@ -64,15 +63,15 @@
 
 # pylint: disable=invalid-name
 @deprecated("2018-08-20", "Use tf.print instead of tf.Print. Note that "
-                          "tf.print returns a no-output operator that directly "
-                          "prints the output. Outside of defuns or eager mode, "
-                          "this operator will not be executed unless it is "
-                          "directly specified in session.run or used as a "
-                          "control dependency for other operators. This is "
-                          "only a concern in graph mode. Below is an example "
-                          "of how to ensure tf.print executes in graph mode:\n"
-                          """```python
-    sess = tf.Session()
+            "tf.print returns a no-output operator that directly "
+            "prints the output. Outside of defuns or eager mode, "
+            "this operator will not be executed unless it is "
+            "directly specified in session.run or used as a "
+            "control dependency for other operators. This is "
+            "only a concern in graph mode. Below is an example "
+            "of how to ensure tf.print executes in graph mode:\n"
+            """```python
+    sess = tf.compat.v1.Session()
     with sess.as_default():
         tensor = tf.range(10)
         print_op = tf.print(tensor)
@@ -86,8 +85,7 @@
   `from __future__ import print_function`
 """)
 @tf_export(v1=["Print"])
-def Print(input_, data, message=None, first_n=None, summarize=None,
-          name=None):
+def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
   """Prints a list of tensors.
 
   This is an identity op (behaves like `tf.identity`) with the side effect
@@ -102,15 +100,17 @@
     data: A list of tensors to print out when op is evaluated.
     message: A string, prefix of the error message.
     first_n: Only log `first_n` number of times. Negative numbers log always;
-             this is the default.
+      this is the default.
     summarize: Only print this many entries of each tensor. If None, then a
-               maximum of 3 elements are printed per input tensor.
+      maximum of 3 elements are printed per input tensor.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor`. Has the same type and contents as `input_`.
   """
   return gen_logging_ops._print(input_, data, message, first_n, summarize, name)
+
+
 # pylint: enable=invalid-name
 
 
@@ -159,7 +159,7 @@
   Example:
     Single-input usage:
     ```python
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
     tensor = tf.range(10)
     tf.print(tensor, output_stream=sys.stderr)
     ```
@@ -167,7 +167,7 @@
 
     Multi-input usage:
     ```python
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
     tensor = tf.range(10)
     tf.print("tensors:", tensor, {2: tensor * 2}, output_stream=sys.stdout)
     ```
@@ -176,7 +176,7 @@
 
     Usage in a defun:
     ```python
-    tf.enable_eager_execution()
+    tf.compat.v1.enable_eager_execution()
 
     @tf.contrib.eager.defun
     def f():
@@ -190,7 +190,7 @@
 
     Usage when constructing graphs:
     ```python
-    sess = tf.Session()
+    sess = tf.compat.v1.Session()
     with sess.as_default():
         tensor = tf.range(10)
         print_op = tf.print("tensors:", tensor, {2: tensor * 2},
@@ -208,18 +208,22 @@
   Args:
     *inputs: Positional arguments that are the inputs to print. Inputs in the
       printed output will be separated by spaces. Inputs may be python
-      primitives, tensors, data structures such as dicts and lists that
-      may contain tensors (with the data structures possibly nested in
-      arbitrary ways), and printable python objects.
+      primitives, tensors, data structures such as dicts and lists that may
+      contain tensors (with the data structures possibly nested in arbitrary
+      ways), and printable python objects.
     output_stream: The output stream, logging level, or file to print to.
-      Defaults to sys.stderr, but sys.stdout, tf.logging.info,
-      tf.logging.warning, and tf.logging.error are also supported. To print to
+      Defaults to sys.stderr, but sys.stdout, tf.compat.v1.logging.info,
+      tf.compat.v1.logging.warning, and tf.compat.v1.logging.error are also
+      supported. To print to
       a file, pass a string started with "file://" followed by the file path,
       e.g., "file:///tmp/foo.out".
     summarize: The first and last `summarize` elements within each dimension are
       recursively printed per Tensor. If None, then the first 3 and last 3
       elements of each dimension are printed for each tensor. If set to -1, it
       will print all elements of every tensor.
+    sep: The string to use to separate the inputs. Defaults to " ".
+    end: End character that is appended at the end the printed string.
+      Defaults to the newline character.
     name: A name for the operation (optional).
 
   Returns:
@@ -236,6 +240,8 @@
   output_stream = kwargs.pop("output_stream", sys.stderr)
   name = kwargs.pop("name", None)
   summarize = kwargs.pop("summarize", 3)
+  sep = kwargs.pop("sep", " ")
+  end = kwargs.pop("end", os.linesep)
   if kwargs:
     raise ValueError("Unrecognized keyword arguments for tf.print: %s" % kwargs)
   format_name = None
@@ -261,17 +267,17 @@
   else:
     output_stream_string = output_stream_to_constant.get(output_stream)
     if not output_stream_string:
-      raise ValueError(
-          "Unsupported output stream, logging level, or file." +
-          str(output_stream) + ". Supported streams are sys.stdout, "
-          "sys.stderr, tf.logging.info, "
-          "tf.logging.warning, tf.logging.error. " +
-          "File needs to be in the form of 'file://<filepath>'.")
+      raise ValueError("Unsupported output stream, logging level, or file." +
+                       str(output_stream) +
+                       ". Supported streams are sys.stdout, "
+                       "sys.stderr, tf.logging.info, "
+                       "tf.logging.warning, tf.logging.error. " +
+                       "File needs to be in the form of 'file://<filepath>'.")
 
   # If we are only printing a single string scalar, there is no need to format
-  if (len(inputs) == 1 and tensor_util.is_tensor(inputs[0])
-      and (not isinstance(inputs[0], sparse_tensor.SparseTensor))
-      and (inputs[0].shape.ndims == 0)and (inputs[0].dtype == dtypes.string)):
+  if (len(inputs) == 1 and tensor_util.is_tensor(inputs[0]) and
+      (not isinstance(inputs[0], sparse_tensor.SparseTensor)) and
+      (inputs[0].shape.ndims == 0) and (inputs[0].dtype == dtypes.string)):
     formatted_string = inputs[0]
   # Otherwise, we construct an appropriate template for the tensors we are
   # printing, and format the template using those tensors.
@@ -282,10 +288,9 @@
     templates = []
     tensors = []
     tensor_free_structure = nest.map_structure(
-        lambda x: "" if tensor_util.is_tensor(x) else x,
-        inputs)
-    tensor_free_template = " ".join(pprint.pformat(x)
-                                    for x in tensor_free_structure)
+        lambda x: "" if tensor_util.is_tensor(x) else x, inputs)
+    tensor_free_template = " ".join(
+        pprint.pformat(x) for x in tensor_free_structure)
     placeholder = _generate_placeholder_string(tensor_free_template)
 
     for input_ in inputs:
@@ -300,8 +305,7 @@
           tensors.extend([x.indices, x.values, x.dense_shape])
           placeholders.append(
               "SparseTensor(indices={}, values={}, shape={})".format(
-                  placeholder, placeholder, placeholder)
-          )
+                  placeholder, placeholder, placeholder))
         elif tensor_util.is_tensor(x):
           tensors.append(x)
           placeholders.append(placeholder)
@@ -332,16 +336,25 @@
     # the formatted/printed output will not contain quotes around tensors.
     # (example of where these quotes might appear: if we have added a
     # placeholder string into a list, then pretty-formatted that list)
-    template = " ".join(templates)
+    template = sep.join(templates)
     template = template.replace("'" + placeholder + "'", placeholder)
     formatted_string = string_ops.string_format(
-        inputs=tensors, template=template, placeholder=placeholder,
+        inputs=tensors,
+        template=template,
+        placeholder=placeholder,
         summarize=summarize,
         name=format_name)
 
-  return gen_logging_ops.print_v2(formatted_string,
-                                  output_stream=output_stream_string,
-                                  name=name)
+  if compat.forward_compatible(2019, 5, 27):
+    return gen_logging_ops.print_v2(
+        formatted_string, output_stream=output_stream_string, name=name,
+        end=end)
+  else:
+    if end == os.linesep:
+      end = ""
+    return gen_logging_ops.print_v2(
+        formatted_string + end, output_stream=output_stream_string, name=name)
+
 # pylint: enable=g-doc-args
 
 
@@ -369,7 +382,8 @@
   This ops is deprecated. Please switch to tf.summary.histogram.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look
+  ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
@@ -379,8 +393,8 @@
 
   Args:
     tag: A `string` `Tensor`. 0-D.  Tag to use for the summary value.
-    values: A real numeric `Tensor`. Any shape. Values to use to
-      build the histogram.
+    values: A real numeric `Tensor`. Any shape. Values to use to build the
+      histogram.
     collections: Optional list of graph collections keys. The new summary op is
       added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
     name: A name for the operation (optional).
@@ -390,8 +404,7 @@
     buffer.
   """
   with ops.name_scope(name, "HistogramSummary", [tag, values]) as scope:
-    val = gen_logging_ops.histogram_summary(
-        tag=tag, values=values, name=scope)
+    val = gen_logging_ops.histogram_summary(tag=tag, values=values, name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
@@ -407,7 +420,8 @@
   """Outputs a `Summary` protocol buffer with images.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look
+  ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_images` summary values containing images. The
   images are built from `tensor` which must be 4-D with shape `[batch_size,
@@ -437,8 +451,8 @@
      generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 
   Args:
-    tag: A scalar `Tensor` of type `string`. Used to build the `tag`
-      of the summary values.
+    tag: A scalar `Tensor` of type `string`. Used to build the `tag` of the
+      summary values.
     tensor: A 4-D `uint8` or `float32` `Tensor` of shape `[batch_size, height,
       width, channels]` where `channels` is 1, 3, or 4.
     max_images: Max number of batch elements to generate images for.
@@ -473,7 +487,8 @@
 
   This op is deprecated. Please switch to tf.summary.audio.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look
+  ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_outputs` summary values containing audio. The
   audio is built from `tensor` which must be 3-D with shape `[batch_size,
@@ -489,8 +504,8 @@
      generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 
   Args:
-    tag: A scalar `Tensor` of type `string`. Used to build the `tag`
-      of the summary values.
+    tag: A scalar `Tensor` of type `string`. Used to build the `tag` of the
+      summary values.
     tensor: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
       or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`.
     sample_rate: A Scalar `float32` `Tensor` indicating the sample rate of the
@@ -505,8 +520,8 @@
     buffer.
   """
   with ops.name_scope(name, "AudioSummary", [tag, tensor]) as scope:
-    sample_rate = ops.convert_to_tensor(sample_rate, dtype=dtypes.float32,
-                                        name="sample_rate")
+    sample_rate = ops.convert_to_tensor(
+        sample_rate, dtype=dtypes.float32, name="sample_rate")
     val = gen_logging_ops.audio_summary_v2(
         tag=tag,
         tensor=tensor,
@@ -522,7 +537,8 @@
   # pylint: disable=line-too-long
   """Merges summaries.
 
-  This op is deprecated. Please switch to tf.summary.merge, which has identical
+  This op is deprecated. Please switch to tf.compat.v1.summary.merge, which has
+  identical
   behavior.
 
   This op creates a
@@ -554,7 +570,8 @@
 def merge_all_summaries(key=ops.GraphKeys.SUMMARIES):
   """Merges all summaries collected in the default graph.
 
-  This op is deprecated. Please switch to tf.summary.merge_all, which has
+  This op is deprecated. Please switch to tf.compat.v1.summary.merge_all, which
+  has
   identical behavior.
 
   Args:
@@ -610,7 +627,8 @@
 
   This ops is deprecated. Please switch to tf.summary.scalar.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look
+  ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The input `tags` and `values` must have the same shape.  The generated
   summary has a summary value for each tag-value pair in `tags` and `values`.
@@ -631,6 +649,7 @@
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
+
 ops.NotDifferentiable("HistogramSummary")
 ops.NotDifferentiable("ImageSummary")
 ops.NotDifferentiable("AudioSummary")
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 4b0de84..4b56b65 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -66,7 +66,8 @@
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
-  See the [Low Level Intro](https://www.tensorflow.org/guide/low_level_intro#feature_columns)
+  See the [Low Level
+  Intro](https://www.tensorflow.org/guide/low_level_intro#feature_columns)
   guide, for an example of usage.
 
   Args:
@@ -166,8 +167,7 @@
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
     if isinstance(initializer, trackable_base.Trackable):
-      self._initializer = self._track_trackable(
-          initializer, "_initializer")
+      self._initializer = self._track_trackable(initializer, "_initializer")
     with ops.init_scope():
       self._resource_handle = self._create_resource()
       self._init_op = self._initialize()
@@ -219,8 +219,9 @@
     with ops.name_scope(
         name, "%s_Lookup" % self.name,
         (self.resource_handle, key_tensor, self._default_value)):
-      values = gen_lookup_ops.lookup_table_find_v2(
-          self.resource_handle, key_tensor, self._default_value)
+      values = gen_lookup_ops.lookup_table_find_v2(self.resource_handle,
+                                                   key_tensor,
+                                                   self._default_value)
 
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -411,8 +412,9 @@
     with ops.name_scope(
         self._name, values=(table.resource_handle, self._keys, self._values)):
       if fwd_compat.forward_compatible(2018, 9, 19):
-        init_op = gen_lookup_ops.lookup_table_import_v2(
-            table.resource_handle, self._keys, self._values)
+        init_op = gen_lookup_ops.lookup_table_import_v2(table.resource_handle,
+                                                        self._keys,
+                                                        self._values)
       else:
         # To maintain forward compatibiltiy, use the old implementation.
         init_op = gen_lookup_ops.initialize_table_v2(table.resource_handle,
@@ -521,9 +523,9 @@
       on `delimiter`.
 
     Args:
-      filename: The filename of the text file to be used for initialization.
-        The path must be accessible from wherever the graph is initialized
-        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      filename: The filename of the text file to be used for initialization. The
+        path must be accessible from wherever the graph is initialized (eg.
+        trainer or eval workers). The filename may be a scalar `Tensor`.
       key_dtype: The `key` data type.
       key_index: the index that represents information of a line to get the
         table 'key' values from.
@@ -575,8 +577,7 @@
     self._delimiter = delimiter
     self._name = name
     self._filename = self._track_trackable(
-        trackable.TrackableAsset(filename),
-        "_filename")
+        trackable.TrackableAsset(filename), "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
 
@@ -649,13 +650,13 @@
       on `delimiter`.
 
     Args:
-      filename: The filename of the text file to be used for initialization.
-        The path must be accessible from wherever the graph is initialized
-        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      filename: The filename of the text file to be used for initialization. The
+        path must be accessible from wherever the graph is initialized (eg.
+        trainer or eval workers). The filename may be a scalar `Tensor`.
       key_column_index: The column index from the text file to get the keys
         from. The default is to use the line number, starting from zero.
-      value_column_index: The column index from the text file to get the
-        values from. The default is to use the whole line content.
+      value_column_index: The column index from the text file to get the values
+        from. The default is to use the whole line content.
       vocab_size: The number of elements in the file, if known.
       delimiter: The delimiter to separate fields in a line.
       name: Optional name for the op.
@@ -701,9 +702,9 @@
       on `delimiter`.
 
     Args:
-      filename: The filename of the text file to be used for initialization.
-        The path must be accessible from wherever the graph is initialized
-        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      filename: The filename of the text file to be used for initialization. The
+        path must be accessible from wherever the graph is initialized (eg.
+        trainer or eval workers). The filename may be a scalar `Tensor`.
       key_column_index: The column index from the text file to get the `key`
         values from. The default is to use the whole line content.
       value_column_index: The column index from the text file to get the `value`
@@ -832,8 +833,8 @@
         assignation of out-of-vocabulary buckets  (optional).
       name: A name for the operation (optional).
       key_dtype: Data type of keys passed to `lookup`. Defaults to
-        `table.key_dtype` if `table` is specified, otherwise `tf.string`.
-        Must be string or integer, and must be castable to `table.key_dtype`.
+        `table.key_dtype` if `table` is specified, otherwise `tf.string`. Must
+        be string or integer, and must be castable to `table.key_dtype`.
 
     Raises:
       ValueError: when `table` in None and `num_oov_buckets` is not positive.
@@ -866,13 +867,13 @@
       self._table = None
       name = name or "hash_bucket"
     if (not key_dtype.is_integer) and (dtypes.string != key_dtype):
-      raise TypeError(
-          "Invalid key_dtype, expected integer or string, got %s." % key_dtype)
+      raise TypeError("Invalid key_dtype, expected integer or string, got %s." %
+                      key_dtype)
     self._num_oov_buckets = num_oov_buckets
 
     if not isinstance(hasher_spec, HasherSpec):
-      raise TypeError(
-          "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
+      raise TypeError("hasher_spec must be of type HasherSpec, got %s" %
+                      hasher_spec)
     self._hasher_spec = hasher_spec
     if name:
       self._table_name = name.split("/")[-1]
@@ -1191,7 +1192,8 @@
   `[vocabulary size, vocabulary size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
+  `session.run(tf.compat.v1.tables_initializer)` or `session.run(table.init)`
+  once.
 
   To specify multi-column vocabulary files, use key_column_index and
   value_column_index and delimiter.
@@ -1219,7 +1221,7 @@
       vocabulary_file="test.txt", num_oov_buckets=1)
   ids = table.lookup(features)
   ...
-  tf.tables_initializer().run()
+  tf.compat.v1.tables_initializer().run()
 
   ids.eval()  ==> [0, 1, 3, 2]  # where 3 is the out-of-vocabulary bucket
   ```
@@ -1248,12 +1250,13 @@
     ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater
       than zero.
   """
-  if vocabulary_file is None or (
-      isinstance(vocabulary_file, six.string_types) and not vocabulary_file):
+  if vocabulary_file is None or (isinstance(vocabulary_file, six.string_types)
+                                 and not vocabulary_file):
     raise ValueError("vocabulary_file must be specified and must not be empty.")
   if num_oov_buckets < 0:
-    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
-                     % num_oov_buckets)
+    raise ValueError(
+        "num_oov_buckets must be greater or equal than 0, got %d." %
+        num_oov_buckets)
   if vocab_size is not None and vocab_size < 1:
     vocab_file_value = vocabulary_file
     if isinstance(vocabulary_file, ops.Tensor):
@@ -1305,7 +1308,8 @@
   `[vocabulary list size, vocabulary list size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
+  `session.run(tf.compat.v1.tables_initializer)` or `session.run(table.init)`
+  once.
 
   Elements in `vocabulary_list` cannot have duplicates, otherwise when executing
   the table initializer op, it will throw a `FailedPreconditionError`.
@@ -1319,7 +1323,7 @@
   features = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = table.lookup(features)
   ...
-  tf.tables_initializer().run()
+  tf.compat.v1.tables_initializer().run()
 
   ids.eval()  ==> [0, 1, 4, 2]
   ```
@@ -1347,8 +1351,9 @@
     raise ValueError("vocabulary_list must be specified.")
 
   if num_oov_buckets < 0:
-    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
-                     % num_oov_buckets)
+    raise ValueError(
+        "num_oov_buckets must be greater or equal than 0, got %d." %
+        num_oov_buckets)
 
   if (not dtype.is_integer) and (dtypes.string != dtype.base_dtype):
     raise TypeError("Only integer and string keys are supported.")
@@ -1356,9 +1361,9 @@
   with ops.name_scope(name, "string_to_index"):
     keys = ops.convert_to_tensor(vocabulary_list)
     if keys.dtype.is_integer != dtype.is_integer:
-      raise ValueError("Expected %s, got %s." %
-                       ("integer"
-                        if dtype.is_integer else "non-integer", keys.dtype))
+      raise ValueError(
+          "Expected %s, got %s." %
+          ("integer" if dtype.is_integer else "non-integer", keys.dtype))
     if (not dtype.is_integer) and (keys.dtype.base_dtype != dtype):
       raise ValueError("Expected %s, got %s." % (dtype, keys.dtype))
     num_elements = array_ops.size(keys)
@@ -1401,7 +1406,8 @@
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
+  `session.run(tf.compat.v1.tables_initializer)` or `session.run(table.init)`
+  once.
 
   To specify multi-column vocabulary files, use key_column_index and
   value_column_index and delimiter.
@@ -1429,7 +1435,7 @@
       vocabulary_file="test.txt", default_value="UNKNOWN")
   values = table.lookup(indices)
   ...
-  tf.tables_initializer().run()
+  tf.compat.v1.tables_initializer().run()
 
   values.eval() ==> ["lake", "UNKNOWN"]
   ```
@@ -1453,8 +1459,8 @@
     ValueError: when `vocabulary_file` is empty.
     ValueError: when `vocab_size` is invalid.
   """
-  if vocabulary_file is None or (
-      isinstance(vocabulary_file, six.string_types) and not vocabulary_file):
+  if vocabulary_file is None or (isinstance(vocabulary_file, six.string_types)
+                                 and not vocabulary_file):
     raise ValueError("vocabulary_file must be specified and must not be empty.")
 
   if vocab_size is not None and vocab_size < 1:
@@ -1487,7 +1493,8 @@
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
+  `session.run(tf.compat.v1.tables_initializer)` or `session.run(table.init)`
+  once.
 
   Elements in `vocabulary_list` cannot have duplicates, otherwise when executing
   the table initializer op, it will throw a `FailedPreconditionError`.
@@ -1501,7 +1508,7 @@
       vocabulary_list, default_value="UNKNOWN")
   values = table.lookup(indices)
   ...
-  tf.tables_initializer().run()
+  tf.compat.v1.tables_initializer().run()
 
   values.eval() ==> ["lake", "UNKNOWN"]
   ```
@@ -1761,8 +1768,9 @@
       # pylint: disable=protected-access
       with ops.name_scope(name, "%s_table_restore" % self.name):
         with ops.colocate_with(self.op.resource_handle):
-          return gen_lookup_ops.lookup_table_import_v2(
-              self.op.resource_handle, restored_tensors[0], restored_tensors[1])
+          return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
+                                                       restored_tensors[0],
+                                                       restored_tensors[1])
 
 
 @tf_export("lookup.experimental.DenseHashTable")
@@ -2052,8 +2060,9 @@
       # pylint: disable=protected-access
       with ops.name_scope(name, "%s_table_restore" % self.name):
         with ops.colocate_with(self.op.resource_handle):
-          return gen_lookup_ops.lookup_table_import_v2(
-              self.op.resource_handle, restored_tensors[0], restored_tensors[1])
+          return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
+                                                       restored_tensors[0],
+                                                       restored_tensors[1])
 
 
 ops.NotDifferentiable("LookupTableFind")
diff --git a/tensorflow/python/ops/losses/loss_reduction.py b/tensorflow/python/ops/losses/loss_reduction.py
index 991fb87..483a325 100644
--- a/tensorflow/python/ops/losses/loss_reduction.py
+++ b/tensorflow/python/ops/losses/loss_reduction.py
@@ -23,20 +23,44 @@
 
   Contains the following values:
 
-  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `AUTO`: Indicates that the reduction option will be determined by the usage
+     context. For almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+     used with `tf.distribute.Strategy`, outside of built-in training loops such
+     as `tf.keras` `compile` and `fit`, we expect reduction value to be
+     `SUM` or `NONE`. Using `AUTO` in that case will raise an error.
+  * `NONE`: Un-reduced weighted losses with the same shape as input. When this
+    reduction type used with built-in Keras training loops like
+    `fit`/`evaluate`, the unreduced vector loss is passed to the optimizer but
+    the reported loss will be a scalar value.
   * `SUM`: Scalar sum of weighted losses.
   * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-     Note that when using `tf.distribute.Strategy`, this is just the
-     replica-local batch size.
+     This reduction type is not supported when used with
+     `tf.distribute.Strategy` outside of built-in training loops like `tf.keras`
+     `compile`/`fit`.
+
+     You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like:
+     ```
+     with strategy.scope():
+       loss_obj = tf.keras.losses.CategoricalCrossentropy(
+           reduction=tf.keras.losses.Reduction.None)
+       ....
+       loss = tf.reduce_sum(loss_object(labels, predictions)) *
+           (1. / global_batch_size)
+     ```
+
+     Please see
+     https://www.tensorflow.org/alpha/tutorials/distribute/training_loops for
+     more details on this.
   """
 
+  AUTO = 'auto'
   NONE = 'none'
   SUM = 'sum'
   SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
 
   @classmethod
   def all(cls):
-    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
+    return (cls.AUTO, cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
 
   @classmethod
   def validate(cls, key):
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index f65304c..d848fe4 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -470,7 +470,7 @@
   a = op.inputs[0]
   y = op.outputs[0]  # y = 0.5 * b / conj(a)
   with ops.control_dependencies([grad]):
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       ga = gen_math_ops.xdivy(grad, a)
       return -gen_math_ops.mul_no_nan(y, math_ops.conj(ga)), 0.5 * ga
     else:
@@ -504,7 +504,7 @@
   y = op.outputs[0]  # y = e^x
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(y, grad)
     else:
       return grad * y
@@ -517,7 +517,7 @@
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
     y = math_ops.exp(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(y, grad)
     else:
       return grad * y
@@ -529,7 +529,7 @@
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return gen_math_ops.xdivy(grad, x)
     else:
       return grad * math_ops.reciprocal(x)
@@ -541,7 +541,7 @@
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return gen_math_ops.xdivy(grad, 1 + x)
     else:
       return grad * math_ops.reciprocal(1 + x)
@@ -623,7 +623,7 @@
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.xdivy(grad, math_ops.sinh(y))
     else:
       return grad / math_ops.sinh(y)
@@ -676,7 +676,7 @@
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(math_ops.digamma(x), grad)
     else:
       return grad * math_ops.digamma(x)
@@ -689,7 +689,7 @@
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(array_ops.constant(1, dtype=x.dtype), x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(partial_x, grad)
     else:
       return grad * partial_x
@@ -702,7 +702,7 @@
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     partial_x = (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(partial_x, grad)
     else:
       return grad * partial_x
@@ -726,7 +726,7 @@
     dy_dx = math_ops.bessel_i0e(safe_x) - y * (
         math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
     dy_dx = array_ops.where(x_is_not_tiny, dy_dx, 0.5 + zeros)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(dy_dx, grad)
     else:
       return grad * dy_dx
@@ -747,7 +747,7 @@
     # and Gamma'(a) can grow large.
     partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) -
                              math_ops.lgamma(a))
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return (array_ops.reshape(
           math_ops.reduce_sum(math_ops.mul_no_nan(partial_a, grad), ra), sa),
               array_ops.reshape(
@@ -786,7 +786,7 @@
                            (a - 1) * math_ops.log(x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (
         None,  # da
         None,  # db
@@ -815,7 +815,7 @@
     q = math_ops.conj(q)
     partial_q = -x * math_ops.zeta(x + 1, q)
     # TODO(b/36815900): Mark None return values as NotImplemented
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return (None,
               array_ops.reshape(
                   math_ops.reduce_sum(math_ops.mul_no_nan(partial_q, grad), rq),
@@ -841,7 +841,7 @@
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(n + 1, x)
     # TODO(b/36815900): Mark None return values as NotImplemented
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return (None,
               array_ops.reshape(
                   math_ops.reduce_sum(math_ops.mul_no_nan(partial_x, grad), rx),
@@ -902,7 +902,7 @@
     x = math_ops.conj(x)
     secx = math_ops.reciprocal(math_ops.cos(x))
     secx2 = math_ops.square(secx)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(secx2, grad)
     else:
       return secx2 * grad
@@ -917,7 +917,7 @@
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.xdivy(grad, den)
     else:
       inv = math_ops.reciprocal(den)
@@ -933,7 +933,7 @@
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return -math_ops.xdivy(grad, den)
     else:
       inv = math_ops.reciprocal(den)
@@ -958,7 +958,7 @@
   y = op.inputs[0]
   x = op.inputs[1]
   with ops.control_dependencies([grad]):
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       grad_inv = math_ops.xdivy(grad, (math_ops.square(x) + math_ops.square(y)))
     else:
       grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
@@ -1078,7 +1078,7 @@
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.xdivy(grad, y), rx), sx),
             array_ops.reshape(
@@ -1131,7 +1131,7 @@
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.xdivy(grad, y), rx), sx),
             array_ops.reshape(
@@ -1158,7 +1158,7 @@
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.div_no_nan(grad, y), rx), sx),
             array_ops.reshape(
@@ -1188,7 +1188,7 @@
   y = math_ops.conj(y)
   z = math_ops.conj(z)
 
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     gx = array_ops.reshape(
         math_ops.reduce_sum(
             gen_math_ops.mul_no_nan(y * math_ops.pow(x, y - 1), grad), rx), sx)
@@ -1204,7 +1204,7 @@
     mask = x > 0
   safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
   log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     gy = array_ops.reshape(
         math_ops.reduce_sum(gen_math_ops.mul_no_nan(z * log_x, grad), ry), sy)
   else:
@@ -1563,10 +1563,11 @@
 @ops.RegisterGradient("ComplexAbs")
 def _ComplexAbsGrad(op, grad):
   """Returns the gradient of ComplexAbs."""
-  # TODO(b/27786104): The cast to complex could be removed once arithmetic
-  # supports mixtures of complex64 and real values.
-  return (math_ops.complex(grad, array_ops.zeros_like(grad)) *
-          math_ops.sign(op.inputs[0]))
+  return math_ops.div_no_nan(
+      math_ops.complex(
+          grad, array_ops.zeros_like(grad)) * op.inputs[0],
+      math_ops.complex(
+          op.outputs[0], array_ops.zeros_like(op.outputs[0])))
 
 
 @ops.RegisterGradient("Cast")
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 077afd3..b83c5ff 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -48,7 +48,7 @@
 
 ```python
 c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-tf.segment_sum(c, tf.constant([0, 0, 1]))
+tf.math.segment_sum(c, tf.constant([0, 0, 1]))
 #  ==>  [[0 0 0 0]
 #        [5 6 7 8]]
 ```
@@ -60,7 +60,7 @@
 
 ``` python
 c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+tf.math.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
 # ==> [[ 6,  8, 10, 12],
 #       [-1, -2, -3, -4]]
 ```
@@ -126,37 +126,35 @@
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
-    gen_math_ops.arg_max.__doc__.replace("dimensions", "axes").replace(
-        "dimension", "axis"))
+    gen_math_ops.arg_max.__doc__.replace("dimensions",
+                                         "axes").replace("dimension", "axis"))
 def argmax(input,
            axis=None,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "dimension", dimension)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dimension",
+                                                dimension)
   return argmax_v2(input, axis, output_type, name)
 
 
 @tf_export("math.argmax", "argmax", v1=[])
-def argmax_v2(input,
-              axis=None,
-              output_type=dtypes.int64,
-              name=None):
+def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the largest value across axes of a tensor.
 
   Note that in case of ties the identity of the return value is not guaranteed.
 
   Args:
     input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
-    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+      `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`,
+      `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`,
+      `uint64`.
     axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
       int32 or int64, must be in the range `-rank(input), rank(input))`.
       Describes which axis of the input Tensor to reduce across. For vectors,
       use axis = 0.
-    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
-      Defaults to `tf.int64`.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to
+      `tf.int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -181,37 +179,35 @@
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
-    gen_math_ops.arg_min.__doc__.replace("dimensions", "axes").replace(
-        "dimension", "axis"))
+    gen_math_ops.arg_min.__doc__.replace("dimensions",
+                                         "axes").replace("dimension", "axis"))
 def argmin(input,
            axis=None,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "dimension", dimension)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dimension",
+                                                dimension)
   return argmin_v2(input, axis, output_type, name)
 
 
 @tf_export("math.argmin", "argmin", v1=[])
-def argmin_v2(input,
-              axis=None,
-              output_type=dtypes.int64,
-              name=None):
+def argmin_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the smallest value across axes of a tensor.
 
   Note that in case of ties the identity of the return value is not guaranteed.
 
   Args:
     input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
-    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+      `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`,
+      `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`,
+      `uint64`.
     axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
       int32 or int64, must be in the range `-rank(input), rank(input))`.
       Describes which axis of the input Tensor to reduce across. For vectors,
       use axis = 0.
-    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
-      Defaults to `tf.int64`.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to
+      `tf.int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -242,8 +238,8 @@
 def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
-  Given a tensor of integer or floating-point values, this operation returns a 
-  tensor of the same type, where each element contains the absolute value of the 
+  Given a tensor of integer or floating-point values, this operation returns a
+  tensor of the same type, where each element contains the absolute value of the
   corresponding element in the input.
 
   Given a tensor `x` of complex numbers, this operation returns a tensor of type
@@ -261,7 +257,7 @@
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` the same size, type, and sparsity as `x` with 
+    A `Tensor` or `SparseTensor` the same size, type, and sparsity as `x` with
       absolute values.
     Note, for `complex64` or `complex128` input, the returned `Tensor` will be
       of type `float32` or `float64`, respectively.
@@ -271,6 +267,8 @@
     if x.dtype.is_complex:
       return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
     return gen_math_ops._abs(x, name=name)
+
+
 # pylint: enable=g-docstring-has-escape
 
 
@@ -359,7 +357,6 @@
 _sub.__doc__ = (
     gen_math_ops.sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
-
 negative = gen_math_ops.neg
 
 
@@ -410,8 +407,8 @@
   shape = scalar.get_shape()
   if shape.ndims == 0:
     if isinstance(x, ops.IndexedSlices):
-      return ops.IndexedSlices(gen_math_ops.mul(scalar, x.values, name),
-                               x.indices, x.dense_shape)
+      return ops.IndexedSlices(
+          gen_math_ops.mul(scalar, x.values, name), x.indices, x.dense_shape)
     else:
       return gen_math_ops.mul(scalar, x, name)
   else:
@@ -441,9 +438,9 @@
 
   Args:
     x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
+      `complex64`, or `complex128`.
     y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
+      `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -475,8 +472,7 @@
   ```
 
   Args:
-    real: A `Tensor`. Must be one of the following types: `float32`,
-      `float64`.
+    real: A `Tensor`. Must be one of the following types: `float32`, `float64`.
     imag: A `Tensor`. Must have the same type as `real`.
     name: A name for the operation (optional).
 
@@ -510,7 +506,7 @@
 
   ```python
   x = tf.constant([-2.25 + 4.75j, 3.25 + 5.75j])
-  tf.real(x)  # [-2.25, 3.25]
+  tf.math.real(x)  # [-2.25, 3.25]
   ```
 
   If `input` is already real, it is returned unchanged.
@@ -544,7 +540,7 @@
 
   ```python
   x = tf.constant([-2.25 + 4.75j, 3.25 + 5.75j])
-  tf.imag(x)  # [4.75, 5.75]
+  tf.math.imag(x)  # [4.75, 5.75]
   ```
 
   Args:
@@ -582,7 +578,7 @@
 
   ```
   input = tf.constant([-2.25 + 4.75j, 3.25 + 5.75j], dtype=tf.complex64)
-  tf.angle(input).numpy()
+  tf.math.angle(input).numpy()
   # ==> array([2.0131705, 1.056345 ], dtype=float32)
   ```
 
@@ -719,17 +715,16 @@
     value = ops.convert_to_tensor(value, name="value")
     dtype = dtypes.as_dtype(dtype).base_dtype
     if value.dtype.min < dtype.min:
-      value = gen_math_ops.maximum(value,
-                                   ops.convert_to_tensor(
-                                       dtype.min, dtype=value.dtype,
-                                       name="min"))
+      value = gen_math_ops.maximum(
+          value,
+          ops.convert_to_tensor(dtype.min, dtype=value.dtype, name="min"))
     if value.dtype.max > dtype.max:
-      value = gen_math_ops.minimum(value,
-                                   ops.convert_to_tensor(
-                                       dtype.max, dtype=value.dtype,
-                                       name="max"))
+      value = gen_math_ops.minimum(
+          value,
+          ops.convert_to_tensor(dtype.max, dtype=value.dtype, name="max"))
     return cast(value, dtype, name=name)
 
+
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_float"])
 def to_float(x, name="ToFloat"):
@@ -889,8 +884,8 @@
         return func(x, y, name=name)
       elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
-          y = ops.convert_to_tensor_v2(y, dtype_hint=x.dtype.base_dtype,
-                                       name="y")
+          y = ops.convert_to_tensor_v2(
+              y, dtype_hint=x.dtype.base_dtype, name="y")
         except TypeError:
           # If the RHS is not a tensor, it might be a tensor aware object
           # that can implement the operator with knowledge of itself
@@ -904,13 +899,10 @@
   def binary_op_wrapper_sparse(sp_x, y):
     with ops.name_scope(None, op_name, [sp_x, y]) as name:
       y = ops.convert_to_tensor(y, dtype=sp_x.dtype.base_dtype, name="y")
-      return sparse_tensor.SparseTensor(sp_x.indices,
-                                        func(
-                                            sp_x.indices,
-                                            sp_x.values,
-                                            sp_x.dense_shape,
-                                            y,
-                                            name=name), sp_x.dense_shape)
+      return sparse_tensor.SparseTensor(
+          sp_x.indices,
+          func(sp_x.indices, sp_x.values, sp_x.dense_shape, y, name=name),
+          sp_x.dense_shape)
 
   def r_binary_op_wrapper(y, x):
     with ops.name_scope(None, op_name, [x, y]) as name:
@@ -999,12 +991,15 @@
 
 
 def _div_python2(x, y, name=None):
-  """Divide two values using Python 2 semantics. Used for Tensor.__div__.
+  """Divide two values using Python 2 semantics.
+
+  Used for Tensor.__div__.
 
   Args:
     x: `Tensor` numerator of real numeric type.
     y: `Tensor` denominator of real numeric type.
     name: A name for the operation (optional).
+
   Returns:
     `x / y` returns the quotient of x and y.
   """
@@ -1064,17 +1059,18 @@
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
   NOTE: Prefer using the Tensor division operator or tf.divide which obey Python
-  division operator semantics.
+  3 division operator semantics.
 
-  This function divides `x` and `y`, forcing Python 2.7 semantics. That is,
-  if one of `x` or `y` is a float, then the result will be a float.
-  Otherwise, the output will be an integer type. Flooring semantics are used
-  for integer division.
+  This function divides `x` and `y`, forcing Python 2 semantics. That is, if `x`
+  and `y` are both integers then the result will be an integer. This is in
+  contrast to Python 3, where division with `/` is always a float while division
+  with `//` is always an integer.
 
   Args:
     x: `Tensor` numerator of real numeric type.
     y: `Tensor` denominator of real numeric type.
     name: A name for the operation (optional).
+
   Returns:
     `x / y` returns the quotient of x and y.
   """
@@ -1091,6 +1087,7 @@
     x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
     y: A `Tensor` whose dtype is compatible with `x`.
     name: A name for the operation (optional).
+
   Returns:
     The element-wise value of the x divided by y.
   """
@@ -1126,8 +1123,8 @@
     x_dtype = x.dtype.base_dtype
     y_dtype = y.dtype.base_dtype
     if x_dtype != y_dtype:
-      raise TypeError(
-          "x and y must have the same dtype, got %r != %r" % (x_dtype, y_dtype))
+      raise TypeError("x and y must have the same dtype, got %r != %r" %
+                      (x_dtype, y_dtype))
     return gen_math_ops.mul_no_nan(x, y, name=name)
 
 
@@ -1143,7 +1140,8 @@
 def floordiv(x, y, name=None):
   """Divides `x / y` elementwise, rounding toward the most negative integer.
 
-  The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
+  The same as `tf.compat.v1.div(x,y)` for integers, but uses
+  `tf.floor(tf.compat.v1.div(x,y))` for
   floating point arguments so that the result is always an integer (though
   possibly an integer represented as floating point).  This op is generated by
   `x // y` floor division in Python 3 and in Python 2.7 with
@@ -1211,7 +1209,29 @@
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("logical_xor")
 def logical_xor(x, y, name="LogicalXor"):
-  """x ^ y = (x | y) & ~(x & y)."""
+  """Logical XOR function.
+
+  x ^ y = (x | y) & ~(x & y)
+
+  Inputs are tensor and if the tensors contains more than one element, an
+  element-wise logical XOR is computed.
+
+  Usage:
+
+  ```python
+  x = tf.constant([False, False, True, True], dtype = tf.bool)
+  y = tf.constant([False, True, False, True], dtype = tf.bool)
+  z = tf.logical_xor(x, y, name="LogicalXor")
+  #  here z = [False  True  True False]
+  ```
+
+  Args:
+      x: A `Tensor` type bool.
+      y: A `Tensor` of type bool.
+
+  Returns:
+    A `Tensor` of type bool with the same size as that of x or y.
+  """
   # TODO(alemi) Make this a cwise op if people end up relying on it.
   return gen_math_ops.logical_and(
       gen_math_ops.logical_or(x, y),
@@ -1260,14 +1280,13 @@
   ```
 
   Args:
-    start: A 0-D `Tensor` (scalar). Acts as first entry in the range if
-      `limit` is not None; otherwise, acts as range limit and first entry
+    start: A 0-D `Tensor` (scalar). Acts as first entry in the range if `limit`
+      is not None; otherwise, acts as range limit and first entry defaults to 0.
+    limit: A 0-D `Tensor` (scalar). Upper limit of sequence, exclusive. If None,
+      defaults to the value of `start` while the first entry of the range
       defaults to 0.
-    limit: A 0-D `Tensor` (scalar). Upper limit of sequence,
-      exclusive. If None, defaults to the value of `start` while the first
-      entry of the range defaults to 0.
-    delta: A 0-D `Tensor` (scalar). Number that increments
-      `start`. Defaults to 1.
+    delta: A 0-D `Tensor` (scalar). Number that increments `start`. Defaults to
+      1.
     dtype: The type of the elements of the resulting tensor.
     name: A name for the operation. Defaults to "range".
 
@@ -1292,9 +1311,8 @@
           dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64
       ]
       assert all(arg.dtype in dtype_hierarchy for arg in [start, limit, delta])
-      inferred_dtype = max(
-          [arg.dtype for arg in [start, limit, delta]],
-          key=dtype_hierarchy.index)
+      inferred_dtype = max([arg.dtype for arg in [start, limit, delta]],
+                           key=dtype_hierarchy.index)
 
       start = cast(start, inferred_dtype)
       limit = cast(limit, inferred_dtype)
@@ -1336,8 +1354,9 @@
 
 
 @tf_export(v1=["math.reduce_sum", "reduce_sum"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_sum_v1(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -1367,9 +1386,9 @@
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1383,8 +1402,9 @@
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_sum(input_tensor, axis, keepdims, name)
@@ -1480,8 +1500,9 @@
 
 
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 @deprecation.deprecated_args(
     None, "reduction_indices is deprecated, use axis instead", "axis")
 def count_nonzero(input_tensor=None,
@@ -1510,11 +1531,11 @@
 
   ```python
   x = tf.constant([[0, 1, 0], [1, 1, 0]])
-  tf.count_nonzero(x)  # 3
-  tf.count_nonzero(x, 0)  # [1, 2, 0]
-  tf.count_nonzero(x, 1)  # [1, 2]
-  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
-  tf.count_nonzero(x, [0, 1])  # 3
+  tf.math.count_nonzero(x)  # 3
+  tf.math.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.math.count_nonzero(x, 1)  # [1, 2]
+  tf.math.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
+  tf.math.count_nonzero(x, [0, 1])  # 3
   ```
 
   **NOTE** Strings are compared against zero-length empty string `""`. Any
@@ -1523,15 +1544,15 @@
   For example:
   ```python
   x = tf.constant(["", "a", "  ", "b", ""])
-  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  tf.math.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
   ```
 
   Args:
-    input_tensor: The tensor to reduce. Should be of numeric type, `bool`,
-      or `string`.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    input_tensor: The tensor to reduce. Should be of numeric type, `bool`, or
+      `string`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     dtype: The output dtype; defaults to `tf.int64`.
     name: A name for the operation (optional).
@@ -1544,22 +1565,23 @@
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  input_tensor = deprecation.deprecated_argument_lookup(
-      "input", input, "input_tensor", input_tensor)
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis,
-      "reduction_indices", reduction_indices
-      )
+  input_tensor = deprecation.deprecated_argument_lookup("input", input,
+                                                        "input_tensor",
+                                                        input_tensor)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
 
   return count_nonzero_v2(input_tensor, axis, keepdims, dtype, name)
 
 
 @tf_export("math.count_nonzero", v1=[])
-def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
-                     axis=None,
-                     keepdims=None,
-                     dtype=dtypes.int64,
-                     name=None):
+def count_nonzero_v2(
+    input,  # pylint: disable=redefined-builtin
+    axis=None,
+    keepdims=None,
+    dtype=dtypes.int64,
+    name=None):
   """Computes number of nonzero elements across dimensions of a tensor.
 
   Reduces `input` along the dimensions given in `axis`.
@@ -1578,11 +1600,11 @@
 
   ```python
   x = tf.constant([[0, 1, 0], [1, 1, 0]])
-  tf.count_nonzero(x)  # 3
-  tf.count_nonzero(x, 0)  # [1, 2, 0]
-  tf.count_nonzero(x, 1)  # [1, 2]
-  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
-  tf.count_nonzero(x, [0, 1])  # 3
+  tf.math.count_nonzero(x)  # 3
+  tf.math.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.math.count_nonzero(x, 1)  # [1, 2]
+  tf.math.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
+  tf.math.count_nonzero(x, [0, 1])  # 3
   ```
 
   **NOTE** Strings are compared against zero-length empty string `""`. Any
@@ -1591,15 +1613,13 @@
   For example:
   ```python
   x = tf.constant(["", "a", "  ", "b", ""])
-  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  tf.math.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
   ```
 
   Args:
-    input: The tensor to reduce. Should be of numeric type, `bool`,
-      or `string`.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input), rank(input))`.
+    input: The tensor to reduce. Should be of numeric type, `bool`, or `string`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input), rank(input))`.
     keepdims: If true, retains reduced dimensions with length 1.
     dtype: The output dtype; defaults to `tf.int64`.
     name: A name for the operation (optional).
@@ -1650,9 +1670,9 @@
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1678,8 +1698,9 @@
 
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_mean(input_tensor, axis, keepdims, name)
@@ -1852,9 +1873,9 @@
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
 
@@ -1874,8 +1895,9 @@
 
 
 @tf_export(v1=["math.reduce_prod", "reduce_prod"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_prod_v1(input_tensor,
                    axis=None,
                    keepdims=None,
@@ -1909,16 +1931,18 @@
   Equivalent to np.prod
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_prod(input_tensor, axis, keepdims, name)
 
 
 @tf_export(v1=["math.reduce_min", "reduce_min"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_min_v1(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -1952,8 +1976,9 @@
   Equivalent to np.min
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_min(input_tensor, axis, keepdims, name)
@@ -1996,8 +2021,9 @@
 
 
 @tf_export(v1=["math.reduce_max", "reduce_max"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_max_v1(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -2016,9 +2042,9 @@
 
   Args:
     input_tensor: The tensor to reduce. Should have real numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -2031,8 +2057,9 @@
   Equivalent to np.max
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_max(input_tensor, axis, keepdims, name)
@@ -2075,8 +2102,9 @@
 
 
 @tf_export(v1=["math.reduce_all", "reduce_all"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_all_v1(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -2119,8 +2147,9 @@
   Equivalent to np.all
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_all(input_tensor, axis, keepdims, name)
@@ -2172,8 +2201,9 @@
 
 
 @tf_export(v1=["math.reduce_any", "reduce_any"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_any_v1(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -2216,8 +2246,9 @@
   Equivalent to np.any
   @end_compatibility
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_any(input_tensor, axis, keepdims, name)
@@ -2269,8 +2300,9 @@
 
 
 @tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 def reduce_logsumexp_v1(input_tensor,
                         axis=None,
                         keepdims=None,
@@ -2315,8 +2347,9 @@
   Returns:
     The reduced tensor.
   """
-  axis = deprecation.deprecated_argument_lookup(
-      "axis", axis, "reduction_indices", reduction_indices)
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
   return reduce_logsumexp(input_tensor, axis, keepdims, name)
@@ -2363,10 +2396,7 @@
   keepdims = False if keepdims is None else keepdims
   input_tensor = ops.convert_to_tensor(input_tensor)
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
-    raw_max = reduce_max(
-        input_tensor,
-        axis=axis,
-        keepdims=True)
+    raw_max = reduce_max(input_tensor, axis=axis, keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
             gen_math_ops.is_finite(raw_max), raw_max,
@@ -2554,7 +2584,7 @@
     a_shape = a._shape_tuple()  # pylint: disable=protected-access
     b_shape = b._shape_tuple()  # pylint: disable=protected-access
 
-    if fwd_compat.forward_compatible(2019, 4, 18):
+    if fwd_compat.forward_compatible(2019, 4, 25):
       output_may_have_non_empty_batch_shape = (
           (a_shape is None or len(a_shape) > 2) or
           (b_shape is None or len(b_shape) > 2))
@@ -2805,10 +2835,10 @@
 
   `tf.math.add_n` performs the same operation as `tf.math.accumulate_n`, but it
   waits for all of its inputs to be ready before beginning to sum.
-  This buffering can result in higher memory consumption when inputs are ready 
+  This buffering can result in higher memory consumption when inputs are ready
   at different times, since the minimum temporary storage required is
   proportional to the input size rather than the output size.
-  
+
   This op does not [broadcast](
   https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html)
   its inputs. If you need broadcasting, use `tf.math.add` (or the `+` operator)
@@ -2861,10 +2891,10 @@
   Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
   otherwise, these are inferred.
 
-  `accumulate_n` performs the same operation as `tf.math.add_n`, but 
-  does not wait for all of its inputs to be ready before beginning to sum. 
-  This approach can save memory if inputs are ready at different times, since 
-  minimum temporary storage is proportional to the output size rather than the 
+  `accumulate_n` performs the same operation as `tf.math.add_n`, but
+  does not wait for all of its inputs to be ready before beginning to sum.
+  This approach can save memory if inputs are ready at different times, since
+  minimum temporary storage is proportional to the output size rather than the
   inputs' size.
 
   `accumulate_n` is differentiable (but wasn't previous to TensorFlow 1.7).
@@ -2884,13 +2914,11 @@
 
   Args:
     inputs: A list of `Tensor` objects, each with same shape and type.
-    shape: Expected shape of elements of `inputs` (optional). 
-      Also controls the output shape of this op, which may affect type 
-      inference in other ops.
-      A value of `None` means "infer the input shape from the shapes in 
-      `inputs`".
-    tensor_dtype: Expected data type of `inputs` (optional).
-      A value of `None` means "infer the input dtype from `inputs[0]`".
+    shape: Expected shape of elements of `inputs` (optional). Also controls the
+      output shape of this op, which may affect type inference in other ops. A
+      value of `None` means "infer the input shape from the shapes in `inputs`".
+    tensor_dtype: Expected data type of `inputs` (optional). A value of `None`
+      means "infer the input dtype from `inputs[0]`".
     name: A name for the operation (optional).
 
   Returns:
@@ -2953,8 +2981,8 @@
   Specifically, `y = 1 / (1 + exp(-x))`.
 
   Args:
-    x: A Tensor with type `float16`, `float32`, `float64`, `complex64`,
-      or `complex128`.
+    x: A Tensor with type `float16`, `float32`, `float64`, `complex64`, or
+      `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -3109,8 +3137,8 @@
 
   Args:
     x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+      `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+      `complex128`, `qint8`, `quint8`, `qint32`, `half`.
     axis: A `Tensor` of type `int32` (default: 0). Must be in the range
       `[-rank(x), rank(x))`.
     exclusive: If `True`, perform exclusive cumsum.
@@ -3162,8 +3190,8 @@
 
   Args:
     x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+      `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+      `complex128`, `qint8`, `quint8`, `qint32`, `half`.
     axis: A `Tensor` of type `int32` (default: 0). Must be in the range
       `[-rank(x), rank(x))`.
     exclusive: If `True`, perform exclusive cumprod.
@@ -3220,8 +3248,8 @@
     elif x.dtype.is_floating or x.dtype.is_integer:
       return x
     else:
-      raise TypeError(
-          "Expected numeric or variant tensor, got dtype %r" % x.dtype)
+      raise TypeError("Expected numeric or variant tensor, got dtype %r" %
+                      x.dtype)
 
 
 def _BroadcastShape(op):
@@ -3238,6 +3266,7 @@
   Args:
     input_shape: 1-D Tensor, the shape of the Tensor being reduced.
     axes: 1-D Tensor, the reduction axes.
+
   Returns:
     A 1-D Tensor, the output shape as if keepdims were set to True.
   """
@@ -3310,8 +3339,8 @@
   Args:
     data: A `Tensor` with floating point or complex dtype.
     segment_ids: An integer tensor whose shape is a prefix of `data.shape`.
-    num_segments: An integer scalar `Tensor`.  The number of distinct
-      segment IDs.
+    num_segments: An integer scalar `Tensor`.  The number of distinct segment
+      IDs.
     name: A name for the operation (optional).
 
   Returns:
@@ -3359,8 +3388,8 @@
   Args:
     data: A `Tensor` with floating point or complex dtype.
     segment_ids: An integer tensor whose shape is a prefix of `data.shape`.
-    num_segments: An integer scalar `Tensor`.  The number of distinct
-      segment IDs.
+    num_segments: An integer scalar `Tensor`.  The number of distinct segment
+      IDs.
     name: A name for the operation (optional).
 
   Returns:
@@ -3420,15 +3449,15 @@
   #     [5 6 7 8]]
 
   # Which is equivalent to:
-  tf.segment_sum(c, tf.constant([0, 0, 1]))
+  tf.math.segment_sum(c, tf.constant([0, 0, 1]))
   ```
 
   Args:
     data: A `Tensor` with data that will be assembled in the output.
     indices: A 1-D `Tensor` with indices into `data`. Has same rank as
       `segment_ids`.
-    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
-      Values should be sorted and can be repeated.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
     name: A name for the operation (optional).
     num_segments: An optional int32 scalar. Indicates the size of the output
       `Tensor`.
@@ -3483,8 +3512,8 @@
     data: A `Tensor` with data that will be assembled in the output.
     indices: A 1-D `Tensor` with indices into `data`. Has same rank as
       `segment_ids`.
-    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
-      Values should be sorted and can be repeated.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
     name: A name for the operation (optional).
     num_segments: An optional int32 scalar. Indicates the size of the output
       `Tensor`.
@@ -3558,8 +3587,8 @@
     data: A `Tensor` with data that will be assembled in the output.
     indices: A 1-D `Tensor` with indices into `data`. Has same rank as
       `segment_ids`.
-    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
-      Values should be sorted and can be repeated.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
     name: A name for the operation (optional).
     num_segments: An optional int32 scalar. Indicates the size of the output
       `Tensor`.
@@ -3671,7 +3700,7 @@
     Args:
       a: `Tensor`.
       axes: List or `int32` `Tensor` of unique indices specifying valid axes of
-       `a`.
+        `a`.
       flipped: An optional `bool`. Defaults to `False`. If `True`, the method
         assumes that `a` is the second argument in the contraction operation.
 
@@ -3735,12 +3764,12 @@
         if axes > a_shape.ndims:
           raise ValueError("'axes' must not be larger than the number of "
                            "dimensions of tensor %s." % a)
-        return (list(xrange(a_shape.ndims - axes, a_shape.ndims)),
-                list(xrange(axes)))
+        return (list(xrange(a_shape.ndims - axes,
+                            a_shape.ndims)), list(xrange(axes)))
       else:
         rank = array_ops.rank(a)
-        return (range(rank - axes, rank, dtype=dtypes.int32),
-                range(axes, dtype=dtypes.int32))
+        return (range(rank - axes, rank,
+                      dtype=dtypes.int32), range(axes, dtype=dtypes.int32))
     elif isinstance(axes, (list, tuple)):
       if len(axes) != 2:
         raise ValueError("'axes' must be an integer or have length 2.")
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 69e753a..2aff9f0 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -136,7 +136,7 @@
     self.assertAllClose(mean_ref, mean_val, atol=1e-3)
     # This is for Bessel's correction. tf.nn.moments uses n, instead of n-1, as
     # the denominator in the formula to calculate variance, while
-    # tf.nn.fused_batch_norm has Bessel's correction built in.
+    # tf.compat.v1.nn.fused_batch_norm has Bessel's correction built in.
     sample_size = x_val.size / scale_val.size
     var_ref = var_ref * sample_size / (max(sample_size - 1.0, 1.0))
     self.assertAllClose(var_ref, var_val, atol=1e-3)
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 6ac333b..7666ba2 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -410,20 +410,17 @@
 @ops.RegisterGradient("EluGrad")
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
-  return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
+  return (gen_nn_ops.elu_grad(grad, elu_x),
           array_ops.where(
-              elu_x < 0, grad * op.inputs[0],
-              array_ops.zeros(shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
+              elu_x < 0, grad * op.inputs[0], array_ops.zeros_like(elu_x)))
 
 
 @ops.RegisterGradient("SeluGrad")
 def _SeluGradGrad(op, grad):
-  x = op.inputs[1]
-  scale_alpha = 1.7580993408473768599402175208123
-  return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
+  selu_x = op.inputs[1]
+  return (gen_nn_ops.selu_grad(grad, selu_x),
           array_ops.where(
-              x < 0., gen_nn_ops.elu_grad(grad, op.outputs[0] + scale_alpha),
-              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
+              selu_x < 0., grad * op.inputs[0], array_ops.zeros_like(selu_x)))
 
 
 @ops.RegisterGradient("Relu6")
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 783656a..9da56cb 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
@@ -157,5 +158,79 @@
     self.run_test(x, grad_wrt_filter)
 
 
+class EluGradOpTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testEluGradGradWRTgrad_ys(self):
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
+    dummy = constant_op.constant(
+        [[3, 1, -1, -2], [9, 8, 7, 6]], dtype=dtypes.float32)
+
+    elu = gen_nn_ops.elu(inputs)
+    elu_grad = gradients_impl.gradients(elu, inputs, grad_ys=dummy)[0]
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          dummy,
+          dummy.shape,
+          elu_grad,
+          elu_grad.shape)
+      self.assertLess(error, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testEluGradGradWRTinputs(self):
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
+    dummy = constant_op.constant(
+        [[3, 1, -1, -2], [9, 8, 7, 6]], dtype=dtypes.float32)
+
+    elu = gen_nn_ops.elu(inputs)
+    elu_grad = gradients_impl.gradients(elu, inputs, grad_ys=dummy)[0]
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs,
+          inputs.shape,
+          elu_grad,
+          elu_grad.shape)
+      self.assertLess(error, 1e-4)
+
+
+class SeluGradOpTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testSeluGradGradWRTgrad_ys(self):
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
+    dummy = constant_op.constant(
+        [[3, 1, -1, -2], [9, 8, 7, 6]], dtype=dtypes.float32)
+
+    selu = gen_nn_ops.selu(inputs)
+    selu_grad = gradients_impl.gradients(selu, inputs, grad_ys=dummy)[0]
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          dummy,
+          dummy.shape,
+          selu_grad,
+          selu_grad.shape)
+      self.assertLess(error, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testSeluGradGradWRTinputs(self):
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
+    dummy = constant_op.constant(
+        [[3, 1, -1, -2], [9, 8, 7, 6]], dtype=dtypes.float32)
+
+    selu = gen_nn_ops.selu(inputs)
+    selu_grad = gradients_impl.gradients(selu, inputs, grad_ys=dummy)[0]
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs,
+          inputs.shape,
+          selu_grad,
+          selu_grad.shape)
+      self.assertLess(error, 1e-4)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 7a2ad9b..cd46130 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -522,7 +522,7 @@
 
   ```python
       z = tf.nn.relu(...)
-      summ = tf.summary.scalar('sparsity', tf.nn.zero_fraction(z))
+      summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))
   ```
 
   Args:
@@ -1192,7 +1192,6 @@
                         name=None):
   r"""Batch normalization.
 
-  As described in [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](http://arxiv.org/abs/1502.03167).
   Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
   `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):
 
@@ -1218,6 +1217,10 @@
       `tf.nn.moments(..., keep_dims=False)` during training, or running averages
       thereof during inference.
 
+  See Source: [Batch Normalization: Accelerating Deep Network Training by
+  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
+  (http://arxiv.org/abs/1502.03167).
+
   Args:
     x: Input `Tensor` of arbitrary dimensionality.
     mean: A mean `Tensor`.
@@ -1255,7 +1258,9 @@
     name=None):
   r"""Batch normalization.
 
-  As described in http://arxiv.org/abs/1502.03167.
+  See Source: [Batch Normalization: Accelerating Deep Network Training by
+  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
+  (http://arxiv.org/abs/1502.03167).
 
   Args:
     x: Input `Tensor` of 4 dimensions.
@@ -1451,7 +1456,7 @@
         class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1476,7 +1481,7 @@
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
-        `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
+        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
     out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
@@ -1652,7 +1657,7 @@
   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
   good results.  For more details, see
-  `tf.nn.log_uniform_candidate_sampler`.
+  `tf.random.log_uniform_candidate_sampler`.
 
   Note: In the case where `num_true` > 1, we assign to each target class
   the target probability 1 / `num_true` so that the target probabilities
@@ -1756,7 +1761,7 @@
   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
   good results.  For more details, see
-  `tf.nn.log_uniform_candidate_sampler`.
+  `tf.random.log_uniform_candidate_sampler`.
 
   Note: In the case where `num_true` > 1, we assign to each target class
   the target probability 1 / `num_true` so that the target probabilities
@@ -1855,7 +1860,7 @@
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
-    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+    loss = tf.nn.softmax_cross_entropy_with_logits(
         labels=labels_one_hot,
         logits=logits)
   ```
@@ -1877,7 +1882,7 @@
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
       target classes.  Note that this format differs from the `labels` argument
-      of `nn.softmax_cross_entropy_with_logits_v2`.
+      of `nn.softmax_cross_entropy_with_logits`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
       the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1950,7 +1955,7 @@
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
-    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+    loss = tf.nn.softmax_cross_entropy_with_logits(
         labels=labels_one_hot,
         logits=logits)
   ```
@@ -1968,7 +1973,7 @@
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a23ea37..05beacf 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1904,7 +1904,7 @@
       value is given it is replicated in the `H` and `W` dimension. By default
       the `N` and `C` dimensions are set to 1. The dimension order is determined
       by the value of `data_format`, see below for details.
-    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+    padding: Either the `string` `"SAME"` or `"VALID"` indicating the type of
       padding algorithm to use, or a list indicating the explicit paddings at
       the start and end of each dimension. When explicit padding is used and
       data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
@@ -3036,7 +3036,8 @@
       probability distribution e.g. for the case in which labels are of shape
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
       probability distribution.
-    logits: Unscaled log probabilities.
+    logits: Per-label activations, typically a linear output. These activation
+      energies are interpreted as unnormalized log probabilities.
     axis: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
@@ -3221,7 +3222,8 @@
       probability distribution e.g. for the case in which labels are of shape
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
       probability distribution.
-    logits: Unscaled log probabilities.
+    logits: Per-label activations, typically a linear output. These activation
+      energies are interpreted as unnormalized log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
     axis: Alias for dim.
@@ -3284,9 +3286,10 @@
       must be an index in `[0, num_classes)`. Other values will raise an
       exception when this op is run on CPU, and return `NaN` for corresponding
       loss and gradient rows on GPU.
-    logits: Unscaled log probabilities of shape
+    logits: Per-label activations (typically a linear output) of shape
       `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float16`, `float32`, or
-      `float64`.
+      `float64`. These activation energies are interpreted as unnormalized log
+      probabilities.
     name: A name for the operation (optional).
 
   Returns:
@@ -3978,6 +3981,25 @@
 max_pool_with_argmax_v1.__doc__ = gen_nn_ops.max_pool_with_argmax.__doc__
 
 
+@ops.RegisterStatistics("Conv3D", "flops")
+def _calc_conv3d_flops(graph, node):
+  """Calculates the compute resources needed for Conv3D."""
+  input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
+  input_shape.assert_is_fully_defined()
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
+  filter_shape.assert_is_fully_defined()
+  output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
+  output_shape.assert_is_fully_defined()
+  filter_time = int(filter_shape[0])
+  filter_height = int(filter_shape[1])
+  filter_width = int(filter_shape[2])
+  filter_in_depth = int(filter_shape[3])
+  output_count = np.prod(output_shape.as_list(), dtype=np.int64)
+  return ops.OpStats("flops", (output_count * filter_in_depth * filter_time *
+                               filter_height * filter_width * 2))
+
+
 @ops.RegisterStatistics("Conv2D", "flops")
 def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
@@ -4120,7 +4142,7 @@
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     name: A name for this operation (optional).
     rate: A scalar `Tensor` with the same type as `x`. The probability that each
       element of `x` is discarded.
@@ -4176,7 +4198,7 @@
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed` for behavior.
+      `tf.compat.v1.set_random_seed` for behavior.
     name: A name for this operation (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 001ae33..82ab32a 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -81,8 +81,12 @@
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
 
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 883f28c..9568a07 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -23,6 +23,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
@@ -101,6 +102,16 @@
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
+  def test_broadcast_to(self):
+    x = random_ops.random_uniform([3, 2, 1, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return (array_ops.broadcast_to(x1, [2, 2, 3]),
+              array_ops.broadcast_to(x1, [1, 2, 1, 3]))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
   def test_expand_dims(self):
     x = random_ops.random_uniform([3, 2, 3])
 
@@ -112,6 +123,31 @@
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
+  def test_one_hot(self):
+    indices = random_ops.random_uniform(
+        [3, 2, 3], minval=0, maxval=4, dtype=dtypes.int32)
+
+    def loop_fn(i):
+      indices_i = array_ops.gather(indices, i)
+      return (array_ops.one_hot(indices_i, depth=4, on_value=2., off_value=-2.),
+              array_ops.one_hot(indices_i, depth=4, axis=1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_searchsorted(self):
+    sorted_inputs = math_ops.cumsum(random_ops.random_uniform([3, 2, 4]),
+                                    axis=-1)
+    values = random_ops.random_uniform([2, 3], minval=-1, maxval=4.5)
+
+    def loop_fn(i):
+      inputs_i = array_ops.gather(sorted_inputs, i)
+      return [array_ops.searchsorted(inputs_i, values, out_type=dtypes.int32,
+                                     side="left"),  # creates LowerBound op.
+              array_ops.searchsorted(inputs_i, values, out_type=dtypes.int64,
+                                     side="right")]  # creates UpperBound op.
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
   def test_slice(self):
     x = random_ops.random_uniform([3, 2, 3])
 
@@ -258,6 +294,20 @@
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
+  def test_matrix_band_part(self):
+    x = random_ops.random_uniform([3, 4, 2, 2])
+
+    for num_lower, num_upper in ((0, -1), (-1, 0), (1, 1)):
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return array_ops.matrix_band_part(
+            array_ops.gather(x, i),
+            num_lower=num_lower,
+            num_upper=num_upper)
+      # pylint: enable=cell-var-from-loop
+
+    self._test_loop_fn(loop_fn, 3)
+
   def test_matrix_diag_part(self):
     x = random_ops.random_uniform([3, 4, 2])
 
@@ -266,6 +316,19 @@
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
 
+  def test_matrix_set_diag(self):
+    matrices = random_ops.random_uniform([3, 4, 4])
+    diags = random_ops.random_uniform([3, 4])
+
+    def loop_fn(i):
+      matrix_i = array_ops.gather(matrices, i)
+      diag_i = array_ops.gather(diags, i)
+      return (array_ops.matrix_set_diag(matrix_i, diag_i),
+              array_ops.matrix_set_diag(matrices[0, ...], diag_i),
+              array_ops.matrix_set_diag(matrix_i, diags[0, ...]))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
   def test_strided_slice(self):
     with backprop.GradientTape(persistent=True) as g:
       x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index 5258d6a..89df51a 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -35,6 +35,7 @@
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
@@ -259,3 +260,79 @@
       else:
         outputs = tiled_outputs
       return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
+
+
+@tf_export("vectorized_map")
+def vectorized_map(fn, elems):
+  """Parallel map on the list of tensors unpacked from `elems` on dimension 0.
+
+
+  This method works similar to tf.map_fn but is optimized to run much faster,
+  but possibly with a much larger memory footprint. The speedups are obtained by
+  vectorization (see https://arxiv.org/pdf/1903.04243.pdf). The idea behind
+  vectorization is to semantically launch all the invocations of `fn` in
+  parallel and fuse corresponding operations across all these invocations. This
+  fusion is done statically at graph generation time and the generated code is
+  often similar in performance to a manually fused version.
+
+
+  For example, let's look at a method that calculates the outer product of a
+  matrix.
+
+  ```python
+  def outer_product(a):
+    return tf.tensordot(a, a, 0)
+
+  # outer_product was designed to not support batching.
+  c = outer_product(tf.ones((2, 3)))
+  # The shape is consistent
+  assert c.shape == (2, 3, 2, 3)
+  ```
+
+  Now suppose we want an efficient batched version of outer_product. We can
+  simply write:
+
+  ```python
+  batch_size = 100
+  a = tf.ones((batch_size, 32, 32))
+  c = tf.vectorized_map(outer_product, a)
+  assert c.shape == (batch_size, 32, 32, 32, 32)
+   ```
+
+  Because `tf.vectorized_map` fully parallelizes the batch, this method will
+  generally be significantly faster than using `tf.map_fn`, especially in eager
+  mode.
+
+  This is an experimental feature and currently has a lot of limitations:
+    - There should be no data dependency between the different semantic
+      invocations of `fn`, i.e. it should be safe to map the elements of the
+      inputs in any order.
+    - Stateful kernels may mostly not be supported since these often imply a
+      data dependency. We do support a limited set of such stateful kernels
+      though (like RandomFoo, Variable operations like reads, etc).
+    - `fn` has limited support for control flow operations. `tf.cond` in
+      particular is not supported.
+    - `fn` should return nested structure of Tensors or Operations. However
+      if an Operation is returned, it should have zero outputs.
+    - The shape and dtype of `fn` outputs should not depend on the input
+      to `fn`.
+
+  Args:
+    fn: The callable to be performed. It accepts one argument, which will have
+      the same (possibly nested) structure as `elems`, and returns a possibly
+      nested structure of Tensors and Operations, which may be different than
+      the structure of `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension. The nested sequence of the
+      resulting slices will be mapped over by `fn`.
+
+  Returns:
+    A tensor or (possibly nested) sequence of tensors. Each tensor packs the
+    results of applying fn to tensors unpacked from elems along the first
+    dimension, from first to last.
+  """
+  def loop_fn(i):
+    gathered_elems = nest.map_structure(lambda x: array_ops.gather(x, i), elems)
+    return fn(gathered_elems)
+  batch_size = array_ops.shape(nest.flatten(elems)[0])[0]
+  return pfor(loop_fn, batch_size)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 46e6541..ab98ef0 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -101,6 +101,12 @@
     with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
       pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
 
+  def test_vectorized_map(self):
+    def compute(x):
+      return math_ops.reduce_mean(x, axis=0, keepdims=True)
+    result = pfor_control_flow_ops.vectorized_map(
+        compute, array_ops.ones((10, 5, 3)))
+    self.run_and_assert_equal(result, array_ops.ones((10, 1, 3)))
 
 @test_util.run_all_in_graph_and_eager_modes
 class ReductionTest(PForTestCase):
@@ -387,6 +393,28 @@
 
         self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 6)
 
+  def test_log_softmax(self):
+    logits = random_ops.random_uniform([3, 2, 4])
+
+    def loop_fn(i):
+      logits_i = array_ops.gather(logits, i)
+      return (nn.log_softmax(logits_i),
+              nn.log_softmax(logits_i, axis=0),
+              nn.log_softmax(logits_i, axis=-1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_softmax(self):
+    logits = random_ops.random_uniform([3, 2, 4])
+
+    def loop_fn(i):
+      logits_i = array_ops.gather(logits, i)
+      return (nn.softmax(logits_i),
+              nn.softmax(logits_i, axis=0),
+              nn.softmax(logits_i, axis=-1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
   def test_softmax_cross_entropy_with_logits(self):
     with backprop.GradientTape(persistent=True) as g:
       logits = random_ops.random_uniform([3, 2, 4])
@@ -444,20 +472,62 @@
 
     self._test_loop_fn(loop_fn, 5)
 
-  def test_random_gamma(self):
+  def test_random_gamma_invariant_alpha(self):
 
     def loop_fn(_):
       return random_ops.random_gamma([3], alpha=[0.5])
 
     self._test_loop_fn(loop_fn, 5)
 
-  def test_random_poisson_v2(self):
+  def test_random_gamma_varying_alpha(self):
+    alphas = math_ops.exp(random_ops.random_normal([5, 3, 2]))
+
+    def loop_fn(i):
+      alphas_i = array_ops.gather(alphas, i)
+      # Test both scalar and non-scalar params and shapes.
+      return (random_ops.random_gamma(alpha=alphas_i[0, 0], shape=[]),
+              random_ops.random_gamma(alpha=alphas_i, shape=[]),
+              random_ops.random_gamma(alpha=alphas_i[0, 0], shape=[3]),
+              random_ops.random_gamma(alpha=alphas_i, shape=[3]))
+
+    self._test_loop_fn(loop_fn, 5, loop_fn_dtypes=[dtypes.float32] * 4)
+
+  def test_random_poisson_v2_invariant_rate(self):
 
     def loop_fn(_):
       return random_ops.random_poisson(lam=[1.3], shape=[3])
 
     self._test_loop_fn(loop_fn, 5)
 
+  def test_random_poisson_v2_varying_rate(self):
+    rates = math_ops.exp(random_ops.random_normal([5, 3, 2]))
+
+    def loop_fn(i):
+      rates_i = array_ops.gather(rates, i)
+      # Test both scalar and non-scalar params and shapes.
+      return (random_ops.random_poisson(lam=rates_i[0, 0], shape=[]),
+              random_ops.random_poisson(lam=rates_i, shape=[]),
+              random_ops.random_poisson(lam=rates_i[0, 0], shape=[3]),
+              random_ops.random_poisson(lam=rates_i, shape=[3]))
+
+    self._test_loop_fn(loop_fn, 5, loop_fn_dtypes=[dtypes.float32] * 4)
+
+  def test_random_multinomial_invariant_logits(self):
+
+    def loop_fn(_):
+      return random_ops.categorical(logits=[[1., -1.]], num_samples=3)
+
+    self._test_loop_fn(loop_fn, 5, loop_fn_dtypes=[dtypes.int64])
+
+  def test_random_multinomial_varying_logits(self):
+    logits = random_ops.random_normal([5, 3, 2])
+
+    def loop_fn(i):
+      logits_i = array_ops.gather(logits, i)
+      return random_ops.categorical(logits_i, num_samples=3)
+
+    self._test_loop_fn(loop_fn, 5, loop_fn_dtypes=[dtypes.int64])
+
 
 class LoggingTest(PForTestCase):
 
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 07fc73b..af6a019 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -290,7 +291,7 @@
             self._test_loop_fn(loop_fn, 2)
 
   def test_batch_matmul_broadcast(self):
-    if not compat.forward_compatible(2019, 4, 18):
+    if not compat.forward_compatible(2019, 4, 25):
       self.skipTest("Skipping test for future functionality.")
     for broadcast_a in (True, False):
       for broadcast_b in (True, False):
@@ -330,6 +331,21 @@
 
           self._test_loop_fn(loop_fn, 2)
 
+  def test_boolean_reduction(self):
+    x = random_ops.random_uniform([2, 3, 4, 5]) > 0.5
+    for op in [math_ops.reduce_any, math_ops.reduce_all]:
+      for axis in ([1], None, [0, 2]):
+        for keepdims in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return op(a, axis=axis, keepdims=keepdims)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2, loop_fn_dtypes=[dtypes.bool])
+
   def test_cum_sum(self):
     x = random_ops.random_uniform([2, 3, 4, 5])
     for axis in (1, -2):
@@ -459,5 +475,52 @@
       self._test_loop_fn(loop_fn, 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class LinalgTest(PForTestCase):
+
+  def test_cholesky(self):
+    z = random_ops.random_normal([2, 3, 3])
+    x = (math_ops.matmul(z, array_ops.matrix_transpose(z))  # Ensure pos. def.
+         + linalg_ops.eye(3))  # Ensure well-conditioned.
+
+    def loop_fn(i):
+      return linalg_ops.cholesky(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_log_matrix_determinant(self):
+    x = random_ops.random_normal([3, 4, 2, 2])
+
+    def loop_fn(i):
+      return linalg_ops.log_matrix_determinant(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_matrix_triangular_solve(self):
+    for lower in (True, False):
+      for adjoint in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (2, 4, 3, 3) if stack_a else (4, 3, 3)
+            shape_b = (2, 4, 3, 5) if stack_b else (4, 3, 5)
+            x = array_ops.matrix_band_part(
+                random_ops.random_uniform(shape_a)
+                + linalg_ops.eye(3),  # Ensure well-conditioned.
+                *((-1, 0) if lower else (0, -1)))  # Ensure triangular.
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return linalg_ops.matrix_triangular_solve(a, b,
+                                                        lower=lower,
+                                                        adjoint=adjoint)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 36ab1d73..1799e7f 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -34,8 +34,12 @@
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_parsing_ops
+from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -1647,6 +1651,13 @@
     return wrap(output, True)
 
 
+@RegisterPForWithArgs("LogSoftmax", gen_nn_ops.log_softmax)
+@RegisterPForWithArgs("Softmax", gen_nn_ops.softmax)
+def _convert_softmax(pfor_input, op_type, op_func):
+  del op_type
+  return wrap(op_func(pfor_input.stacked_input(0)), True)
+
+
 # array_ops
 
 
@@ -1669,11 +1680,29 @@
 def _convert_reshape(pfor_input):
   t = pfor_input.stacked_input(0)
   shape = pfor_input.unstacked_input(1)
-  new_dim = array_ops.shape(t)[:1]
-  new_shape = array_ops.concat([new_dim, shape], axis=0)
+  new_shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape], axis=0)
   return wrap(array_ops.reshape(t, new_shape), True)
 
 
+@RegisterPFor("BroadcastTo")
+def _convert_broadcast_to(pfor_input):
+  t = pfor_input.stacked_input(0)
+  shape = pfor_input.unstacked_input(1)
+  new_shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape], axis=0)
+
+  # Expand dims of stacked t to broadcast against the new shape.
+  # TODO(davmre): consider factoring out common code with
+  # `expanddim_inputs_for_broadcast`, which has similar logic but with
+  # implicit shapes (of input Tensors) rather than explicit shapes.
+  rank_diff = array_ops.shape(new_shape)[0] - array_ops.rank(t)
+  ones = array_ops.tile([1], array_ops.reshape(rank_diff, [1]))
+  t_shape = array_ops.shape(t)
+  t_expanded_shape = array_ops.concat([t_shape[:1], ones, t_shape[1:]], axis=0)
+
+  return wrap(array_ops.broadcast_to(array_ops.reshape(t, t_expanded_shape),
+                                     new_shape), True)
+
+
 @RegisterPFor("ExpandDims")
 def _convert_expanddims(pfor_input):
   t = pfor_input.stacked_input(0)
@@ -1682,6 +1711,48 @@
   return wrap(array_ops.expand_dims(t, axis=dim), True)
 
 
+@RegisterPForWithArgs("LowerBound", gen_array_ops.lower_bound)
+@RegisterPForWithArgs("UpperBound", gen_array_ops.upper_bound)
+def _convert_searchsorted(pfor_input, _, op_func):
+  pfor_input.stack_inputs()
+  sorted_inputs = _flatten_first_two_dims(pfor_input.stacked_input(0))
+  values = _flatten_first_two_dims(pfor_input.stacked_input(1))
+  out_type = pfor_input.get_attr("out_type")
+  output = op_func(sorted_inputs, values, out_type)
+  return wrap(_unflatten_first_dim(
+      output, pfor_input.pfor.loop_len_vector), True)
+
+
+@RegisterPFor("MatrixBandPart")
+def _convert_matrix_band_part(pfor_input):
+  t = pfor_input.stacked_input(0)
+  num_lower = pfor_input.unstacked_input(1)
+  num_upper = pfor_input.unstacked_input(2)
+  return wrap(array_ops.matrix_band_part(
+      t, num_lower=num_lower, num_upper=num_upper), True)
+
+
+@RegisterPFor("MatrixSetDiag")
+def _convert_matrix_set_diag(pfor_input):
+  pfor_input.stack_inputs()
+  t = pfor_input.stacked_input(0)
+  diag = pfor_input.stacked_input(1)
+  return wrap(array_ops.matrix_set_diag(t, diag), True)
+
+
+@RegisterPFor("OneHot")
+def _convert_one_hot(pfor_input):
+  indices = pfor_input.stacked_input(0)
+  depth = pfor_input.unstacked_input(1)
+  on_value = pfor_input.unstacked_input(2)
+  off_value = pfor_input.unstacked_input(3)
+  axis = pfor_input.get_attr("axis")
+  if axis >= 0:
+    axis += 1
+  return wrap(
+      array_ops.one_hot(indices, depth, on_value, off_value, axis), True)
+
+
 @RegisterPFor("Slice")
 def _convert_slice(pfor_input):
   t = pfor_input.stacked_input(0)
@@ -2016,6 +2087,8 @@
 @RegisterPForWithArgs("Max", math_ops.reduce_max)
 @RegisterPForWithArgs("Min", math_ops.reduce_min)
 @RegisterPForWithArgs("Mean", math_ops.reduce_mean)
+@RegisterPForWithArgs("All", math_ops.reduce_all)
+@RegisterPForWithArgs("Any", math_ops.reduce_any)
 def _convert_reduction(pfor_input, _, op_func):
   t = pfor_input.stacked_input(0)
   indices = pfor_input.unstacked_input(1)
@@ -2141,6 +2214,7 @@
 @RegisterPForWithArgs("Invert", bitwise_ops.invert)
 @RegisterPForWithArgs("IsFinite", math_ops.is_finite)
 @RegisterPForWithArgs("IsInf", math_ops.is_inf)
+@RegisterPForWithArgs("IsNan", math_ops.is_nan)
 @RegisterPForWithArgs("LeftShift", bitwise_ops.left_shift)
 @RegisterPForWithArgs("Less", math_ops.less)
 @RegisterPForWithArgs("LessEqual", math_ops.less_equal)
@@ -2308,12 +2382,20 @@
 # random_ops
 
 
+def _transpose_dim_to_front(x, dim):
+  rank = array_ops.rank(x)
+  return array_ops.transpose(
+      x,
+      perm=array_ops.concat([
+          [dim],
+          math_ops.range(0, dim),
+          math_ops.range(dim + 1, rank)], axis=0))
+
+
 @RegisterPForWithArgs("RandomUniform")
 @RegisterPForWithArgs("RandomUniformInt")
 @RegisterPForWithArgs("RandomStandardNormal")
 @RegisterPForWithArgs("TruncatedNormal")
-@RegisterPForWithArgs("RandomGamma")
-@RegisterPForWithArgs("RandomPoissonV2")
 def _convert_random(pfor_input, op_type, *args, **kw_args):
   del args
   del kw_args
@@ -2331,6 +2413,94 @@
   return [wrap(x, True) for x in outputs]
 
 
+@RegisterPFor("RandomGamma")
+@RegisterPFor("RandomPoissonV2")
+def _convert_random_with_param(pfor_input):
+  shape = pfor_input.unstacked_input(0)
+  # param is lam (Poisson rate) or alpha (Gamma shape).
+  param, param_stacked, _ = pfor_input.input(1)
+  logging.warning(
+      "Note that %s inside pfor op may not give same output as "
+      "inside a sequential loop.", pfor_input.op_type)
+
+  if param_stacked:
+    samples = _create_op(
+        pfor_input.op_type,
+        inputs=[shape, param],
+        op_dtypes=[x.dtype for x in pfor_input.outputs],
+        attrs=pfor_input.op.node_def.attr).outputs[0]
+    loop_dim = array_ops.shape(shape)[0]
+    stacked_samples = _transpose_dim_to_front(samples, loop_dim)
+  else:
+    shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape], axis=0)
+    stacked_samples = _create_op(
+        pfor_input.op_type,
+        inputs=[shape, param],
+        op_dtypes=[x.dtype for x in pfor_input.outputs],
+        attrs=pfor_input.op.node_def.attr).outputs[0]
+
+  return wrap(stacked_samples, True)
+
+
+@RegisterPFor("Multinomial")
+def _convert_multinomial(pfor_input):
+  logits, logits_stacked, _ = pfor_input.input(0)
+  num_samples = pfor_input.unstacked_input(1)
+  seed = pfor_input.get_attr("seed")
+  seed2 = pfor_input.get_attr("seed2")
+  output_dtype = pfor_input.get_attr("output_dtype")
+  logging.warning(
+      "Note that Multinomial inside pfor op may not give same output as "
+      "inside a sequential loop.")
+
+  n = pfor_input.pfor.loop_len_vector[0]
+  if logits_stacked:
+    flattened_logits = _flatten_first_two_dims(logits)
+    samples = gen_random_ops.multinomial(
+        flattened_logits,
+        num_samples,
+        seed=seed, seed2=seed2, output_dtype=output_dtype)
+    stacked_samples = _unflatten_first_dim(samples, [n])
+  else:
+    samples = gen_random_ops.multinomial(
+        logits, num_samples * n,
+        seed=seed, seed2=seed2, output_dtype=output_dtype)
+    stacked_samples = array_ops.transpose(
+        array_ops.reshape(samples, [-1, n, num_samples]), [1, 0, 2])
+
+  return wrap(stacked_samples, True)
+
+
+# linalg_ops
+
+
+@RegisterPFor("Cholesky")
+def _convert_cholesky(pfor_input):
+  t = pfor_input.stacked_input(0)
+  return wrap(linalg_ops.cholesky(t), True)
+
+
+@RegisterPFor("LogMatrixDeterminant")
+def _convert_log_matrix_determinant(pfor_input):
+  # Input must have shape [N, M, M], so we need to flatten.
+  t = _flatten_first_two_dims(pfor_input.stacked_input(0))
+  sign, log_abs_det = linalg_ops.log_matrix_determinant(t)
+  return [wrap(_unflatten_first_dim(x, pfor_input.pfor.loop_len_vector), True)
+          for x in (sign, log_abs_det)]
+
+
+@RegisterPFor("MatrixTriangularSolve")
+def _convert_matrix_triangular_solve(pfor_input):
+  pfor_input.stack_inputs()
+  matrix = pfor_input.stacked_input(0)
+  rhs = pfor_input.stacked_input(1)
+  lower = pfor_input.get_attr("lower")
+  adjoint = pfor_input.get_attr("adjoint")
+  output = linalg_ops.matrix_triangular_solve(
+      matrix, rhs, lower=lower, adjoint=adjoint)
+  return wrap(output, True)
+
+
 # logging_ops
 
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index a84af6c..7887451 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -41,6 +41,7 @@
 
 
 ops.NotDifferentiable("DecodeRaw")
+ops.NotDifferentiable("DecodePaddedRaw")
 ops.NotDifferentiable("ParseTensor")
 ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
@@ -1827,6 +1828,91 @@
     return (context_output, feature_list_output)
 
 
+@tf_export("io.decode_raw", v1=[])
+def decode_raw(input_bytes,
+               out_type,
+               little_endian=True,
+               fixed_length=None,
+               name=None):
+  """Convert raw byte strings into tensors.
+
+  Args:
+    input_bytes:
+      Each element of the input Tensor is converted to an array of bytes.
+    out_type:
+      `DType` of the output. Acceptable types are `half`, `float`, `double`,
+      `int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`.
+    little_endian:
+      Whether the `input_bytes` data is in little-endian format. Data will be
+      converted into host byte order if necessary.
+    fixed_length:
+      If set, the first `fixed_length` bytes of each element will be converted.
+      Data will be zero-padded or truncated to the specified length.
+
+      `fixed_length` must be a multiple of the size of `out_type`.
+      `fixed_length` must be specified if the elements of `input_bytes` are of
+      variable length.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` object storing the decoded bytes.
+
+  """
+  if fixed_length is not None:
+    return gen_parsing_ops.decode_padded_raw(
+        input_bytes,
+        fixed_length=fixed_length,
+        out_type=out_type,
+        little_endian=little_endian,
+        name=name)
+  else:
+    return gen_parsing_ops.decode_raw(
+        input_bytes, out_type, little_endian=little_endian, name=name)
+
+
+@tf_export(v1=["decode_raw", "io.decode_raw"])
+@deprecation.deprecated_args(None,
+                             "bytes is deprecated, use input_bytes instead",
+                             "bytes")
+def decode_raw_v1(
+    input_bytes=None,
+    out_type=None,
+    little_endian=True,
+    name=None,
+    bytes=None  # pylint: disable=redefined-builtin
+):
+  """Convert raw byte strings into tensors.
+
+  Args:
+    input_bytes:
+      Each element of the input Tensor is converted to an array of bytes.
+    out_type:
+      `DType` of the output. Acceptable types are `half`, `float`, `double`,
+      `int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`.
+    little_endian:
+      Whether the `input_bytes` data is in little-endian format. Data will be
+      converted into host byte order if necessary.
+    name: A name for the operation (optional).
+    bytes: Deprecated parameter. Use `input_bytes` instead.
+
+  Returns:
+    A `Tensor` object storing the decoded bytes.
+  """
+  input_bytes = deprecation.deprecated_argument_lookup("input_bytes",
+                                                       input_bytes, "bytes",
+                                                       bytes)
+
+  # out_type is a required positional argument in the original API, and had to
+  # be changed to a keyword argument in order to facilitate the transition from
+  # the reserved named `bytes` to `input_bytes`. Ensure it's still set.
+  if out_type is None:
+    raise ValueError(
+        "decode_raw_v1() missing 1 positional argument: 'out_type'")
+
+  return gen_parsing_ops.decode_raw(
+      input_bytes, out_type, little_endian=little_endian, name=name)
+
+
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export(v1=["io.decode_csv", "decode_csv"])
 @deprecation.deprecated_endpoints("decode_csv")
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 30c660b..d8fb74f 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -27,6 +27,7 @@
         ":ragged_batch_gather_ops",
         ":ragged_batch_gather_with_default_op",
         ":ragged_concat_ops",
+        ":ragged_config",
         ":ragged_conversion_ops",
         ":ragged_dispatch",
         ":ragged_factory_ops",
@@ -39,6 +40,7 @@
         ":ragged_string_ops",
         ":ragged_tensor",
         ":ragged_tensor_shape",
+        ":ragged_tensor_spec",
         ":ragged_tensor_value",
         ":ragged_util",
         ":ragged_where_op",
@@ -270,10 +272,33 @@
 )
 
 py_library(
+    name = "ragged_squeeze_op",
+    srcs = ["ragged_squeeze_op.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:ops",
+    ],
+)
+
+py_library(
+    name = "ragged_config",
+    srcs = ["ragged_config.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
     name = "ragged_tensor",
     srcs = ["ragged_tensor.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged_config",
         ":ragged_tensor_value",
         ":ragged_util",
         ":segment_id_ops",
@@ -350,6 +375,7 @@
     srcs = ["segment_id_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged_config",
         ":ragged_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -391,6 +417,7 @@
         ":ragged_array_ops",
         ":ragged_batch_gather_ops",
         ":ragged_math_ops",
+        ":ragged_squeeze_op",
         ":ragged_tensor",
         ":ragged_tensor_shape",
         ":ragged_util",
@@ -408,6 +435,18 @@
     ],
 )
 
+py_library(
+    name = "ragged_tensor_spec",
+    srcs = ["ragged_tensor_spec.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+    ],
+)
+
 #-------------------------------------------------------------------------------
 # RaggedTensor Tests
 #-------------------------------------------------------------------------------
@@ -1012,3 +1051,19 @@
         "//tensorflow/python:platform_test",
     ],
 )
+
+py_test(
+    name = "ragged_squeeze_op_test",
+    srcs = ["ragged_squeeze_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_squeeze_op",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index be1ccd9..7e67401 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -41,6 +41,17 @@
       dict(pylist=[[1, 2], [3]]),
       dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.float32),
       dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.string),
+      # Note: Conversion of a single np.array is tested below. These tests
+      # check nestings consisting of multiple or irregularily-shaped np.arrays.
+      dict(
+          pylist=[np.array([1, 2]), np.array([3])],
+          preferred_dtype=dtypes.string),
+      dict(pylist=np.array([[1, 2], [3]]), preferred_dtype=dtypes.float32),
+      dict(pylist=np.array([[1, 2], [3]]), preferred_dtype=dtypes.string),
+      dict(
+          pylist=[np.array([[1], np.array([2])]), [np.array([3])]],
+          preferred_dtype=dtypes.float32),
+      dict(pylist=[np.array(1)], preferred_dtype=dtypes.string),
   ])
   def testConvertRaggedTensor(self, pylist, dtype=None, preferred_dtype=None):
     rt = ragged_factory_ops.constant(pylist)
@@ -55,6 +66,11 @@
           message=('Tensor conversion requested dtype float32 for '
                    'RaggedTensor with dtype int32')),
       dict(
+          pylist=np.array([[1, 2], [3, 4]]),
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'RaggedTensor with dtype int32')),
+      dict(
           pylist=[[1, 2], [3, 4]],
           dtype=dtypes.string,
           message=('Tensor conversion requested dtype string for '
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 8c62cc4..5b36388 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -23,7 +23,6 @@
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -122,6 +121,8 @@
     data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
     mask = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         mask, dtypes.bool, name='mask')
+    row_splits_dtype, (data, mask) = ragged_tensor.match_row_splits_dtypes(
+        data, mask, return_dtype=True)
 
     # Get static rank of mask.
     if mask.shape.ndims is None:
@@ -132,8 +133,9 @@
     # If mask is ragged, then recurse with a non-ragged mask.
     if ragged_tensor.is_ragged(mask):
       if not ragged_tensor.is_ragged(data):
-        data = ragged_conversion_ops.from_tensor(
-            data, ragged_rank=mask.ragged_rank)
+        data = ragged_tensor.RaggedTensor.from_tensor(
+            data, ragged_rank=mask.ragged_rank,
+            row_splits_dtype=mask.row_splits.dtype)
       # Check that mask.nested_row_splits is a prefix of
       # data.nested_row_splits.
       splits_list = [
@@ -152,7 +154,7 @@
             # Count the number of True mask values in each row to find the
             # lengths of the filtered rows; then convert to splits.
             int_mask = ragged_functional_ops.map_flat_values(
-                math_ops.cast, mask, dtype=dtypes.int64)
+                math_ops.cast, mask, dtype=row_splits_dtype)
             masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
             splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
           mask = mask.values
@@ -164,7 +166,7 @@
         # Add the ragged `splits` back to the result.
         if keepdims:
           masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits(
-              masked_values, splits)
+              masked_values, splits, validate=False)
 
         return masked_values
 
@@ -187,13 +189,15 @@
       masked_values = boolean_mask(data.values, segment_mask, keepdims=False)
 
       return ragged_tensor.RaggedTensor.from_row_splits(masked_values,
-                                                        masked_splits)
+                                                        masked_splits,
+                                                        validate=False)
 
     # If mask is non-ragged and has rank>1, then convert it to be ragged,
     # with a ragged rank matching data.
     if ragged_tensor.is_ragged(data):
-      mask = ragged_conversion_ops.from_tensor(
-          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1))
+      mask = ragged_tensor.RaggedTensor.from_tensor(
+          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1),
+          row_splits_dtype=data.row_splits.dtype)
       return boolean_mask(data, mask, keepdims)
 
     # Otherwise, data and mask are both `Tensor`s.
@@ -206,20 +210,21 @@
         # number of values it contains.  Then flatten that to get a list of
         # cell lengths, and convert it to splits.  Finally, combine the splits
         # and values to get the innermost ragged tensor.
-        masked_lengths = math_ops.count_nonzero(mask, axis=-1)
+        masked_lengths = math_ops.count_nonzero(mask, axis=-1,
+                                                dtype=row_splits_dtype)
         flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
         masked_values = ragged_tensor.RaggedTensor.from_row_lengths(
-            masked_values, flattened_masked_lengths)
+            masked_values, flattened_masked_lengths, validate=False)
 
         # Wrap remaining ragged dimensions.
         if mask.shape.ndims > 2 and keepdims:
-          mask_shape = array_ops.shape(mask, out_type=dtypes.int64)
+          mask_shape = array_ops.shape(mask, out_type=row_splits_dtype)
           split_size = math_ops.cumprod(mask_shape) + 1
           for dim in range(mask.shape.ndims - 3, -1, -1):
             elt_size = mask_shape[dim + 1]
             masked_splits = math_ops.range(split_size[dim]) * elt_size
             masked_values = ragged_tensor.RaggedTensor.from_row_splits(
-                masked_values, masked_splits)
+                masked_values, masked_splits, validate=False)
 
       return masked_values
 
@@ -254,11 +259,11 @@
   with ops.name_scope(name, 'RaggedTile', [input, multiples]):
     input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         input, name='input')
-    multiples = ragged_util.convert_to_int_tensor(
-        multiples, name='multiples', dtype=dtypes.int64)
-    multiples.shape.assert_has_rank(1)
     if not ragged_tensor.is_ragged(input):
       return array_ops.tile(input, multiples, name)
+    multiples = ragged_util.convert_to_int_tensor(
+        multiples, name='multiples', dtype=input.row_splits.dtype)
+    multiples.shape.assert_has_rank(1)
 
     # If the constant value of `multiples` is available, then we can use it
     # to skip tiling dimensions where `multiples=1`.
@@ -266,7 +271,8 @@
 
     return ragged_tensor.RaggedTensor.from_nested_row_splits(
         _tile_ragged_values(input, multiples, const_multiples),
-        _tile_ragged_splits(input, multiples, const_multiples))
+        _tile_ragged_splits(input, multiples, const_multiples),
+        validate=False)
 
 
 def _tile_ragged_values(rt_input, multiples, const_multiples=None):
@@ -343,7 +349,7 @@
       dimensions where `multiples=1`.
 
   Returns:
-    A list of 1-D `int64` `Tensor`s (one for each ragged dimension in
+    A list of 1-D integer `Tensor`s (one for each ragged dimension in
     `rt_input`).
 
   #### Example:
@@ -481,7 +487,8 @@
       values = expand_dims(input.values, axis - 1)
       splits = input.row_splits
 
-    return ragged_tensor.RaggedTensor.from_row_splits(values, splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(values, splits,
+                                                      validate=False)
 
 
 #===============================================================================
@@ -515,40 +522,6 @@
 
 
 #===============================================================================
-# Internal Helper Functions
-#===============================================================================
-
-
-def _increase_ragged_rank_to(rt_input, ragged_rank):
-  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
-  if ragged_rank > 0:
-    if not ragged_tensor.is_ragged(rt_input):
-      rt_input = ragged_conversion_ops.from_tensor(rt_input)
-    if rt_input.ragged_rank < ragged_rank:
-      rt_input = rt_input.with_values(
-          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
-  return rt_input
-
-
-def _concat_ragged_splits(splits_list):
-  """Concatenates a list of RaggedTensor splits to form a single splits."""
-  pieces = [splits_list[0]]
-  splits_offset = splits_list[0][-1]
-  for splits in splits_list[1:]:
-    pieces.append(splits[1:] + splits_offset)
-    splits_offset += splits[-1]
-  return array_ops.concat(pieces, axis=0)
-
-
-def _nrows(rt_input, out_type=dtypes.int64, name=None):
-  if isinstance(rt_input, ragged_tensor.RaggedTensor):
-    return rt_input.nrows(out_type=out_type, name=name)
-  else:
-    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
-      return array_ops.shape(rt_input, out_type=out_type)[0]
-
-
-#===============================================================================
 # ragged.rank
 #===============================================================================
 def rank(input, name=None):  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
index d4f6d37..cc8bebb 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
@@ -18,12 +18,10 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
@@ -60,7 +58,7 @@
     ```python
     >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
     >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
-    >>> tf.batch_gather(params, indices)
+    >>> tf.compat.v1.batch_gather(params, indices)
     [['b', 'c', 'a'], [], [], ['e', 'e']]
     ```
   """
@@ -72,6 +70,7 @@
         params, name='params')
     indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
+    params, indices = ragged_tensor.match_row_splits_dtypes(params, indices)
     indices_ndims = indices.shape.ndims
     if indices_ndims is None:
       raise ValueError(
@@ -88,7 +87,8 @@
         checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
         with ops.control_dependencies(checks):
           return ragged_tensor.RaggedTensor.from_row_splits(
-              batch_gather(params.values, indices.values), indices.row_splits)
+              batch_gather(params.values, indices.values), indices.row_splits,
+              validate=False)
 
       # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
       else:
@@ -97,18 +97,20 @@
           if params.shape.ndims is not None and params.shape.ndims < 2:
             raise ValueError('batch shape from indices does '
                              'not match params shape')
-          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+          params = ragged_tensor.RaggedTensor.from_tensor(
+              params, ragged_rank=1,
+              row_splits_dtype=indices.row_splits.dtype)
 
         # Adjust indices from within-batch to global (in params.values), and
         # then use ragged.gather to gather them.
         num_indices = indices.row_lengths()
         params_starts = params.row_starts()
         adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
-        adjusted_index_values = math_ops.cast(
-            indices.values, dtypes.int64) + adjustments
+        adjusted_index_values = (
+            math_ops.cast(indices.values, adjustments.dtype) + adjustments)
         return ragged_tensor.RaggedTensor.from_row_splits(
             ragged_gather_ops.gather(params.values, adjusted_index_values),
-            indices.row_splits)
+            indices.row_splits, validate=False)
 
     else:  # params is a RaggedTensor and indices is a Tensor.
       if indices_ndims == 1:
@@ -116,7 +118,8 @@
       elif indices_ndims == 2:
         # Adjust indices from batch-local to global (in params.values)
         adjustments = array_ops.expand_dims(params.row_starts(), 1)
-        adjusted_indices = math_ops.cast(indices, dtypes.int64) + adjustments
+        adjusted_indices = (
+            math_ops.cast(indices, adjustments.dtype) + adjustments)
         return ragged_gather_ops.gather(params.values, adjusted_indices)
       else:
         raise ValueError('batch shape from indices does not match params shape')
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
index 049829d..b10524c 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
@@ -20,7 +20,6 @@
 
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -81,6 +80,9 @@
     default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         default_value, name='default_value',
     )
+    row_splits_dtype, (params, indices, default_value) = (
+        ragged_tensor.match_row_splits_dtypes(params, indices, default_value,
+                                              return_dtype=True))
     # TODO(hterry): lift this restriction and support default_values of
     #               of rank > 1
     if (default_value.shape.ndims is not 0
@@ -113,7 +115,7 @@
             axis=-1)
         upper_bounds = math_ops.cast(row_lengths, indices.dtype)
 
-        pad_shape = _get_pad_shape(params, indices)
+        pad_shape = _get_pad_shape(params, indices, row_splits_dtype)
 
         pad = ragged_tensor_shape.broadcast_to(
             default_value, pad_shape)
@@ -144,11 +146,11 @@
           params=padded_params, indices=adjusted_indices, name=name)
 
 
-def _get_pad_shape(params, indices):
+def _get_pad_shape(params, indices, row_splits_dtype):
   """Gets the RaggedTensorDynamicShape for the pad tensor."""
   num_batch_dimensions = indices.shape.ndims - 1
   params_shape = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(
-      params)
+      params, dim_size_dtype=row_splits_dtype)
 
   # We want to create a pad tensor that can be concatenated with the params.
   if params.shape.ndims == indices.shape.ndims:
@@ -169,8 +171,8 @@
     # has size 1.
     pad_dims = None
     if num_batch_dimensions == 0:
-      pad_dims = (constant_op.constant(1, dtype=dtypes.int64),) + (
-          constant_op.constant([1], dtype=dtypes.int64),) * (
+      pad_dims = (constant_op.constant(1, dtype=row_splits_dtype),) + (
+          constant_op.constant([1], dtype=row_splits_dtype),) * (
               params_shape.num_partitioned_dimensions -
               num_batch_dimensions - 1)
     else:
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 0f86b05..30fe753 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -24,7 +24,6 @@
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
@@ -135,6 +134,9 @@
       ragged_tensor.convert_to_tensor_or_ragged_tensor(
           rt_input, name='rt_input') for rt_input in rt_inputs
   ]
+  row_splits_dtype, rt_inputs = ragged_tensor.match_row_splits_dtypes(
+      *rt_inputs, return_dtype=True)
+  rt_inputs = list(rt_inputs)
 
   # Special case: if there's only one input, then return it as-is.
   if len(rt_inputs) == 1:
@@ -168,12 +170,13 @@
   # possible to concatenate Tensors and RaggedTensors together.
   for i in range(len(rt_inputs)):
     if not ragged_tensor.is_ragged(rt_inputs[i]):
-      rt_inputs[i] = ragged_conversion_ops.from_tensor(
-          rt_inputs[i], ragged_rank=1)
+      rt_inputs[i] = ragged_tensor.RaggedTensor.from_tensor(
+          rt_inputs[i], ragged_rank=1, row_splits_dtype=row_splits_dtype)
 
   # Convert the input tensors to all have the same ragged_rank.
   ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
-  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank, row_splits_dtype)
+               for rt in rt_inputs]
 
   if axis == 0:
     return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
@@ -185,7 +188,7 @@
     with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
       return ragged_tensor.RaggedTensor.from_row_splits(
           _ragged_stack_concat_helper(values, axis - 1, stack_values),
-          splits[0][0])
+          splits[0][0], validate=False)
 
 
 def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
@@ -220,7 +223,7 @@
     concatenated_nested_splits.insert(0, stack_splits)
 
   return ragged_tensor.RaggedTensor.from_nested_row_splits(
-      concatenated_flat_values, concatenated_nested_splits)
+      concatenated_flat_values, concatenated_nested_splits, validate=False)
 
 
 def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
@@ -263,15 +266,15 @@
       # Add a new splits tensor to group together the values.
       stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
       _copy_row_shape(rt_inputs, stack_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
-                                                        stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(
+          permuted_rt, stack_splits, validate=False)
     else:
       # Merge together adjacent rows by dropping the row-split indices that
       # separate them.
       concat_splits = permuted_rt.row_splits[::num_inputs]
       _copy_row_shape(rt_inputs, concat_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
-                                                        concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(
+          permuted_rt.values, concat_splits, validate=False)
 
 
 def _copy_row_shape(rt_inputs, splits):
@@ -281,14 +284,16 @@
       splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
 
 
-def _increase_ragged_rank_to(rt_input, ragged_rank):
+def _increase_ragged_rank_to(rt_input, ragged_rank, row_splits_dtype):
   """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
   if ragged_rank > 0:
     if not ragged_tensor.is_ragged(rt_input):
-      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+      rt_input = ragged_tensor.RaggedTensor.from_tensor(
+          rt_input, row_splits_dtype=row_splits_dtype)
     if rt_input.ragged_rank < ragged_rank:
       rt_input = rt_input.with_values(
-          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1,
+                                   row_splits_dtype))
   return rt_input
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_config.py b/tensorflow/python/ops/ragged/ragged_config.py
new file mode 100644
index 0000000..9105668
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_config.py
@@ -0,0 +1,33 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Configuration parameters for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def auto_cast_partition_dtype():
+  """Whether incopmatible row-partitioning dtypes should be auto-converted.
+
+  If true, then operations that combine RaggedTensors but have different
+  row-partitioning tensor dtypes will be automatically cast to a
+  compatible dtype (`tf.int64`).  If false, then such operations will result
+  in an error.
+
+  Returns:
+    `bool`
+  """
+  return False
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index 29a9bdf..895c269 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -65,6 +66,27 @@
           ragged_rank=1,
           inner_shape=(2,),
           expected_shape=(3, None, 2)),
+      # 3-dimensional tensors with numpy arrays
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
       #=========================================================================
       # 4-dimensional tensors.
       dict(
@@ -86,14 +108,23 @@
                   [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
           inner_shape=(2, 2),
           expected_shape=(2, None, 2, 2)),
+      # 4-dimensional tensors with numpy arrays
+      dict(
+          pylist=np.array([[[np.array([1, 2]), [3, 4]], [[5, 6], [7, 8]]],
+                           np.array([[[2, 4], [6, 8]], [[1, 5], [7, 9]]])]),
+          expected_shape=(2, None, None, None)),
 
       #=========================================================================
       # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
       dict(pylist=[], expected_shape=(0,)),
-      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(pylist=[[], [], np.array([])], expected_shape=(3, None)),
       dict(
           pylist=[[[], []], [], [[], [[]]]],
           expected_shape=(3, None, None, None)),
+      dict(
+          pylist=np.array([np.array([[], []]),
+                           np.array([]), [[], [[]]]]),
+          expected_shape=(3, None, None, None)),
 
       #=========================================================================
       # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
@@ -113,6 +144,11 @@
       dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
       dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
       dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+      dict(
+          pylist=np.array([]),
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
 
       #=========================================================================
       # default/inferred dtypes
@@ -123,6 +159,9 @@
       dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=dtypes.float32),
       dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=dtypes.string),
       dict(pylist=[[True]], expected_dtype=dtypes.bool),
+      dict(
+          pylist=[np.array([1, 2]), np.array([3.]), [4, 5, 6]],
+          expected_dtype=dtypes.float32),
 
       #=========================================================================
       # explicit dtypes
@@ -133,8 +172,9 @@
       dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.float32),
       dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float16),
       dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float32),
-      dict(pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
-           dtype=dtypes.string),
+      dict(
+          pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+          dtype=dtypes.string),
   )
   def testRaggedConst(self,
                       pylist,
@@ -161,6 +201,9 @@
     """
     rt = ragged_factory_ops.constant(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+    # Normalize the pylist, i.e., convert all np.arrays to list.
+    # E.g., [np.array((1,2))] --> [[1,2]]
+    pylist = self._normalize_pylist(pylist)
 
     # If dtype was explicitly specified, check it.
     if dtype is not None:
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index 7f47459..150c917 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -64,6 +64,27 @@
           ragged_rank=1,
           inner_shape=(2,),
           expected_shape=(3, None, 2)),
+      # 3-dimensional tensors with numpy arrays
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], np.array([3, np.array(4)])],
+                  np.array([]), [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
       #=========================================================================
       # 4-dimensional tensors.
       dict(
@@ -85,14 +106,23 @@
                   [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
           inner_shape=(2, 2),
           expected_shape=(2, None, 2, 2)),
+      # 4-dimensional tensors with numpy arrays
+      dict(
+          pylist=np.array([[[np.array([1, 2]), [3, 4]], [[5, 6], [7, 8]]],
+                           np.array([[[2, 4], [6, 8]], [[1, 5], [7, 9]]])]),
+          expected_shape=(2, None, None, None)),
 
       #=========================================================================
       # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
       dict(pylist=[], expected_shape=(0,)),
-      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(pylist=[[], [], np.array([])], expected_shape=(3, None)),
       dict(
           pylist=[[[], []], [], [[], [[]]]],
           expected_shape=(3, None, None, None)),
+      dict(
+          pylist=np.array([np.array([[], []]),
+                           np.array([]), [[], [[]]]]),
+          expected_shape=(3, None, None, None)),
 
       #=========================================================================
       # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
@@ -112,6 +142,11 @@
       dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
       dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
       dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+      dict(
+          pylist=np.array([]),
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
 
       #=========================================================================
       # default/inferred dtypes.
@@ -126,6 +161,9 @@
       dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=np.float64),
       dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=np.dtype('S1')),
       dict(pylist=[[True]], expected_dtype=np.bool),
+      dict(
+          pylist=[np.array([1, 2]), np.array([3.]), [4, 5, 6]],
+          expected_dtype=np.float64),
 
       #=========================================================================
       # explicit dtypes
@@ -150,7 +188,9 @@
     """Tests that `ragged_value(pylist).to_list() == pylist`."""
     rt = ragged_factory_ops.constant_value(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
-
+    # Normalize the pylist, i.e., convert all np.arrays to list.
+    # E.g., [np.array((1,2))] --> [[1,2]]
+    pylist = self._normalize_pylist(pylist)
     # If dtype was explicitly specified, check it.
     if dtype is not None:
       self.assertEqual(rt.dtype, dtype)
@@ -193,6 +233,12 @@
           exception=ValueError,
           message='Invalid pylist=12: incompatible with ragged_rank=1'),
       dict(
+          pylist=np.array(12),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=array\\(12\\): incompatible with '
+          'ragged_rank=1'),
+      dict(
           pylist=12,
           inner_shape=(1,),
           exception=ValueError,
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index 854c5b3..8e06a2d 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -18,15 +18,22 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.ragged import ragged_tensor
 
 
-def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1, name=None):
+def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1,
+                row_splits_dtype=dtypes.int64, name=None):
   if ragged_tensor.is_ragged(tensor):
     return tensor
   else:
-    return ragged_tensor.RaggedTensor.from_tensor(tensor, lengths, padding,
-                                                  ragged_rank, name)
+    return ragged_tensor.RaggedTensor.from_tensor(
+        tensor,
+        lengths=lengths,
+        padding=padding,
+        ragged_rank=ragged_rank,
+        row_splits_dtype=row_splits_dtype,
+        name=name)
 
 
 def to_tensor(rt_input, default_value=None, name=None):
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 1115eee..50d9079 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -36,10 +36,12 @@
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_squeeze_op
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import ragged_where_op
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
@@ -126,6 +128,7 @@
         elif not _is_convertible_to_tensor(elt):
           return self.NOT_SUPPORTED
       if found_ragged:
+        x = ragged_tensor.match_row_splits_dtypes(*x)
         nested_splits_lists = [
             elt.nested_row_splits for elt in x if ragged_tensor.is_ragged(elt)
         ]
@@ -137,7 +140,7 @@
             ragged_util.assert_splits_match(nested_splits_lists)):
           return ragged_tensor.RaggedTensor.from_nested_row_splits(
               self._original_op(flat_values, *args, **kwargs),
-              nested_splits_lists[0])
+              nested_splits_lists[0], validate=False)
       else:
         return self.NOT_SUPPORTED
     else:
@@ -197,6 +200,9 @@
     except (TypeError, ValueError):
       return self.NOT_SUPPORTED
 
+    if x_is_ragged and y_is_ragged:
+      x, y = ragged_tensor.match_row_splits_dtypes(x, y)
+
     if ((x_is_ragged and y_is_ragged) or
         (x_is_ragged and x.flat_values.shape.ndims <= y.shape.ndims) or
         (y_is_ragged and y.flat_values.shape.ndims <= x.shape.ndims)):
@@ -270,16 +276,6 @@
     return found_ragged
 
 
-def ragged_dispatch(original_op, tensor_args):
-
-  def decorator(ragged_op):
-    dispatch.RaggedDispatcher(original_op, ragged_op,
-                              tensor_args).register(original_op)
-    return ragged_op
-
-  return decorator
-
-
 _UNARY_ELEMENTWISE_OPS = [
     array_ops.check_numerics,
     array_ops.identity,
@@ -432,6 +428,11 @@
   return ragged_array_ops.size(input=input, out_type=out_type, name=name)
 
 
+def _ragged_squeeze_v1(input, axis=None, name=None, squeeze_dims=None):  # pylint: disable=redefined-builtin
+  axis = deprecation.deprecated_argument_lookup('axis', axis, 'squeeze_dims',
+                                                squeeze_dims)
+  return ragged_squeeze_op.squeeze(input, axis, name)
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
     (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
@@ -442,11 +443,13 @@
     (array_ops.gather, _ragged_gather_v1, ['params', 'indices']),
     (array_ops.gather_v2, ragged_gather_ops.gather, ['params', 'indices']),
     (array_ops.gather_nd, _ragged_gather_nd_v1, ['params', 'indices']),
-    (array_ops.gather_nd_v2, ragged_gather_ops.gather_nd,
-     ['params', 'indices']),
+    (array_ops.gather_nd_v2, ragged_gather_ops.gather_nd, ['params',
+                                                           'indices']),
     (array_ops.rank, ragged_array_ops.rank, ['input']),
     (array_ops.size, _ragged_size_v1, ['input']),
     (array_ops.size_v2, ragged_array_ops.size, ['input']),
+    (array_ops.squeeze, _ragged_squeeze_v1, ['input']),
+    (array_ops.squeeze_v2, ragged_squeeze_op.squeeze, ['input']),
     (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
     (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 04ef0d7..b695fc2 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -695,6 +695,20 @@
           op=array_ops.size_v2,
           kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
           expected=3),
+      dict(
+          op=array_ops.squeeze,
+          kwargs={
+              'input': ragged_factory_ops.constant_value([[[1, 2, 3], [4, 5]]]),
+              'axis': [0]
+          },
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], [4, 5]])),
+      dict(
+          op=array_ops.squeeze_v2,
+          kwargs={
+              'input': ragged_factory_ops.constant_value([[[1, 2, 3], [4, 5]]]),
+              'axis': [0]
+          },
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], [4, 5]])),
   ])
   def testRaggedDispatch(self, op, expected, args=(), kwargs=None):
     if kwargs is None: kwargs = {}
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index ab72146..5c654c6 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -34,7 +34,8 @@
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
 @tf_export("ragged.constant")
-def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
+def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
+             name=None, row_splits_dtype=dtypes.int64):
   """Constructs a constant RaggedTensor from a nested Python list.
 
   Example:
@@ -50,8 +51,9 @@
   `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
 
   Args:
-    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
-      or `tuple` must be a scalar value compatible with `dtype`.
+    pylist: A nested `list`, `tuple` or `np.ndarray`.  Any nested element that
+      is not a `list`, `tuple` or `np.ndarray` must be a scalar value
+      compatible with `dtype`.
     dtype: The type of elements for the returned `RaggedTensor`.  If not
       specified, then a default is chosen based on the scalar values in
       `pylist`.
@@ -64,6 +66,8 @@
       is not specified.  If `ragged_rank` is specified, then a default is chosen
       based on the contents of `pylist`.
     name: A name prefix for the returned tensor (optional).
+    row_splits_dtype: data type for the constructed `RaggedTensor`'s row_splits.
+      One of `tf.int32` or `tf.int64`.
 
   Returns:
     A potentially ragged tensor with rank `K` and the specified `ragged_rank`,
@@ -73,14 +77,19 @@
     ValueError: If the scalar values in `pylist` have inconsistent nesting
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
+  def ragged_factory(values, row_splits):
+    row_splits = constant_op.constant(row_splits, dtype=row_splits_dtype)
+    return ragged_tensor.RaggedTensor.from_row_splits(values, row_splits,
+                                                      validate=False)
+
   with ops.name_scope(name, "RaggedConstant"):
-    return _constant_value(ragged_tensor.RaggedTensor.from_row_splits,
-                           constant_op.constant, pylist, dtype, ragged_rank,
-                           inner_shape)
+    return _constant_value(ragged_factory, constant_op.constant, pylist, dtype,
+                           ragged_rank, inner_shape)
 
 
 @tf_export(v1=["ragged.constant_value"])
-def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
+def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
+                   row_splits_dtype="int64"):
   """Constructs a RaggedTensorValue from a nested Python list.
 
   Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
@@ -100,8 +109,8 @@
   in `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
 
   Args:
-    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
-      or `tuple` must be a scalar value compatible with `dtype`.
+    pylist: A nested `list`, `tuple` or `np.ndarray`.  Any nested element that
+      is not a `list` or `tuple` must be a scalar value compatible with `dtype`.
     dtype: `numpy.dtype`.  The type of elements for the returned `RaggedTensor`.
       If not specified, then a default is chosen based on the scalar values in
       `pylist`.
@@ -113,18 +122,20 @@
       values in the returned `RaggedTensorValue`.  Defaults to `()` if
       `ragged_rank` is not specified.  If `ragged_rank` is specified, then a
       default is chosen based on the contents of `pylist`.
+    row_splits_dtype: data type for the constructed `RaggedTensorValue`'s
+      row_splits.  One of `numpy.int32` or `numpy.int64`.
 
   Returns:
-    A `RaggedTensorValue` or `numpy.array` with rank `K` and the specified
+    A `tf.RaggedTensorValue` or `numpy.array` with rank `K` and the specified
     `ragged_rank`, containing the values from `pylist`.
 
   Raises:
     ValueError: If the scalar values in `pylist` have inconsistent nesting
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
-
+  row_splits_dtype = dtypes.as_dtype(row_splits_dtype).as_numpy_dtype
   def _ragged_factory(values, row_splits):
-    row_splits = np.array(row_splits, dtype=np.int64)
+    row_splits = np.array(row_splits, dtype=row_splits_dtype)
     return ragged_tensor_value.RaggedTensorValue(values, row_splits)
 
   def _inner_factory(pylist, dtype, shape, name=None):  # pylint: disable=unused-argument
@@ -143,7 +154,7 @@
       `ragged_factory(values, row_splits)`
     inner_factory: A factory function with the signature: `inner_factory(pylist,
       dtype, shape, name)`
-    pylist: A nested `list` or `tuple`.
+    pylist: A nested `list`, `tuple` or `np.ndarray`.
     dtype: Data type for returned value.
     ragged_rank: Ragged rank for returned value.
     inner_shape: Inner value shape for returned value.
@@ -157,8 +168,8 @@
   """
   if ragged_tensor.is_ragged(pylist):
     raise TypeError("pylist may not be a RaggedTensor or RaggedTensorValue.")
-
-  if not isinstance(pylist, (list, tuple)):
+  # np.ndim builds an array, so we short-circuit lists and tuples.
+  if not isinstance(pylist, (list, tuple)) and np.ndim(pylist) == 0:
     # Scalar value
     if ragged_rank is not None and ragged_rank != 0:
       raise ValueError("Invalid pylist=%r: incompatible with ragged_rank=%d" %
@@ -245,7 +256,9 @@
   Raises:
     ValueError: If pylist has inconsistent nesting depths for scalars.
   """
-  if isinstance(pylist, (list, tuple)):
+  # Check if pylist is not scalar. np.ndim builds an array, so we
+  # short-circuit lists and tuples.
+  if isinstance(pylist, (list, tuple)) or np.ndim(pylist) != 0:
     scalar_depth = None
     max_depth = 1
     for child in pylist:
@@ -256,8 +269,7 @@
         scalar_depth = child_scalar_depth + 1
       max_depth = max(max_depth, child_max_depth + 1)
     return (scalar_depth, max_depth)
-  else:
-    return (0, 0)
+  return (0, 0)
 
 
 def _default_inner_shape_for_pylist(pylist, ragged_rank):
@@ -265,16 +277,15 @@
 
   def get_inner_shape(item):
     """Returns the inner shape for a python list `item`."""
-    if not isinstance(item, (list, tuple)):
+    if not isinstance(item, (list, tuple)) and np.ndim(item) == 0:
       return ()
     elif item:
       return (len(item),) + get_inner_shape(item[0])
-    else:
-      return (0,)
+    return (0,)
 
   def check_inner_shape(item, shape):
     """Checks that `item` has a consistent shape matching `shape`."""
-    is_nested = isinstance(item, (list, tuple))
+    is_nested = isinstance(item, (list, tuple)) or np.ndim(item) != 0
     if is_nested != bool(shape):
       raise ValueError("inner values have inconsistent shape")
     if is_nested:
@@ -286,7 +297,8 @@
   # Collapse the ragged layers to get the list of inner values.
   flat_values = pylist
   for dim in range(ragged_rank):
-    if not all(isinstance(v, (list, tuple)) for v in flat_values):
+    if not all(
+        isinstance(v, (list, tuple)) or np.ndim(v) != 0 for v in flat_values):
       raise ValueError("pylist has scalar values depth %d, but ragged_rank=%d "
                        "requires scalar value depth greater than %d" %
                        (dim + 1, ragged_rank, ragged_rank))
@@ -333,4 +345,3 @@
                                          "row_splits_%d" % i)
       result = ragged_tensor.RaggedTensor(result, row_splits, internal=True)
     return result
-
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index b6937a1..c63f11e 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -18,7 +18,10 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.util.tf_export import tf_export
@@ -72,12 +75,23 @@
   if not nested_splits_lists:
     return op(*args, **kwargs)
 
+  split_dtypes = set(splits[0].dtype for splits in nested_splits_lists)
+  if len(split_dtypes) > 1:
+    if not ragged_config.auto_cast_partition_dtype():
+      raise ValueError("Input RaggedTensors have mismatched row_splits dtypes; "
+                       "use RaggedTensor.with_row_splits_dtype() to convert "
+                       "them to compatible dtypes.")
+
+    nested_splits_lists = [
+        [math_ops.cast(s, dtypes.int64) for s in nested_splits]  # pylint: disable=g-complex-comprehension
+        for nested_splits in nested_splits_lists]
+
   with ops.control_dependencies(
       ragged_util.assert_splits_match(nested_splits_lists)):
     # Delegate to op, and then compose the result from the transformed values
     # and the splits.
     return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        op(*inner_args, **inner_kwargs), nested_splits_lists[0])
+        op(*inner_args, **inner_kwargs), nested_splits_lists[0], validate=False)
 
 
 def _replace_ragged_with_flat_values(value, nested_splits_lists):
diff --git a/tensorflow/python/ops/ragged/ragged_gather_ops.py b/tensorflow/python/ops/ragged/ragged_gather_ops.py
index ff04997..ba3beef 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_ops.py
@@ -25,7 +25,6 @@
 from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
 
@@ -96,6 +95,7 @@
         params, name='params')
     indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
+    params, indices = ragged_tensor.match_row_splits_dtypes(params, indices)
 
     if ragged_tensor.is_ragged(indices):
       return indices.with_values(gather(params, indices.values))
@@ -116,7 +116,7 @@
 
     # Compose the RaggedTensor from splits & values.
     return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        result.output_dense_values, result.output_nested_splits)
+        result.output_dense_values, result.output_nested_splits, validate=False)
 
 
 #===============================================================================
@@ -147,7 +147,7 @@
 
   #### Examples:
     ```python
-    >>> params = tf.ragged.constant_value(
+    >>> params = tf.compat.v1.ragged.constant_value(
     ...     [ [ ['000', '001'], ['010'              ]          ],
     ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
     ...       [ [            ], ['210'              ]          ] ])
@@ -177,6 +177,7 @@
         params, name='params')
     indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
+    params, indices = ragged_tensor.match_row_splits_dtypes(params, indices)
     indices_shape = indices.shape
     indices_ndims = indices_shape.ndims
     if indices_ndims is None:
@@ -199,12 +200,13 @@
     if indices_ndims > 2:
       indices_is_dense = not ragged_tensor.is_ragged(indices)
       if indices_is_dense:
-        indices = ragged_conversion_ops.from_tensor(
-            indices, ragged_rank=indices_ndims - 2)
+        indices = ragged_tensor.RaggedTensor.from_tensor(
+            indices, ragged_rank=indices_ndims - 2,
+            row_splits_dtype=params.row_splits.dtype)
       result = indices.with_flat_values(gather_nd(params, indices.flat_values))
       if (indices_is_dense and ragged_tensor.is_ragged(result) and
           result.ragged_rank == indices_ndims - 2):
-        result = ragged_conversion_ops.to_tensor(result)
+        result = ragged_tensor.RaggedTensor.to_tensor(result)
       return result
 
     # indices_ndims <= 2, and the innermost dimension of indices may not be
@@ -235,7 +237,7 @@
     # index tuples point to the correct values in the flattened params; and
     # then use ragged.gather on the flattened index tuples & params.
     else:
-      indices = math_ops.cast(indices, dtypes.int64)
+      indices = math_ops.cast(indices, params.row_splits.dtype)
 
       # Flatten the outermost 2 dimensions of the index tuples & params.
       flattened_index_tuples = array_ops.gather(params.row_splits,
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index d01cf67..27d0dae 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -19,7 +19,6 @@
 from __future__ import print_function
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -136,9 +135,10 @@
   # that puts all values in a single row.
   if row_key is array_ops.newaxis:
     inner_rt = _ragged_getitem(rt_input, inner_keys)
-    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    nsplits = array_ops.shape(inner_rt.row_splits,
+                              out_type=inner_rt.row_splits.dtype)[0]
     return ragged_tensor.RaggedTensor.from_row_splits(
-        inner_rt, array_ops.stack([0, nsplits - 1]))
+        inner_rt, array_ops.stack([0, nsplits - 1]), validate=False)
 
   # Slicing a range of rows: first slice the outer dimension, and then
   # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
@@ -192,7 +192,7 @@
   # Use row_key to slice the starts & limits.
   new_starts = rt_input.row_splits[:-1][row_key]
   new_limits = rt_input.row_splits[1:][row_key]
-  zero_pad = array_ops.zeros([1], dtypes.int64)
+  zero_pad = array_ops.zeros([1], rt_input.row_splits.dtype)
 
   # If there's no slice step, then we can just select a single continuous
   # span of `ragged.values(rt_input)`.
@@ -206,7 +206,8 @@
     values_start = new_splits[0]
     values_limit = new_splits[-1]
     return ragged_tensor.RaggedTensor.from_row_splits(
-        rt_input.values[values_start:values_limit], new_splits - values_start)
+        rt_input.values[values_start:values_limit], new_splits - values_start,
+        validate=False)
 
   # If there is a slice step (aka a strided slice), then use ragged_gather to
   # collect the necessary elements of `ragged.values(rt_input)`.
@@ -245,9 +246,11 @@
   # RaggedTensor that puts each value in its own row.
   if column_key is array_ops.newaxis:
     inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
-    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    nsplits = array_ops.shape(inner_rt.row_splits,
+                              out_type=inner_rt.row_splits.dtype)[0]
     return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
-                                                      math_ops.range(nsplits))
+                                                      math_ops.range(nsplits),
+                                                      validate=False)
 
   # Slicing a range of columns in a ragged inner dimension.  We use a
   # recursive call to process the values, and then assemble a RaggedTensor
@@ -359,10 +362,11 @@
     step = 1
   step = ops.convert_to_tensor(step, name="step")
   if step.dtype.is_integer:
-    step = math_ops.cast(step, dtypes.int64)
+    step = math_ops.cast(step, starts.dtype)
   else:
     raise TypeError("slice strides must be integers or None")
-  value_indices = ragged_math_ops.range(starts, limits, step)
+  value_indices = ragged_math_ops.range(starts, limits, step,
+                                        row_splits_dtype=starts.dtype)
 
   # Use `ragged_gather` or `array_ops.gather` to collect the values.
   if isinstance(values, ragged_tensor.RaggedTensor):
@@ -384,11 +388,11 @@
 
   Args:
     offset: The offset to add.  None, or an int, or a scalar Tensor.
-    starts: 1-D int64 tensor containing start indices.
-    limits: 1-D int64 tensor containing limit indices.
+    starts: 1-D integer tensor containing start indices.
+    limits: 1-D integer tensor containing limit indices.
 
   Returns:
-    A 1-D int64 tensor.
+    A 1-D integer tensor.
   """
 
   def map_positive_offset(offset):
@@ -398,7 +402,7 @@
     return math_ops.maximum(limits + offset, starts)
 
   if isinstance(offset, ops.Tensor):
-    offset = math_ops.cast(offset, dtypes.int64)
+    offset = math_ops.cast(offset, starts.dtype)
     return control_flow_ops.cond(offset >= 0,
                                  lambda: map_positive_offset(offset),
                                  lambda: map_negative_offset(offset))
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 1520640..77e41c0 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -222,7 +222,7 @@
   def testZip(self):
     x = ragged_factory_ops.constant(
         [[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]], dtypes.int64)
-    y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64), axis=1)
+    y = array_ops.expand_dims(mo.range(x.nrows(out_type=dtypes.int64)), axis=1)
 
     def _zip(foo):
       y_val, x_val = foo
@@ -273,7 +273,7 @@
     elems = ragged_factory_ops.constant([[1, 2, 3], [4, 5], [6, 7]])
     fn = lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0])
     with self.assertRaisesWithLiteralMatch(
-        ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
+        ValueError, r'The declared ragged rank (10) mismatches the result (2)'):
       _ = ragged_map_ops.map_fn(
           fn,
           elems,
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index 1d34251..e647d47 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -29,6 +29,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -196,6 +197,7 @@
     return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
 
   elems_flat = input_flatten(elems)
+  elems_flat = ragged_tensor.match_row_splits_dtypes(*elems_flat)
 
   with ops.name_scope(name, "map", elems_flat):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
@@ -397,8 +399,9 @@
   nested_row_lengths = tuple(t.nested_row_lengths)
   for nested_row_length in reversed(nested_row_lengths):
     values = ragged_tensor.RaggedTensor.from_row_lengths(
-        values, nested_row_length)
-  return ragged_tensor.RaggedTensor.from_row_lengths(values, t.outer_row_length)
+        values, nested_row_length, validate=False)
+  return ragged_tensor.RaggedTensor.from_row_lengths(values, t.outer_row_length,
+                                                     validate=False)
 
 
 def _maybe_decompose_dtype(d):
@@ -408,8 +411,9 @@
 
   result = _RaggedTensorComponents(
       flat_values=d.dtype,
-      nested_row_lengths=tuple(dtypes.int64 for i in range(d.ragged_rank - 1)),
-      outer_row_length=dtypes.int64,
+      nested_row_lengths=tuple(
+          d.row_splits_dtype for i in range(d.ragged_rank - 1)),
+      outer_row_length=d.row_splits_dtype,
   )
   return result
 
@@ -418,31 +422,42 @@
   """Convert outputs which are `Tensor`s into `_RaggedTensorComponents`."""
   for current, declared in zip(fn_output_flat, output_declared):
     if isinstance(declared, ragged_tensor.RaggedTensorType):
-      if isinstance(current, ragged_tensor.RaggedTensor):
-        # Check that the ragged ranks match up.
-        # + 1 to account for the rank of the outermost dimension.
-        if declared.ragged_rank != current.ragged_rank + 1:
-          raise ValueError(
-              "The declared ragged rank (%d) mismatches the result (%d)" %
-              (declared.ragged_rank, current.ragged_rank))
-        yield current
-      else:
-        # We the output is a Tensor, but the caller has declared that we are
-        # expecting an RaggedTensor output.
-        if declared.ragged_rank != 1:
-          raise ValueError(
-              "The declared ragged rank (%d) mismatches the result (1)" %
-              declared.ragged_rank)
-
-        if isinstance(current, ragged_tensor.RaggedTensor):
-          nrows = current.nrows()
-        else:
-          nrows = array_ops.shape(current, out_type=dtypes.int64)[0]
-        row_length = array_ops.expand_dims(nrows, axis=0)
-        rt = _RaggedTensorComponents(
-            flat_values=current,
-            nested_row_lengths=(),
-            outer_row_length=row_length)
-        yield rt
+      yield _convert_declared_ragged(current, declared)
     else:
       yield current
+
+
+def _convert_declared_ragged(current, declared):
+  """Converts an output with RaggedTensorType into a _RaggedTensorComponents."""
+  # Check that the ragged ranks match up.
+  # + 1 to account for the rank of the outermost dimension.
+  current_ragged_rank = getattr(current, "ragged_rank", 0)
+  if declared.ragged_rank != current_ragged_rank + 1:
+    raise ValueError(
+        "The declared ragged rank (%d) mismatches the result (%d)" %
+        (declared.ragged_rank, current_ragged_rank + 1))
+
+  # Check that dtypes match up.
+  if declared.dtype != current.dtype:
+    raise ValueError(
+        "The declared dtype (%s) mismatches the result (%s)" %
+        (declared.dtype, current.dtype))
+  if (isinstance(current, ragged_tensor.RaggedTensor) and
+      declared.row_splits_dtype != current.row_splits.dtype):
+    if not ragged_config.auto_cast_partition_dtype():
+      raise ValueError(
+          "The declared row_splits dtype (%s) mismatches the result (%s)."
+          "  Use RaggedTensor.with_row_splits_dtype to convert it."
+          % (declared.row_splits_dtype, current.row_splits.dtype))
+    current = current.with_row_splits_dtype(declared.row_splits_dtype)
+
+  if isinstance(current, ragged_tensor.RaggedTensor):
+    return current
+  else:
+    nrows = array_ops.shape(current, out_type=declared.row_splits_dtype)[0]
+    row_length = array_ops.expand_dims(nrows, axis=0)
+    return _RaggedTensorComponents(
+        flat_values=current,
+        nested_row_lengths=(),
+        outer_row_length=row_length)
+
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 02e927b..bd84ccc 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -39,7 +39,8 @@
 #===============================================================================
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
-def range(starts, limits=None, deltas=1, dtype=None, name=None):
+def range(starts, limits=None, deltas=1, dtype=None,
+          name=None, row_splits_dtype=dtypes.int64):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
 
   Each row of the returned `RaggedTensor` contains a single sequence:
@@ -81,10 +82,13 @@
     dtype: The type of the elements of the resulting tensor.  If not specified,
       then a value is chosen based on the other args.
     name: A name for the operation.
+    row_splits_dtype: `dtype` for the returned `RaggedTensor`'s `row_splits`
+      tensor.  One of `tf.int32` or `tf.int64`.
 
   Returns:
     A `RaggedTensor` of type `dtype` with `ragged_rank=1`.
   """
+  row_splits_dtype = dtypes.as_dtype(row_splits_dtype)
   if limits is None:
     starts, limits = 0, starts
 
@@ -99,9 +103,11 @@
           [starts, limits, deltas],
           [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
 
-    result = gen_ragged_math_ops.ragged_range(starts, limits, deltas, name=name)
+    result = gen_ragged_math_ops.ragged_range(
+        starts, limits, deltas, Tsplits=row_splits_dtype, name=name)
     return ragged_tensor.RaggedTensor.from_row_splits(result.rt_dense_values,
-                                                      result.rt_nested_splits)
+                                                      result.rt_nested_splits,
+                                                      validate=False)
 
 
 def _infer_matching_dtype(tensors, dtype_hierarchy):
@@ -190,6 +196,9 @@
     data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
     segment_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         segment_ids, name='segment_ids')
+    data, segment_ids = ragged_tensor.match_row_splits_dtypes(data, segment_ids)
+    if segment_ids.dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError('segment_ids must have dtype int32 or int64.')
 
     if ragged_tensor.is_ragged(segment_ids):
       if not ragged_tensor.is_ragged(data):
@@ -203,22 +212,19 @@
         return _ragged_segment_aggregate(unsorted_segment_op, data.values,
                                          segment_ids.values, num_segments, name)
 
-    segment_ids = math_ops.cast(segment_ids, dtypes.int64)
-
-    # Find the length of each row in data.  (dtype=int64, shape=[data_nrows])
+    # Find the length of each row in data.  (shape=[data_nrows])
     data_row_lengths = data.row_splits[1:] - data.row_splits[:-1]
 
     # Find the length that each output row will have.  The length of the row
     # corresponding to segment `id` is `max(data_row_lengths[i])` where
-    # `segment_ids[i]=id`.  (dtype=int64, shape=[output_nrows])
+    # `segment_ids[i]=id`.  (shape=[output_nrows])
     output_row_lengths = math_ops.maximum(
         math_ops.unsorted_segment_max(data_row_lengths, segment_ids,
                                       num_segments), 0)
-    assert output_row_lengths.dtype == dtypes.int64
 
     # Build the splits tensor for the output RaggedTensor.
     output_splits = array_ops.concat([
-        array_ops.zeros([1], dtypes.int64),
+        array_ops.zeros([1], output_row_lengths.dtype),
         math_ops.cumsum(output_row_lengths)
     ],
                                      axis=0)
@@ -238,8 +244,8 @@
     output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
                                               data_val_to_out_val_index,
                                               output_splits[-1])
-    return ragged_tensor.RaggedTensor.from_row_splits(output_values,
-                                                      output_splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        output_values, output_splits, validate=False)
 
 
 def segment_sum(data, segment_ids, num_segments, name=None):
@@ -276,7 +282,8 @@
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
     ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
-        array_ops.ones_like(data.flat_values), data.nested_row_splits)
+        array_ops.ones_like(data.flat_values), data.nested_row_splits,
+        validate=False)
     count = segment_sum(ones, segment_ids, num_segments)
     if ragged_tensor.is_ragged(total):
       return total.with_flat_values(total.flat_values / count.flat_values)
@@ -290,7 +297,8 @@
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
     ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
-        array_ops.ones_like(data.flat_values), data.nested_row_splits)
+        array_ops.ones_like(data.flat_values), data.nested_row_splits,
+        validate=False)
     count = segment_sum(ones, segment_ids, num_segments)
     if ragged_tensor.is_ragged(total):
       return total.with_flat_values(
@@ -461,6 +469,13 @@
       elif len(axis) == 1:
         axis = axis[0]
       else:
+        # When reducing multiple axes, as we reduce one at a time (see below),
+        # the negative axis has to be converted to positive at the first run
+        # as the sort with negative axis will have different orders.
+        # See GitHub issue 27497.
+        axis = [
+            ragged_util.get_positive_axis(a, rt_input.shape.ndims) for a in axis
+        ]
         # When reducing multiple axes, just reduce one at a time.  This is less
         # efficient, and only works for associative ops.  (In particular, it
         # does not work for reduce_mean.)  However, reducing multiple axes at
@@ -533,13 +548,14 @@
     if ragged_tensor.is_ragged(input_tensor):
       ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
           array_ops.ones_like(input_tensor.flat_values),
-          input_tensor.nested_row_splits)
+          input_tensor.nested_row_splits, validate=False)
     else:
       ones = array_ops.ones_like(input_tensor)
     count = reduce_sum(ones, axis, keepdims)
     if ragged_tensor.is_ragged(total):
       return ragged_tensor.RaggedTensor.from_nested_row_splits(
-          total.flat_values / count.flat_values, total.nested_row_splits)
+          total.flat_values / count.flat_values, total.nested_row_splits,
+          validate=False)
     else:
       return total / count
 
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index a9fa378..cac2d2d 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -304,6 +304,18 @@
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=2,
           expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
+
+      # Test case for GitHub issue 27497, multiple negative axes.
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[-2, -1],
+          expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[-3, -2, -1],
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
   )
   def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
     rt_input = ragged_factory_ops.constant(rt_input)
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
index 5384f3a..be331a0 100644
--- a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -43,8 +43,8 @@
     self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
                             segment_id_ops.row_splits_to_segment_ids, [])
     self.assertRaisesRegexp(
-        ValueError, r'Tensor conversion requested dtype int64 for '
-        'Tensor with dtype float32', segment_id_ops.row_splits_to_segment_ids,
+        ValueError, r'splits must have dtype int32 or int64',
+        segment_id_ops.row_splits_to_segment_ids,
         constant_op.constant([0.5]))
     self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
                             segment_id_ops.row_splits_to_segment_ids, 0)
diff --git a/tensorflow/python/ops/ragged/ragged_squeeze_op.py b/tensorflow/python/ops/ragged/ragged_squeeze_op.py
new file mode 100644
index 0000000..d3070b6
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_squeeze_op.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator Squeeze for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+
+
+def squeeze(input, axis=None, name=None):  # pylint: disable=redefined-builtin
+  """Ragged compatible squeeze.
+
+  If `input` is a `tf.Tensor`, then this calls `tf.squeeze`.
+
+  If `input` is a `tf.RaggedTensor`, then this operation takes `O(N)` time,
+  where `N` is the number of elements in the squeezed dimensions.
+
+  Args:
+    input: A potentially ragged tensor. The input to squeeze.
+    axis: An optional list of ints. Defaults to `None`. If the `input` is
+      ragged, it only squeezes the dimensions listed. It fails if `input` is
+      ragged and axis is []. If `input` is not ragged it calls tf.squeeze. Note
+      that it is an error to squeeze a dimension that is not 1. It must be in
+      the range of [-rank(input), rank(input)).
+   name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor. Contains the same data as input,
+    but has one or more dimensions of size 1 removed.
+  """
+  with ops.name_scope(name, 'RaggedSqueeze', [input]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
+    if isinstance(input, ops.Tensor):
+      return array_ops.squeeze(input, axis, name)
+
+    if axis is None:
+      raise ValueError('Ragged.squeeze must have an axis argument.')
+    if isinstance(axis, int):
+      axis = [axis]
+    elif ((not isinstance(axis, (list, tuple))) or
+          (not all(isinstance(d, int) for d in axis))):
+      raise TypeError('Axis must be a list or tuple of integers.')
+
+    dense_dims = []
+    ragged_dims = []
+    # Normalize all the dims in axis to be positive
+    axis = [ragged_util.get_positive_axis(d, input.shape.ndims) for d in axis]
+    for dim in axis:
+      if dim > input.ragged_rank:
+        dense_dims.append(dim - input.ragged_rank)
+      else:
+        ragged_dims.append(dim)
+
+    # Make sure the specified ragged dimensions are squeezable.
+    assertion_list = []
+    scalar_tensor_one = constant_op.constant(1, dtype=input.row_splits.dtype)
+    for i, r in enumerate(input.nested_row_lengths()):
+      if i + 1 in ragged_dims:
+        assertion_list.append(
+            control_flow_ops.Assert(
+                math_ops.reduce_all(math_ops.equal(r, scalar_tensor_one)),
+                ['the given axis (axis = %d) is not squeezable!' % (i + 1)]))
+    if 0 in ragged_dims:
+      scalar_tensor_two = constant_op.constant(2, dtype=dtypes.int32)
+      assertion_list.append(
+          control_flow_ops.Assert(
+              math_ops.equal(
+                  array_ops.size(input.row_splits), scalar_tensor_two),
+              ['the given axis (axis = 0) is not squeezable!']))
+
+    # Till now, we are sure that the ragged dimensions are squeezable.
+    squeezed_rt = None
+    squeezed_rt = control_flow_ops.with_dependencies(assertion_list,
+                                                     input.flat_values)
+
+    if dense_dims:
+      # Gives error if the dense dimension is not squeezable.
+      squeezed_rt = array_ops.squeeze(squeezed_rt, dense_dims)
+
+    remaining_row_splits = []
+    remaining_row_splits = list()
+    for i, row_split in enumerate(input.nested_row_splits):
+      # each row_splits tensor is for dimension #(i+1) .
+      if (i + 1) not in ragged_dims:
+        remaining_row_splits.append(row_split)
+    # Take care of the first row if it is to be squeezed.
+    if remaining_row_splits and 0 in ragged_dims:
+      remaining_row_splits.pop(0)
+
+    squeezed_rt = RaggedTensor.from_nested_row_splits(squeezed_rt,
+                                                      remaining_row_splits)
+
+    # Corner case: when removing all the ragged dimensions and the output is
+    # a scalar tensor e.g. ragged.squeeze(ragged.constant([[[1]]])).
+    if set(range(0, input.ragged_rank + 1)).issubset(set(ragged_dims)):
+      squeezed_rt = array_ops.squeeze(squeezed_rt, [0], name)
+
+    return squeezed_rt
diff --git a/tensorflow/python/ops/ragged/ragged_squeeze_op_test.py b/tensorflow/python/ops/ragged/ragged_squeeze_op_test.py
new file mode 100644
index 0000000..35abb50
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_squeeze_op_test.py
@@ -0,0 +1,292 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.size."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_squeeze_op
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSqueezeTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
+
+  @parameterized.parameters([
+      {
+          'input_list': []
+      },
+      {
+          'input_list': [[]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[[[], []], [[], []]]],
+          'squeeze_ranks': [0]
+      },
+  ])
+  def test_passing_empty(self, input_list, squeeze_ranks=None):
+    rt = ragged_squeeze_op.squeeze(
+        ragged_factory_ops.constant(input_list), squeeze_ranks)
+    dt = array_ops.squeeze(constant_op.constant(input_list), squeeze_ranks)
+    self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt), dt)
+
+  @parameterized.parameters([
+      {
+          'input_list': [[1]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[1]],
+          'squeeze_ranks': [0, 1]
+      },
+      {
+          'input_list': [[1, 2]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[1], [2]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [3]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 3]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 1]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [1, 3]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 1, 3]
+      },
+      {
+          'input_list': [[[1], [2]], [[3], [4]]],
+          'squeeze_ranks': [2]
+      },
+      {
+          'input_list': [[1], [2]],
+          'squeeze_ranks': [-1]
+      },
+  ])
+  def test_passing_simple(self, input_list, squeeze_ranks=None):
+    rt = ragged_squeeze_op.squeeze(
+        ragged_factory_ops.constant(input_list), squeeze_ranks)
+    dt = array_ops.squeeze(constant_op.constant(input_list), squeeze_ranks)
+    self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt), dt)
+
+  @parameterized.parameters([
+      # ragged_conversion_ops.from_tensor does not work for this
+      # {'input_list': [1]},
+      {
+          'input_list': [[1]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[1, 2]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[1], [2]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [3]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 3]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 1]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [1, 3]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 1, 3]
+      },
+      {
+          'input_list': [[[1], [2]], [[3], [4]]],
+          'squeeze_ranks': [2]
+      },
+  ])
+  def test_passing_simple_from_dense(self, input_list, squeeze_ranks=None):
+    dt = constant_op.constant(input_list)
+    rt = ragged_conversion_ops.from_tensor(dt)
+    rt_s = ragged_squeeze_op.squeeze(rt, squeeze_ranks)
+    dt_s = array_ops.squeeze(dt, squeeze_ranks)
+    self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt_s), dt_s)
+
+  @parameterized.parameters([
+      {
+          'input_list': [[[[[[1]], [[1, 2]]]], [[[[]], [[]]]]]],
+          'output_list': [[[1], [1, 2]], [[], []]],
+          'squeeze_ranks': [0, 2, 4]
+      },
+      {
+          'input_list': [[[[[[1]], [[1, 2]]]], [[[[]], [[]]]]]],
+          'output_list': [[[[[1]], [[1, 2]]]], [[[[]], [[]]]]],
+          'squeeze_ranks': [0]
+      },
+  ])
+  def test_passing_ragged(self, input_list, output_list, squeeze_ranks=None):
+    rt = ragged_factory_ops.constant(input_list)
+    rt_s = ragged_squeeze_op.squeeze(rt, squeeze_ranks)
+    ref = ragged_factory_ops.constant(output_list)
+    self.assertRaggedEqual(rt_s, ref)
+
+  def test_passing_text(self):
+    rt = ragged_factory_ops.constant([[[[[[[['H']], [['e']], [['l']], [['l']],
+                                           [['o']]],
+                                          [[['W']], [['o']], [['r']], [['l']],
+                                           [['d']], [['!']]]]],
+                                        [[[[['T']], [['h']], [['i']], [['s']]],
+                                          [[['i']], [['s']]],
+                                          [[['M']], [['e']], [['h']], [['r']],
+                                           [['d']], [['a']], [['d']]],
+                                          [[['.']]]]]]]])
+    output_list = [[['H', 'e', 'l', 'l', 'o'], ['W', 'o', 'r', 'l', 'd', '!']],
+                   [['T', 'h', 'i', 's'], ['i', 's'],
+                    ['M', 'e', 'h', 'r', 'd', 'a', 'd'], ['.']]]
+    ref = ragged_factory_ops.constant(output_list)
+    rt_s = ragged_squeeze_op.squeeze(rt, [0, 1, 3, 6, 7])
+    self.assertRaggedEqual(rt_s, ref)
+
+  @parameterized.parameters([
+      {
+          'input_list': [[]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[1, 2]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[1], [2]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 2]
+      },
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [2]
+      },
+      {
+          'input_list': [[[1], [2]], [[3], [4]]],
+          'squeeze_ranks': [0]
+      },
+      {
+          'input_list': [[[1], [2]], [[3], [4]]],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[], []],
+          'squeeze_ranks': [1]
+      },
+      {
+          'input_list': [[[], []], [[], []]],
+          'squeeze_ranks': [1]
+      },
+  ])
+  def test_failing_InvalidArgumentError(self, input_list, squeeze_ranks):
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(
+          ragged_squeeze_op.squeeze(
+              ragged_factory_ops.constant(input_list), squeeze_ranks))
+
+  @parameterized.parameters([
+      {
+          'input_list': [[]]
+      },
+      {
+          'input_list': [[1]]
+      },
+      {
+          'input_list': [[1, 2]]
+      },
+      {
+          'input_list': [[[1], [2]], [[3], [4]]]
+      },
+      {
+          'input_list': [[1]]
+      },
+      {
+          'input_list': [[[1], [2]], [[3], [4]]]
+      },
+      {
+          'input_list': [[[[12], [11]]]]
+      },
+  ])
+  def test_failing_no_squeeze_dim_specified(self, input_list):
+    with self.assertRaises(ValueError):
+      ragged_squeeze_op.squeeze(ragged_factory_ops.constant(input_list))
+
+  @parameterized.parameters([
+      {
+          'input_list': [[[[12], [11]]]],
+          'squeeze_ranks': [0, 1, 3]
+      },
+  ])
+  def test_failing_axis_is_not_a_list(self, input_list, squeeze_ranks):
+    with self.assertRaises(TypeError):
+      tensor_ranks = constant_op.constant(squeeze_ranks)
+      ragged_squeeze_op.squeeze(
+          ragged_factory_ops.constant(input_list), tensor_ranks)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 68d0dab..e37b345 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -24,7 +24,6 @@
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
@@ -72,7 +71,8 @@
       indices, values, shape = gen_string_ops.string_split(
           input, delimiter="", skip_empty=False)
       return ragged_tensor.RaggedTensor.from_value_rowids(
-          values=values, value_rowids=indices[:, 0], nrows=shape[0])
+          values=values, value_rowids=indices[:, 0], nrows=shape[0],
+          validate=False)
     else:
       return string_bytes_split(ragged_tensor.RaggedTensor.from_tensor(input))
 
@@ -146,8 +146,8 @@
       if input_tensor.shape.ndims == 2:
         # The input tensor is of the correct 2-D shape, it's just not ragged.
         return unicode_encode(
-            ragged_conversion_ops.from_tensor(input_tensor), output_encoding,
-            errors, replacement_char)
+            ragged_tensor.RaggedTensor.from_tensor(input_tensor),
+            output_encoding, errors, replacement_char)
       elif input_tensor.shape.ndims > 2:
         # We need to initially flatten the input tensor to 2-D, and then can
         # reshape the output of our processed flattened tensor.
@@ -166,7 +166,8 @@
         ragged_input_tensor = ragged_tensor.RaggedTensor.from_row_splits(
             input_tensor,
             array_ops.stack(
-                [0, array_ops.shape(input_tensor, out_type=dtypes.int64)[0]]))
+                [0, array_ops.shape(input_tensor, out_type=dtypes.int32)[0]]),
+            validate=False)
         output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
                                        errors, replacement_char)
         return array_ops.reshape(output_tensor, [])
@@ -404,11 +405,11 @@
   if input_ndims > 1:
     # Convert to a ragged tensor with ragged_rank = input_ndims - 1.
     if not ragged_tensor.is_ragged(input):
-      input = ragged_conversion_ops.from_tensor(
+      input = ragged_tensor.RaggedTensor.from_tensor(
           input, ragged_rank=input_ndims - 1)
     elif input.ragged_rank < input_ndims - 1:
       input = input.with_flat_values(
-          ragged_conversion_ops.from_tensor(
+          ragged_tensor.RaggedTensor.from_tensor(
               input.flat_values,
               ragged_rank=input_ndims - input.ragged_rank + 1))
 
@@ -435,12 +436,13 @@
       offsets = flat_result.char_to_byte_starts
   else:
     codepoints = ragged_tensor.RaggedTensor.from_row_splits(
-        flat_result.char_values, flat_result.row_splits)
+        flat_result.char_values, flat_result.row_splits, validate=False)
     if input_ndims > 1:
       codepoints = input.with_flat_values(codepoints)
     if with_offsets:
       offsets = ragged_tensor.RaggedTensor.from_row_splits(
-          flat_result.char_to_byte_starts, flat_result.row_splits)
+          flat_result.char_to_byte_starts, flat_result.row_splits,
+          validate=False)
       if input_ndims > 1:
         offsets = input.with_flat_values(offsets)
 
@@ -461,6 +463,8 @@
   Example:
 
   ```python
+  >>> tf.strings.split('hello world')
+  <Tensor ['hello', 'world']>
   >>> tf.strings.split(['hello world', 'a b c'])
   <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
   ```
@@ -475,7 +479,8 @@
   Note that the above mentioned behavior matches python's str.split.
 
   Args:
-    input: `1-D` string `Tensor`, the strings to split.
+    input: A string `Tensor` of rank `N`, the strings to split.  If
+      `rank(input)` is not known statically, then it is assumed to be `1`.
     sep: `0-D` string `Tensor`, the delimiter string.
     maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
     name: A name for the operation (optional).
@@ -484,15 +489,30 @@
     ValueError: If sep is not a string.
 
   Returns:
-    A `RaggedTensor` of rank `2`: the strings split according to the delimiter.
+    A `RaggedTensor` of rank `N+1`, the strings split according to the
+    delimiter.
   """
   with ops.name_scope(name, "StringSplit", [input]):
-    sparse_result = string_ops.string_split_v2(input, sep=sep,
-                                               maxsplit=maxsplit)
-    return ragged_tensor.RaggedTensor.from_value_rowids(
-        values=sparse_result.values,
-        value_rowids=sparse_result.indices[:, 0],
-        nrows=sparse_result.dense_shape[0])
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, dtype=dtypes.string, name="input")
+    if isinstance(input, ragged_tensor.RaggedTensor):
+      return input.with_flat_values(
+          string_split_v2(input.flat_values, sep, maxsplit))
+
+    rank = input.shape.ndims
+    if rank == 0:
+      return string_split_v2(array_ops.stack([input]), sep, maxsplit)[0]
+    elif rank == 1 or rank is None:
+      sparse_result = string_ops.string_split_v2(
+          input, sep=sep, maxsplit=maxsplit)
+      return ragged_tensor.RaggedTensor.from_value_rowids(
+          values=sparse_result.values,
+          value_rowids=sparse_result.indices[:, 0],
+          nrows=sparse_result.dense_shape[0],
+          validate=False)
+    else:
+      return string_split_v2(
+          ragged_tensor.RaggedTensor.from_tensor(input), sep, maxsplit)
 
 
 @tf_export(v1=["string_split"])
@@ -552,7 +572,8 @@
       return ragged_tensor.RaggedTensor.from_value_rowids(
           values=sparse_result.values,
           value_rowids=sparse_result.indices[:, 0],
-          nrows=sparse_result.dense_shape[0])
+          nrows=sparse_result.dense_shape[0],
+          validate=False)
     else:
       raise ValueError("result_type must be 'RaggedTensor' or 'SparseTensor'.")
 
@@ -590,7 +611,8 @@
   Note that the above mentioned behavior matches python's str.split.
 
   Args:
-    input: `1-D` string `Tensor`, the strings to split.
+    input: A string `Tensor` of rank `N`, the strings to split.  If
+      `rank(input)` is not known statically, then it is assumed to be `1`.
     sep: `0-D` string `Tensor`, the delimiter character.
     maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
     result_type: The tensor type for the result: one of `"RaggedTensor"` or
@@ -602,21 +624,21 @@
     ValueError: If sep is not a string.
 
   Returns:
-    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
-    The first column of the indices corresponds to the row in `source` and the
-    second column corresponds to the index of the split component in this row.
+    A `SparseTensor` or `RaggedTensor` of rank `N+1`, the strings split
+    according to the delimiter.
   """
-  source = deprecation.deprecated_argument_lookup(
+  input = deprecation.deprecated_argument_lookup(
       "input", input, "source", source)
-  with ops.name_scope(name, "StringSplit", [source]):
-    sparse_result = string_ops.string_split_v2(
-        source, sep=sep, maxsplit=maxsplit)
+  with ops.name_scope(name, "StringSplit", [input]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, dtype=dtypes.string, name="input")
+    if result_type == "SparseTensor" and input.shape.rank == 1:
+      return string_ops.string_split_v2(input, sep=sep, maxsplit=maxsplit)
+
+    ragged_result = string_split_v2(input, sep=sep, maxsplit=maxsplit)
     if result_type == "SparseTensor":
-      return sparse_result
+      return ragged_result.to_sparse()
     elif result_type == "RaggedTensor":
-      return ragged_tensor.RaggedTensor.from_value_rowids(
-          values=sparse_result.values,
-          value_rowids=sparse_result.indices[:, 0],
-          nrows=sparse_result.dense_shape[0])
+      return ragged_result
     else:
       raise ValueError("result_type must be 'RaggedTensor' or 'SparseTensor'.")
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 27438ff..e6cea0e 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -25,11 +25,14 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_ragged_conversion_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
@@ -37,7 +40,6 @@
 
 # pylint: disable=protected-access
 _eval_using_default_session = ops._eval_using_default_session
-
 # pylint: enable=protected-access
 
 #===============================================================================
@@ -115,8 +117,8 @@
       `[nvals]`, corresponding one-to-one with `values`, which specifies
       each value's row index.  In particular, the row `rt[row]` consists of the
       values `rt.values[j]` where `value_rowids[j]==row`.  `nrows` is an
-      int64 scalar that specifies the number of rows in the `RaggedTensor`.
-      (`nrows` is used to indicate trailing empty rows.)
+      integer scalar that specifies the number of rows in the
+      `RaggedTensor`. (`nrows` is used to indicate trailing empty rows.)
 
     * `row_starts`: a vector with shape `[nrows]`, which specifies the start
       offset of each row.  Equivalent to `row_splits[:-1]`.
@@ -220,10 +222,10 @@
 
     Args:
       values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
-      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.
-      cached_row_lengths: A 1-D int64 tensor with shape `[nrows]`
-      cached_value_rowids: A 1-D int64 tensor with shape `[nvals]`.
-      cached_nrows: A 1-D int64 scalar tensor.
+      row_splits: A 1-D integer tensor with shape `[nrows+1]`.
+      cached_row_lengths: A 1-D integer tensor with shape `[nrows]`
+      cached_value_rowids: A 1-D integer tensor with shape `[nvals]`.
+      cached_nrows: A 1-D integer scalar tensor.
       internal: True if the constructor is being called by one of the factory
         methods.  If false, an exception will be raised.
 
@@ -239,14 +241,30 @@
                        "of the factory methods instead (e.g., "
                        "RaggedTensor.from_row_lengths())")
 
-    # Validate the arguments.
-    if not isinstance(values, (RaggedTensor, ops.Tensor)):
-      raise TypeError("values must be a Tensor or RaggedTensor.")
-    if not isinstance(row_splits, ops.Tensor):
-      raise TypeError("Row-partitioning argument must be a Tensor.")
-    values.shape.with_rank_at_least(1)
+    is_tensor_spec = isinstance(row_splits, tensor_spec.TensorSpec)
+    if is_tensor_spec:
+      if not (isinstance(values, tensor_spec.TensorSpec) or
+              (isinstance(values, RaggedTensor) and
+               isinstance(values.row_splits, tensor_spec.TensorSpec))):
+        raise TypeError("Expected values to be a TensorSpec, got %r" % values)
+    else:
+      # Validate the arguments.
+      if not isinstance(row_splits, ops.Tensor):
+        raise TypeError("Row-partitioning argument must be a Tensor, got %r" %
+                        row_splits)
+      if not isinstance(values, (RaggedTensor, ops.Tensor)):
+        raise TypeError("values must be a Tensor or RaggedTensor, got %r" %
+                        values)
+      if row_splits.dtype not in (dtypes.int32, dtypes.int64):
+        raise ValueError("Row-partitioning argument must be int32 or int64")
+
+    # Validate shapes & dtypes.
     row_splits.shape.assert_has_rank(1)
-    row_splits.set_shape([None])
+    values.shape.with_rank_at_least(1)
+    if not is_tensor_spec:
+      row_splits.set_shape([None])
+    if isinstance(values, RaggedTensor):
+      assert row_splits.dtype == values.row_splits.dtype
 
     self._values = values
     self._row_splits = row_splits
@@ -255,8 +273,11 @@
     # round-trip conversions when a RaggedTensor is constructed from
     # lengths or rowids, and we later want those lengths/rowids back.
     for tensor in [cached_row_lengths, cached_value_rowids, cached_nrows]:
-      if tensor is not None and not isinstance(tensor, ops.Tensor):
-        raise TypeError("Cached value must be a Tensor or None.")
+      if tensor is not None:
+        if not isinstance(tensor, ops.Tensor):
+          raise TypeError("Cached value must be a Tensor or None.")
+        elif tensor.dtype not in (dtypes.int32, dtypes.int64):
+          raise TypeError("Cached value must be int32 or int64.")
     self._cached_row_lengths = cached_row_lengths
     self._cached_value_rowids = cached_value_rowids
     self._cached_nrows = cached_nrows
@@ -266,7 +287,12 @@
   #=============================================================================
 
   @classmethod
-  def from_value_rowids(cls, values, value_rowids, nrows=None, name=None):
+  def from_value_rowids(cls,
+                        values,
+                        value_rowids,
+                        nrows=None,
+                        name=None,
+                        validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
 
     The returned `RaggedTensor` corresponds with the python list defined by:
@@ -276,19 +302,18 @@
               for row in range(nrows)]
     ```
 
-    Warning: currently, this needs to cast value_rowids to int64 before
-    converting, since `tf.bincount` only supports `int32`.
-
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
-      value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
+      value_rowids: A 1-D integer tensor with shape `[nvals]`, which corresponds
         one-to-one with `values`, and specifies each value's row index.  Must be
         nonnegative, and must be sorted in ascending order.
-      nrows: An int64 scalar specifying the number of rows.  This should be
+      nrows: An integer scalar specifying the number of rows.  This should be
         specified if the `RaggedTensor` may containing empty training rows. Must
         be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
         Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor`.  `result.rank = values.rank + 1`.
@@ -306,11 +331,12 @@
       <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
       ```
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromValueRowIds",
                         [values, value_rowids, nrows]):
-      values = convert_to_tensor_or_ragged_tensor(values, name="values")
-      value_rowids = ops.convert_to_tensor(
-          value_rowids, dtypes.int64, name="value_rowids")
+      values, value_rowids = cls._convert_values_and_row_partition(
+          values, value_rowids, "value_rowids")
       if nrows is None:
         const_rowids = tensor_util.constant_value(value_rowids)
         if const_rowids is None:
@@ -318,9 +344,10 @@
           const_nrows = None
         else:
           const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
-          nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name="nrows")
+          nrows = ops.convert_to_tensor(const_nrows, value_rowids.dtype,
+                                        name="nrows")
       else:
-        nrows = ops.convert_to_tensor(nrows, dtypes.int64, "nrows")
+        nrows = ops.convert_to_tensor(nrows, value_rowids.dtype, "nrows")
         const_nrows = tensor_util.constant_value(nrows)
         if const_nrows is not None:
           if const_nrows < 0:
@@ -336,18 +363,34 @@
       nrows.shape.assert_has_rank(0)
       values.shape[:1].assert_is_compatible_with(value_rowids.shape)
 
+      if validate:
+        msg = "Arguments to from_value_rowids do not form a valid RaggedTensor"
+        nvals1 = _nrows(values)
+        nvals2 = _nrows(value_rowids)
+        checks = [
+            check_ops.assert_rank(value_rowids, 1, message=msg),
+            check_ops.assert_rank(nrows, 0, message=msg),
+            check_ops.assert_equal(nvals1, nvals2, message=msg),
+            check_ops.assert_non_negative(value_rowids[:1], message=msg),
+            _assert_monotonic_increasing(value_rowids, message=msg),
+            check_ops.assert_less(value_rowids[-1:], nrows, message=msg),
+        ]
+        if not isinstance(values, RaggedTensor):
+          checks.append(check_ops.assert_rank_at_least(values, 1))
+        value_rowids = control_flow_ops.with_dependencies(checks, value_rowids)
+
       # Convert value_rowids & nrows to row_splits.
       # Note: we don't use segment_ids_to_row_splits() here because we want
       # to save the intermediate value `row_lengths`, so we can cache it.
       # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
-      # cast (Remove the warning in the docstring when we do.)
+      # cast.
       value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
       nrows_int32 = math_ops.cast(nrows, dtypes.int32)
       row_lengths = math_ops.bincount(
           value_rowids_int32,
           minlength=nrows_int32,
           maxlength=nrows_int32,
-          dtype=dtypes.int64)
+          dtype=value_rowids.dtype)
       row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
       if const_nrows is not None:
         row_lengths.set_shape([const_nrows])
@@ -362,7 +405,7 @@
           internal=True)
 
   @classmethod
-  def from_row_splits(cls, values, row_splits, name=None):
+  def from_row_splits(cls, values, row_splits, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
 
     The returned `RaggedTensor` corresponds with the python list defined by:
@@ -374,10 +417,12 @@
 
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
-      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
-        and must be sorted in ascending order.  `row_splits[0]` must be zero and
-        `row_splits[-1]` must be `nvals`.
+      row_splits: A 1-D integer tensor with shape `[nrows+1]`.  Must not be
+        empty, and must be sorted in ascending order.  `row_splits[0]` must be
+        zero and `row_splits[-1]` must be `nvals`.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor`.  `result.rank = values.rank + 1`.
@@ -394,16 +439,35 @@
       <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
       ```
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     if isinstance(row_splits, (list, tuple)) and not row_splits:
       raise ValueError("row_splits tensor may not be empty.")
+    if isinstance(row_splits, tensor_spec.TensorSpec):
+      return cls(values=values, row_splits=row_splits, internal=True)
+
     with ops.name_scope(name, "RaggedFromRowSplits", [values, row_splits]):
-      values = convert_to_tensor_or_ragged_tensor(values, name="values")
-      row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, "row_splits")
+      values, row_splits = cls._convert_values_and_row_partition(
+          values, row_splits, "row_splits")
       row_splits.shape.assert_has_rank(1)
+
+      if validate:
+        msg = "Arguments to from_row_splits do not form a valid RaggedTensor"
+        nvals = _nrows(values, row_splits.dtype)
+        checks = [
+            check_ops.assert_rank(row_splits, 1, message=msg),
+            _assert_zero(row_splits[0], message=msg),
+            _assert_monotonic_increasing(row_splits, message=msg),
+            check_ops.assert_equal(row_splits[-1], nvals, message=msg),
+        ]
+        if not isinstance(values, RaggedTensor):
+          checks.append(check_ops.assert_rank_at_least(values, 1))
+        row_splits = control_flow_ops.with_dependencies(checks, row_splits)
+
       return cls(values=values, row_splits=row_splits, internal=True)
 
   @classmethod
-  def from_row_lengths(cls, values, row_lengths, name=None):
+  def from_row_lengths(cls, values, row_lengths, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
 
     The returned `RaggedTensor` corresponds with the python list defined by:
@@ -415,9 +479,11 @@
 
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
-      row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be
+      row_lengths: A 1-D integer tensor with shape `[nrows]`.  Must be
         nonnegative.  `sum(row_lengths)` must be `nvals`.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor`.  `result.rank = values.rank + 1`.
@@ -431,11 +497,26 @@
       <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []])>
       ```
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowLengths", [values, row_lengths]):
-      values = convert_to_tensor_or_ragged_tensor(values, name="values")
-      row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
-                                          "row_lengths")
+      values, row_lengths = cls._convert_values_and_row_partition(
+          values, row_lengths, "row_lengths")
       row_lengths.shape.assert_has_rank(1)
+
+      if validate:
+        msg = "Arguments to from_row_lengths do not form a valid RaggedTensor"
+        nvals1 = math_ops.reduce_sum(row_lengths)
+        nvals2 = _nrows(values, row_lengths.dtype)
+        checks = [
+            check_ops.assert_rank(row_lengths, 1, message=msg),
+            check_ops.assert_non_negative(row_lengths, message=msg),
+            check_ops.assert_equal(nvals1, nvals2, message=msg)
+        ]
+        if not isinstance(values, RaggedTensor):
+          checks.append(check_ops.assert_rank_at_least(values, 1))
+        row_lengths = control_flow_ops.with_dependencies(checks, row_lengths)
+
       row_limits = math_ops.cumsum(row_lengths)
       row_splits = array_ops.concat([[0], row_limits], axis=0)
       return cls(
@@ -445,17 +526,19 @@
           internal=True)
 
   @classmethod
-  def from_row_starts(cls, values, row_starts, name=None):
+  def from_row_starts(cls, values, row_starts, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
 
     Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
 
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
-      row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
-        and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must
-        be zero.
+      row_starts: A 1-D integer tensor with shape `[nrows]`.  Must be
+        nonnegative and sorted in ascending order.  If `nrows>0`, then
+        `row_starts[0]` must be zero.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor`.  `result.rank = values.rank + 1`.
@@ -469,25 +552,42 @@
       <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
       ```
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
-      values = convert_to_tensor_or_ragged_tensor(values, name="values")
-      row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, "row_starts")
+      values, row_starts = cls._convert_values_and_row_partition(
+          values, row_starts, "row_starts")
       row_starts.shape.assert_has_rank(1)
-      nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
-      row_splits = array_ops.concat([row_starts, nvals], axis=0)
+      nvals = _nrows(values, row_starts.dtype)
+
+      if validate:
+        msg = "Arguments to from_row_starts do not form a valid RaggedTensor"
+        checks = [
+            check_ops.assert_rank(row_starts, 1, message=msg),
+            _assert_zero(row_starts[:1], message=msg),
+            _assert_monotonic_increasing(row_starts, message=msg),
+            check_ops.assert_less_equal(row_starts[-1:], nvals, message=msg),
+        ]
+        if not isinstance(values, RaggedTensor):
+          checks.append(check_ops.assert_rank_at_least(values, 1))
+        row_starts = control_flow_ops.with_dependencies(checks, row_starts)
+
+      row_splits = array_ops.concat([row_starts, [nvals]], axis=0)
       return cls(values=values, row_splits=row_splits, internal=True)
 
   @classmethod
-  def from_row_limits(cls, values, row_limits, name=None):
+  def from_row_limits(cls, values, row_limits, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
 
     Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
 
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
-      row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
+      row_limits: A 1-D integer tensor with shape `[nrows]`.  Must be sorted in
         ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor`.  `result.rank = values.rank + 1`.
@@ -501,11 +601,27 @@
       <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
       ```
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowLimits", [values, row_limits]):
-      values = convert_to_tensor_or_ragged_tensor(values, name="values")
-      row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, "row_limits")
+      values, row_limits = cls._convert_values_and_row_partition(
+          values, row_limits, "row_limits")
       row_limits.shape.assert_has_rank(1)
-      zero = array_ops.zeros([1], dtypes.int64)
+
+      if validate:
+        msg = "Arguments to from_row_limits do not form a valid RaggedTensor"
+        nvals = _nrows(values, row_limits.dtype)
+        checks = [
+            check_ops.assert_rank(row_limits, 1, message=msg),
+            check_ops.assert_non_negative(row_limits[:1], message=msg),
+            _assert_monotonic_increasing(row_limits, message=msg),
+            check_ops.assert_equal(row_limits[-1:], nvals, message=msg)
+        ]
+        if not isinstance(values, RaggedTensor):
+          checks.append(check_ops.assert_rank_at_least(values, 1))
+        row_limits = control_flow_ops.with_dependencies(checks, row_limits)
+
+      zero = array_ops.zeros([1], row_limits.dtype)
       row_splits = array_ops.concat([zero, row_limits], axis=0)
       return cls(values=values, row_splits=row_splits, internal=True)
 
@@ -514,7 +630,8 @@
                                flat_values,
                                nested_value_rowids,
                                nested_nrows=None,
-                               name=None):
+                               name=None,
+                               validate=True):
     """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
 
     Equivalent to:
@@ -527,11 +644,13 @@
 
     Args:
       flat_values: A potentially ragged tensor.
-      nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is
+      nested_value_rowids: A list of 1-D integer tensors.  The `i`th tensor is
         used as the `value_rowids` for the `i`th ragged dimension.
-      nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
+      nested_nrows: A list of integer scalars.  The `i`th scalar is used as the
         `nrows` for the `i`th ragged dimension.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor` (or `flat_values` if `nested_value_rowids` is empty).
@@ -539,6 +658,8 @@
     Raises:
       ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     if isinstance(nested_value_rowids, ops.Tensor):
       raise TypeError("nested_value_rowids must be a list of Tensors")
     if nested_nrows is None:
@@ -556,11 +677,16 @@
       result = flat_values
       for value_rowids, nrows in reversed(
           list(zip(nested_value_rowids, nested_nrows))):
-        result = cls.from_value_rowids(result, value_rowids, nrows)
+        result = cls.from_value_rowids(result, value_rowids, nrows,
+                                       validate=validate)
       return result
 
   @classmethod
-  def from_nested_row_splits(cls, flat_values, nested_row_splits, name=None):
+  def from_nested_row_splits(cls,
+                             flat_values,
+                             nested_row_splits,
+                             name=None,
+                             validate=True):
     """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
 
     Equivalent to:
@@ -573,24 +699,32 @@
 
     Args:
       flat_values: A potentially ragged tensor.
-      nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used
-        as the `row_splits` for the `i`th ragged dimension.
+      nested_row_splits: A list of 1-D integer tensors.  The `i`th tensor is
+        used as the `row_splits` for the `i`th ragged dimension.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor` (or `flat_values` if `nested_row_splits` is empty).
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     if isinstance(nested_row_splits, ops.Tensor):
       raise TypeError("nested_row_splits must be a list of Tensors")
     with ops.name_scope(name, "RaggedFromNestedRowSplits",
                         [flat_values] + list(nested_row_splits)):
       result = flat_values
       for splits in reversed(nested_row_splits):
-        result = cls.from_row_splits(result, splits)
+        result = cls.from_row_splits(result, splits, validate=validate)
       return result
 
   @classmethod
-  def from_nested_row_lengths(cls, flat_values, nested_row_lengths, name=None):
+  def from_nested_row_lengths(cls,
+                              flat_values,
+                              nested_row_lengths,
+                              name=None,
+                              validate=True):
     """Creates a `RaggedTensor` from a nested list of `row_lengths` tensors.
 
     Equivalent to:
@@ -603,22 +737,70 @@
 
     Args:
       flat_values: A potentially ragged tensor.
-      nested_row_lengths: A list of 1-D int64 tensors.  The `i`th tensor is used
-        as the `row_lengths` for the `i`th ragged dimension.
+      nested_row_lengths: A list of 1-D integer tensors.  The `i`th tensor is
+        used as the `row_lengths` for the `i`th ragged dimension.
       name: A name prefix for the RaggedTensor (optional).
+      validate: If true, then use assertions to check that the arguments form
+        a valid `RaggedTensor`.
 
     Returns:
       A `RaggedTensor` (or `flat_values` if `nested_row_lengths` is empty).
     """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
     if isinstance(nested_row_lengths, ops.Tensor):
       raise TypeError("nested_row_lengths must be a list of Tensors")
     with ops.name_scope(name, "RaggedFromNestedRowlengths",
                         [flat_values] + list(nested_row_lengths)):
       result = flat_values
       for lengths in reversed(nested_row_lengths):
-        result = cls.from_row_lengths(result, lengths)
+        result = cls.from_row_lengths(result, lengths, validate=validate)
       return result
 
+  @classmethod
+  def _convert_values_and_row_partition(cls, values, partition, name):
+    """Converts `values` and `partition` to Tensors.
+
+    If `values` is a `RaggedTensor`, then converts `values` and `partition`
+    to have compatible row-partitioning dtypes.  In particular, if any of the
+    row partitioning tensors are `int64`, then all of the other row
+    partitioning tensors wil be cast to `int64` (if auto_cast_partition_dtype()
+    is true) or an error will be raised (if auto_cast_partition_dtype() is
+    false).
+
+    Args:
+      values: The `values` for the `RaggedTensor` being constructed.
+      partition: A row-partitioning tensor for the `RaggedTensor` being
+        constructed.  I.e., one of: row_splits, row_lengths, row_starts,
+        row_limits, value_rowids.
+      name: The name of the row-partitioning tensor.
+
+    Returns:
+      A tuple (values, partition).
+    """
+    if isinstance(values, RaggedTensor):
+      if isinstance(partition, ops.Tensor):
+        if partition.dtype not in (dtypes.int32, dtypes.int64):
+          raise ValueError("%s must have dtype int32 or int64" % name)
+        if values.row_splits.dtype != partition.dtype:
+          if not ragged_config.auto_cast_partition_dtype():
+            raise ValueError("dtype mismatch: %s (%s) vs values.row_splits (%s)"
+                             % (name, partition.dtype, values.row_splits.dtype))
+          partition = math_ops.cast(partition, dtypes.int64)
+          values = values.with_row_splits_dtype(dtypes.int64)
+      else:
+        partition = ops.convert_to_tensor(partition, values.row_splits.dtype,
+                                          name=name)
+    else:
+      values = ops.convert_to_tensor(values, name="values")
+      partition = ops.convert_to_tensor(
+          partition, preferred_dtype=dtypes.int64,
+          name=name)
+      if partition.dtype not in (dtypes.int32, dtypes.int64):
+        raise ValueError("%s must have dtype int32 or int64" % name)
+
+    return (values, partition)
+
   #=============================================================================
   # Accessors
   #=============================================================================
@@ -696,7 +878,7 @@
     the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
 
     Returns:
-      A 1-D `int64` `Tensor` with shape `[self.nrows+1]`.
+      A 1-D integer `Tensor` with shape `[self.nrows+1]`.
       The returned tensor is non-empty, and is sorted in ascending order.
       `self.row_splits[0]` is zero, and `self.row_splits[-1]` is equal to
       `self.values.shape[0]`.
@@ -752,7 +934,7 @@
         * `value_splits = rt.values.nested_row_splits` otherwise.
 
     Returns:
-      A `tuple` of 1-D `int64` `Tensor`s.
+      A `tuple` of 1-D integer `Tensor`s.
 
     #### Example:
 
@@ -785,7 +967,7 @@
       name: A name prefix for the returned tensor (optional).
 
     Returns:
-      A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
+      A 1-D integer `Tensor` with shape `self.values.shape[:1]`.
       The returned tensor is nonnegative, and is sorted in ascending order.
 
     #### Example:
@@ -803,13 +985,14 @@
     with ops.name_scope(name, "RaggedValueRowIds", [self]):
       return segment_id_ops.row_splits_to_segment_ids(self.row_splits)
 
-  def nrows(self, out_type=dtypes.int64, name=None):
+  def nrows(self, out_type=None, name=None):
     """Returns the number of rows in this ragged tensor.
 
     I.e., the size of the outermost dimension of the tensor.
 
     Args:
-      out_type: `dtype` for the returned tensor.
+      out_type: `dtype` for the returned tensor.  Defaults to
+        `self.row_splits.dtype`.
       name: A name prefix for the returned tensor (optional).
 
     Returns:
@@ -822,9 +1005,12 @@
       5
       ```
     """
+    if out_type is None:
+      out_type = self._row_splits.dtype
+    else:
+      out_type = dtypes.as_dtype(out_type)
     if self._cached_nrows is not None:
-      return self._cached_nrows
-
+      return math_ops.cast(self._cached_nrows, out_type)
     with ops.name_scope(name, "RaggedNRows", [self]):
       return array_ops.shape(self.row_splits, out_type=out_type)[0] - 1
 
@@ -838,7 +1024,7 @@
       name: A name prefix for the returned tensor (optional).
 
     Returns:
-      A 1-D Tensor of int64 with shape `[nrows]`.
+      A 1-D integer Tensor with shape `[nrows]`.
       The returned tensor is nonnegative, and is sorted in ascending order.
 
     #### Example:
@@ -863,7 +1049,7 @@
       name: A name prefix for the returned tensor (optional).
 
     Returns:
-      A 1-D Tensor of int64 with shape `[nrows]`.
+      A 1-D integer Tensor with shape `[nrows]`.
       The returned tensor is nonnegative, and is sorted in ascending order.
 
     #### Example:
@@ -890,7 +1076,7 @@
       name: A name prefix for the returned tensor (optional).
 
     Returns:
-      A potentially ragged Tensor of int64 with shape `self.shape[:axis]`.
+      A potentially ragged integer Tensor with shape `self.shape[:axis]`.
 
     Raises:
       ValueError: If `axis` is out of bounds.
@@ -917,9 +1103,10 @@
       elif isinstance(self.values, RaggedTensor):
         return self.with_values(self.values.row_lengths(axis - 1))
       else:
-        shape = array_ops.shape(self.values, out_type=dtypes.int64)
+        shape = array_ops.shape(self.values, out_type=self._row_splits.dtype)
         return self.with_values(
-            array_ops.ones(shape[:axis - 1], dtypes.int64) * shape[axis - 1])
+            array_ops.ones(shape[:axis - 1], self._row_splits.dtype) *
+            shape[axis - 1])
 
   def nested_row_lengths(self, name=None):
     """Returns a tuple containing the row_lengths for all ragged dimensions.
@@ -931,7 +1118,7 @@
       name: A name prefix for the returned tensors (optional).
 
     Returns:
-      A `tuple` of 1-D `int64` `Tensors`.  The length of the tuple is equal to
+      A `tuple` of 1-D integer `Tensors`.  The length of the tuple is equal to
       `self.ragged_rank`.
     """
     with ops.name_scope(name, "RaggedNestedRowLengths", [self]):
@@ -942,7 +1129,7 @@
         rt = rt.values
       return tuple(rt_nested_row_lengths)
 
-  def bounding_shape(self, axis=None, name=None):
+  def bounding_shape(self, axis=None, name=None, out_type=None):
     """Returns the tight bounding box shape for this `RaggedTensor`.
 
     Args:
@@ -950,13 +1137,15 @@
         bounding box for.  If not specified, then the full bounding box is
         returned.
       name: A name prefix for the returned tensor (optional).
+      out_type: `dtype` for the returned tensor.  Defaults to
+        `self.row_splits.dtype`.
 
     Returns:
-      An int64 `Tensor`.  If `axis` is not specified, then `output`
-      is a vector with `output.shape=[self.shape.ndims]`.  If `axis` is a
-      scalar, then the `output` is a scalar.  If `axis` is a vector, then
-      `output` is a vector, where `output[i]` is the bounding size for
-      dimension `axis[i]`.
+      An integer `Tensor` (`dtype=self.row_splits.dtype`).  If `axis` is not
+      specified, then `output` is a vector with
+      `output.shape=[self.shape.ndims]`.  If `axis` is a scalar, then the
+      `output` is a scalar.  If `axis` is a vector, then `output` is a vector,
+      where `output[i]` is the bounding size for dimension `axis[i]`.
 
     #### Example:
       ```python
@@ -965,6 +1154,10 @@
       [5, 4]
       ```
     """
+    if out_type is None:
+      out_type = self._row_splits.dtype
+    else:
+      out_type = dtypes.as_dtype(out_type)
     with ops.name_scope(name, "RaggedBoundingBox", [self, axis]):
       nested_splits = self.nested_row_splits
       rt_flat_values = self.flat_values
@@ -972,12 +1165,12 @@
       # Optimized special cases for when axis=0 or axis=1:
       if isinstance(axis, int):
         if axis == 0:
-          return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
+          return array_ops.shape(nested_splits[0], out_type=out_type)[0] - 1
         elif axis == 1:
           return math_ops.maximum(math_ops.reduce_max(self.row_lengths()), 0)
 
-      splits_shape = array_ops.shape(self.row_splits, out_type=dtypes.int64)
-      flat_values_shape = array_ops.shape(rt_flat_values, out_type=dtypes.int64)
+      splits_shape = array_ops.shape(self.row_splits, out_type=out_type)
+      flat_values_shape = array_ops.shape(rt_flat_values, out_type=out_type)
 
       ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
           math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
@@ -1009,6 +1202,14 @@
     """
     new_values.shape.with_rank_at_least(1)
     self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
+    if (isinstance(new_values, RaggedTensor) and
+        self._row_splits.dtype != new_values.row_splits.dtype):
+      if not ragged_config.auto_cast_partition_dtype():
+        raise ValueError("self and new_values have mismatched row_splits "
+                         "dtypes; use RaggedTensor.with_row_splits_dtype() to "
+                         "convert them to compatible dtypes.")
+      new_values = new_values.with_row_splits_dtype(dtypes.int64)
+      return self.with_row_splits_dtype(dtypes.int64).with_values(new_values)
     return RaggedTensor(
         new_values,
         self._row_splits,
@@ -1038,6 +1239,43 @@
     else:
       return self.with_values(self.values.with_flat_values(new_values))
 
+  def with_row_splits_dtype(self, dtype):
+    """Returns a copy of this RaggedTensor with the given `row_splits` dtype.
+
+    For RaggedTensors with multiple ragged dimensions, the `row_splits` for all
+    nested `RaggedTensor` objects are cast to the given dtype.
+
+    Args:
+      dtype: The dtype for `row_splits`.  One of `tf.int32` or `tf.int64`.
+
+    Returns:
+      A copy of this RaggedTensor, with the `row_splits` cast to the given
+      type.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    if dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError("dtype must be int32 or int64")
+    if self._row_splits.dtype == dtype:
+      return self
+
+    row_splits = math_ops.cast(self._row_splits, dtype)
+
+    values = self._values
+    if isinstance(values, RaggedTensor):
+      values = values.with_row_splits_dtype(dtype)
+    cached_row_lengths = self._cached_row_lengths
+    if cached_row_lengths is not None:
+      cached_row_lengths = math_ops.cast(cached_row_lengths, dtype)
+    cached_value_rowids = self._cached_value_rowids
+    if cached_value_rowids is not None:
+      cached_value_rowids = math_ops.cast(cached_value_rowids, dtype)
+    cached_nrows = self._cached_nrows
+    if cached_value_rowids is not None:
+      cached_value_rowids = math_ops.cast(cached_value_rowids, dtype)
+
+    return RaggedTensor(values, row_splits, cached_row_lengths,
+                        cached_value_rowids, cached_nrows, internal=True)
+
   #=============================================================================
   # Tensor Type Conversions
   #=============================================================================
@@ -1048,7 +1286,8 @@
                   lengths=None,
                   padding=None,
                   ragged_rank=1,
-                  name=None):
+                  name=None,
+                  row_splits_dtype=dtypes.int64):
     """Converts a `tf.Tensor` into a `RaggedTensor`.
 
     The set of absent/default values may be specified using a vector of lengths
@@ -1096,6 +1335,8 @@
       ragged_rank: Integer specifying the ragged rank for the returned
         `RaggedTensor`.  Must be greater than zero.
       name: A name prefix for the returned tensors (optional).
+      row_splits_dtype: `dtype` for the returned `RaggedTensor`'s `row_splits`
+        tensor.  One of `tf.int32` or `tf.int64`.
 
     Returns:
       A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
@@ -1103,6 +1344,7 @@
     Raises:
       ValueError: If both `lengths` and `padding` are specified.
     """
+    row_splits_dtype = dtypes.as_dtype(row_splits_dtype)
     if lengths is not None and padding is not None:
       raise ValueError("Specify lengths or padding, but not both")
     if not isinstance(ragged_rank, int):
@@ -1114,7 +1356,7 @@
     with ops.name_scope(name, "RaggedFromTensor", [tensor, lengths, padding]):
       tensor = ops.convert_to_tensor(tensor, name="tensor")
       tensor.shape.with_rank_at_least(ragged_rank + 1)
-      input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
+      input_shape = array_ops.shape(tensor, out_type=row_splits_dtype)
       ncols = input_shape[1]
 
       # Handle ragged_rank>1 via recursion:
@@ -1125,12 +1367,14 @@
       if ragged_rank > 1:
         # Flatten `tensor` to eliminate all but the last ragged dimension.
         new_shape = array_ops.concat([
-            constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]
+            constant_op.constant([-1], row_splits_dtype),
+            input_shape[ragged_rank:]
         ],
                                      axis=0)
         flattened = array_ops.reshape(tensor, new_shape)
         # Recursively convert the flattened tensor.
-        values = cls.from_tensor(flattened, lengths, padding)
+        values = cls.from_tensor(flattened, lengths, padding,
+                                 row_splits_dtype=row_splits_dtype)
         # The total number of elements in each  dimension.  E.g., if
         # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
         dim_size = math_ops.cumprod(input_shape)
@@ -1139,7 +1383,7 @@
             math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
             for dim in range(1, ragged_rank)
         ]
-        return cls.from_nested_row_splits(values, new_splits)
+        return cls.from_nested_row_splits(values, new_splits, validate=False)
 
       # If padding was specified, then use it to find row lengths.
       if padding is not None:
@@ -1167,12 +1411,12 @@
         has_default.set_shape(tensor_shape.TensorShape([None, None]))
         has_default.set_shape(tensor.shape[:2])
 
-        # Use has_default it to find the length of each row: for each
+        # Use has_default to find the length of each row: for each
         # non-default item in a row, calculate the length that the row needs to
         # have to include that item; and then take the max of those values
         # (across each row).
         has_nondefault = math_ops.logical_not(has_default)
-        has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
+        has_nondefault = math_ops.cast(has_nondefault, row_splits_dtype)
         length_for_nondefault_value = (
             has_nondefault * array_ops.expand_dims(
                 math_ops.range(1, ncols + 1), 0))
@@ -1189,25 +1433,27 @@
           tensor.shape.with_rank_at_least(len(lengths) + 1)
           num_tokens = math_ops.reduce_sum(lengths[-1])
           ones_mask = array_ops.ones([num_tokens], dtype=dtypes.bool)
-          ragged_mask = cls.from_nested_row_lengths(ones_mask, lengths)
+          ragged_mask = cls.from_nested_row_lengths(
+              ones_mask, lengths, validate=False)
           dense_ragged_mask = ragged_mask.to_tensor(default_value=False)
           masked_data = array_ops.boolean_mask(tensor, dense_ragged_mask)
-          return cls.from_nested_row_lengths(masked_data, lengths)
+          return cls.from_nested_row_lengths(
+              masked_data, lengths, validate=False)
         else:
           # If we have lengths (either directly supplied, or computed from
           # paddings), then use those to construct splits; and then use masking
           # to get the corresponding values.
           lengths = ragged_util.convert_to_int_tensor(lengths, "lengths",
-                                                      dtypes.int64)
+                                                      row_splits_dtype)
           lengths.shape.assert_has_rank(1)
           lengths = math_ops.minimum(lengths, ncols)
           lengths = math_ops.maximum(lengths, 0)
           limits = math_ops.cumsum(lengths)
           splits = array_ops.concat(
-              [array_ops.zeros([1], dtypes.int64), limits], axis=0)
+              [array_ops.zeros([1], row_splits_dtype), limits], axis=0)
           mask = array_ops.sequence_mask(lengths, maxlen=ncols)
           values = array_ops.boolean_mask(tensor, mask)
-          return cls.from_row_splits(values, splits)
+          return cls.from_row_splits(values, splits, validate=False)
 
       # If neither padding nor lengths were specified, then create a splits
       # vector that contains no default values, and reshape the input tensor
@@ -1217,7 +1463,7 @@
       splits = math_ops.range(nrows + 1) * ncols
       values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
       values = array_ops.reshape(tensor, values_shape)
-      return cls.from_row_splits(values, splits)
+      return cls.from_row_splits(values, splits, validate=False)
 
   def to_tensor(self, default_value=None, name=None):
     """Converts this `RaggedTensor` into a `tf.Tensor`.
@@ -1267,9 +1513,10 @@
 
       # Get the expected dense shape ([nrows, ncols] + value_shape).
       rt_row_lengths = [self.row_splits[1:] - self.row_splits[:-1]]
-      nrows = array_ops.shape(self.row_splits, out_type=dtypes.int64)[0] - 1
+      nrows = array_ops.shape(self.row_splits,
+                              out_type=self._row_splits.dtype)[0] - 1
       ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
-      values_shape = array_ops.shape(values, out_type=dtypes.int64)
+      values_shape = array_ops.shape(values, out_type=self._row_splits.dtype)
       value_shape = values_shape[1:]
       nvals = values_shape[0]
 
@@ -1305,7 +1552,7 @@
       return array_ops.gather(values_and_default, indices)
 
   @classmethod
-  def from_sparse(cls, st_input, name=None):
+  def from_sparse(cls, st_input, name=None, row_splits_dtype=dtypes.int64):
     """Converts a 2D `tf.SparseTensor` to a `RaggedTensor`.
 
     Each row of the `output` `RaggedTensor` will contain the explicit values
@@ -1327,6 +1574,8 @@
     Args:
       st_input: The sparse tensor to convert.  Must have rank 2.
       name: A name prefix for the returned tensors (optional).
+      row_splits_dtype: `dtype` for the returned `RaggedTensor`'s `row_splits`
+        tensor.  One of `tf.int32` or `tf.int64`.
 
     Returns:
       A `RaggedTensor` with the same values as `st_input`.
@@ -1336,6 +1585,7 @@
       ValueError: If the number of dimensions in `st_input` is not known
         statically, or is not two.
     """
+    row_splits_dtype = dtypes.as_dtype(row_splits_dtype)
     if not sparse_tensor.is_sparse(st_input):
       raise TypeError("Expected SparseTensor, got %s" % type(st_input).__name__)
     with ops.name_scope(name, "RaggedFromSparse", [st_input]):
@@ -1360,9 +1610,10 @@
         # Treat sparse row indices as segment ids to generate a splits tensor
         # thta we can pair with the sparse tensor values.  (Ignore sparse column
         # indices.)
-        segment_ids = st_input.indices[:, 0]
-        num_segments = st_input.dense_shape[0]
-        return cls.from_value_rowids(st_input.values, segment_ids, num_segments)
+        segment_ids = math_ops.cast(st_input.indices[:, 0], row_splits_dtype)
+        num_segments = math_ops.cast(st_input.dense_shape[0], row_splits_dtype)
+        return cls.from_value_rowids(
+            st_input.values, segment_ids, num_segments, validate=False)
 
   def to_sparse(self, name=None):
     """Converts this `RaggedTensor` into a `tf.SparseTensor`.
@@ -1390,6 +1641,105 @@
                                         result.sparse_values,
                                         result.sparse_dense_shape)
 
+  @classmethod
+  def _from_variant(cls,
+                    variant,
+                    dtype,
+                    output_ragged_rank,
+                    input_ragged_rank=None,
+                    name=None):
+    """Converts a `variant` Tensor into a `RaggedTensor`.
+
+    The input `variant` could be a scalar, meaning it encodes a single
+    `RaggedTensor` with ragged_rank `output_ragged_rank`. Alternatively it could
+    have an arbitrary rank, in which case each element is decoded into a
+    `RaggedTensor` with ragged_rank `input_ragged_rank` and these are then
+    stacked according to the input shape to output a single `RaggedTensor`
+    with ragged_rank `output_ragged_rank`. If `input_ragged_rank` is not
+    provided, it is inferred dynamically as `output_ragged_rank` -
+    `rank(variant)`. If `input_ragged_rank` is provided, the following must be
+    true: `output_ragged_rank` = `input_ragged_rank` + `rank(variant)`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[0], [1, 2]])
+    >>> et = rt._to_variant()
+    >>> stacked_et = ragged.stack([et, et])
+    >>> ragged.RaggedTensor._from_variant(  # scalar input.
+          et, dtype=tf.int32, output_ragged_rank=1).eval().tolist()
+    [[0], [1, 2]]
+    >>> ragged.RaggedTensor._from_variant(  # batched input.
+          stacked_et, dtype=tf.int32, output_ragged_rank=2).eval().tolist()
+    [[[0], [1, 2]], [[0], [1, 2]]]
+    ```
+
+    Args:
+      variant: A `variant` Tensor representing an encoded (possibly
+        nested-batched) `RaggedTensor`.
+      dtype: The dtype of the encoded `RaggedTensor`.
+      output_ragged_rank: The expected ragged rank of the output `RaggedTensor`.
+      input_ragged_rank: The ragged rank of each encoded `RaggedTensor`. This
+        is optional and inferred dynamically if not provided.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` of dtype `dtype` and ragged rank `output_ragged_rank`.
+
+    Raises:
+      ValueError: If the input rank is known, `input_ragged_rank` is provided
+          and `output_ragged_rank` = `input_ragged_rank` + `rank(variant)` does
+          not hold.
+    """
+    variant = ops.convert_to_tensor(
+        variant, name="variant", dtype=dtypes.variant)
+    if (variant.shape.ndims is not None and input_ragged_rank is not None and
+        output_ragged_rank != input_ragged_rank + variant.shape.ndims):
+      raise ValueError(
+          "output_ragged_rank must be equal to input_ragged_rank +"
+          "variant.shape.ndims, found variant.shape.ndims: %d, "
+          "input_ragged_rank: %d, output_ragged_rank: %d" %
+          (variant.shape.ndims, input_ragged_rank, output_ragged_rank))
+    input_ragged_rank = -1 if input_ragged_rank is None else input_ragged_rank
+    with ops.name_scope(
+        name, "RaggedFromVariant",
+        [variant, dtype, input_ragged_rank, output_ragged_rank]):
+      result = gen_ragged_conversion_ops.ragged_tensor_from_variant(
+          variant, input_ragged_rank, output_ragged_rank, dtype, dtypes.int64,
+          name)
+      return cls.from_nested_row_splits(
+          result.output_dense_values,
+          result.output_nested_splits,
+          validate=False)
+
+  def _to_variant(self, batched_input=False, name=None):
+    """Converts this `RaggedTensor` into a `variant` Tensor.
+
+    If `batched_input` is `True`, then the `RaggedTensor` is unbatched along the
+    zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+    `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+    If `batched_input` is `False`, then the `RaggedTensor` is encoded as is and
+    a scalar `variant` Tensor is returned.
+
+    Example:
+    >>> rt = ragged.constant([[[0]], [[1]], [[2]]])
+    >>> rt._to_variant().shape.as_list()
+    []
+    >>> rt._to_variant(batched_input=True).shape.as_list()
+    [3]
+
+    Args:
+      batched_input: If `True`, the `RaggedTensor` is unbatched and converted to
+        a `variant` vector. Set to `False` by default.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `variant` Tensor that encodes this `RaggedTensor`.
+    """
+    with ops.name_scope(name, "RaggedToVariant", [self, batched_input]):
+      return gen_ragged_conversion_ops.ragged_tensor_to_variant(
+          self.nested_row_splits, self.flat_values, batched_input, name)
+
   #=============================================================================
   # String Encoding
   #=============================================================================
@@ -1472,8 +1822,9 @@
     return (self.flat_values,) + self.nested_row_splits
 
   @classmethod
-  def _from_components(cls, components):
-    return cls.from_nested_row_splits(components[0], components[1:])
+  def _from_components(cls, components, metadata):
+    return cls.from_nested_row_splits(
+        components[0], components[1:], validate=False)
 
   def _shape_invariant_to_components(self, shape=None):
     ragged_rank = self.ragged_rank
@@ -1509,7 +1860,10 @@
 
   @property
   def _is_graph_tensor(self):
-    return hasattr(self._values, 'graph')
+    return hasattr(self._values, "graph")
+
+  def consumers(self):
+    return self._consumers()
 
 
 def is_ragged(value):
@@ -1518,6 +1872,50 @@
                     (RaggedTensor, ragged_tensor_value.RaggedTensorValue))
 
 
+def match_row_splits_dtypes(*tensors, **kwargs):
+  """Return a copy of `tensors` with row_splits all having the same dtype.
+
+  Args:
+    *tensors: A list of Tensors or RaggedTensors.
+    **kwargs: If 'return_dtype=True', then return a tuple (dtype, tensors),
+      where `dtype` is the data type used by row-splits, and `tensors` is the
+      converted list of `Tensors` and `RaggedTensors`.
+  Returns:
+    The converted list of `Tensors` and `RaggedTensors`.
+  """
+  return_dtype = kwargs.pop("return_dtype", False)
+  if kwargs:
+    raise ValueError("Unexpected keyword args %r" % kwargs)
+
+  has_int32 = False
+  has_int64 = False
+  for tensor in tensors:
+    if isinstance(tensor, RaggedTensor):
+      if tensor.row_splits.dtype == dtypes.int32:
+        has_int32 = True
+      else:
+        has_int64 = True
+
+  if has_int32 and has_int64:
+    if not ragged_config.auto_cast_partition_dtype():
+      raise ValueError("Input RaggedTensors have mismatched row_splits dtypes; "
+                       "use RaggedTensor.with_row_splits_dtype() to convert "
+                       "them to compatible dtypes.")
+    dtype = dtypes.int64
+    tensors = tuple(t.with_row_splits_dtype(dtypes.int64)
+                    if isinstance(t, RaggedTensor) else t for t in tensors)
+
+  elif has_int32:
+    dtype = dtypes.int32
+  else:
+    dtype = dtypes.int64
+
+  if return_dtype:
+    return (dtype, tensors)
+  else:
+    return tensors
+
+
 #===============================================================================
 # Convert value -> tensor
 #===============================================================================
@@ -1558,8 +1956,8 @@
           dtype=dtype,
           preferred_dtype=preferred_dtype,
           name="flat_values")
-      return RaggedTensor.from_nested_row_splits(flat_values,
-                                                 value.nested_row_splits)
+      return RaggedTensor.from_nested_row_splits(
+          flat_values, value.nested_row_splits, validate=False)
   else:
     return ops.convert_to_tensor(
         value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
@@ -1606,18 +2004,23 @@
   `RaggedTensor`.
   """
 
-  def __init__(self, dtype, ragged_rank):
+  def __init__(self, dtype, ragged_rank, row_splits_dtype=dtypes.int64):
     """Initializes a RaggedTensorType object.
 
     Args:
       dtype: data type of the `RaggedTensor`'s inner values.
       ragged_rank: ragged_rank of the declared `RaggedTensor`.
+      row_splits_dtype: data type for the `RaggedTensor`'s row splits.
+        One of: `tf.int32` or `tf.int64`.
     """
+    row_splits_dtype = dtypes.as_dtype(row_splits_dtype)
     self._dtype = dtype
     self._ragged_rank = ragged_rank
+    self._row_splits_dtype = row_splits_dtype
 
   dtype = property(lambda self: self._dtype)
   ragged_rank = property(lambda self: self._ragged_rank)
+  row_splits_dtype = property(lambda self: self._row_splits_dtype)
 
 
 #===============================================================================
@@ -1682,3 +2085,20 @@
                                            flat_values_shape)
 
   return nested_row_splits_gradient + [flat_values_gradient]
+
+
+def _assert_monotonic_increasing(tensor, message=None):
+  return check_ops.assert_non_negative(
+      tensor[1:] - tensor[:-1], message=message)
+
+
+def _assert_zero(tensor, message=None):
+  return check_ops.assert_equal(
+      tensor, constant_op.constant(0, dtype=tensor.dtype), message=message)
+
+
+def _nrows(tensor, out_type=dtypes.int32):
+  if isinstance(tensor, RaggedTensor):
+    return tensor.nrows(out_type=out_type)
+  else:
+    return array_ops.shape(tensor, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
index 706881d..111e2c8 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -27,7 +27,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 
@@ -82,7 +82,8 @@
   `[[[1, 2], [3]], [[4, 5]]]`    |      2 | `2, (2, 1), (2, 1, 2)` |
   """
 
-  def __init__(self, partitioned_dim_sizes, inner_dim_sizes):
+  def __init__(self, partitioned_dim_sizes, inner_dim_sizes,
+               dim_size_dtype=None):
     """Creates a RaggedTensorDynamicShape.
 
     Args:
@@ -96,16 +97,19 @@
         number of inner dimensions.  `inner_dim_sizes[n]` is the size of all
         slices across the `n`th inner dimension (which is the
         `(len(partitioned_dim_sizes)+n)`th dimension in the overall tensor.
+      dim_size_dtype: dtype for dimension sizes.  If not specified, then it
+        is chosen based on the dtypes of `partitioned_dim_sizes` and
+        `inner_dim_sizes`.
     """
     assert isinstance(partitioned_dim_sizes, (list, tuple))
+
     with ops.name_scope(None, 'RaggedTensorDynamicShape',
                         (partitioned_dim_sizes, inner_dim_sizes)):
       partitioned_dim_sizes = tuple(
-          ragged_util.convert_to_int_tensor(
-              size, dtype=dtypes.int64, name='partitioned_dimension_size')
-          for size in partitioned_dim_sizes)
-      inner_dim_sizes = ragged_util.convert_to_int_tensor(
-          inner_dim_sizes, dtype=dtypes.int64, name='inner_dim_sizes')
+          ops.convert_to_tensor(size, name='partitioned_dimension_size_%d' % i)
+          for (i, size) in enumerate(partitioned_dim_sizes))
+      inner_dim_sizes = ops.convert_to_tensor(
+          inner_dim_sizes, name='inner_dim_sizes')
 
       # Validate shapes.
       if partitioned_dim_sizes:
@@ -120,6 +124,22 @@
           raise ValueError('innermost partitioned dimension must be ragged')
       inner_dim_sizes.shape.assert_has_rank(1)
 
+      # Convert dimension size tensors to a single dtype.
+      if dim_size_dtype is None:
+        dim_size_dtypes = set([p.dtype for p in partitioned_dim_sizes
+                               if p.shape.ndims == 1])
+        if not dim_size_dtypes:
+          dim_size_dtype = dtypes.int64
+        elif len(dim_size_dtypes) == 1:
+          dim_size_dtype = dim_size_dtypes.pop()
+        else:
+          if not ragged_config.auto_cast_partition_dtype():
+            raise ValueError('partitioned_dim_sizes must have matching dtypes')
+          dim_size_dtype = dtypes.int64
+      partitioned_dim_sizes = tuple(math_ops.cast(p, dim_size_dtype)
+                                    for p in partitioned_dim_sizes)
+      inner_dim_sizes = math_ops.cast(inner_dim_sizes, dim_size_dtype)
+
       self._partitioned_dim_sizes = partitioned_dim_sizes
       self._inner_dim_sizes = inner_dim_sizes
 
@@ -137,7 +157,7 @@
     ragged.
 
     Args:
-      dim_sizes: List of int64 scalars or vectors.
+      dim_sizes: List of int32 or int64 scalars or vectors.
 
     Returns:
       A RaggedTensorDynamicShape.
@@ -145,8 +165,8 @@
     with ops.name_scope(None, 'RaggedTensorDynamicShapeFromDimensionSizes',
                         [dim_sizes]):
       dim_sizes = tuple(
-          ragged_util.convert_to_int_tensor(
-              size, dtype=dtypes.int64, name='dim_sizes') for size in dim_sizes)
+          ops.convert_to_tensor(size, preferred_dtype=dtypes.int64,
+                                name='dim_sizes') for size in dim_sizes)
       # Split the dimensions into partitioned & inner dimensions.
       inner_split = 0
       for dim, dim_size in enumerate(dim_sizes):
@@ -158,7 +178,7 @@
                                       dim_sizes[inner_split:])
 
   @classmethod
-  def from_tensor(cls, rt_input):
+  def from_tensor(cls, rt_input, dim_size_dtype=None):
     """Constructs a ragged shape for a potentially ragged tensor."""
     with ops.name_scope(None, 'RaggedTensorDynamicShapeFromTensor', [rt_input]):
       rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
@@ -169,7 +189,8 @@
             (rt_input.nrows(),) + rt_input.nested_row_lengths())
         return RaggedTensorDynamicShape(
             partitioned_dim_sizes,
-            array_ops.shape(rt_input.flat_values)[1:])
+            array_ops.shape(rt_input.flat_values)[1:],
+            dim_size_dtype=dim_size_dtype)
 
   def dimension_size(self, axis):
     """Returns the size of slices across the specified dimension."""
@@ -231,6 +252,11 @@
     """The number of inner dimensions, or `None` if not statically known."""
     return tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
 
+  @property
+  def dim_size_dtype(self):
+    """DType used by this shape for dimension sizes."""
+    return self._inner_dim_sizes.dtype
+
   def broadcast_to_rank(self, rank):
     """Adds leading size-1 dimensions to broadcast `self` to the given rank.
 
@@ -260,7 +286,8 @@
       return RaggedTensorDynamicShape(partitioned_dims, self._inner_dim_sizes)
     else:
       inner_dims = array_ops.concat(
-          [array_ops.ones([dims_to_add], dtypes.int64), self.inner_dim_sizes],
+          [array_ops.ones([dims_to_add], self.dim_size_dtype),
+           self.inner_dim_sizes],
           axis=0)
       return RaggedTensorDynamicShape([], inner_dims)
 
@@ -290,7 +317,7 @@
       A `RaggedTensorDynamicShape`.
     """
     lengths = ragged_util.convert_to_int_tensor(
-        lengths, name='lengths', dtype=dtypes.int64)
+        lengths, name='lengths', dtype=self.dim_size_dtype)
     # Check whether lengths is a scalar (for uniform dimensions) or
     # vector (for ragged dimensions).
     if lengths.shape.ndims is None:
@@ -347,7 +374,7 @@
   def num_slices_in_dimension(self, axis):
     """Returns the total number of slices across the indicated dimension."""
     if axis < 0:
-      return constant_op.constant(1, dtype=dtypes.int64)
+      return constant_op.constant(1, dtype=self.dim_size_dtype)
     elif self.is_ragged(axis):
       return math_ops.reduce_sum(self._partitioned_dim_sizes[axis])
     else:
@@ -365,7 +392,7 @@
       splits = array_ops.stack([0, self.num_slices_in_dimension(axis)])
     else:
       splits = math_ops.range(
-          array_ops.size(lengths, out_type=dtypes.int64) + 1)
+          array_ops.size(lengths, out_type=self.dim_size_dtype) + 1)
       repeats = lengths
 
     partitioned_sizes.append(lengths)
@@ -404,6 +431,15 @@
     inner_sizes = self._inner_dim_sizes[axis_in_inner_dims + 1:]
     return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
 
+  def with_dim_size_dtype(self, dtype):
+    if dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError('dtype must be int32 or int64')
+    if self.dim_size_dtype == dtype:
+      return self
+    return RaggedTensorDynamicShape(
+        [math_ops.cast(p, dtype) for p in self._partitioned_dim_sizes],
+        math_ops.cast(self._inner_dim_sizes, dtype))
+
 
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Returns the shape formed by broadcasting two shapes to be compatible.
@@ -479,6 +515,17 @@
 
 def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
   """Broadcasts rt_input to the ragged shape `dst_shape`."""
+  # Check that rt_input and dst_shape have the same row_splits dtype.
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.row_splits.dtype != dst_shape.dim_size_dtype):
+    if not ragged_config.auto_cast_partition_dtype():
+      raise ValueError('rt_input and dst_shape have different row_split '
+                       'dtypes; use RaggedTensor.with_row_splits_dtype() or '
+                       'RaggedTensorDynamicShape.with_dim_size_dtype() to '
+                       'convert to a compatible dtype.')
+    rt_input = rt_input.with_row_splits_dtype(dtypes.int64)
+    dst_shape = dst_shape.with_dim_size_dtype(dtypes.int64)
+
   # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's
   if rt_input.shape.ndims is None or dst_shape.rank is None:
     raise ValueError('Unable to broadcast: unknown rank')
@@ -500,8 +547,10 @@
       if ragged_tensor.is_ragged(rt_input):
         nrows = rt_input.nrows()
       else:
-        nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0]
-      rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows])
+        nrows = array_ops.shape(rt_input,
+                                out_type=dst_shape.dim_size_dtype)[0]
+      rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows],
+                                                             validate=False)
 
   # Add ragged dimensions to match dst_shape.
   if ragged_tensor.is_ragged(rt_input):
@@ -509,11 +558,13 @@
         rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
     if inner_rank_diff > 0:
       rt_input = rt_input.with_flat_values(
-          ragged_conversion_ops.from_tensor(
-              rt_input.flat_values, ragged_rank=inner_rank_diff))
+          ragged_tensor.RaggedTensor.from_tensor(
+              rt_input.flat_values, ragged_rank=inner_rank_diff,
+              row_splits_dtype=dst_shape.dim_size_dtype))
   else:
-    rt_input = ragged_conversion_ops.from_tensor(
-        rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)
+    rt_input = ragged_tensor.RaggedTensor.from_tensor(
+        rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1,
+        row_splits_dtype=dst_shape.dim_size_dtype)
 
   # Do broadcasting for any dimensions that will remain uniform.  We can do
   # these all at once, since they're independent of one another.
@@ -541,21 +592,24 @@
   for axis in range(dst_shape.num_partitioned_dimensions):
     if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis):
       dst_size = dst_shape.dimension_size(axis)
-      rt_input = _ragged_tile_axis(rt_input, axis, dst_size)
+      rt_input = _ragged_tile_axis(rt_input, axis, dst_size,
+                                   dst_shape.dim_size_dtype)
 
   return rt_input
 
 
-def _ragged_tile_axis(rt_input, axis, repeats):
+def _ragged_tile_axis(rt_input, axis, repeats, row_splits_dtype):
   """Tile a dimension of a RaggedTensor to match a ragged shape."""
   assert axis > 0  # Outermost dimension may not be ragged.
 
   if not ragged_tensor.is_ragged(rt_input):
-    rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1)
+    rt_input = ragged_tensor.RaggedTensor.from_tensor(
+        rt_input, ragged_rank=1, row_splits_dtype=row_splits_dtype)
 
   if axis > 1:
     return rt_input.with_values(
-        _ragged_tile_axis(rt_input.values, axis - 1, repeats))
+        _ragged_tile_axis(rt_input.values, axis - 1, repeats,
+                          row_splits_dtype))
   else:
     src_row_splits = rt_input.nested_row_splits
     src_row_lengths = rt_input.nested_row_lengths()
@@ -569,4 +623,4 @@
     dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits,
                                            repeats)
     return ragged_tensor.RaggedTensor.from_nested_row_lengths(
-        dst_values, dst_row_lengths)
+        dst_values, dst_row_lengths, validate=False)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
index bc0139c..b83c2e0 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -32,8 +32,8 @@
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
-                                  parameterized.TestCase):
+class RaggedTensorShapeTest(ragged_test_util.RaggedTensorTestCase,
+                            parameterized.TestCase):
 
   def assertShapeEq(self, x, y):
     assert isinstance(x, RaggedTensorDynamicShape)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_spec.py b/tensorflow/python/ops/ragged/ragged_tensor_spec.py
new file mode 100644
index 0000000..9da282c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_spec.py
@@ -0,0 +1,77 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorSpec factory for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def ragged_tensor_spec(shape=None, dtype=dtypes.float32,
+                       ragged_rank=None, row_splits_dtype=dtypes.int64,
+                       name=None):
+  """Returns a tensor specification for a RaggedTensor.
+
+  Returns an object which can be passed to `tf.function` (or other
+  functions that expect `TensorSpec`s) to specify shape constraints
+  for a `RaggedTensor` argument.
+
+  Args:
+    shape: The shape of the RaggedTensor, or `None` to allow any shape.
+    dtype: Data type of values in the RaggedTensor.
+    ragged_rank: Python integer, the ragged rank of the RaggedTensor
+      to be described.  Defaults to `shape.ndims - 1`.
+    row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor.
+      One of `tf.int32` or `tf.int64`.
+    name: Optional name prefix for the `TensorSpec`s.
+
+  Returns:
+    An object describing the `flat_values` and `nested_row_splits` tensors
+    that comprise the `RaggedTensor`.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  shape = tensor_shape.TensorShape(shape)
+  if ragged_rank is None:
+    if shape.ndims is None:
+      raise ValueError("Must specify ragged_rank or a shape with known rank.")
+    ragged_rank = shape.ndims - 1
+  elif not isinstance(ragged_rank, int):
+    raise TypeError("ragged_rank must be an int")
+  if ragged_rank == 0:
+    return tensor_spec.TensorSpec(shape=shape, dtype=dtype, name=name)
+
+  result = tensor_spec.TensorSpec(
+      tensor_shape.TensorShape([None]).concatenate(shape[ragged_rank + 1:]),
+      dtype, name)
+
+  for i in range(ragged_rank - 1, 0, -1):
+    splits = tensor_spec.TensorSpec(
+        [None], row_splits_dtype,
+        "%s.row_splits_%d" % (name, i) if name else None)
+    result = ragged_tensor.RaggedTensor.from_row_splits(result, splits)
+
+  outer_dim = tensor_shape.dimension_at_index(shape, 0)
+  splits_shape = [None if outer_dim is None else outer_dim + 1]
+  splits = tensor_spec.TensorSpec(
+      splits_shape, row_splits_dtype,
+      "%s.row_splits_0" % name if name else None)
+  result = ragged_tensor.RaggedTensor.from_row_splits(result, splits)
+
+  return result
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 5d55afc..1af535b 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -181,7 +182,7 @@
     rt_value = ragged_tensor_value.RaggedTensorValue(values, splits)
     self.assertEqual(rt_value.row_splits.dtype, np.int64)
     self.assertEqual(rt_value.shape, (5, None))
-    self.assertEqual(len(rt_value.nested_row_splits), 1)
+    self.assertLen(rt_value.nested_row_splits, 1)
     self.assertAllEqual(splits, rt_value.row_splits)
     self.assertAllEqual(values, rt_value.values)
     self.assertAllEqual(splits, rt_value.nested_row_splits[0])
@@ -193,7 +194,7 @@
         row_splits=splits2)
     self.assertEqual(rt_value.row_splits.dtype, np.int64)
     self.assertEqual(rt_value.shape, (2, None, None))
-    self.assertEqual(len(rt_value.nested_row_splits), 2)
+    self.assertLen(rt_value.nested_row_splits, 2)
     self.assertAllEqual(splits2, rt_value.row_splits)
     self.assertAllEqual(splits, rt_value.values.row_splits)
     self.assertAllEqual(splits2, rt_value.nested_row_splits[0])
@@ -245,17 +246,16 @@
           cached_row_lengths=[2, 3, 4],
           internal=True)
 
-
-#=============================================================================
-# RaggedTensor Factory Ops
-#=============================================================================
+  #=============================================================================
+  # RaggedTensor Factory Ops
+  #=============================================================================
 
   def testFromValueRowIdsWithDerivedNRows(self):
     # nrows is known at graph creation time.
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
 
-    rt = RaggedTensor.from_value_rowids(values, value_rowids)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -278,7 +278,7 @@
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
 
-    rt = RaggedTensor.from_value_rowids(values, value_rowids)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     if context.executing_eagerly():
       self.assertEqual(rt.shape.as_list(), [5, None])
@@ -303,7 +303,8 @@
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(7, dtypes.int64)
 
-    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows,
+                                        validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [7, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -324,7 +325,8 @@
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(5, dtypes.int64)
 
-    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows,
+                                        validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -357,7 +359,7 @@
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
 
-    rt = RaggedTensor.from_row_splits(values, row_splits)
+    rt = RaggedTensor.from_row_splits(values, row_splits, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -382,7 +384,7 @@
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
 
-    rt = RaggedTensor.from_row_starts(values, row_starts)
+    rt = RaggedTensor.from_row_starts(values, row_starts, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -402,7 +404,7 @@
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
 
-    rt = RaggedTensor.from_row_limits(values, row_limits)
+    rt = RaggedTensor.from_row_limits(values, row_limits, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -422,7 +424,7 @@
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
 
-    rt = RaggedTensor.from_row_lengths(values, row_lengths)
+    rt = RaggedTensor.from_row_lengths(values, row_lengths, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -525,7 +527,8 @@
         constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     ]
 
-    rt = RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits)
+    rt = RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits,
+                                             validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [4, None, None])
     self.assertEqual(rt.ragged_rank, 2)
@@ -728,7 +731,8 @@
       * Call __getitem__ with int values in the slice spec wrapped in
         `tf.constant()`.
       * Call __getitem__ with int values in the slice spec wrapped in
-        `tf.placeholder()` (so value is not known at graph construction time).
+        `tf.compat.v1.placeholder()` (so value is not known at graph
+        construction time).
 
     Args:
       rt: The RaggedTensor to test.
@@ -1076,16 +1080,18 @@
   def testRaggedTensorStr(self):
     values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
     row_splits = [0, 2, 5, 6, 6, 7]
-    rt = RaggedTensor.from_row_splits(values, row_splits)
+    rt = RaggedTensor.from_row_splits(values, row_splits, validate=False)
+    splits_type = 'int64'
     if context.executing_eagerly():
       expected_str = '<tf.RaggedTensor {}>'.format([[b'a', b'b'],
                                                     [b'c', b'd', b'e'], [b'f'],
                                                     [], [b'g']])
       expected_repr = (
           'tf.RaggedTensor(values=tf.Tensor([{}], shape=(7,), dtype=string), '
-          'row_splits=tf.Tensor([{}], shape=(6,), dtype=int64))'.format(
-              ' '.join(repr(x) for x in values), ' '.join(
-                  repr(x) for x in row_splits)))
+          'row_splits=tf.Tensor([{}], shape=(6,), dtype={}))'.format(
+              ' '.join(repr(x) for x in values),
+              ' '.join(repr(x) for x in row_splits),
+              splits_type))
       self.assertEqual(str(rt), expected_str)
       self.assertEqual(repr(rt), expected_repr)
     else:
@@ -1093,7 +1099,7 @@
           'tf.RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
           'shape=(7,), dtype=string), row_splits='
           'Tensor("RaggedFromRowSplits/row_splits:0", '
-          'shape=(6,), dtype=int64))')
+          'shape=(6,), dtype={}))').format(splits_type)
       self.assertEqual(repr(rt), expected_repr)
       self.assertEqual(str(rt), expected_repr)
 
@@ -1144,7 +1150,7 @@
     rt2 = ragged_factory_ops.constant([[[], [1, 2]], [[3]]])
     with self.test_session() as session:
       result = session.run({'rt1': rt1, 'rt2': rt2})
-      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertCountEqual(result.keys(), ['rt1', 'rt2'])
       self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
       self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
 
@@ -1165,15 +1171,10 @@
     rt2_feed_val = ragged_factory_ops.constant_value([[[], [1, 2]], [[3]]])
 
     with self.test_session() as session:
-      result = session.run({
-          'rt1': rt1,
-          'rt2': rt2
-      },
-                           feed_dict={
-                               rt1: rt1_feed_val,
-                               rt2: rt2_feed_val
-                           })
-      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      fetches = {'rt1': rt1, 'rt2': rt2}
+      feeds = {rt1: rt1_feed_val, rt2: rt2_feed_val}
+      result = session.run(fetches, feed_dict=feeds)
+      self.assertCountEqual(result.keys(), ['rt1', 'rt2'])
       self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
       self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
 
@@ -1226,10 +1227,327 @@
 
     a = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
-        array_ops.placeholder(dtypes.int64, name='a.row_splits'))
+        array_ops.placeholder(dtypes.int64, name='a.row_splits'),
+        validate=False)
     ragged_math_ops.reduce_sum(a)
     self.assertLen(a.consumers(), 1)
 
+  @parameterized.parameters([
+      # from_value_rowids
+      {'descr': 'bad rank for value_rowids',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': [[1, 2], [3, 4]],
+       'value_rowids': [[1, 2], [3, 4]],
+       'nrows': 10},
+      {'descr': 'bad rank for nrows',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': [1, 2, 3, 4],
+       'value_rowids': [1, 2, 3, 4],
+       'nrows': [10]},
+      {'descr': 'len(values) != len(value_rowids)',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': [1, 2, 3, 4],
+       'value_rowids': [1, 2, 3, 4, 5],
+       'nrows': 10},
+      {'descr': 'negative value_rowid',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': [1, 2, 3, 4],
+       'value_rowids': [-5, 2, 3, 4],
+       'nrows': 10},
+      {'descr': 'non-monotonic-increasing value_rowid',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': [1, 2, 3, 4],
+       'value_rowids': [4, 3, 2, 1],
+       'nrows': 10},
+      {'descr': 'value_rowid > nrows',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': [1, 2, 3, 4],
+       'value_rowids': [1, 2, 3, 4],
+       'nrows': 2},
+      {'descr': 'bad rank for values',
+       'factory': RaggedTensor.from_value_rowids,
+       'values': 10,
+       'value_rowids': [1, 2, 3, 4],
+       'nrows': 10},
+
+      # from_row_splits
+      {'descr': 'bad rank for row_splits',
+       'factory': RaggedTensor.from_row_splits,
+       'values': [[1, 2], [3, 4]],
+       'row_splits': [[1, 2], [3, 4]]},
+      {'descr': 'row_splits[0] != 0',
+       'factory': RaggedTensor.from_row_splits,
+       'values': [1, 2, 3, 4],
+       'row_splits': [2, 3, 4]},
+      {'descr': 'non-monotonic-increasing row_splits',
+       'factory': RaggedTensor.from_row_splits,
+       'values': [1, 2, 3, 4],
+       'row_splits': [0, 3, 2, 4]},
+      {'descr': 'row_splits[0] != nvals',
+       'factory': RaggedTensor.from_row_splits,
+       'values': [1, 2, 3, 4],
+       'row_splits': [0, 2, 3, 5]},
+      {'descr': 'bad rank for values',
+       'factory': RaggedTensor.from_row_splits,
+       'values': 10,
+       'row_splits': [0, 1]},
+
+      # from_row_lengths
+      {'descr': 'bad rank for row_lengths',
+       'factory': RaggedTensor.from_row_lengths,
+       'values': [1, 2, 3, 4],
+       'row_lengths': [[1, 2], [1, 0]]},
+      {'descr': 'negatve row_lengths',
+       'factory': RaggedTensor.from_row_lengths,
+       'values': [1, 2, 3, 4],
+       'row_lengths': [3, -1, 2]},
+      {'descr': 'sum(row_lengths) != nvals',
+       'factory': RaggedTensor.from_row_lengths,
+       'values': [1, 2, 3, 4],
+       'row_lengths': [2, 4, 2, 8]},
+      {'descr': 'bad rank for values',
+       'factory': RaggedTensor.from_row_lengths,
+       'values': 10,
+       'row_lengths': [0, 1]},
+
+      # from_row_starts
+      {'descr': 'bad rank for row_starts',
+       'factory': RaggedTensor.from_row_starts,
+       'values': [[1, 2], [3, 4]],
+       'row_starts': [[1, 2], [3, 4]]},
+      {'descr': 'row_starts[0] != 0',
+       'factory': RaggedTensor.from_row_starts,
+       'values': [1, 2, 3, 4],
+       'row_starts': [2, 3, 4]},
+      {'descr': 'non-monotonic-increasing row_starts',
+       'factory': RaggedTensor.from_row_starts,
+       'values': [1, 2, 3, 4],
+       'row_starts': [0, 3, 2, 4]},
+      {'descr': 'row_starts[0] > nvals',
+       'factory': RaggedTensor.from_row_starts,
+       'values': [1, 2, 3, 4],
+       'row_starts': [0, 2, 3, 5]},
+      {'descr': 'bad rank for values',
+       'factory': RaggedTensor.from_row_starts,
+       'values': 10,
+       'row_starts': [0, 1]},
+
+      # from_row_limits
+      {'descr': 'bad rank for row_limits',
+       'factory': RaggedTensor.from_row_limits,
+       'values': [[1, 2], [3, 4]],
+       'row_limits': [[1, 2], [3, 4]]},
+      {'descr': 'row_limits[0] < 0',
+       'factory': RaggedTensor.from_row_limits,
+       'values': [1, 2, 3, 4],
+       'row_limits': [-1, 3, 4]},
+      {'descr': 'non-monotonic-increasing row_limits',
+       'factory': RaggedTensor.from_row_limits,
+       'values': [1, 2, 3, 4],
+       'row_limits': [0, 3, 2, 4]},
+      {'descr': 'row_limits[0] != nvals',
+       'factory': RaggedTensor.from_row_limits,
+       'values': [1, 2, 3, 4],
+       'row_limits': [0, 2, 3, 5]},
+      {'descr': 'bad rank for values',
+       'factory': RaggedTensor.from_row_limits,
+       'values': 10,
+       'row_limits': [0, 1]},
+  ])
+  def testFactoryValidation(self, descr, factory, **kwargs):
+    # When input tensors have shape information, some of these errors will be
+    # detected statically.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(factory(**kwargs))
+
+    # Remove shape information (by wraping tensors in placeholders), and check
+    # that we detect the errors when the graph is run.
+    if not context.executing_eagerly():
+      def wrap_arg(v):
+        return array_ops.placeholder_with_default(
+            constant_op.constant(v, dtype=dtypes.int64),
+            tensor_shape.TensorShape(None))
+      kwargs = dict((k, wrap_arg(v)) for (k, v) in kwargs.items())
+
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(factory(**kwargs))
+
+#=============================================================================
+# RaggedTensor Variant conversion
+#=============================================================================
+
+  @parameterized.parameters(
+      {
+          'ragged_constant': [[1, 2], [3, 4, 5], [6], [], [7]],
+          'ragged_rank': 1
+      }, {
+          'ragged_constant': [[[1, 2]], [], [[3, 4]], []],
+          'ragged_rank': 1
+      }, {
+          'ragged_constant': [[[1], [2, 3, 4, 5, 6, 7]], [[]]],
+          'ragged_rank': 2
+      })
+  def testRaggedToVariant(self, ragged_constant, ragged_rank):
+    rt = ragged_factory_ops.constant(ragged_constant, ragged_rank=ragged_rank)
+    et = rt._to_variant()
+    self.assertEqual(et.shape.as_list(), [])
+    self.assertEqual(et.dtype, dtypes.variant)
+
+  @parameterized.parameters(
+      {
+          'ragged_constant': [[1, 2], [3, 4, 5], [6], [], [7]],
+          'ragged_rank': 1,
+          'num_batched_elems': 5
+      }, {
+          'ragged_constant': [[[1, 2]], [], [[3, 4]], []],
+          'ragged_rank': 1,
+          'num_batched_elems': 4
+      }, {
+          'ragged_constant': [[[1], [2, 3, 4, 5, 6, 7]], [[]]],
+          'ragged_rank': 2,
+          'num_batched_elems': 2
+      })
+  def testRaggedToBatchedVariant(self, ragged_constant, ragged_rank,
+                                 num_batched_elems):
+    rt = ragged_factory_ops.constant(ragged_constant, ragged_rank=ragged_rank)
+    et = rt._to_variant(batched_input=True)
+    self.assertEqual(et.shape.as_list(), [num_batched_elems])
+    self.assertEqual(et.dtype, dtypes.variant)
+
+  @parameterized.parameters(
+      # 2D test cases.
+      {
+          'ragged_constant': [[]],
+          'ragged_rank': 1,
+      },
+      {
+          'ragged_constant': [[1]],
+          'ragged_rank': 1,
+      },
+      {
+          'ragged_constant': [[1, 2]],
+          'ragged_rank': 1,
+      },
+      {
+          'ragged_constant': [[1], [2], [3]],
+          'ragged_rank': 1,
+      },
+      {
+          'ragged_constant': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'ragged_rank': 1,
+      },
+      {
+          'ragged_constant': [[1, 2], [3, 4, 5], [6], [], [7]],
+          'ragged_rank': 1,
+      },
+      # 3D test cases.
+      {
+          'ragged_constant': [[[]]],
+          'ragged_rank': 2,
+      },
+      {
+          'ragged_constant': [[[1]]],
+          'ragged_rank': 2,
+      },
+      {
+          'ragged_constant': [[[1, 2]]],
+          'ragged_rank': 2,
+      },
+      {
+          'ragged_constant': [[[1, 2], [3, 4]]],
+          'ragged_rank': 2,
+      },
+      {
+          'ragged_constant': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+          'ragged_rank': 2,
+      },
+      {
+          'ragged_constant': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+          'ragged_rank': 2,
+      },
+      {
+          'ragged_constant': [[[1, 2]], [], [[3, 4]], []],
+          'ragged_rank': 2,
+      },
+      # 4D test cases.
+      {
+          'ragged_constant': [[[[1, 2], [3, 4]]],
+                              [[[0, 0], [0, 0]], [[5, 6], [7, 8]]], []],
+          'ragged_rank': 3,
+      },
+      # dtype `string`.
+      {
+          'ragged_constant': [['a'], ['b'], ['c']],
+          'ragged_rank': 1,
+          'dtype': dtypes.string,
+      },
+      {
+          'ragged_constant': [[['a', 'b'], ['c', 'd']]],
+          'ragged_rank': 2,
+          'dtype': dtypes.string,
+      },
+      {
+          'ragged_constant': [[[['a', 'b'], ['c', 'd']]],
+                              [[['e', 'f'], ['g', 'h']], [['i', 'j'],
+                                                          ['k', 'l']]], []],
+          'ragged_rank': 3,
+          'dtype': dtypes.string,
+      })
+  def testVariantRoundTrip(self,
+                           ragged_constant,
+                           ragged_rank,
+                           dtype=dtypes.int32):
+    rt = ragged_factory_ops.constant(
+        ragged_constant, ragged_rank=ragged_rank, dtype=dtype)
+    et = rt._to_variant()
+    round_trip_rt = RaggedTensor._from_variant(
+        et, dtype, output_ragged_rank=ragged_rank)
+    self.assertRaggedEqual(rt, round_trip_rt)
+
+  def testBatchedVariantRoundTripInputRaggedRankInferred(self):
+    ragged_rank = 1
+    rt = ragged_factory_ops.constant(
+        [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]],
+        ragged_rank=ragged_rank)
+    batched_variant = rt._to_variant(batched_input=True)
+    nested_batched_variant = array_ops.reshape(batched_variant, [5, 2])
+    decoded_rt = RaggedTensor._from_variant(
+        nested_batched_variant,
+        dtype=dtypes.int32,
+        output_ragged_rank=ragged_rank + 1)
+    expected_rt = ragged_factory_ops.constant([[[0], [1]], [[2], [3]], [[4],
+                                                                        [5]],
+                                               [[6], [7]], [[8], [9]]])
+    self.assertRaggedEqual(decoded_rt, expected_rt)
+
+  def testBatchedVariantRoundTripWithInputRaggedRank(self):
+    ragged_rank = 1
+    rt = ragged_factory_ops.constant(
+        [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]],
+        ragged_rank=ragged_rank)
+    batched_variant = rt._to_variant(batched_input=True)
+    nested_batched_variant = array_ops.reshape(batched_variant, [5, 2])
+    decoded_rt = RaggedTensor._from_variant(
+        nested_batched_variant,
+        dtype=dtypes.int32,
+        output_ragged_rank=ragged_rank + 1,
+        input_ragged_rank=ragged_rank - 1)
+    expected_rt = ragged_factory_ops.constant([[[0], [1]], [[2], [3]], [[4],
+                                                                        [5]],
+                                               [[6], [7]], [[8], [9]]])
+    self.assertRaggedEqual(decoded_rt, expected_rt)
+
+  def testFromVariantInvalidParams(self):
+    rt = ragged_factory_ops.constant([[0], [1], [2], [3]])
+    batched_variant = rt._to_variant(batched_input=True)
+    nested_batched_variant = array_ops.reshape(batched_variant, [2, 2])
+    with self.assertRaisesRegexp(ValueError,
+                                 'output_ragged_rank must be equal to'):
+      RaggedTensor._from_variant(
+          nested_batched_variant,
+          dtype=dtypes.int32,
+          output_ragged_rank=1,
+          input_ragged_rank=1)
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
index c5e498e..4c1d5ff 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_value.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -38,13 +38,17 @@
 
     Args:
       values: A numpy array of any type and shape; or a RaggedTensorValue.
-      row_splits: A 1-D int64 numpy array.
+      row_splits: A 1-D int32 or int64 numpy array.
     """
     if not (isinstance(row_splits, (np.ndarray, np.generic)) and
-            row_splits.dtype == np.int64 and row_splits.ndim == 1):
-      raise TypeError("row_splits must be a 1D int64 numpy array")
+            row_splits.dtype in (np.int64, np.int32) and row_splits.ndim == 1):
+      raise TypeError("row_splits must be a 1D int32 or int64 numpy array")
     if not isinstance(values, (np.ndarray, np.generic, RaggedTensorValue)):
       raise TypeError("values must be a numpy array or a RaggedTensorValue")
+    if (isinstance(values, RaggedTensorValue) and
+        row_splits.dtype != values.row_splits.dtype):
+      raise ValueError("row_splits and values.row_splits must have "
+                       "the same dtype")
     self._values = values
     self._row_splits = row_splits
 
diff --git a/tensorflow/python/ops/ragged/ragged_test_util.py b/tensorflow/python/ops/ragged/ragged_test_util.py
index dcbab30..2c6b2e6 100644
--- a/tensorflow/python/ops/ragged/ragged_test_util.py
+++ b/tensorflow/python/ops/ragged/ragged_test_util.py
@@ -94,3 +94,13 @@
           self._eval_tensor(tensor.row_splits))
     else:
       return test_util.TensorFlowTestCase._eval_tensor(self, tensor)
+
+  @staticmethod
+  def _normalize_pylist(item):
+    """Convert all (possibly nested) np.arrays contained in item to list."""
+    # convert np.arrays in current level to list
+    if np.ndim(item) == 0:
+      return item
+    level = (x.tolist() if isinstance(x, np.ndarray) else x for x in item)
+    _normalize = RaggedTensorTestCase._normalize_pylist
+    return [_normalize(el) if np.ndim(el) != 0 else el for el in level]
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
index 92959a9..41da2a4 100644
--- a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -145,17 +145,18 @@
         array_ops.zeros([0], dtypes.int64), shape=None)
 
     bad_rt1 = ragged_tensor.RaggedTensor.from_row_splits(
-        row_splits=[2, 3], values=[1, 2, 3])
+        row_splits=[2, 3], values=[1, 2, 3], validate=False)
     bad_split0 = r'First value of ragged splits must be 0.*'
     with self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0):
       self.evaluate(bad_rt1.to_sparse())
 
     bad_rt2 = ragged_tensor.RaggedTensor.from_row_splits(
-        row_splits=[0, 5], values=empty_vector)
+        row_splits=[0, 5], values=empty_vector, validate=False)
     bad_rt3 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 1],
         values=ragged_tensor.RaggedTensor.from_row_splits(
-            row_splits=[0, 5], values=empty_vector))
+            row_splits=[0, 5], values=empty_vector, validate=False),
+        validate=False)
     split_mismatch1_error = r'Final value of ragged splits must match.*'
     for rt in [bad_rt2, bad_rt3]:
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
@@ -165,14 +166,15 @@
     bad_rt4 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 5],
         values=ragged_tensor.RaggedTensor.from_row_splits(
-            row_splits=[0], values=empty_vector))
+            row_splits=[0], values=empty_vector, validate=False),
+        validate=False)
     split_mismatch2_error = r'Final value of ragged splits must match.*'
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  split_mismatch2_error):
       self.evaluate(bad_rt4.to_sparse())
 
     bad_rt5 = ragged_tensor.RaggedTensor.from_row_splits(
-        row_splits=empty_vector, values=[])
+        row_splits=empty_vector, values=[], validate=False)
     empty_splits_error = (r'ragged splits may not be empty.*')
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  empty_splits_error):
@@ -191,7 +193,6 @@
 
     g1, g2 = gradients_impl.gradients(st.values,
                                       [rt1.flat_values, rt2.flat_values])
-    print(g1, g2)
     self.assertRaggedEqual(g1, [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
     self.assertRaggedEqual(g2, [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
 
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
index a832f93..2c738e7 100644
--- a/tensorflow/python/ops/ragged/ragged_util.py
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -268,7 +268,7 @@
   else:
     # Optimization: we can just call repeat once, and then slice the result.
     repeated_splits = repeat(splits, repeats, axis=0)
-    n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
+    n_splits = array_ops.shape(repeated_splits, out_type=repeats.dtype)[0]
     repeated_starts = repeated_splits[:n_splits - repeats]
     repeated_limits = repeated_splits[repeats:]
 
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
index d60ee49..542f53a 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -18,7 +18,6 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -77,23 +76,23 @@
   #### Examples:
     ```python
     >>> # Coordinates where condition is true.
-    >>> condition = tf.ragged.constant_value(
+    >>> condition = tf.compat.v1.ragged.constant_value(
     ...     [[True, False, True], [False, True]])
     >>> ragged.where(condition)
     [[0, 0], [0, 2], [1, 1]]
 
     >>> # Elementwise selection between x and y, based on condition.
-    >>> condition = tf.ragged.constant_value(
+    >>> condition = tf.compat.v1.ragged.constant_value(
     ...     [[True, False, True], [False, True]])
-    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> x = tf.compat.v1.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.compat.v1.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
     >>> ragged.where(condition, x, y)
     [['A', 'b', 'C'], ['d', 'E']]
 
     >>> # Row selection between x and y, based on condition.
     >>> condition = [True, False]
-    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> x = tf.compat.v1.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.compat.v1.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
     >>> ragged.where(condition, x, y)
     [['A', 'B', 'C'], ['d', 'e']]
     ```
@@ -108,6 +107,7 @@
     else:
       x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
       y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      condition, x, y = ragged_tensor.match_row_splits_dtypes(condition, x, y)
       return _elementwise_where(condition, x, y)
 
 
@@ -126,10 +126,11 @@
   elif not condition_is_ragged:
     # Concatenate x and y, and then use `gather` to assemble the selected rows.
     condition.shape.assert_has_rank(1)
-    x_nrows = _nrows(x)
     x_and_y = ragged_concat_ops.concat([x, y], axis=0)
+    x_nrows = _nrows(x, out_type=x_and_y.row_splits.dtype)
+    y_nrows = _nrows(y, out_type=x_and_y.row_splits.dtype)
     indices = array_ops.where(condition, math_ops.range(x_nrows),
-                              x_nrows + math_ops.range(_nrows(y)))
+                              x_nrows + math_ops.range(y_nrows))
     return ragged_gather_ops.gather(x_and_y, indices)
 
   else:
@@ -145,6 +146,7 @@
   selected_coords = _coordinate_where(condition.values)
 
   # Convert the first index in each coordinate to a row index and column index.
+  condition = condition.with_row_splits_dtype(selected_coords.dtype)
   first_index = selected_coords[:, 0]
   selected_rows = array_ops.gather(condition.value_rowids(), first_index)
   selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
@@ -158,9 +160,8 @@
                           axis=1)
 
 
-def _nrows(rt_input, out_type=dtypes.int64, name=None):
+def _nrows(rt_input, out_type):
   if isinstance(rt_input, ragged_tensor.RaggedTensor):
-    return rt_input.nrows(out_type=out_type, name=name)
+    return rt_input.nrows(out_type=out_type)
   else:
-    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
-      return array_ops.shape(rt_input, out_type=out_type)[0]
+    return array_ops.shape(rt_input, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index e76a040..d54e2c7 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -155,12 +155,19 @@
       #=========================================================================
       # Elementwise row-selection mode
       #=========================================================================
-      dict(  # shape=[D1, D2]
+      dict(  # x.shape=[D1, D2], y.shape=[D1, D2]
           condition=[True, False, True],
           x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
           y=[['a', 'b'], ['c', 'd'], ['e', 'f']],
           expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
-      dict(  # shape=[D1, (D2)]
+      dict(  # x.shape=[D1, D2], y.shape=[D1, (D2)]
+          condition=[True, False, True],
+          x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b'], ['c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B'], [b'c'], [b'E', b'F']])),
+      dict(  # x.shape=[D1, (D2)], y.shape=[D1, (D2)]
           condition=[True, False, True],
           x=ragged_factory_ops.constant_value(
               [['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 31e26e7..69791fb 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -31,7 +31,7 @@
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
-def row_splits_to_segment_ids(splits, name=None):
+def row_splits_to_segment_ids(splits, name=None, out_type=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
   Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
@@ -43,22 +43,32 @@
   ```
 
   Args:
-    splits: A sorted 1-D int64 Tensor.  `splits[0]` must be zero.
+    splits: A sorted 1-D integer Tensor.  `splits[0]` must be zero.
     name: A name prefix for the returned tensor (optional).
+    out_type: The dtype for the return value.  Defaults to `splits.dtype`,
+      or `tf.int64` if `splits` does not have a dtype.
 
   Returns:
-    A sorted 1-D int64 Tensor, with `shape=[splits[-1]]`
+    A sorted 1-D integer Tensor, with `shape=[splits[-1]]`
 
   Raises:
     ValueError: If `splits` is invalid.
   """
   with ops.name_scope(name, "RaggedSplitsToSegmentIds", [splits]) as name:
-    splits = ops.convert_to_tensor(splits, dtype=dtypes.int64, name="splits")
+    splits = ops.convert_to_tensor(
+        splits, name="splits",
+        preferred_dtype=dtypes.int64)
+    if splits.dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError("splits must have dtype int32 or int64")
     splits.shape.assert_has_rank(1)
     if tensor_shape.dimension_value(splits.shape[0]) == 0:
       raise ValueError("Invalid row_splits: []")
+    if out_type is None:
+      out_type = splits.dtype
+    else:
+      out_type = dtypes.as_dtype(out_type)
     row_lengths = splits[1:] - splits[:-1]
-    nrows = array_ops.shape(splits, out_type=dtypes.int64)[-1] - 1
+    nrows = array_ops.shape(splits, out_type=out_type)[-1] - 1
     indices = math_ops.range(nrows)
     return ragged_util.repeat(indices, repeats=row_lengths, axis=0)
 
@@ -66,7 +76,8 @@
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
-def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
+def segment_ids_to_row_splits(segment_ids, num_segments=None,
+                              out_type=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
 
   Returns an integer vector `splits`, where `splits[0] = 0` and
@@ -81,24 +92,39 @@
     segment_ids: A 1-D integer Tensor.
     num_segments: A scalar integer indicating the number of segments.  Defaults
       to `max(segment_ids) + 1` (or zero if `segment_ids` is empty).
+    out_type: The dtype for the return value.  Defaults to `segment_ids.dtype`,
+      or `tf.int64` if `segment_ids` does not have a dtype.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
-    A sorted 1-D int64 Tensor, with `shape=[num_segments + 1]`.
+    A sorted 1-D integer Tensor, with `shape=[num_segments + 1]`.
   """
+  if out_type is None:
+    if isinstance(segment_ids, ops.Tensor):
+      out_type = segment_ids.dtype
+    elif isinstance(num_segments, ops.Tensor):
+      out_type = num_segments.dtype
+    else:
+      out_type = dtypes.int64
+  else:
+    out_type = dtypes.as_dtype(out_type)
   with ops.name_scope(name, "SegmentIdsToRaggedSplits", [segment_ids]) as name:
-    segment_ids = ragged_util.convert_to_int_tensor(segment_ids, "segment_ids")
+    # Note: we cast int64 tensors to int32, since bincount currently only
+    # supports int32 inputs.
+    segment_ids = ragged_util.convert_to_int_tensor(segment_ids, "segment_ids",
+                                                    dtype=dtypes.int32)
     segment_ids.shape.assert_has_rank(1)
     if num_segments is not None:
       num_segments = ragged_util.convert_to_int_tensor(num_segments,
-                                                       "num_segments")
+                                                       "num_segments",
+                                                       dtype=dtypes.int32)
       num_segments.shape.assert_has_rank(0)
 
     row_lengths = math_ops.bincount(
         segment_ids,
         minlength=num_segments,
         maxlength=num_segments,
-        dtype=dtypes.int64)
+        dtype=out_type)
     splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
 
     # Update shape information, if possible.
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index cd332ed..711bb8e 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -63,7 +63,7 @@
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -112,7 +112,7 @@
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -162,7 +162,7 @@
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -217,7 +217,7 @@
     dtype: The type of the output: `float16`, `float32`, `float64`, `int32`,
       or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See `tf.set_random_seed`
+      See `tf.compat.v1.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -270,7 +270,7 @@
     value: A Tensor to be shuffled.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -299,7 +299,7 @@
     value: Input tensor to crop.
     size: 1-D tensor with size the rank of `value`.
     seed: Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: A name for this operation (optional).
 
@@ -338,7 +338,7 @@
   ```python
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
-  samples = tf.multinomial(tf.log([[10., 10.]]), 5)
+  samples = tf.random.categorical(tf.math.log([[10., 10.]]), 5)
   ```
 
   Args:
@@ -346,7 +346,7 @@
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See `tf.set_random_seed` for behavior.
+      See `tf.compat.v1.set_random_seed` for behavior.
     name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
 
@@ -366,7 +366,7 @@
   ```python
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
-  samples = tf.random.categorical(tf.log([[10., 10.]]), 5)
+  samples = tf.random.categorical(tf.math.log([[10., 10.]]), 5)
   ```
 
   Args:
@@ -375,7 +375,7 @@
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     dtype: integer type to use for the output. Defaults to int64.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See `tf.set_random_seed` for behavior.
+      See `tf.compat.v1.set_random_seed` for behavior.
     name: Optional name for the operation.
 
   Returns:
@@ -386,7 +386,7 @@
 
 
 def multinomial_categorical_impl(logits, num_samples, dtype, seed):
-  """Implementation for random.multinomial (v1) and random.categorical (v2)."""
+  """Implementation for random.categorical (v1) and random.categorical (v2)."""
   logits = ops.convert_to_tensor(logits, name="logits")
   seed1, seed2 = random_seed.get_seed(seed)
   return gen_random_ops.multinomial(
@@ -425,17 +425,17 @@
   Example:
 
   ```python
-  samples = tf.random_gamma([10], [0.5, 1.5])
+  samples = tf.random.gamma([10], [0.5, 1.5])
   # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
   # the samples drawn from each distribution
 
-  samples = tf.random_gamma([7, 5], [0.5, 1.5])
+  samples = tf.random.gamma([7, 5], [0.5, 1.5])
   # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
   # represents the 7x5 samples drawn from each of the two distributions
 
   alpha = tf.constant([[1.],[3.],[5.]])
   beta = tf.constant([[3., 4.]])
-  samples = tf.random_gamma([30], alpha=alpha, beta=beta)
+  samples = tf.random.gamma([30], alpha=alpha, beta=beta)
   # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.
 
   loss = tf.reduce_mean(tf.square(samples))
@@ -458,7 +458,7 @@
       `float64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: Optional name for the operation.
 
@@ -490,11 +490,11 @@
   Example:
 
   ```python
-  samples = tf.random_poisson([0.5, 1.5], [10])
+  samples = tf.random.poisson([0.5, 1.5], [10])
   # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
   # the samples drawn from each distribution
 
-  samples = tf.random_poisson([12.2, 3.3], [7, 5])
+  samples = tf.random.poisson([12.2, 3.3], [7, 5])
   # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
   # represents the 7x5 samples drawn from each of the two distributions
   ```
@@ -509,7 +509,7 @@
       `int64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: Optional name for the operation.
 
@@ -529,11 +529,11 @@
   Example:
 
   ```python
-  samples = tf.random_poisson([10], [0.5, 1.5])
+  samples = tf.random.poisson([10], [0.5, 1.5])
   # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
   # the samples drawn from each distribution
 
-  samples = tf.random_poisson([7, 5], [12.2, 3.3])
+  samples = tf.random.poisson([7, 5], [12.2, 3.3])
   # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
   # represents the 7x5 samples drawn from each of the two distributions
   ```
@@ -548,7 +548,7 @@
       `int64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
-      `tf.set_random_seed`
+      `tf.compat.v1.set_random_seed`
       for behavior.
     name: Optional name for the operation.
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index fbff49c..fdbc5a6 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -128,6 +128,51 @@
   return variable_handle_data
 
 
+def variable_handle_from_shape_and_dtype(
+    shape, dtype, shared_name, name, graph_mode, extra_handle_data=None):
+  """Create a new variable handle, optionally copying in `extra_handle_data`."""
+  container = ops.get_default_graph()._container  # pylint: disable=protected-access
+  if container is None:
+    container = ""
+  handle = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
+                                                   shared_name=shared_name,
+                                                   name=name,
+                                                   container=container)
+  if extra_handle_data is None:
+    extra_handle_data = handle
+  if graph_mode:
+    full_handle_data = _combine_handle_data(handle, extra_handle_data)
+    _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+    return handle
+  else:
+    # We do not want two distinct ResourceVariable objects for the same
+    # underlying resource in the runtime.
+    # When in eager mode, explicitly ensure so here. When in graph mode, it's
+    # ensured by always generating different variable names.
+    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
+    if exists:
+      raise ValueError("variable object with name '%s' already created. Use "
+                       "get_variable() if reuse is desired." %
+                       shared_name)
+    with context.graph_mode(), ops.Graph().as_default() as graph:
+      h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
+                                                  shared_name=shared_name,
+                                                  name=name,
+                                                  container=container)
+
+      # Tensor._handle_data contains information for the shape-inference code to
+      # know the shape and dtype of the variable pointed to by a handle. Since
+      # shape inference doesn't run in eager mode we copy this data here for
+      # when the handle is captured by an eager mode function.
+      # pylint: disable=protected-access
+      full_handle_data = _combine_handle_data(h, extra_handle_data)
+      _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+      # pylint: enable=protected-access
+    # Clean up op->graph->op reference cycles.
+    ops.dismantle_graph(graph)
+    return handle
+
+
 def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
   """Creates a variable handle with information to do shape inference.
 
@@ -170,45 +215,8 @@
   """
   shape = initial_value.get_shape()
   dtype = initial_value.dtype.base_dtype
-  container = ops.get_default_graph()._container  # pylint: disable=protected-access
-  if container is None:
-    container = ""
-  handle = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                   shared_name=shared_name,
-                                                   name=name,
-                                                   container=container)
-
-  if graph_mode:
-    full_handle_data = _combine_handle_data(handle, initial_value)
-    _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
-    return handle
-  else:
-    # We do not want two distinct ResourceVariable objects for the same
-    # underlying resource in the runtime.
-    # When in eager mode, explicitly ensure so here. When in graph mode, it's
-    # ensured by always generating different variable names.
-    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-    if exists:
-      raise ValueError("variable object with name '%s' already created. Use "
-                       "get_variable() if reuse is desired." %
-                       shared_name)
-    with context.graph_mode(), ops.Graph().as_default() as graph:
-      h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                  shared_name=shared_name,
-                                                  name=name,
-                                                  container=container)
-
-      # Tensor._handle_data contains information for the shape-inference code to
-      # know the shape and dtype of the variable pointed to by a handle. Since
-      # shape inference doesn't run in eager mode we copy this data here for
-      # when the handle is captured by an eager mode function.
-      # pylint: disable=protected-access
-      full_handle_data = _combine_handle_data(h, initial_value)
-      _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
-      # pylint: enable=protected-access
-    # Clean up op->graph->op reference cycles.
-    ops.dismantle_graph(graph)
-    return handle
+  return variable_handle_from_shape_and_dtype(
+      shape, dtype, shared_name, name, graph_mode, initial_value)
 
 
 @contextlib.contextmanager
@@ -344,7 +352,7 @@
   with tf.control_dependencies([other_assign]):
     # Will print 2.0 because the value was read before other_assign ran. If
     # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
-    tf.Print(b, [b]).eval()
+    tf.compat.v1.Print(b, [b]).eval()
   ```
   """
 
@@ -1268,7 +1276,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         op = ref.scatter_nd_sub(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(op)
     ```
 
@@ -1321,7 +1329,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         add = ref.scatter_nd_add(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(add)
     ```
 
@@ -1374,7 +1382,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         op = ref.scatter_nd_update(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(op)
     ```
 
@@ -1635,21 +1643,119 @@
       var, "_should_act_as_resource_variable")
 
 
+# TODO(allenl): Rather than UninitializedVariable inheriting from
+# ResourceVariable, ResourceVariable should inherit from UninitializedVariable
+# and add its initialization logic.
+class UninitializedVariable(ResourceVariable):
+  """A variable with no initializer."""
+
+  def __init__(self,  # pylint: disable=super-init-not-called
+               trainable=None,
+               caching_device=None,
+               name=None,
+               shape=None,
+               dtype=None,
+               constraint=None,
+               synchronization=None,
+               aggregation=None,
+               extra_handle_data=None,
+               **unused_kwargs):
+    """Creates the variable handle.
+
+    Args:
+      trainable: If `True`, GradientTapes automatically watch uses of this
+        Variable.
+      caching_device: Optional device string or function describing where the
+        Variable should be cached for reading.  Defaults to the Variable's
+        device.  If not `None`, caches on another device.  Typical use is to
+        cache on the device where the Ops using the Variable reside, to
+        deduplicate copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      shape: The variable's shape.
+      dtype: The variable's dtype.
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value
+        (which must have the same shape). Constraints are not safe to
+        use when doing asynchronous distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      extra_handle_data: Optional, another resource handle or Tensor with handle
+        data to merge with `shape` and `dtype`.
+    """
+    with ops.init_scope():
+      self._in_graph_mode = not context.executing_eagerly()
+    synchronization, aggregation, trainable = (
+        variables.validate_synchronization_aggregation_trainable(
+            synchronization, aggregation, trainable, name))
+    self._trainable = trainable
+    self._synchronization = synchronization
+    self._aggregation = aggregation
+    self._save_slice_info = None
+    self._initial_value = None
+    self._initializer_op = None
+    self._is_initialized_op = None
+    self._graph_element = None
+    self._cached_value = None
+    # Store the graph key so optimizers know how to only retrieve variables from
+    # this graph. Guaranteed to be the same as the eager graph_key.
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    self._shape = shape
+    self._dtype = dtype
+    with ops.init_scope():
+      handle_name = ops.name_from_scope_name(name)
+      unique_id = "%s_%d" % (handle_name, ops.uid())
+      shared_name = context.shared_name(unique_id)
+      self._handle = variable_handle_from_shape_and_dtype(
+          shape=shape, dtype=dtype, shared_name=shared_name,
+          name=name, graph_mode=self._in_graph_mode,
+          extra_handle_data=extra_handle_data)
+      if self._in_graph_mode:
+        with ops.name_scope("Read"), ops.colocate_with(self._handle):
+          # Manually assign reads to the handle's device to avoid log
+          # messages.
+          with ops.device(self._handle.device):
+            value = self._read_variable_op()
+          self._graph_element = value
+        ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
+    self._unique_id = unique_id
+    self._handle_name = handle_name + ":0"
+    self._constraint = constraint
+    # After the handle has been created, set up a way to clean it up when
+    # executing eagerly. We'll hold the only reference to the deleter, so that
+    # when this object is garbage collected the deleter will be too. This
+    # means ResourceVariables can be part of reference cycles without those
+    # cycles being uncollectable.
+    if not self._in_graph_mode:
+      self._handle_deleter = EagerResourceDeleter(
+          handle=self._handle, handle_device=self._handle.device)
+    self._cached_shape_as_list = None
+
+
 def copy_to_graph_uninitialized(var):
   """Copies an existing variable to a new graph, with no initializer."""
   # Like ResourceVariable.__deepcopy__, but does not set an initializer on the
   # new variable.
   # pylint: disable=protected-access
-  new_variable = ResourceVariable(
-      initial_value=array_ops.placeholder(
-          shape=var.shape, dtype=var.dtype,
-          name="unused_initial_variable_value"),
+  new_variable = UninitializedVariable(
       trainable=var.trainable,
       constraint=var._constraint,
+      shape=var.shape,
       dtype=var.dtype,
       name=var._shared_name,
       synchronization=var.synchronization,
-      aggregation=var.aggregation)
+      aggregation=var.aggregation,
+      extra_handle_data=var.handle)
   new_variable._maybe_initialize_trackable()
   # pylint: enable=protected-access
   return new_variable
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index d882d6d..adda1f5 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """RNN helpers for TensorFlow models."""
 from __future__ import absolute_import
 from __future__ import division
@@ -36,7 +35,6 @@
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-
 # pylint: disable=protected-access
 _concat = rnn_cell_impl._concat
 # pylint: enable=protected-access
@@ -60,12 +58,11 @@
 
   x_rank = array_ops.rank(x)
   x_t = array_ops.transpose(
-      x, array_ops.concat(
-          ([1, 0], math_ops.range(2, x_rank)), axis=0))
+      x, array_ops.concat(([1, 0], math_ops.range(2, x_rank)), axis=0))
   x_t.set_shape(
-      tensor_shape.TensorShape([
-          x_static_shape.dims[1].value, x_static_shape.dims[0].value
-      ]).concatenate(x_static_shape[2:]))
+      tensor_shape.TensorShape(
+          [x_static_shape.dims[1].value,
+           x_static_shape.dims[0].value]).concatenate(x_static_shape[2:]))
   return x_t
 
 
@@ -73,9 +70,8 @@
   """Get static input batch size if available, with fallback to the dynamic one.
 
   Args:
-    flat_input: An iterable of time major input Tensors of shape
-      `[max_time, batch_size, ...]`.
-    All inputs should have compatible batch sizes.
+    flat_input: An iterable of time major input Tensors of shape `[max_time,
+      batch_size, ...]`. All inputs should have compatible batch sizes.
 
   Returns:
     The batch size in Python integer if available, or a scalar Tensor otherwise.
@@ -88,8 +84,8 @@
     if shape.rank is None:
       continue
     if shape.rank < 2:
-      raise ValueError(
-          "Expected input tensor %s to have rank at least 2" % input_)
+      raise ValueError("Expected input tensor %s to have rank at least 2" %
+                       input_)
     batch_size = shape.dims[1].value
     if batch_size is not None:
       return batch_size
@@ -156,6 +152,7 @@
   Args:
     rnn_cell: An RNN cell instance that either follow the Keras interface or TF
       RNN interface.
+
   Returns:
     Boolean, whether the cell is an Keras RNN cell.
   """
@@ -163,15 +160,21 @@
   # library like Deepmind that didn't inherit tf.nn.rnn_cell.RNNCell.
   # Keras cells never had zero_state method, which was from the original
   # interface from TF RNN cell.
-  return (not isinstance(rnn_cell, rnn_cell_impl.RNNCell)
-          and isinstance(rnn_cell, base_layer.Layer)
-          and getattr(rnn_cell, "zero_state", None) is None)
+  return (not isinstance(rnn_cell, rnn_cell_impl.RNNCell) and
+          isinstance(rnn_cell, base_layer.Layer) and
+          getattr(rnn_cell, "zero_state", None) is None)
 
 
 # pylint: disable=unused-argument
-def _rnn_step(
-    time, sequence_length, min_sequence_length, max_sequence_length,
-    zero_output, state, call_cell, state_size, skip_conditionals=False):
+def _rnn_step(time,
+              sequence_length,
+              min_sequence_length,
+              max_sequence_length,
+              zero_output,
+              state,
+              call_cell,
+              state_size,
+              skip_conditionals=False):
   """Calculate one step of a dynamic RNN minibatch.
 
   Returns an (output, state) pair conditioned on `sequence_length`.
@@ -246,10 +249,12 @@
     # a calculated state & output.
     flat_new_output = [
         _copy_one_through(zero_output, new_output)
-        for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+        for zero_output, new_output in zip(flat_zero_output, flat_new_output)
+    ]
     flat_new_state = [
         _copy_one_through(state, new_state)
-        for state, new_state in zip(flat_state, flat_new_state)]
+        for state, new_state in zip(flat_state, flat_new_state)
+    ]
     return flat_new_output + flat_new_state
 
   def _maybe_copy_some_through():
@@ -263,7 +268,8 @@
     flat_new_output = nest.flatten(new_output)
     return control_flow_ops.cond(
         # if t < min_seq_len: calculate and return everything
-        time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+        time < min_sequence_length,
+        lambda: flat_new_output + flat_new_state,
         # else copy some of it through
         lambda: _copy_some_through(flat_new_output, flat_new_state))
 
@@ -284,7 +290,8 @@
     empty_update = lambda: flat_zero_output + flat_state
     final_output_and_state = control_flow_ops.cond(
         # if t >= max_seq_len: copy all state through, output zeros
-        time >= max_sequence_length, empty_update,
+        time >= max_sequence_length,
+        empty_update,
         # otherwise calculation is required: copy some or all of it through
         _maybe_copy_some_through)
 
@@ -313,10 +320,9 @@
 
   Args:
     input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
-               or nested tuples of tensors.
+      or nested tuples of tensors.
     lengths:   A `Tensor` of dimension batch_size, containing lengths for each
-               sequence in the batch. If "None" is specified, simply reverses
-               the list.
+      sequence in the batch. If "None" is specified, simply reverses the list.
 
   Returns:
     time-reversed sequence
@@ -328,8 +334,7 @@
 
   flat_results = [[] for _ in range(len(input_seq))]
   for sequence in zip(*flat_input_seq):
-    input_shape = tensor_shape.unknown_shape(
-        rank=sequence[0].get_shape().rank)
+    input_shape = tensor_shape.unknown_shape(rank=sequence[0].get_shape().rank)
     for input_ in sequence:
       input_shape.merge_with(input_.get_shape())
       input_.set_shape(input_shape)
@@ -345,8 +350,10 @@
       r.set_shape(input_shape)
       flat_result.append(r)
 
-  results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
-             for input_, flat_result in zip(input_seq, flat_results)]
+  results = [
+      nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+      for input_, flat_result in zip(input_seq, flat_results)
+  ]
   return results
 
 
@@ -354,10 +361,17 @@
                         "keras.layers.RNN(cell))`, which is equivalent to "
                         "this API")
 @tf_export(v1=["nn.bidirectional_dynamic_rnn"])
-def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
-                              initial_state_fw=None, initial_state_bw=None,
-                              dtype=None, parallel_iterations=None,
-                              swap_memory=False, time_major=False, scope=None):
+def bidirectional_dynamic_rnn(cell_fw,
+                              cell_bw,
+                              inputs,
+                              sequence_length=None,
+                              initial_state_fw=None,
+                              initial_state_bw=None,
+                              dtype=None,
+                              parallel_iterations=None,
+                              swap_memory=False,
+                              time_major=False,
+                              scope=None):
   """Creates a dynamic version of bidirectional recurrent neural network.
 
   Takes input and builds independent forward and backward RNNs. The input_size
@@ -373,38 +387,38 @@
     inputs: The RNN inputs.
       If time_major == False (default), this must be a tensor of shape:
         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
-      If time_major == True, this must be a tensor of shape:
-        `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+      If time_major == True, this must be a tensor of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements.
     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
-      containing the actual lengths for each of the sequences in the batch.
-      If not provided, all batch entries are assumed to be full sequences; and
-      time reversal is applied from time `0` to `max_time` for each sequence.
-    initial_state_fw: (optional) An initial state for the forward RNN.
-      This must be a tensor of appropriate type and shape
-      `[batch_size, cell_fw.state_size]`.
-      If `cell_fw.state_size` is a tuple, this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
-      the corresponding properties of `cell_bw`.
+      containing the actual lengths for each of the sequences in the batch. If
+      not provided, all batch entries are assumed to be full sequences; and time
+      reversal is applied from time `0` to `max_time` for each sequence.
+    initial_state_fw: (optional) An initial state for the forward RNN. This must
+      be a tensor of appropriate type and shape `[batch_size,
+      cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a
+      tuple of tensors having shapes `[batch_size, s] for s in
+      cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using the
+      corresponding properties of `cell_bw`.
     dtype: (optional) The data type for the initial states and expected output.
       Required if initial_states are not provided or RNN states have a
       heterogeneous dtype.
     parallel_iterations: (Default: 32).  The number of iterations to run in
-      parallel.  Those operations which do not have any temporal dependency
-      and can be run in parallel, will be.  This parameter trades off
-      time for space.  Values >> 1 use more memory but take less time,
-      while smaller values use less memory but computations take longer.
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
     swap_memory: Transparently swap the tensors produced in forward inference
-      but needed for back prop from GPU to CPU.  This allows training RNNs
-      which would typically not fit on a single GPU, with very minimal (or no)
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
       performance penalty.
-    time_major: The shape format of the `inputs` and `outputs` Tensors.
-      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-      Using `time_major = True` is a bit more efficient because it avoids
-      transposes at the beginning and end of the RNN calculation.  However,
-      most TensorFlow data is batch-major, so by default this function
-      accepts input and emits output in batch-major form.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
     scope: VariableScope for the created subgraph; defaults to
       "bidirectional_rnn"
 
@@ -439,10 +453,15 @@
     # Forward direction
     with vs.variable_scope("fw") as fw_scope:
       output_fw, output_state_fw = dynamic_rnn(
-          cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
-          initial_state=initial_state_fw, dtype=dtype,
-          parallel_iterations=parallel_iterations, swap_memory=swap_memory,
-          time_major=time_major, scope=fw_scope)
+          cell=cell_fw,
+          inputs=inputs,
+          sequence_length=sequence_length,
+          initial_state=initial_state_fw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=fw_scope)
 
     # Backward direction
     if not time_major:
@@ -455,8 +474,10 @@
     def _reverse(input_, seq_lengths, seq_axis, batch_axis):
       if seq_lengths is not None:
         return array_ops.reverse_sequence(
-            input=input_, seq_lengths=seq_lengths,
-            seq_axis=seq_axis, batch_axis=batch_axis)
+            input=input_,
+            seq_lengths=seq_lengths,
+            seq_axis=seq_axis,
+            batch_axis=batch_axis)
       else:
         return array_ops.reverse(input_, axis=[seq_axis])
 
@@ -471,14 +492,21 @@
 
       inputs_reverse = nest.map_structure(_map_reverse, inputs)
       tmp, output_state_bw = dynamic_rnn(
-          cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
-          initial_state=initial_state_bw, dtype=dtype,
-          parallel_iterations=parallel_iterations, swap_memory=swap_memory,
-          time_major=time_major, scope=bw_scope)
+          cell=cell_bw,
+          inputs=inputs_reverse,
+          sequence_length=sequence_length,
+          initial_state=initial_state_bw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=bw_scope)
 
   output_bw = _reverse(
-      tmp, seq_lengths=sequence_length,
-      seq_axis=time_axis, batch_axis=batch_axis)
+      tmp,
+      seq_lengths=sequence_length,
+      seq_axis=time_axis,
+      batch_axis=batch_axis)
 
   outputs = (output_fw, output_bw)
   output_states = (output_state_fw, output_state_bw)
@@ -490,9 +518,15 @@
     None,
     "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
 @tf_export(v1=["nn.dynamic_rnn"])
-def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
-                dtype=None, parallel_iterations=None, swap_memory=False,
-                time_major=False, scope=None):
+def dynamic_rnn(cell,
+                inputs,
+                sequence_length=None,
+                initial_state=None,
+                dtype=None,
+                parallel_iterations=None,
+                swap_memory=False,
+                time_major=False,
+                scope=None):
   """Creates a recurrent neural network specified by RNNCell `cell`.
 
   Performs fully dynamic unrolling of `inputs`.
@@ -501,7 +535,7 @@
 
   ```python
   # create a BasicRNNCell
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  rnn_cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(hidden_size)
 
   # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
 
@@ -509,22 +543,22 @@
   initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
 
   # 'state' is a tensor of shape [batch_size, cell_state_size]
-  outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+  outputs, state = tf.compat.v1.nn.dynamic_rnn(rnn_cell, input_data,
                                      initial_state=initial_state,
                                      dtype=tf.float32)
   ```
 
   ```python
   # create 2 LSTMCells
-  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+  rnn_layers = [tf.compat.v1.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
 
   # create a RNN cell composed sequentially of a number of RNNCells
-  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+  multi_rnn_cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(rnn_layers)
 
   # 'outputs' is a tensor of shape [batch_size, max_time, 256]
   # 'state' is a N-tuple where N is the number of LSTMCells containing a
-  # tf.contrib.rnn.LSTMStateTuple for each cell
-  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+  # tf.nn.rnn_cell.LSTMStateTuple for each cell
+  outputs, state = tf.compat.v1.nn.dynamic_rnn(cell=multi_rnn_cell,
                                      inputs=data,
                                      dtype=tf.float32)
   ```
@@ -534,46 +568,44 @@
     cell: An instance of RNNCell.
     inputs: The RNN inputs.
       If `time_major == False` (default), this must be a `Tensor` of shape:
-        `[batch_size, max_time, ...]`, or a nested tuple of such
-        elements.
-      If `time_major == True`, this must be a `Tensor` of shape:
-        `[max_time, batch_size, ...]`, or a nested tuple of such
-        elements.
-      This may also be a (possibly nested) tuple of Tensors satisfying
-      this property.  The first two dimensions must match across all the inputs,
-      but otherwise the ranks and other shape components may differ.
-      In this case, input to `cell` at each time-step will replicate the
-      structure of these tuples, except for the time dimension (from which the
-      time is taken).
-      The input to `cell` at each time step will be a `Tensor` or (possibly
-      nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
-    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
-      Used to copy-through state and zero-out outputs when past a batch
-      element's sequence length.  So it's more for performance than correctness.
-    initial_state: (optional) An initial state for the RNN.
-      If `cell.state_size` is an integer, this must be
-      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-      If `cell.state_size` is a tuple, this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If `time_major == True`, this must be a `Tensor` of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements. This may also be
+        a (possibly nested) tuple of Tensors satisfying this property.  The
+        first two dimensions must match across all the inputs, but otherwise the
+        ranks and other shape components may differ. In this case, input to
+        `cell` at each time-step will replicate the structure of these tuples,
+        except for the time dimension (from which the time is taken). The input
+        to `cell` at each time step will be a `Tensor` or (possibly nested)
+        tuple of Tensors each with dimensions `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used
+      to copy-through state and zero-out outputs when past a batch element's
+      sequence length.  This parameter enables users to extract the last valid
+      state and properly padded outputs, so it is provided for correctness.
+    initial_state: (optional) An initial state for the RNN. If `cell.state_size`
+      is an integer, this must be a `Tensor` of appropriate type and shape
+      `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this
+      should be a tuple of tensors having shapes `[batch_size, s] for s in
+      cell.state_size`.
     dtype: (optional) The data type for the initial state and expected output.
       Required if initial_state is not provided or RNN state has a heterogeneous
       dtype.
     parallel_iterations: (Default: 32).  The number of iterations to run in
-      parallel.  Those operations which do not have any temporal dependency
-      and can be run in parallel, will be.  This parameter trades off
-      time for space.  Values >> 1 use more memory but take less time,
-      while smaller values use less memory but computations take longer.
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
     swap_memory: Transparently swap the tensors produced in forward inference
-      but needed for back prop from GPU to CPU.  This allows training RNNs
-      which would typically not fit on a single GPU, with very minimal (or no)
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
       performance penalty.
-    time_major: The shape format of the `inputs` and `outputs` Tensors.
-      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-      Using `time_major = True` is a bit more efficient because it avoids
-      transposes at the beginning and end of the RNN calculation.  However,
-      most TensorFlow data is batch-major, so by default this function
-      accepts input and emits output in batch-major form.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
     scope: VariableScope for the created subgraph; defaults to "rnn".
 
   Returns:
@@ -631,7 +663,8 @@
             "sequence_length must be a vector of length batch_size, "
             "but saw shape: %s" % sequence_length.get_shape())
       sequence_length = array_ops.identity(  # Just to find it in the graph.
-          sequence_length, name="sequence_length")
+          sequence_length,
+          name="sequence_length")
 
     batch_size = _best_effort_input_batch_size(flat_input)
 
@@ -650,9 +683,10 @@
       x_shape = array_ops.shape(x)
       packed_shape = array_ops.stack(shape)
       return control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
-          ["Expected shape for Tensor %s is " % x.name,
-           packed_shape, " but saw shape: ", x_shape])
+          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [
+              "Expected shape for Tensor %s is " % x.name, packed_shape,
+              " but saw shape: ", x_shape
+          ])
 
     if not context.executing_eagerly() and sequence_length is not None:
       # Perform some shape validation
@@ -696,8 +730,8 @@
     inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
       tuple of such elements.
     initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
-      `cell.state_size` is a tuple, then this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      `cell.state_size` is a tuple, then this should be a tuple of tensors
+      having shapes `[batch_size, s] for s in cell.state_size`.
     parallel_iterations: Positive Python int.
     swap_memory: A Python boolean
     sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
@@ -736,8 +770,8 @@
   time_steps = input_shape[0]
   batch_size = _best_effort_input_batch_size(flat_input)
 
-  inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
-                           for input_ in flat_input)
+  inputs_got_shape = tuple(
+      input_.get_shape().with_rank_at_least(3) for input_ in flat_input)
 
   const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
 
@@ -762,10 +796,10 @@
     return array_ops.zeros(
         array_ops.stack(size), _infer_state_dtype(dtype, state))
 
-  flat_zero_output = tuple(_create_zero_arrays(output)
-                           for output in flat_output_size)
-  zero_output = nest.pack_sequence_as(structure=cell.output_size,
-                                      flat_sequence=flat_zero_output)
+  flat_zero_output = tuple(
+      _create_zero_arrays(output) for output in flat_output_size)
+  zero_output = nest.pack_sequence_as(
+      structure=cell.output_size, flat_sequence=flat_zero_output)
 
   if sequence_length is not None:
     min_sequence_length = math_ops.reduce_min(sequence_length)
@@ -779,19 +813,20 @@
     base_name = scope
 
   def _create_ta(name, element_shape, dtype):
-    return tensor_array_ops.TensorArray(dtype=dtype,
-                                        size=time_steps,
-                                        element_shape=element_shape,
-                                        tensor_array_name=base_name + name)
+    return tensor_array_ops.TensorArray(
+        dtype=dtype,
+        size=time_steps,
+        element_shape=element_shape,
+        tensor_array_name=base_name + name)
 
   in_graph_mode = not context.executing_eagerly()
   if in_graph_mode:
     output_ta = tuple(
         _create_ta(
             "output_%d" % i,
-            element_shape=(tensor_shape.TensorShape([const_batch_size])
-                           .concatenate(
-                               _maybe_tensor_shape_from_tensor(out_size))),
+            element_shape=(
+                tensor_shape.TensorShape([const_batch_size]).concatenate(
+                    _maybe_tensor_shape_from_tensor(out_size))),
             dtype=_infer_state_dtype(dtype, state))
         for i, out_size in enumerate(flat_output_size))
     input_ta = tuple(
@@ -800,8 +835,8 @@
             element_shape=flat_input_i.shape[1:],
             dtype=flat_input_i.dtype)
         for i, flat_input_i in enumerate(flat_input))
-    input_ta = tuple(ta.unstack(input_)
-                     for ta, input_ in zip(input_ta, flat_input))
+    input_ta = tuple(
+        ta.unstack(input_) for ta, input_ in zip(input_ta, flat_input))
   else:
     output_ta = tuple([0 for _ in range(time_steps.numpy())]
                       for i in range(len(flat_output_size)))
@@ -866,8 +901,8 @@
   if in_graph_mode:
     # Make sure that we run at least 1 step, if necessary, to ensure
     # the TensorArrays pick up the dynamic shape.
-    loop_bound = math_ops.minimum(
-        time_steps, math_ops.maximum(1, max_sequence_length))
+    loop_bound = math_ops.minimum(time_steps,
+                                  math_ops.maximum(1, max_sequence_length))
   else:
     # Using max_sequence_length isn't currently supported in the Eager branch.
     loop_bound = time_steps
@@ -885,8 +920,9 @@
     final_outputs = tuple(ta.stack() for ta in output_final_ta)
     # Restore some shape information
     for output, output_size in zip(final_outputs, flat_output_size):
-      shape = _concat(
-          [const_time_steps, const_batch_size], output_size, static=True)
+      shape = _concat([const_time_steps, const_batch_size],
+                      output_size,
+                      static=True)
       output.set_shape(shape)
   else:
     final_outputs = output_final_ta
@@ -901,8 +937,11 @@
 
 
 @tf_export(v1=["nn.raw_rnn"])
-def raw_rnn(cell, loop_fn,
-            parallel_iterations=None, swap_memory=False, scope=None):
+def raw_rnn(cell,
+            loop_fn,
+            parallel_iterations=None,
+            swap_memory=False,
+            scope=None):
   """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
 
   **NOTE: This method is still in testing, and the API may change.**
@@ -948,13 +987,14 @@
   A simple implementation of `dynamic_rnn` via `raw_rnn` looks like this:
 
   ```python
-  inputs = tf.placeholder(shape=(max_time, batch_size, input_depth),
+  inputs = tf.compat.v1.placeholder(shape=(max_time, batch_size, input_depth),
                           dtype=tf.float32)
-  sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
+  sequence_length = tf.compat.v1.placeholder(shape=(batch_size,),
+  dtype=tf.int32)
   inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
   inputs_ta = inputs_ta.unstack(inputs)
 
-  cell = tf.contrib.rnn.LSTMCell(num_units)
+  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units)
 
   def loop_fn(time, cell_output, cell_state, loop_state):
     emit_output = cell_output  # == None for time == 0
@@ -978,68 +1018,60 @@
 
   Args:
     cell: An instance of RNNCell.
-    loop_fn: A callable that takes inputs
-      `(time, cell_output, cell_state, loop_state)`
-      and returns the tuple
-      `(finished, next_input, next_cell_state, emit_output, next_loop_state)`.
-      Here `time` is an int32 scalar `Tensor`, `cell_output` is a
-      `Tensor` or (possibly nested) tuple of tensors as determined by
-      `cell.output_size`, and `cell_state` is a `Tensor`
-      or (possibly nested) tuple of tensors, as determined by the `loop_fn`
-      on its first call (and should match `cell.state_size`).
+    loop_fn: A callable that takes inputs `(time, cell_output, cell_state,
+      loop_state)` and returns the tuple `(finished, next_input,
+      next_cell_state, emit_output, next_loop_state)`. Here `time` is an int32
+      scalar `Tensor`, `cell_output` is a `Tensor` or (possibly nested) tuple of
+      tensors as determined by `cell.output_size`, and `cell_state` is a
+      `Tensor` or (possibly nested) tuple of tensors, as determined by the
+      `loop_fn` on its first call (and should match `cell.state_size`).
       The outputs are: `finished`, a boolean `Tensor` of
       shape `[batch_size]`, `next_input`: the next input to feed to `cell`,
       `next_cell_state`: the next state to feed to `cell`,
-      and `emit_output`: the output to store for this iteration.
-
-      Note that `emit_output` should be a `Tensor` or (possibly nested)
-      tuple of tensors which is aggregated in the `emit_ta` inside the
-      `while_loop`. For the first call to `loop_fn`, the `emit_output`
-      corresponds to the `emit_structure` which is then used to determine the
-      size of the `zero_tensor` for the `emit_ta` (defaults to
-      `cell.output_size`). For the subsequent calls to the `loop_fn`, the
-      `emit_output` corresponds to the actual output tensor
-      that is to be aggregated in the `emit_ta`. The parameter `cell_state`
-      and output `next_cell_state` may be either a single or (possibly nested)
-      tuple of tensors.  The parameter `loop_state` and
-      output `next_loop_state` may be either a single or (possibly nested) tuple
-      of `Tensor` and `TensorArray` objects.  This last parameter
-      may be ignored by `loop_fn` and the return value may be `None`.  If it
-      is not `None`, then the `loop_state` will be propagated through the RNN
-      loop, for use purely by `loop_fn` to keep track of its own state.
-      The `next_loop_state` parameter returned may be `None`.
-
-      The first call to `loop_fn` will be `time = 0`, `cell_output = None`,
-      `cell_state = None`, and `loop_state = None`.  For this call:
-      The `next_cell_state` value should be the value with which to initialize
-      the cell's state.  It may be a final state from a previous RNN or it
-      may be the output of `cell.zero_state()`.  It should be a
-      (possibly nested) tuple structure of tensors.
-      If `cell.state_size` is an integer, this must be
-      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-      If `cell.state_size` is a `TensorShape`, this must be a `Tensor` of
-      appropriate type and shape `[batch_size] + cell.state_size`.
-      If `cell.state_size` is a (possibly nested) tuple of ints or
-      `TensorShape`, this will be a tuple having the corresponding shapes.
-      The `emit_output` value may be either `None` or a (possibly nested)
-      tuple structure of tensors, e.g.,
-      `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`.
-      If this first `emit_output` return value is `None`,
-      then the `emit_ta` result of `raw_rnn` will have the same structure and
-      dtypes as `cell.output_size`.  Otherwise `emit_ta` will have the same
-      structure, shapes (prepended with a `batch_size` dimension), and dtypes
-      as `emit_output`.  The actual values returned for `emit_output` at this
-      initializing call are ignored.  Note, this emit structure must be
-      consistent across all time steps.
-
+      and `emit_output`: the output to store for this iteration.  Note that
+        `emit_output` should be a `Tensor` or (possibly nested) tuple of tensors
+        which is aggregated in the `emit_ta` inside the `while_loop`. For the
+        first call to `loop_fn`, the `emit_output` corresponds to the
+        `emit_structure` which is then used to determine the size of the
+        `zero_tensor` for the `emit_ta` (defaults to `cell.output_size`). For
+        the subsequent calls to the `loop_fn`, the `emit_output` corresponds to
+        the actual output tensor that is to be aggregated in the `emit_ta`. The
+        parameter `cell_state` and output `next_cell_state` may be either a
+        single or (possibly nested) tuple of tensors.  The parameter
+        `loop_state` and output `next_loop_state` may be either a single or
+        (possibly nested) tuple of `Tensor` and `TensorArray` objects.  This
+        last parameter may be ignored by `loop_fn` and the return value may be
+        `None`.  If it is not `None`, then the `loop_state` will be propagated
+        through the RNN loop, for use purely by `loop_fn` to keep track of its
+        own state. The `next_loop_state` parameter returned may be `None`.  The
+        first call to `loop_fn` will be `time = 0`, `cell_output = None`,
+      `cell_state = None`, and `loop_state = None`.  For this call: The
+        `next_cell_state` value should be the value with which to initialize the
+        cell's state.  It may be a final state from a previous RNN or it may be
+        the output of `cell.zero_state()`.  It should be a (possibly nested)
+        tuple structure of tensors. If `cell.state_size` is an integer, this
+        must be a `Tensor` of appropriate type and shape `[batch_size,
+        cell.state_size]`. If `cell.state_size` is a `TensorShape`, this must be
+        a `Tensor` of appropriate type and shape `[batch_size] +
+        cell.state_size`. If `cell.state_size` is a (possibly nested) tuple of
+        ints or `TensorShape`, this will be a tuple having the corresponding
+        shapes. The `emit_output` value may be either `None` or a (possibly
+        nested) tuple structure of tensors, e.g., `(tf.zeros(shape_0,
+        dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`. If this first
+        `emit_output` return value is `None`, then the `emit_ta` result of
+        `raw_rnn` will have the same structure and dtypes as `cell.output_size`.
+        Otherwise `emit_ta` will have the same structure, shapes (prepended with
+        a `batch_size` dimension), and dtypes as `emit_output`.  The actual
+        values returned for `emit_output` at this initializing call are ignored.
+        Note, this emit structure must be consistent across all time steps.
     parallel_iterations: (Default: 32).  The number of iterations to run in
-      parallel.  Those operations which do not have any temporal dependency
-      and can be run in parallel, will be.  This parameter trades off
-      time for space.  Values >> 1 use more memory but take less time,
-      while smaller values use less memory but computations take longer.
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
     swap_memory: Transparently swap the tensors produced in forward inference
-      but needed for back prop from GPU to CPU.  This allows training RNNs
-      which would typically not fit on a single GPU, with very minimal (or no)
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
       performance penalty.
     scope: VariableScope for the created subgraph; defaults to "rnn".
 
@@ -1086,14 +1118,15 @@
         varscope.set_caching_device(lambda op: op.device)
 
     time = constant_op.constant(0, dtype=dtypes.int32)
-    (elements_finished, next_input, initial_state, emit_structure,
-     init_loop_state) = loop_fn(
+    (elements_finished, next_input,
+     initial_state, emit_structure, init_loop_state) = loop_fn(
          time, None, None, None)  # time, cell_output, cell_state, loop_state
     flat_input = nest.flatten(next_input)
 
     # Need a surrogate loop state for the while_loop if none is available.
-    loop_state = (init_loop_state if init_loop_state is not None
-                  else constant_op.constant(0, dtype=dtypes.int32))
+    loop_state = (
+        init_loop_state if init_loop_state is not None else
+        constant_op.constant(0, dtype=dtypes.int32))
 
     input_shape = [input_.get_shape() for input_ in flat_input]
     static_batch_size = tensor_shape.dimension_at_index(input_shape[0], 0)
@@ -1112,13 +1145,14 @@
     state = initial_state
     flat_state = nest.flatten(state)
     flat_state = [ops.convert_to_tensor(s) for s in flat_state]
-    state = nest.pack_sequence_as(structure=state,
-                                  flat_sequence=flat_state)
+    state = nest.pack_sequence_as(structure=state, flat_sequence=flat_state)
 
     if emit_structure is not None:
       flat_emit_structure = nest.flatten(emit_structure)
-      flat_emit_size = [emit.shape if emit.shape.is_fully_defined() else
-                        array_ops.shape(emit) for emit in flat_emit_structure]
+      flat_emit_size = [
+          emit.shape if emit.shape.is_fully_defined() else array_ops.shape(emit)
+          for emit in flat_emit_structure
+      ]
       flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure]
     else:
       emit_structure = cell.output_size
@@ -1129,26 +1163,28 @@
         tensor_array_ops.TensorArray(
             dtype=dtype_i,
             dynamic_size=True,
-            element_shape=(tensor_shape.TensorShape([const_batch_size])
-                           .concatenate(
-                               _maybe_tensor_shape_from_tensor(size_i))),
+            element_shape=(tensor_shape.TensorShape([
+                const_batch_size
+            ]).concatenate(_maybe_tensor_shape_from_tensor(size_i))),
             size=0,
             name="rnn_output_%d" % i)
-        for i, (dtype_i, size_i)
-        in enumerate(zip(flat_emit_dtypes, flat_emit_size))]
-    emit_ta = nest.pack_sequence_as(structure=emit_structure,
-                                    flat_sequence=flat_emit_ta)
+        for i, (dtype_i,
+                size_i) in enumerate(zip(flat_emit_dtypes, flat_emit_size))
+    ]
+    emit_ta = nest.pack_sequence_as(
+        structure=emit_structure, flat_sequence=flat_emit_ta)
     flat_zero_emit = [
         array_ops.zeros(_concat(batch_size, size_i), dtype_i)
-        for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)]
-    zero_emit = nest.pack_sequence_as(structure=emit_structure,
-                                      flat_sequence=flat_zero_emit)
+        for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)
+    ]
+    zero_emit = nest.pack_sequence_as(
+        structure=emit_structure, flat_sequence=flat_zero_emit)
 
     def condition(unused_time, elements_finished, *_):
       return math_ops.logical_not(math_ops.reduce_all(elements_finished))
 
-    def body(time, elements_finished, current_input,
-             emit_ta, state, loop_state):
+    def body(time, elements_finished, current_input, emit_ta, state,
+             loop_state):
       """Internal while loop body for raw_rnn.
 
       Args:
@@ -1169,8 +1205,8 @@
 
       next_time = time + 1
       (next_finished, next_input, next_state, emit_output,
-       next_loop_state) = loop_fn(
-           next_time, next_output, cell_state, loop_state)
+       next_loop_state) = loop_fn(next_time, next_output, cell_state,
+                                  loop_state)
 
       nest.assert_same_structure(state, next_state)
       nest.assert_same_structure(current_input, next_input)
@@ -1182,6 +1218,7 @@
 
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
+
         def copy_fn(cur_i, cand_i):
           # TensorArray and scalar get passed through.
           if isinstance(cur_i, tensor_array_ops.TensorArray):
@@ -1191,23 +1228,26 @@
           # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
             return array_ops.where(elements_finished, cur_i, cand_i)
+
         return nest.map_structure(copy_fn, current, candidate)
 
       emit_output = _copy_some_through(zero_emit, emit_output)
       next_state = _copy_some_through(state, next_state)
 
-      emit_ta = nest.map_structure(
-          lambda ta, emit: ta.write(time, emit), emit_ta, emit_output)
+      emit_ta = nest.map_structure(lambda ta, emit: ta.write(time, emit),
+                                   emit_ta, emit_output)
 
       elements_finished = math_ops.logical_or(elements_finished, next_finished)
 
-      return (next_time, elements_finished, next_input,
-              emit_ta, next_state, loop_state)
+      return (next_time, elements_finished, next_input, emit_ta, next_state,
+              loop_state)
 
     returned = control_flow_ops.while_loop(
-        condition, body, loop_vars=[
-            time, elements_finished, next_input,
-            emit_ta, state, loop_state],
+        condition,
+        body,
+        loop_vars=[
+            time, elements_finished, next_input, emit_ta, state, loop_state
+        ],
         parallel_iterations=parallel_iterations,
         swap_memory=swap_memory)
 
@@ -1219,9 +1259,9 @@
     return (emit_ta, final_state, final_loop_state)
 
 
-@deprecation.deprecated(
-    None, "Please use `keras.layers.RNN(cell, unroll=True)`, "
-    "which is equivalent to this API")
+@deprecation.deprecated(None,
+                        "Please use `keras.layers.RNN(cell, unroll=True)`, "
+                        "which is equivalent to this API")
 @tf_export(v1=["nn.static_rnn"])
 def static_rnn(cell,
                inputs,
@@ -1261,18 +1301,18 @@
 
   Args:
     cell: An instance of RNNCell.
-    inputs: A length T list of inputs, each a `Tensor` of shape
-      `[batch_size, input_size]`, or a nested tuple of such elements.
-    initial_state: (optional) An initial state for the RNN.
-      If `cell.state_size` is an integer, this must be
-      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-      If `cell.state_size` is a tuple, this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+    inputs: A length T list of inputs, each a `Tensor` of shape `[batch_size,
+      input_size]`, or a nested tuple of such elements.
+    initial_state: (optional) An initial state for the RNN. If `cell.state_size`
+      is an integer, this must be a `Tensor` of appropriate type and shape
+      `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this
+      should be a tuple of tensors having shapes `[batch_size, s] for s in
+      cell.state_size`.
     dtype: (optional) The data type for the initial state and expected output.
       Required if initial_state is not provided or RNN state has a heterogeneous
       dtype.
-    sequence_length: Specifies the length of each sequence in inputs.
-      An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
+    sequence_length: Specifies the length of each sequence in inputs. An int32
+      or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
     scope: VariableScope for the created subgraph; defaults to "rnn".
 
   Returns:
@@ -1356,9 +1396,10 @@
         size = _concat(batch_size, output_size)
         output = array_ops.zeros(
             array_ops.stack(size), _infer_state_dtype(dtype, state))
-        shape = _concat(tensor_shape.dimension_value(fixed_batch_size),
-                        output_size,
-                        static=True)
+        shape = _concat(
+            tensor_shape.dimension_value(fixed_batch_size),
+            output_size,
+            static=True)
         output.set_shape(tensor_shape.TensorShape(shape))
         return output
 
@@ -1403,9 +1444,9 @@
     return (outputs, state)
 
 
-@deprecation.deprecated(
-    None, "Please use `keras.layers.RNN(cell, stateful=True)`, "
-    "which is equivalent to this API")
+@deprecation.deprecated(None,
+                        "Please use `keras.layers.RNN(cell, stateful=True)`, "
+                        "which is equivalent to this API")
 @tf_export(v1=["nn.static_state_saving_rnn"])
 def static_state_saving_rnn(cell,
                             inputs,
@@ -1417,16 +1458,15 @@
 
   Args:
     cell: An instance of `RNNCell`.
-    inputs: A length T list of inputs, each a `Tensor` of shape
-      `[batch_size, input_size]`.
+    inputs: A length T list of inputs, each a `Tensor` of shape `[batch_size,
+      input_size]`.
     state_saver: A state saver object with methods `state` and `save_state`.
     state_name: Python string or tuple of strings.  The name to use with the
-      state_saver. If the cell returns tuples of states (i.e.,
-      `cell.state_size` is a tuple) then `state_name` should be a tuple of
-      strings having the same length as `cell.state_size`.  Otherwise it should
-      be a single string.
-    sequence_length: (optional) An int32/int64 vector size [batch_size].
-      See the documentation for rnn() for more details about sequence_length.
+      state_saver. If the cell returns tuples of states (i.e., `cell.state_size`
+      is a tuple) then `state_name` should be a tuple of strings having the same
+      length as `cell.state_size`.  Otherwise it should be a single string.
+    sequence_length: (optional) An int32/int64 vector size [batch_size]. See the
+      documentation for rnn() for more details about sequence_length.
     scope: VariableScope for the created subgraph; defaults to "rnn".
 
   Returns:
@@ -1445,8 +1485,8 @@
 
   if state_is_tuple != state_name_tuple:
     raise ValueError("state_name should be the same type as cell.state_size.  "
-                     "state_name: %s, cell.state_size: %s" % (str(state_name),
-                                                              str(state_size)))
+                     "state_name: %s, cell.state_size: %s" %
+                     (str(state_name), str(state_size)))
 
   if state_is_tuple:
     state_name_flat = nest.flatten(state_name)
@@ -1524,17 +1564,17 @@
   Args:
     cell_fw: An instance of RNNCell, to be used for forward direction.
     cell_bw: An instance of RNNCell, to be used for backward direction.
-    inputs: A length T list of inputs, each a tensor of shape
-      [batch_size, input_size], or a nested tuple of such elements.
-    initial_state_fw: (optional) An initial state for the forward RNN.
-      This must be a tensor of appropriate type and shape
-      `[batch_size, cell_fw.state_size]`.
-      If `cell_fw.state_size` is a tuple, this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
-      the corresponding properties of `cell_bw`.
-    dtype: (optional) The data type for the initial state.  Required if
-      either of the initial states are not provided.
+    inputs: A length T list of inputs, each a tensor of shape [batch_size,
+      input_size], or a nested tuple of such elements.
+    initial_state_fw: (optional) An initial state for the forward RNN. This must
+      be a tensor of appropriate type and shape `[batch_size,
+      cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a
+      tuple of tensors having shapes `[batch_size, s] for s in
+      cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using the
+      corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial state.  Required if either
+      of the initial states are not provided.
     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
       containing the actual lengths for each of the sequences.
     scope: VariableScope for the created subgraph; defaults to
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index dc545e1..19b7bfa 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -56,7 +56,6 @@
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
-
 _BIAS_VARIABLE_NAME = "bias"
 _WEIGHTS_VARIABLE_NAME = "kernel"
 
@@ -81,8 +80,8 @@
   ASSERT_LIKE_RNNCELL_ERROR_REGEXP.
 
   Args:
-    cell_name: A string to give a meaningful error referencing to the name
-      of the functionargument.
+    cell_name: A string to give a meaningful error referencing to the name of
+      the functionargument.
     cell: The object which should behave like an RNNCell.
 
   Raises:
@@ -95,8 +94,7 @@
       callable(cell),
   ]
   errors = [
-      "'output_size' property is missing",
-      "'state_size' property is missing",
+      "'output_size' property is missing", "'state_size' property is missing",
       "either 'zero_state' or 'get_initial_state' method is required",
       "is not callable"
   ]
@@ -141,8 +139,9 @@
   else:
     p = tensor_shape.as_shape(prefix)
     p_static = p.as_list() if p.ndims is not None else None
-    p = (constant_op.constant(p.as_list(), dtype=dtypes.int32)
-         if p.is_fully_defined() else None)
+    p = (
+        constant_op.constant(p.as_list(), dtype=dtypes.int32)
+        if p.is_fully_defined() else None)
   if isinstance(suffix, ops.Tensor):
     s = suffix
     s_static = tensor_util.constant_value(suffix)
@@ -154,22 +153,24 @@
   else:
     s = tensor_shape.as_shape(suffix)
     s_static = s.as_list() if s.ndims is not None else None
-    s = (constant_op.constant(s.as_list(), dtype=dtypes.int32)
-         if s.is_fully_defined() else None)
+    s = (
+        constant_op.constant(s.as_list(), dtype=dtypes.int32)
+        if s.is_fully_defined() else None)
 
   if static:
     shape = tensor_shape.as_shape(p_static).concatenate(s_static)
     shape = shape.as_list() if shape.ndims is not None else None
   else:
     if p is None or s is None:
-      raise ValueError("Provided a prefix or suffix of None: %s and %s"
-                       % (prefix, suffix))
+      raise ValueError("Provided a prefix or suffix of None: %s and %s" %
+                       (prefix, suffix))
     shape = array_ops.concat((p, s), 0)
   return shape
 
 
 def _zero_state_tensors(state_size, batch_size, dtype):
   """Create tensors of zeros based on state_size, batch_size, and dtype."""
+
   def get_state_shape(s):
     """Combine s with batch_size to get a proper tensor shape."""
     c = _concat(batch_size, s)
@@ -178,6 +179,7 @@
       c_static = _concat(batch_size, s, static=True)
       size.set_shape(c_static)
     return size
+
   return nest.map_structure(get_state_shape, state_size)
 
 
@@ -220,8 +222,8 @@
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: if `self.state_size` is an integer, this should be a `2-D Tensor`
         with shape `[batch_size, self.state_size]`.  Otherwise, if
-        `self.state_size` is a tuple of integers, this should be a tuple
-        with shapes `[batch_size, s] for s in self.state_size`.
+        `self.state_size` is a tuple of integers, this should be a tuple with
+        shapes `[batch_size, s] for s in self.state_size`.
       scope: VariableScope for the created subgraph; defaults to class name.
 
     Returns:
@@ -232,15 +234,15 @@
         the arity and shapes of `state`.
     """
     if scope is not None:
-      with vs.variable_scope(scope,
-                             custom_getter=self._rnn_get_variable) as scope:
+      with vs.variable_scope(
+          scope, custom_getter=self._rnn_get_variable) as scope:
         return super(RNNCell, self).__call__(inputs, state, scope=scope)
     else:
       scope_attrname = "rnncell_scope"
       scope = getattr(self, scope_attrname, None)
       if scope is None:
-        scope = vs.variable_scope(vs.get_variable_scope(),
-                                  custom_getter=self._rnn_get_variable)
+        scope = vs.variable_scope(
+            vs.get_variable_scope(), custom_getter=self._rnn_get_variable)
         setattr(self, scope_attrname, scope)
       with scope:
         return super(RNNCell, self).__call__(inputs, state)
@@ -331,8 +333,7 @@
     if is_eager and _hasattr(self, "_last_zero_state"):
       (last_state_size, last_batch_size, last_dtype,
        last_output) = getattr(self, "_last_zero_state")
-      if (last_batch_size == batch_size and
-          last_dtype == dtype and
+      if (last_batch_size == batch_size and last_dtype == dtype and
           last_state_size == state_size):
         return last_output
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
@@ -346,14 +347,15 @@
   """Subclass of RNNCells that act like proper `tf.Layer` objects.
 
   For backwards compatibility purposes, most `RNNCell` instances allow their
-  `call` methods to instantiate variables via `tf.get_variable`.  The underlying
+  `call` methods to instantiate variables via `tf.compat.v1.get_variable`.  The
+  underlying
   variable scope thus keeps track of any variables, and returning cached
   versions.  This is atypical of `tf.layer` objects, which separate this
   part of layer building into a `build` method that is only called once.
 
   Here we provide a subclass for `RNNCell` objects that act exactly as
   `Layer` objects do.  They must provide a `build` method and their
-  `call` methods do not access Variables `tf.get_variable`.
+  `call` methods do not access Variables `tf.compat.v1.get_variable`.
   """
 
   def __call__(self, inputs, state, scope=None, *args, **kwargs):
@@ -363,8 +365,8 @@
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: if `self.state_size` is an integer, this should be a `2-D Tensor`
         with shape `[batch_size, self.state_size]`.  Otherwise, if
-        `self.state_size` is a tuple of integers, this should be a tuple
-        with shapes `[batch_size, s] for s in self.state_size`.
+        `self.state_size` is a tuple of integers, this should be a tuple with
+        shapes `[batch_size, s] for s in self.state_size`.
       scope: optional cell scope.
       *args: Additional positional arguments.
       **kwargs: Additional keyword arguments.
@@ -379,8 +381,8 @@
     # Bypass RNNCell's variable capturing semantics for LayerRNNCell.
     # Instead, it is up to subclasses to provide a proper build
     # method.  See the class docstring for more details.
-    return base_layer.Layer.__call__(self, inputs, state, scope=scope,
-                                     *args, **kwargs)
+    return base_layer.Layer.__call__(
+        self, inputs, state, scope=scope, *args, **kwargs)
 
 
 @tf_export(v1=["nn.rnn_cell.BasicRNNCell"])
@@ -394,20 +396,19 @@
     num_units: int, The number of units in the RNN cell.
     activation: Nonlinearity to use.  Default: `tanh`. It could also be string
       that is within Keras activation function names.
-    reuse: (optional) Python boolean describing whether to reuse variables
-     in an existing scope.  If not `True`, and the existing scope already has
-     the given variables, an error is raised.
-    name: String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require reuse=True in such
-      cases.
-    dtype: Default dtype of the layer (default of `None` means use the type
-      of the first input). Required when `build` is called before `call`.
+    reuse: (optional) Python boolean describing whether to reuse variables in an
+      existing scope.  If not `True`, and the existing scope already has the
+      given variables, an error is raised.
+    name: String, the name of the layer. Layers with the same name will share
+      weights, but to avoid mistakes we require reuse=True in such cases.
+    dtype: Default dtype of the layer (default of `None` means use the type of
+      the first input). Required when `build` is called before `call`.
     **kwargs: Dict, keyword named properties for common layer attributes, like
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
   @deprecated(None, "This class is equivalent as tf.keras.layers.SimpleRNNCell,"
-                    " and will be replaced by that in Tensorflow 2.0.")
+              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -419,9 +420,10 @@
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
     if context.executing_eagerly() and context.num_gpus() > 0:
-      logging.warn("%s: Note that this cell is not optimized for performance. "
-                   "Please use tf.contrib.cudnn_rnn.CudnnRNNTanh for better "
-                   "performance on GPU.", self)
+      logging.warn(
+          "%s: Note that this cell is not optimized for performance. "
+          "Please use tf.contrib.cudnn_rnn.CudnnRNNTanh for better "
+          "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
     self.input_spec = input_spec.InputSpec(ndim=2)
@@ -443,8 +445,8 @@
   @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
     if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
-                       % str(inputs_shape))
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       str(inputs_shape))
     _check_supported_dtypes(self.dtype)
 
     input_depth = inputs_shape[-1]
@@ -479,7 +481,9 @@
 
 @tf_export(v1=["nn.rnn_cell.GRUCell"])
 class GRUCell(LayerRNNCell):
-  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+  """Gated Recurrent Unit cell (cf.
+
+  http://arxiv.org/abs/1406.1078).
 
   Note that this cell is not optimized for performance. Please use
   `tf.contrib.cudnn_rnn.CudnnGRU` for better performance on GPU, or
@@ -488,23 +492,22 @@
   Args:
     num_units: int, The number of units in the GRU cell.
     activation: Nonlinearity to use.  Default: `tanh`.
-    reuse: (optional) Python boolean describing whether to reuse variables
-     in an existing scope.  If not `True`, and the existing scope already has
-     the given variables, an error is raised.
+    reuse: (optional) Python boolean describing whether to reuse variables in an
+      existing scope.  If not `True`, and the existing scope already has the
+      given variables, an error is raised.
     kernel_initializer: (optional) The initializer to use for the weight and
-    projection matrices.
+      projection matrices.
     bias_initializer: (optional) The initializer to use for the bias.
-    name: String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require reuse=True in such
-      cases.
-    dtype: Default dtype of the layer (default of `None` means use the type
-      of the first input). Required when `build` is called before `call`.
+    name: String, the name of the layer. Layers with the same name will share
+      weights, but to avoid mistakes we require reuse=True in such cases.
+    dtype: Default dtype of the layer (default of `None` means use the type of
+      the first input). Required when `build` is called before `call`.
     **kwargs: Dict, keyword named properties for common layer attributes, like
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
   @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell,"
-                    " and will be replaced by that in Tensorflow 2.0.")
+              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -519,9 +522,10 @@
     _check_supported_dtypes(self.dtype)
 
     if context.executing_eagerly() and context.num_gpus() > 0:
-      logging.warn("%s: Note that this cell is not optimized for performance. "
-                   "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
-                   "performance on GPU.", self)
+      logging.warn(
+          "%s: Note that this cell is not optimized for performance. "
+          "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
+          "performance on GPU.", self)
     # Inputs must be 2-dimensional.
     self.input_spec = input_spec.InputSpec(ndim=2)
 
@@ -544,8 +548,8 @@
   @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
     if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
-                       % str(inputs_shape))
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       str(inputs_shape))
     _check_supported_dtypes(self.dtype)
     input_depth = inputs_shape[-1]
     self._gate_kernel = self.add_variable(
@@ -555,10 +559,9 @@
     self._gate_bias = self.add_variable(
         "gates/%s" % _BIAS_VARIABLE_NAME,
         shape=[2 * self._num_units],
-        initializer=(
-            self._bias_initializer
-            if self._bias_initializer is not None
-            else init_ops.constant_initializer(1.0, dtype=self.dtype)))
+        initializer=(self._bias_initializer
+                     if self._bias_initializer is not None else
+                     init_ops.constant_initializer(1.0, dtype=self.dtype)))
     self._candidate_kernel = self.add_variable(
         "candidate/%s" % _WEIGHTS_VARIABLE_NAME,
         shape=[input_depth + self._num_units, self._num_units],
@@ -566,10 +569,9 @@
     self._candidate_bias = self.add_variable(
         "candidate/%s" % _BIAS_VARIABLE_NAME,
         shape=[self._num_units],
-        initializer=(
-            self._bias_initializer
-            if self._bias_initializer is not None
-            else init_ops.zeros_initializer(dtype=self.dtype)))
+        initializer=(self._bias_initializer
+                     if self._bias_initializer is not None else
+                     init_ops.zeros_initializer(dtype=self.dtype)))
 
     self.built = True
 
@@ -631,7 +633,7 @@
 
 @tf_export(v1=["nn.rnn_cell.BasicLSTMCell"])
 class BasicLSTMCell(LayerRNNCell):
-  """DEPRECATED: Please use `tf.nn.rnn_cell.LSTMCell` instead.
+  """DEPRECATED: Please use `tf.compat.v1.nn.rnn_cell.LSTMCell` instead.
 
   Basic LSTM recurrent network cell.
 
@@ -643,7 +645,7 @@
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
 
-  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
+  For advanced models, please use the full `tf.compat.v1.nn.rnn_cell.LSTMCell`
   that follows.
 
   Note that this cell is not optimized for performance. Please use
@@ -653,7 +655,7 @@
   """
 
   @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
-                    " and will be replaced by that in Tensorflow 2.0.")
+              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                forget_bias=1.0,
@@ -667,38 +669,37 @@
 
     Args:
       num_units: int, The number of units in the LSTM cell.
-      forget_bias: float, The bias added to forget gates (see above).
-        Must set to `0.0` manually when restoring from CudnnLSTM-trained
-        checkpoints.
-      state_is_tuple: If True, accepted and returned states are 2-tuples of
-        the `c_state` and `m_state`.  If False, they are concatenated
-        along the column axis.  The latter behavior will soon be deprecated.
+      forget_bias: float, The bias added to forget gates (see above). Must set
+        to `0.0` manually when restoring from CudnnLSTM-trained checkpoints.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of the
+        `c_state` and `m_state`.  If False, they are concatenated along the
+        column axis.  The latter behavior will soon be deprecated.
       activation: Activation function of the inner states.  Default: `tanh`. It
         could also be string that is within Keras activation function names.
-      reuse: (optional) Python boolean describing whether to reuse variables
-        in an existing scope.  If not `True`, and the existing scope already has
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
-      name: String, the name of the layer. Layers with the same name will
-        share weights, but to avoid mistakes we require reuse=True in such
-        cases.
-      dtype: Default dtype of the layer (default of `None` means use the type
-        of the first input). Required when `build` is called before `call`.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
       **kwargs: Dict, keyword named properties for common layer attributes, like
         `trainable` etc when constructing the cell from configs of get_config().
-
-      When restoring from CudnnLSTM-trained checkpoints, must use
-      `CudnnCompatibleLSTMCell` instead.
+        When restoring from CudnnLSTM-trained checkpoints, must use
+        `CudnnCompatibleLSTMCell` instead.
     """
     super(BasicLSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
     if not state_is_tuple:
-      logging.warn("%s: Using a concatenated state is slower and will soon be "
-                   "deprecated.  Use state_is_tuple=True.", self)
+      logging.warn(
+          "%s: Using a concatenated state is slower and will soon be "
+          "deprecated.  Use state_is_tuple=True.", self)
     if context.executing_eagerly() and context.num_gpus() > 0:
-      logging.warn("%s: Note that this cell is not optimized for performance. "
-                   "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
-                   "performance on GPU.", self)
+      logging.warn(
+          "%s: Note that this cell is not optimized for performance. "
+          "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
+          "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
     self.input_spec = input_spec.InputSpec(ndim=2)
@@ -723,8 +724,8 @@
   @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
     if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
-                       % str(inputs_shape))
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       str(inputs_shape))
     _check_supported_dtypes(self.dtype)
     input_depth = inputs_shape[-1]
     h_depth = self._num_units
@@ -743,10 +744,9 @@
 
     Args:
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
-      state: An `LSTMStateTuple` of state tensors, each shaped
-        `[batch_size, num_units]`, if `state_is_tuple` has been set to
-        `True`.  Otherwise, a `Tensor` shaped
-        `[batch_size, 2 * num_units]`.
+      state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size,
+        num_units]`, if `state_is_tuple` has been set to `True`.  Otherwise, a
+        `Tensor` shaped `[batch_size, 2 * num_units]`.
 
     Returns:
       A pair containing the new hidden state, and the new state (either a
@@ -776,8 +776,9 @@
     # performance improvement. So using those at the cost of readability.
     add = math_ops.add
     multiply = math_ops.multiply
-    new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))),
-                multiply(sigmoid(i), self._activation(j)))
+    new_c = add(
+        multiply(c, sigmoid(add(f, forget_bias_tensor))),
+        multiply(sigmoid(i), self._activation(j)))
     new_h = multiply(self._activation(new_c), sigmoid(o))
 
     if self._state_is_tuple:
@@ -827,13 +828,23 @@
   """
 
   @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
-                    " and will be replaced by that in Tensorflow 2.0.")
-  def __init__(self, num_units,
-               use_peepholes=False, cell_clip=None,
-               initializer=None, num_proj=None, proj_clip=None,
-               num_unit_shards=None, num_proj_shards=None,
-               forget_bias=1.0, state_is_tuple=True,
-               activation=None, reuse=None, name=None, dtype=None, **kwargs):
+              " and will be replaced by that in Tensorflow 2.0.")
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               num_unit_shards=None,
+               num_proj_shards=None,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None,
+               **kwargs):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -848,48 +859,48 @@
       proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
         provided, then the projected values are clipped elementwise to within
         `[-proj_clip, proj_clip]`.
-      num_unit_shards: Deprecated, will be removed by Jan. 2017.
-        Use a variable_scope partitioner instead.
-      num_proj_shards: Deprecated, will be removed by Jan. 2017.
-        Use a variable_scope partitioner instead.
-      forget_bias: Biases of the forget gate are initialized by default to 1
-        in order to reduce the scale of forgetting at the beginning of
-        the training. Must set it manually to `0.0` when restoring from
-        CudnnLSTM trained checkpoints.
-      state_is_tuple: If True, accepted and returned states are 2-tuples of
-        the `c_state` and `m_state`.  If False, they are concatenated
-        along the column axis.  This latter behavior will soon be deprecated.
+      num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a
+        variable_scope partitioner instead.
+      num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a
+        variable_scope partitioner instead.
+      forget_bias: Biases of the forget gate are initialized by default to 1 in
+        order to reduce the scale of forgetting at the beginning of the
+        training. Must set it manually to `0.0` when restoring from CudnnLSTM
+        trained checkpoints.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of the
+        `c_state` and `m_state`.  If False, they are concatenated along the
+        column axis.  This latter behavior will soon be deprecated.
       activation: Activation function of the inner states.  Default: `tanh`. It
         could also be string that is within Keras activation function names.
-      reuse: (optional) Python boolean describing whether to reuse variables
-        in an existing scope.  If not `True`, and the existing scope already has
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
-      name: String, the name of the layer. Layers with the same name will
-        share weights, but to avoid mistakes we require reuse=True in such
-        cases.
-      dtype: Default dtype of the layer (default of `None` means use the type
-        of the first input). Required when `build` is called before `call`.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
       **kwargs: Dict, keyword named properties for common layer attributes, like
         `trainable` etc when constructing the cell from configs of get_config().
-
-      When restoring from CudnnLSTM-trained checkpoints, use
-      `CudnnCompatibleLSTMCell` instead.
+        When restoring from CudnnLSTM-trained checkpoints, use
+        `CudnnCompatibleLSTMCell` instead.
     """
     super(LSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
     if not state_is_tuple:
-      logging.warn("%s: Using a concatenated state is slower and will soon be "
-                   "deprecated.  Use state_is_tuple=True.", self)
+      logging.warn(
+          "%s: Using a concatenated state is slower and will soon be "
+          "deprecated.  Use state_is_tuple=True.", self)
     if num_unit_shards is not None or num_proj_shards is not None:
       logging.warn(
           "%s: The num_unit_shards and proj_unit_shards parameters are "
           "deprecated and will be removed in Jan 2017.  "
           "Use a variable scope with a partitioner instead.", self)
     if context.executing_eagerly() and context.num_gpus() > 0:
-      logging.warn("%s: Note that this cell is not optimized for performance. "
-                   "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
-                   "performance on GPU.", self)
+      logging.warn(
+          "%s: Note that this cell is not optimized for performance. "
+          "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
+          "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
     self.input_spec = input_spec.InputSpec(ndim=2)
@@ -911,13 +922,13 @@
 
     if num_proj:
       self._state_size = (
-          LSTMStateTuple(num_units, num_proj)
-          if state_is_tuple else num_units + num_proj)
+          LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units +
+          num_proj)
       self._output_size = num_proj
     else:
       self._state_size = (
-          LSTMStateTuple(num_units, num_units)
-          if state_is_tuple else 2 * num_units)
+          LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 *
+          num_units)
       self._output_size = num_units
 
   @property
@@ -931,15 +942,14 @@
   @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
     if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
-                       % str(inputs_shape))
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       str(inputs_shape))
     _check_supported_dtypes(self.dtype)
     input_depth = inputs_shape[-1]
     h_depth = self._num_units if self._num_proj is None else self._num_proj
     maybe_partitioner = (
         partitioned_variables.fixed_size_partitioner(self._num_unit_shards)
-        if self._num_unit_shards is not None
-        else None)
+        if self._num_unit_shards is not None else None)
     self._kernel = self.add_variable(
         _WEIGHTS_VARIABLE_NAME,
         shape=[input_depth + h_depth, 4 * self._num_units],
@@ -954,18 +964,17 @@
         shape=[4 * self._num_units],
         initializer=initializer)
     if self._use_peepholes:
-      self._w_f_diag = self.add_variable("w_f_diag", shape=[self._num_units],
-                                         initializer=self._initializer)
-      self._w_i_diag = self.add_variable("w_i_diag", shape=[self._num_units],
-                                         initializer=self._initializer)
-      self._w_o_diag = self.add_variable("w_o_diag", shape=[self._num_units],
-                                         initializer=self._initializer)
+      self._w_f_diag = self.add_variable(
+          "w_f_diag", shape=[self._num_units], initializer=self._initializer)
+      self._w_i_diag = self.add_variable(
+          "w_i_diag", shape=[self._num_units], initializer=self._initializer)
+      self._w_o_diag = self.add_variable(
+          "w_o_diag", shape=[self._num_units], initializer=self._initializer)
 
     if self._num_proj is not None:
       maybe_proj_partitioner = (
           partitioned_variables.fixed_size_partitioner(self._num_proj_shards)
-          if self._num_proj_shards is not None
-          else None)
+          if self._num_proj_shards is not None else None)
       self._proj_kernel = self.add_variable(
           "projection/%s" % _WEIGHTS_VARIABLE_NAME,
           shape=[self._num_units, self._num_proj],
@@ -979,10 +988,9 @@
 
     Args:
       inputs: input Tensor, must be 2-D, `[batch, input_size]`.
-      state: if `state_is_tuple` is False, this must be a state Tensor,
-        `2-D, [batch, state_size]`.  If `state_is_tuple` is True, this must be a
-        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-        `m_state`.
+      state: if `state_is_tuple` is False, this must be a state Tensor, `2-D,
+        [batch, state_size]`.  If `state_is_tuple` is True, this must be a tuple
+        of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
 
     Returns:
       A tuple containing:
@@ -1023,11 +1031,13 @@
         value=lstm_matrix, num_or_size_splits=4, axis=1)
     # Diagonal connections
     if self._use_peepholes:
-      c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
-           sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
+      c = (
+          sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
+          sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
     else:
-      c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
-           self._activation(j))
+      c = (
+          sigmoid(f + self._forget_bias) * c_prev +
+          sigmoid(i) * self._activation(j))
 
     if self._cell_clip is not None:
       # pylint: disable=invalid-unary-operand-type
@@ -1046,8 +1056,9 @@
         m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
         # pylint: enable=invalid-unary-operand-type
 
-    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
-                 array_ops.concat([c, m], 1))
+    new_state = (
+        LSTMStateTuple(c, m)
+        if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
   def get_config(self):
@@ -1071,12 +1082,14 @@
 
 def _enumerated_map_structure_up_to(shallow_structure, map_fn, *args, **kwargs):
   ix = [0]
+
   def enumerated_fn(*inner_args, **inner_kwargs):
     r = map_fn(ix[0], *inner_args, **inner_kwargs)
     ix[0] += 1
     return r
-  return nest.map_structure_up_to(shallow_structure,
-                                  enumerated_fn, *args, **kwargs)
+
+  return nest.map_structure_up_to(shallow_structure, enumerated_fn, *args,
+                                  **kwargs)
 
 
 def _default_dropout_state_filter_visitor(substate):
@@ -1190,17 +1203,22 @@
 class DropoutWrapperBase(object):
   """Operator adding dropout to inputs and outputs of the given cell."""
 
-  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
-               state_keep_prob=1.0, variational_recurrent=False,
-               input_size=None, dtype=None, seed=None,
+  def __init__(self,
+               cell,
+               input_keep_prob=1.0,
+               output_keep_prob=1.0,
+               state_keep_prob=1.0,
+               variational_recurrent=False,
+               input_size=None,
+               dtype=None,
+               seed=None,
                dropout_state_filter_visitor=None):
     """Create a cell with added input, state, and/or output dropout.
 
     If `variational_recurrent` is set to `True` (**NOT** the default behavior),
     then the same dropout mask is applied at every step, as described in:
-
-    Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
-    Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287
+    [A Theoretically Grounded Application of Dropout in Recurrent
+    Neural Networks. Y. Gal, Z. Ghahramani](https://arxiv.org/abs/1512.05287).
 
     Otherwise a different dropout mask is applied at every time step.
 
@@ -1217,40 +1235,34 @@
         probability; if it is constant and 1, no output dropout will be added.
       state_keep_prob: unit Tensor or float between 0 and 1, output keep
         probability; if it is constant and 1, no output dropout will be added.
-        State dropout is performed on the outgoing states of the cell.
-        **Note** the state components to which dropout is applied when
-        `state_keep_prob` is in `(0, 1)` are also determined by
-        the argument `dropout_state_filter_visitor` (e.g. by default dropout
-        is never applied to the `c` component of an `LSTMStateTuple`).
-      variational_recurrent: Python bool.  If `True`, then the same
-        dropout pattern is applied across all time steps per run call.
-        If this parameter is set, `input_size` **must** be provided.
+        State dropout is performed on the outgoing states of the cell. **Note**
+        the state components to which dropout is applied when `state_keep_prob`
+        is in `(0, 1)` are also determined by the argument
+        `dropout_state_filter_visitor` (e.g. by default dropout is never applied
+        to the `c` component of an `LSTMStateTuple`).
+      variational_recurrent: Python bool.  If `True`, then the same dropout
+        pattern is applied across all time steps per run call. If this parameter
+        is set, `input_size` **must** be provided.
       input_size: (optional) (possibly nested tuple of) `TensorShape` objects
         containing the depth(s) of the input tensors expected to be passed in to
-        the `DropoutWrapper`.  Required and used **iff**
-         `variational_recurrent = True` and `input_keep_prob < 1`.
+        the `DropoutWrapper`.  Required and used **iff** `variational_recurrent
+        = True` and `input_keep_prob < 1`.
       dtype: (optional) The `dtype` of the input, state, and output tensors.
         Required and used **iff** `variational_recurrent = True`.
       seed: (optional) integer, the randomness seed.
       dropout_state_filter_visitor: (optional), default: (see below).  Function
-        that takes any hierarchical level of the state and returns
-        a scalar or depth=1 structure of Python booleans describing
-        which terms in the state should be dropped out.  In addition, if the
-        function returns `True`, dropout is applied across this sublevel.  If
-        the function returns `False`, dropout is not applied across this entire
-        sublevel.
+        that takes any hierarchical level of the state and returns a scalar or
+        depth=1 structure of Python booleans describing which terms in the state
+        should be dropped out.  In addition, if the function returns `True`,
+        dropout is applied across this sublevel.  If the function returns
+        `False`, dropout is not applied across this entire sublevel.
         Default behavior: perform dropout on all terms except the memory (`c`)
-        state of `LSTMCellState` objects, and don't try to apply dropout to
-        `TensorArray` objects:
-        ```
+          state of `LSTMCellState` objects, and don't try to apply dropout to
+        `TensorArray` objects: ```
         def dropout_state_filter_visitor(s):
-          if isinstance(s, LSTMCellState):
-            # Never perform dropout on the c state.
-            return LSTMCellState(c=False, h=True)
-          elif isinstance(s, TensorArray):
-            return False
-          return True
-        ```
+          if isinstance(s, LSTMCellState): # Never perform dropout on the c
+            state. return LSTMCellState(c=False, h=True)
+          elif isinstance(s, TensorArray): return False return True ```
 
     Raises:
       TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is provided
@@ -1260,24 +1272,26 @@
     super(DropoutWrapperBase, self).__init__(cell)
     assert_like_rnncell("cell", cell)
 
-    if (dropout_state_filter_visitor is not None
-        and not callable(dropout_state_filter_visitor)):
+    if (dropout_state_filter_visitor is not None and
+        not callable(dropout_state_filter_visitor)):
       raise TypeError("dropout_state_filter_visitor must be callable")
     self._dropout_state_filter = (
         dropout_state_filter_visitor or _default_dropout_state_filter_visitor)
     with ops.name_scope("DropoutWrapperInit"):
+
       def tensor_and_const_value(v):
         tensor_value = ops.convert_to_tensor(v)
         const_value = tensor_util.constant_value(tensor_value)
         return (tensor_value, const_value)
+
       for prob, attr in [(input_keep_prob, "input_keep_prob"),
                          (state_keep_prob, "state_keep_prob"),
                          (output_keep_prob, "output_keep_prob")]:
         tensor_prob, const_prob = tensor_and_const_value(prob)
         if const_prob is not None:
           if const_prob < 0 or const_prob > 1:
-            raise ValueError("Parameter %s must be between 0 and 1: %d"
-                             % (attr, const_prob))
+            raise ValueError("Parameter %s must be between 0 and 1: %d" %
+                             (attr, const_prob))
           setattr(self, "_%s" % attr, float(const_prob))
         else:
           setattr(self, "_%s" % attr, tensor_prob)
@@ -1299,8 +1313,7 @@
         # Prepend a 1 for the batch dimension; for recurrent
         # variational dropout we use the same dropout mask for all
         # batch elements.
-        return array_ops.concat(
-            ([1], tensor_shape.TensorShape(s).as_list()), 0)
+        return array_ops.concat(([1], tensor_shape.TensorShape(s).as_list()), 0)
 
       def batch_noise(s, inner_seed):
         shape = convert_to_batch_shape(s)
@@ -1360,7 +1373,11 @@
     ret.set_shape(value.get_shape())
     return ret
 
-  def _dropout(self, values, salt_prefix, recurrent_noise, keep_prob,
+  def _dropout(self,
+               values,
+               salt_prefix,
+               recurrent_noise,
+               keep_prob,
                shallow_filtered_substructure=None):
     """Decides whether to perform standard dropout or recurrent dropout."""
 
@@ -1370,21 +1387,25 @@
       shallow_filtered_substructure = values
 
     if not self._variational_recurrent:
+
       def dropout(i, do_dropout, v):
         if not isinstance(do_dropout, bool) or do_dropout:
           return nn_ops.dropout_v2(
               v, rate=1. - keep_prob, seed=self._gen_seed(salt_prefix, i))
         else:
           return v
+
       return _enumerated_map_structure_up_to(
           shallow_filtered_substructure, dropout,
           *[shallow_filtered_substructure, values])
     else:
+
       def dropout(i, do_dropout, v, n):
         if not isinstance(do_dropout, bool) or do_dropout:
           return self._variational_recurrent_dropout_value(i, v, n, keep_prob)
         else:
           return v
+
       return _enumerated_map_structure_up_to(
           shallow_filtered_substructure, dropout,
           *[shallow_filtered_substructure, values, recurrent_noise])
@@ -1405,12 +1426,12 @@
       - Output: A tensor with cell's output.
       - New state: A tensor or tuple of tensors with new wrapped cell's state.
     """
+
     def _should_dropout(p):
       return (not isinstance(p, float)) or p < 1
 
     if _should_dropout(self._input_keep_prob):
-      inputs = self._dropout(inputs, "input",
-                             self._recurrent_input_noise,
+      inputs = self._dropout(inputs, "input", self._recurrent_input_noise,
                              self._input_keep_prob)
     output, new_state = cell_call_fn(inputs, state, **kwargs)
     if _should_dropout(self._state_keep_prob):
@@ -1418,13 +1439,11 @@
       # which ones to keep.
       shallow_filtered_substructure = nest.get_traverse_shallow_structure(
           self._dropout_state_filter, new_state)
-      new_state = self._dropout(new_state, "state",
-                                self._recurrent_state_noise,
+      new_state = self._dropout(new_state, "state", self._recurrent_state_noise,
                                 self._state_keep_prob,
                                 shallow_filtered_substructure)
     if _should_dropout(self._output_keep_prob):
-      output = self._dropout(output, "output",
-                             self._recurrent_output_noise,
+      output = self._dropout(output, "output", self._recurrent_output_noise,
                              self._output_keep_prob)
     return output, new_state
 
@@ -1460,7 +1479,7 @@
       residual_fn: (Optional) The function to map raw cell inputs and raw cell
         outputs to the actual cell outputs of the residual network.
         Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
-        and outputs.
+          and outputs.
     """
     super(ResidualWrapperBase, self).__init__(cell)
     self._residual_fn = residual_fn
@@ -1495,13 +1514,16 @@
       ValueError: If cell inputs and outputs have different structure (value).
     """
     outputs, new_state = cell_call_fn(inputs, state, **kwargs)
+
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
+
     def default_residual_fn(inputs, outputs):
       nest.assert_same_structure(inputs, outputs)
       nest.map_structure(assert_shape_match, inputs, outputs)
       return nest.map_structure(lambda inp, out: inp + out, inputs, outputs)
+
     res_outputs = (self._residual_fn or default_residual_fn)(inputs, outputs)
     return (res_outputs, new_state)
 
@@ -1593,17 +1615,16 @@
   """
 
   @deprecated(None, "This class is equivalent as "
-                    "tf.keras.layers.StackedRNNCells, and will be replaced by "
-                    "that in Tensorflow 2.0.")
+              "tf.keras.layers.StackedRNNCells, and will be replaced by "
+              "that in Tensorflow 2.0.")
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
 
     Args:
       cells: list of RNNCells that will be composed in this order.
       state_is_tuple: If True, accepted and returned states are n-tuples, where
-        `n = len(cells)`.  If False, the states are all
-        concatenated along the column axis.  This latter behavior will soon be
-        deprecated.
+        `n = len(cells)`.  If False, the states are all concatenated along the
+        column axis.  This latter behavior will soon be deprecated.
 
     Raises:
       ValueError: if cells is empty (not allowed), or at least one of the cells
@@ -1613,13 +1634,12 @@
     if not cells:
       raise ValueError("Must specify at least one cell for MultiRNNCell.")
     if not nest.is_sequence(cells):
-      raise TypeError(
-          "cells must be a list or tuple, but saw: %s." % cells)
+      raise TypeError("cells must be a list or tuple, but saw: %s." % cells)
 
     if len(set([id(cell) for cell in cells])) < len(cells):
-      logging.log_first_n(logging.WARN,
-                          "At least two cells provided to MultiRNNCell "
-                          "are the same object and will share weights.", 1)
+      logging.log_first_n(
+          logging.WARN, "At least two cells provided to MultiRNNCell "
+          "are the same object and will share weights.", 1)
 
     self._cells = cells
     for cell_number, cell in enumerate(self._cells):
@@ -1632,8 +1652,8 @@
     if not state_is_tuple:
       if any(nest.is_sequence(c.state_size) for c in self._cells):
         raise ValueError("Some cells return tuples of states, but the flag "
-                         "state_is_tuple is not set.  State sizes are: %s"
-                         % str([c.state_size for c in self._cells]))
+                         "state_is_tuple is not set.  State sizes are: %s" %
+                         str([c.state_size for c in self._cells]))
 
   @property
   def state_size(self):
@@ -1699,8 +1719,9 @@
         cur_inp, new_state = cell(cur_inp, cur_state)
         new_states.append(new_state)
 
-    new_states = (tuple(new_states) if self._state_is_tuple else
-                  array_ops.concat(new_states, 1))
+    new_states = (
+        tuple(new_states) if self._state_is_tuple else array_ops.concat(
+            new_states, 1))
 
     return cur_inp, new_states
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index d352666..bb41559 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -56,7 +56,7 @@
     Args:
       func: The function to wrap.
       Tout: A list of datatypes for the output; an empty list if the output is
-            None.
+        None.
       is_grad_func: Whether this EagerFunc is the gradient of another
         EagerPyFunc.
     """
@@ -229,6 +229,7 @@
       self._unique_id += 1
     return "pyfunc_%d" % uid
 
+
 # Global registry for py functions.
 _py_funcs = FuncRegistry()
 
@@ -326,15 +327,15 @@
     if tf.abs(x) <= m:
       return x**2
     else:
-      return m**2 * (1 - 2 * tf.log(m) + tf.log(x**2))
+      return m**2 * (1 - 2 * tf.math.log(m) + tf.math.log(x**2))
 
-  x = tf.placeholder(tf.float32)
-  m = tf.placeholder(tf.float32)
+  x = tf.compat.v1.placeholder(tf.float32)
+  m = tf.compat.v1.placeholder(tf.float32)
 
   y = tf.py_function(func=log_huber, inp=[x, m], Tout=tf.float32)
   dy_dx = tf.gradients(y, x)[0]
 
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
     # The session executes `log_huber` eagerly. Given the feed values below,
     # it will take the first branch, so `y` evaluates to 1.0 and
     # `dy_dx` evaluates to 2.0.
@@ -350,15 +351,16 @@
   For more information on eager execution, see the
   [Eager guide](https://tensorflow.org/guide/eager).
 
-  `tf.py_function` is similar in spirit to `tf.py_func`, but unlike
+  `tf.py_function` is similar in spirit to `tf.compat.v1.py_func`, but unlike
   the latter, the former lets you use TensorFlow operations in the wrapped
-  Python function. In particular, while `tf.py_func` only runs on CPUs and
+  Python function. In particular, while `tf.compat.v1.py_func` only runs on CPUs
+  and
   wraps functions that take NumPy arrays as inputs and return NumPy arrays as
   outputs, `tf.py_function` can be placed on GPUs and wraps functions
   that take Tensors as inputs, execute TensorFlow operations in their bodies,
   and return Tensors as outputs.
 
-  Like `tf.py_func`, `tf.py_function` has the following limitations
+  Like `tf.compat.v1.py_func`, `tf.py_function` has the following limitations
   with respect to serialization and distribution:
 
   * The body of the function (i.e. `func`) will not be serialized in a
@@ -367,17 +369,16 @@
 
   * The operation must run in the same address space as the Python program
     that calls `tf.py_function()`. If you are using distributed
-    TensorFlow, you must run a `tf.train.Server` in the same process as the
+    TensorFlow, you must run a `tf.distribute.Server` in the same process as the
     program that calls `tf.py_function()` and you must pin the created
     operation to a device in that server (e.g. using `with tf.device():`).
 
 
   Args:
-    func: A Python function which accepts a list of `Tensor` objects
-      having element types that match the corresponding `tf.Tensor` objects
-      in `inp` and returns a list of `Tensor` objects (or a single
-      `Tensor`, or `None`) having element types that match the
-      corresponding values in `Tout`.
+    func: A Python function which accepts a list of `Tensor` objects having
+      element types that match the corresponding `tf.Tensor` objects in `inp`
+      and returns a list of `Tensor` objects (or a single `Tensor`, or `None`)
+      having element types that match the corresponding values in `Tout`.
     inp: A list of `Tensor` objects.
     Tout: A list or tuple of tensorflow data types or a single tensorflow data
       type if there is only one, indicating what `func` returns; an empty list
@@ -404,43 +405,44 @@
   def my_func(x):
     # x will be a numpy array with the contents of the placeholder below
     return np.sinh(x)
-  input = tf.placeholder(tf.float32)
-  y = tf.py_func(my_func, [input], tf.float32)
+  input = tf.compat.v1.placeholder(tf.float32)
+  y = tf.compat.v1.py_func(my_func, [input], tf.float32)
   ```
 
-  **N.B.** The `tf.py_func()` operation has the following known limitations:
+  **N.B.** The `tf.compat.v1.py_func()` operation has the following known
+  limitations:
 
   * The body of the function (i.e. `func`) will not be serialized in a
     `GraphDef`. Therefore, you should not use this function if you need to
     serialize your model and restore it in a different environment.
 
   * The operation must run in the same address space as the Python program
-    that calls `tf.py_func()`. If you are using distributed TensorFlow, you
-    must run a `tf.train.Server` in the same process as the program that calls
-    `tf.py_func()` and you must pin the created operation to a device in that
+    that calls `tf.compat.v1.py_func()`. If you are using distributed
+    TensorFlow, you
+    must run a `tf.distribute.Server` in the same process as the program that
+    calls
+    `tf.compat.v1.py_func()` and you must pin the created operation to a device
+    in that
     server (e.g. using `with tf.device():`).
 
   Args:
     func: A Python function, which accepts `ndarray` objects as arguments and
       returns a list of `ndarray` objects (or a single `ndarray`). This function
       must accept as many arguments as there are tensors in `inp`, and these
-      argument types will match the corresponding `tf.Tensor` objects
-      in `inp`. The returns `ndarray`s must match the number and types defined
-      `Tout`.
+      argument types will match the corresponding `tf.Tensor` objects in `inp`.
+      The returns `ndarray`s must match the number and types defined `Tout`.
       Important Note: Input and output numpy `ndarray`s of `func` are not
-      guaranteed to be copies. In some cases their underlying memory will be
-      shared with the corresponding TensorFlow tensors.
-      In-place modification or storing `func` input or return values in
-      python datastructures without explicit (np.)copy
-      can have non-deterministic consequences.
+        guaranteed to be copies. In some cases their underlying memory will be
+        shared with the corresponding TensorFlow tensors. In-place modification
+        or storing `func` input or return values in python datastructures
+        without explicit (np.)copy can have non-deterministic consequences.
     inp: A list of `Tensor` objects.
     Tout: A list or tuple of tensorflow data types or a single tensorflow data
       type if there is only one, indicating what `func` returns.
-    stateful: (Boolean.) If True, the function should be considered stateful.
-      If a function is stateless, when given the same input it will return the
-      same output and have no observable side effects. Optimizations such as
-      common subexpression elimination are only performed on stateless
-      operations.
+    stateful: (Boolean.) If True, the function should be considered stateful. If
+      a function is stateless, when given the same input it will return the same
+      output and have no observable side effects. Optimizations such as common
+      subexpression elimination are only performed on stateless operations.
     name: A name for the operation (optional).
 
   Returns:
@@ -489,6 +491,5 @@
 numpy_function.__doc__ = py_func_common.__doc__.replace("py_func",
                                                         "numpy_function")
 
-
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index c6cf2fe..733d30c 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -162,10 +162,10 @@
 
   ```python
   c = tf.multiply(a, b)
-  h = tf.get_session_handle(c)
+  h = tf.compat.v1.get_session_handle(c)
   h = sess.run(h)
 
-  p, a = tf.get_session_tensor(h.handle, tf.float32)
+  p, a = tf.compat.v1.get_session_tensor(h.handle, tf.float32)
   b = tf.multiply(a, 10)
   c = sess.run(b, feed_dict={p: h.handle})
   ```
@@ -203,10 +203,10 @@
 
   ```python
   c = tf.multiply(a, b)
-  h = tf.get_session_handle(c)
+  h = tf.compat.v1.get_session_handle(c)
   h = sess.run(h)
 
-  p, a = tf.get_session_tensor(h.handle, tf.float32)
+  p, a = tf.compat.v1.get_session_tensor(h.handle, tf.float32)
   b = tf.multiply(a, 10)
   c = sess.run(b, feed_dict={p: h.handle})
   ```
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index ee9c9b6..a5a64e3 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -169,8 +169,8 @@
     ])
     b = tf.SparseTensor(list(b.keys()), list(b.values()), dense_shape=[2, 2, 4])
 
-    # `tf.sets.set_intersection` is applied to each aligned pair of sets.
-    tf.sets.set_intersection(a, b)
+    # `tf.sets.intersection` is applied to each aligned pair of sets.
+    tf.sets.intersection(a, b)
 
     # The result will be equivalent to either of:
     #
@@ -202,7 +202,7 @@
 
 
 @tf_export(
-	   "sets.difference", v1=["sets.difference", "sets.set_difference"])
+    "sets.difference", v1=["sets.difference", "sets.set_difference"])
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -241,7 +241,7 @@
     b = tf.SparseTensor(list(b.keys()), list(b.values()), dense_shape=[2, 2, 4])
 
     # `set_difference` is applied to each aligned pair of sets.
-    tf.sets.set_difference(a, b)
+    tf.sets.difference(a, b)
 
     # The result will be equivalent to either of:
     #
@@ -274,7 +274,7 @@
 
 
 @tf_export(
-	   "sets.union", v1=["sets.union", "sets.set_union"])
+    "sets.union", v1=["sets.union", "sets.set_union"])
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
@@ -312,7 +312,7 @@
     b = tf.SparseTensor(list(b.keys()), list(b.values()), dense_shape=[2, 2, 4])
 
     # `set_union` is applied to each aligned pair of sets.
-    tf.sets.set_union(a, b)
+    tf.sets.union(a, b)
 
     # The result will be a equivalent to either of:
     #
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
index 617ad62..032afae 100644
--- a/tensorflow/python/ops/signal/dct_ops.py
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -54,11 +54,11 @@
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
   Currently only Types I, II and III are supported.
-  Type I is implemented using a length `2N` padded `tf.spectral.rfft`.
-  Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as
+  Type I is implemented using a length `2N` padded `tf.signal.rfft`.
+  Type II is implemented using a length `2N` padded `tf.signal.rfft`, as
   described here: [Type 2 DCT using 2N FFT padded (Makhoul)](https://dsp.stackexchange.com/a/10606).
   Type III is a fairly straightforward inverse of Type II
-  (i.e. using a length `2N` padded `tf.spectral.irfft`).
+  (i.e. using a length `2N` padded `tf.signal.irfft`).
 
   @compatibility(scipy)
   Equivalent to [scipy.fftpack.dct](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html)
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 675d60ee..b1465ff 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -47,7 +47,7 @@
   ```python
   sample_rate = 16000.0
   # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
-  pcm = tf.placeholder(tf.float32, [None, None])
+  pcm = tf.compat.v1.placeholder(tf.float32, [None, None])
 
   # A 1024-point STFT with frames of 64 ms and 75% overlap.
   stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
@@ -66,7 +66,7 @@
     linear_to_mel_weight_matrix.shape[-1:]))
 
   # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
-  log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
+  log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
 
   # Compute MFCCs from log_mel_spectrograms and take the first 13.
   mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index ae9c2ef..e76ce18 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -69,7 +69,7 @@
   For example:
 
   ```python
-  pcm = tf.placeholder(tf.float32, [None, 9152])
+  pcm = tf.compat.v1.placeholder(tf.float32, [None, 9152])
   frames = tf.signal.frame(pcm, 512, 180)
   magspec = tf.abs(tf.signal.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index ba1709b..59ea2ae 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -171,7 +171,7 @@
   ```python
   frame_length = 400
   frame_step = 160
-  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
+  waveform = tf.compat.v1.placeholder(dtype=tf.float32, shape=[1000])
   stft = tf.signal.stft(waveform, frame_length, frame_step)
   inverse_stft = tf.signal.inverse_stft(
       stft, frame_length, frame_step,
@@ -185,7 +185,7 @@
   frame_length = 400
   frame_step = 160
   window_fn = functools.partial(window_ops.hamming_window, periodic=True),
-  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
+  waveform = tf.compat.v1.placeholder(dtype=tf.float32, shape=[1000])
   stft = tf.signal.stft(
       waveform, frame_length, frame_step, window_fn=window_fn)
   inverse_stft = tf.signal.inverse_stft(
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3af5516..ab7df72 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -373,10 +373,10 @@
 
   Then,
 
-      * `thresh == 0` (the default): all 5 index/value pairs will be returned.
-      * `thresh == 0.11`: only .1 and 0 will vanish, and the remaining three
-          index/value pairs will be returned.
-      * `thresh == 0.21`: .1, 0, and -.2 will vanish.
+  * `thresh == 0` (the default): all 5 index/value pairs will be returned.
+  * `thresh == 0.11`: only .1 and 0 will vanish, and the remaining three
+      index/value pairs will be returned.
+  * `thresh == 0.21`: .1, 0, and -.2 will vanish.
 
   Args:
     a: The first operand; `SparseTensor` or `Tensor`.
@@ -434,11 +434,11 @@
 
   Then,
 
-      * `threshold == 0` (the default): all 5 index/value pairs will be
-          returned.
-      * `threshold == 0.11`: only .1 and 0 will vanish, and the remaining three
-          index/value pairs will be returned.
-      * `threshold == 0.21`: .1, 0, and -.2 will vanish.
+  * `threshold == 0` (the default): all 5 index/value pairs will be
+      returned.
+  * `threshold == 0.11`: only .1 and 0 will vanish, and the remaining three
+      index/value pairs will be returned.
+  * `threshold == 0.21`: .1, 0, and -.2 will vanish.
 
   Args:
     a: The first operand; `SparseTensor` or `Tensor`.
@@ -1508,7 +1508,7 @@
     sp_values = sparse_tensor.SparseTensor(sp_input.indices, new_values,
                                            sp_input.dense_shape)
 
-    sp_new = sparse_merge(sp_input, sp_values, vocab_size, name)
+    sp_new = sparse_merge_impl(sp_input, sp_values, vocab_size, name)
 
     # validate_indices may be False because we allow duplicates in new_indices:
     # repeated indices are allowed when creating an indicator matrix.
@@ -1611,6 +1611,15 @@
       `vocab_size` is not a or list thereof and `sp_ids` is a list.
     ValueError: If `sp_ids` and `vocab_size` are lists of different lengths.
   """
+  return sparse_merge_impl(sp_ids, sp_values, vocab_size, name, already_sorted)
+
+
+def sparse_merge_impl(sp_ids,
+                      sp_values,
+                      vocab_size,
+                      name=None,
+                      already_sorted=False):
+  """Internal implementation for sparse_merge to avoid deprecation warnings."""
   if isinstance(sp_ids, sparse_tensor.SparseTensorValue) or isinstance(
       sp_ids, sparse_tensor.SparseTensor):
     sp_ids = [sp_ids]
@@ -2384,7 +2393,7 @@
   values = np.asarray([[[0., np.e], [1., 0.]], [[np.e, 0.], [np.e, np.e]]])
   indices = np.vstack(np.where(values)).astype(np.int64).T
 
-  result = tf.sparse_softmax(tf.SparseTensor(indices, values, shape))
+  result = tf.sparse.softmax(tf.SparseTensor(indices, values, shape))
   # ...returning a 3-D SparseTensor, equivalent to:
   # [?   1.]     [1    ?]
   # [1.  ? ] and [.5  .5]
@@ -2416,7 +2425,7 @@
   ```python
   sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
   sp_one = sparse_tensor.SparseTensor([[1]], [1], [7])
-  res = tf.sparse_maximum(sp_zero, sp_one).eval()
+  res = tf.sparse.maximum(sp_zero, sp_one).eval()
   # "res" should be equal to SparseTensor([[0], [1]], [0, 1], [7]).
   ```
 
@@ -2454,7 +2463,7 @@
   ```python
   sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
   sp_one = sparse_tensor.SparseTensor([[1]], [1], [7])
-  res = tf.sparse_minimum(sp_zero, sp_one).eval()
+  res = tf.sparse.minimum(sp_zero, sp_one).eval()
   # "res" should be equal to SparseTensor([[0], [1]], [0, 0], [7]).
   ```
 
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 3048e49..6b47f7e 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -26,8 +26,10 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.compiler.tf2xla.ops import gen_xla_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
@@ -132,6 +134,39 @@
     return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i1e(x)
 
 
+@ops.RegisterGradient('XlaEinsum')
+def _einsum_grad(op, grad):
+  equation = op.get_attr('equation')
+  if isinstance(equation, bytes):
+    equation = equation.decode()
+
+  inputs, output = equation.split('->')
+  left, right = inputs.split(',')
+
+  return [
+      gen_xla_ops.xla_einsum(
+          grad,
+          op.inputs[1],
+          equation='{},{}->{}'.format(output, right, left),
+          name=None),
+      gen_xla_ops.xla_einsum(
+          grad,
+          op.inputs[0],
+          equation='{},{}->{}'.format(output, left, right),
+          name=None)
+  ]
+
+
+def _enclosing_tpu_context():
+  # pylint: disable=protected-access
+  context = ops.get_default_graph()._get_control_flow_context()
+  # pylint: enable=protected-access
+  while context is not None and not isinstance(
+      context, control_flow_ops.XLAControlFlowContext):
+    context = context.outer_context
+  return context
+
+
 @tf_export('einsum', 'linalg.einsum')
 def einsum(equation, *inputs, **kwargs):
   """A generalized contraction between tensors of arbitrary dimension.
@@ -240,6 +275,12 @@
             ' because index "%s" is summed over more than two inputs.', a)
         return _exponential_space_einsum(equation, *inputs)
 
+    # Use xla_einsum if executing on TPU and if the operation is a 2 input
+    # einsum supported by XlaEinsumOp.
+    if _enclosing_tpu_context() is not None and len(inputs) == 2:
+      return gen_xla_ops.xla_einsum(
+          inputs[0], inputs[1], input_axis_labels[0] + ',' +
+          input_axis_labels[1] + '->' + output_axis_labels)
     temp = inputs[0]
     temp_axis_labels = input_axis_labels[0]
     for i in xrange(len(inputs) - 1):
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index ce75de7..f007e1f 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -105,6 +105,8 @@
 from tensorflow.python.ops.tensor_array_ops import *
 from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
+from tensorflow.python.ops.parallel_for.control_flow_ops import vectorized_map
+
 # pylint: enable=wildcard-import
 # pylint: enable=g-bad-import-order
 
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index c922326..7ee7e4e 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -332,8 +332,8 @@
       ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
       indices = tf.constant([[4], [3], [1] ,[7]])
       updates = tf.constant([9, 10, 11, 12])
-      update = tf.scatter_nd_update(ref, indices, updates)
-      with tf.Session() as sess:
+      update = tf.compat.v1.scatter_nd_update(ref, indices, updates)
+      with tf.compat.v1.Session() as sess:
         print sess.run(update)
   ```
 
@@ -446,8 +446,8 @@
   ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
   indices = tf.constant([[4], [3], [1], [7]])
   updates = tf.constant([9, 10, 11, 12])
-  add = tf.scatter_nd_add(ref, indices, updates)
-  with tf.Session() as sess:
+  add = tf.compat.v1.scatter_nd_add(ref, indices, updates)
+  with tf.compat.v1.Session() as sess:
     print sess.run(add)
   ```
 
@@ -563,8 +563,8 @@
   ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
   indices = tf.constant([[4], [3], [1] ,[7]])
   updates = tf.constant([9, 10, 11, 12])
-  op = tf.scatter_nd_sub(ref, indices, updates)
-  with tf.Session() as sess:
+  op = tf.compat.v1.scatter_nd_sub(ref, indices, updates)
+  with tf.compat.v1.Session() as sess:
     print sess.run(op)
   ```
 
@@ -819,7 +819,7 @@
 @deprecation.deprecated(
     "2018-11-29", "Use the batch_scatter_update method of Variable instead.")
 def batch_scatter_update(ref, indices, updates, use_locking=True, name=None):
-  """Generalization of `tf.scatter_update` to axis different than 0.
+  """Generalization of `tf.compat.v1.scatter_update` to axis different than 0.
 
   Analogous to `batch_gather`. This assumes that `ref`, `indices` and `updates`
   have a series of leading dimensions that are the same for all of them, and the
@@ -841,18 +841,19 @@
   `var[i_1, ..., i_n, indices[i_1, ..., i_n, j]] = updates[i_1, ..., i_n, j]`
 
   When indices is a 1D tensor, this operation is equivalent to
-  `tf.scatter_update`.
+  `tf.compat.v1.scatter_update`.
 
   To avoid this operation there would be 2 alternatives:
   1) Reshaping the variable by merging the first `ndims` dimensions. However,
      this is not possible because `tf.reshape` returns a Tensor, which we
-     cannot use `tf.scatter_update` on.
+     cannot use `tf.compat.v1.scatter_update` on.
   2) Looping over the first `ndims` of the variable and using
-     `tf.scatter_update` on the subtensors that result of slicing the first
+     `tf.compat.v1.scatter_update` on the subtensors that result of slicing the
+     first
      dimension. This is a valid option for `ndims = 1`, but less efficient than
      this implementation.
 
-  See also `tf.scatter_update` and `tf.scatter_nd_update`.
+  See also `tf.compat.v1.scatter_update` and `tf.compat.v1.scatter_nd_update`.
 
   Args:
     ref: `Variable` to scatter onto.
@@ -887,7 +888,7 @@
     # coordinates we created with the original indices.
 
     # For example if indices.shape = [2, 3, 4], we should generate the following
-    # indices for tf.scatter_nd_update:
+    # indices for tf.compat.v1.scatter_nd_update:
     # nd_indices[:, :, 0] = [[0, 0, 0], [1, 1, 1]]
     # nd_indices[:, :, 1] = [[0, 1, 2], [0, 1, 2]]
     # nd_indices[:, :, 2] = indices
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 9f88842..29b77e7 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -195,11 +195,15 @@
                                        dtype=SEED_TYPE)
       else:
         state = create_rng_state(seed, algorithm)
-      self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+      self._state_var = variables.Variable(state,
+                                           dtype=STATE_TYPE,
+                                           trainable=False)
       self._alg_var = algorithm
     else:
       assert seed is None
-      self._state_var = variables.Variable(copy_from.state, dtype=STATE_TYPE)
+      self._state_var = variables.Variable(copy_from.state,
+                                           dtype=STATE_TYPE,
+                                           trainable=False)
       self._alg_var = copy_from.algorithm
 
   def reset(self, seed):
@@ -225,6 +229,40 @@
     return gen_stateful_random_ops.stateful_standard_normal_v2(
         self.state.handle, self.algorithm, shape, dtype=dtype)
 
+  @property
+  def key(self):
+    """The 'key' part of the state of a counter-based RNG.
+
+    For a counter-base RNG algorithm such as Philox and ThreeFry (as
+    described in paper 'Parallel Random Numbers: As Easy as 1, 2, 3'
+    (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)),
+    the RNG state consists of two parts: counter and key. The output is
+    generated via the formula: output=hash(key, counter), i.e. a hashing of
+    the counter parametrized by the key. Two RNGs with two different keys can
+    be thought as generating two independent random-number streams (a stream
+    is formed by increasing the counter).
+
+    Returns:
+      A scalar which is the 'key' part of the state, if the RNG algorithm is
+        counter-based; otherwise it raises a ValueError.
+    """
+    alg = self.algorithm
+    if alg == RNG_ALG_PHILOX or alg == RNG_ALG_THREEFRY:
+      return self._state_var[-1]
+    else:
+      raise ValueError("Unsupported algorithm id: %s" % alg)
+
+  def skip(self, delta):
+    """Advance the counter of a counter-based RNG.
+
+    Args:
+      delta: the amount of advancement. The state of the RNG after
+        `skip(n)` will be the same as that after `normal([n])`
+        (or any other distribution). The actual increment added to the
+        counter is an unspecified implementation detail.
+    """
+    gen_stateful_random_ops.rng_skip(self.state.handle, self.algorithm, delta)
+
   # The following functions return a tensor and as a side effect update
   # self._state_var.
   def normal(self, shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 852654e..2f4a7cf 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -257,6 +257,28 @@
     compare(True, True)
     compare(True, False)
 
+  @test_util.run_v2_only
+  def testKey(self):
+    key = 1234
+    gen = random.Generator(seed=[0, 0, key])
+    got = gen.key
+    self.assertAllEqual(key, got)
+    @def_function.function
+    def f():
+      return gen.key
+    got = f()
+    self.assertAllEqual(key, got)
+
+  @test_util.run_v2_only
+  def testSkip(self):
+    key = 1234
+    counter = 5678
+    gen = random.Generator(seed=[counter, 0, key])
+    delta = 432
+    gen.skip(delta)
+    new_counter = gen._state_var[0]
+    self.assertAllEqual(counter + delta * 256, new_counter)
+
   def _sameAsOldRandomOps(self, device, floats):
     def compare(dtype, old, new):
       seed1, seed2 = 79, 25
@@ -331,6 +353,21 @@
     """
     self._sameAsOldRandomOps(test_util.gpu_device_name(), GPU_FLOATS)
 
+  @parameterized.parameters(INTS + [dtypes.uint32, dtypes.uint64])
+  @test_util.run_v2_only
+  @test_util.run_cuda_only
+  def testGPUEqualsCPU(self, dtype):
+    """Tests that GPU and CPU generate the same integer outputs."""
+    seed = 1234
+    shape = [315, 49]
+    with ops.device("/device:CPU:0"):
+      cpu = random.Generator(seed=seed).uniform_full_int(
+          shape=shape, dtype=dtype)
+    with ops.device(test_util.gpu_device_name()):
+      gpu = random.Generator(seed=seed).uniform_full_int(
+          shape=shape, dtype=dtype)
+    self.assertAllEqual(cpu, gpu)
+
   @parameterized.parameters(FLOATS + INTS)
   @test_util.run_v2_only
   def testUniformIsInRange(self, dtype):
@@ -392,10 +429,15 @@
     gen = random.Generator(seed=1234)
     with self.assertRaisesWithPredicateMatch(
         errors.InvalidArgumentError,
-        r"algorithm must be of shape \[\], not"):
+        r"must have shape \[\], not"):
       gen_stateful_random_ops.stateful_standard_normal_v2(
           gen.state.handle, [0, 0], shape)
     with self.assertRaisesWithPredicateMatch(
+        errors.InvalidArgumentError,
+        r"must have shape \[\], not"):
+      gen_stateful_random_ops.rng_skip(
+          gen.state.handle, gen.algorithm, [0, 0])
+    with self.assertRaisesWithPredicateMatch(
         TypeError, "Requested dtype: int64"):
       gen_stateful_random_ops.stateful_standard_normal_v2(
           gen.state.handle, 1.1, shape)
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 0576f6e..bd8a093 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -43,7 +43,7 @@
                              name=None):
   """Outputs deterministic pseudorandom values from a uniform distribution.
 
-  This is a stateless version of `tf.random_uniform`: if run twice with the
+  This is a stateless version of `tf.random.uniform`: if run twice with the
   same seeds, it will produce the same pseudorandom numbers.  The output is
   consistent across multiple runs on the same hardware (and between CPU
   and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
@@ -110,7 +110,7 @@
                             name=None):
   """Outputs deterministic pseudorandom values from a normal distribution.
 
-  This is a stateless version of `tf.random_normal`: if run twice with the
+  This is a stateless version of `tf.random.normal`: if run twice with the
   same seeds, it will produce the same pseudorandom numbers.  The output is
   consistent across multiple runs on the same hardware (and between CPU
   and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
@@ -147,7 +147,8 @@
                                name=None):
   """Outputs deterministic pseudorandom values, truncated normally distributed.
 
-  This is a stateless version of `tf.truncated_normal`: if run twice with the
+  This is a stateless version of `tf.random.truncated_normal`: if run twice with
+  the
   same seeds, it will produce the same pseudorandom numbers.  The output is
   consistent across multiple runs on the same hardware (and between CPU
   and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
@@ -190,7 +191,7 @@
                           name=None):
   """Draws deterministic pseudorandom samples from a multinomial distribution.
 
-  This is a stateless version of `tf.multinomial`: if run twice with the
+  This is a stateless version of `tf.random.categorical`: if run twice with the
   same seeds, it will produce the same pseudorandom numbers.  The output is
   consistent across multiple runs on the same hardware (and between CPU
   and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
@@ -201,8 +202,8 @@
   ```python
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
-  samples = tf.random.stateless_multinomial(
-      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  samples = tf.random.stateless_categorical(
+      tf.math.log([[10., 10.]]), 5, seed=[7, 17])
   ```
 
   Args:
@@ -241,7 +242,7 @@
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
   samples = tf.random.stateless_categorical(
-      tf.log([[10., 10.]]), 5, seed=[7, 17])
+      tf.math.log([[10., 10.]]), 5, seed=[7, 17])
   ```
 
   Args:
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 7c11bf9..c27d845 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -123,7 +123,7 @@
   Example:
     Formatting a single-tensor template:
     ```python
-    sess = tf.Session()
+    sess = tf.compat.v1.Session()
     with sess.as_default():
         tensor = tf.range(10)
         formatted = tf.strings.format("tensor: {}, suffix", tensor)
@@ -135,7 +135,7 @@
 
     Formatting a multi-tensor template:
     ```python
-    sess = tf.Session()
+    sess = tf.compat.v1.Session()
     with sess.as_default():
         tensor_one = tf.reshape(tf.range(100), [10, 10])
         tensor_two = tf.range(10)
@@ -459,7 +459,7 @@
 
   Note that the hash function may change from time to time.
   This functionality will be deprecated and it's recommended to use
-  `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+  `tf.strings.to_hash_bucket_fast()` or `tf.strings.to_hash_bucket_strong()`.
 
   Args:
     input: A `Tensor` of type `string`.
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 8393828..370f17c 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -304,11 +304,11 @@
   which can happen before or after this function is called.
 
   Args:
-    graph: A `tf.Graph` or `tf.GraphDef` to output to the writer.
+    graph: A `tf.Graph` or `tf.compat.v1.GraphDef` to output to the writer.
       This function will not write the default graph by default. When
       writing to an event log file, the associated step will be zero.
     session: So this method can call `tf.Session.run`. This defaults
-      to `tf.get_default_session`.
+      to `tf.compat.v1.get_default_session`.
 
   Raises:
     RuntimeError: If  the current thread has no default
@@ -510,13 +510,16 @@
   return constant_op.constant(False)
 
 
-def all_summary_ops():
-  """Graph-mode only. Returns all summary ops.
+@tf_export(v1=["summary.all_v2_summary_ops"])
+def all_v2_summary_ops():
+  """Returns all V2-style summary ops defined in the current default graph.
 
-  Please note this excludes `tf.summary.graph` ops.
+  This includes ops from TF 2.0 tf.summary and TF 1.x tf.contrib.summary (except
+  for `tf.contrib.summary.graph` and `tf.contrib.summary.import_event`), but
+  does *not* include TF 1.x tf.summary ops.
 
   Returns:
-    The summary ops.
+    List of summary ops, or None if called under eager execution.
   """
   if context.executing_eagerly():
     return None
@@ -638,8 +641,12 @@
         with ops.control_dependencies([write_summary_op]):
           return constant_op.constant(True)
 
-    return smart_cond.smart_cond(
-        _should_record_summaries_v2(), record, _nothing, name="summary_cond")
+    with ops.device("cpu:0"):
+      op = smart_cond.smart_cond(
+          _should_record_summaries_v2(), record, _nothing, name="summary_cond")
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
+      return op
 
 
 @tf_export("summary.experimental.write_raw_pb", v1=[])
@@ -686,8 +693,12 @@
         with ops.control_dependencies([raw_summary_op]):
           return constant_op.constant(True)
 
-    return smart_cond.smart_cond(
-        _should_record_summaries_v2(), record, _nothing, name="summary_cond")
+    with ops.device("cpu:0"):
+      op = smart_cond.smart_cond(
+          _should_record_summaries_v2(), record, _nothing, name="summary_cond")
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
+      return op
 
 
 def summary_writer_function(name, tensor, function, family=None):
@@ -756,7 +767,7 @@
       `int8`, `uint16`, `half`, `uint32`, `uint64`.
     family: Optional, the summary's family.
     step: The `int64` monotonic step variable, which defaults
-      to `tf.train.get_global_step`.
+      to `tf.compat.v1.train.get_global_step`.
 
   Returns:
     The created `tf.Operation` or a `tf.no_op` if summary writing has
@@ -836,14 +847,14 @@
   TensorBoard.
 
   When not using eager execution mode, the user should consider passing
-  the `graph` parameter to `tf.contrib.summary.initialize` instead of
+  the `graph` parameter to `tf.compat.v1.summary.initialize` instead of
   calling this function. Otherwise special care needs to be taken when
   using the graph to record the graph.
 
   Args:
     param: A `tf.Tensor` containing a serialized graph proto. When
       eager execution is enabled, this function will automatically
-      coerce `tf.Graph`, `tf.GraphDef`, and string types.
+      coerce `tf.Graph`, `tf.compat.v1.GraphDef`, and string types.
     step: The global step variable. This doesn't have useful semantics
       for graph summaries, but is used anyway, due to the structure of
       event log files. This defaults to the global step.
@@ -875,7 +886,7 @@
 
 
 def import_event(tensor, name=None):
-  """Writes a `tf.Event` binary proto.
+  """Writes a `tf.compat.v1.Event` binary proto.
 
   This can be used to import existing event logs into a new summary writer sink.
   Please note that this is lower level than the other summary functions and
@@ -883,7 +894,7 @@
 
   Args:
     tensor: A `tf.Tensor` of type `string` containing a serialized
-      `tf.Event` proto.
+      `tf.compat.v1.Event` proto.
     name: A name for the operation (optional).
 
   Returns:
@@ -1009,10 +1020,12 @@
   with summary_scope(name,
                      "graph_run_metadata_summary",
                      [data, step]) as (tag, _):
+    with ops.device("cpu:0"):
+      tensor = constant_op.constant(data.SerializeToString(),
+                                    dtype=dtypes.string)
     return write(
         tag=tag,
-        tensor=constant_op.constant(
-            data.SerializeToString(), dtype=dtypes.string),
+        tensor=tensor,
         step=step,
         metadata=summary_metadata)
 
@@ -1050,10 +1063,12 @@
   with summary_scope(name,
                      "graph_run_metadata_graph_summary",
                      [data, step]) as (tag, _):
+    with ops.device("cpu:0"):
+      tensor = constant_op.constant(data.SerializeToString(),
+                                    dtype=dtypes.string)
     return write(
         tag=tag,
-        tensor=constant_op.constant(
-            data.SerializeToString(), dtype=dtypes.string),
+        tensor=tensor,
         step=step,
         metadata=summary_metadata)
 
@@ -1096,9 +1111,11 @@
     return False
 
   with summary_scope(name, "graph_keras_model", [data, step]) as (tag, _):
+    with ops.device("cpu:0"):
+      tensor = constant_op.constant(json_string, dtype=dtypes.string)
     return write(
         tag=tag,
-        tensor=constant_op.constant(json_string, dtype=dtypes.string),
+        tensor=tensor,
         step=step,
         metadata=summary_metadata)
 
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 3ca9799..c13b9b8 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Provides templates which allow variable sharing."""
 from __future__ import absolute_import
 from __future__ import division
@@ -33,13 +32,16 @@
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
-
 __all__ = ["make_template"]
 
 
 @tf_export(v1=["make_template"])
-def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
-                  custom_getter_=None, **kwargs):
+def make_template(name_,
+                  func_,
+                  create_scope_now_=False,
+                  unique_name_=None,
+                  custom_getter_=None,
+                  **kwargs):
   """Given an arbitrary function, wrap it so that it does variable sharing.
 
   This wraps `func_` in a Template and partially evaluates it. Templates are
@@ -48,12 +50,14 @@
   have the following properties:
 
   * The function should create all trainable variables and any variables that
-     should be reused by calling `tf.get_variable`. If a trainable variable is
+     should be reused by calling `tf.compat.v1.get_variable`. If a trainable
+     variable is
      created using `tf.Variable`, then a ValueError will be thrown. Variables
      that are intended to be locals can be created by specifying
      `tf.Variable(..., trainable=false)`.
   * The function may use variable scopes and other templates internally to
-      create and reuse variables, but it shouldn't use `tf.global_variables` to
+      create and reuse variables, but it shouldn't use
+      `tf.compat.v1.global_variables` to
       capture variables that are defined outside of the scope of the function.
   * Internal scopes and variable names should not depend on any arguments that
       are not supplied to `make_template`. In general you will get a ValueError
@@ -67,12 +71,12 @@
 
   ```python
   def my_op(x, scalar_name):
-    var1 = tf.get_variable(scalar_name,
+    var1 = tf.compat.v1.get_variable(scalar_name,
                            shape=[],
-                           initializer=tf.constant_initializer(1))
+                           initializer=tf.compat.v1.constant_initializer(1))
     return x * var1
 
-  scale_by_y = tf.make_template('scale_by_y', my_op, scalar_name='y')
+  scale_by_y = tf.compat.v1.make_template('scale_by_y', my_op, scalar_name='y')
 
   z = scale_by_y(input1)
   w = scale_by_y(input2)
@@ -91,19 +95,21 @@
 
   ```python
   def my_op(x, scalar_name):
-    var1 = tf.get_variable(scalar_name,
+    var1 = tf.compat.v1.get_variable(scalar_name,
                            shape=[],
-                           initializer=tf.constant_initializer(1))
+                           initializer=tf.compat.v1.constant_initializer(1))
     return x * var1
 
-  with tf.variable_scope('scope') as vs:
-    scale_by_y = tf.make_template('scale_by_y', my_op, scalar_name='y')
+  with tf.compat.v1.variable_scope('scope') as vs:
+    scale_by_y = tf.compat.v1.make_template('scale_by_y', my_op,
+    scalar_name='y')
     z = scale_by_y(input1)
     w = scale_by_y(input2)
 
   # Creates a template that reuses the variables above.
-  with tf.variable_scope(vs, reuse=True):
-    scale_by_y2 = tf.make_template('scale_by_y', my_op, scalar_name='y')
+  with tf.compat.v1.variable_scope(vs, reuse=True):
+    scale_by_y2 = tf.compat.v1.make_template('scale_by_y', my_op,
+    scalar_name='y')
     z2 = scale_by_y2(input1)
     w2 = scale_by_y2(input2)
   ```
@@ -128,8 +134,8 @@
       template of the same scope/unique_name already exists and reuse is false,
       an error is raised. Defaults to None.
     custom_getter_: Optional custom getter for variables used in `func_`. See
-      the `tf.get_variable` `custom_getter` documentation for
-      more information.
+      the `tf.compat.v1.get_variable` `custom_getter` documentation for more
+      information.
     **kwargs: Keyword arguments to apply to `func_`.
 
   Returns:
@@ -176,16 +182,16 @@
       template of the same scope/unique_name already exists and reuse is false,
       an error is raised. Defaults to None. If executing eagerly, must be None.
     custom_getter_: Optional custom getter for variables used in `func_`. See
-      the `tf.get_variable` `custom_getter` documentation for
-      more information.
+      the `tf.compat.v1.get_variable` `custom_getter` documentation for more
+      information.
     create_graph_function_: When True, `func_` will be executed as a graph
       function. This implies that `func_` must satisfy the properties that
       `function.defun` requires of functions: See the documentation of
-      `function.defun` for details. When executing eagerly, setting this flag to
-      True can improve performance. Regardless of whether eager execution is
-      enabled, enabling this flag gives the caller access to graph-function
-      semantics, i.e., accesses to variables are totally ordered and
-      side-effecting ops are not pruned.
+        `function.defun` for details. When executing eagerly, setting this flag
+        to True can improve performance. Regardless of whether eager execution
+        is enabled, enabling this flag gives the caller access to graph-function
+        semantics, i.e., accesses to variables are totally ordered and
+        side-effecting ops are not pruned.
     **kwargs: Keyword arguments to apply to `func_`.
 
   Returns:
@@ -203,8 +209,8 @@
   """
 
   if kwargs:
-    func_ = tf_decorator.make_decorator(func_, functools.partial(
-        func_, **kwargs))
+    func_ = tf_decorator.make_decorator(func_,
+                                        functools.partial(func_, **kwargs))
   if context.executing_eagerly():
     if unique_name_ is not None:
       raise ValueError(
@@ -244,24 +250,29 @@
   call.
   """
 
-  def __init__(self, name, func, create_scope_now=False, unique_name=None,
-               custom_getter=None, create_graph_function=False):
+  def __init__(self,
+               name,
+               func,
+               create_scope_now=False,
+               unique_name=None,
+               custom_getter=None,
+               create_graph_function=False):
     """Creates a template for the given function.
 
     Args:
-      name: A name for the scope created by this template. The
-        name will be made unique by appending `_N` to the it (see how
-        `tf.variable_scope` treats the `default_name` for details).
+      name: A name for the scope created by this template. The name will be made
+        unique by appending `_N` to the it (see how
+        `tf.compat.v1.variable_scope` treats the `default_name` for details).
       func: The function to apply each time.
       create_scope_now: Whether to create the scope at Template construction
         time, rather than first call. Defaults to false. Creating the scope at
         construction time may be more convenient if the template is to passed
-        through much lower level code, and you want to be sure of the scope
-        name without knowing exactly where it will be first called. If set to
-        True, the scope will be created in the constructor, and all subsequent
-        times in `__call__`, leading to a trailing numeral being added to the
-        names of all created Tensors. If set to False, the scope will be created
-        at the first call location.
+        through much lower level code, and you want to be sure of the scope name
+        without knowing exactly where it will be first called. If set to True,
+        the scope will be created in the constructor, and all subsequent times
+        in `__call__`, leading to a trailing numeral being added to the names of
+        all created Tensors. If set to False, the scope will be created at the
+        first call location.
       unique_name: When used, it overrides `name` and is not made unique. If a
         template of the same scope/unique_name already exists and reuse is
         false, an error is raised. Defaults to None.
@@ -330,10 +341,10 @@
         # so log it.
         variables = ops.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES)
         if vars_at_start != len(variables):
-          logging.info("New variables created when calling a template after "
-                       "the first time, perhaps you used tf.Variable when you "
-                       "meant tf.get_variable: %s",
-                       variables[vars_at_start:])
+          logging.info(
+              "New variables created when calling a template after "
+              "the first time, perhaps you used tf.Variable when you "
+              "meant tf.get_variable: %s", variables[vars_at_start:])
       elif self._first_call:
         self._first_call = False
         try:
@@ -356,8 +367,9 @@
         arg0 = ""
       else:
         arg0 = args[0]
-      trace = "".join(_skip_common_stack_elements(self._stacktrace,
-                                                  traceback.format_stack()))
+      trace = "".join(
+          _skip_common_stack_elements(self._stacktrace,
+                                      traceback.format_stack()))
       arg0 = "%s\n\noriginally defined at:\n%s" % (arg0, trace)
       new_args = [arg0]
       new_args.extend(args[1:])
@@ -407,8 +419,7 @@
 
   @property
   def variables(self):
-    """Returns the list of global and local variables created by the Template.
-    """
+    """Returns the list of global and local variables created by the Template."""
     return self.global_variables + self.local_variables
 
   @property
@@ -462,17 +473,16 @@
     return self.non_trainable_variables
 
   @property
-  @deprecated(
-      "2017-02-21", "The .var_scope property is deprecated. Please change your "
-      "code to use the .variable_scope property")
+  @deprecated("2017-02-21",
+              "The .var_scope property is deprecated. Please change your "
+              "code to use the .variable_scope property")
   def var_scope(self):
     """Returns the variable scope object created by this Template."""
     return self._variable_scope
 
 
 class _EagerTemplateVariableStore(object):
-  """Wrapper around EagerVariableStore to support nesting EagerTemplates.
-  """
+  """Wrapper around EagerVariableStore to support nesting EagerTemplates."""
 
   def __init__(self, variable_scope_name):
     self._variable_scope_name = variable_scope_name
@@ -537,24 +547,28 @@
   call.
   """
 
-  def __init__(self, name, func, create_scope_now=False, custom_getter=None,
+  def __init__(self,
+               name,
+               func,
+               create_scope_now=False,
+               custom_getter=None,
                create_graph_function=False):
     """Creates a template for the given function.
 
     Args:
-      name: A name for the scope created by this template. The
-        name will be made unique by appending `_N` to the it (see how
-        `tf.variable_scope` treats the `default_name` for details).
+      name: A name for the scope created by this template. The name will be made
+        unique by appending `_N` to the it (see how
+        `tf.compat.v1.variable_scope` treats the `default_name` for details).
       func: The function to apply each time.
       create_scope_now: Whether to create the scope at Template construction
         time, rather than first call. Defaults to false. Creating the scope at
         construction time may be more convenient if the template is passed
-        through much lower level code, and you want to be sure of the scope
-        name without knowing exactly where it will be first called. If set to
-        True, the scope will be created in the constructor, and all subsequent
-        times in `__call__`, leading to a trailing numeral being added to the
-        names of all created Tensors. If set to False, the scope will be created
-        at the first call location.
+        through much lower level code, and you want to be sure of the scope name
+        without knowing exactly where it will be first called. If set to True,
+        the scope will be created in the constructor, and all subsequent times
+        in `__call__`, leading to a trailing numeral being added to the names of
+        all created Tensors. If set to False, the scope will be created at the
+        first call location.
       custom_getter: optional custom getter to pass to `variable_scope()`
       create_graph_function: When True, `func` will be executed as a graph
         function. Enabling this flag allows the caller to reap the performance
@@ -568,8 +582,7 @@
     if not context.executing_eagerly():
       raise RuntimeError(
           "{} objects can only be used when eager execution is enabled, use "
-          "tf.Template for graph construction".
-          format(type(self)))
+          "tf.Template for graph construction".format(type(self)))
     super(EagerTemplate, self).__init__(name, func, create_scope_now, None,
                                         custom_getter, create_graph_function)
     if self._variable_scope is not None:
@@ -601,21 +614,22 @@
         # If a variable that we intend to train is created as a side effect
         # of creating a template, then that is almost certainly an error.
         if len(trainable_at_start) != len(trainable_variables):
-          raise ValueError("Trainable variable created when calling a template "
-                           "after the first time, perhaps you used tf.Variable "
-                           "when you meant tf.get_variable: %s" %
-                           list(set(trainable_variables) -
-                                set(trainable_at_start)))
+          raise ValueError(
+              "Trainable variable created when calling a template "
+              "after the first time, perhaps you used tf.Variable "
+              "when you meant tf.get_variable: %s" %
+              list(set(trainable_variables) - set(trainable_at_start)))
 
         # Non-trainable tracking variables are a legitimate reason why a new
         # variable would be created, but it is a relatively advanced use-case,
         # so log it.
         variables = self._template_store.variables()
         if len(vars_at_start) != len(variables):
-          logging.info("New variables created when calling a template after "
-                       "the first time, perhaps you used tf.Variable when you "
-                       "meant tf.get_variable: %s",
-                       list(set(variables) - set(vars_at_start)))
+          logging.info(
+              "New variables created when calling a template after "
+              "the first time, perhaps you used tf.Variable when you "
+              "meant tf.get_variable: %s",
+              list(set(variables) - set(vars_at_start)))
       else:
         self._variables_created = True
       return result
@@ -627,8 +641,9 @@
         arg0 = ""
       else:
         arg0 = args[0]
-      trace = "".join(_skip_common_stack_elements(self._stacktrace,
-                                                  traceback.format_stack()))
+      trace = "".join(
+          _skip_common_stack_elements(self._stacktrace,
+                                      traceback.format_stack()))
       arg0 = "%s\n\noriginally defined at:\n%s" % (arg0, trace)
       new_args = [arg0]
       new_args.extend(args[1:])
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index d75f423..aeb7b54 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A class to store named variables and a scope operator to manage sharing."""
 
 from __future__ import absolute_import
@@ -55,15 +54,14 @@
 
 
 class _PartitionInfo(object):
-  """Holds partition info used by initializer functions.
-  """
+  """Holds partition info used by initializer functions."""
 
   def __init__(self, full_shape, var_offset):
     """Constructor.
 
     Args:
-      full_shape: Tuple or list of `int` indicating the full combined shape
-        of the partitioned variables.
+      full_shape: Tuple or list of `int` indicating the full combined shape of
+        the partitioned variables.
       var_offset: Tuple or list of `int` specifying offset of this partition
         with respect to the full variable for each dimension.
 
@@ -157,8 +155,9 @@
     if len(shape) != len(self.full_shape):
       raise ValueError(
           "Expected equal length, but received shape={} of length {} while "
-          "self.full_shape={} is of length {}.".format(shape, len(
-              shape), self.full_shape, len(self.full_shape)))
+          "self.full_shape={} is of length {}.".format(shape, len(shape),
+                                                       self.full_shape,
+                                                       len(self.full_shape)))
 
     for i in xrange(len(shape)):
       if self.var_offset[i] + shape[i] > self.full_shape[i]:
@@ -206,7 +205,6 @@
 it does exist, simply return it.
 """
 
-
 _DEFAULT_USE_RESOURCE = tf2.enabled()
 
 
@@ -270,8 +268,8 @@
   variables are initialized with the initializer passed to __init__.
 
   Attributes:
-    vars: a dictionary with string names (same as passed in GetVar) as keys
-          and the corresponding TensorFlow Variables as values.
+    vars: a dictionary with string names (same as passed in GetVar) as keys and
+      the corresponding TensorFlow Variables as values.
   """
 
   def __init__(self):
@@ -304,7 +302,7 @@
 
     Set `reuse` to `True` when you only want to reuse existing Variables.
     Set `reuse` to `False` when you only want to create new Variables.
-    Set `reuse` to None (the default) or tf.AUTO_REUSE when you want
+    Set `reuse` to None (the default) or tf.compat.v1.AUTO_REUSE when you want
     variables to be created if they don't exist or returned if they do.
 
     If initializer is `None` (the default), the default initializer passed in
@@ -324,16 +322,15 @@
       shape: Shape of the new or existing variable.
       dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
       initializer: Initializer for the variable.
-      regularizer: A (Tensor -> Tensor or None) function; the result of
-        applying it on a newly created variable will be added to the collection
+      regularizer: A (Tensor -> Tensor or None) function; the result of applying
+        it on a newly created variable will be added to the collection
         GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-      reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation
-        of variables. When eager execution is enabled  this argument is always
+      reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation of
+        variables. When eager execution is enabled  this argument is always
         forced to be False.
       trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-        `trainable` defaults to `True` unless `synchronization` is
-        set to `ON_READ`.
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). `trainable`
+        defaults to `True` unless `synchronization` is set to `ON_READ`.
       collections: List of graph collections keys to add the `Variable` to.
         Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
       caching_device: Optional device string or function describing where the
@@ -349,34 +346,32 @@
         must be known.
       use_resource: If False, creates a regular Variable. If True, creates
         instead an experimental ResourceVariable which has well-defined
-        semantics. Defaults to False (will later change to True).
-        When eager execution is enabled this argument is always forced to be
-        true.
+        semantics. Defaults to False (will later change to True). When eager
+        execution is enabled this argument is always forced to be true.
       custom_getter: Callable that takes as a first argument the true getter,
-        and allows overwriting the internal get_variable method.
-        The signature of `custom_getter` should match that of this method,
-        but the most future-proof version will allow for changes:
-        `def custom_getter(getter, *args, **kwargs)`.  Direct access to
-        all `get_variable` parameters is also allowed:
-        `def custom_getter(getter, name, *args, **kwargs)`.  A simple identity
+        and allows overwriting the internal get_variable method. The signature
+        of `custom_getter` should match that of this method,
+        but the most future-proof version will allow for changes: `def
+          custom_getter(getter, *args, **kwargs)`.  Direct access to
+        all `get_variable` parameters is also allowed: `def
+          custom_getter(getter, name, *args, **kwargs)`.  A simple identity
         custom getter that simply creates variables with modified names is:
-        ```python
-        def custom_getter(getter, name, *args, **kwargs):
-          return getter(name + '_suffix', *args, **kwargs)
-        ```
+          ```python
+        def custom_getter(getter, name, *args, **kwargs): return getter(name +
+          '_suffix', *args, **kwargs) ```
       constraint: An optional projection function to be applied to the variable
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize. If `synchronization` is set to `ON_READ`, `trainable` must
+        not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -393,8 +388,8 @@
         EagerVariableStore.
     """
     if custom_getter is not None and not callable(custom_getter):
-      raise ValueError(
-          "Passed a custom_getter which is not callable: %s" % custom_getter)
+      raise ValueError("Passed a custom_getter which is not callable: %s" %
+                       custom_getter)
 
     with ops.init_scope():
       if context.executing_eagerly():
@@ -448,14 +443,14 @@
         constraint=None,
         synchronization=VariableSynchronization.AUTO,
         aggregation=VariableAggregation.NONE):
-      is_scalar = (shape is not None
-                   and isinstance(shape, collections_lib.Sequence)
-                   and not shape)
+      is_scalar = (
+          shape is not None and isinstance(shape, collections_lib.Sequence) and
+          not shape)
       # Partitioned variable case
       if partitioner is not None and not is_scalar:
         if not callable(partitioner):
-          raise ValueError(
-              "Partitioner must be callable, but received: %s" % partitioner)
+          raise ValueError("Partitioner must be callable, but received: %s" %
+                           partitioner)
         with ops.name_scope(None):
           return self._get_partitioned_variable(
               name=name,
@@ -596,7 +591,7 @@
 
     Set `reuse` to `True` when you only want to reuse existing Variables.
     Set `reuse` to `False` when you only want to create new Variables.
-    Set `reuse` to None (the default) or tf.AUTO_REUSE when you want
+    Set `reuse` to None (the default) or tf.compat.v1.AUTO_REUSE when you want
     variables to be created if they don't exist or returned if they do.
 
     If initializer is `None` (the default), the default initializer passed in
@@ -617,14 +612,14 @@
         and `dtype` of the Variable to be created, and returns a list of
         partitions for each axis (currently only one axis can be partitioned).
       shape: shape of the new or existing sharded variable.
-      dtype: type of the new or existing sharded variable
-        (defaults to `DT_FLOAT`).
+      dtype: type of the new or existing sharded variable (defaults to
+        `DT_FLOAT`).
       initializer: initializer for the sharded variable.
-      regularizer: a (Tensor -> Tensor or None) function; the result of
-        applying it on a newly created variable will be added to the collection
+      regularizer: a (Tensor -> Tensor or None) function; the result of applying
+        it on a newly created variable will be added to the collection
         GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-      reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation
-        of variables.
+      reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation of
+        variables.
       trainable: If `True` also add the variable to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
       collections: List of graph collections keys to add the Variable to.
@@ -644,15 +639,15 @@
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize. If `synchronization` is set to `ON_READ`, `trainable` must
+        not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -686,20 +681,17 @@
       if reuse is False:
         raise ValueError(
             "Partitioned variable with name %s already exists. Did you mean to "
-            "set reuse=True or reuse=tf.AUTO_REUSE in VarScope?"
-            % name)
+            "set reuse=True or reuse=tf.AUTO_REUSE in VarScope?" % name)
 
       existing_var = self._partitioned_vars[name]
       if not shape.is_compatible_with(existing_var.get_shape()):
         raise ValueError(
             "Trying to reuse partitioned variable %s, but specified shape %s "
-            "and found shape %s."
-            % (name, shape, existing_var.get_shape()))
+            "and found shape %s." % (name, shape, existing_var.get_shape()))
       if not dtype.is_compatible_with(existing_var.dtype):
         raise ValueError(
             "Trying to reuse partitioned variable %s, but specified dtype %s "
-            "and found dtype %s."
-            % (name, dtype.name, existing_var.dtype.name))
+            "and found dtype %s." % (name, dtype.name, existing_var.dtype.name))
 
       # pylint: disable=protected-access
       if (partitions is not None and
@@ -724,21 +716,18 @@
         raise ValueError(
             "Partitioner returned a different partitioning than what was "
             "already found.  Partitioner returned %d shards, and shard "
-            "%s/part_0 was found, but %s/part_%d was not."
-            % (num_slices, name, name, num_slices - 1))
+            "%s/part_0 was found, but %s/part_%d was not." %
+            (num_slices, name, name, num_slices - 1))
       if "%s/part_%d" % (name, num_slices) in self._vars:
         raise ValueError(
             "Partitioner returned a different partitioning than what was "
             "already found.  Partitioner returned %d shards, and shard "
-            "%s/part_0 was found, but so was the extra shard %s/part_%d."
-            % (num_slices, name, name, num_slices))
+            "%s/part_0 was found, but so was the extra shard %s/part_%d." %
+            (num_slices, name, name, num_slices))
 
     vs = []
-    for i, (var_offset, var_shape) in enumerate(_iter_slices(
-        shape.as_list(),
-        num_slices,
-        slice_dim
-    )):
+    for i, (var_offset, var_shape) in enumerate(
+        _iter_slices(shape.as_list(), num_slices, slice_dim)):
       partition_info = _PartitionInfo(
           full_shape=shape.as_list(), var_offset=var_offset)
       var_full_name = "%s/part_%d" % (name, i)
@@ -783,16 +772,18 @@
             aggregation=aggregation)
 
       # pylint: disable=protected-access
-      var._set_save_slice_info(variables.Variable.SaveSliceInfo(
-          name, shape.as_list(), var_offset, var_shape))
+      var._set_save_slice_info(
+          variables.Variable.SaveSliceInfo(name, shape.as_list(), var_offset,
+                                           var_shape))
       vs.append(var)
       # pylint: enable=protected-access
 
-    partitioned_var = variables.PartitionedVariable(name=name,
-                                                    shape=shape,
-                                                    dtype=dtype,
-                                                    variable_list=vs,
-                                                    partitions=partitions)
+    partitioned_var = variables.PartitionedVariable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        variable_list=vs,
+        partitions=partitions)
     if not context.executing_eagerly() or self._store_eager_variables:
       self._partitioned_vars[name] = partitioned_var
     return partitioned_var
@@ -813,7 +804,9 @@
                            constraint=None,
                            synchronization=VariableSynchronization.AUTO,
                            aggregation=VariableAggregation.NONE):
-    """Get or create a single Variable (e.g. a shard or entire variable).
+    """Get or create a single Variable (e.g.
+
+    a shard or entire variable).
 
     See the documentation of get_variable above (ignore partitioning components)
     for details.
@@ -867,19 +860,19 @@
         # functions to create variables) so we take more than needed in the
         # default case.
         tb = [x for x in tb if "tensorflow/python" not in x[0]][:5]
-        raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(
-            traceback.format_list(tb))))
+        raise ValueError("%s Originally defined at:\n\n%s" %
+                         (err_msg, "".join(traceback.format_list(tb))))
       found_var = self._vars[name]
       if not shape.is_compatible_with(found_var.get_shape()):
         raise ValueError("Trying to share variable %s, but specified shape %s"
-                         " and found shape %s." % (name, shape,
-                                                   found_var.get_shape()))
+                         " and found shape %s." %
+                         (name, shape, found_var.get_shape()))
       if not dtype.is_compatible_with(found_var.dtype):
         dtype_str = dtype.name
         found_type_str = found_var.dtype.name
         raise ValueError("Trying to share variable %s, but specified dtype %s"
-                         " and found dtype %s." % (name, dtype_str,
-                                                   found_type_str))
+                         " and found dtype %s." %
+                         (name, dtype_str, found_type_str))
       return found_var
 
     # The code below handles only the case of creating a new variable.
@@ -900,10 +893,12 @@
       else:
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
-          initializer = initializer(dtype=dtype)
+          initializer = initializer()
         if shape is not None and shape.is_fully_defined():
           init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-              shape.as_list(), dtype=dtype, partition_info=partition_info)
+              shape.as_list(),
+              dtype=dtype,
+              partition_info=partition_info)
           variable_dtype = dtype.base_dtype
         elif len(tf_inspect.getargspec(initializer).args) == len(
             tf_inspect.getargspec(initializer).defaults or []):
@@ -960,8 +955,9 @@
           else:
             v_name = v.name
             loss_name = loss.name
-          logging.vlog(1, "Applied regularizer to %s and added the result %s "
-                       "to REGULARIZATION_LOSSES.", v_name, loss_name)
+          logging.vlog(
+              1, "Applied regularizer to %s and added the result %s "
+              "to REGULARIZATION_LOSSES.", v_name, loss_name)
           ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, loss)
     return v
 
@@ -987,20 +983,20 @@
       initializing_from_value = False
     # If dtype is DT_INT/DT_UINT, provide a default value `zero`
     # If dtype is DT_BOOL, provide a default value `FALSE`
-    elif (dtype.is_integer or dtype.is_unsigned or dtype.is_bool
-          or dtype == dtypes.string):
+    elif (dtype.is_integer or dtype.is_unsigned or dtype.is_bool or
+          dtype == dtypes.string):
       initializer = init_ops.zeros_initializer()
       initializing_from_value = False
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
     else:
-      raise ValueError("An initializer for variable %s of %s is required"
-                       % (name, dtype.base_dtype))
+      raise ValueError("An initializer for variable %s of %s is required" %
+                       (name, dtype.base_dtype))
 
     return initializer, initializing_from_value
 
 
 # To stop regularization, use this regularizer
-@tf_export("no_regularizer")
+@tf_export(v1=["no_regularizer"])
 def no_regularizer(_):
   """Use this function to prevent regularization of variables."""
   return None
@@ -1018,7 +1014,7 @@
     name: name of the current scope, used as prefix in get_variable.
     initializer: default initializer passed to get_variable.
     regularizer: default regularizer passed to get_variable.
-    reuse: Boolean, None, or tf.AUTO_REUSE, setting the reuse in
+    reuse: Boolean, None, or tf.compat.v1.AUTO_REUSE, setting the reuse in
       get_variable. When eager execution is enabled this argument is always
       forced to be False.
     caching_device: string, callable, or None: the caching device passed to
@@ -1028,16 +1024,16 @@
     name_scope: The name passed to `tf.name_scope`.
     dtype: default type passed to get_variable (defaults to DT_FLOAT).
     use_resource: if False, create a normal Variable; if True create an
-      experimental ResourceVariable with well-defined semantics. Defaults
-      to False (will later change to True). When eager execution is enabled
-      this argument is always forced to be True.
+      experimental ResourceVariable with well-defined semantics. Defaults to
+      False (will later change to True). When eager execution is enabled this
+      argument is always forced to be True.
     constraint: An optional projection function to be applied to the variable
       after being updated by an `Optimizer` (e.g. used to implement norm
       constraints or value constraints for layer weights). The function must
       take as input the unprojected Tensor representing the value of the
-      variable and return the Tensor for the projected value
-      (which must have the same shape). Constraints are not safe to
-      use when doing asynchronous distributed training.
+      variable and return the Tensor for the projected value (which must have
+      the same shape). Constraints are not safe to use when doing asynchronous
+      distributed training.
   """
 
   def __init__(self,
@@ -1399,7 +1395,7 @@
     container = tfe.EagerVariableStore()
     for input in dataset_iterator:
       with container.as_default():
-        x = tf.layers.dense(input, name="l1")
+        x = tf.compat.v1.layers.dense(input, name="l1")
     print(container.variables)  # Should print the variables used in the layer.
   ```
   """
@@ -1456,9 +1452,7 @@
 
       # Create new variable with same value, name, and "trainable" flag.
       new_var = resource_variable_ops.ResourceVariable(
-          var.read_value(),
-          name=stripped_var_name,
-          trainable=var.trainable)
+          var.read_value(), name=stripped_var_name, trainable=var.trainable)
       new_store._store._vars[key] = new_var
     return new_store
     # pylint: enable=protected-access
@@ -1603,8 +1597,7 @@
     and `dtype` don't match. Reuse is set inside `variable_scope`.
 """)
 get_variable.__doc__ = get_variable_or_local_docstring % (
-    "Gets an existing variable with these parameters or create a new one.",
-    "",
+    "Gets an existing variable with these parameters or create a new one.", "",
     "trainable: If `True` also add the variable to the graph collection\n"
     "    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).\n  ",
     "GraphKeys.GLOBAL_VARIABLES")
@@ -1655,9 +1648,7 @@
     "Gets an existing *local* variable or creates a new one.",
     "Behavior is the same as in `get_variable`, except that variables are\n"
     "added to the `LOCAL_VARIABLES` collection and `trainable` is set to\n"
-    "`False`.\n",
-    "",
-    "GraphKeys.LOCAL_VARIABLES")
+    "`False`.\n", "", "GraphKeys.LOCAL_VARIABLES")
 
 
 def _get_partitioned_variable(name,
@@ -1703,24 +1694,24 @@
     shape: Shape of the new or existing variable.
     dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
     initializer: Initializer for the variable if one is created.
-    regularizer: A (Tensor -> Tensor or None) function; the result of
-      applying it on a newly created variable will be added to the collection
+    regularizer: A (Tensor -> Tensor or None) function; the result of applying
+      it on a newly created variable will be added to the collection
       GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
     trainable: If `True` also add the variable to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    collections: List of graph collections keys to add the Variable to.
-      Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
+    collections: List of graph collections keys to add the Variable to. Defaults
+      to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
     caching_device: Optional device string or function describing where the
-      Variable should be cached for reading.  Defaults to the Variable's
-      device.  If not `None`, caches on another device.  Typical use is to
-      cache on the device where the Ops using the Variable reside, to
-      deduplicate copying through `Switch` and other conditional statements.
+      Variable should be cached for reading.  Defaults to the Variable's device.
+      If not `None`, caches on another device.  Typical use is to cache on the
+      device where the Ops using the Variable reside, to deduplicate copying
+      through `Switch` and other conditional statements.
     partitioner: Optional callable that accepts a fully defined `TensorShape`
       and `dtype` of the Variable to be created, and returns a list of
       partitions for each axis (currently only one axis can be partitioned).
-    validate_shape: If False, allows the variable to be initialized with a
-        value of unknown shape. If True, the default, the shape of initial_value
-        must be known.
+    validate_shape: If False, allows the variable to be initialized with a value
+      of unknown shape. If True, the default, the shape of initial_value must be
+      known.
     use_resource: If False, creates a regular Variable. If True, creates an
       experimental ResourceVariable instead which has well-defined semantics.
       Defaults to False (will later change to True).
@@ -1728,15 +1719,15 @@
       after being updated by an `Optimizer` (e.g. used to implement norm
       constraints or value constraints for layer weights). The function must
       take as input the unprojected Tensor representing the value of the
-      variable and return the Tensor for the projected value
-      (which must have the same shape). Constraints are not safe to
-      use when doing asynchronous distributed training.
-    synchronization: Indicates when a distributed a variable will be
-      aggregated. Accepted values are constants defined in the class
+      variable and return the Tensor for the projected value (which must have
+      the same shape). Constraints are not safe to use when doing asynchronous
+      distributed training.
+    synchronization: Indicates when a distributed a variable will be aggregated.
+      Accepted values are constants defined in the class
       `tf.VariableSynchronization`. By default the synchronization is set to
-      `AUTO` and the current `DistributionStrategy` chooses
-      when to synchronize. If `synchronization` is set to `ON_READ`,
-      `trainable` must not be set to `True`.
+      `AUTO` and the current `DistributionStrategy` chooses when to synchronize.
+      If `synchronization` is set to `ON_READ`, `trainable` must not be set to
+      `True`.
     aggregation: Indicates how a distributed variable will be aggregated.
       Accepted values are constants defined in the class
       `tf.VariableAggregation`.
@@ -1802,8 +1793,8 @@
 
     Args:
       name_or_scope: `string` or `VariableScope`: the scope to open.
-      reuse: `True` or None, or tf.AUTO_REUSE; if `None`, we inherit the parent
-        scope's reuse flag.
+      reuse: `True` or None, or tf.compat.v1.AUTO_REUSE; if `None`, we inherit
+        the parent scope's reuse flag.
       initializer: default initializer for variables within this scope.
       regularizer: default regularizer for variables within this scope.
       caching_device: default caching device for variables within this scope.
@@ -1818,9 +1809,9 @@
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
     """
     self._name_or_scope = name_or_scope
     self._reuse = reuse
@@ -1865,8 +1856,8 @@
         variable_scope_object.set_partitioner(self._partitioner)
       if self._custom_getter is not None:
         variable_scope_object.set_custom_getter(
-            _maybe_wrap_custom_getter(
-                self._custom_getter, self._name_or_scope.custom_getter))
+            _maybe_wrap_custom_getter(self._custom_getter,
+                                      self._name_or_scope.custom_getter))
       if self._dtype is not None:
         variable_scope_object.set_dtype(self._dtype)
       if self._use_resource is not None:
@@ -1894,10 +1885,10 @@
       #   VariableScope with name extended by the provided one, and inherited
       #   reuse and initializer (except if the user provided values to set).
       self._new_name = (
-          self._old.name + "/" + self._name_or_scope if self._old.name
-          else self._name_or_scope)
-      self._reuse = (self._reuse
-                     or self._old.reuse)  # Re-using is inherited by sub-scopes.
+          self._old.name + "/" +
+          self._name_or_scope if self._old.name else self._name_or_scope)
+      self._reuse = (self._reuse or
+                     self._old.reuse)  # Re-using is inherited by sub-scopes.
       if self._old_name_scope is None:
         name_scope = self._name_or_scope
       else:
@@ -1936,8 +1927,8 @@
     return variable_scope_object
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    if (self._var_scope_store.current_scope is not
-        self._last_variable_scope_object):
+    if (self._var_scope_store.current_scope is
+        not self._last_variable_scope_object):
       raise RuntimeError("Improper nesting of variable_scope.")
     # If jumping out from a non-prolonged scope, restore counts.
     if isinstance(self._name_or_scope, VariableScope):
@@ -1961,9 +1952,8 @@
     # will call the true_getter, perform any intermediate
     # processing, and return the results to the current
     # getter, which will also perform additional processing.
-    return custom_getter(
-        functools.partial(old_getter, getter),
-        *args, **kwargs)
+    return custom_getter(functools.partial(old_getter, getter), *args, **kwargs)
+
   return wrapped_custom_getter
 
 
@@ -2004,24 +1994,24 @@
   Simple example of how to create a new variable:
 
   ```python
-  with tf.variable_scope("foo"):
-      with tf.variable_scope("bar"):
-          v = tf.get_variable("v", [1])
+  with tf.compat.v1.variable_scope("foo"):
+      with tf.compat.v1.variable_scope("bar"):
+          v = tf.compat.v1.get_variable("v", [1])
           assert v.name == "foo/bar/v:0"
   ```
 
   Simple example of how to reenter a premade variable scope safely:
 
   ```python
-  with tf.variable_scope("foo") as vs:
+  with tf.compat.v1.variable_scope("foo") as vs:
     pass
 
   # Re-enter the variable scope.
-  with tf.variable_scope(vs,
+  with tf.compat.v1.variable_scope(vs,
                          auxiliary_name_scope=False) as vs1:
     # Restore the original name_scope.
     with tf.name_scope(vs1.original_name_scope):
-        v = tf.get_variable("v", [1])
+        v = tf.compat.v1.get_variable("v", [1])
         assert v.name == "foo/v:0"
         c = tf.constant([1], name="c")
         assert c.name == "foo/c:0"
@@ -2031,8 +2021,8 @@
 
   ```python
   def foo():
-    with tf.variable_scope("foo", reuse=tf.AUTO_REUSE):
-      v = tf.get_variable("v", [1])
+    with tf.compat.v1.variable_scope("foo", reuse=tf.compat.v1.AUTO_REUSE):
+      v = tf.compat.v1.get_variable("v", [1])
     return v
 
   v1 = foo()  # Creates v.
@@ -2043,20 +2033,20 @@
   Basic example of sharing a variable with reuse=True:
 
   ```python
-  with tf.variable_scope("foo"):
-      v = tf.get_variable("v", [1])
-  with tf.variable_scope("foo", reuse=True):
-      v1 = tf.get_variable("v", [1])
+  with tf.compat.v1.variable_scope("foo"):
+      v = tf.compat.v1.get_variable("v", [1])
+  with tf.compat.v1.variable_scope("foo", reuse=True):
+      v1 = tf.compat.v1.get_variable("v", [1])
   assert v1 == v
   ```
 
   Sharing a variable by capturing a scope and setting reuse:
 
   ```python
-  with tf.variable_scope("foo") as scope:
-      v = tf.get_variable("v", [1])
+  with tf.compat.v1.variable_scope("foo") as scope:
+      v = tf.compat.v1.get_variable("v", [1])
       scope.reuse_variables()
-      v1 = tf.get_variable("v", [1])
+      v1 = tf.compat.v1.get_variable("v", [1])
   assert v1 == v
   ```
 
@@ -2064,9 +2054,9 @@
   an existing variable in a non-reusing scope.
 
   ```python
-  with tf.variable_scope("foo"):
-      v = tf.get_variable("v", [1])
-      v1 = tf.get_variable("v", [1])
+  with tf.compat.v1.variable_scope("foo"):
+      v = tf.compat.v1.get_variable("v", [1])
+      v1 = tf.compat.v1.get_variable("v", [1])
       #  Raises ValueError("... v already exists ...").
   ```
 
@@ -2074,8 +2064,8 @@
   exist in reuse mode.
 
   ```python
-  with tf.variable_scope("foo", reuse=True):
-      v = tf.get_variable("v", [1])
+  with tf.compat.v1.variable_scope("foo", reuse=True):
+      v = tf.compat.v1.get_variable("v", [1])
       #  Raises ValueError("... v does not exists ...").
   ```
 
@@ -2145,14 +2135,14 @@
       caching_device: default caching device for variables within this scope.
       partitioner: default partitioner for variables within this scope.
       custom_getter: default custom getter for variables within this scope.
-      reuse: `True`, None, or tf.AUTO_REUSE; if `True`, we go into reuse mode
-        for this scope as well as all sub-scopes; if tf.AUTO_REUSE, we create
-        variables if they do not exist, and return them otherwise; if None, we
-        inherit the parent scope's reuse flag. When eager execution is enabled,
-        new variables are always created unless an EagerVariableStore or
-        template is currently active.
-      dtype: type of variables created in this scope (defaults to the type
-        in the passed scope, or inherited from parent scope).
+      reuse: `True`, None, or tf.compat.v1.AUTO_REUSE; if `True`, we go into
+        reuse mode for this scope as well as all sub-scopes; if
+        tf.compat.v1.AUTO_REUSE, we create variables if they do not exist, and
+        return them otherwise; if None, we inherit the parent scope's reuse
+        flag. When eager execution is enabled, new variables are always created
+        unless an EagerVariableStore or template is currently active.
+      dtype: type of variables created in this scope (defaults to the type in
+        the passed scope, or inherited from parent scope).
       use_resource: If False, all variables will be regular Variables. If True,
         experimental ResourceVariables with well-defined semantics will be used
         instead. Defaults to False (will later change to True). When eager
@@ -2161,13 +2151,13 @@
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't create it. Note that the argument is
-        not inherited, and it only takes effect for once when creating. You
-        should only use it for re-entering a premade variable scope.
+        the scope. If `False`, we don't create it. Note that the argument is not
+        inherited, and it only takes effect for once when creating. You should
+        only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
@@ -2355,8 +2345,8 @@
       return entered_pure_variable_scope
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    self._cached_pure_variable_scope.__exit__(
-        type_arg, value_arg, traceback_arg)
+    self._cached_pure_variable_scope.__exit__(type_arg, value_arg,
+                                              traceback_arg)
     if self._current_name_scope:
       self._current_name_scope.__exit__(type_arg, value_arg, traceback_arg)
     if self._in_graph_mode and not self._building_function:
@@ -2381,18 +2371,19 @@
   """Deprecated: context manager for defining an op that creates variables."""
   logging.warn("tf.variable_op_scope(values, name, default_name) is deprecated,"
                " use tf.variable_scope(name, default_name, values)")
-  with variable_scope(name_or_scope,
-                      default_name=default_name,
-                      values=values,
-                      initializer=initializer,
-                      regularizer=regularizer,
-                      caching_device=caching_device,
-                      partitioner=partitioner,
-                      custom_getter=custom_getter,
-                      reuse=reuse,
-                      dtype=dtype,
-                      use_resource=use_resource,
-                      constraint=constraint) as scope:
+  with variable_scope(
+      name_or_scope,
+      default_name=default_name,
+      values=values,
+      initializer=initializer,
+      regularizer=regularizer,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      custom_getter=custom_getter,
+      reuse=reuse,
+      dtype=dtype,
+      use_resource=use_resource,
+      constraint=constraint) as scope:
     yield scope
 
 
@@ -2400,10 +2391,10 @@
   """Call partitioner validating its inputs/output.
 
   Args:
-    partitioner: a function mapping `Tensor` shape and dtype to a
-        list of partitions.
+    partitioner: a function mapping `Tensor` shape and dtype to a list of
+      partitions.
     shape: shape of the `Tensor` to partition, must have at least two
-        dimensions.
+      dimensions.
     dtype: dtype of the elements in the `Tensor`.
 
   Returns:
@@ -2419,20 +2410,18 @@
 
   slicing = partitioner(shape=shape, dtype=dtype)
   if not isinstance(slicing, collections_lib.Sequence):
-    raise ValueError("Partitioner must return a sequence, but saw: %s"
-                     % slicing)
+    raise ValueError("Partitioner must return a sequence, but saw: %s" %
+                     slicing)
   if len(slicing) != shape.ndims:
     raise ValueError(
         "Partitioner returned a partition list that does not match the "
         "Variable's rank: %s vs. %s" % (slicing, shape))
   if any(p < 1 for p in slicing):
-    raise ValueError(
-        "Partitioner returned zero partitions for some axes: %s" %
-        slicing)
+    raise ValueError("Partitioner returned zero partitions for some axes: %s" %
+                     slicing)
   if sum(p > 1 for p in slicing) > 1:
-    raise ValueError(
-        "Can only slice a variable along one dimension: "
-        "shape: %s, partitioning: %s" % (shape, slicing))
+    raise ValueError("Can only slice a variable along one dimension: "
+                     "shape: %s, partitioning: %s" % (shape, slicing))
   return slicing
 
 
@@ -2489,20 +2478,34 @@
   if use_resource:
     distribute_strategy = kwargs.get("distribute_strategy", None)
     return resource_variable_ops.ResourceVariable(
-        initial_value=initial_value, trainable=trainable,
-        collections=collections, validate_shape=validate_shape,
-        caching_device=caching_device, name=name, dtype=dtype,
-        constraint=constraint, variable_def=variable_def,
-        import_scope=import_scope, distribute_strategy=distribute_strategy,
-        synchronization=synchronization, aggregation=aggregation)
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        dtype=dtype,
+        constraint=constraint,
+        variable_def=variable_def,
+        import_scope=import_scope,
+        distribute_strategy=distribute_strategy,
+        synchronization=synchronization,
+        aggregation=aggregation)
   else:
     return variables.RefVariable(
-        initial_value=initial_value, trainable=trainable,
-        collections=collections, validate_shape=validate_shape,
-        caching_device=caching_device, name=name, dtype=dtype,
-        constraint=constraint, variable_def=variable_def,
-        expected_shape=expected_shape, import_scope=import_scope,
-        synchronization=synchronization, aggregation=aggregation)
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        dtype=dtype,
+        constraint=constraint,
+        variable_def=variable_def,
+        expected_shape=expected_shape,
+        import_scope=import_scope,
+        synchronization=synchronization,
+        aggregation=aggregation)
 
 
 def default_variable_creator_v2(next_creator=None, **kwargs):
@@ -2522,11 +2525,18 @@
   aggregation = kwargs.get("aggregation", None)
 
   return resource_variable_ops.ResourceVariable(
-      initial_value=initial_value, trainable=trainable,
-      validate_shape=validate_shape, caching_device=caching_device,
-      name=name, dtype=dtype, constraint=constraint, variable_def=variable_def,
-      import_scope=import_scope, distribute_strategy=distribute_strategy,
-      synchronization=synchronization, aggregation=aggregation)
+      initial_value=initial_value,
+      trainable=trainable,
+      validate_shape=validate_shape,
+      caching_device=caching_device,
+      name=name,
+      dtype=dtype,
+      constraint=constraint,
+      variable_def=variable_def,
+      import_scope=import_scope,
+      distribute_strategy=distribute_strategy,
+      synchronization=synchronization,
+      aggregation=aggregation)
 
 
 variables.default_variable_creator = default_variable_creator
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 08326ea..6970bfc 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -17,6 +17,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import enum  # pylint: disable=g-bad-import-order
 import functools
 import os
@@ -85,9 +86,9 @@
 class VariableAggregationV2(enum.Enum):
   """Indicates how a distributed variable will be aggregated.
 
-  `tf.contrib.distribute.DistributionStrategy` distributes a model by making
-  multiple copies (called "replicas") acting data-parallel on different elements
-  of the input batch. When performing some variable-update operation, say
+  `tf.distribute.Strategy` distributes a model by making multiple copies
+  (called "replicas") acting data-parallel on different elements of the input
+  batch. When performing some variable-update operation, say
   `var.assign_add(x)`, in a model, we need to resolve how to combine the
   different values for `x` computed in the different replicas.
 
@@ -306,7 +307,7 @@
 
   ```python
   # Launch the graph in a session.
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
       # Run the variable initializer.
       sess.run(w.initializer)
       # ...you now can run ops that use the value of 'w'...
@@ -318,10 +319,10 @@
 
   ```python
   # Add an Op to initialize global variables.
-  init_op = tf.global_variables_initializer()
+  init_op = tf.compat.v1.global_variables_initializer()
 
   # Launch the graph in a session.
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
       # Run the Op that initializes global variables.
       sess.run(init_op)
       # ...you can now run any Op that uses variable values...
@@ -372,8 +373,8 @@
   not have these issues:
 
   * Add `use_resource=True` when constructing `tf.Variable`;
-  * Call `tf.get_variable_scope().set_use_resource(True)` inside a
-    `tf.variable_scope` before the `tf.get_variable()` call.
+  * Call `tf.compat.v1.get_variable_scope().set_use_resource(True)` inside a
+    `tf.compat.v1.variable_scope` before the `tf.compat.v1.get_variable()` call.
   """
 
   def __init__(self,
@@ -512,14 +513,14 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See `tf.Session` for more
+    passed, the default session is used.  See `tf.compat.v1.Session` for more
     information on launching a graph and on sessions.
 
     ```python
     v = tf.Variable([1, 2])
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
 
-    with tf.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         sess.run(init)
         # Usage passing the session explicitly.
         print(v.eval(sess))
@@ -549,7 +550,7 @@
 
     ```python
     # Initialize 'v' with a random tensor.
-    v = tf.Variable(tf.truncated_normal([10, 40]))
+    v = tf.Variable(tf.random.truncated_normal([10, 40]))
     # Use `initialized_value` to guarantee that `v` has been
     # initialized before its value is used to initialize `w`.
     # The random values are picked only once.
@@ -670,7 +671,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered addition has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -687,7 +688,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered assignment has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -734,7 +735,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered assignment has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -767,7 +768,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         op = v.scatter_nd_sub(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(op)
     ```
 
@@ -818,7 +819,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         add = v.scatter_nd_add(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(add)
     ```
 
@@ -836,7 +837,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered addition has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -869,7 +870,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         op = v.scatter_nd_assign(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(op)
     ```
 
@@ -887,7 +888,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered assignment has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -958,14 +959,14 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See `tf.Session` for more
+    passed, the default session is used.  See `tf.compat.v1.Session` for more
     information on launching a graph and on sessions.
 
     ```python
     v = tf.Variable([1, 2])
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
 
-    with tf.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         sess.run(init)
         # Usage passing the session explicitly.
         v.load([2, 3], sess)
@@ -1280,7 +1281,7 @@
 
   ```python
   # Launch the graph in a session.
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
       # Run the variable initializer.
       sess.run(w.initializer)
       # ...you now can run ops that use the value of 'w'...
@@ -1292,10 +1293,10 @@
 
   ```python
   # Add an Op to initialize global variables.
-  init_op = tf.global_variables_initializer()
+  init_op = tf.compat.v1.global_variables_initializer()
 
   # Launch the graph in a session.
-  with tf.Session() as sess:
+  with tf.compat.v1.Session() as sess:
       # Run the Op that initializes global variables.
       sess.run(init_op)
       # ...you can now run any Op that uses variable values...
@@ -1345,8 +1346,8 @@
   not have these issues:
 
   * Add `use_resource=True` when constructing `tf.Variable`;
-  * Call `tf.get_variable_scope().set_use_resource(True)` inside a
-    `tf.variable_scope` before the `tf.get_variable()` call.
+  * Call `tf.compat.v1.get_variable_scope().set_use_resource(True)` inside a
+    `tf.compat.v1.variable_scope` before the `tf.compat.v1.get_variable()` call.
   """
 
   def __init__(self,  # pylint: disable=super-init-not-called
@@ -1844,14 +1845,14 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See `tf.Session` for more
+    passed, the default session is used.  See `tf.compat.v1.Session` for more
     information on launching a graph and on sessions.
 
     ```python
     v = tf.Variable([1, 2])
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
 
-    with tf.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         sess.run(init)
         # Usage passing the session explicitly.
         print(v.eval(sess))
@@ -1993,7 +1994,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered addition has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -2017,7 +2018,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered assignment has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -2071,7 +2072,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered assignment has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -2106,7 +2107,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         op = ref.scatter_nd_sub(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(op)
     ```
 
@@ -2158,7 +2159,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         add = ref.scatter_nd_add(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(add)
     ```
 
@@ -2176,7 +2177,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered addition has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -2210,7 +2211,7 @@
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
         op = ref.scatter_nd_update(indices, updates)
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
           print sess.run(op)
     ```
 
@@ -2228,7 +2229,7 @@
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
+      the scattered assignment has completed.
 
     Raises:
       ValueError: if `sparse_delta` is not an `IndexedSlices`.
@@ -2822,7 +2823,7 @@
   This convenience function returns the contents of that collection.
 
   An alternative to global variables are local variables. See
-  `tf.local_variables`
+  `tf.compat.v1.local_variables`
 
   Args:
     scope: (Optional.) A string. If supplied, the resulting list is filtered
@@ -2840,7 +2841,7 @@
 @tf_export(v1=["all_variables"])
 @deprecated("2017-03-02", "Please use tf.global_variables instead.")
 def all_variables():
-  """Use `tf.global_variables` instead."""
+  """Use `tf.compat.v1.global_variables` instead."""
   return global_variables()
 
 
@@ -2875,7 +2876,7 @@
   This convenience function returns the contents of that collection.
 
   An alternative to local variables are global variables. See
-  `tf.global_variables`
+  `tf.compat.v1.global_variables`
 
   Args:
     scope: (Optional.) A string. If supplied, the resulting list is filtered
@@ -2981,7 +2982,7 @@
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.variables_initializer` instead.")
 def initialize_variables(var_list, name="init"):
-  """See `tf.variables_initializer`."""
+  """See `tf.compat.v1.variables_initializer`."""
   return variables_initializer(var_list, name=name)
 
 
@@ -3003,7 +3004,7 @@
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.global_variables_initializer` instead.")
 def initialize_all_variables():
-  """See `tf.global_variables_initializer`."""
+  """See `tf.compat.v1.global_variables_initializer`."""
   return global_variables_initializer()
 
 
@@ -3025,7 +3026,7 @@
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.local_variables_initializer` instead.")
 def initialize_local_variables():
-  """See `tf.local_variables_initializer`."""
+  """See `tf.compat.v1.local_variables_initializer`."""
   return local_variables_initializer()
 
 
@@ -3138,3 +3139,14 @@
 ops.register_tensor_conversion_function(
     PartitionedVariable,
     PartitionedVariable._TensorConversionFunction)  # pylint: disable=protected-access
+
+
+class AbstractVariableMetaclass(VariableMetaclass, abc.ABCMeta):
+  """Metaclass combining `VariableMetaclass` and `abc.ABCMeta`."""
+  pass
+
+
+@six.add_metaclass(AbstractVariableMetaclass)
+class AbstractVariable(Variable):
+  """`Variable`, but abstract."""
+  pass
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index cbbc0de..2db28ec 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -32,7 +32,6 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_functional_ops
@@ -72,12 +71,18 @@
   # `wrapped_body` below.
   loop_vars = list(_tensor_array_to_flow(orig_loop_vars))
   loop_vars = nest.map_structure(
-      ops.internal_convert_to_tensor_or_indexed_slices, loop_vars)
+      ops.internal_convert_to_tensor_or_indexed_slices, loop_vars,
+      expand_composites=True)
   if shape_invariants is not None:
-    nest.assert_same_structure(orig_loop_vars, shape_invariants)
+    nest.assert_same_structure(orig_loop_vars, shape_invariants,
+                               expand_composites=False)
+    shape_invariants = nest.map_structure(
+        control_flow_ops._get_shape_invariant, loop_vars,
+        list(shape_invariants), expand_composites=False)
   else:
-    shape_invariants = nest.map_structure(lambda t: t.shape, loop_vars)
-
+    shape_invariants = nest.map_structure(
+        control_flow_ops._get_shape_invariant, loop_vars,
+        expand_composites=False)
   if not name:
     name = "while"
 
@@ -150,11 +155,12 @@
       # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays
       # and packs it into the structure of `orig_loop_vars`.
       outputs = body(*_pack_sequence_as(orig_loop_vars, args))
-      if not nest.is_sequence(outputs):
+      if not nest.is_sequence_or_composite(outputs):
         outputs = [outputs]
       # Compare the structure of input and output of body converting the
       # top-level tuples to list to be compatible with legacy while_loop.
-      nest.assert_same_structure(list(outputs), list(orig_loop_vars))
+      nest.assert_same_structure(list(outputs), list(orig_loop_vars),
+                                 expand_composites=True)
 
       outputs = _tensor_array_to_flow(outputs)
 
@@ -193,7 +199,8 @@
     # Make sure that the shapes of the loop outputs are compatible with the
     # shape invariants, or the shapes of the loop vars if the invariants are not
     # specified.
-    num_flattened_outputs = len(nest.flatten(orig_loop_vars))
+    num_flattened_outputs = len(nest.flatten(orig_loop_vars,
+                                             expand_composites=True))
     # First var is loop counter and second var is maximum_iterations.
     first_loop_var_index = 2
     _check_shapes_compat(
@@ -201,10 +208,10 @@
                            num_flattened_outputs],
         nest.flatten(
             shape_invariants[first_loop_var_index:first_loop_var_index +
-                             len_orig_loop_vars]),
+                             len_orig_loop_vars], expand_composites=True),
         nest.flatten(loop_vars[first_loop_var_index:first_loop_var_index +
-                               len_orig_loop_vars]))
-    flattened_loop_vars = nest.flatten(loop_vars)
+                               len_orig_loop_vars], expand_composites=True))
+    flattened_loop_vars = nest.flatten(loop_vars, expand_composites=True)
     _check_num_inputs_outputs(cond_graph, body_graph,
                               len(flattened_loop_vars))
 
@@ -237,7 +244,7 @@
   if return_same_structure:
     return outputs
 
-  flattened_outputs = nest.flatten(outputs)
+  flattened_outputs = nest.flatten(outputs, expand_composites=True)
   if len(flattened_outputs) == 1:
     return flattened_outputs[0]
   else:
@@ -877,19 +884,6 @@
     custom_gradient.copy_handle_data(src_t, tgt_t)
 
 
-# TODO(srbs): This method should be in control_flow_util but that introduces
-# a circular dependency ops -> control_flow_util -> ops.
-def _is_in_xla_context():
-  """Returns whether the current context is inside an XLA context."""
-  outer_graph = ops.get_default_graph()
-  # The `_control_flow_context` is not copied when building a FuncGraph so
-  # we look it up from the base graph.
-  while isinstance(outer_graph, func_graph_module.FuncGraph):
-    outer_graph = outer_graph.outer_graph
-  cur_ctxt = outer_graph._get_control_flow_context()  # pylint: disable=protected-access
-  return control_flow_util.GetContainingXLAContext(cur_ctxt) is not None
-
-
 def _graph_name(graph):
   if isinstance(graph, func_graph_module.FuncGraph):
     return graph.name
@@ -905,9 +899,11 @@
 
   flattened_loop_vars = [
       flow_to_tensor_array(*z)
-      for z in zip(nest.flatten(loop_vars), nest.flatten(structure_with_tas))
+      for z in zip(nest.flatten(loop_vars, expand_composites=True),
+                   nest.flatten(structure_with_tas, expand_composites=True))
   ]
-  return nest.pack_sequence_as(structure_with_tas, flattened_loop_vars)
+  return nest.pack_sequence_as(structure_with_tas, flattened_loop_vars,
+                               expand_composites=True)
 
 
 def _tensor_array_to_flow(loop_vars):
@@ -917,14 +913,15 @@
       return maybe_ta.flow
     return maybe_ta
 
-  return nest.map_structure(f, loop_vars)
+  return nest.map_structure(f, loop_vars, expand_composites=True)
 
 
 def _build_signature(loop_vars, shape_invariants):
   return nest.pack_sequence_as(loop_vars, [
       tensor_spec.TensorSpec(s, t.dtype, name=t.op.name)
-      for s, t in zip(nest.flatten(shape_invariants), nest.flatten(loop_vars))
-  ])
+      for s, t in zip(nest.flatten(shape_invariants, expand_composites=True),
+                      nest.flatten(loop_vars, expand_composites=True))
+  ], expand_composites=True)
 
 
 def _build_maximum_iterations_loop_var(maximum_iterations):
diff --git a/tensorflow/python/platform/benchmark_test.py b/tensorflow/python/platform/benchmark_test.py
index 64b6163..1760598 100644
--- a/tensorflow/python/platform/benchmark_test.py
+++ b/tensorflow/python/platform/benchmark_test.py
@@ -24,7 +24,6 @@
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
-
 class BenchmarkTest(test.TestCase, benchmark.TensorFlowBenchmark):
 
   def testReportBenchmark(self):
@@ -39,14 +38,17 @@
         iters=2000,
         wall_time=1000,
         name='testReportBenchmark',
-        metrics=[{'name': 'metric_name', 'value': 99, 'min_value': 1}])
+        metrics=[{'name': 'metric_name_1', 'value': 0, 'min_value': 1},
+                 {'name': 'metric_name_2', 'value': 90, 'min_value': 0,
+                  'max_value': 95}])
 
     with open(proto_file_path, 'rb') as f:
       benchmark_entries = test_log_pb2.BenchmarkEntries()
       benchmark_entries.ParseFromString(f.read())
 
       actual_result = json_format.MessageToDict(
-          benchmark_entries, preserving_proto_field_name=True)['entry'][0]
+          benchmark_entries, preserving_proto_field_name=True,
+          including_default_value_fields=True)['entry'][0]
     os.remove(proto_file_path)
 
     expected_result = {
@@ -55,11 +57,22 @@
         # int64 field to string.
         'iters': '2000',
         'wall_time': 1000,
-        'metrics': [{
-            'name': 'metric_name',
-            'value': 99,
-            'min_value': 1
-        }]
+        'cpu_time': 0,
+        'throughput': 0,
+        'extras': {},
+        'metrics': [
+            {
+                'name': 'metric_name_1',
+                'value': 0,
+                'min_value': 1
+            },
+            {
+                'name': 'metric_name_2',
+                'value': 90,
+                'min_value': 0,
+                'max_value': 95
+            }
+        ]
     }
 
     self.assertEqual(2000, benchmark_entries.entry[0].iters)
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 813bcb8..86a4957 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -158,7 +158,7 @@
 
 @tf_export(v1=['logging.warn'])
 def warn(msg, *args, **kwargs):
-  get_logger().warn(msg, *args, **kwargs)
+  get_logger().warning(msg, *args, **kwargs)
 
 
 @tf_export(v1=['logging.warning'])
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index fcab57c..f2796e4 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -147,6 +147,7 @@
     size = "small",
     srcs = ["pprof_profiler_test.py"],
     main = "pprof_profiler_test.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],  # TODO(annarev): get it working with pip.
     deps = [
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index f3810a0..c7fc481 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -17,6 +17,9 @@
 
 %ignore "";
 
+%rename("%s") TF_SetXlaEnableLazyCompilation;
+%rename("%s") TF_SetXLaAutoJitMode;
+%rename("%s") TF_SetXlaMinClusterSize;
 %rename("%s") TFE_NewContext;
 %rename("%s") TFE_DeleteContext;
 %rename("%s") TFE_ContextListDevices;
@@ -84,8 +87,6 @@
 %rename("%s") TFE_EnableCollectiveOps;
 %rename("%s") TF_ListPhysicalDevices;
 %rename("%s") TF_PickUnusedPortOrDie;
-%rename("%s") TFE_MonitoringSetGauge;
-%rename("%s") TFE_MonitoringAddSampler;
 %rename("%s") TFE_MonitoringCounterCellIncrementBy;
 %rename("%s") TFE_MonitoringCounterCellValue;
 %rename("%s") TFE_MonitoringNewCounter0;
@@ -97,6 +98,52 @@
 %rename("%s") TFE_MonitoringNewCounter2;
 %rename("%s") TFE_MonitoringDeleteCounter2;
 %rename("%s") TFE_MonitoringGetCellCounter2;
+%rename("%s") TFE_MonitoringIntGaugeCellSet;
+%rename("%s") TFE_MonitoringIntGaugeCellValue;
+%rename("%s") TFE_MonitoringNewIntGauge0;
+%rename("%s") TFE_MonitoringDeleteIntGauge0;
+%rename("%s") TFE_MonitoringGetCellIntGauge0;
+%rename("%s") TFE_MonitoringNewIntGauge1;
+%rename("%s") TFE_MonitoringDeleteIntGauge1;
+%rename("%s") TFE_MonitoringGetCellIntGauge1;
+%rename("%s") TFE_MonitoringNewIntGauge2;
+%rename("%s") TFE_MonitoringDeleteIntGauge2;
+%rename("%s") TFE_MonitoringGetCellIntGauge2;
+%rename("%s") TFE_MonitoringStringGaugeCellSet;
+%rename("%s") TFE_MonitoringStringGaugeCellValue;
+%rename("%s") TFE_MonitoringNewStringGauge0;
+%rename("%s") TFE_MonitoringDeleteStringGauge0;
+%rename("%s") TFE_MonitoringGetCellStringGauge0;
+%rename("%s") TFE_MonitoringNewStringGauge1;
+%rename("%s") TFE_MonitoringDeleteStringGauge1;
+%rename("%s") TFE_MonitoringGetCellStringGauge1;
+%rename("%s") TFE_MonitoringNewStringGauge2;
+%rename("%s") TFE_MonitoringDeleteStringGauge2;
+%rename("%s") TFE_MonitoringGetCellStringGauge2;
+%rename("%s") TFE_MonitoringBoolGaugeCellSet;
+%rename("%s") TFE_MonitoringBoolGaugeCellValue;
+%rename("%s") TFE_MonitoringNewBoolGauge0;
+%rename("%s") TFE_MonitoringDeleteBoolGauge0;
+%rename("%s") TFE_MonitoringGetCellBoolGauge0;
+%rename("%s") TFE_MonitoringNewBoolGauge1;
+%rename("%s") TFE_MonitoringDeleteBoolGauge1;
+%rename("%s") TFE_MonitoringGetCellBoolGauge1;
+%rename("%s") TFE_MonitoringNewBoolGauge2;
+%rename("%s") TFE_MonitoringDeleteBoolGauge2;
+%rename("%s") TFE_MonitoringGetCellBoolGauge2;
+%rename("%s") TFE_MonitoringSamplerCellAdd;
+%rename("%s") TFE_MonitoringSamplerCellValue;
+%rename("%s") TFE_MonitoringNewExponentialBuckets;
+%rename("%s") TFE_MonitoringDeleteBuckets;
+%rename("%s") TFE_MonitoringNewSampler0;
+%rename("%s") TFE_MonitoringDeleteSampler0;
+%rename("%s") TFE_MonitoringGetCellSampler0;
+%rename("%s") TFE_MonitoringNewSampler1;
+%rename("%s") TFE_MonitoringDeleteSampler1;
+%rename("%s") TFE_MonitoringGetCellSampler1;
+%rename("%s") TFE_MonitoringNewSampler2;
+%rename("%s") TFE_MonitoringDeleteSampler2;
+%rename("%s") TFE_MonitoringGetCellSampler2;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 1f7c631..b76aab8 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -11,6 +11,7 @@
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
@@ -378,7 +379,7 @@
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "load_test",
     srcs = ["load_test.py"],
     additional_deps = [
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index b932e1b..a0e1e48 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -395,7 +395,12 @@
     in serialized format.
 
     Args:
-      as_text: Writes the SavedModel protocol buffer in text format to disk.
+      as_text: Writes the SavedModel protocol buffer in text format to
+        disk. Protocol buffers in text format are useful for debugging, but
+        parsing fails when it encounters an unknown field and so is not forward
+        compatible. This means changes to TensorFlow may prevent deployment of
+        new text format SavedModels to existing serving binaries. Do not deploy
+        `as_text` SavedModels to production.
 
     Returns:
       The path to which the SavedModel protocol buffer was written.
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 5806776..13eeead 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -23,6 +23,7 @@
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import init_ops
@@ -65,7 +66,7 @@
     self._restore_checkpoint()
 
     for node in self._nodes:
-      if isinstance(node, tracking.TrackableResource):
+      if isinstance(node, tracking.CapturableResource):
         init_op = node._initialize()  # pylint: disable=protected-access
         if not context.executing_eagerly():
           ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
@@ -121,8 +122,8 @@
         return obj.asset_path
       elif tensor_util.is_tensor(obj):
         return obj
-      elif isinstance(obj, tracking.TrackableResource):
-        # Note: this executes restored functions in the TrackableResource.
+      elif isinstance(obj, tracking.CapturableResource):
+        # Note: this executes restored functions in the CapturableResource.
         return obj.resource_handle
       raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
@@ -186,7 +187,8 @@
     # TODO(andresp): Clean use of private methods of TrackableSaver.
     # pylint: disable=protected-access
     saver = util.TrackableSaver(graph_view.ObjectGraphView(self.get(0)))
-    saver._file_prefix_placeholder = constant_op.constant(variables_path)
+    with ops.device("CPU"):
+      saver._file_prefix_placeholder = constant_op.constant(variables_path)
     load_status = saver.restore(variables_path)
     load_status.assert_existing_objects_matched()
     checkpoint = load_status._checkpoint
@@ -280,13 +282,16 @@
 
   def _recreate_constant(self, proto):
     tensor_proto = self._operation_attributes[proto.operation]["value"].tensor
-    imported_constant = constant_op.constant(
-        tensor_util.MakeNdarray(tensor_proto))
+    ndarray = tensor_util.MakeNdarray(tensor_proto)
+    if dtypes.as_dtype(tensor_proto.dtype) == dtypes.string:
+      with ops.device("CPU"):
+        imported_constant = constant_op.constant(ndarray)
+    else:
+      imported_constant = constant_op.constant(ndarray)
     return imported_constant, setattr
 
   def _recreate_resource(self, proto):
-    del proto
-    return _RestoredResource(), setattr
+    return _RestoredResource(device=proto.device), setattr
 
 
 # TODO(b/124205571,b/124092991): Solve destruction of resources.
@@ -339,6 +344,27 @@
   assert 6. == imported.f(x=tf.constant(2.)).numpy()
   ```
 
+  _Importing SavedModels from TensorFlow 1.x_
+
+  SavedModels from `tf.estimator.Estimator` or 1.x SavedModel APIs have a flat
+  graph instead of `tf.function` objects. These SavedModels will have functions
+  corresponding to their signatures in the `.signatures` attribute, but also
+  have a `.prune` method which allows you to extract functions for new
+  subgraphs. This is equivalent to importing the SavedModel and naming feeds and
+  fetches in a Session from TensorFlow 1.x.
+
+  ```python
+  imported = tf.saved_model.load(path_to_v1_saved_model)
+  pruned = imported.prune("x:0", "out:0")
+  pruned(tf.ones([]))
+  ```
+
+  See `tf.compat.v1.wrap_function` for details. These SavedModels also have a
+  `.variables` attribute containing imported variables, and a `.graph` attribute
+  representing the whole imported graph. For SavedModels exported from
+  `tf.saved_model.save`, variables are instead assigned to whichever attributes
+  they were assigned before export.
+
   Args:
     export_dir: The SavedModel directory to load from.
     tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
@@ -375,6 +401,9 @@
                        saved_model_proto,
                        export_dir)
       root = loader.get(0)
+    root.tensorflow_version = meta_graph_def.meta_info_def.tensorflow_version
+    root.tensorflow_git_version = (
+        meta_graph_def.meta_info_def.tensorflow_git_version)
   else:
     with ops.init_scope():
       root = load_v1_in_v2.load(export_dir, tags)
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index e33bff7..6a644be 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -21,6 +21,7 @@
 import collections
 import functools
 import os
+import sys
 import tempfile
 import weakref
 
@@ -36,6 +37,8 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import versions
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
@@ -70,8 +73,12 @@
     # point w.r.t. saving/restoring, ideally after 2nd saving.
     for _ in range(cycles):
       path = tempfile.mkdtemp(prefix=self.get_temp_dir())
-      save.save(to_save, path, signatures)
-      loaded = load.load(path)
+      # If available, we'll run the save and restore preferring the GPU. This
+      # just makes sure we aren't throwing errors and have enough
+      # device("CPU") blocks to satisfy the placer.
+      with test_util.use_gpu():
+        save.save(to_save, path, signatures)
+        loaded = load.load(path)
       to_save = loaded
     return loaded
 
@@ -1290,6 +1297,24 @@
     root = self.cycle(root, cycles)
     self.assertEqual(root.f(constant_op.constant(5)).numpy(), 45)
 
+  def test_partial_bind_only_first_argument(self, cycles):
+    if sys.version_info[0] < 3:
+      self.skipTest("Test is only valid in python3. Only then we get some more "
+                    "advanced inspection of partials where this is allowed.")
+
+    def f(x, y):
+      return x + y
+
+    partial_func = functools.partial(f, x=5)
+    tf_func = def_function.function(partial_func)
+
+    root = tracking.AutoTrackable()
+    root.f = tf_func
+    self.assertAllEqual(root.f(y=constant_op.constant(7)), 12)
+
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(root.f(y=constant_op.constant(9)), 14)
+
   def test_partial_with_passed_fn_as_default(self, cycles):
 
     def f(x, y):
@@ -1307,6 +1332,26 @@
     root = self.cycle(root, cycles)
     self.assertEqual(root.f(constant_op.constant(3)).numpy(), 9)
 
+  def test_partial_with_input_signature(self, cycles):
+
+    def full_function(a, b, c=3.0):
+      return a, b, c
+
+    partial = functools.partial(full_function, 1, c=4)
+    self.assertAllEqual((1, 2.0, 4), partial(2.0))
+
+    signature = [tensor_spec.TensorSpec([], dtypes.float32)]
+    func = def_function.function(partial, input_signature=signature)
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    a, b, c = root.f(2.0)
+    self.assertAllEqual([a.numpy(), b.numpy(), c.numpy()], (1, 2.0, 4))
+
+    root = self.cycle(root, cycles)
+    a, b, c = root.f(3.0)
+    self.assertAllEqual([a.numpy(), b.numpy(), c.numpy()], (1, 3.0, 4))
+
   def test_convert_to_input_signature(self, cycles):
 
     @def_function.function(
@@ -1336,6 +1381,9 @@
             b=tensor_spec.TensorSpec(None, dtypes.float32, name="b")))
     obj = tracking.AutoTrackable()
     obj.__call__ = f
+    if sys.version_info.major == 3 and sys.version_info.minor < 5:
+      # TODO(allenl): figure out why this doesn't work in Python3.4
+      self.skipTest("Not working in Python 3.4")
     imported = self.cycle(obj, cycles)
     self.assertAllClose(3.,
                         imported(NamedTupleType(a=constant_op.constant(1.),
@@ -1434,12 +1482,15 @@
         return current_sum
 
     root = HasDataset()
-    self.assertEqual(3 * (1 + 4 + 9 + 16),
-                     root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
+    self.assertEqual(
+        3 * (1 + 4 + 9 + 16),
+        root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
     root = self.cycle(root, cycles)
-    self.assertEqual(3 * (1 + 4 + 9 + 16),
-                     root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
+    self.assertEqual(
+        3 * (1 + 4 + 9 + 16),
+        root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
 
+  @test_util.run_in_graph_and_eager_modes
   def test_dense_features_layer(self, cycles):
     columns = [feature_column_v2.numeric_column("x"),
                feature_column_v2.numeric_column("y")]
@@ -1447,7 +1498,7 @@
     model = sequential.Sequential([layer])
     model_input = {"x": constant_op.constant([[1.]]),
                    "y": constant_op.constant([[2.]])}
-    self.assertAllClose([[1., 2.]], model.predict(model_input))
+    self.assertAllClose([[1., 2.]], model.predict(model_input, steps=1))
     loaded = self.cycle(model, cycles)
     output, = loaded._default_save_signature(model_input).values()
     self.assertAllClose([[1., 2.]], output)
@@ -1467,6 +1518,53 @@
     loaded._default_save_signature(model_input)
     loaded.signatures["serving_default"](**model_input)
 
+  def test_multi_output_layer(self, cycles):
+
+    inp = input_layer.Input(name="inp", shape=(None,), dtype=dtypes.float32)
+
+    class _MultiOutput(base_layer.Layer):
+
+      def call(self, x):
+        return x + 1., x + 2.
+
+    out = _MultiOutput(name="out")(inp)
+    model = training_lib.Model(inp, out)
+    loaded = self.cycle(model, cycles)
+    self.assertAllClose(
+        dict(out=2., out_1=3.),
+        loaded.signatures["serving_default"](constant_op.constant(1.)))
+
+  def test_tuple_signature(self, cycles):
+    root = util.Checkpoint()
+    root.f = def_function.function(
+        lambda: (array_ops.ones([]), array_ops.zeros([])),
+        input_signature=())
+    for _ in range(cycles):
+      root = self.cycle(root, 1, signatures=root.f)
+    self.assertEqual(({"output_0": 1., "output_1": 0.}),
+                     self.evaluate(root.signatures["serving_default"]()))
+
+  def test_model_with_custom_function_attached(self, cycles):
+    root = util.Checkpoint(model=sequential.Sequential([core.Dense(2)]))
+
+    @def_function.function
+    def _use_sequential(x):
+      return root.model.call(x)
+
+    root.model.traced_call = _use_sequential
+
+    original = root.model.traced_call(array_ops.zeros([1, 1])).numpy()
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(
+        original,
+        root.model.traced_call(array_ops.zeros([1, 1])).numpy())
+
+  def test_version_info(self, cycles):
+    root = util.Checkpoint()
+    root = self.cycle(root, cycles)
+    self.assertEqual(versions.__version__, root.tensorflow_version)
+    self.assertEqual(versions.__git_version__, root.tensorflow_git_version)
+
   def test_functional_model_with_conv(self, cycles):
     x = input_layer.Input(name="x", shape=(None, None, 3), dtype=dtypes.float32)
     conved = convolutional.Conv2D(filters=3, kernel_size=3, dilation_rate=2)(x)
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index 3c62e9c..3d076c5 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -20,17 +20,21 @@
 
 import functools
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import signature_serialization
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training.tracking import tracking
 
 
-class _Initializer(tracking.TrackableResource):
+class _Initializer(tracking.CapturableResource):
   """Represents an initialization operation restored from a SavedModel.
 
   Without this object re-export of imported 1.x SavedModels would omit the
@@ -54,7 +58,7 @@
         dtype=dtypes.resource, shape=[], name="unused_resource")
 
   def _initialize(self):
-    self._init_fn(*[path.asset_path for path in self._asset_paths])
+    return self._init_fn(*[path.asset_path for path in self._asset_paths])
 
 
 class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
@@ -88,11 +92,21 @@
     """Restores variables from the checkpoint."""
     if saver is not None:
       saver_def = saver.saver_def
+      filename_tensor = wrapped.graph.as_graph_element(
+          saver_def.filename_tensor_name)
+      # We both feed and fetch filename_tensor so we have an operation to use to
+      # feed into variable initializers (only relevant for v1 graph building).
       restore_fn = wrapped.prune(
-          feeds=[wrapped.graph.as_graph_element(
-              saver_def.filename_tensor_name)],
-          fetches=[wrapped.graph.as_graph_element(saver_def.restore_op_name)])
-      restore_fn(constant_op.constant(self._variables_path))
+          feeds=[filename_tensor],
+          fetches=[filename_tensor,
+                   wrapped.graph.as_graph_element(saver_def.restore_op_name)])
+      initializer, _ = restore_fn(constant_op.constant(self._variables_path))
+      if not ops.executing_eagerly_outside_functions():
+        for variable in wrapped.graph.get_collection_ref(
+            ops.GraphKeys.GLOBAL_VARIABLES):
+          # pylint: disable=protected-access
+          variable._initializer_op = initializer
+          # pylint: enable=protected-access
 
   def _extract_signatures(self, wrapped, meta_graph_def):
     """Creates ConcreteFunctions for signatures in `meta_graph_def`."""
@@ -104,11 +118,27 @@
         input_names = []
         input_specs = []
       # TODO(allenl): Support optional arguments
-      signature_fn = wrapped.prune(
-          feeds=[wrapped.graph.as_graph_element(inp.name)
-                 for inp in input_specs],
-          fetches={name: wrapped.graph.as_graph_element(out.name)
-                   for name, out in signature_def.outputs.items()})
+      feeds = [wrapped.graph.as_graph_element(inp.name)
+               for inp in input_specs]
+      fetches = {name: wrapped.graph.as_graph_element(out.name)
+                 for name, out in signature_def.outputs.items()}
+      try:
+        signature_fn = wrapped.prune(feeds=feeds, fetches=fetches)
+      except lift_to_graph.UnliftableError as ex:
+        # Mutate the exception to add a bit more detail.
+        args = ex.args
+        if not args:
+          message = ""
+        else:
+          message = args[0]
+        message = (
+            ("A SavedModel signature needs an input for each placeholder the "
+             "signature's outputs use. An output for signature '{}' depends on "
+             "a placeholder which is not an input (i.e. the placeholder is not "
+             "fed a value).\n\n").format(signature_key)
+            + message)
+        ex.args = (message,) + args[1:]
+        raise
       # pylint: disable=protected-access
       signature_fn._arg_keywords = input_names
       if len(input_names) == 1:
@@ -131,29 +161,46 @@
     saver, = load_graph_returns
     self.restore_variables(wrapped, saver)
     with wrapped.graph.as_default():
-      init_op = loader_impl.get_init_op(meta_graph_def)
+      init_op = loader_impl.get_init_op(
+          meta_graph_def) or monitored_session.Scaffold.default_local_init_op()
+      # Add a dummy Tensor we know we can fetch to add control dependencies to.
+      init_anchor = constant_op.constant(0., name="dummy_fetch")
+
     root = tracking.AutoTrackable()
-    if init_op is not None:
-      asset_feed_tensors = []
-      asset_paths = []
-      for tensor_name, value in loader_impl.get_asset_tensors(
-          self._export_dir, meta_graph_def).items():
-        asset_feed_tensors.append(wrapped.graph.as_graph_element(tensor_name))
-        asset_paths.append(tracking.TrackableAsset(value))
-      init_fn = wrapped.prune(
-          feeds=asset_feed_tensors,
-          fetches=[wrapped.graph.as_graph_element(init_op)])
-      initializer = _Initializer(init_fn, asset_paths)
-      initializer._initialize()  # pylint: disable=protected-access
-      root.initializer = initializer
-      root.asset_paths = asset_paths
-    else:
-      root.asset_paths = []
+    asset_feed_tensors = []
+    asset_paths = []
+    for tensor_name, value in loader_impl.get_asset_tensors(
+        self._export_dir, meta_graph_def).items():
+      asset_feed_tensors.append(wrapped.graph.as_graph_element(tensor_name))
+      asset_paths.append(tracking.TrackableAsset(value))
+    init_fn = wrapped.prune(
+        feeds=asset_feed_tensors,
+        fetches=[init_anchor, wrapped.graph.as_graph_element(init_op)])
+    initializer = _Initializer(init_fn, asset_paths)
+    # pylint: disable=protected-access
+    local_init_op, _ = initializer._initialize()
+    # pylint: enable=protected-access
+    with ops.init_scope():
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, local_init_op)
+        for variable in wrapped.graph.get_collection_ref(
+            ops.GraphKeys.LOCAL_VARIABLES):
+          # pylint: disable=protected-access
+          variable._initializer_op = local_init_op
+          # pylint: enable=protected-access
+    root.initializer = initializer
+    root.asset_paths = asset_paths
     signature_functions = self._extract_signatures(wrapped, meta_graph_def)
 
     root.signatures = signature_serialization.create_signature_map(
         signature_functions)
     root.variables = list(wrapped.graph.variables)
+    root.tensorflow_version = (
+        meta_graph_def.meta_info_def.tensorflow_version)
+    root.tensorflow_git_version = (
+        meta_graph_def.meta_info_def.tensorflow_git_version)
+    root.graph = wrapped.graph
+    root.prune = wrapped.prune
     return root
 
 
@@ -161,3 +208,4 @@
   """Load a v1-style SavedModel as an object."""
   loader = _EagerSavedModelLoader(export_dir)
   return loader.load(tags=tags)
+
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index 43c79f2..670e00e 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -23,10 +23,13 @@
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import versions
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -49,7 +52,7 @@
     export_graph = ops.Graph()
     with export_graph.as_default():
       start = array_ops.placeholder(
-          shape=[None], dtype=dtypes.float32, name="start")
+          shape=None, dtype=dtypes.float32, name="start")
       if use_resource:
         distractor = variables.RefVariable(-1., name="distractor")
         v = resource_variable_ops.ResourceVariable(3., name="v")
@@ -79,17 +82,20 @@
             legacy_init_op=local_variable.initializer)
     return path
 
+  @test_util.run_in_graph_and_eager_modes
   def test_resource_variable_import(self):
     imported = load.load(self._v1_single_metagraph_saved_model(
         use_resource=True))
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.local_variables_initializer())
     fn = imported.signatures["serving_default"]
     self.assertEqual({"output": 6.},
                      self.evaluate(fn(constant_op.constant(2.))))
     self.assertAllEqual([3., 1.], self.evaluate(imported.variables))
-    imported.variables[0].assign(4.)
+    self.evaluate(imported.variables[0].assign(4.))
     self.assertEqual({"output": 8.},
                      self.evaluate(fn(start=constant_op.constant(2.))))
-    imported.variables[1].assign(2.)
+    self.evaluate(imported.variables[1].assign(2.))
     self.assertEqual({"output": 24.},
                      self.evaluate(fn(start=constant_op.constant(3.))))
     self.assertTrue(imported.variables[0].trainable)
@@ -97,7 +103,9 @@
     with backprop.GradientTape() as tape:
       output = fn(start=constant_op.constant(4.))
     self.assertEqual(imported.variables[:1], list(tape.watched_variables()))
-    self.assertEqual(8., tape.gradient(output, imported.variables[0]).numpy())
+    self.assertEqual(
+        8.,
+        self.evaluate(tape.gradient(output, imported.variables[0])))
 
   def test_ref_variable_import(self):
     saved = self._v1_single_metagraph_saved_model(use_resource=False)
@@ -183,9 +191,11 @@
     file_io.delete_file(vocab_path)
     return path
 
+  @test_util.run_in_graph_and_eager_modes
   def test_asset_loading(self):
     first_path = self._v1_asset_saved_model()
     imported = load.load(first_path)
+    self.evaluate(lookup_ops.tables_initializer())
     fn = imported.signatures["serving_default"]
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
@@ -193,7 +203,9 @@
                                str(ops.uid()))
     save.save(imported, second_path, signatures=imported.signatures)
     shutil.rmtree(first_path)
+    del ops.get_collection_ref(ops.GraphKeys.TABLE_INITIALIZERS)[:]
     second_import = load.load(second_path)
+    self.evaluate(lookup_ops.tables_initializer())
     fn = second_import.signatures["serving_default"]
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
@@ -202,7 +214,9 @@
                               str(ops.uid()))
     save.save(second_import, third_path, signatures=second_import.signatures)
     shutil.rmtree(second_path)
+    del ops.get_collection_ref(ops.GraphKeys.TABLE_INITIALIZERS)[:]
     third_import = load.load(third_path)
+    self.evaluate(lookup_ops.tables_initializer())
     fn = third_import.signatures["serving_default"]
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
@@ -293,8 +307,8 @@
   def _no_signatures_model(self):
     export_graph = ops.Graph()
     with export_graph.as_default():
-      array_ops.placeholder(name="x", shape=[], dtype=dtypes.float32)
-
+      inp = array_ops.placeholder(name="x", shape=[], dtype=dtypes.float32)
+      array_ops.identity(inp + 1., name="out")
       with session_lib.Session() as session:
         path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
         b = builder_impl.SavedModelBuilder(path)
@@ -333,5 +347,44 @@
     imported = load.load(path)
     self.assertEqual([2], imported.signatures["key"]()["value"].shape)
 
+  def test_version_info(self):
+    path = self._signature_with_no_inputs()
+    imported = load.load(path)
+    self.assertEqual(versions.__version__, imported.tensorflow_version)
+    self.assertEqual(versions.__git_version__,
+                     imported.tensorflow_git_version)
+
+  def _unfed_placeholder_signature(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      x = array_ops.placeholder(name="x", shape=[], dtype=dtypes.float32)
+      output = x * random_ops.random_normal([2])
+      with session_lib.Session() as session:
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        b = builder_impl.SavedModelBuilder(path)
+        b.add_meta_graph_and_variables(
+            session,
+            tags=[tag_constants.SERVING],
+            signature_def_map={
+                "key": signature_def_utils.build_signature_def(
+                    {}, dict(value=utils_impl.build_tensor_info(output)))})
+        b.save()
+    return path
+
+  def test_unfed_placeholder_exception(self):
+    path = self._unfed_placeholder_signature()
+    with self.assertRaisesRegexp(
+        lift_to_graph.UnliftableError,
+        "signature needs an input for each placeholder.*\n\nUnable to lift"):
+      load.load(path)
+
+  def test_custom_pruning(self):
+    path = self._no_signatures_model()
+    root = load.load(path)
+    fn = root.prune("x:0", "out:0")
+    self.assertEqual(2., self.evaluate(fn(x=array_ops.ones([]))))
+    root.graph.as_graph_element("x:0")
+
 if __name__ == "__main__":
   test.main()
+
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 493574a..7c2a5d0 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -49,6 +49,7 @@
 py_test(
     name = "export_output_test",
     srcs = ["export_output_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":export_output",
@@ -83,6 +84,7 @@
 py_test(
     name = "export_test",
     srcs = ["export_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":export_utils",
@@ -109,6 +111,7 @@
 py_test(
     name = "mode_keys_test",
     srcs = ["mode_keys_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":mode_keys",
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 3ad6862..602e2e3 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -220,8 +220,11 @@
         asset_filename_map={},
         asset_index={})
     for node_id, obj in enumerate(self.nodes):
-      if isinstance(obj, tracking.TrackableResource):
-        new_resource = obj._create_resource()  # pylint: disable=protected-access
+      if isinstance(obj, tracking.CapturableResource):
+        # pylint: disable=protected-access
+        with ops.device(obj._resource_device):
+          new_resource = obj._create_resource()
+        # pylint: enable=protected-access
         resource_map[obj.resource_handle] = new_resource
         self.captured_tensor_node_ids[obj.resource_handle] = node_id
       elif resource_variable_ops.is_resource_variable(obj):
@@ -421,7 +424,7 @@
 
 
 def _trace_resource_initializers(accessible_objects):
-  """Create concrete functions from `TrackableResource` objects."""
+  """Create concrete functions from `CapturableResource` objects."""
   resource_initializers = []
 
   def _wrap_initializer(obj):
@@ -432,7 +435,7 @@
     return lambda: _wrap_initializer(obj)
 
   for obj in accessible_objects:
-    if isinstance(obj, tracking.TrackableResource):
+    if isinstance(obj, tracking.CapturableResource):
       resource_initializers.append(def_function.function(
           _wrap_obj_initializer(obj),
           # All inputs are captures.
@@ -605,8 +608,8 @@
         function_serialization.serialize_bare_concrete_function(obj))
   elif isinstance(obj, _CapturedConstant):
     proto.constant.operation = obj.graph_tensor.op.name
-  elif isinstance(obj, tracking.TrackableResource):
-    proto.resource.SetInParent()
+  elif isinstance(obj, tracking.CapturableResource):
+    proto.resource.device = obj._resource_device  # pylint: disable=protected-access
   else:
     registered_type_proto = revived_types.serialize(obj)
     if registered_type_proto is None:
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index ace3033..14cb5ab 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -33,6 +33,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.lib.io import file_io
@@ -119,6 +120,10 @@
         _import_and_infer(
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
+  def test_unbuilt_model_does_not_prevent_saving(self):
+    root = util.Checkpoint(model=sequential.Sequential([core.Dense(2)]))
+    save.save(root, os.path.join(self.get_temp_dir(), "saved_model"))
+
   def test_version_information_included(self):
     root = tracking.AutoTrackable()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -330,6 +335,9 @@
     save.save(root, save_dir)
 
   def test_function_with_captured_dataset(self):
+    if test_util.is_gpu_available():
+      self.skipTest("Currently broken when a GPU is available.")
+
     class HasDataset(module.Module):
 
       def __init__(self):
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index 0cd64ee..dfb2d45 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -136,7 +136,7 @@
 def _is_flat(sequence):
   sequence_flat = nest.flatten(sequence)
   try:
-    nest.assert_same_structure(sequence_flat, sequence)
+    nest.assert_same_structure(sequence_flat, sequence, check_types=False)
     return True
   except ValueError:
     return False
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index e483155..b66b38a 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -162,8 +162,8 @@
 )
 
 py_library(
-    name = "component_api_helper",
-    srcs = ["component_api_helper.py"],
+    name = "module_util",
+    srcs = ["module_util.py"],
     srcs_version = "PY2AND3",
 )
 
diff --git a/tensorflow/python/tools/component_api_helper.py b/tensorflow/python/tools/component_api_helper.py
deleted file mode 100644
index 97f4671..0000000
--- a/tensorflow/python/tools/component_api_helper.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper functions to help integrate TensorFlow components into TF API.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import os
-
-
-def package_hook(parent_package_str, child_package_str, error_msg=None):
-  """Used to hook in an external package into the TensorFlow namespace.
-
-  Example usage:
-  ### tensorflow/__init__.py
-  from tensorflow.python.tools import component_api_helper
-  component_api_helper.package_hook(
-      'tensorflow', 'tensorflow_estimator.python')
-  component_api_helper(
-      'tensorflow.contrib', 'tensorflow_estimator.contrib.python')
-  del component_api_helper
-
-  TODO(mikecase): This function has a minor issue, where if the child package
-  does not exist alone in its directory, sibling packages to it will also be
-  accessible from the parent. This is because we just add
-  `child_pkg.__file__/..` to the subpackage search path. This should not be
-  a big issue because of how our API generation scripts work (the child package
-  we are hooking up should always be alone). But there might be a better way
-  of doing this.
-
-  Args:
-    parent_package_str: Parent package name as a string such as 'tensorflow' or
-      'tensorflow.contrib'. This will become the parent package for the
-      component package being hooked in.
-    child_package_str: Child package name as a string such as
-      'tensorflow_estimator.python'. This package will be added as a subpackage
-      of the parent.
-    error_msg: Message to print if child package cannot be found.
-  """
-  parent_pkg = importlib.import_module(parent_package_str)
-  try:
-    child_pkg = importlib.import_module(child_package_str)
-  except ImportError:
-    if error_msg:
-      print(error_msg)
-    return
-
-  def set_child_as_subpackage():
-    """Sets child package as a subpackage of parent package.
-
-    Will allow the following import statement to work.
-    >>> import parent.child
-    """
-    child_pkg_path = [os.path.abspath(
-        os.path.join(os.path.dirname(child_pkg.__file__), ".."))]
-    try:
-      parent_pkg.__path__ = child_pkg_path + parent_pkg.__path__
-    except AttributeError:
-      parent_pkg.__path__ = child_pkg_path
-
-  def set_child_as_attr():
-    """Sets child package as a attr of the parent package.
-
-    Will allow for the following.
-    >>> import parent
-    >>> parent.child
-    """
-    child_pkg_attr_name = child_pkg.__name__.split(".")[-1]
-    setattr(parent_pkg, child_pkg_attr_name, child_pkg)
-
-  set_child_as_subpackage()
-  set_child_as_attr()
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 82e1b5d..e955e7b 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -125,12 +125,12 @@
   # 'input_checkpoint' may be a prefix if we're using Saver V2 format
   if (not input_saved_model_dir and
       not checkpoint_management.checkpoint_exists(input_checkpoint)):
-    print("Input checkpoint '" + input_checkpoint + "' doesn't exist!")
-    return -1
+    raise ValueError("Input checkpoint '" + input_checkpoint +
+                     "' doesn't exist!")
 
   if not output_node_names:
-    print("You need to supply the name of a node to --output_node_names.")
-    return -1
+    raise ValueError(
+        "You need to supply the name of a node to --output_node_names.")
 
   # Remove all the explicit device specifications for this node. This helps to
   # make the graph more portable.
@@ -193,14 +193,15 @@
         # tensors. Partition variables are Identity tensors that cannot be
         # handled by Saver.
         if has_partition_var:
-          print("Models containing partition variables cannot be converted "
-                "from checkpoint files. Please pass in a SavedModel using "
-                "the flag --input_saved_model_dir.")
-          return -1
+          raise ValueError(
+              "Models containing partition variables cannot be converted "
+              "from checkpoint files. Please pass in a SavedModel using "
+              "the flag --input_saved_model_dir.")
         # Models that have been frozen previously do not contain Variables.
         elif _has_no_variables(sess):
-          print("No variables were found in this model. It is likely the model "
-                "was frozen previously. You cannot freeze a graph twice.")
+          raise ValueError(
+              "No variables were found in this model. It is likely the model "
+              "was frozen previously. You cannot freeze a graph twice.")
           return 0
         else:
           raise e
@@ -242,8 +243,7 @@
 def _parse_input_graph_proto(input_graph, input_binary):
   """Parses input tensorflow graph into GraphDef proto."""
   if not gfile.Exists(input_graph):
-    print("Input graph file '" + input_graph + "' does not exist!")
-    return -1
+    raise IOError("Input graph file '" + input_graph + "' does not exist!")
   input_graph_def = graph_pb2.GraphDef()
   mode = "rb" if input_binary else "r"
   with gfile.GFile(input_graph, mode) as f:
@@ -257,8 +257,7 @@
 def _parse_input_meta_graph_proto(input_graph, input_binary):
   """Parses input tensorflow graph into MetaGraphDef proto."""
   if not gfile.Exists(input_graph):
-    print("Input meta graph file '" + input_graph + "' does not exist!")
-    return -1
+    raise IOError("Input meta graph file '" + input_graph + "' does not exist!")
   input_meta_graph_def = MetaGraphDef()
   mode = "rb" if input_binary else "r"
   with gfile.GFile(input_graph, mode) as f:
@@ -273,8 +272,7 @@
 def _parse_input_saver_proto(input_saver, input_binary):
   """Parses input tensorflow Saver into SaverDef proto."""
   if not gfile.Exists(input_saver):
-    print("Input saver file '" + input_saver + "' does not exist!")
-    return -1
+    raise IOError("Input saver file '" + input_saver + "' does not exist!")
   mode = "rb" if input_binary else "r"
   with gfile.GFile(input_saver, mode) as f:
     saver_def = saver_pb2.SaverDef()
@@ -369,9 +367,8 @@
   elif flags.checkpoint_version == 2:
     checkpoint_version = saver_pb2.SaverDef.V2
   else:
-    print("Invalid checkpoint version (must be '1' or '2'): %d" %
-          flags.checkpoint_version)
-    return -1
+    raise ValueError("Invalid checkpoint version (must be '1' or '2'): %d" %
+                     flags.checkpoint_version)
   freeze_graph(flags.input_graph, flags.input_saver, flags.input_binary,
                flags.input_checkpoint, flags.output_node_names,
                flags.restore_op_name, flags.filename_tensor_name,
@@ -380,7 +377,9 @@
                flags.input_meta_graph, flags.input_saved_model_dir,
                flags.saved_model_tags, checkpoint_version)
 
+
 def run_main():
+  """Main function of freeze_graph."""
   parser = argparse.ArgumentParser()
   parser.register("type", "bool", lambda v: v.lower() == "true")
   parser.add_argument(
@@ -487,5 +486,6 @@
   my_main = lambda unused_args: main(unused_args, flags)
   app.run(main=my_main, argv=[sys.argv[0]] + unparsed)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
   run_main()
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index d7edf4e..0d054c0 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -316,17 +316,17 @@
     output_node_names = "save/restore_all"
     output_graph_path = os.path.join(self.get_temp_dir(), output_graph_name)
 
-    return_value = freeze_graph.freeze_graph_with_def_protos(
-        input_graph_def=sess.graph_def,
-        input_saver_def=None,
-        input_checkpoint=checkpoint_path,
-        output_node_names=output_node_names,
-        restore_op_name="save/restore_all",  # default value
-        filename_tensor_name="save/Const:0",  # default value
-        output_graph=output_graph_path,
-        clear_devices=False,
-        initializer_nodes="")
-    self.assertTrue(return_value, -1)
+    with self.assertRaises(ValueError):
+      freeze_graph.freeze_graph_with_def_protos(
+          input_graph_def=sess.graph_def,
+          input_saver_def=None,
+          input_checkpoint=checkpoint_path,
+          output_node_names=output_node_names,
+          restore_op_name="save/restore_all",  # default value
+          filename_tensor_name="save/Const:0",  # default value
+          output_graph=output_graph_path,
+          clear_devices=False,
+          initializer_nodes="")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 7b35751..288abf9 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import argparse
+import re
 import sys
 
 import numpy as np
@@ -29,15 +30,33 @@
 FLAGS = None
 
 
-def _count_total_params(reader):
+def _count_total_params(reader, count_exclude_pattern=""):
   """Count total number of variables."""
   var_to_shape_map = reader.get_variable_to_shape_map()
+
+  # Filter out tensors that we don't want to count
+  if count_exclude_pattern:
+    regex_pattern = re.compile(count_exclude_pattern)
+    new_var_to_shape_map = {}
+    exclude_num_tensors = 0
+    exclude_num_params = 0
+    for v in var_to_shape_map:
+      if regex_pattern.search(v):
+        exclude_num_tensors += 1
+        exclude_num_params += np.prod(var_to_shape_map[v])
+      else:
+        new_var_to_shape_map[v] = var_to_shape_map[v]
+    var_to_shape_map = new_var_to_shape_map
+    print("# Excluding %d tensors (%d params) that match %s when counting." % (
+        exclude_num_tensors, exclude_num_params, count_exclude_pattern))
+
   var_sizes = [np.prod(var_to_shape_map[v]) for v in var_to_shape_map]
   return np.sum(var_sizes, dtype=int)
 
 
 def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
-                                     all_tensor_names=False):
+                                     all_tensor_names=False,
+                                     count_exclude_pattern=""):
   """Prints tensors in a checkpoint file.
 
   If no `tensor_name` is provided, prints the tensor names and shapes
@@ -50,6 +69,7 @@
     tensor_name: Name of the tensor in the checkpoint file to print.
     all_tensors: Boolean indicating whether to print all tensors.
     all_tensor_names: Boolean indicating whether to print all tensor names.
+    count_exclude_pattern: Regex string, pattern to exclude tensors when count.
   """
   try:
     reader = pywrap_tensorflow.NewCheckpointReader(file_name)
@@ -66,7 +86,8 @@
       print(reader.get_tensor(tensor_name))
 
     # Count total number of parameters
-    print("# Total number of params: %d" % _count_total_params(reader))
+    print("# Total number of params: %d" % _count_total_params(
+        reader, count_exclude_pattern=count_exclude_pattern))
   except Exception as e:  # pylint: disable=broad-except
     print(str(e))
     if "corrupted compressed block contents" in str(e):
@@ -124,8 +145,10 @@
           "[--printoptions]")
     sys.exit(1)
   else:
-    print_tensors_in_checkpoint_file(FLAGS.file_name, FLAGS.tensor_name,
-                                     FLAGS.all_tensors, FLAGS.all_tensor_names)
+    print_tensors_in_checkpoint_file(
+        FLAGS.file_name, FLAGS.tensor_name,
+        FLAGS.all_tensors, FLAGS.all_tensor_names,
+        count_exclude_pattern=FLAGS.count_exclude_pattern)
 
 
 if __name__ == "__main__":
@@ -144,6 +167,11 @@
       default="",
       help="Name of the tensor to inspect")
   parser.add_argument(
+      "--count_exclude_pattern",
+      type=str,
+      default="",
+      help="Pattern to exclude tensors, e.g., from optimizers, when counting.")
+  parser.add_argument(
       "--all_tensors",
       nargs="?",
       const=True,
diff --git a/tensorflow/tools/docker/simple_console.py b/tensorflow/python/tools/module_util.py
similarity index 71%
rename from tensorflow/tools/docker/simple_console.py
rename to tensorflow/python/tools/module_util.py
index 106528b..26604d9 100644
--- a/tensorflow/tools/docker/simple_console.py
+++ b/tensorflow/python/tools/module_util.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,22 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Start a simple interactive console with TensorFlow available."""
-
+"""Helper functions for modules."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import code
-import sys
+import os
 
 
-def main(_):
-  """Run an interactive console."""
-  code.interact()
-  return 0
+def get_parent_dir(module):
+  return os.path.abspath(os.path.join(os.path.dirname(module.__file__), ".."))
 
-
-if __name__ == '__main__':
-  sys.exit(main(sys.argv))
diff --git a/tensorflow/python/tools/strip_unused_lib.py b/tensorflow/python/tools/strip_unused_lib.py
index decd7e2..a2766be 100644
--- a/tensorflow/python/tools/strip_unused_lib.py
+++ b/tensorflow/python/tools/strip_unused_lib.py
@@ -80,7 +80,7 @@
       inputs_replaced_graph_def.node.extend([copy.deepcopy(node)])
 
   if not_found:
-    raise KeyError("The following input nodes were not found: %s\n" % not_found)
+    raise KeyError("The following input nodes were not found: %s" % not_found)
 
   output_graph_def = graph_util.extract_sub_graph(inputs_replaced_graph_def,
                                                   output_node_names)
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 85c0ad9..171d44c 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -236,7 +236,10 @@
         "//tensorflow/python:framework",
         "//tensorflow/python:layers",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_oss",  # TODO(b/131157871): Reenable in OSS when fixed
+        "no_windows",  # TODO: needs investigation on Windows
+    ],
 )
 
 tf_py_test(
@@ -273,30 +276,6 @@
 )
 
 tf_py_test(
-    name = "tpu_config_test",
-    size = "small",
-    srcs = ["tpu_config_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_estimator_signals_test",
-    size = "small",
-    srcs = ["tpu_estimator_signals_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-    # TODO(jhseu): Remove. Fails in OSS on Python 3.
-    tags = ["no_oss"],
-)
-
-tf_py_test(
     name = "topology_test",
     size = "medium",
     srcs = ["topology_test.py"],
diff --git a/tensorflow/python/tpu/_tpu_estimator_embedding.py b/tensorflow/python/tpu/_tpu_estimator_embedding.py
index 4a832db..d85aae6 100644
--- a/tensorflow/python/tpu/_tpu_estimator_embedding.py
+++ b/tensorflow/python/tpu/_tpu_estimator_embedding.py
@@ -1,366 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""Tooling for support TPU embedding in TPUEstimator."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.feature_column import feature_column as core_fc
-from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import math_ops
-from tensorflow.python.tpu import feature_column as tpu_fc
-from tensorflow.python.tpu import tpu_embedding
-from tensorflow.python.tpu.tpu_embedding import AdagradParameters
-from tensorflow.python.tpu.tpu_embedding import AdamParameters
-from tensorflow.python.tpu.tpu_embedding import StochasticGradientDescentParameters
-from tensorflow.python.training import training
-
-# pylint: disable=protected-access
-_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
-                                 tpu_fc._TPUSharedEmbeddingColumn)
-_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
-                             core_fc_lib.EmbeddingColumn,
-                             core_fc._SharedEmbeddingColumn)
-_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
-_SUPPORTED_OPTIMIZERS = (AdagradParameters, AdamParameters,
-                         StochasticGradientDescentParameters)
-
-# pylint: enable=protected-access
-
-_TABLE_NAME_PREFIX = 'tbl_'
-_LEN_TABLE_NAME_PREFIX = len(_TABLE_NAME_PREFIX)
-
-
-def _get_table_name_from_embedding_var_name(embedding_var_name):
-  return '{}{}'.format(_TABLE_NAME_PREFIX, embedding_var_name)
-
-
-def _get_embedding_var_name_from_table_name(table_name):
-  return table_name[_LEN_TABLE_NAME_PREFIX:]
-
-
-def _get_embedding_variable_name(scope_name, var_name):
-  return '{}/{}'.format(scope_name, var_name)
-
-
-def _get_slot_variable_names(scope_name, var_name, optimization_parameters):
-  """Return embedding variable names which are consistent with CPU runs."""
-  if isinstance(optimization_parameters, tpu_embedding.AdagradParameters):
-    return tpu_embedding.AdagradSlotVariableName(
-        '{}/{}/Adagrad'.format(scope_name, var_name)
-    )
-  elif isinstance(optimization_parameters, tpu_embedding.AdamParameters):
-    return tpu_embedding.AdamSlotVariableNames(
-        '{}/{}/Adam/m'.format(scope_name, var_name),
-        '{}/{}/Adam/v'.format(scope_name, var_name)
-    )
-  elif isinstance(optimization_parameters,
-                  tpu_embedding.StochasticGradientDescentParameters):
-    return None
-  else:
-    raise ValueError('Support to infer full variable name '
-                     'for optimization_parameter {} has not been added.'
-                     .format(optimization_parameters))
-
-
-def get_full_variable_names(
-    graph, table_to_config_dict, optimization_parameters=None):
-  """Return embedding variable names and slot variables which are consistent with CPU runs."""
-  collection = graph.get_collection_ref(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
-  if not collection:
-    raise RuntimeError(
-        'Embedding feature column did not capture any thing. Make sure the '
-        'feature columns passed to TPUEstimator constructor is properly '
-        'used in model_fn.')
-
-  embedding_variable_name_by_table = {}
-  slot_variable_names_by_table = {}
-  for table_name in table_to_config_dict:
-    embedding_var_name = _get_embedding_var_name_from_table_name(table_name)
-    (scope_name, var_name) = collection[0][embedding_var_name]
-    embedding_variable_name_by_table[table_name] = (
-        _get_embedding_variable_name(scope_name, var_name))
-    if optimization_parameters:
-      slot_variable_names_by_table[table_name] = _get_slot_variable_names(
-          scope_name, var_name, optimization_parameters)
-
-  graph.clear_collection(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
-  return embedding_variable_name_by_table, slot_variable_names_by_table
-
-
-def get_configs_from_feature_columns(feature_columns):
-  """Create configs for TPUEmbedding etc from a list of feature columns.
-
-  Args:
-    feature_columns: a list of supported feature columns.
-
-  Returns:
-    A tuple of dicts, the first maps tables to their config, the second maps
-    features to their config, and the third maps features to weight key names.
-  """
-
-  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
-
-  for column in feature_columns:
-    if not isinstance(column, allowed):
-      raise TypeError(
-          'Unsupported feature column {}. Supported types are {}.'.format(
-              type(column), allowed))
-
-  table_to_config = {}
-  feature_to_config = {}
-  feature_to_weight_key_name = {}
-  for column in feature_columns:
-    feature_name = column.get_feature_key_name()
-    table_name = _get_table_name_from_embedding_var_name(
-        column.get_embedding_var_name())
-    if feature_name in feature_to_config:
-      raise ValueError(
-          'Feature column {} is used with multiple embeddings and this is '
-          'not supported.'.format(feature_name))
-    feature_to_config[feature_name] = tpu_embedding.FeatureConfig(
-        table_id=table_name)
-    feature_to_weight_key_name[feature_name] = column.get_weight_key_name()
-    vocabulary_size, dimension = column.get_embedding_table_size()
-    table_to_config[table_name] = tpu_embedding.TableConfig(
-        vocabulary_size=vocabulary_size,
-        dimension=dimension,
-        initializer=column.get_initializer(),
-        combiner=column.get_combiner())
-
-  return table_to_config, feature_to_config, feature_to_weight_key_name
-
-
-class EmbeddingConfigSpec(
-    collections.namedtuple('EmbeddingConfigSpec', [
-        'feature_columns', 'optimization_parameters', 'clipping_limit',
-        'pipeline_execution_with_tensor_core',
-        'experimental_gradient_multiplier_fn'
-    ])):
-  """Class to keep track of embedding config specification."""
-
-  def __new__(cls,
-              feature_columns,
-              optimization_parameters,
-              clipping_limit=None,
-              pipeline_execution_with_tensor_core=False,
-              experimental_gradient_multiplier_fn=None):
-    """Creates an EmbeddingConfigSpec instance.
-
-    Args:
-      feature_columns: All `FeatureColumn`s used by model.
-      optimization_parameters: An instance of `AdagradParameters`,
-        `AdamParameters` or `StochasticGradientDescentParameters`. This
-        optimizer will be applied to all embedding variables specified by
-        `feature_columns`.
-      clipping_limit: (Optional) Clipping limit (absolute value).
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding IDs. Please see
-        `tpu_embedding_configuration.proto` for details.
-      experimental_gradient_multiplier_fn: (Optional) A Fn taking global step as
-        input returning the current multiplier for all embedding gradients.
-
-    Returns:
-      An EmbeddingConfigSpec instance.
-
-    Raises:
-      ValueError: If the feature_columns are not specified.
-      TypeError: If the feature columns are not of ths correct type (one of
-        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
-        _EMBEDDING_COLUMN_CLASSES).
-      ValueError: If `optimization_parameters` is not one of the required types.
-    """
-    if not feature_columns:
-      raise ValueError('`feature_columns` cannot be `None` or empty.')
-
-    # It is unknown at this moment, whether the TPUEstimator is running in CPU
-    # or TPU mode. So allow non-TPU embedding columns also.
-    supported_classes = tuple(
-        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
-        list(_EMBEDDING_COLUMN_CLASSES))
-
-    for column in feature_columns:
-      if not isinstance(column, supported_classes):
-        raise TypeError(
-            'All feature columns must be supported types in {}. Got {}'.format(
-                supported_classes, type(column)))
-
-    if not isinstance(optimization_parameters, _SUPPORTED_OPTIMIZERS):
-      raise ValueError('optimization_parameters must be an instance of type '
-                       '{}. Got {}.'.format(_SUPPORTED_OPTIMIZERS,
-                                            type(optimization_parameters)))
-
-    return super(EmbeddingConfigSpec, cls).__new__(
-        cls,
-        feature_columns=feature_columns,
-        optimization_parameters=optimization_parameters,
-        clipping_limit=clipping_limit,
-        pipeline_execution_with_tensor_core=pipeline_execution_with_tensor_core,
-        experimental_gradient_multiplier_fn=experimental_gradient_multiplier_fn)
-
-
-class EmbeddingConfig(object):
-  """This is the internal immutable object for embedding config.
-
-  `_EmbeddingConfig` is responsible to _translate_ user provided
-  `EmbeddingConfigSpec` to internal data structures, mostly constructor
-  arguments of `TPUEmbedding`.
-  """
-
-  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
-               num_hosts, num_cores, run_config):
-    if not embedding_config_spec:
-      raise ValueError('embedding_config_spec cannot be None.')
-
-    self._embedding_config_spec = embedding_config_spec
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._num_hosts = num_hosts
-    self._num_cores = num_cores
-    self._run_config = run_config
-
-    (self._table_to_config_dict, self._feature_to_config_dict,
-     self.feature_to_weight_key_name_dict) = (
-         get_configs_from_feature_columns(
-             embedding_config_spec.feature_columns))
-    self._mode_to_tpu_embedding_dict = {}
-    self.dummy_table_variables = None
-
-    self._grad_multiplier_fn = (
-        embedding_config_spec.experimental_gradient_multiplier_fn)
-
-  def get_grad_multiplier(self):
-    if self._grad_multiplier_fn:
-      return ops.convert_to_tensor(
-          self._grad_multiplier_fn(training.get_global_step()),
-          dtype=dtypes.float32)
-
-  def has_embedding_tables(self):
-    return bool(self._table_to_config_dict)
-
-  def _create_tpu_embedding(self, mode):
-    """Create tpu_embedding.TPUEmbedding based on mode."""
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      batch_size = self._train_batch_size
-    else:
-      batch_size = self._eval_batch_size
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      tpu_embedding_mode = tpu_embedding.TRAINING
-      optimization_parameters = (
-          self._embedding_config_spec.optimization_parameters)
-    elif (mode == model_fn_lib.ModeKeys.EVAL or
-          mode == model_fn_lib.ModeKeys.PREDICT):
-      tpu_embedding_mode = tpu_embedding.INFERENCE
-      optimization_parameters = None
-    else:
-      raise ValueError('Mode {} is not supported.'.format(mode))
-
-    if self._run_config.cluster:
-      master = self._run_config.cluster.master()
-      cluster_spec = self._run_config.cluster.cluster_spec()
-      cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-    else:
-      master = (
-          self._run_config.evaluation_master
-          if mode == model_fn_lib.ModeKeys.EVAL else self._run_config.master)
-      cluster_def = None
-    tpu_embedding_ = tpu_embedding.TPUEmbedding(
-        self._table_to_config_dict,
-        self._feature_to_config_dict,
-        batch_size,
-        tpu_embedding_mode,
-        master,
-        optimization_parameters,
-        cluster_def,
-        pipeline_execution_with_tensor_core=self._embedding_config_spec
-        .pipeline_execution_with_tensor_core)
-    return tpu_embedding_
-
-  def get_tpu_embedding(self, mode):
-    if mode not in self._mode_to_tpu_embedding_dict:
-      self._mode_to_tpu_embedding_dict[mode] = (
-          self._create_tpu_embedding(mode))
-    return self._mode_to_tpu_embedding_dict[mode]
-
-
-def split_inputs(ctx, features, labels):
-  """Splits the dense and sparse tensors inside the features and labels."""
-  enqueue_datas = collections.OrderedDict()
-  if ctx.embedding_config:
-    tpu_embedding_ = ctx.embedding_config.tpu_embedding
-    feature_to_weight_key_name_dict = (
-        ctx.embedding_config.feature_to_weight_key_name_dict)
-    for feature_key in tpu_embedding_.feature_to_config_dict:
-      sparse_feature = _get_sparse_feature_from_feature(feature_key, features)
-      weight_key_name = feature_to_weight_key_name_dict[feature_key]
-      if isinstance(sparse_feature, sparse_tensor.SparseTensor):
-        weights = _get_weights_from_features(weight_key_name, features)
-        enqueue_data = tpu_embedding.EnqueueData.from_sparse_tensor(
-            sparse_feature, weights)
-      else:
-        if weight_key_name is not None:
-          raise ValueError(
-              'Found weights {} for weighted_categorical_column, which is not'
-              'compatible with sparse feature {} enqueued as dense tensor.'
-              .format(weight_key_name, feature_key))
-        enqueue_data = tpu_embedding.EnqueueData(sparse_feature)
-      enqueue_datas[feature_key] = enqueue_data
-
-  return features, labels, enqueue_datas
-
-
-def _get_sparse_feature_from_feature(feature_key, features):
-  """Pop and return sparse feature."""
-  sparse_feature = features.pop(feature_key)
-  if not sparse_feature.dtype.is_integer:
-    raise ValueError('SparseTensor with string as values are not supported. '
-                     'If you are using vocabulary_file_categorical_column or '
-                     'vocabulary_list_categorical_column, please call '
-                     'your_column.categorical_column._transform_feature({{'
-                     'your_column.key: features[your_column.key]}}) in'
-                     'your input_fn() to convert string to int. '
-                     'feature_key = {}.'.format(feature_key))
-  return sparse_feature
-
-
-def _get_weights_from_features(weight_key_name, features):
-  """Pop and return feature for weights, possibly None."""
-  weights = None
-  if weight_key_name is not None:
-    if weight_key_name in features:
-      weights = features.pop(weight_key_name)
-    else:
-      raise ValueError(
-          'Cannot find weights {} for weighted_categorical_column.'
-          ' Please check if the weights are present in feature dict. Also'
-          ' note weight-sharing among weighted_categorical_column is not '
-          'supported on TPU.'.format(weight_key_name))
-    if not isinstance(weights, sparse_tensor.SparseTensor):
-      raise ValueError(
-          'weighted_categorical_column with weight key name {} has dense '
-          'weights. Dense weights are not supported on TPU. Please use '
-          'sparse weights instead.'.format(weight_key_name))
-    if weights.dtype is not dtypes.float32:
-      weights = math_ops.to_float(weights)
-  return weights
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu._tpu_estimator_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/datasets.py b/tensorflow/python/tpu/datasets.py
index 30a0f96..eab0249 100644
--- a/tensorflow/python/tpu/datasets.py
+++ b/tensorflow/python/tpu/datasets.py
@@ -130,8 +130,8 @@
   if sloppy is None:
     sloppy = True
 
-  if file_reader_job == 'cordinator':
-    file_reader_device = '/job:%s/task:0' % file_reader_job
+  if file_reader_job == 'coordinator':
+    file_reader_device = '/job:coordinator/task:0'
   else:
     file_reader_device = '/job:%s' % file_reader_job
 
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
index 51a301b..863d5e6 100644
--- a/tensorflow/python/tpu/device_assignment.py
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -22,6 +22,7 @@
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu.topology import Topology
 from tensorflow.python.util.tf_export import tf_export
 
@@ -174,6 +175,43 @@
                              num_replicas)
 
 
+def _ring_2d(height, width):
+  """Ring-order of a height x width mesh.
+
+  For example, in a 4x4 mesh, this returns the following order.
+    0 -- 1 -- 2 -- 3
+    |    |    |    |
+    15-- 6 -- 5 -- 4
+    |    |    |    |
+    14-- 7 -- 8 -- 9
+    |    |    |    |
+    13-- 12-- 11-- 10
+
+  Args:
+    height: An integer represents the height.
+    width: An integer represents the width.
+
+  Returns:
+    A list of [y, x] pairs with ring order.
+  """
+  if height == 1:
+    return [(0, i) for i in range(width)]
+  if width == 1:
+    return [(i, 0) for i in range(height)]
+  if height % 2 != 0:
+    logging.warning("Odd dimension")
+    return [(i % height, i // height) for i in range(width * height)]
+  ret = [(0, 0)]
+  for i in range(height // 2):
+    for j in range(1, width):
+      ret.append((2 * i, j))
+    for j in range(width - 1, 0, -1):
+      ret.append((2 * i + 1, j))
+  for i in range(height - 1, 0, -1):
+    ret.append((i, 0))
+  return ret
+
+
 def device_assignment(topology,
                       computation_shape=None,
                       computation_stride=None,
@@ -296,28 +334,53 @@
 
   # Assigns an offset to each replica such that no two replicas overlap.
   replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
-  for replica in xrange(num_replicas):
-    # Chooses a replica number in each axis.
-    t = replica
-    pos = []
-    for dim in replica_shape[::-1]:
-      pos.append(t % dim)
-      t //= dim
-    replica_pos = np.array(pos[::-1], dtype=np.int32)
 
-    # Determines where that replica starts in each axis.
-    outer = replica_pos // computation_stride
-    inner = replica_pos % computation_stride
-    replica_offsets[replica, :] = outer * computation_footprint + inner
+  # TODO(ylc): Revisit here when topology_rank > 3.
+  enable_2d_tiling = (
+      topology_rank == 3 and
+      computation_shape[-1] == 2  # Only handle 2D case.
+      and np.prod(computation_stride) == 1  # Ensure no stride.
+      and num_replicas == max_replicas)  # Full replication.
+  logging.info("enable_2d_tiling: {}".format(enable_2d_tiling))
+  if enable_2d_tiling:
+    assignment = []
+    inner_ring = _ring_2d(computation_shape[0], computation_shape[1])
+    outer_ring = _ring_2d(replica_shape[0], replica_shape[1])
 
-  # Computes a complete logical core -> physical core mapping for each replica.
-  indices = [
-      np.arange(0, computation_shape[i] * computation_stride[i],
-                computation_stride[i]) for i in xrange(topology_rank)
-  ]
-  indices = np.concatenate(
-      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
-      axis=-1)
-  indices = indices.reshape((-1, topology_rank))
-  assignment = indices + replica_offsets[:, np.newaxis, :]
+    for replica in xrange(num_replicas):
+      outer_x, outer_y = outer_ring[replica]
+      per_replica_assignment = []
+      for index in xrange(np.prod(computation_shape)):
+        inner_x, inner_y = inner_ring[index // 2]
+        px = outer_x * computation_shape[0] + inner_x
+        py = outer_y * computation_shape[1] + inner_y
+        pz = index % 2
+        per_replica_assignment.append([px, py, pz])
+      assignment.append(per_replica_assignment)
+  else:
+    for replica in xrange(num_replicas):
+      # Chooses a replica number in each axis.
+      t = replica
+      pos = []
+      for dim in replica_shape[::-1]:
+        pos.append(t % dim)
+        t //= dim
+      replica_pos = np.array(pos[::-1], dtype=np.int32)
+
+      # Determines where that replica starts in each axis.
+      outer = replica_pos // computation_stride
+      inner = replica_pos % computation_stride
+      replica_offsets[replica, :] = outer * computation_footprint + inner
+
+    # Computes a logical core -> physical core mapping for each replica.
+    indices = [
+        np.arange(0, computation_shape[i] * computation_stride[i],
+                  computation_stride[i]) for i in xrange(topology_rank)
+    ]
+    indices = np.concatenate(
+        [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
+        axis=-1)
+    indices = indices.reshape((-1, topology_rank))
+    assignment = indices + replica_offsets[:, np.newaxis, :]
+
   return DeviceAssignment(topology, core_assignment=assignment)
diff --git a/tensorflow/python/tpu/error_handling.py b/tensorflow/python/tpu/error_handling.py
index 87f0b30..9cbb508 100644
--- a/tensorflow/python/tpu/error_handling.py
+++ b/tensorflow/python/tpu/error_handling.py
@@ -1,135 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""ErrorRendezvous handler for collecting errors from multiple threads."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import sys
-import threading
-import time
-
-import six
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-
-_UNINTERESTING_ERRORS = (errors.CancelledError,)
-
-
-class ErrorRendezvous(object):
-  """Resolve errors from multiple threads during TPU execution.
-
-  TPU errors can occur on the infeed or outfeed threads as well as the main
-  training thread.
-
-  Depending on which thread "wins" and receives the session error first, we may
-  end up showing users a confusing and non-actionable error message (session
-  cancelled) instead of a root cause (e.g. a bad filename).
-
-  The rendezvous object provides a location to capture these errors until all
-  threads terminate.  At that point we can choose the most informative error
-  to report.
-  """
-
-  def __init__(self, num_sources):
-    # string -> (message, traceback)
-    self._errors = {}
-    self._num_sources = num_sources
-    self._session_cancel_timer = None
-
-  def record_error(self, source, exc_info, session=None):
-    """Report an exception from the given source.
-
-    If a session is passed, a timer will be registered to close it after a few
-    seconds.  This is necessary to ensure the main training loop does not hang
-    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
-    interesting error from another thread to propagate.
-
-    Args:
-      source: string, source of the error
-      exc_info: Output from `sys.exc_info` (type, value, traceback)
-      session: Session to close after delay.
-    """
-    _, value, _ = exc_info
-    self._errors[source] = exc_info
-    logging.error('Error recorded from %s: %s', source, value)
-
-    if session is not None and self._session_cancel_timer is None:
-
-      def _cancel_session():
-        time.sleep(5)
-        logging.error('Closing session due to error %s' % value)
-        try:
-          session.close()
-        except:  # pylint: disable=bare-except
-          logging.error(
-              '\n\n\nFailed to close session after error.'
-              'Other threads may hang.\n\n\n')
-
-      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
-      self._session_cancel_timer.daemon = True
-      self._session_cancel_timer.start()
-
-  def record_done(self, source):
-    """Mark execution source `source` as done.
-
-    If an error was originally reported from `source` it is left intact.
-
-    Args:
-      source: `str`, source being recorded
-    """
-    logging.info('%s marked as finished', source)
-    if source not in self._errors:
-      self._errors[source] = None
-
-  @contextlib.contextmanager
-  def catch_errors(self, source, session=None):
-    """Context manager to report any errors within a block."""
-    try:
-      yield
-    except Exception:  # pylint: disable=broad-except
-      self.record_error(source, sys.exc_info(), session)
-
-  def raise_errors(self, timeout_sec=0):
-    """Wait for up to `timeout` seconds for all error sources to finish.
-
-    Preferentially raise "interesting" errors (errors not in the
-    _UNINTERESTING_ERRORS) set.
-
-    Args:
-      timeout_sec: Seconds to wait for other error sources.
-    """
-    for _ in range(timeout_sec):
-      if len(self._errors) == self._num_sources:
-        break
-      time.sleep(1)
-
-    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
-
-    # First check for any interesting errors, then fall back on the session
-    # cancelled errors etc.
-    for k, (typ, value, traceback) in kept_errors:
-      if isinstance(value, _UNINTERESTING_ERRORS):
-        continue
-      else:
-        logging.warn('Reraising captured error')
-        six.reraise(typ, value, traceback)
-
-    for k, (typ, value, traceback) in kept_errors:
-      logging.warn('Reraising captured error')
-      six.reraise(typ, value, traceback)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.error_handling import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
index 50daff8..57eb9dd 100644
--- a/tensorflow/python/tpu/feature_column.py
+++ b/tensorflow/python/tpu/feature_column.py
@@ -22,6 +22,7 @@
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.feature_column import feature_column_lib as fc_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.tpu import tpu
@@ -30,6 +31,9 @@
 
 
 _TPU_FC_TO_SCOPE = '_tpu_feature_column_scope'
+_SUPPORTED_SEQUENCE_COLUMNS = (fc._SequenceCategoricalColumn,
+                               fc_lib.SequenceCategoricalColumn)
+
 _SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
                                   fc._VocabularyFileCategoricalColumn,
                                   fc._VocabularyListCategoricalColumn,
@@ -37,13 +41,17 @@
                                   fc_lib.IdentityCategoricalColumn,
                                   fc_lib.VocabularyFileCategoricalColumn,
                                   fc_lib.VocabularyListCategoricalColumn,
-                                  fc_lib.WeightedCategoricalColumn)
+                                  fc_lib.WeightedCategoricalColumn
+                                 ) + _SUPPORTED_SEQUENCE_COLUMNS
+_SEQUENCE_FEATURE_LENGTH_POSTFIX = '_seq_length_'
 
 
 def embedding_column(categorical_column,
                      dimension,
                      combiner='mean',
-                     initializer=None):
+                     initializer=None,
+                     max_sequence_length=0,
+                     partition_strategy='div'):
   """TPU embedding_column for `tf.feature_column.embedding_column`.
 
   Note that the interface for TPU embedding_column is different from the non-TPU
@@ -52,17 +60,29 @@
 
   Args:
     categorical_column: A categorical_column returned from
-        categorical_column_with_identity,  weighted_categorical_column,
-        categorical_column_with_vocabulary_list or
-        categorical_column_with_vocabulary_file.
+        categorical_column_with_identity, weighted_categorical_column,
+        categorical_column_with_vocabulary_file,
+        categorical_column_with_vocabulary_list,
+        sequence_categorical_column_with_identity,
+        sequence_categorical_column_with_vocabulary_file,
+        sequence_categorical_column_with_vocabulary_list
     dimension: An integer specifying dimension of the embedding, must be > 0.
     combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. For more information, see
+      in a single row for a non-sequence column. For more information, see
       `tf.feature_column.embedding_column`.
     initializer: A variable initializer function to be used in embedding
       variable initialization. If not specified, defaults to
       `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and
       standard deviation `1/sqrt(dimension)`.
+    max_sequence_length: An non-negative integer specifying the max sequence
+      length. Any sequence shorter then this will be padded with 0 embeddings
+      and any sequence longer will be truncated. This must be positive for
+      sequence features and 0 for non-sequence features.
+    partition_strategy: Determines how tensors are sharded on the tpu hosts. See
+      `tf.nn.safe_embedding_lookup_sparse` for more details. Allowed value are
+      `"div"` and `"mod"'. If `"mod"` is used, evaluation and exporting the
+      model to CPU will not work. In order to do this, you must shuffle the
+      embedding tensors into a single shard.
 
   Returns:
     A  _TPUEmbeddingColumn.
@@ -107,7 +127,9 @@
       ckpt_to_load_from=None,
       tensor_name_in_ckpt=None,
       max_norm=None,
-      trainable=True)
+      trainable=True,
+      max_sequence_length=max_sequence_length,
+      partition_strategy=partition_strategy)
   # For Embedding column, the initializer is hidden inside the creator Fn, which
   # is not accessiable later. So, we attach it to a speicial field. Also note
   # that non-TPU Embedding column and non-TPU shared Embedding column handle the
@@ -120,8 +142,56 @@
                              dimension,
                              combiner='mean',
                              initializer=None,
-                             shared_embedding_collection_name=None):
-  """List of dense columns that convert from sparse, categorical input."""
+                             shared_embedding_collection_name=None,
+                             max_sequence_lengths=None,
+                             partition_strategy='div'):
+  """List of dense columns that convert from sparse, categorical input.
+
+  Note that the interface for TPU embedding_column is different from the non-TPU
+  version. The following args available for the non-TPU version are NOT
+  supported: ckpt_to_load_from, tensor_name_in_ckp, max_norm and trainable.
+
+  Args:
+    categorical_columns: A list of categorical_columns returned from
+        categorical_column_with_identity, weighted_categorical_column,
+        categorical_column_with_vocabulary_file,
+        categorical_column_with_vocabulary_list,
+        sequence_categorical_column_with_identity,
+        sequence_categorical_column_with_vocabulary_file,
+        sequence_categorical_column_with_vocabulary_list
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row for a non-sequence column. For more information, see
+      `tf.feature_column.embedding_column`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    shared_embedding_collection_name: Optional name of the collection where
+      shared embedding weights are added. If not given, a reasonable name will
+      be chosen based on the names of `categorical_columns`. This is also used
+      in `variable_scope` when creating shared embedding weights.
+    max_sequence_lengths: An list of non-negative integers, either None or
+      empty or the same length as the argument categorical_columns. Entries
+      corresponding to non-sequence columns must be 0 and entries corresponding
+      to sequence columns specify the max sequence length for the column. Any
+      sequence shorter then this will be padded with 0 embeddings and any
+      sequence longer will be truncated.
+    partition_strategy: Determines how tensors are sharded on the tpu hosts. See
+      `tf.nn.safe_embedding_lookup_sparse` for more details. Allowed value are
+      `"div"` and `"mod"'.
+
+  Returns:
+    A  _TPUEmbeddingColumn.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if `initializer` is specified but not callable.
+    ValueError: if `max_sequence_lengths` is specified and not the same length
+      as `categorical_columns`.
+    ValueError: if `max_sequence_lengths` is positive for a non sequence column
+      or 0 for a sequence column.
+  """
   for categorical_column in categorical_columns:
     if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
       raise TypeError(
@@ -129,25 +199,46 @@
           ' shared_embedding_columns must be type %s, got %s.' % (' or '.join([
               cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
           ]), type(categorical_column)))
-  columns = fc_lib.shared_embedding_columns(
-      categorical_columns,
-      dimension,
-      combiner=combiner,
-      initializer=initializer,
-      shared_embedding_collection_name=shared_embedding_collection_name,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      max_norm=None,
-      trainable=True)
 
-  # Use the initializer and shared_embedding_collection_name to create TPU
-  # version
-  initializer = columns[0].initializer
-  shared_embedding_collection_name = columns[0].shared_embedding_collection_name
+  if not max_sequence_lengths:
+    max_sequence_lengths = [0] * len(categorical_columns)
+  if len(max_sequence_lengths) != len(categorical_columns):
+    raise ValueError('max_sequence_lengths and categorical_columns must be of '
+                     'the same length. len(max_sequence_lengths)={} '
+                     'len(categorical_columns)={}.'.format(
+                         len(max_sequence_lengths), len(categorical_columns)))
+
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. ')
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+  num_buckets = sorted_columns[0]._num_buckets  # pylint: disable=protected-access
+
+  for c in sorted_columns[1:]:
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              sorted_columns[0], num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
+
+  if not shared_embedding_collection_name:
+    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
+    shared_embedding_collection_name += '_shared_embedding'
+
   tpu_columns = []
 
   # Create the state (_SharedEmbeddingColumnLayer) here.
-  for categorical_column in categorical_columns:
+  for categorical_column, max_sequence_length in zip(
+      categorical_columns, max_sequence_lengths):
     column = _TPUSharedEmbeddingColumn(
         categorical_column=categorical_column,
         dimension=dimension,
@@ -157,7 +248,9 @@
         ckpt_to_load_from=None,
         tensor_name_in_ckpt=None,
         max_norm=None,
-        trainable=True)
+        trainable=True,
+        max_sequence_length=max_sequence_length,
+        partition_strategy=partition_strategy)
     tpu_columns.append(column)
 
   return tpu_columns
@@ -166,8 +259,23 @@
 class _TPUBaseEmbeddingColumn(object):
   """Base class for TPU Embedding Column."""
 
-  def __init__(self, categorical_column):
+  def __init__(self, categorical_column, max_sequence_length=0,
+               partition_strategy='div'):
     self._tpu_categorical_column = categorical_column
+    self._max_sequence_length = max_sequence_length
+    if (self.is_sequence_column() and max_sequence_length < 1):
+      raise ValueError('max_sequence_length must be greater than 0 for '
+                       'sequence columns. Got max_sequence_length={} for '
+                       'sequence column {}.'.format(max_sequence_length,
+                                                    categorical_column.name))
+    if (not self.is_sequence_column() and max_sequence_length != 0):
+      raise ValueError('Non zero max_seq_length={} specified for non '
+                       'sequence column {}.'.format(max_sequence_length,
+                                                    categorical_column.name))
+    self._partition_strategy = partition_strategy
+    if partition_strategy not in ('mod', 'div'):
+      raise ValueError('partition_strategy must be one of `mod` or `div`. '
+                       'Received {}.'.format(partition_strategy))
 
   def get_combiner(self):
     """Returns the embedding combiner."""
@@ -201,6 +309,20 @@
     """Check if the categorical column of the embedding column is weighted."""
     raise NotImplementedError('not impl')
 
+  def is_sequence_column(self):
+    return isinstance(self._tpu_categorical_column, _SUPPORTED_SEQUENCE_COLUMNS)
+
+  def get_max_sequence_length(self):
+    return self._max_sequence_length
+
+  def get_sequence_length_feature_key_name(self):
+    """Get the key for the associated sequence length feature."""
+    return get_sequence_length_feature_key_name_from_feature_key_name(
+        self.get_feature_key_name())
+
+  def get_partition_strategy(self):
+    return self._partition_strategy
+
 
 class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
   """Core Embedding Column."""
@@ -213,7 +335,9 @@
               ckpt_to_load_from=None,
               tensor_name_in_ckpt=None,
               max_norm=None,
-              trainable=True):
+              trainable=True,
+              max_sequence_length=0,
+              partition_strategy='div'):
     # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
     # are not supported on TPU. They are solely for matching the signature of
     # __new__ of parent class fc._EmbeddingColumn.
@@ -236,8 +360,12 @@
                ckpt_to_load_from=None,
                tensor_name_in_ckpt=None,
                max_norm=None,
-               trainable=True):
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+               trainable=True,
+               max_sequence_length=0,
+               partition_strategy='div'):
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column,
+                                     max_sequence_length=max_sequence_length,
+                                     partition_strategy=partition_strategy)
     self._key = None
 
   def get_combiner(self):
@@ -278,12 +406,18 @@
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if tpu.under_tpu_inference_context():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('Export saved model does not support MOD '
+                                  'sharded embeddings.')
       def host_computation():
         return fc._EmbeddingColumn._get_dense_tensor(
             self, inputs, weight_collections, trainable)
       return tpu.outside_compilation(host_computation)
 
     if _is_running_on_cpu():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('TPUEmbedding on CPU does not support MOD '
+                                  'sharded embeddings.')
       return fc._EmbeddingColumn._get_dense_tensor(
           self, inputs, weight_collections, trainable)
 
@@ -297,6 +431,38 @@
 
     return tensor
 
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('Export saved model does not support MOD '
+                                  'sharded embeddings.')
+      def host_computation():
+        return fc._EmbeddingColumn._get_sequence_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('TPUEmbedding on CPU does not support MOD '
+                                  'sharded embeddings.')
+      return fc._EmbeddingColumn._get_sequence_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    tensor = inputs.get(self.get_feature_key_name())
+    tensor_lengths = inputs.get(self.get_sequence_length_feature_key_name())
+
+    # inputs is a _LazyBuilder and for rank 1 tensors, it calls expand_dims(-1).
+    # We need to undo this to match the standard CPU sequence embedding.
+    tensor_lengths = array_ops.squeeze(tensor_lengths, -1)
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(self.get_embedding_var_name(),
+                                    'embedding_weights')
+
+    return fc._SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=tensor, sequence_length=tensor_lengths)
+
 
 class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
                                 fc._SharedEmbeddingColumn):
@@ -311,7 +477,9 @@
               ckpt_to_load_from=None,
               tensor_name_in_ckpt=None,
               max_norm=None,
-              trainable=True):
+              trainable=True,
+              max_sequence_length=0,
+              partition_strategy='div'):
     return fc._SharedEmbeddingColumn.__new__(
         cls,
         categorical_column,
@@ -333,9 +501,13 @@
                ckpt_to_load_from=None,
                tensor_name_in_ckpt=None,
                max_norm=None,
-               trainable=True):
+               trainable=True,
+               max_sequence_length=0,
+               partition_strategy='div'):
 
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column,
+                                     max_sequence_length=max_sequence_length,
+                                     partition_strategy=partition_strategy)
     self._key = None
 
   def get_combiner(self):
@@ -376,12 +548,18 @@
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if tpu.under_tpu_inference_context():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('Export saved model does not support MOD '
+                                  'sharded embeddings.')
       def host_computation():
         return fc._SharedEmbeddingColumn._get_dense_tensor(
             self, inputs, weight_collections, trainable)
       return tpu.outside_compilation(host_computation)
 
     if _is_running_on_cpu():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('TPUEmbedding on CPU does not support MOD '
+                                  'sharded embeddings.')
       return fc._SharedEmbeddingColumn._get_dense_tensor(
           self, inputs, weight_collections, trainable)
 
@@ -396,6 +574,36 @@
         is_shared_embedding=True)
     return tensor
 
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('Export saved model does not support MOD '
+                                  'sharded embeddings.')
+      def host_computation():
+        return fc._SharedEmbeddingColumn._get_sequence_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      if self._partition_strategy == 'mod':
+        raise NotImplementedError('TPUEmbedding on CPU does not support MOD '
+                                  'sharded embeddings.')
+      return fc._SharedEmbeddingColumn._get_sequence_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    tensor = inputs.get(self.get_feature_key_name())
+    tensor_lengths = inputs.get(self.get_sequence_length_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        is_shared_embedding=True)
+
+    return fc._SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=tensor, sequence_length=tensor_lengths)
+
 
 def _record_variable_scope_and_name(embedding_var_name,
                                     embedding_var_name_in_fc,
@@ -433,3 +641,50 @@
 def _is_running_on_cpu():
   """Returns True if the current context is CPU model."""
   return tpu_function.get_tpu_context().number_of_shards is None
+
+
+def get_sequence_length_feature_key_name_from_feature_key_name(feature_name):
+  """Gets the name of the sequence length feature from that of the base feature.
+
+  Args:
+    feature_name: The feature key of a sequence column.
+
+  Returns:
+    A string which is the feature key for the associated feature length column.
+  """
+  return feature_name + _SEQUENCE_FEATURE_LENGTH_POSTFIX
+
+
+def split_sequence_columns(feature_columns):
+  """Split a list of _TPUEmbeddingColumn into sequence and non-sequence columns.
+
+  For use in a TPUEstimator model_fn function. E.g.
+
+  def model_fn(features):
+    sequence_columns, feature_columns = (
+        tf.tpu.feature_column.split_sequence_columns(feature_columns))
+    input = tf.feature_column.input_layer(
+        features=features, feature_columns=feature_columns)
+    sequence_features, sequence_lengths = (
+        tf.contrib.feature_column.sequence_input_layer(
+            features=features, feature_columns=sequence_columns))
+
+  Args:
+    feature_columns: A list of _TPUEmbeddingColumns to split.
+
+  Returns:
+    Two lists of _TPUEmbeddingColumns, the first is the sequence columns and the
+    second is the non-sequence columns.
+  """
+  sequence_columns = []
+  non_sequence_columns = []
+  for column in feature_columns:
+    if not isinstance(column, (_TPUEmbeddingColumn, _TPUSharedEmbeddingColumn)):
+      raise TypeError(
+          'column must be a _TPUEmbeddingColumn or  _TPUSharedEmbeddingColumn '
+          'but got %s instead.' % (type(column)))
+    if column.is_sequence_column():
+      sequence_columns.append(column)
+    else:
+      non_sequence_columns.append(column)
+  return sequence_columns, non_sequence_columns
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index 60ff0a5..ddfc26e 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -11,7 +11,7 @@
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tpu_profiler_analysis_pb2_grpc",
+        ":profiler_analysis_pb2_grpc",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/profiler:profiler_analysis_proto_py",
         "//tensorflow/core/profiler:protos_all_py",
@@ -20,8 +20,8 @@
 )
 
 py_library(
-    name = "tpu_profiler_analysis_pb2_grpc",
-    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
+    name = "profiler_analysis_pb2_grpc",
+    srcs = ["profiler_analysis_pb2_grpc.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = ["//tensorflow/core/profiler:profiler_analysis_proto_py"],
diff --git a/tensorflow/python/tpu/profiler/profiler_analysis_pb2_grpc.py b/tensorflow/python/tpu/profiler/profiler_analysis_pb2_grpc.py
new file mode 100644
index 0000000..c28d168
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/profiler_analysis_pb2_grpc.py
@@ -0,0 +1,125 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+#
+# Do not use pylint on generated code.
+# pylint: disable=missing-docstring,g-short-docstring-punctuation,g-no-space-after-docstring-summary,invalid-name,line-too-long,unused-argument,g-doc-args
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import grpc
+
+from tensorflow.core.profiler import profiler_analysis_pb2 as third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+
+
+class ProfileAnalysisStub(object):
+  """//////////////////////////////////////////////////////////////////////////////
+
+  ProfileAnalysis service provide entry point for profiling TPU and for
+  serving profiled data to Tensorboard through GRPC
+  //////////////////////////////////////////////////////////////////////////////
+  """
+
+  def __init__(self, channel):
+    """Constructor.
+
+    Args:
+      channel: A grpc.Channel.
+    """
+    self.NewSession = channel.unary_unary(
+        '/tensorflow.ProfileAnalysis/NewSession',
+        request_serializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+        .NewProfileSessionRequest.SerializeToString,
+        response_deserializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+        .NewProfileSessionResponse.FromString,
+    )
+    self.EnumSessions = channel.unary_unary(
+        '/tensorflow.ProfileAnalysis/EnumSessions',
+        request_serializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+        .EnumProfileSessionsAndToolsRequest.SerializeToString,
+        response_deserializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+        .EnumProfileSessionsAndToolsResponse.FromString,
+    )
+    self.GetSessionToolData = channel.unary_unary(
+        '/tensorflow.ProfileAnalysis/GetSessionToolData',
+        request_serializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+        .ProfileSessionDataRequest.SerializeToString,
+        response_deserializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+        .ProfileSessionDataResponse.FromString,
+    )
+
+
+class ProfileAnalysisServicer(object):
+  """//////////////////////////////////////////////////////////////////////////////
+
+  ProfileAnalysis service provide entry point for profiling TPU and for
+  serving profiled data to Tensorboard through GRPC
+  //////////////////////////////////////////////////////////////////////////////
+  """
+
+  def NewSession(self, request, context):
+    """Starts a profiling session, blocks until it completes.
+
+    TPUProfileAnalysis service delegate this to TPUProfiler service.
+    Populate the profiled data in repository, then return status to caller.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def EnumSessions(self, request, context):
+    """Enumerate existing sessions and return available profile tools."""
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def GetSessionToolData(self, request, context):
+    """Retrieve specific tool's data for specific session."""
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+
+def add_ProfileAnalysisServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+      'NewSession':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.NewSession,
+              request_deserializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+              .NewProfileSessionRequest.FromString,
+              response_serializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+              .NewProfileSessionResponse.SerializeToString,
+          ),
+      'EnumSessions':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.EnumSessions,
+              request_deserializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+              .EnumProfileSessionsAndToolsRequest.FromString,
+              response_serializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+              .EnumProfileSessionsAndToolsResponse.SerializeToString,
+          ),
+      'GetSessionToolData':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.GetSessionToolData,
+              request_deserializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+              .ProfileSessionDataRequest.FromString,
+              response_serializer=third__party_dot_tensorflow_dot_core_dot_profiler_dot_profiler__analysis__pb2
+              .ProfileSessionDataResponse.SerializeToString,
+          ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+      'tensorflow.ProfileAnalysis', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
diff --git a/tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
deleted file mode 100644
index 8f51488..0000000
--- a/tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-#
-# Do not use pylint on generated code.
-# pylint: disable=missing-docstring,g-short-docstring-punctuation,g-no-space-after-docstring-summary,invalid-name,line-too-long,unused-argument,g-doc-args
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import grpc
-
-from tensorflow.contrib.tpu.profiler import tpu_profiler_analysis_pb2 as third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2
-
-
-class TPUProfileAnalysisStub(object):
-  """//////////////////////////////////////////////////////////////////////////////
-
-  TPUProfileAnalysis service provide entry point for profiling TPU and for
-  serving profiled data to Tensorboard through GRPC
-  //////////////////////////////////////////////////////////////////////////////
-  """
-
-  def __init__(self, channel):
-    """Constructor.
-
-    Args:
-      channel: A grpc.Channel.
-    """
-    self.NewSession = channel.unary_unary(
-        '/tensorflow.TPUProfileAnalysis/NewSession',
-        request_serializer=
-        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-        NewProfileSessionRequest.SerializeToString,
-        response_deserializer=
-        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-        NewProfileSessionResponse.FromString,
-    )
-    self.EnumSessions = channel.unary_unary(
-        '/tensorflow.TPUProfileAnalysis/EnumSessions',
-        request_serializer=
-        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-        EnumProfileSessionsAndToolsRequest.SerializeToString,
-        response_deserializer=
-        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-        EnumProfileSessionsAndToolsResponse.FromString,
-    )
-    self.GetSessionToolData = channel.unary_unary(
-        '/tensorflow.TPUProfileAnalysis/GetSessionToolData',
-        request_serializer=
-        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-        ProfileSessionDataRequest.SerializeToString,
-        response_deserializer=
-        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-        ProfileSessionDataResponse.FromString,
-    )
-
-
-class TPUProfileAnalysisServicer(object):
-  """//////////////////////////////////////////////////////////////////////////////
-
-  TPUProfileAnalysis service provide entry point for profiling TPU and for
-  serving profiled data to Tensorboard through GRPC
-  //////////////////////////////////////////////////////////////////////////////
-  """
-
-  def NewSession(self, request, context):
-    """Starts a profiling session, blocks until it completes.
-    TPUProfileAnalysis service delegate this to TPUProfiler service.
-    Populate the profiled data in repository, then return status to caller.
-    """
-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-    context.set_details('Method not implemented!')
-    raise NotImplementedError('Method not implemented!')
-
-  def EnumSessions(self, request, context):
-    """Enumerate existing sessions and return available profile tools.
-    """
-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-    context.set_details('Method not implemented!')
-    raise NotImplementedError('Method not implemented!')
-
-  def GetSessionToolData(self, request, context):
-    """Retrieve specific tool's data for specific session.
-    """
-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-    context.set_details('Method not implemented!')
-    raise NotImplementedError('Method not implemented!')
-
-
-def add_TPUProfileAnalysisServicer_to_server(servicer, server):
-  rpc_method_handlers = {
-      'NewSession':
-          grpc.unary_unary_rpc_method_handler(
-              servicer.NewSession,
-              request_deserializer=
-              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-              NewProfileSessionRequest.FromString,
-              response_serializer=
-              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-              NewProfileSessionResponse.SerializeToString,
-          ),
-      'EnumSessions':
-          grpc.unary_unary_rpc_method_handler(
-              servicer.EnumSessions,
-              request_deserializer=
-              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-              EnumProfileSessionsAndToolsRequest.FromString,
-              response_serializer=
-              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-              EnumProfileSessionsAndToolsResponse.SerializeToString,
-          ),
-      'GetSessionToolData':
-          grpc.unary_unary_rpc_method_handler(
-              servicer.GetSessionToolData,
-              request_deserializer=
-              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-              ProfileSessionDataRequest.FromString,
-              response_serializer=
-              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
-              ProfileSessionDataResponse.SerializeToString,
-          ),
-  }
-  generic_handler = grpc.method_handlers_generic_handler(
-      'tensorflow.TPUProfileAnalysis', rpc_method_handlers)
-  server.add_generic_rpc_handlers((generic_handler,))
diff --git a/tensorflow/python/tpu/tpu_config.py b/tensorflow/python/tpu/tpu_config.py
index 6ae65ac..2c9bce0 100644
--- a/tensorflow/python/tpu/tpu_config.py
+++ b/tensorflow/python/tpu/tpu_config.py
@@ -1,295 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""A RunConfig subclass with TPU support."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import os
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import util as util_lib
-
-# pylint: disable=protected-access
-_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
-_SERVICE_KEY = run_config_lib._SERVICE_KEY
-_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
-# pylint: enable=protected-access
-
-
-class InputPipelineConfig(object):
-  r"""Please see the definition of these values in TPUConfig."""
-  PER_SHARD_V1 = 1
-  PER_HOST_V1 = 2
-  PER_HOST_V2 = 3
-  BROADCAST = 4
-  SLICED = 5
-
-
-class TPUConfig(
-    collections.namedtuple('TPUConfig', [
-        'iterations_per_loop',
-        'num_shards',
-        'num_cores_per_replica',
-        'per_host_input_for_training',
-        'tpu_job_name',
-        'initial_infeed_sleep_secs',
-        'input_partition_dims',
-        'eval_training_input_configuration',
-    ])):
-  r"""TPU related configuration required by `TPUEstimator`.
-
-  Args:
-    iterations_per_loop: This is the number of train steps running in TPU
-      system before returning to CPU host for each `Session.run`. This means
-      global step is increased `iterations_per_loop` times in one `Session.run`.
-      It is recommended to be set as number of global steps for next checkpoint.
-      Note that in evaluation don't use this value, instead we run total eval
-      `steps` on TPU for a single `Session.run`.
-    num_shards: (Deprecated, ignored by TPUEstimator).
-      The number of model replicas in the system. For non-model-parallelism
-      case, this number equals the total number of TPU cores. For
-      model-parallelism, the total number of TPU cores equals
-      num_cores_per_replica * num_shards.
-    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
-      An integer which describes the number of TPU cores per model replica. This
-      is required by model-parallelism which enables partitioning
-      the model to multiple cores. Currently num_cores_per_replica must be
-      1, 2, 4, or 8.
-    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
-      `input_fn` is invoked once on each host. With the per-core input pipeline
-      configuration, it is invoked once for each core.
-      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
-      the batch size for each shard is `train_batch_size` // #hosts in the
-      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
-      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. The batch size equals to train_batch_size`. With the per-core
-      input pipeline configuration, the shard batch size is also
-      `train_batch_size` // #cores.
-      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
-    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
-      within TPUEstimator, however when using ClusterSpec propagation in more
-      esoteric cluster configurations, you may need to specify the job name as a
-      string.
-    initial_infeed_sleep_secs: The number of seconds the infeed thread should
-      wait before enqueueing the first batch. This helps avoid timeouts for
-      models that require a long compilation time.
-    input_partition_dims: A nested list to describe the partition dims
-      for all the tensors from input_fn(). The structure of
-      input_partition_dims must match the structure of `features` and
-      `labels` from input_fn(). The total number of partitions must match
-      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
-      images with shape [N, H, W, C] and labels [N].
-      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
-      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
-      to all the TPU cores since the partition dims is `None`.
-      Current limitations: This feature is only supported with the PER_HOST_V2
-      input mode.
-    eval_training_input_configuration: If `SLICED`, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. Unlike per_host_input_for_training=BROADCAST, each replica will
-      only get a slice of the data instead of a whole copy. If `PER_HOST_V1`,
-      the behaviour is determined by per_host_input_for_training.
-
-    Raises:
-      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
-  """
-
-  def __new__(
-      cls,
-      iterations_per_loop=2,
-      num_shards=None,
-      num_cores_per_replica=None,
-      per_host_input_for_training=True,
-      tpu_job_name=None,
-      initial_infeed_sleep_secs=None,
-      input_partition_dims=None,
-      eval_training_input_configuration=InputPipelineConfig.PER_HOST_V1):
-
-    # Check iterations_per_loop.
-    util_lib.check_positive_integer(iterations_per_loop,
-                                    'TPUConfig iterations_per_loop')
-
-    # Check num_shards.
-    if num_shards is not None:
-      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
-
-    if input_partition_dims is not None:
-      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
-        raise ValueError(
-            'input_partition_dims must be a list/tuple with one or two'
-            ' elements.')
-
-      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
-        raise ValueError(
-            'input_partition_dims is only supported in PER_HOST_V2 mode.')
-
-      if num_cores_per_replica is None:
-        raise ValueError(
-            'input_partition_dims requires setting num_cores_per_replica.')
-
-    # Check num_cores_per_replica
-    if num_cores_per_replica is not None:
-      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
-        raise ValueError(
-            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
-                str(num_cores_per_replica)))
-
-    if eval_training_input_configuration not in [
-        InputPipelineConfig.PER_HOST_V1, InputPipelineConfig.SLICED
-    ]:
-      raise ValueError(
-          'eval_training_input_configuration must be PER_HOST_V1 or SLICED;'
-          ' got {}'.format(str(eval_training_input_configuration)))
-
-    # per_host_input_for_training may be True, False, or integer in [1..3].
-    # Map legacy values (True, False) to numeric values.
-    if per_host_input_for_training is False:
-      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
-    elif per_host_input_for_training is True:
-      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
-
-    # Check initial_infeed_sleep_secs.
-    if initial_infeed_sleep_secs:
-      util_lib.check_positive_integer(initial_infeed_sleep_secs,
-                                      'TPUConfig initial_infeed_sleep_secs')
-
-    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
-
-    return super(TPUConfig, cls).__new__(
-        cls,
-        iterations_per_loop=iterations_per_loop,
-        num_shards=num_shards,
-        num_cores_per_replica=num_cores_per_replica,
-        per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name,
-        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
-        input_partition_dims=input_partition_dims,
-        eval_training_input_configuration=eval_training_input_configuration)
-
-
-class RunConfig(run_config_lib.RunConfig):
-  """RunConfig with TPU support."""
-
-  def __init__(self,
-               tpu_config=None,
-               evaluation_master=None,
-               master=None,
-               cluster=None,
-               **kwargs):
-    """Constructs a RunConfig.
-
-    Args:
-      tpu_config: the TPUConfig that specifies TPU-specific configuration.
-      evaluation_master: a string. The address of the master to use for eval.
-        Defaults to master if not set.
-      master: a string. The address of the master to use for training.
-      cluster: a ClusterResolver
-      **kwargs: keyword config parameters.
-
-    Raises:
-      ValueError: if cluster is not None and the provided session_config has a
-        cluster_def already.
-    """
-    super(RunConfig, self).__init__(**kwargs)
-    self._tpu_config = tpu_config or TPUConfig()
-    self._cluster = cluster
-
-    # If user sets master and/or evaluation_master explicitly, including empty
-    # string '', take it. Otherwise, take the values set by parent class.
-    if master is not None:
-      if cluster is not None:
-        raise ValueError('Both master and cluster are set.')
-      self._master = master
-    else:
-      if cluster:
-        self._master = cluster.master()
-
-    if evaluation_master is not None:
-      self._evaluation_master = evaluation_master
-    elif (not self._evaluation_master and
-          self.task_type != run_config_lib.TaskType.EVALUATOR):
-      # If the task type is EVALUATOR, it means some cluster manager sets the
-      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
-      #
-      # Otherwise, it means user executes the code without external cluster
-      # manager. For that, we optimize the user experience by setting
-      # evaluation_master to master, unless user overwrites it.
-      self._evaluation_master = self._master
-
-    # Set the ClusterSpec to use
-    if cluster:
-      self._cluster_spec = cluster.cluster_spec()
-
-      # Merge the cluster_def into the ConfigProto.
-      if self._session_config is None:  # pylint: disable=access-member-before-definition
-        self._session_config = config_pb2.ConfigProto(
-            allow_soft_placement=True, isolate_session_state=True)
-      if self._session_config.HasField('cluster_def'):
-        raise ValueError(
-            'You cannot provide a ClusterResolver and '
-            'session_config.cluster_def.')
-      if self._cluster_spec:
-        self._session_config.cluster_def.CopyFrom(
-            self._cluster_spec.as_cluster_def())
-
-  def _maybe_overwrite_session_config_for_distributed_training(self):
-    # Overrides the parent class session_config overwrite for between-graph. TPU
-    # runs with in-graph, which should not have device filter. Doing nothing
-    # ("pass") basically disables it.
-    pass
-
-  @property
-  def evaluation_master(self):
-    return self._evaluation_master
-
-  @property
-  def master(self):
-    return self._master
-
-  @property
-  def tpu_config(self):
-    return self._tpu_config
-
-  @property
-  def cluster(self):
-    return self._cluster
-
-  def replace(self, **kwargs):
-    if 'tpu_config' not in kwargs:
-      return super(RunConfig, self).replace(**kwargs)
-
-    tpu_config = kwargs.pop('tpu_config')
-    new_instance = super(RunConfig, self).replace(**kwargs)
-    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
-    return new_instance
-
-
-def _get_tpu_job_name_from_tf_config():
-  """Extracts the TPU job name from TF_CONFIG env variable."""
-  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
-  # spec propagation.
-  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
-  if tpu_job_name:
-    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
-  return tpu_job_name
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.tpu_config import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_config_test.py b/tensorflow/python/tpu/tpu_config_test.py
deleted file mode 100644
index 22fb303..0000000
--- a/tensorflow/python/tpu/tpu_config_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TPU RunConfig tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_config as tpu_config_lib
-
-
-def _set_tf_config_env_variable(tf_config):
-  return test.mock.patch.dict('os.environ', {
-      'TF_CONFIG': json.dumps(tf_config)
-  })
-
-
-class TPURunConfigTest(test.TestCase):
-
-  def test_no_session_config_set_in_local_case(self):
-    run_config = tpu_config_lib.RunConfig()
-    self.assertIsNone(run_config.session_config)
-
-  def test_no_session_config_overwrite_in_local_case(self):
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    run_config = tpu_config_lib.RunConfig(session_config=session_config)
-    self.assertEqual(session_config, run_config.session_config)
-
-  def test_no_session_config_set_with_cluster_spec(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3'],
-            run_config_lib.TaskType.WORKER: ['host3:4']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig()
-      self.assertIsNone(run_config.session_config)
-
-  def test_no_session_config_overwrite_with_cluster_spec(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3'],
-            run_config_lib.TaskType.WORKER: ['host3:4']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with _set_tf_config_env_variable(tf_config):
-      session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-      run_config = tpu_config_lib.RunConfig(session_config=session_config)
-      self.assertEqual(session_config, run_config.session_config)
-
-  def test_fail_with_invalid_num_shards(self):
-    with self.assertRaisesRegexp(ValueError, 'must be positive'):
-      tpu_config_lib.RunConfig(
-          tpu_config=tpu_config_lib.TPUConfig(num_shards=0))
-
-  def test_fail_with_iterations_per_loop(self):
-    with self.assertRaisesRegexp(ValueError, 'must be positive'):
-      tpu_config_lib.RunConfig(
-          tpu_config=tpu_config_lib.TPUConfig(iterations_per_loop=0))
-
-  def test_fail_with_invalid_num_cores_per_replica(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'num_cores_per_replica must be 1, 2, 4, 8, or 16;'
-        ' got 7'):
-      tpu_config_lib.TPUConfig(num_cores_per_replica=7)
-
-
-class TPURunConfigMasterTest(test.TestCase):
-
-  def test_default_values(self):
-    run_config = tpu_config_lib.RunConfig()
-    self.assertEqual('', run_config.master)
-    self.assertEqual('', run_config.evaluation_master)
-
-  def test_user_provided_master_and_evaluation_master(self):
-    run_config = tpu_config_lib.RunConfig(
-        master='_master_123', evaluation_master='_eval_master_123')
-    self.assertEqual('_master_123', run_config.master)
-    self.assertEqual('_eval_master_123', run_config.evaluation_master)
-
-  def test_evaluation_master_defaults_to_master(self):
-    run_config = tpu_config_lib.RunConfig(master='_master_123')
-    self.assertEqual('_master_123', run_config.master)
-    self.assertEqual('_master_123', run_config.evaluation_master)
-
-  def test_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-        'eval_session_master': '_eval_master_123'
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig()
-      self.assertEqual('_master_123', run_config.master)
-      self.assertEqual('_eval_master_123', run_config.evaluation_master)
-
-  def test_evaluation_master_defaults_to_master_in_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig()
-      self.assertEqual('_master_123', run_config.master)
-      self.assertEqual('_master_123', run_config.evaluation_master)
-
-  def test_respect_evaluation_master_in_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 0
-        },
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig(master='_something')
-      self.assertEqual('', run_config.evaluation_master)
-
-  def test_user_overwrites_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-        'eval_session_master': '_eval_master_123'
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig(
-          master='_new_master_123', evaluation_master='_new_eval_master_123')
-      self.assertEqual('_new_master_123', run_config.master)
-      self.assertEqual('_new_eval_master_123', run_config.evaluation_master)
-
-  def test_user_overwrites_master_in_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-        'eval_session_master': '_eval_master_123'
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig(master='_new_master_123')
-      self.assertEqual('_new_master_123', run_config.master)
-      self.assertEqual('_eval_master_123', run_config.evaluation_master)
-
-
-class TPUJobNameTest(test.TestCase):
-
-  def test_default_name(self):
-    config = tpu_config_lib.RunConfig()
-    self.assertIsNone(config.tpu_config.tpu_job_name)
-
-  def test_with_tf_config(self):
-    tf_config = {'service': {'tpu_worker_job_name': '_my_new_name',}}
-    with _set_tf_config_env_variable(tf_config):
-      config = tpu_config_lib.RunConfig()
-      self.assertEqual('_my_new_name', config.tpu_config.tpu_job_name)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/tpu/tpu_context.py b/tensorflow/python/tpu/tpu_context.py
index c6d2504..573f49b 100644
--- a/tensorflow/python/tpu/tpu_context.py
+++ b/tensorflow/python/tpu/tpu_context.py
@@ -1,749 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
-import copy
-
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import _tpu_estimator_embedding
-from tensorflow.python.tpu import device_assignment as tpu_device_assignment
-from tensorflow.python.tpu import tpu_config
-from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-
-
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-_NUM_CORES_TO_COMPUTATION_SHAPE = {
-    1: [1, 1, 1],
-    2: [1, 1, 2],
-    4: [1, 2, 2],
-    8: [2, 2, 2],
-    16: [4, 2, 2],
-}
-
-
-class TPUContext(object):
-  """A context that holds the current configuration of the TPU computation."""
-
-  def __init__(self,
-               internal_ctx,
-               input_device=None,
-               invocation_index=None,
-               call_from_input_fn=True):
-    self._internal_ctx = internal_ctx
-    self._input_device = input_device
-    self._invocation_index = invocation_index
-    self._call_from_input_fn = call_from_input_fn
-
-  def current_input_fn_deployment(self):
-    """The configuration of the current input_fn invocation.
-
-    The configuration depends on `TPUConfig.per_host_input_for_training`. See
-    `TPUConfig` for details.
-
-    Only set in params dict of input_fn
-
-    Returns:
-      A tuple of
-        1. Device spec string: String, is the current CPU host where the
-           input_fn is invoked.
-        2. Current invocation index: Int, 0-based index of the input_fn
-           invocation. See next item for details.
-        3. Total invocation count: Int, the total number of times to invoke the
-           input_fn on all CPU hosts. Each invocation will be passed with a new
-           `TPUContext` instance with current invocation index set properly.
-        4. Total number of replicas consumed by current_invocation: Int, the
-           number of replicas fed by the data returned by current input_fn. For
-           example, for per_core input pipeline deployment
-           and non-model-parallelism, total invocation count is equal to
-           the number of cores in the system and num replicas consumed by
-           current invocation is 1. For per-host v2 input pipeline deployment,
-           total invocation count is equal to the number of hosts in the system
-           and num replicas consumed by current invocation is equal to number of
-           cores per host.
-
-    Raises:
-      RuntimeError: If this method must not be called from input_fn.
-    """
-    if not self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' model_fn.')
-
-    if self._internal_ctx.is_input_sharded_per_core():
-      total_invocation_count = (self._internal_ctx.num_hosts
-                                * self._internal_ctx.num_of_replicas_per_host)
-      replicas_consumed = 1
-    elif self._internal_ctx.is_input_broadcast_with_iterators():
-      total_invocation_count = 1
-      replicas_consumed = self._internal_ctx.num_replicas
-    else:
-      total_invocation_count = self._internal_ctx.num_hosts
-      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
-    return (self._input_device, self._invocation_index,
-            total_invocation_count, replicas_consumed)
-
-  @property
-  def num_replicas(self):
-    """The total number of replicas.
-
-    For non-model-parallelism, num_replicas should be the total num of TPU
-    cores in the system.
-
-    Returns:
-      The number of replicas.
-    """
-    return self._internal_ctx.num_replicas
-
-  @property
-  def num_hosts(self):
-    """The number of hosts for the TPU system."""
-    return self._internal_ctx.num_hosts
-
-  @property
-  def current_host(self):
-    """The current host index for the TPU system."""
-    return self._invocation_index
-
-  @property
-  def num_of_replicas_per_host(self):
-    """The number of replicas for each host."""
-    if self._internal_ctx.model_parallelism_enabled:
-      raise ValueError(
-          'num_of_replicas_per_host is not supported for model_parallelism')
-    return self._internal_ctx.num_of_replicas_per_host
-
-  @property
-  def device_assignment(self):
-    """Returns device_assignment object."""
-    if self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' input_fn.')
-    return self._internal_ctx.device_assignment
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    # Note that: For the non-model parallelism, the mapping could be
-    # a random permutation. The order should not matter in most cases
-    # as far as model is replicated to all cores in the system.
-    return self._internal_ctx.device_for_replica(replica_id)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function.
-
-    The place function takes host_id as the input and returns the TF device
-    for the correspoding host.
-    """
-
-    def _placement_function(host_id):
-      """Return the host device given host_id."""
-      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
-
-    return _placement_function
-
-
-class _InternalTPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, based on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  if eval_on_tpu is False, then execution of eval on TPU is disabled.
-  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
-  and TPU execution is disabled for all modes.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _InternalTPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self,
-               config,
-               train_batch_size,
-               eval_batch_size,
-               predict_batch_size,
-               use_tpu,
-               eval_on_tpu=True,
-               embedding_config_spec=None):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._predict_batch_size = predict_batch_size
-    self._use_tpu = use_tpu
-    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
-    if not use_tpu and eval_on_tpu:
-      logging.warning('eval_on_tpu ignored because use_tpu is False.')
-
-    self._eval_on_tpu = eval_on_tpu
-    self._model_parallelism_enabled = (
-        use_tpu and config.tpu_config.num_cores_per_replica)
-    self._mode = None
-    num_cores_per_replica = config.tpu_config.num_cores_per_replica
-    if self._model_parallelism_enabled:
-      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
-          num_cores_per_replica]
-    else:
-      self._computation_shape = None
-    self._lazy_tpu_system_metadata_dict = {}  # key by master address
-    self._lazy_device_assignment_dict = {}  # key by master address
-    self._lazy_validation_dict = {}  # key by ModeKeys
-    self._embedding_config_spec = embedding_config_spec
-    self._lazy_embedding_config_dict = {}  # key by master address
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @contextmanager
-  def with_mode(self, mode):
-    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
-    # such as _lazy_tpu_system_metadata_dict between new copy and the original
-    # one. Note that all lazy states stored in properties _lazy_foo are sort of
-    # immutable as they should be same for the process lifetime.
-    new_ctx = copy.copy(self)
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  def _get_master_address(self):
-    mode = self._assert_mode()
-    config = self._config
-    master = (
-        config.master
-        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
-    return master
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    cluster_def = None
-    if (self._config.session_config and
-        self._config.session_config.cluster_def.job):
-      cluster_def = self._config.session_config.cluster_def
-
-    # pylint: disable=protected-access
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(
-            master,
-            cluster_def=cluster_def,
-            query_topology=self.model_parallelism_enabled))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-  def _get_device_assignment(self):
-    """Gets the (maybe cached) TPU device assignment."""
-    master = self._get_master_address()
-    device_assignment = self._lazy_device_assignment_dict.get(master)
-    if device_assignment is not None:
-      return device_assignment
-
-    tpu_system_metadata = self._get_tpu_system_metadata()
-
-    device_assignment = tpu_device_assignment.device_assignment(
-        tpu_system_metadata.topology,
-        computation_shape=self._computation_shape,
-        num_replicas=self.num_replicas)
-
-    logging.info('num_cores_per_replica: %s',
-                 str(self._config.tpu_config.num_cores_per_replica))
-    logging.info('computation_shape: %s', str(self._computation_shape))
-    logging.info('num_replicas: %d', self.num_replicas)
-    logging.info('device_assignment.topology.device_coordinates: %s',
-                 str(device_assignment.topology.device_coordinates))
-    logging.info('device_assignment.core_assignment: %s',
-                 str(device_assignment.core_assignment))
-
-    self._lazy_device_assignment_dict[master] = device_assignment
-    return device_assignment
-
-  @property
-  def embedding_config(self):
-    """Returns the embedding config based on current mode."""
-    master = self._get_master_address()
-    if master in self._lazy_embedding_config_dict:
-      embedding_config = self._lazy_embedding_config_dict[master]
-    else:
-      embedding_config = None
-      if self._use_tpu and self._embedding_config_spec:
-        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
-            self._embedding_config_spec, self._train_batch_size,
-            self._eval_batch_size, self.num_hosts, self.num_cores, self.config)
-        if not embedding_config.has_embedding_tables():
-          embedding_config = None
-      self._lazy_embedding_config_dict[master] = embedding_config
-
-    if embedding_config is not None:
-      mode = self._assert_mode()
-      # Dynamically attach tpu_embedding based on mode. With
-      # this, we could keep embedding_config immutable but call site always
-      # accesses the unified API '.tpu_embedding'.
-      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
-    return embedding_config
-
-  @property
-  def model_parallelism_enabled(self):
-    return self._model_parallelism_enabled
-
-  @property
-  def input_partition_dims(self):
-    return self._config.tpu_config.input_partition_dims
-
-  @property
-  def device_assignment(self):
-    return (self._get_device_assignment()
-            if self._model_parallelism_enabled else None)
-
-  @property
-  def num_of_cores_per_host(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_of_cores_per_host
-
-  @property
-  def num_cores(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_cores
-
-  @property
-  def num_of_replicas_per_host(self):
-    """Return the number of replicas per host."""
-    if self.model_parallelism_enabled:
-      return self.num_replicas // self.num_hosts
-    else:
-      return self.num_of_cores_per_host
-
-  @property
-  def num_replicas(self):
-    num_cores_in_system = self.num_cores
-
-    if self.model_parallelism_enabled:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      if num_cores_per_replica > num_cores_in_system:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the total num of '
-            'TPU cores in the system. num_cores_per_replica: {}, num cores '
-            'in the system: {}'.format(num_cores_per_replica,
-                                       num_cores_in_system))
-
-      if num_cores_in_system % num_cores_per_replica != 0:
-        raise RuntimeError(
-            'The num of cores in the system ({}) is not divisible by the num '
-            'of cores ({}) required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
-                num_cores_in_system, num_cores_per_replica))
-
-      return num_cores_in_system // num_cores_per_replica
-    else:
-      return num_cores_in_system
-
-  @property
-  def num_hosts(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_hosts
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    mode = self._assert_mode()
-    return (mode == model_fn_lib.ModeKeys.TRAIN and
-            (self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.PER_SHARD_V1))
-
-  def is_input_per_host_with_iterators(self):
-    """Return true if input_fn should be run in the per-host v2 config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.PER_HOST_V2)
-
-  def is_input_broadcast_with_iterators(self):
-    """Return true if input_fn should be run in the full_replicae config."""
-    mode = self._assert_mode()
-    return ((self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.BROADCAST) or
-            (mode != model_fn_lib.ModeKeys.TRAIN and
-             self._config.tpu_config.eval_training_input_configuration is
-             tpu_config.InputPipelineConfig.SLICED))
-
-  def is_running_on_cpu(self, is_export_mode=False):
-    """Determines whether the input_fn and model_fn should be invoked on CPU.
-
-    This API also validates user provided configuration, such as batch size,
-    according the lazy initialized TPU system metadata.
-
-    Args:
-      is_export_mode: Indicates whether the current mode is for exporting the
-        model, when mode == PREDICT. Only with this bool, we could
-        tell whether user is calling the Estimator.predict or
-        Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distinguish these two.
-
-    Returns:
-      bool, whether current input_fn or model_fn should be running on CPU.
-
-    Raises:
-      ValueError: any configuration is invalid.
-    """
-
-    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
-    if not is_running_on_cpu:
-      self._validate_tpu_configuration()
-    return is_running_on_cpu
-
-  def _is_running_on_cpu(self, is_export_mode):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
-    mode = self._assert_mode()
-
-    if not self._use_tpu:
-      return True
-
-    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
-      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
-      return True
-
-    if is_export_mode:
-      return True
-
-    return False
-
-  @property
-  def global_batch_size(self):
-    mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return self._eval_batch_size
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return self._predict_batch_size
-    else:
-      return None
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    global_batch_size = self.global_batch_size
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU
-    if self.is_input_sharded_per_core() or (
-        self.is_input_per_host_with_iterators()):
-      return global_batch_size // self.num_replicas
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU. always sharded per shard.
-    return global_batch_size // self.num_replicas
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (
-        run_config.evaluation_master
-        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
-    cluster_def = (run_config.session_config.cluster_def
-                   if run_config.session_config else None)
-
-    return tpu_system_metadata_lib.master_job(master, cluster_def)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-
-    master = self.master_job
-
-    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
-      """Return the host device given replica_id or host_id."""
-      assert _sentinal is None
-      if replica_id is not None and host_id is not None:
-        raise RuntimeError(
-            'replica_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        if replica_id is not None:
-          if self.model_parallelism_enabled:
-            return self.device_assignment.host_device(
-                replica=replica_id, job=master)
-          else:
-            host_id = replica_id / self.num_of_cores_per_host
-
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    """Returns a TPU device placement Fn."""
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    def _placement_function(i):
-      if self.model_parallelism_enabled:
-        return self.device_assignment.tpu_device(replica=i, job=master)
-      else:
-        num_of_cores_per_host = self.num_of_cores_per_host
-        host_id = i / num_of_cores_per_host
-        ordinal_id = i % num_of_cores_per_host
-        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
-
-    return _placement_function
-
-  def tpu_ordinal_function(self, host_id):
-    """Returns the TPU ordinal fn."""
-
-    def _tpu_ordinal_function(shard_index_in_host):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        shard_index_in_host: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      if self.model_parallelism_enabled:
-        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
-        replica = self.device_assignment.lookup_replicas(host_id,
-                                                         0)[shard_index_in_host]
-        return self.device_assignment.tpu_ordinal(replica=replica)
-      else:
-        return shard_index_in_host % self.num_of_cores_per_host
-
-    return _tpu_ordinal_function
-
-  def _validate_tpu_configuration(self):
-    """Validates the configuration based on the TPU system metadata."""
-    mode = self._assert_mode()
-    if self._lazy_validation_dict.get(mode):
-      return
-
-    # All following information is obtained from TPU system metadata.
-    num_cores = self.num_cores
-    num_replicas = self.num_replicas
-    num_hosts = self.num_hosts
-
-    if not num_cores:
-      tpu_system_metadata = self._get_tpu_system_metadata()
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system. Please double check '
-          'Tensorflow master address and TPU worker(s). Available devices '
-          'are {}.'.format(tpu_system_metadata.devices))
-
-    if self._config.tpu_config.num_shards:
-      user_provided_num_replicas = self._config.tpu_config.num_shards
-      if user_provided_num_replicas != num_replicas:
-        message = (
-            'TPUConfig.num_shards is not set correctly. According to TPU '
-            'system metadata for Tensorflow master ({}): num_replicas should '
-            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
-            'be the total num of TPU cores in the system. For '
-            'model-parallelism, the total number of TPU cores should be '
-            'num_cores_per_replica * num_replicas. Please set it '
-            'accordingly or leave it as `None`'.format(
-                self._get_master_address(), num_replicas,
-                user_provided_num_replicas))
-
-        raise ValueError(message)
-
-    if self._config.tpu_config.num_cores_per_replica:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
-      if num_cores_per_replica > num_cores_per_host:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the '
-            'num_cores_per_host. num_cores_per_replica: {}, '
-            'num_cores_per_host: {}'.format(num_cores_per_replica,
-                                            num_cores_per_host))
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      if (self._train_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'train batch size {} must be divisible by number of replicas {}'
-            .format(self._train_batch_size, num_replicas))
-
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      if self._eval_batch_size is None:
-        raise ValueError(
-            'eval_batch_size in TPUEstimator constructor cannot be `None`'
-            'if .evaluate is running on TPU.')
-      if (self._eval_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'eval batch size {} must be divisible by number of replicas {}'
-            .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.evaluate should be running on single TPU'
-            ' instead of a Pod.')
-    else:
-      assert mode == model_fn_lib.ModeKeys.PREDICT
-      if self._predict_batch_size is None:
-        raise ValueError(
-            'predict_batch_size in TPUEstimator constructor should not be '
-            '`None` if .predict is running on TPU.')
-      if (self._predict_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'predict batch size {} must be divisible by number of replicas {}'
-            .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.predict should be running on single TPU worker. '
-            'got {}.'.format(num_hosts))
-
-    # Record the state "validated" into lazy dictionary.
-    self._lazy_validation_dict[mode] = True
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    master = self.master_job
-
-    if self.model_parallelism_enabled:
-      return (self.device_assignment.host_device(
-          replica=replica_id, job=master),
-              self.device_assignment.tpu_ordinal(replica=replica_id))
-
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
-
-
-class _OneCoreTPUContext(_InternalTPUContext):
-  """Special _InternalTPUContext for one core usage."""
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
-
-    super(_OneCoreTPUContext, self).__init__(
-        config, train_batch_size, eval_batch_size,
-        predict_batch_size, use_tpu)
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
-            num_cores=1,
-            num_hosts=1,
-            num_of_cores_per_host=1,
-            topology=None,
-            devices=[]))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-
-def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu, eval_on_tpu,
-                     embedding_config_spec):
-  """Returns an instance of `_InternalTPUContext`."""
-
-  if (config.tpu_config.num_shards == 1 and
-      config.tpu_config.num_cores_per_replica is None):
-    if embedding_config_spec is not None:
-      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
-                       'when embedding_config_spec is not None.')
-    logging.warning(
-        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
-        'Please fix as soon as possible (leaving num_shards as None.)')
-    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
-                              predict_batch_size, use_tpu)
-
-  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
-                             predict_batch_size, use_tpu, eval_on_tpu,
-                             embedding_config_spec)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.tpu_context import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 6836c12..d3f23a1 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -34,6 +34,7 @@
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.tpu.ops import tpu_ops
 
@@ -322,6 +323,10 @@
           self).__init__(learning_rate, False, clip_weight_min, clip_weight_max)
 
 
+DeviceConfig = collections.namedtuple('DeviceConfig',
+                                      ['num_hosts', 'num_cores', 'job_name'])
+
+
 class TPUEmbedding(object):
   """API for using TPU for embedding.
 
@@ -409,10 +414,12 @@
                feature_to_config_dict,
                batch_size,
                mode,
-               master,
+               master=None,
                optimization_parameters=None,
                cluster_def=None,
-               pipeline_execution_with_tensor_core=False):
+               pipeline_execution_with_tensor_core=False,
+               partition_strategy='div',
+               device_config=None):
     """API for using TPU for embedding lookups.
 
     Args:
@@ -433,10 +440,20 @@
         faster, but trained model will be different if step N and step N+1
         involve the same set of embedding IDs. Please see
         `tpu_embedding_configuration.proto` for details.
+      partition_strategy: A string, either 'mod' or 'div', specifying how to map
+        the lookup id to the embedding tensor. For more information see
+        `tf.nn.embedding_lookup_sparse`.
+      device_config: A DeviceConfig instance, used when `master` and
+        `cluster_def` are both `None`.
 
     Raises:
       ValueError: if any input is invalid.
     """
+    if partition_strategy not in ('div', 'mod'):
+      raise ValueError(
+          'Invalid partition_strategy {}'.format(partition_strategy))
+    self._partition_strategy = partition_strategy
+
     _validate_table_to_config_dict(table_to_config_dict)
     # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
     self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
@@ -452,23 +469,38 @@
 
     self._batch_size = batch_size
 
-    self._master = master
-    self._cluster_def = cluster_def
-    self._tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
-            self._master, cluster_def=self._cluster_def))
-    if self._tpu_system_metadata.num_cores == 0:
-      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
-                       'TPUs.'.format(self._master))
-    self._num_hosts = self._tpu_system_metadata.num_hosts
-    master_job_name = tpu_system_metadata_lib.master_job(self._master,
-                                                         self._cluster_def)
-    self._hosts = sorted([
-        device.name for device in self._tpu_system_metadata.devices
-        if 'device:CPU:' in device.name and (master_job_name is None or
-                                             master_job_name in device.name)])
-    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
-    self._num_cores = self._tpu_system_metadata.num_cores
+    if master is None and cluster_def is None:
+      if device_config is None:
+        raise ValueError('When master and cluster_def are both None,'
+                         'device_config must be set but is not.')
+      if device_config.num_cores % device_config.num_hosts:
+        raise ValueError('num_hosts ({}) should divide num_cores ({}) '
+                         'but does not.'.format(device_config.num_cores,
+                                                device_config.num_hosts))
+      self._num_hosts = device_config.num_hosts
+      self._num_cores = device_config.num_cores
+      self._num_cores_per_host = self._num_cores // self._num_hosts
+      self._hosts = [
+          '{}/replica:0/task:{}/device:CPU:0'.format(device_config.job_name, i)
+          for i in range(self._num_hosts)
+      ]
+    else:
+      tpu_system_metadata = (
+          tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
+              master,
+              cluster_def=cluster_def))
+      if tpu_system_metadata.num_cores == 0:
+        raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
+                         'TPUs.'.format(master))
+      self._num_hosts = tpu_system_metadata.num_hosts
+      master_job_name = tpu_system_metadata_lib.master_job(master, cluster_def)
+      self._hosts = []
+      for device in tpu_system_metadata.devices:
+        if 'device:CPU:' in device.name and (
+            master_job_name is None or master_job_name in device.name):
+          self._hosts.append(device.name)
+      self._num_cores_per_host = tpu_system_metadata.num_of_cores_per_host
+      self._num_cores = tpu_system_metadata.num_cores
 
     _validate_batch_size(self._batch_size, self._num_cores)
     self._batch_size_per_core = self._batch_size // self._num_cores
@@ -575,7 +607,10 @@
       table_descriptor.name = table
 
       table_config = self._table_to_config_dict[table]
-      table_descriptor.vocabulary_size = table_config.vocabulary_size
+      # For small tables, we pad to the number of hosts so that at least one
+      # id will be assigned to each host.
+      table_descriptor.vocabulary_size = max(table_config.vocabulary_size,
+                                             len(self.hosts))
       table_descriptor.dimension = table_config.dimension
 
       table_descriptor.num_features = self._table_to_num_features_dict[table]
@@ -598,7 +633,10 @@
     config_proto.batch_size_per_tensor_core = self._batch_size_per_core
     config_proto.num_hosts = self._num_hosts
     config_proto.num_tensor_cores = self._num_cores
-    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
+    config_proto.sharding_strategy = (
+        elc.TPUEmbeddingConfiguration.DIV_DEFAULT
+        if self._partition_strategy == 'div' else
+        elc.TPUEmbeddingConfiguration.MOD)
     config_proto.pipeline_execution_with_tensor_core = (
         self._pipeline_execution_with_tensor_core)
 
@@ -896,7 +934,7 @@
         table_gradients.append(gradient)
       interleaved_table_grads = array_ops.reshape(
           array_ops.concat(table_gradients, axis=1),
-          [-1, table_gradients[0].shape[-1]])
+          [-1, array_ops.shape(table_gradients[0])[-1]])
       gradients.append(interleaved_table_grads)
     return tpu_ops.send_tpu_embedding_gradients(
         inputs=gradients, config=self.config_proto.SerializeToString())
@@ -1244,14 +1282,19 @@
   def device_fn(op):
     """Returns the `device` for `op`."""
     part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
+    dummy_match = re.match(r'.*dummy_(\d+).*', op.name)
+    if not part_match and not dummy_match:
+      raise RuntimeError(
+          'Internal Error: Expected {} to contain /part_* or dummy_*'.format(
+              op.name))
 
     if part_match:
       idx = int(part_match.group(1))
     else:
-      raise RuntimeError('Internal Error: '
-                         'Expected %s to contain /part_*.' % op.name)
+      idx = int(dummy_match.group(1))
 
     device = hosts[idx]
+    logging.debug('assigning {} to {}.', op, device)
     return device
 
   return device_fn
@@ -1264,17 +1307,31 @@
                                   initializer,
                                   collections=None):  # pylint: disable=redefined-outer-name
   """Creates ParitionedVariables based on `num_hosts` for `table`."""
-  # TODO(shizhiw): automatically place embedding lookup elsewhere?
-  if vocabulary_size < num_hosts:
-    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
-                     'As TPU embedding is not optimized for small tables, '
-                     'please consider other ways for this embedding lookup.')
 
-  return list(variable_scope.get_variable(
-      name,
-      shape=(vocabulary_size, embedding_dimension),
-      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
-      dtype=dtypes.float32,
-      initializer=initializer,
-      collections=collections,
-      trainable=False))
+  num_slices = min(vocabulary_size, num_hosts)
+
+  var_list = list(
+      variable_scope.get_variable(
+          name,
+          shape=(vocabulary_size, embedding_dimension),
+          partitioner=partitioned_variables.fixed_size_partitioner(num_slices),
+          dtype=dtypes.float32,
+          initializer=initializer,
+          collections=collections,
+          trainable=False))
+
+  if vocabulary_size >= num_hosts:
+    return var_list
+
+  # For padded part, define the dummy variable to be loaded into TPU system.
+  for idx in range(num_hosts - vocabulary_size):
+    var_list.append(
+        variable_scope.get_variable(
+            'dummy_{}_{}'.format(vocabulary_size + idx, name),
+            shape=(1, embedding_dimension),
+            dtype=dtypes.float32,
+            initializer=initializer,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            trainable=False))
+
+  return var_list
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
index 0699407..0ee4906 100644
--- a/tensorflow/python/tpu/tpu_estimator.py
+++ b/tensorflow/python/tpu/tpu_estimator.py
@@ -1,4068 +1,33 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPUEstimator class."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import enum
-import os
-import signal
-import sys
-import threading
-import time
-
-import numpy as np
-import six
-from six.moves import queue as Queue  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import variable_pb2
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest as data_nest
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import batch_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as contrib_summary
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.tpu import _tpu_estimator_embedding
-from tensorflow.python.tpu import error_handling
-from tensorflow.python.tpu import functional as tpu_functional
-from tensorflow.python.tpu import preempted_hook
-from tensorflow.python.tpu import session_support
-from tensorflow.python.tpu import tensor_tracer
-from tensorflow.python.tpu import tpu
-from tensorflow.python.tpu import tpu_config
-from tensorflow.python.tpu import tpu_context
-from tensorflow.python.tpu import tpu_embedding_gradient
-from tensorflow.python.tpu import tpu_feed
-from tensorflow.python.tpu import tpu_function
-from tensorflow.python.tpu import training_loop
-from tensorflow.python.tpu import util as util_lib
-from tensorflow.python.tpu._tpu_estimator_embedding import AdagradParameters  # pylint: disable=unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import StochasticGradientDescentParameters  # pylint: disable=unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
-from tensorflow.python.tpu.ops import tpu_ops
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import evaluation
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
-
-_INITIAL_LOSS = 1e7
-_ZERO_LOSS = 0.
-_TPU_ESTIMATOR = 'tpu_estimator'
-_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
-_BATCH_SIZE_KEY = 'batch_size'
-_CTX_KEY = 'context'
-_USE_TPU_KEY = 'use_tpu'
-_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
-_ONE_GIGABYTE = 1024 * 1024 * 1024
-_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
-_TPU_TRAIN_OP = '_tpu_train_op'
-_INFERENCE_ON_TPU_MODE = '_inference_on_tpu'
-_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
-
-# Ideally _USE_TPU_KEY should be reserved as well. However there are already
-# models that make use of this key, thus it can not be reserved now to prevent
-# breakage. In the long run, we would like to mitigate this by migrating models
-# off of using _USE_TPU_KEY.
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
-
-# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
-# only used for per-core based deployments. For per-host based pipelines, if a
-# user returns a Dataset instance it will be automatically wrapped in a
-# tf.while_loop (This can be disabled by returning features and labels
-# explicitly).
-_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
-
-if ops.get_to_proto_function(
-    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)) is None:
-  ops.register_proto_function(
-      '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
-      proto_type=variable_pb2.VariableDef,
-      to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
-      from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
-
-
-def _is_iterable(obj):
-  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
-  try:
-    iter(obj)
-    return True
-  except TypeError:
-    return False
-
-
-class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
-
-  def AddOp(self, op):
-    if op.type in [
-        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
-        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
-    ]:
-      raise ValueError('Use tf.contrib.summary inside of host_calls.')
-
-
-def _create_global_step(graph):
-  graph = graph or ops.get_default_graph()
-  if training.get_global_step(graph) is not None:
-    raise ValueError('"global_step" already exists.')
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        ops.GraphKeys.GLOBAL_STEP,
-        shape=[],
-        dtype=dtypes.int64,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
-
-
-def _create_or_get_iterations_per_loop():
-  """Creates or gets the iterations_per_loop variable.
-
-  In TPUEstimator, the user provided computation, the model_fn, is wrapped
-  inside a tf.while_loop for peak performance. The iterations of the loop are
-  specified by this variable, which adjusts its value on the CPU after each TPU
-  program execution and before the next TPU execution.
-
-  The purpose of using a variable, rather then a constant, is to allow
-  TPUEstimator adapt the TPU training iterations according to the final steps
-  specified by users. For example, if the user sets the iterations_per_loop as 4
-  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
-  variable will have the following value before each TPU training.
-
-      - 1-th TPU execution: iterations_per_loop = 4
-      - 2-th TPU execution: iterations_per_loop = 4
-      - 3-th TPU execution: iterations_per_loop = 2
-
-  As model_fn increases the global step once per train_op invocation, the global
-  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
-  users.
-
-  Returns:
-    A TF non-trainable resource variable.
-
-  Raises:
-    RuntimeError: If multi iterations_per_loop variables were found.
-  """
-  graph = ops.get_default_graph()
-  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
-  iter_vars = graph.get_collection(collection_name)
-  if len(iter_vars) == 1:
-    return iter_vars[0]
-  elif len(iter_vars) > 1:
-    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
-
-  with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(
-        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
-      return variable_scope.get_variable(
-          _ITERATIONS_PER_LOOP_VAR,
-          initializer=init_ops.zeros_initializer(),
-          shape=[],
-          dtype=dtypes.int32,
-          trainable=False,
-          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
-          use_resource=True)
-
-
-def _sync_variables_ops(ctx):
-  """Create varriables synchronization ops.
-
-  Gets the variables back from TPU nodes. This means the variables updated
-  by TPU will now be *synced* to host memory.
-  In BROADCAST mode, we skip this sync since the variables are ususally too
-  big to transmit via RPC.
-
-  Args:
-    ctx: A `_InternalTPUContext` instance with mode.
-
-  Returns:
-    A list of sync ops.
-  """
-
-  if not ctx.is_input_broadcast_with_iterators():
-    return [
-        array_ops.check_numerics(v.read_value(),
-                                 'Gradient for %s is NaN' % v.name).op
-        for v in variables.trainable_variables()
-    ]
-  else:
-    return [control_flow_ops.no_op()]
-
-
-def _increase_eval_step_op(iterations_per_loop):
-  """Returns an op to increase the eval step for TPU evaluation.
-
-  Args:
-    iterations_per_loop: Tensor. The number of eval steps running in TPU system
-      before returning to CPU host for each `Session.run`.
-
-  Returns:
-    An operation
-  """
-  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
-  # Estimator evaluate increases 1 by default. So, we increase the difference.
-  return state_ops.assign_add(
-      eval_step,
-      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
-      use_locking=True)
-
-
-def _extract_key_names(tensor_or_dict):
-  if isinstance(tensor_or_dict, dict):
-    return sorted(tensor_or_dict.keys())
-  return []
-
-
-class PeriodicLogger(object):
-
-  def __init__(self, seconds):
-    self._log_every_n_seconds = seconds
-    self._last_log_time = 0
-
-  def log(self, msg, *args, **kw):
-    if time.time() - self._last_log_time > self._log_every_n_seconds:
-      self._last_log_time = time.time()
-      logging.info(msg, *args, **kw)
-
-
-class _SIGNAL(object):
-  """Signal used to control the thread of infeed/outfeed.
-
-  All preserved signals must be negative numbers. Positive numbers are used to
-  indicate the number of iterations for next training/evaluation loop.
-  """
-  NEXT_BATCH = -1
-  STOP = -2
-
-
-class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
-
-  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
-  `export_outputs`.
-
-  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
-  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
-  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
-  To be precise, TPU evaluation expects a slightly different signature from the
-  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
-  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
-  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
-  `tensors` usually specify the model logits, which are transferred back from
-  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
-  size is the first dimension. Once all tensors are available at CPU host from
-  all shards, they are concatenated (on CPU) and passed as positional arguments
-  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
-  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
-  name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
-  `eval_metrics`.
-
-  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
-  function should not capture any Tensors in `model_fn`.
-
-  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
-  to pass to that function and returns a list of Tensors. `host_call` currently
-  works for train() and evaluate(). The Tensors returned by the function is
-  executed on the CPU on every step, so there is communication overhead when
-  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
-  size of the tensors. The `tensors` are concatenated along their major (batch)
-  dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with `tf.contrib.summary.create_file_writer`.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metrics=None,
-              export_outputs=None,
-              scaffold_fn=None,
-              host_call=None,
-              training_hooks=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a validated `TPUEstimatorSpec` instance."""
-    host_calls = {}
-    if eval_metrics is not None:
-      host_calls['eval_metrics'] = eval_metrics
-    if host_call is not None:
-      host_calls['host_call'] = host_call
-    _OutfeedHostCall.validate(host_calls)
-
-    training_hooks = tuple(training_hooks or [])
-    evaluation_hooks = tuple(evaluation_hooks or [])
-    prediction_hooks = tuple(prediction_hooks or [])
-
-    for hook in training_hooks + evaluation_hooks + prediction_hooks:
-      if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
-                        .format(hook))
-
-    return super(TPUEstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metrics=eval_metrics,
-        export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn,
-        host_call=host_call,
-        training_hooks=training_hooks,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def as_estimator_spec(self):
-    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    host_calls = {}
-    if self.eval_metrics is not None:
-      host_calls['eval_metrics'] = self.eval_metrics
-    if self.host_call is not None:
-      host_calls['host_call'] = self.host_call
-    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
-    eval_metric_ops = None
-    if self.eval_metrics is not None:
-      eval_metric_ops = host_call_ret['eval_metrics']
-    hooks = None
-    if self.host_call is not None:
-      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    loss = self.loss
-    if tensor_tracer.TensorTracer.is_enabled() \
-       and self.train_op is not None:
-      tt = tensor_tracer.TensorTracer()
-      loss = tt.trace_cpu(ops.get_default_graph(), loss, self.train_op)
-
-    hooks = tuple(hooks or [])
-    scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(
-        mode=self.mode,
-        predictions=self.predictions,
-        loss=loss,
-        train_op=self.train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=self.export_outputs,
-        scaffold=scaffold,
-        training_hooks=self.training_hooks + hooks,
-        evaluation_hooks=self.evaluation_hooks + hooks,
-        prediction_hooks=self.prediction_hooks + hooks)
-
-
-class _OpQueueContext(object):
-  """Manages work queue and thread for a infeed/outfeed thread."""
-
-  def __init__(self, name, target, args):
-    self._name = name
-    self._queue = Queue.Queue()
-    args = (self,) + args
-    self._thread = threading.Thread(name=name, target=target, args=args)
-    self._thread.daemon = True
-    self._thread.start()
-
-  def stop(self):
-    self._queue.put(_SIGNAL.STOP)
-
-  def send_next_batch_signal(self, iterations):
-    self._queue.put(iterations)
-
-  def read_iteration_counts(self):
-    while True:
-      iterations = self._queue.get(block=True)
-      logging.debug('%s read iterations %s', self._name, iterations)
-      if iterations == _SIGNAL.STOP:
-        logging.info('%s received shutdown signal, stopping.', self._name)
-        return
-      yield iterations
-
-  def join(self):
-    logging.info('Shutting down %s thread.', self._name)
-    self.stop()
-    self._thread.join()
-
-
-class _OpSignalOnceQueueContext(_OpQueueContext):
-  """Manages work queue and thread for a infeed/outfeed thread.
-
-  This subclass only signals once.
-  """
-
-  def __init__(self, name, target, args):
-    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
-    self._has_signaled = False
-
-  def send_next_batch_signal(self, iterations):
-    if not self._has_signaled:
-      self._queue.put(iterations)
-      self._has_signaled = True
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
-
-  def __init__(self,
-               ctx,
-               enqueue_ops,
-               dequeue_ops,
-               tpu_compile_op,
-               run_infeed_loop_on_coordinator=True,
-               rendezvous=None,
-               master=None,
-               session_config=None,
-               tpu_init_ops=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._rendezvous = rendezvous
-    self._master = master
-    self._session_config = session_config
-    self._init_ops = list(tpu_init_ops or [])
-    if ctx.embedding_config is None:
-      self._embedding_layer_config = None
-    else:
-      self._embedding_layer_config = (
-          ctx.embedding_config.tpu_embedding.config_proto)
-    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
-
-    # When using model parallelism, the TPU is pre-initialized at startup to
-    # fetch mesh information.  We skip re-initializing it here to avoid
-    # suspected issues due to the mesh layout changing on the second
-    # initialization.
-    self._should_initialize_tpu = not ctx.model_parallelism_enabled
-    self._tpu_compile_op = tpu_compile_op
-
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    if self._should_initialize_tpu:
-      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
-    else:
-      self._finalize_ops = []
-
-    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
-    self._init_ops.extend(summary_writer_init_ops)
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    for op in summary_writer_init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def _run_infeed(self, queue_ctx, session):
-    logging.info('Starting infeed thread controller.')
-    if self._initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   self._initial_infeed_sleep_secs)
-      time.sleep(self._initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-
-    with self._rendezvous.catch_errors(source='infeed', session=session):
-      if self._run_infeed_loop_on_coordinator:
-        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-          for i in xrange(steps):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(self._enqueue_ops)
-      else:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      logging.info('Infeed thread finished, shutting down.')
-
-  def _run_outfeed(self, queue_ctx, session):
-    logging.info('Starting outfeed thread controller.')
-    status_logger = PeriodicLogger(seconds=60)
-    with self._rendezvous.catch_errors(source='outfeed', session=session):
-      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-        for i in xrange(steps):
-          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-          session.run(self._dequeue_ops)
-          status_logger.log('Outfeed finished for iteration (%d, %d)', count, i)
-      logging.info('Outfeed thread finished, shutting down.')
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpQueueContext(name=name, target=target, args=args)
-
-  def _assertCompilationSucceeded(self, result, coord):
-    proto = tpu_compilation_result.CompilationResultProto()
-    proto.ParseFromString(result)
-    if proto.status_error_message:
-      logging.error('Compilation failed: {}'.format(proto.status_error_message))
-      coord.request_stop()
-    else:
-      logging.info('Compilation succeeded')
-
-  def after_create_session(self, session, coord):
-    if self._should_initialize_tpu:
-      logging.info('Init TPU system')
-      start = time.time()
-      with ops.Graph().as_default():
-        with tf_session.Session(
-            self._master, config=self._session_config) as sess:
-          sess.run(
-              tpu.initialize_system(
-                  job=self._master_job,
-                  embedding_config=self._embedding_layer_config))
-      logging.info('Initialized TPU in %d seconds', time.time() - start)
-
-    session.run(self._init_ops,
-                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-
-    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
-      logging.info('Compiling user program: this may take a while...')
-      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
-
-    self._infeed_controller = self._create_infeed_controller(
-        name='InfeedController', target=self._run_infeed, args=(session,))
-
-    self._outfeed_controller = _OpQueueContext(
-        name='OutfeedController', target=self._run_outfeed, args=(session,))
-
-    # Enable the worker watchdog to terminate workers on coordinator exit.
-    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
-    if watchdog_timeout > 0:
-      session_support.start_worker_watchdog(session,
-                                            shutdown_timeout=watchdog_timeout)
-
-  def before_run(self, run_context):
-    iterations = run_context.session.run(self._iterations_per_loop_var)
-
-    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
-    self._infeed_controller.send_next_batch_signal(iterations)
-
-    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                 iterations)
-    self._outfeed_controller.send_next_batch_signal(iterations)
-
-  def end(self, session):
-    logging.info('Stop infeed thread controller')
-    self._infeed_controller.join()
-    self._rendezvous.record_done('infeed')
-
-    logging.info('Stop output thread controller')
-    self._outfeed_controller.join()
-    self._rendezvous.record_done('outfeed')
-
-    logging.info('Shutdown TPU system.')
-    session.run(self._finalize_ops)
-
-
-class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
-
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
-               rendezvous=None, master=None, session_config=None):
-    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx,
-        enqueue_ops,
-        dequeue_ops,
-        tpu_compile_op=tpu_compile_op,
-        run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous,
-        master=master,
-        session_config=session_config)
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
-
-
-class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step.
-
-  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
-  following differences for TPU training:
-
-  1. This hook sets the variable for iterations_per_loop, which is used by
-     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
-     As the hook execution order is not guaranteed, the variable update is
-     handled in `after_create_session` and `after_run` as
-     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
-
-  2. For each training loop (session.run), the global step could be increased
-     multiple times on TPU. The global step tensor value will be explicitly read
-     again in `after_run` to ensure the latest value is retrieved to avoid race
-     condition.
-  """
-
-  def __init__(self, iterations, num_steps=None, last_step=None):
-    """Initializes a `StopAtStepHook`.
-
-    Args:
-      iterations: The number of iterations to run optimizer per training loop.
-      num_steps: Number of steps to execute.
-      last_step: Step after which to stop.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
-    if num_steps is None and last_step is None:
-      raise ValueError('One of num_steps or last_step must be specified.')
-    if num_steps is not None and last_step is not None:
-      raise ValueError('Only one of num_steps or last_step can be specified.')
-    self._num_steps = num_steps
-    self._last_step = last_step
-    self._iterations = iterations
-
-  def _next_iterations(self, global_step, last_step):
-    gap = last_step - global_step
-    return min(gap, self._iterations)
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError('Global step should be created.')
-
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-    if self._last_step is None:
-      self._last_step = global_step + self._num_steps
-
-    iterations = self._next_iterations(global_step, self._last_step)
-
-    self._iterations_per_loop_var.load(iterations, session=session)
-
-  def after_run(self, run_context, run_values):
-    # Global step cannot be retrieved via SessionRunArgs and before_run due to
-    # race condition.
-    global_step = run_context.session.run(self._global_step_tensor)
-    if global_step >= self._last_step:
-      run_context.request_stop()
-    else:
-      iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(
-          iterations, session=run_context.session)
-
-
-class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step."""
-
-  def __init__(self, num_steps):
-    """Initializes a `_SetEvalIterationsHook`.
-
-    Args:
-      num_steps: Number of steps to execute.
-    """
-    self._num_steps = num_steps
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    self._iterations_per_loop_var.load(self._num_steps, session=session)
-
-
-class _StoppingPredictHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop according to the stopping signal in prediction."""
-
-  def __init__(self, scalar_stopping_signal):
-    self._scalar_stopping_signal = scalar_stopping_signal
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
-    # in side threads for prediction model. But it makes the
-    # TPUInfeedOutfeedSessionHook prints nice message.
-    self._iterations_per_loop_var.load(1, session=session)
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
-
-  def after_run(self, run_context, run_values):
-    _ = run_context
-    scalar_stopping_signal = run_values.results
-    if _StopSignals.should_stop(scalar_stopping_signal):
-      # NOTE(xiejw): In prediction, stopping signals are inserted for each
-      # batch. And we append one more batch to signal the system it should stop.
-      # The data flow might look like
-      #
-      #  batch   0: images, labels, stop = 0  (user provided)
-      #  batch   1: images, labels, stop = 0  (user provided)
-      #  ...
-      #  batch  99: images, labels, stop = 0  (user provided)
-      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
-      #
-      # where the final batch (id = 100) is appended by TPUEstimator, so we
-      # should drop it before returning the predictions to user.
-      # To achieve that, we throw the OutOfRangeError in after_run. Once
-      # Monitored Session sees this error in SessionRunHook.after_run, the
-      # "current" prediction, i.e., batch with id=100, will be discarded
-      # immediately
-      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
-
-
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
-  """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A fn returns enqueue_ops."""
-    num_cores_per_host = ctx.num_of_cores_per_host
-    per_host_sharded_inputs = []
-    for core_ordinal in range(num_cores_per_host):
-      with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        user_context = tpu_context.TPUContext(
-            internal_ctx=ctx,
-            input_device=host_device,
-            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
-        inputs = _Inputs.from_input_fn(input_fn(user_context))
-        if inputs.is_dataset:
-          raise TypeError(
-              '`input_fn` returning `Dataset`  is not yet supported in '
-              'per-Core input pipeline deployment yet. Please set '
-              'TPUConfig.per_host_input_for_training to True or return '
-              '`features` and `labels` from `input_fn`')
-        features, labels = inputs.features_and_labels()
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels))
-        per_host_sharded_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-
-    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
-    return per_host_enqueue_ops
-
-  return enqueue_ops_fn, captured_infeed_queue
-
-
-def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-      if batch_axis is not None:
-        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A Fn returning the TPU infeed enqueue ops.
-
-    By providing as a Fn, it can be invoked inside the tf.while_loop such that
-    the input pipeline for multiple iterations can be executed by one
-    Session.run call.
-
-    Returns:
-      list of dict of ops.
-    """
-    with ops.device(device):
-      num_of_replicas_per_host = ctx.num_of_replicas_per_host
-      # Convert user input to features and labels.  If the user returns a
-      # dataset, it is initialized and the features and labels extracted via
-      # `dataset.iterator.get_next()`
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
-      unsharded_tensor_list = (
-          inputs_structure_recorder.flatten_features_and_labels(
-              features, labels, signals))
-
-      infeed_queue = tpu_feed.InfeedQueue(
-          tuple_types=[t.dtype for t in unsharded_tensor_list],
-          tuple_shapes=[t.shape for t in unsharded_tensor_list],
-          shard_dimensions=batch_axis)
-      captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
-      per_host_enqueue_ops = (
-          infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device,
-              tpu_ordinal_function=tpu_ordinal_function_impl))
-      if signals is None:
-        return per_host_enqueue_ops
-      else:
-        return {
-            'ops': per_host_enqueue_ops,
-            'signals': signals,
-        }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_per_host_v2_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if not is_dataset:
-      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
-                      'input pipeline configuration.')
-
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True,
-          num_invocations_per_step=ctx.num_of_replicas_per_host)
-
-    dataset_initializer = inputs.dataset_initializer()
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """Generates the per_host enqueue ops."""
-    control_deps = []
-    per_host_sharded_inputs = []
-    enqueue_datas_list = []
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-    cached_signals = None
-    with ops.device(device):
-      if not inputs.is_dataset:
-        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
-      for _ in range(num_replicas_per_host):
-        # Use control dependencies to ensure a deterministic ordering.
-        with ops.control_dependencies(control_deps):
-          features, labels = inputs.features_and_labels()  # Calls get_next()
-          signals = inputs.signals()
-
-          # All the replicas share the replica 0's stopping singal.
-          # This avoids inconsistent state among different model replcias.
-          if cached_signals:
-            signals['stopping'] = cached_signals['stopping']
-          else:
-            cached_signals = signals
-
-        features, labels, enqueue_data = (
-            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
-        enqueue_datas_list.append(enqueue_data)
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels, signals))
-        control_deps.extend(flattened_inputs)
-        per_host_sharded_inputs.append(flattened_inputs)
-
-      if inputs_structure_recorder.flattened_input_dims:
-        input_partition_dims = inputs_structure_recorder.flattened_input_dims
-        if signals:
-          input_partition_dims += [None] * len(signals)
-        # pylint: disable=protected-access
-        infeed_queue = tpu_feed._PartitionedInfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
-            host_id=host_id,
-            input_partition_dims=input_partition_dims,
-            device_assignment=ctx.device_assignment)
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs)
-      else:
-        infeed_queue = tpu_feed.InfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs,
-            tpu_ordinal_function=tpu_ordinal_function_impl)
-      captured_infeed_queue.capture(infeed_queue)
-
-    if ctx.embedding_config:
-      per_host_enqueue_ops.extend(
-          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
-              enqueue_datas_list))
-
-    if signals is None:
-      return per_host_enqueue_ops
-    else:
-      return {
-          'ops': per_host_enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
-                                      num_hosts):
-  """Generates infeed enqueue ops for one input_fn on all the hosts."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device_0, invocation_index=0)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-
-  def tpu_ordinal_function_impl(replica_id):
-    if ctx.device_assignment:
-      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
-    else:
-      return replica_id % num_replicas_per_host
-
-  def device_function_impl(replica_id):
-    return ctx.tpu_host_placement_function(replica_id=replica_id)
-
-  def enqueue_ops_fn():
-    """Generates enqueue ops for all the hosts."""
-    broadcasted_inputs = []
-    flattened_inputs = None  # Cache result from input_fn.
-    signals = None
-    num_replicas = ctx.num_replicas
-    core_id = 0
-    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
-        for _ in xrange(ctx.num_of_replicas_per_host):
-          # Note: input_fn is only called once at host 0 for the first replica.
-          # The features and labels returned from that invocation are
-          # broadcasted to other replicas(including the replicas on other
-          # hosts).
-          if flattened_inputs is None:
-            features, labels = inputs.features_and_labels()  # Calls get_next()
-            signals = inputs.signals()
-
-            inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
-            flattened_inputs = (
-                inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels, signals))
-            if (ctx.config.tpu_config.eval_training_input_configuration is
-                tpu_config.InputPipelineConfig.SLICED):
-              input_slices = [
-                  array_ops.split(x, num_replicas) for x in flattened_inputs
-              ]
-          if (ctx.config.tpu_config.eval_training_input_configuration is
-              tpu_config.InputPipelineConfig.SLICED):
-            # for each core, slice out the flattened_inputs for each core.
-            broadcasted_inputs.append([x[core_id] for x in input_slices])
-            core_id += 1
-          else:
-            broadcasted_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(broadcasted_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-    enqueue_ops = infeed_queue.generate_enqueue_ops(
-        broadcasted_inputs,
-        tpu_ordinal_function=tpu_ordinal_function_impl,
-        placement_function=device_function_impl)
-
-    if signals is None:
-      return enqueue_ops
-    else:
-      return {
-          'ops': enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-class _InputPipeline(object):
-  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
-
-  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in
-  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
-  multi-host TPU training) or for one host (usually for single-host TPU
-  evaluation), and sends all `features` and `labels` returned by `input_fn` to
-  TPU infeed. For per-core invocation, `features` and `labels` are piped to
-  infeed directly, one tuple for each core. For per-host invocation,  `features`
-  and `labels` are split at host (with respect to `batch_axis`) and piped to all
-  cores accordingly.
-
-  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
-  inputs returned by the `input_fn` can have one of the following forms:
-  1. features
-  2. (features, labels)
-  3. ((arbitrarily nested structure of features), labels)
-
-  Internally, form 1 is reformed to `(features, None)` as features and labels
-  are passed separately to underlying methods. For TPU training, TPUEstimator
-  may expect multiple `features` and `labels` tuples one for each core.
-
-  TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  Both `features` and `labels` can be any nested sturcture
-  supported by TF nest (namely, dict, tuples, namedtuples or any nested
-  structure of such of Tensors).  `labels` could be `None` as well.
-
-  These are flattened before they are passed to the infeed/outfeed library
-  as that expectes flattend lists.
-  """
-
-  class InputsStructureRecorder(object):
-    """The recorder to record inputs structure."""
-
-    def __init__(self, input_partition_dims=None):
-      # Holds the structure of inputs
-      self._feature_structure = {}
-      self._flattened_input_dims = None
-
-      if input_partition_dims:
-        # This should have been validated in TPUConfig.
-        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
-        if len(input_partition_dims) == 2:
-          self._feature_dims, self._label_dims = input_partition_dims
-        else:
-          self._feature_dims = input_partition_dims[0]
-          self._label_dims = None
-
-        assert self._feature_dims is not None, ('input_partition_dims[0] must '
-                                                'not be None')
-      else:
-        self._feature_dims = None
-        self._label_dims = None
-
-      # Internal state.
-      self._initialized = False
-
-    @property
-    def flattened_input_dims(self):
-      assert self._initialized, 'InputsStructureRecorder is not initialized.'
-      return self._flattened_input_dims
-
-    def has_labels(self):
-      return 'labels' in self._feature_structure
-
-    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
-                            label_dims_names, label_names, has_labels):
-      """Flatten input dims with the same order as flattened input tensors."""
-      flattened_input_dims = []
-      if feature_dims_names:
-        # We need a fixed ordering for matching the tensors in features.
-        flattened_input_dims.extend(
-            [feature_dims[name] for name in feature_dims_names])
-      else:
-        flattened_input_dims.append(feature_dims)
-
-      if label_dims_names:
-        # We need a fixed ordering for matching the tensors in labels.
-        flattened_input_dims.extend(
-            [label_dims[name] for name in label_dims_names])
-      else:
-        if label_names:
-          num_tensors_in_label = len(label_names)
-        else:
-          num_tensors_in_label = int(has_labels)
-        # Setting `None` in input_partition_dims[1] will apply `None` to
-        # all the tensors in labels, regardless of internal structure.
-        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
-
-      return flattened_input_dims
-
-    def validate_and_record_structure(self, features, labels):
-      """Validates and records the structure of `features` and `labels`."""
-      # Extract structure.
-      has_labels = labels is not None
-      feature_names = _extract_key_names(features)
-      label_names = _extract_key_names(labels)
-
-      if not self._initialized:
-        # Record structure.
-        self._initialized = True
-        if self._feature_dims is not None:
-          feature_dims_names = _extract_key_names(self._feature_dims)
-          if feature_dims_names != feature_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[0] mismatched feature'
-                ' keys. Expected {}, got {}'.format(feature_names,
-                                                    feature_dims_names))
-
-          label_dims_names = _extract_key_names(self._label_dims)
-          if self._label_dims is not None and label_dims_names != label_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[1] mismatched label'
-                ' keys. Expected {}, got {}'.format(label_names,
-                                                    label_dims_names))
-
-          self._flattened_input_dims = self._flatten_input_dims(
-              self._feature_dims, feature_dims_names, self._label_dims,
-              label_dims_names, label_names, has_labels)
-
-    def flatten_features_and_labels(self, features, labels, signals=None):
-      """Flattens the `features` and `labels` to a single tensor list."""
-      self._feature_structure['features'] = features
-      if labels is not None:
-        self._feature_structure['labels'] = labels
-      if signals is not None:
-        self._feature_structure['signals'] = signals
-      return data_nest.flatten(self._feature_structure)
-
-    def unflatten_features_and_labels(self, flattened_inputs):
-      """Restores the flattened inputs to original features and labels form.
-
-      Args:
-        flattened_inputs: Flattened inputs for each shard.
-
-      Returns:
-        A tuple of (`features`, `labels`), where `labels` could be None.
-        Each one, if present, should have identical structure (single tensor vs
-        dict) as the one returned by input_fn.
-
-      Raises:
-        ValueError: If the number of expected tensors from `flattened_inputs`
-          mismatches the recorded structure.
-      """
-
-      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
-                                                      flattened_inputs)
-      return _Inputs(
-          unflattened_inputs['features'],
-          unflattened_inputs.get('labels'),
-          signals=unflattened_inputs.get('signals'))
-
-  def __init__(self, input_fn, batch_axis, ctx):
-    """Constructor.
-
-    Args:
-      input_fn: input fn for train or eval.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards.
-      ctx: A `_InternalTPUContext` instance with mode.
-
-    Raises:
-      ValueError: If both `sharded_features` and `num_cores` are `None`.
-    """
-    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
-        ctx.input_partition_dims)
-
-    self._sharded_per_core = ctx.is_input_sharded_per_core()
-    self._input_fn = input_fn
-    self._infeed_queue = None
-    self._ctx = ctx
-    self._batch_axis = batch_axis
-
-  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
-    """Generates infeed enqueue ops and dequeue_fn."""
-    # While tf.while_loop is called, the body function, which invokes
-    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
-    # structure is recorded.
-    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
-        self._invoke_input_fn_and_record_structure())
-
-    self._validate_input_pipeline()
-
-    def dequeue_fn():
-      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
-      # computations must agree on the core on which infeed takes place. We
-      # choose to perform infeed on logical core 0 of each replica.
-      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
-      # The unflatten process uses the structure information recorded above.
-      return self._inputs_structure_recorder.unflatten_features_and_labels(
-          values)
-
-    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
-
-  def _invoke_input_fn_and_record_structure(self):
-    """Deploys the input pipeline and record input structure."""
-    enqueue_ops = []
-    infeed_queues = []
-    all_dataset_initializers = []
-    num_hosts = self._ctx.num_hosts
-    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
-
-    run_infeed_loop_on_coordinator = True
-
-    if self._sharded_per_core:
-      # Per-Core input pipeline deployment.
-      # Invoke input pipeline for each core and placed on the corresponding
-      # host.
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
-                generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    host_device, host_id))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              run_infeed_loop_on_coordinator = False
-              enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(captured_infeed_queue.get())
-
-    elif self._ctx.is_input_broadcast_with_iterators():
-      # Only calls input_fn in host 0.
-      host_device = tpu_host_placement_fn(host_id=0)
-      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
-                                            self._inputs_structure_recorder,
-                                            num_hosts))
-      if dataset_initializer:
-        all_dataset_initializers.append(dataset_initializer)
-        run_infeed_loop_on_coordinator = False
-        wrap_fn = (
-            _wrap_computation_in_while_loop
-            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-            _wrap_computation_in_while_loop_with_stopping_signals)
-        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-      else:
-        enqueue_ops.append(enqueue_ops_fn())
-      infeed_queues.append(captured_infeed_queue.get())
-    else:
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            if self._ctx.is_input_per_host_with_iterators():
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_v2_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, host_device, host_id))
-            else:
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, self._batch_axis,
-                      host_device, host_id))
-
-            # NOTE(xiejw): We dispatch here based on the return type of the
-            # users `input_fn`.
-            #
-            # 1. If input_fn returns a Dataset instance, we initialize the
-            # iterator outside of tf.while_loop, and call the iterator.get_next
-            # inside tf.while_loop.  This should be always safe.
-            #
-            # 2. If input_fn returns (features, labels), it is too late to wrap
-            # them inside tf.while_loop, as resource initialization cannot be
-            # handled in TF control flow properly. In this case, we will use
-            # python loop to enqueue the data into TPU system.  This may be
-            # slow compared to the previous case.
-            if dataset_initializer:
-              all_dataset_initializers.append(dataset_initializer)
-              run_infeed_loop_on_coordinator = False
-              wrap_fn = (
-                  _wrap_computation_in_while_loop
-                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-                  _wrap_computation_in_while_loop_with_stopping_signals)
-              enqueue_ops.append(
-                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(captured_infeed_queue.get())
-    # infeed_queue is used to generate dequeue ops. The only thing it uses for
-    # dequeue is dtypes and types. So, any one can be used. Here, grab the
-    # first one.
-    self._infeed_queue = infeed_queues[0]
-    return enqueue_ops, [
-        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
-    ], run_infeed_loop_on_coordinator
-
-  def _validate_input_pipeline(self):
-    """Validates the input pipeline.
-
-    Perform some sanity checks to log user friendly information. We should
-    error out to give users better error message. But, if
-    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    user code, so, log a warning.
-
-    Raises:
-      RuntimeError: If the validation failed.
-    """
-    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
-      err_msg = ('Input pipeline contains one or more QueueRunners. '
-                 'It could be slow and not scalable. Please consider '
-                 'converting your input pipeline to use `tf.data` instead (see '
-                 'https://www.tensorflow.org/guide/datasets for '
-                 'instructions.')
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
-      else:
-        logging.warn(err_msg)
-
-
-def call_computation(computation_inputs,
-                     computation,
-                     experimental_export_device_assignment,
-                     batch_config=None):
-  """Call computation.
-
-  Args:
-    computation_inputs: A tensor or dict of tensors, the inputs to the
-      computation.
-    computation: A Python function that takes no inputs and builds computation
-      graph. If `computation` returns m outputs, this function will return a
-      list of m Tensors.
-    experimental_export_device_assignment: If `True`, use user-provided device
-      assignment. If `False`, round-robin computation among all TPU cores
-      visible to the host.
-    batch_config: A BatchConfig named tuple specifying the batching
-      configuration to use for inference batching.
-
-  Returns:
-    A list of output tensors.
-  """
-  if experimental_export_device_assignment:
-    return computation(computation_inputs)
-
-  # Using `TPUPartitionedCall` makes it possible to target a different
-  # TPU core with every `Session.run()` call. Note that the entire inference
-  # graph executes on a single core, and that invocations of this graph
-  # will round-robin among the cores attached to a host.
-  def tpu_partitioned_call(partition_inputs):
-
-    # capture_resource_var_by_value enables variables to be mirrored on TPU
-    # to avoid fetching from CPU, since variables do not change during
-    # inference.
-    @function.Defun(capture_resource_var_by_value=False)
-    def tpu_subgraph():
-      return computation(partition_inputs)
-
-    return tpu_functional.TPUPartitionedCall(
-        args=tpu_subgraph.captured_inputs,
-        device_ordinal=tpu_ops.tpu_ordinal_selector(),
-        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
-        f=tpu_subgraph)
-
-  # Not using Batching Function but use TPUPartitionedCall/all cores.
-  if not batch_config:
-    return tpu_partitioned_call(computation_inputs)
-
-  # Use Batching Function and TPUPartitionedCall/all cores.
-  # Note that BatchingFunction requires a list of tensors and doesn't support
-  # a dict of tensors. So we preserve the structure by deterministically
-  # flattening the dict before batching and then recomposing it after batching
-  # to feed into the computation.
-  ordered_inputs_list = nest.flatten(computation_inputs)
-
-  @batch_ops.batch_function(
-      num_batch_threads=batch_config.num_batch_threads,
-      max_batch_size=batch_config.max_batch_size,
-      batch_timeout_micros=batch_config.batch_timeout_micros,
-      allowed_batch_sizes=batch_config.allowed_batch_sizes,
-      max_enqueued_batches=batch_config.max_enqueued_batches,
-      autograph=False)
-  def batched_tpu_computation(*tensor_args):
-    """Recompose the input feature dict and calls the TPU computation."""
-    computation_feature_input = nest.pack_sequence_as(computation_inputs,
-                                                      tensor_args)
-    return tpu_partitioned_call(computation_feature_input)
-
-  return batched_tpu_computation(*ordered_inputs_list)
-
-
-class _ModelFnWrapper(object):
-  """A `model_fn` wrapper.
-
-  This makes calling model_fn on CPU and TPU easier and more consistent and
-  performs necessary check and mutation required by TPU training and evaluation.
-
-  In addition, this wrapper manages converting the `model_fn` to a single TPU
-  train and eval step.
-  """
-
-  def __init__(self, model_fn, config, params, ctx):
-    self._model_fn = model_fn
-    self._config = config
-    self._params = params
-    self._ctx = ctx
-
-  def call_without_tpu(self, features, labels, is_export_mode):
-    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
-
-  def _add_embedding_features(self, features, hook_dummy_table_variables):
-    """Add embedding features, optionally add hook to intercept gradient."""
-    if self._ctx.embedding_config:
-      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
-      embedding_activations = tpu_embedding_.get_activations()
-      if hook_dummy_table_variables:
-        new_embedding_activations = (
-            tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
-                tpu_embedding_, embedding_activations,
-                self._ctx.embedding_config.dummy_table_variables))
-        features.update(new_embedding_activations)
-      else:
-        features.update(embedding_activations)
-
-  def convert_to_single_tpu_train_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single train step on TPU.
-
-    The user provided `model_fn` takes input tuple
-    (features, labels) and produces the EstimatorSpec with train_op and loss for
-    train `mode`. This usually represents a single train computation on CPU.
-
-    For TPU training, a train (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input should be taken from TPU infeed rather
-    than input pipeline (input_fn) directly. To fit TPU loop and replicate
-    pattern, the original train computation should be reformed, which is the
-    returned `train_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
-      representing the train step for TPU.
-    """
-
-    host_call = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_training_hooks = _CapturedObject()
-
-    def train_step(loss):
-      """Training step function for use inside a while loop."""
-      del loss  # unused; required in function signature.
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      self._add_embedding_features(features, True)
-
-      estimator_spec = self._verify_estimator_spec(
-          self._call_model_fn(features, labels))
-      loss, train_op = estimator_spec.loss, estimator_spec.train_op
-
-      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
-      else:
-        captured_scaffold_fn.capture(None)
-
-      captured_training_hooks.capture(estimator_spec.training_hooks)
-
-      if self._ctx.embedding_config is None:
-        apply_sparse_grads = []
-      else:
-        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
-        gradients = (
-            tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
-                tpu_embedding_)
-        )
-        grad_multiplier = self._ctx.embedding_config.get_grad_multiplier()
-        if grad_multiplier is not None:
-          scaled_gradients = collections.OrderedDict(
-              (k, v * grad_multiplier) for k, v in six.iteritems(gradients))
-        else:
-          scaled_gradients = gradients
-        apply_sparse_grads = [
-            tpu_embedding_.generate_send_gradients_op(scaled_gradients)
-        ]
-
-      # We must run train_op to update the variables prior to running the
-      # outfeed.
-      with ops.control_dependencies([train_op] + apply_sparse_grads):
-        host_call_outfeed_ops = []
-        host_call_fn, host_call_args = None, []
-
-        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
-            and estimator_spec.host_call is not None):
-          host_call_fn, host_call_args = estimator_spec.host_call
-
-        if host_call_fn:
-          # Ignore dummy hostcalls (no arguments)
-          if host_call_args:
-            host_call.record({'host_call': estimator_spec.host_call})
-            host_call_outfeed_ops = host_call.create_enqueue_op()
-        else:
-          # Create a host call for the loss to track execution progress
-          # Without this, we don't have any indication of the state of the
-          # TPU program.
-          host_call.record({
-              'host_call': (lambda loss_t: loss_t,
-                            [array_ops.reshape(loss, [1])])
-          })
-          host_call_outfeed_ops = host_call.create_enqueue_op()
-
-        with ops.control_dependencies(host_call_outfeed_ops):
-          return array_ops.identity(loss)
-
-    return (train_step, host_call, captured_scaffold_fn,
-            captured_training_hooks)
-
-  def convert_to_single_tpu_eval_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single eval step on TPU.
-
-    Similar to training, the user provided `model_fn` takes input tuple
-    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
-    eval `mode`. This usually represents a single evaluation computation on CPU.
-
-    For TPU evaluation, a eval (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input and output are slightly different. Input,
-    features and labels, should be taken from TPU infeed rather than input
-    pipeline (input_fn) directly. Output is managed in two stages.  First, the
-    model outputs as the result of evaluation computation, usually model logits,
-    should be transferred from TPU system to CPU. Then, all model outputs are
-    concatenated first on CPU and sent to the metric_fn for metrics computation.
-    To fit TPU evaluation pattern, the original eval computation should be
-    reformed, which is the returned `eval_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
-      representing the eval step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_eval_hooks = _CapturedObject()
-
-    def eval_step(total_loss):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      self._add_embedding_features(features, False)
-
-      tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU evaluation must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      loss = tpu_estimator_spec.loss
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
-
-      to_record = {}
-      if tpu_estimator_spec.eval_metrics:
-        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
-      if tpu_estimator_spec.host_call is not None:
-        # We assume that evaluate won't update global step, so we don't wrap
-        # this host_call.
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return math_ops.add(total_loss, loss)
-
-    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-
-  def convert_to_single_tpu_predict_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single predict step on TPU.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
-      predict_fn representing the predict step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_predict_hooks = _CapturedObject()
-
-    def predict_step(unused_scalar_stopping_signal):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      stopping_signals = inputs.signals()
-
-      assert stopping_signals is not None, (
-          'Internal Error: `signals` is missing.')
-
-      tpu_estimator_spec = self._call_model_fn(
-          features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU prediction must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
-
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
-      to_record = {}
-      identity_fn = lambda **kwargs: kwargs
-      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
-      to_record['signals'] = [identity_fn, stopping_signals]
-      if tpu_estimator_spec.host_call is not None:
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
-
-    return (predict_step, host_calls, captured_scaffold_fn,
-            captured_predict_hooks)
-
-  def _verify_tpu_spec_predictions(self, predictions):
-    """Validates TPUEstimatorSpec.predictions dict."""
-    # TODO(xiejw): Adds validation for prediction dictionrary.
-    # TODO(xiejw): Adds support for single tensor as predictions.
-    if not isinstance(predictions, dict):
-      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
-
-    for (key, tensor) in predictions.items():
-      if tensor.shape.dims[0].value is None:
-        raise ValueError(
-            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
-            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
-    return predictions
-
-  def _validate_model_features_and_labels(self, features, labels,
-                                          is_export_mode):
-    """Validates that the features and labels for the model function are valid.
-
-    A valid features/labels object is the one with:
-    - Type: A tensor or any nested structure of tensors supported by TF nest,
-        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
-    - Static shape if is_export_mode is False.
-
-    Args:
-      features: the features that would be input to the model function.
-      labels: the labels that would be input to the model function.
-      is_export_mode: boolean value specifying if in export mode.
-
-    Raises:
-      TypeError: If features/labels are not of the correct type.
-      ValueError: If features/labels have dynamic shape.
-    """
-
-    def validate(obj, obj_name):
-      """Helper validate function."""
-      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
-        return
-      if isinstance(obj, ops.Tensor):
-        if not obj.get_shape().is_fully_defined():
-          raise ValueError(
-              'The {} to the model returned by input_fn must have static shape.'
-              ' Tensor: {}'.format(obj_name, obj))
-      else:
-        for tensor in data_nest.flatten(obj):
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                ('The {} to the model returned by input_fn must have static '
-                 'shape. Tensor: {}').format(obj_name, tensor))
-
-    validate(features, 'features')
-    if labels is not None:
-      validate(labels, 'labels')
-
-  def _call_model_fn(self, features, labels, is_export_mode=False):
-    """Calls the model_fn with required parameters."""
-    self._validate_model_features_and_labels(features, labels, is_export_mode)
-    model_fn_args = function_utils.fn_args(self._model_fn)
-    kwargs = {}
-
-    # Makes deep copy with `config` and params` in case user mutates them.
-    config = copy.deepcopy(self._config)
-    params = copy.deepcopy(self._params)
-
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    elif labels is not None:
-      raise ValueError(
-          'model_fn does not take labels, but input_fn returns labels.')
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = self._ctx.mode
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-
-    if 'params' not in model_fn_args:
-      raise ValueError('model_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params[\'batch_size\']'.format(self._model_fn))
-
-    if is_export_mode:
-      batch_size_for_model_fn = None
-    else:
-      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
-
-    if batch_size_for_model_fn is not None:
-      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
-
-    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
-    # In export mode, params['use_tpu'] has already been set based on mode
-    # (i.e. True for _REWRITE_FOR_INFERENCE_MODE, False otherwise).
-    if not is_export_mode:
-      _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
-
-    if not running_on_cpu:
-      user_context = tpu_context.TPUContext(
-          internal_ctx=self._ctx, call_from_input_fn=False)
-      _add_item_to_params(params, _CTX_KEY, user_context)
-
-    estimator_spec = self._model_fn(features=features, **kwargs)
-    if (running_on_cpu and
-        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
-      # The estimator_spec will be passed to `Estimator` directly, which expects
-      # type `EstimatorSpec`.
-      return estimator_spec.as_estimator_spec()
-    else:
-      return estimator_spec
-
-  def _verify_estimator_spec(self, estimator_spec):
-    """Validates the estimator_spec."""
-    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-      return estimator_spec
-
-    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
-    if estimator_spec.training_chief_hooks:
-      raise ValueError(
-          err_msg.format('training_chief_hooks') + 'If you want' +
-          ' to pass training hooks, please pass via training_hooks.')
-
-    if estimator_spec.scaffold:
-      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
-                      'Please use TPUEstimatorSpec.')
-    return estimator_spec
-
-
-class _OutfeedHostCall(object):
-  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
-
-  def __init__(self, ctx):
-    self._ctx = ctx
-    self._names = []
-    # All of these are dictionaries of lists keyed on the name.
-    self._host_fns = {}
-    self._tensor_keys = collections.defaultdict(list)
-    self._tensors = collections.defaultdict(list)
-    self._tensor_dtypes = collections.defaultdict(list)
-    self._tensor_shapes = collections.defaultdict(list)
-
-  @staticmethod
-  def validate(host_calls):
-    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
-
-    for name, host_call in host_calls.items():
-      if not isinstance(host_call, (tuple, list)):
-        raise ValueError('{} should be tuple or list'.format(name))
-      if len(host_call) != 2:
-        raise ValueError('{} should have two elements.'.format(name))
-      if not callable(host_call[0]):
-        raise TypeError('{}[0] should be callable.'.format(name))
-      if not isinstance(host_call[1], (tuple, list, dict)):
-        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
-
-      if isinstance(host_call[1], (tuple, list)):
-        fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = function_utils.fn_args(host_call[0])
-        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
-        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
-          raise RuntimeError(
-              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
-              'method args of the function, which takes {}.'.format(
-                  name, len(host_call[1]), len(fn_args)))
-
-  @staticmethod
-  def create_cpu_hostcall(host_calls):
-    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
-
-    _OutfeedHostCall.validate(host_calls)
-    ret = {}
-    for name, host_call in host_calls.items():
-      host_fn, tensors = host_call
-      if isinstance(tensors, (tuple, list)):
-        ret[name] = host_fn(*tensors)
-      else:
-        # Must be dict.
-        try:
-          ret[name] = host_fn(**tensors)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling %s: %s. It is likely the tensors '
-              '(%s[1]) do not match the '
-              'function\'s arguments', name, e, name)
-          raise
-    return ret
-
-  def record(self, host_calls):
-    """Records the host_call structure."""
-
-    for name, host_call in host_calls.items():
-      host_fn, tensor_list_or_dict = host_call
-      self._names.append(name)
-      self._host_fns[name] = host_fn
-
-      if isinstance(tensor_list_or_dict, dict):
-        for (key, tensor) in six.iteritems(tensor_list_or_dict):
-          self._tensor_keys[name].append(key)
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-      else:
-        # List or tuple.
-        self._tensor_keys[name] = None
-        for tensor in tensor_list_or_dict:
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-
-  def create_enqueue_op(self):
-    """Create the op to enqueue the recorded host_calls.
-
-    Returns:
-      A list of enqueue ops, which is empty if there are no host calls.
-    """
-    if not self._names:
-      return []
-
-    tensors = []
-    # TODO(jhseu): Consider deduping tensors.
-    for name in self._names:
-      tensors.extend(self._tensors[name])
-
-    with ops.device(tpu.core(0)):
-      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
-
-  def create_tpu_hostcall(self):
-    """Sends the tensors through outfeed and runs the host_fn on CPU.
-
-    The tensors are concatenated along dimension 0 to form a global tensor
-    across all shards. The concatenated function is passed to the host_fn and
-    executed on the first host.
-
-    Returns:
-      A dictionary mapping name to the return type of the host_call by that
-      name.
-
-    Raises:
-      RuntimeError: If outfeed tensor is scalar.
-    """
-    if not self._names:
-      return {}
-
-    ret = {}
-    # For each i, dequeue_ops[i] is a list containing the tensors from all
-    # shards. This list is concatenated later.
-    dequeue_ops = []
-    tensor_dtypes = []
-    tensor_shapes = []
-    for name in self._names:
-      for _ in self._tensors[name]:
-        dequeue_ops.append([])
-      for dtype in self._tensor_dtypes[name]:
-        tensor_dtypes.append(dtype)
-      for shape in self._tensor_shapes[name]:
-        tensor_shapes.append(shape)
-
-    # Outfeed ops execute on each replica's first logical core. Note: we must
-    # constraint it such that we have at most one outfeed dequeue and enqueue
-    # per replica.
-    for i in xrange(self._ctx.num_replicas):
-      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
-        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes,
-            shapes=tensor_shapes,
-            device_ordinal=ordinal_id)
-        for j, item in enumerate(outfeed_tensors):
-          dequeue_ops[j].append(item)
-
-    # Deconstruct dequeue ops.
-    flat_dequeue_ops = []
-    for l in dequeue_ops:
-      flat_dequeue_ops.extend(l)
-
-    dequeue_ops_by_name = {}
-    pos = 0
-    for name in self._names:
-      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
-                                              len(self._tensors[name])]
-      pos += len(self._tensors[name])
-
-    def _call_host_fn(fn, *args, **kw):
-      context = CatchInvalidHostcallFunctions()
-      context.Enter()
-      result = fn(*args, **kw)
-      context.Exit()
-      context.ExitResult(result)
-      return result
-
-    # It is assumed evaluation always happens on single host TPU system. So,
-    # place all ops on tpu host if possible.
-    #
-    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
-      for name in self._names:
-        dequeue_ops = dequeue_ops_by_name[name]
-        for i, item in enumerate(dequeue_ops):
-          if dequeue_ops[i][0].shape.ndims == 0:
-            raise RuntimeError(
-                'All tensors outfed from TPU should preserve batch size '
-                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-          # TODO(xiejw): Make the specification of the outfeed combinaton
-          # function more explicit and well-documented.  We may want to give the
-          # user the option of concatenating along any axis.
-          if (self._ctx.config.tpu_config.per_host_input_for_training is
-              tpu_config.InputPipelineConfig.BROADCAST):
-            # If the infeed is in BROADCAST mode (each core recieving the same
-            # input), then we assume that the cores also produce identical
-            # copies of the same output, and we simply take the output from
-            # the first core.  This mode is used by Mesh-TensorFlow.
-            with ops.control_dependencies(dequeue_ops[i]):
-              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
-          else:
-            # Assume that the input has been batch-split and that axis 0 of the
-            # output tensors represents the batch size.  Concatenate along
-            # the axis 0 to re-combine the batch.
-            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
-
-        if self._tensor_keys[name] is not None:
-          # The user-provided eval_metrics[1] is a dict.
-          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
-          try:
-            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
-          except TypeError as e:
-            logging.warning(
-                'Exception while calling %s: %s. It is likely the tensors '
-                '(%s[1]) do not match the '
-                'function\'s arguments', name, e, name)
-            raise
-        else:
-          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
-
-    # force all dequeue operations to be run if not consumed by the host calls
-    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
-    return ret
-
-
-class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
-  """Hook to run host calls when use_tpu=False."""
-
-  def __init__(self, tensors):
-    self._tensors = tensors
-
-  def begin(self):
-    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
-    # create a separate hook to guarantee execution order, because summaries
-    # need to be initialized before the outfeed thread starts.
-    # TODO(jhseu): Make a wrapper hook instead?
-    self._init_ops = contrib_summary.summary_writer_initializer_op()
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    self._finalize_ops = []
-    for op in self._init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def after_create_session(self, session, coord):
-    session.run(self._init_ops)
-
-  def before_run(self, run_context):
-    return basic_session_run_hooks.SessionRunArgs(self._tensors)
-
-  def end(self, session):
-    session.run(self._finalize_ops)
-
-
-class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Calculate and report global_step/sec and examples/sec during runtime."""
-
-  def __init__(self,
-               batch_size,
-               every_n_steps=100,
-               every_n_secs=None,
-               output_dir=None,
-               summary_writer=None):
-    self._batch_size = batch_size
-    super(ExamplesPerSecondHook, self).__init__(
-        every_n_steps=every_n_steps,
-        every_n_secs=every_n_secs,
-        output_dir=output_dir,
-        summary_writer=summary_writer)
-
-  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    global_step_per_sec = elapsed_steps / elapsed_time
-    examples_per_sec = self._batch_size * global_step_per_sec
-    if self._summary_writer is not None:
-      global_step_summary = Summary(value=[
-          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
-      ])
-      example_summary = Summary(value=[
-          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
-      ])
-      self._summary_writer.add_summary(global_step_summary, global_step)
-      self._summary_writer.add_summary(example_summary, global_step)
-    logging.info('global_step/sec: %g', global_step_per_sec)
-    logging.info('examples/sec: %g', examples_per_sec)
-
-
-class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
-  """Change SIGINT (CTRL^C) handler to force quit the process.
-
-  The default behavior often results in hanging processes.
-  The original handler is restored after training/evaluation.
-  """
-
-  def __init__(self):
-    self._signal_fn = signal.getsignal(signal.SIGINT)
-
-  def before_run(self, run_context):
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-
-  def end(self, session):
-    signal.signal(signal.SIGINT, self._signal_fn)
-
-
-class ExportSavedModelApiVersion(enum.Enum):
-  V1 = 1
-  V2 = 2
-
-
-class BatchConfig(
-    collections.namedtuple('BatchConfig', [
-        'num_batch_threads', 'max_batch_size', 'batch_timeout_micros',
-        'allowed_batch_sizes', 'max_enqueued_batches'
-    ])):
-  """Class to handle config inputs into the batching function."""
-
-  def __new__(cls,
-              num_batch_threads,
-              max_batch_size,
-              batch_timeout_micros,
-              allowed_batch_sizes,
-              max_enqueued_batches=10):
-    """Creates an EmbeddingConfigSpec instance.
-
-    Args:
-     num_batch_threads: Number of scheduling threads for processing batches of
-       work. Determines the number of batches processed in parallel.
-      max_batch_size: Batch sizes will never be bigger than this.
-      batch_timeout_micros: Maximum number of microseconds to wait before
-        outputting an incomplete batch.
-      allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
-        does nothing. Otherwise, supplies a list of batch sizes, causing the op
-        to pad batches up to one of those sizes. The entries must increase
-        monotonically, and the final entry must equal max_batch_size.
-      max_enqueued_batches: The maximum depth of the batch queue. Defaults to
-        10.
-
-    Returns:
-      An BatchConfig instance.
-    """
-    return super(BatchConfig, cls).__new__(
-        cls,
-        num_batch_threads=num_batch_threads,
-        max_batch_size=max_batch_size,
-        batch_timeout_micros=batch_timeout_micros,
-        allowed_batch_sizes=allowed_batch_sizes,
-        max_enqueued_batches=max_enqueued_batches)
-
-
-class TPUEstimator(estimator_lib.Estimator):
-  """Estimator with TPU support.
-
-  TPUEstimator also supports training on CPU and GPU. You don't need to define
-  a separate `tf.estimator.Estimator`.
-
-  TPUEstimator handles many of the details of running on TPU devices, such as
-  replicating inputs and models for each core, and returning to host
-  periodically to run hooks.
-
-  TPUEstimator transforms a global batch size in params to a per-shard batch
-  size when calling the `input_fn` and `model_fn`. Users should specify
-  global batch size in constructor, and then get the batch size for each shard
-  in `input_fn` and `model_fn` by `params['batch_size']`.
-
-  - For training, `model_fn` gets per-core batch size; `input_fn` may get
-    per-core or per-host batch size depending on `per_host_input_for_training`
-    in `TPUConfig` (See docstring for TPUConfig for details).
-
-  - For evaluation and prediction, `model_fn` gets per-core batch size and
-    `input_fn` get per-host batch size.
-
-  Evaluation
-  ==========
-
-  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation. If eval_on_tpu is False, the evaluation will execute on
-  CPU or GPU; in this case the following discussion on TPU evaluation does not
-  apply.
-
-  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
-  `tensors` could be a list of any nested structure of `Tensor`s (See
-  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
-  a dict from metric string name to the result of calling a metric function,
-  namely a `(metric_tensor, update_op)` tuple.
-
-  One can set `use_tpu` to `False` for testing. All training, evaluation, and
-  predict will be executed on CPU. `input_fn` and `model_fn` will receive
-  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
-
-  Current limitations:
-  --------------------
-
-  1. TPU evaluation only works on a single host (one TPU worker) except
-     BROADCAST mode.
-
-  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
-     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
-     batches should have the same size.
-
-  Example (MNIST):
-  ----------------
-
-  ```
-  # The metric Fn which runs on CPU.
-  def metric_fn(labels, logits):
-    predictions = tf.argmax(logits, 1)
-    return {
-      'accuracy': tf.compat.v1.metrics.precision(
-          labels=labels, predictions=predictions),
-    }
-
-  # Your model Fn which runs on TPU (eval_metrics is list in this example)
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, [labels, logits]))
-
-  # or specify the eval_metrics tensors as dict.
-  def model_fn(features, labels, mode, config, params):
-    ...
-    final_layer_output = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, {
-              'labels': labels,
-              'logits': final_layer_output,
-          }))
-  ```
-
-  Prediction
-  ==========
-
-  Prediction on TPU is an experimental feature to support large batch inference.
-  It is not designed for latency-critical system. In addition, due to some
-  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
-  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
-  convenient.
-
-  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
-  *should* raise an end-of-input exception (`OutOfRangeError` or
-  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
-  precise, the ops created by `input_fn` produce one batch of the data.
-  The `predict()` API processes one batch at a time. When reaching the end of
-  the data source, an end-of-input exception should be raised by one of these
-  operations. The user usually does not need to do this manually. As long as the
-  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
-  exception automatically after the last batch has been produced.
-
-  Note: Estimator.predict returns a Python generator. Please consume all the
-  data from the generator so that TPUEstimator can shutdown the TPU system
-  properly for user.
-
-  Current limitations:
-  --------------------
-  1. TPU prediction only works on a single host (one TPU worker).
-
-  2. `input_fn` must return a `Dataset` instance rather than `features`. In
-  fact, .train() and .evaluate() also support Dataset as return value.
-
-  Example (MNIST):
-  ----------------
-  ```
-  height = 32
-  width = 32
-  total_examples = 100
-
-  def predict_input_fn(params):
-    batch_size = params['batch_size']
-
-    images = tf.random.uniform(
-        [total_examples, height, width, 3], minval=-1, maxval=1)
-
-    dataset = tf.data.Dataset.from_tensor_slices(images)
-    dataset = dataset.map(lambda images: {'image': images})
-
-    dataset = dataset.batch(batch_size)
-    return dataset
-
-  def model_fn(features, labels, params, mode):
-     # Generate predictions, called 'output', from features['image']
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      return tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode,
-          predictions={
-              'predictions': output,
-              'is_padding': features['is_padding']
-          })
-
-  tpu_est = TPUEstimator(
-      model_fn=model_fn,
-      ...,
-      predict_batch_size=16)
-
-  # Fully consume the generator so that TPUEstimator can shutdown the TPU
-  # system.
-  for item in tpu_est.predict(input_fn=input_fn):
-    # Filter out item if the `is_padding` is 1.
-    # Process the 'predictions'
-  ```
-
-  Exporting
-  =========
-
-  `export_savedmodel` exports 2 metagraphs, one with `saved_model.SERVING`,
-  and another with `saved_model.SERVING` and `saved_model.TPU`.
-  At serving time, these tags are used to select metagraph to load.
-
-  Before running the graph on TPU, TPU system needs to be initialized. If
-  TensorFlow Serving model-server is used, this is done automatically. If
-  not, please call `session.run(tpu.initialize_system())`.
-
-  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
-  `model_fn`.
-
-  Example:
-  ----------------
-
-  ```
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-    export_outputs = {
-      'logits': export_output_lib.PredictOutput(
-        {'logits': logits})
-    }
-
-    def host_call(logits):
-      class_ids = math_ops.argmax(logits)
-      classes = string_ops.as_string(class_ids)
-      export_outputs['classes'] =
-        export_output_lib.ClassificationOutput(classes=classes)
-
-    tpu.outside_compilation(host_call, logits)
-
-    ...
-  ```
-
-  """
-
-  def __init__(self,
-               model_fn=None,
-               model_dir=None,
-               config=None,
-               params=None,
-               use_tpu=True,
-               train_batch_size=None,
-               eval_batch_size=None,
-               predict_batch_size=None,
-               batch_axis=None,
-               eval_on_tpu=True,
-               export_to_tpu=True,
-               export_to_cpu=True,
-               warm_start_from=None,
-               experimental_export_device_assignment=False,
-               embedding_config_spec=None,
-               export_saved_model_api_version=ExportSavedModelApiVersion.V1):
-    """Constructs an `TPUEstimator` instance.
-
-    Args:
-      model_fn: Model function as required by `Estimator` which returns
-        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
-        and `prediction_hooks` must not capure any TPU Tensor inside the
-        model_fn.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
-      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
-      params: An optional `dict` of hyper parameters that will be passed into
-        `input_fn` and `model_fn`.  Keys are names of parameters, values are
-        basic python types. There are reserved keys for `TPUEstimator`,
-        including 'batch_size'.
-      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
-        TPU training and evaluation respect this bit, but eval_on_tpu can
-        override execution of eval. See below. - Predict still happens on CPU.
-      train_batch_size: An int representing the global training batch size.
-        TPUEstimator transforms this global batch size to a per-shard batch
-        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
-        number of replicas.
-      eval_batch_size: An int representing evaluation batch size. Must be
-        divisible by total number of replicas.
-      predict_batch_size: An int representing the prediction batch size. Must be
-        divisible by total number of replicas.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards. For example, if your input_fn produced (images, labels)
-        where the images tensor is in `HWCN` format, your shard dimensions would
-        be [3, 0], where 3 corresponds to the `N` dimension of your images
-        Tensor, and 0 corresponds to the dimension along which to split the
-        labels to match up with the corresponding images. If None is supplied,
-        and per_host_input_for_training is True, batches will be sharded based
-        on the major dimension. If tpu_config.per_host_input_for_training is
-        False or `PER_HOST_V2`, batch_axis is ignored.
-      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
-        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
-      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU. Note that unsupported export modes such as EVAL will be
-        ignored. For those modes, only a CPU model will be exported.
-        Currently, export_to_tpu only supports PREDICT.
-      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on CPU.
-      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
-        configure warm-starting.  If the string filepath is provided instead of
-        a `WarmStartSettings`, then all variables are warm-started, and it is
-        assumed that vocabularies and Tensor names are unchanged.
-      experimental_export_device_assignment: Whether to include the device
-        assignment in the exported model. Doing so is useful in case of model
-        parallel inference but will tie the exported model to the TPU topology
-        used to export the model.
-      embedding_config_spec: Optional EmbeddingConfigSpec instance
-        to support using TPU embedding.
-      export_saved_model_api_version: ExportSavedModelApiVersion, V1 or V2.
-        With V1, `export_savedmodel()` adds rewrite() and TPUPartitionedCallOp()
-        for user; while in v2, user is expected to add rewrite(),
-        TPUPartitionedCallOp() etc in their model_fn.
-        A helper function `inference_on_tpu` is provided for V2.
-        brn_tpu_estimator.py includes examples for both versions
-        i.e. TPUEstimatorExportTest and TPUEstimatorExportV2Test.
-
-    Raises:
-      ValueError: `params` has reserved keys already.
-    """
-    if config is None or not isinstance(config, tpu_config.RunConfig):
-      raise ValueError(
-          '`config` must be provided with type `tpu_config.RunConfig`')
-
-    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError('{} are reserved keys but existed in params {}.'.format(
-          _RESERVED_PARAMS_KEYS, params))
-
-    if use_tpu:
-      # Perform some very basic validations. More validations will be found in
-      # _InternalTPUContext.
-      if train_batch_size is None:
-        raise ValueError('`train_batch_size` cannot be `None`')
-      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
-
-      if (config.tpu_config.per_host_input_for_training is
-          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
-          config.tpu_config.num_cores_per_replica):
-        raise ValueError(
-            'Model parallelism only supports per host input for training. '
-            'Please adjust TPURunconfig.per_host_input_for_training.')
-
-      if eval_batch_size is not None:
-        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
-
-      if predict_batch_size is not None:
-        util_lib.check_positive_integer(predict_batch_size,
-                                        'predict_batch_size')
-
-      if embedding_config_spec:
-        if (config.tpu_config.per_host_input_for_training !=
-            tpu_config.InputPipelineConfig.PER_HOST_V2):
-          raise ValueError('Only PER_HOST_V2 is supported when using TPU '
-                           'Embedding; got {}.'.format(
-                               config.tpu_config.per_host_input_for_training))
-
-    # Verifies the model_fn signature according to Estimator framework.
-    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
-    # We cannot store config and params in this constructor as parent
-    # constructor might change them, such as assigning a temp dir for
-    # config.model_dir.
-    model_function = self._augment_model_fn(model_fn, batch_axis)
-
-    # Overwrite log_step_count_steps to disable TensorLoggingHook and
-    # StepCounterHook from being created in Estimator. TPUEstimator already
-    # added equivalent hooks in _augment_model_fn above.
-    self._log_every_n_steps = config.log_step_count_steps
-    config = config.replace(log_step_count_steps=None)
-
-    # Passing non-None params as wrapped model_fn has it.
-    params = params or {}
-    super(TPUEstimator, self).__init__(
-        model_fn=model_function,
-        model_dir=model_dir,
-        config=config,
-        params=params,
-        warm_start_from=warm_start_from)
-    self._iterations_per_training_loop = (
-        self._config.tpu_config.iterations_per_loop)
-
-    # All properties passed to _InternalTPUContext are immutable.
-    # pylint: disable=protected-access
-    self._ctx = tpu_context._get_tpu_context(
-        self._config, train_batch_size, eval_batch_size, predict_batch_size,
-        use_tpu, eval_on_tpu, embedding_config_spec)
-
-    self._export_to_cpu = export_to_cpu
-    self._export_to_tpu = export_to_tpu
-    self._experimental_export_device_assignment = (
-        experimental_export_device_assignment)
-
-    if not isinstance(export_saved_model_api_version,
-                      ExportSavedModelApiVersion):
-      raise ValueError('export_saved_model_api_version should be of type '
-                       'ExportSavedModelApiVersion; got {}.'.format(
-                           export_saved_model_api_version))
-    self._export_saved_model_api_version = export_saved_model_api_version
-    self._is_input_fn_invoked = None
-
-    self._rendezvous = {}
-
-  def _add_meta_graph_for_mode(self,
-                               builder,
-                               input_receiver_fn_map,
-                               checkpoint_path,
-                               save_variables=True,
-                               mode=model_fn_lib.ModeKeys.PREDICT,
-                               export_tags=None,
-                               check_variables=True,
-                               strip_default_attrs=True):
-    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
-                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
-                      'for TPU.'.format(mode))
-
-    if not self._export_to_cpu and not self._export_to_tpu:
-      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
-
-    if self._export_to_cpu:
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables,
-          strip_default_attrs=strip_default_attrs))
-
-    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
-      input_receiver_fn_map = {
-          _INFERENCE_ON_TPU_MODE: input_receiver_fn_map[mode]
-      }
-      export_tags = [tag_constants.SERVING, tag_constants.TPU]
-      mode = _INFERENCE_ON_TPU_MODE
-
-      # See b/110052256 for why `check_variables` is `False`.
-      if not self._export_to_cpu:
-        check_variables = save_variables = True
-      else:
-        check_variables = save_variables = False
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables=save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables,
-          strip_default_attrs=strip_default_attrs))
-
-  def _call_model_fn(self, features, labels, mode, config):
-    if self._export_saved_model_api_version == ExportSavedModelApiVersion.V1:
-      if mode == _INFERENCE_ON_TPU_MODE:
-        return self._call_model_fn_for_inference(features, labels, mode, config)
-      else:
-        return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                        config)
-    else:
-      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                      config)
-
-  def _call_model_fn_for_inference(self, features, labels, mode, config):
-    """Wraps `_call_model_fn` for `export_savedmodel`."""
-    if mode != _INFERENCE_ON_TPU_MODE:
-      raise ValueError('mode must be {}; '
-                       'got {}.'.format(_INFERENCE_ON_TPU_MODE, mode))
-    return model_fn_inference_on_tpu(
-        self._model_fn,
-        features,
-        labels,
-        config,
-        self._params,
-        batch_config=None,
-        experimental_export_device_assignment=self
-        ._experimental_export_device_assignment,
-        call_context=self._ctx)
-
-  def _create_global_step(self, graph):
-    """Creates a global step suitable for TPUs.
-
-    Args:
-      graph: The graph in which to create the global step.
-
-    Returns:
-      A global step `Tensor`.
-
-    Raises:
-      ValueError: if the global step tensor is already defined.
-    """
-    return _create_global_step(graph)
-
-  def _convert_train_steps_to_hooks(self, steps, max_steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
-            steps, max_steps)
-
-    # On TPU.
-    if steps is None and max_steps is None:
-      raise ValueError(
-          'For TPU training, one of `steps` or `max_steps` must be set. '
-          'Cannot be both `None`.')
-
-    # Estimator.train has explicit positiveness check.
-    if steps is not None:
-      util_lib.check_positive_integer(steps, 'Train steps')
-    if max_steps is not None:
-      util_lib.check_positive_integer(max_steps, 'Train max_steps')
-
-    return [
-        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
-    ]
-
-  def _convert_eval_steps_to_hooks(self, steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
-
-    if steps is None:
-      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
-
-    util_lib.check_positive_integer(steps, 'Eval steps')
-
-    return [
-        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
-            num_evals=steps),
-        _SetEvalIterationsHook(steps)
-    ]
-
-  def _call_input_fn(self, input_fn, mode):
-    """Calls the input function.
-
-    Args:
-      input_fn: The input function.
-      mode: ModeKeys
-
-    Returns:
-      In TPU mode, returns an input_fn to be called later in model_fn.
-      Otherwise, calls the input_fn and returns either fatures or
-        (features, labels).
-
-    Raises:
-      ValueError: if input_fn takes invalid arguments or does not have `params`.
-    """
-    input_fn_args = function_utils.fn_args(input_fn)
-    config = self.config  # a deep copy.
-    kwargs = {}
-    if 'params' in input_fn_args:
-      kwargs['params'] = self.params  # a deep copy.
-    else:
-      raise ValueError('input_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params["batch_size"]'.format(input_fn))
-    if 'config' in input_fn_args:
-      kwargs['config'] = config
-
-    if 'mode' in input_fn_args:
-      kwargs['mode'] = mode
-
-    # Records the fact input_fn has been invoked.
-    self._is_input_fn_invoked = True
-
-    with self._ctx.with_mode(mode) as ctx:
-      # Setting the batch size in params first. This helps user to have same
-      # input_fn for use_tpu=True/False.
-      batch_size_for_input_fn = ctx.batch_size_for_input_fn
-      if batch_size_for_input_fn is not None:
-        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
-                            batch_size_for_input_fn)
-
-      # For export_savedmodel, input_fn is never passed to Estimator. So,
-      # `is_export_mode` must be False.
-      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
-          return input_fn(**kwargs)
-
-      # For TPU computation, input_fn should be invoked in a tf.while_loop for
-      # performance. While constructing the tf.while_loop, the structure of
-      # inputs returned by the `input_fn` needs to be recorded. The structure
-      # includes whether features or labels is dict or single Tensor, dict keys,
-      # tensor shapes, and dtypes. The recorded structure is used to create the
-      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
-      # inside the TPU computation, as the TPU computation is wrapped inside a
-      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
-      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
-      # `features` in `model_fn` signature.
-      def _input_fn(ctx):
-        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
-        return input_fn(**kwargs)
-
-      return _input_fn
-
-  def _validate_features_in_predict_input(self, result):
-    """Skip the validation.
-
-    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
-    has stronger check. Parent class's check generates confusing warning msg.
-
-    Args:
-      result: `features` returned by input_fn.
-    """
-    pass
-
-  def train(self,
-            input_fn,
-            hooks=None,
-            steps=None,
-            max_steps=None,
-            saving_listeners=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
-    try:
-      return super(TPUEstimator, self).train(
-          input_fn=input_fn,
-          hooks=hooks,
-          steps=steps,
-          max_steps=max_steps,
-          saving_listeners=saving_listeners)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('training_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('training_loop')
-      rendezvous.raise_errors()
-
-  def evaluate(self,
-               input_fn,
-               steps=None,
-               hooks=None,
-               checkpoint_path=None,
-               name=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
-    try:
-      return super(TPUEstimator, self).evaluate(
-          input_fn,
-          steps=steps,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('evaluation_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('evaluation_loop')
-      rendezvous.raise_errors()
-
-  def predict(self,
-              input_fn,
-              predict_keys=None,
-              hooks=None,
-              checkpoint_path=None,
-              yield_single_examples=True):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
-    try:
-      for result in super(TPUEstimator, self).predict(
-          input_fn=input_fn,
-          predict_keys=predict_keys,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          yield_single_examples=yield_single_examples):
-        yield result
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('prediction_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('prediction_loop')
-      rendezvous.raise_errors()
-
-    rendezvous.record_done('prediction_loop')
-    rendezvous.raise_errors()
-
-  def _augment_model_fn(self, model_fn, batch_axis):
-    """Returns a new model_fn, which wraps the TPU support."""
-
-    def _model_fn(features, labels, mode, config, params):
-      """A Estimator `model_fn` for TPUEstimator."""
-
-      # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
-      # but not in `export_savedmodel()`.
-      if self._is_input_fn_invoked:
-        is_export_mode = False
-      else:
-        is_export_mode = True
-
-      # Clear the bit.
-      self._is_input_fn_invoked = None
-
-      if is_export_mode:
-        if mode == _INFERENCE_ON_TPU_MODE:
-          _add_item_to_params(params, _USE_TPU_KEY, True)
-          mode = model_fn_lib.ModeKeys.PREDICT
-        else:
-          _add_item_to_params(params, _USE_TPU_KEY, False)
-
-      with self._ctx.with_mode(mode) as ctx:
-        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
-
-        # examples_hook is added to training_hooks for both CPU and TPU
-        # execution.
-        if self._log_every_n_steps is not None:
-          examples_hook = ExamplesPerSecondHook(
-              ctx.global_batch_size,
-              # pylint:disable=g-long-ternary
-              output_dir=(self.model_dir
-                          if not config or config.save_summary_steps
-                          else None),
-              # pylint:enable=g-long-ternary
-              every_n_steps=self._log_every_n_steps)
-
-        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
-          logging.info('Running %s on CPU', mode)
-          estimator_spec = model_fn_wrapper.call_without_tpu(
-              features, labels, is_export_mode=is_export_mode)
-          if self._log_every_n_steps is not None:
-            estimator_spec = estimator_spec._replace(
-                training_hooks=estimator_spec.training_hooks + (examples_hook,))
-          return estimator_spec
-
-        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
-        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
-        assert callable(features), '`input_fn` is not callable.'
-        input_fn = features
-
-        tpu_init_ops = []
-        if ctx.embedding_config and mode == model_fn_lib.ModeKeys.TRAIN:
-          dummy_table_variables, dummy_table_variables_init = (
-              tpu_embedding_gradient.create_dummy_table_variables(
-                  ctx.embedding_config.tpu_embedding))
-          ctx.embedding_config.dummy_table_variables = dummy_table_variables
-          tpu_init_ops.append(dummy_table_variables_init)
-
-        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
-            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
-
-        graph = ops.get_default_graph()
-        for enqueue_op in enqueue_ops:
-          if isinstance(enqueue_op, list):
-            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
-          else:
-            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
-
-        if mode == model_fn_lib.ModeKeys.TRAIN:
-          compile_op, loss, host_call, scaffold_fn, training_hooks = (
-              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          if ctx.embedding_config:
-            g = ops.get_default_graph()
-            table_to_config_dict = (
-                ctx.embedding_config.tpu_embedding.table_to_config_dict)
-            optimization_parameters = (
-                ctx.embedding_config.tpu_embedding.optimization_parameters)
-            embedding_variable_name_by_table, slot_variable_names_by_table = (
-                _tpu_estimator_embedding.get_full_variable_names(
-                    g, table_to_config_dict, optimization_parameters
-                )
-            )
-            embedding_variables_and_ops = (
-                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
-                    embedding_variable_name_by_table,
-                    slot_variable_names_by_table
-                ))
-            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
-          # scaffold_fn must be called after variables for TPU embedding has
-          # been created on CPU, as user might reinitialize those from some
-          # checkpoint within scaffold_fn.
-          scaffold = _get_scaffold(scaffold_fn)
-
-          host_ops = host_call.create_tpu_hostcall()
-
-          shutdown_hooks = []
-          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
-                                         'reset_computation')
-          if shutdown_mode:
-            if shutdown_mode == 'shutdown_worker':
-              finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(),
-              ]
-            elif shutdown_mode == 'shutdown_all_workers':
-              finalizer_hooks = [
-                  session_support.ShutdownAllWorkers(),
-              ]
-            elif shutdown_mode == 'reset_computation':
-              finalizer_hooks = [
-                  session_support.ResetComputation(),
-              ]
-            elif not shutdown_mode:
-              finalizer_hooks = []
-            else:
-              raise ValueError(
-                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
-
-            if finalizer_hooks:
-              shutdown_hooks.append(
-                  session_support.GracefulShutdownHook(
-                      checkpoint_prefix=self.model_dir + '/model.ckpt',
-                      on_shutdown_hooks=finalizer_hooks))
-
-          with ops.control_dependencies([loss]):
-            global_step = array_ops.identity(training.get_global_step())
-          hooks = input_hooks + shutdown_hooks
-          hooks.extend([
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.master,
-                  session_config=self._session_config,
-                  tpu_init_ops=tpu_init_ops),
-              InstallSignalHandlerHook()
-          ])
-          if tpu_cluster_resolver.is_running_in_gce():
-            hooks.extend(
-                [preempted_hook.CloudTPUPreemptedHook(self._config.cluster)])
-          if self._log_every_n_steps is not None:
-            logging_hook_frequency = (  # Divide and round up
-                (self._log_every_n_steps +
-                 self._config.tpu_config.iterations_per_loop - 1) //
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(
-                training.LoggingTensorHook({
-                    'loss': array_ops.identity(loss),
-                    'step': global_step,
-                },
-                                           every_n_iter=logging_hook_frequency))
-            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(examples_hook)
-
-          if training_hooks:
-            hooks.extend(training_hooks)
-
-          chief_hooks = []
-          if (self._config.save_checkpoints_secs or
-              self._config.save_checkpoints_steps):
-            checkpoint_hook = training.CheckpointSaverHook(
-                self.model_dir,
-                save_secs=self._config.save_checkpoints_secs,
-                save_steps=self._config.save_checkpoints_steps,
-                scaffold=scaffold)
-            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            chief_hooks.append(checkpoint_hook)
-
-          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
-          with ops.control_dependencies([loss]):
-            update_ops = _sync_variables_ops(ctx)
-            if ctx.embedding_config:
-              update_ops.extend(embedding_variables_and_ops.retrieve_ops())
-
-          # Validate the TPU training graph to catch basic errors
-          _validate_tpu_training_graph()
-
-          train_op = control_flow_ops.group(*update_ops)
-          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=loss,
-              training_chief_hooks=chief_hooks,
-              training_hooks=hooks,
-              train_op=train_op,
-              scaffold=scaffold)
-
-        if mode == model_fn_lib.ModeKeys.EVAL:
-          compile_op, total_loss, host_calls, scaffold_fn, eval_hooks = (
-              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          if ctx.embedding_config:
-            g = ops.get_default_graph()
-            table_to_config_dict = (
-                ctx.embedding_config.tpu_embedding.table_to_config_dict)
-            embedding_variable_name_by_table, _ = (
-                _tpu_estimator_embedding.get_full_variable_names(
-                    g, table_to_config_dict)
-            )
-            embedding_variables_and_ops = (
-                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
-                    embedding_variable_name_by_table
-                ))
-            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
-          # scaffold_fn must be called after variables for TPU embedding has
-          # been created on CPU, as user might reinitialize those from some
-          # checkpoint within scaffold_fn.
-          scaffold = _get_scaffold(scaffold_fn)
-          iterations_per_loop_var = _create_or_get_iterations_per_loop()
-          mean_loss = math_ops.div(
-              total_loss,
-              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
-
-          with ops.control_dependencies([mean_loss]):
-            # After TPU evaluation computation is done (the mean_loss tensor),
-            # reads all variables back from TPU and updates the eval step
-            # counter properly
-            internal_ops_to_run = _sync_variables_ops(ctx)
-            internal_ops_to_run.append(
-                _increase_eval_step_op(iterations_per_loop_var))
-
-          host_call_ret = host_calls.create_tpu_hostcall()
-          eval_metric_ops = {}
-          eval_update_ops = []
-
-          eval_metrics = host_call_ret.get('eval_metrics', {})
-          if eval_metrics:
-            # Creates a dummy metric update_op for all metrics. Estimator
-            # expects all metrics in `eval_metric_ops` have update_op and calls
-            # them one by one. The real metric update_ops are invoked in a
-            # separated thread. So, here give Estimator the dummy op for all
-            # metrics.
-            with ops.control_dependencies(internal_ops_to_run):
-              dummy_update_op = control_flow_ops.no_op()
-
-            for k, v in eval_metrics.items():
-              eval_metric_ops[k] = (v[0], dummy_update_op)
-              eval_update_ops.append(v[1])
-          else:
-            # If no eval metrics are passed, create an identity node for the
-            # loss and add `internal_ops_to_run` to its dependencies. So
-            # `internal_ops_to_run` can be executed.
-            with ops.control_dependencies(internal_ops_to_run):
-              mean_loss = array_ops.identity(mean_loss)
-
-          if 'host_call' not in host_call_ret:
-            host_ops = []
-          else:
-            host_ops = host_call_ret['host_call']
-          hooks = [
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  eval_update_ops + host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.evaluation_master,
-                  session_config=self._session_config,
-                  tpu_init_ops=tpu_init_ops)
-          ] + input_hooks
-
-          if tpu_cluster_resolver.is_running_in_gce():
-            hooks.extend(
-                [preempted_hook.CloudTPUPreemptedHook(self._config.cluster)])
-
-          if eval_hooks:
-            hooks.extend(eval_hooks)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=mean_loss,
-              evaluation_hooks=hooks,
-              eval_metric_ops=eval_metric_ops,
-              scaffold=scaffold)
-
-        # Predict
-        assert mode == model_fn_lib.ModeKeys.PREDICT
-
-        (compile_op, dummy_predict_op, host_calls,
-         scaffold_fn, prediction_hooks) = _predict_on_tpu_system(
-             ctx, model_fn_wrapper, dequeue_fn)
-        scaffold = _get_scaffold(scaffold_fn)
-        with ops.control_dependencies([dummy_predict_op]):
-          internal_ops_to_run = _sync_variables_ops(ctx)
-          with ops.control_dependencies(internal_ops_to_run):
-            dummy_predict_op = control_flow_ops.no_op()
-
-        # In train and evaluation, the main TPU program is passed to monitored
-        # training session to run. Infeed enqueue and outfeed dequeue are
-        # executed in side threads. This is not the configuration for
-        # prediction mode.
-        #
-        # For prediction, the Estimator executes the EstimatorSpec.predictions
-        # directly and yield the element (via generator) to call site. So, the
-        # outfeed based prediction must be passed to MonitoredSession directly.
-        # Other parts of the TPU execution are organized as follows.
-        #
-        # 1. All outfeed based Tensors must be grouped with predictions Tensors
-        #    to form a single invocation. This avoid the issue we might trigger
-        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
-        #    placed in control_dependencies of `stopping_signals`, and
-        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
-        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
-        #    all SessionRunArgs with the fetch in session.run together.
-        #
-        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
-        #    are grouped together. They will be launched once and only once in
-        #    side threads and they quit naturally according to the SAME stopping
-        #    condition.
-        enqueue_ops.append(dummy_predict_op)
-
-        host_call_ret = host_calls.create_tpu_hostcall()
-        if 'host_call' not in host_call_ret:
-          host_ops = []
-        else:
-          host_ops = host_call_ret['host_call']
-
-        predictions = host_call_ret['predictions']
-        _verify_cross_hosts_transfer_size(
-            predictions,
-            message=(
-                'The estimated size for TPUEstimatorSpec.predictions is too '
-                'large.'))
-        signals = host_call_ret['signals']
-
-        with ops.control_dependencies(host_ops):
-          host_ops = []  # Empty, we do do not need it anymore.
-          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              signals)
-          predictions = _PaddingSignals.slice_tensor_or_dict(
-              predictions, signals)
-
-        hooks = [
-            _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
-                tpu_compile_op=compile_op,
-                master=self._config.master,
-                session_config=self._session_config),
-        ] + input_hooks
-
-        if prediction_hooks:
-          hooks.extend(prediction_hooks)
-
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            prediction_hooks=hooks,
-            predictions=predictions,
-            scaffold=scaffold)
-
-    return _model_fn
-
-
-def _export_output_to_tensors(export_output):
-  """Get a list of `Tensors` used in `export_output`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-
-  Returns:
-    a list of tensors used in export_output.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    return [export_output.scores, export_output.classes]
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    return [export_output.value]
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return list(export_output.outputs.values())
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _clone_export_output_with_tensors(export_output, tensors):
-  """Clones `export_output` but with new `tensors`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-    tensors: a list of `Tensors` used to construct a new `export_output`.
-
-  Returns:
-    A dict similar to `export_output` but with `tensors`.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    if len(tensors) != 2:
-      raise ValueError('tensors must be of length 2; '
-                       'got {}.'.format(len(tensors)))
-    return export_output_lib.ClassificationOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    if len(tensors) != 1:
-      raise ValueError('tensors must be of length 1; '
-                       'got {}'.format(len(tensors)))
-    return export_output_lib.RegressionOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output_lib.PredictOutput(
-        dict(zip(export_output.outputs.keys(), tensors)))
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
-
-  @tpu_function.on_device_training_loop
-  def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
-                                [_ZERO_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_eval_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  return (compile_op, loss, host_calls, captured_scaffold_fn,
-          captured_eval_hooks.get())
-
-
-def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_train_step, host_call, captured_scaffold_fn,
-   captured_training_hooks) = (
-       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
-
-  @tpu_function.on_device_training_loop
-  def multi_tpu_train_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
-                                [_INITIAL_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_train_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  return (compile_op, loss, host_call, captured_scaffold_fn,
-          captured_training_hooks.get())
-
-
-def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
-   captured_predict_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
-
-  @tpu_function.on_device_training_loop
-  def multi_tpu_predict_steps_on_single_shard():
-
-    def cond(scalar_stopping_signal):
-      return math_ops.logical_not(
-          _StopSignals.should_stop(scalar_stopping_signal))
-
-    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
-    outputs = training_loop.while_loop(
-        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
-    return outputs
-
-  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
-      multi_tpu_predict_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  dummy_predict_op = dummy_predict_op[0]
-  return (compile_op, dummy_predict_op, host_calls, captured_scaffold_fn,
-          captured_predict_hooks.get())
-
-
-def _wrap_computation_in_while_loop(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def computation(i):
-    with ops.control_dependencies(op_fn()):
-      return i + 1
-
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    iterations = array_ops.identity(iterations_per_loop_var)
-    return control_flow_ops.while_loop(
-        lambda i: i < iterations,
-        computation, [constant_op.constant(0)],
-        parallel_iterations=1)
-
-
-def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def cond(scalar_stopping_signal):
-    return math_ops.logical_not(
-        _StopSignals.should_stop(scalar_stopping_signal))
-
-  def computation(unused_scalar_stopping_signal):
-    return_value = op_fn()
-    execute_ops = return_value['ops']
-    signals = return_value['signals']
-    with ops.control_dependencies(execute_ops):
-      return _StopSignals.as_scalar_stopping_signal(signals)
-
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    return control_flow_ops.while_loop(
-        cond,
-        computation, [_StopSignals.NON_STOPPING_SIGNAL],
-        parallel_iterations=1)
-
-
-def _validate_tpu_training_graph():
-  """Validate graph before running distributed training.
-
-  Raises:
-    ValueError: If the graph seems invalid for running on device
-  """
-  operations = ops.get_default_graph().get_operations()
-
-  # Check if there is atleast one CrossReplicaSum operation in the graph
-  # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [
-      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
-  ]
-  if not cross_replica_sum_ops:
-    raise ValueError(
-        'CrossShardOptimizer must be used for model training on TPUs.')
-
-
-class _CapturedObject(object):
-  """A placeholder to capture an object.
-
-  This is useful when we need to capture a Python object in the Tensorflow
-  control flow body function and use it outside the control flow.
-  """
-
-  def __init__(self):
-    self._object = None
-    self._captured = False
-
-  def capture(self, o):
-    if self._captured:
-      raise RuntimeError(
-          'InternalError: Object can capture only once. Please file bug.')
-
-    self._captured = True
-    self._object = o
-
-  def get(self):
-    if not self._captured:
-      raise RuntimeError(
-          'InternalError: Object is not captured properly before `get`. '
-          'Please file bug.')
-    return self._object
-
-
-def _get_scaffold(captured_scaffold_fn):
-  """Retrieves the Scaffold from `captured_scaffold_fn`."""
-  with _CapturingContext(message='Inside scaffold_fn'):
-    scaffold_fn = captured_scaffold_fn.get()
-    if scaffold_fn:
-      scaffold = scaffold_fn()
-      if scaffold is None:
-        raise ValueError(
-            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
-    else:
-      scaffold = None
-
-  if scaffold:
-    wrapped_finalize = scaffold.finalize
-
-    def _finalize():
-      with _CapturingContext('Inside Scaffold.finalize'):
-        wrapped_finalize()
-
-    scaffold.finalize = _finalize
-  return scaffold
-
-
-class _CapturingContext(control_flow_ops.ControlFlowContext):
-  """Tracks references to Tensors defined in TPU replication."""
-
-  def __init__(self, message):
-    control_flow_ops.ControlFlowContext.__init__(self)
-    self._message = message
-
-  def to_control_flow_context_def(self, context_def, export_scope=None):
-    # pylint: disable=useless-super-delegation
-    # NOTE(slebedev): the method is required by `ControlFlowContext`.
-    super(_CapturingContext, self).to_control_flow_context_def(
-        context_def, export_scope)
-
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    for c in op.inputs:
-      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError('{}: Op {} depends on TPU computation {}, '
-                         'which is not allowed.'.format(self._message, op, c))
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._old = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    # pylint: enable=protected-access
-
-  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
-    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
-
-
-class _Inputs(object):
-  """A data structure representing the input_fn returned values.
-
-  This also supports the returned value from input_fn as `Dataset`.
-  """
-
-  def __init__(self, features=None, labels=None, dataset=None, signals=None):
-    if dataset is not None and (features is not None or labels is not None or
-                                signals is not None):
-      raise RuntimeError('Internal Error: Either (features and labels) or '
-                         'dataset should be provided, not both. Please file '
-                         'bug')
-
-    self._features = features
-    self._labels = labels
-    self._signals = signals
-
-    self._dataset = dataset
-    self._iterator = None
-
-  @staticmethod
-  def from_input_fn(return_values):
-    """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.DatasetV2):
-      dataset = return_values
-      return _Inputs(dataset=dataset)
-
-    features, labels = _Inputs._parse_inputs(return_values)
-    return _Inputs(features, labels)
-
-  @staticmethod
-  def _parse_inputs(return_values):
-    if isinstance(return_values, tuple):
-      features, labels = return_values
-    else:
-      features, labels = return_values, None
-    return features, labels
-
-  @property
-  def is_dataset(self):
-    """Returns True if the return value from input_fn is Dataset."""
-    return self._dataset is not None
-
-  def dataset_initializer(self):
-    """Returns the dataset's initializer.
-
-    The initializer must be run before calling `features_and_labels`.
-    """
-    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return self._iterator.initializer
-
-  def features_and_labels(self):
-    """Gets `features` and `labels`."""
-    if self.is_dataset:
-      if self._iterator is None:
-        raise RuntimeError('Internal error: Must run dataset_initializer '
-                           'before calling features_and_labels(). Please file '
-                           'a bug!')
-      return _Inputs._parse_inputs(self._iterator.get_next())
-
-    return (self._features, self._labels)
-
-  def signals(self):
-    return self._signals
-
-  @property
-  def dataset(self):
-    return self._dataset
-
-
-class _InputsWithStoppingSignals(_Inputs):
-  """Inputs with `_StopSignals` inserted into the dataset."""
-
-  def __init__(self,
-               dataset,
-               batch_size,
-               add_padding=False,
-               num_invocations_per_step=1):
-
-    assert dataset is not None
-    user_provided_dataset = dataset.map(
-        _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size, add_padding=add_padding))
-    if num_invocations_per_step == 1:
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-    else:
-      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
-      # user_provided_dataset and stop properly.
-      # For example, if num_invocations_per_step is 2, we append 3 additional
-      # padding batches: b1, b2, b3.
-      # If user_provided_dataset contains two batches: a1, a2
-      # Step 1: [a1, a2]
-      # Step 2: [b1, b2] -> STOP
-      # If user_provided_dataset contains three batches: a1, a2, a3.
-      # The training loops:
-      # Step 1: [a1, a2]
-      # Step 2: [a3, b1]
-      # Step 3: [b2, b3] -> STOP.
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-      final_batch_dataset = final_batch_dataset.repeat(
-          2 * num_invocations_per_step - 1)
-
-      def _set_mask(data_dict):
-        signals = data_dict['signals']
-        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
-        data_dict['signals'] = signals
-        return data_dict
-
-      # Mask out the extra batch.
-      final_batch_dataset = final_batch_dataset.map(_set_mask)
-
-    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
-
-    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
-    self._current_inputs = None
-
-  def features_and_labels(self):
-    if self._current_inputs is not None:
-      raise RuntimeError(
-          'Internal Error: The previous inputs have not been properly '
-          'consumed. First call features_and_labels, then call signals.')
-
-    inputs_with_signals = self._iterator.get_next()
-    features = inputs_with_signals['features']
-    labels = inputs_with_signals.get('labels')
-
-    self._current_inputs = inputs_with_signals
-    return features, labels
-
-  def signals(self):
-    """Returns the `Signals` from `_Inputs`."""
-    if self._current_inputs is None:
-      raise RuntimeError(
-          'Internal Error: The current inputs have not been properly '
-          'generated. First call features_and_labels, then call signals.')
-    signals = self._current_inputs['signals']
-    self._current_inputs = None
-    return signals
-
-  @staticmethod
-  def insert_stopping_signal(stop, batch_size, add_padding=False):
-    """Inserts stopping_signal into dataset via _map_fn.
-
-    Here we change the data structure in the dataset, such that the return value
-    is a dictionary now and `features`, `labels`, and `signals` are three
-    distinguished keys in that dict. This provides a better structure, which
-    eases the process to decompose the inputs (see `features_and_labels`).
-
-    Args:
-      stop: bool, state of current stopping signals.
-      batch_size: int, batch size.
-      add_padding: bool, whether to pad the tensor to full batch size.
-
-    Returns:
-      A map_fn passed to dataset.map API.
-    """
-
-    def _map_fn(*args):
-      """The map fn to insert signals."""
-      if len(args) == 1:
-        # Unpack the single Tensor/dict argument as features. This is required
-        # for the input_fn returns no labels.
-        args = args[0]
-      features, labels = _Inputs._parse_inputs(args)
-      new_input_dict = {}
-
-      if add_padding:
-        padding_mask, features, labels = (
-            _PaddingSignals.pad_features_and_labels(features, labels,
-                                                    batch_size))
-
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-
-      else:
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-        padding_mask = None
-
-      new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size,
-          padding_mask=padding_mask).as_dict()
-
-      return new_input_dict
-
-    return _map_fn
-
-
-class _StopSignals(object):
-  """Signals class holding all logic to handle TPU stopping condition."""
-
-  NON_STOPPING_SIGNAL = False
-  STOPPING_SIGNAL = True
-
-  def __init__(self, stop, batch_size, padding_mask=None):
-    self._stop = stop
-    self._batch_size = batch_size
-    self._padding_mask = padding_mask
-
-  def as_dict(self):
-    """Returns the signals as Python dict."""
-    shape = [self._batch_size, 1]
-    dtype = dtypes.bool
-
-    if self._stop:
-      stopping = array_ops.ones(shape=shape, dtype=dtype)
-    else:
-      stopping = array_ops.zeros(shape=shape, dtype=dtype)
-
-    signals = {'stopping': stopping}
-    if self._padding_mask is not None:
-      signals['padding_mask'] = self._padding_mask
-    return signals
-
-  @staticmethod
-  def as_scalar_stopping_signal(signals):
-    return array_ops.identity(signals['stopping'][0][0])
-
-  @staticmethod
-  def should_stop(scalar_stopping_signal):
-    """Detects whether scalar_stopping_signal indicates stopping."""
-    if isinstance(scalar_stopping_signal, ops.Tensor):
-      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
-      # way to express the bool check whether scalar_stopping_signal is True.
-      return math_ops.logical_and(scalar_stopping_signal,
-                                  _StopSignals.STOPPING_SIGNAL)
-    else:
-      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
-      # the graph anymore. Here, we use pure Python.
-      return bool(scalar_stopping_signal)
-
-
-class _PaddingSignals(object):
-  """Signals class holding all logic to handle padding."""
-
-  @staticmethod
-  def pad_features_and_labels(features, labels, batch_size):
-    """Pads out the batch dimension of features and labels."""
-    real_batch_size = array_ops.shape(
-        _PaddingSignals._find_any_tensor(features))[0]
-
-    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
-
-    check_greater = check_ops.assert_greater_equal(
-        batch_size_tensor,
-        real_batch_size,
-        data=(batch_size_tensor, real_batch_size),
-        message='The real batch size should not be greater than batch_size.')
-
-    with ops.control_dependencies([check_greater]):
-      missing_count = batch_size_tensor - real_batch_size
-
-    def pad_single_tensor(tensor):
-      """Pads out the batch dimension of a tensor to the complete batch_size."""
-      rank = len(tensor.shape)
-      assert rank > 0
-      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
-      padded_tensor = array_ops.pad(tensor, padding)
-      padded_tensor.set_shape(padded_shape)
-      return padded_tensor
-
-    def nest_pad(tensor_or_dict):
-      return nest.map_structure(pad_single_tensor, tensor_or_dict)
-
-    features = nest_pad(features)
-    if labels is not None:
-      labels = nest_pad(labels)
-
-    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
-                                                 batch_size)
-
-    return padding_mask, features, labels
-
-  @staticmethod
-  def slice_tensor_or_dict(tensor_or_dict, signals):
-    """Slice the real Tensors according to padding mask in signals."""
-
-    padding_mask = signals['padding_mask']
-    batch_size = array_ops.shape(padding_mask)[0]
-
-    def verify_batch_size(tensor):
-      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
-      with ops.control_dependencies([check_batch_size]):
-        return array_ops.identity(tensor)
-
-    def slice_single_tensor(tensor):
-      rank = len(tensor.shape)
-      assert rank > 0
-      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
-      return verify_batch_size(tensor)[0:real_batch_size]
-
-    # As we split the Tensors to all TPU cores and concat them back, it is
-    # important to ensure the real data is placed before padded ones, i.e.,
-    # order is preserved. By that, the sliced padding mask should have all 0's.
-    # If this assertion failed, # the slice logic here would not hold.
-    sliced_padding_mask = slice_single_tensor(padding_mask)
-    assert_padding_mask = math_ops.equal(
-        math_ops.reduce_sum(sliced_padding_mask), 0)
-
-    with ops.control_dependencies([assert_padding_mask]):
-      should_stop = _StopSignals.should_stop(
-          _StopSignals.as_scalar_stopping_signal(signals))
-
-    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
-
-    def slice_fn(tensor):
-      # If the current batch is full batch or part of stopping signals, we do
-      # not need to slice to save performance.
-      return control_flow_ops.cond(
-          math_ops.logical_or(should_stop, is_full_batch),
-          (lambda: verify_batch_size(tensor)),
-          (lambda: slice_single_tensor(tensor)))
-
-    return nest.map_structure(slice_fn, tensor_or_dict)
-
-  @staticmethod
-  def _find_any_tensor(batch_features):
-    tensors = [
-        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
-    ]
-    if not tensors:
-      raise ValueError('Cannot find any Tensor in features dict.')
-    return tensors[0]
-
-  @staticmethod
-  def _padding_mask(real_batch_size, missing_count, batch_size):
-    padding_mask = array_ops.concat([
-        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
-        array_ops.ones((missing_count,), dtype=dtypes.int32)
-    ],
-                                    axis=0)
-    padding_mask.set_shape((batch_size,))
-    return padding_mask
-
-
-def _verify_cross_hosts_transfer_size(tensor_dict, message):
-  total_size = 0
-  tensor_structure = {}
-  for key, tensor in tensor_dict.items():
-    shape = tensor.shape
-    size = np.product(shape) * tensor.dtype.size
-    tensor_structure[key] = shape
-    total_size += size
-  if total_size >= _ONE_GIGABYTE:
-    raise ValueError(
-        '{} The transfer size is larger than the protobuf limit. Please '
-        'consider to use Tensors with smaller shapes or reduce batch '
-        'size. Given:\n'
-        '{}'.format(
-            message, '\n'.join([
-                ' -- Key: {}, Shape: {}'.format(k, v)
-                for k, v in tensor_structure.items()
-            ])))
-
-
-def _add_item_to_params(params, key, value):
-  """Adds a new item into `params`."""
-  if hasattr(params, 'set_hparam'):
-    # For HParams, we need to use special API.
-    if key in params:
-      params.set_hparam(key, value)
-    else:
-      params.add_hparam(key, value)
-  else:
-    # Now params is Python dict.
-    params[key] = value
-
-
-def export_estimator_savedmodel(estimator,
-                                export_dir_base,
-                                serving_input_receiver_fn,
-                                assets_extra=None,
-                                as_text=False,
-                                checkpoint_path=None,
-                                strip_default_attrs=False):
-  """Export `Estimator` trained model for TPU inference.
-
-  Args:
-    estimator: `Estimator` with which model has been trained.
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    serving_input_receiver_fn: A function that takes no argument and returns a
-      `ServingInputReceiver` or `TensorServingInputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs.
-
-  Returns:
-    The string path to the exported directory.
-  """
-  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
-  # `estimator.config`.
-  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
-  est = TPUEstimator(
-      estimator._model_fn,  # pylint: disable=protected-access
-      config=config,
-      params=estimator.params,
-      use_tpu=True,
-      train_batch_size=2048,  # Does not matter.
-      eval_batch_size=2048,  # Does not matter.
-  )
-  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                               assets_extra, as_text, checkpoint_path,
-                               strip_default_attrs)
-
-
-def model_fn_inference_on_tpu(model_fn,
-                              features,
-                              labels=None,
-                              config=None,
-                              params=None,
-                              batch_config=None,
-                              experimental_export_device_assignment=False,
-                              call_context=None):
-  """Convenience wrapper for export_saved_model API v2 for a model_fn.
-
-  It attempts to execute the entire model function on the TPU for prediction.
-  Note that this does not support features which are SparseTensors. If you have
-  SparseTensor features, consider partitioning your model function further and
-  use inference_on_tpu.
-
-  Args:
-    model_fn: the model_fn for which we want to inference on TPU.
-    features: a tensor or dict of tensors, serves as the feature inputs to the
-      model.
-    labels: a tensor or dict of tensors, serves as the labels inputs to the
-      model.
-    config: auxiliary config to the Estimator.
-    params: hparams that we want to pass to the model_fn.
-    batch_config: a named tuple to wrap the inference batching configuration
-      inputs.
-    experimental_export_device_assignment: Whether to include the device
-      assignment in the exported model. Doing so is useful in case of model
-      parallel inference but will tie the exported model to the TPU topology
-      used to export the model.
-    call_context: an optional TPUContext under which the TPU run configuartion
-      is stored.
-
-  Returns:
-    An EstimatorSpec containing the outputs in export_outputs and predictions.
-  """
-  computation, capture = _build_computation_for_inference(
-      model_fn, labels, config, params, experimental_export_device_assignment,
-      call_context)
-  tensors = call_computation(
-      features,
-      computation,
-      experimental_export_device_assignment=
-      experimental_export_device_assignment,
-      batch_config=batch_config)
-  estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
-      capture.get())
-  predictions_list = tensors[:len(predictions_dict)]
-  export_outputs_list_without_none = tensors[len(predictions_dict):]
-
-  # Reinsert `None`s which we've taken out in
-  # `_build_computation_for_inference()`.
-  export_outputs_list = []
-  while none_indices or export_outputs_list_without_none:
-    if none_indices and none_indices[0] == len(export_outputs_list):
-      export_outputs_list.append(None)
-      none_indices.pop(0)
-    else:
-      export_outputs_list.append(export_outputs_list_without_none.pop(0))
-
-  # Reconstruct `export_outputs` with updated tensors.
-  new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
-                                                  export_outputs_list)
-  export_outputs = estimator_spec.export_outputs
-  new_export_outputs = collections.OrderedDict(
-      (k, _clone_export_output_with_tensors(export_outputs[k], v))
-      for k, v in six.iteritems(new_export_outputs_dict))
-  # Reconstruct `predictions` with updated tensors.
-  new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
-  if (len(new_predictions) == 1 and
-      _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
-    new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
-
-  return estimator_spec._replace(
-      export_outputs=new_export_outputs, predictions=new_predictions)
-
-
-def _build_computation_for_inference(model_fn,
-                                     labels,
-                                     config,
-                                     params,
-                                     experimental_export_device_assignment,
-                                     call_context=None):
-  """Builds the computation with calls the model_fn for inference."""
-  capture = _CapturedObject()
-
-  def computation(computation_input):
-    """Computation to be passed to `TPUPartitionedCall()`."""
-    tpu_computation, tpu_capture = _build_tpu_computation_for_inference(
-        model_fn, computation_input, labels, config, params)
-
-    if experimental_export_device_assignment and call_context:
-      # Export the device assignment as part of the model. This is useful for
-      # model parallel usecases where the model relies on the mapping between
-      # logical and physical devices.
-      with call_context.with_mode(_INFERENCE_ON_TPU_MODE) as ctx:
-        device_assignment = ctx.device_assignment
-    else:
-      device_assignment = None
-
-    if experimental_export_device_assignment:
-      tensors_on_cpu = tpu.rewrite_for_inference(
-          tpu_computation, device_assignment=device_assignment)
-    else:
-      tensors_on_cpu = tpu.rewrite(
-          tpu_computation, device_assignment=device_assignment)
-      tpu.prune_unconnected_ops_from_xla(ops.get_default_graph())
-
-    (estimator_spec, export_outputs_dict, export_outputs_list,
-     predictions_dict) = (
-         tpu_capture.get())
-    predictions_list = tensors_on_cpu[:len(predictions_dict)]
-    export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
-
-    # Reconstruct tensors used in export_outputs, with TPU tensors replaced
-    # with their CPU counterpart returned from `rewrite_for_inference()`.
-    # `function.Defun()` does not like `None`s in return values, so we leave
-    # `None`s out but record their positions for later reconstruction.
-    export_outputs_list_without_none = []
-    none_indices = []
-    for i, t in enumerate(export_outputs_list):
-      if t is None:
-        none_indices.append(i)
-      else:
-        export_outputs_list_without_none.append(
-            export_outputs_tpu_on_cpu_list.pop(0))
-
-    capture.capture(
-        (estimator_spec, export_outputs_dict, predictions_dict, none_indices))
-    return predictions_list + export_outputs_list_without_none
-
-  return computation, capture
-
-
-def _build_tpu_computation_for_inference(model_fn, features, labels, config,
-                                         params):
-  """Builds the TPU computation for inference on TPU."""
-  capture = _CapturedObject()
-
-  def computation():
-    """Compute tpu tensors used in export_outputs.
-
-    Passed to rewrite_for_inference so that model_fn will be called under
-    the rewriting contexts. Only tpu tensors are returned, but export_outputs
-    and scaffold are captured.
-
-    Returns:
-       A list of Tensors used in export_outputs and not marked for
-       outside_compilation.
-    """
-    # We should only call model fn once and it should be inside `computation`
-    # so that building the graph will happen under `rewrite_for_inference`.
-
-    model_fn_args = function_utils.fn_args(model_fn)
-    kwargs = {}
-    # Makes deep copy with `config` and params` in case user mutates them.
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = model_fn_lib.ModeKeys.PREDICT
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-    estimator_spec = model_fn(features, **kwargs)
-
-    # We pick the TPU tensors out from `export_output` and later return them
-    # from `computation` for rewriting.
-    export_outputs_dict = collections.OrderedDict(
-        (k, _export_output_to_tensors(v))
-        for k, v in six.iteritems(estimator_spec.export_outputs))
-    export_outputs_list = nest.flatten(export_outputs_dict)
-    export_outputs_tpu_list = [t for t in export_outputs_list if t is not None]
-
-    if isinstance(estimator_spec.predictions, dict):
-      predictions_dict = collections.OrderedDict(
-          (k, v) for k, v in six.iteritems(estimator_spec.predictions))
-    else:
-      predictions_dict = {
-          _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
-      }
-    predictions_list = nest.flatten(predictions_dict)
-
-    # We cannot return everything we want through the return values, so
-    # capture the rest here for later use.
-    capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
-                     predictions_dict))
-    return predictions_list + export_outputs_tpu_list
-
-  return computation, capture
-
-
-def inference_on_tpu(computation,
-                     inputs_to_tpu,
-                     num_batch_threads,
-                     max_batch_size,
-                     batch_timeout_micros,
-                     allowed_batch_sizes=None,
-                     max_enqueued_batches=10):
-  """Convenient wrapper for export_saved_model API v2 to wrap TPU computation.
-
-  It puts computation on TPU, add batching around it and round robin computation
-  between TPU cores.
-
-  See tpu_estimator_test.py for an example.
-
-  Args:
-    computation: computation to be put on TPU, which takes inputs_to_tpu as
-      arguments.
-    inputs_to_tpu: a list of tensors as input to computation.
-    num_batch_threads: Number of scheduling threads for processing batches of
-      work. Determines the number of batches processed in parallel.
-    max_batch_size: Batch sizes will never be bigger than this.
-    batch_timeout_micros: Maximum number of microseconds to wait before
-      outputting an incomplete batch.
-    allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
-      does nothing. Otherwise, supplies a list of batch sizes, causing the op to
-      pad batches up to one of those sizes. The entries must increase
-      monotonically, and the final entry must equal max_batch_size.
-    max_enqueued_batches: The maximum depth of the batch queue. Defaults to 10.
-
-  Returns:
-    The unbatched computation output Tensors.
-  """
-
-  @batch_ops.batch_function(num_batch_threads, max_batch_size,
-                            batch_timeout_micros, allowed_batch_sizes,
-                            max_enqueued_batches)
-  def batched_tpu_computation(*args):
-
-    @function.Defun(capture_resource_var_by_value=False)
-    def tpu_computation():
-      return tpu.rewrite(computation, args)
-
-    return tpu_functional.TPUPartitionedCall(
-        args=tpu_computation.captured_inputs,
-        device_ordinal=tpu_ops.tpu_ordinal_selector(),
-        Tout=[o.type for o in tpu_computation.definition.signature.output_arg],
-        f=tpu_computation)
-
-  return batched_tpu_computation(*inputs_to_tpu)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import *
+# used by tests
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _create_global_step
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _get_scaffold
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _Inputs
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_TRAIN_OP
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/python/tpu/tpu_estimator_signals_test.py b/tensorflow/python/tpu/tpu_estimator_signals_test.py
deleted file mode 100644
index ca3eeaa..0000000
--- a/tensorflow/python/tpu/tpu_estimator_signals_test.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TPU Estimator Signalling Tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_estimator
-
-
-def make_input_fn(num_samples):
-  a = np.linspace(0, 100.0, num=num_samples)
-  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
-
-  def input_fn(params):
-    batch_size = params['batch_size']
-    da1 = dataset_ops.Dataset.from_tensor_slices(a)
-    da2 = dataset_ops.Dataset.from_tensor_slices(b)
-
-    dataset = dataset_ops.Dataset.zip((da1, da2))
-    dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
-    dataset = dataset.batch(batch_size)
-    return dataset
-  return input_fn, (a, b)
-
-
-def make_input_fn_with_labels(num_samples):
-  a = np.linspace(0, 100.0, num=num_samples)
-  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
-
-  def input_fn(params):
-    batch_size = params['batch_size']
-    da1 = dataset_ops.Dataset.from_tensor_slices(a)
-    da2 = dataset_ops.Dataset.from_tensor_slices(b)
-
-    dataset = dataset_ops.Dataset.zip((da1, da2))
-    dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
-    dataset = dataset.batch(batch_size)
-    return dataset
-  return input_fn, (a, b)
-
-
-class TPUEstimatorStoppingSignalsTest(test.TestCase):
-
-  def test_normal_output_without_signals(self):
-    num_samples = 4
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      features = dataset_ops.make_one_shot_iterator(dataset).get_next()
-
-      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
-      self.assertIsNone(features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        result = sess.run(features)
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-
-        # This run should work as num_samples / batch_size = 2.
-        result = sess.run(features)
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          # Given num_samples and batch_size, this run should fail.
-          sess.run(features)
-
-  def test_output_with_stopping_signals(self):
-    num_samples = 4
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
-      self.assertIsNone(features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This run should work as num_samples / batch_size = 2.
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(features)
-
-
-class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
-
-  def test_num_samples_divisible_by_batch_size(self):
-    num_samples = 4
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
-                                                        add_padding=True)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      # With padding, all shapes are static now.
-      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This run should work as num_samples / batch_size = 2.
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(features)
-
-  def test_num_samples_not_divisible_by_batch_size(self):
-    num_samples = 5
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn_with_labels(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
-                                                        add_padding=True)
-      dataset_initializer = inputs.dataset_initializer()
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      # With padding, all shapes are static.
-      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        evaluated_features, evaluated_labels, evaluated_signals = (
-            sess.run([features, labels, signals]))
-        self.assertAllEqual(a[:batch_size], evaluated_features['a'])
-        self.assertAllEqual(b[:batch_size], evaluated_labels)
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This run should work as num_samples / batch_size >= 2.
-        evaluated_features, evaluated_labels, evaluated_signals = (
-            sess.run([features, labels, signals]))
-        self.assertAllEqual(a[batch_size:2*batch_size], evaluated_features['a'])
-        self.assertAllEqual(b[batch_size:2*batch_size], evaluated_labels)
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This is the final partial batch.
-        evaluated_features, evaluated_labels, evaluated_signals = (
-            sess.run([features, labels, signals]))
-        real_batch_size = num_samples % batch_size
-
-        # Assert the real part.
-        self.assertAllEqual(a[2*batch_size:num_samples],
-                            evaluated_features['a'][:real_batch_size])
-        self.assertAllEqual(b[2*batch_size:num_samples],
-                            evaluated_labels[:real_batch_size])
-        # Assert the padded part.
-        self.assertAllEqual([0.0] * (batch_size - real_batch_size),
-                            evaluated_features['a'][real_batch_size:])
-        self.assertAllEqual([[0.0]] * (batch_size - real_batch_size),
-                            evaluated_labels[real_batch_size:])
-
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        padding = ([.0] * real_batch_size
-                   + [1.] * (batch_size - real_batch_size))
-        self.assertAllEqual(padding, evaluated_signals['padding_mask'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(features)
-
-  def test_slice(self):
-    num_samples = 3
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
-                                                        add_padding=True)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      sliced_features = (
-          tpu_estimator._PaddingSignals.slice_tensor_or_dict(
-              features, signals))
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This is the final partial batch.
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertEqual(1, len(result['a']))
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(sliced_features)
-
-  def test_slice_with_multi_invocations_per_step(self):
-    num_samples = 3
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(
-          dataset, batch_size, add_padding=True, num_invocations_per_step=2)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      sliced_features = (
-          tpu_estimator._PaddingSignals.slice_tensor_or_dict(features, signals))
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This is the final partial batch.
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertEqual(1, len(result['a']))
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # We should see 3 continuous batches with STOP ('1') as signals and all
-        # of them have mask 1.
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([1.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([1.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([1.] * batch_size,
-                            evaluated_signals['padding_mask'])
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(sliced_features)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
index 159131c..e073b99 100644
--- a/tensorflow/python/tpu/tpu_feed.py
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -78,7 +78,7 @@
                 x, num_or_size_splits=num_or_size_splits, axis=axis))
       output = new_output
     else:
-      output = [array_ops.split(x, dim, axis=axis) for x in output]
+      output = [array_ops.split(x, int(dim), axis=axis) for x in output]
     output = nest.flatten(output)
   return output
 
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index a832819..7e18c3b 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -20,14 +20,11 @@
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import functional as tpu_functional_ops
 from tensorflow.python.tpu import topology
 from tensorflow.python.tpu import tpu
 from tensorflow.python.util import compat
@@ -68,30 +65,25 @@
     # DistributedTPURewritePass. This pass actually adds real ops that
     # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
     # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
-    # The easiest way to trigger a rewrite is to run the function with
-    # TPUPartitionedCallOp.
     @function.defun
     def _tpu_init_fn():
       return tpu.initialize_system()
 
-    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
-    # see above) but need to define it to get it added to eager context
-    # and get its assigned name.
-    # pylint: disable=protected-access
-    graph_func = _tpu_init_fn._get_concrete_function_internal()
-    func_name = compat.as_str(graph_func._inference_function.name)
-    # pylint: enable=protected-access
-
     tpu_devices = sorted(
         [x for x in context.list_devices() if "device:TPU:" in x])
 
     if not tpu_devices:
       raise RuntimeError("Could not find any TPU devices")
 
-    with ops.device(device_util.get_host_for_device(tpu_devices[0])):
-      output = tpu_functional_ops.TPUPartitionedCall(
-          args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
-    serialized_topology = output[0].numpy()
+    # Replace the remote TPU device with the remote TPU_SYSTEM system device. As
+    # in the remote TPU device case, we will try to compile it instead of
+    # running through optimization passes and TF Executor, but TPU_SYSTEM should
+    # work.
+    tpu_system_device = tpu_devices[0].replace("TPU", "TPU_SYSTEM")
+
+    with ops.device(tpu_system_device):
+      output = _tpu_init_fn()
+    serialized_topology = output.numpy()
   else:
     master = cluster_resolver.master()
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
diff --git a/tensorflow/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
index 4cf40a9..6b94fe6 100644
--- a/tensorflow/python/tpu/tpu_test.py
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -30,6 +30,7 @@
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu
@@ -83,7 +84,6 @@
     # This should not throw an error.
     tpu.rewrite(loop)
 
-
 class TPUGraphPruneTest(test.TestCase):
 
   def test_prune_unconnected_ops(self):
@@ -141,6 +141,32 @@
         graph.get_operation_by_name("import/y").get_attr(
             tpu._TPU_REPLICATE_ATTR)
 
+def do_einsum():
+  a = array_ops.placeholder(dtype=dtypes.float32, name="a", shape=[2, 3, 4])
+  b = array_ops.placeholder(dtype=dtypes.float32, name="b", shape=[2, 4, 5])
+  return special_math_ops.einsum("abc,acd->abd", a, b)
+
+
+def find_einsum(g):
+  graph_def = g.as_graph_def()
+  for node in graph_def.node:
+    if node.op == "XlaEinsum":
+      return True
+  return False
+
+
+class TPUXlaEinsumTest(test.TestCase):
+
+  def test_tpu_rewrite_uses_xla_einsum(self):
+    with ops.Graph().as_default() as g:
+      tpu.rewrite(do_einsum)
+      self.assertTrue(find_einsum(g))
+
+  def test_default_does_not_use_xla_einsum(self):
+    with ops.Graph().as_default() as g:
+      do_einsum()
+      self.assertFalse(find_einsum(g))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tpu/util.py b/tensorflow/python/tpu/util.py
index dfb8ce1..6e0da24 100644
--- a/tensorflow/python/tpu/util.py
+++ b/tensorflow/python/tpu/util.py
@@ -1,51 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Utilities for the functionalities."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-import six
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training
-
-def check_positive_integer(value, name):
-  """Checks whether `value` is a positive integer."""
-  if not isinstance(value, six.integer_types):
-    raise TypeError('{} must be int, got {}'.format(name, type(value)))
-
-  if value <= 0:
-    raise ValueError('{} must be positive, got {}'.format(name, value))
-
-
-# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
-# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
-# python/estimator/util.py.
-class MultiHostDatasetInitializerHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes all passed iterators."""
-
-  def __init__(self, dataset_initializers):
-    self._initializers = dataset_initializers
-
-  def after_create_session(self, session, coord):
-    del coord
-    start = time.time()
-    session.run(self._initializers)
-    logging.info('Initialized dataset iterators in %d seconds',
-                 time.time() - start)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.util import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 68fcb97..c418323 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -22,8 +22,11 @@
 
 
 @tf_export(v1=["train.basic_train_loop"])
-def basic_train_loop(supervisor, train_step_fn, args=None,
-                     kwargs=None, master=""):
+def basic_train_loop(supervisor,
+                     train_step_fn,
+                     args=None,
+                     kwargs=None,
+                     master=""):
   """Basic loop to train a model.
 
   Calls `train_step_fn` in a loop to train a model.  The function is called as:
@@ -32,17 +35,18 @@
   train_step_fn(session, *args, **kwargs)
   ```
 
-  It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
+  It is passed a `tf.compat.v1.Session` in addition to `args` and `kwargs`.  The
+  function
   typically runs one training step in the session.
 
   Args:
-    supervisor: `tf.train.Supervisor` to run the training services.
-    train_step_fn: Callable to execute one training step.  Called
-      repeatedly as `train_step_fn(session, *args **kwargs)`.
+    supervisor: `tf.compat.v1.train.Supervisor` to run the training services.
+    train_step_fn: Callable to execute one training step.  Called repeatedly as
+      `train_step_fn(session, *args **kwargs)`.
     args: Optional positional arguments passed to `train_step_fn`.
     kwargs: Optional keyword arguments passed to `train_step_fn`.
-    master: Master to use to create the training session.  Defaults to
-      `""` which causes the session to be created in the local process.
+    master: Master to use to create the training session.  Defaults to `""`
+      which causes the session to be created in the local process.
   """
   if args is None:
     args = []
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index a71947f..763b079 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -42,7 +42,6 @@
 from tensorflow.python.training.summary_io import SummaryWriterCache
 from tensorflow.python.util.tf_export import tf_export
 
-
 _HOOKS = "hooks"
 _STEPS_PER_RUN_VAR = "steps_per_run"
 
@@ -85,8 +84,7 @@
 
 @tf_export(v1=["train.SecondOrStepTimer"])
 class SecondOrStepTimer(_HookTimer):
-  """Timer that triggers at most once every N seconds or once every N steps.
-  """
+  """Timer that triggers at most once every N seconds or once every N steps."""
 
   def __init__(self, every_secs=None, every_steps=None):
     self.reset()
@@ -171,29 +169,33 @@
   seeing the logs, you might want to add the following line after your imports:
 
   ```python
-    tf.logging.set_verbosity(tf.logging.INFO)
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   ```
 
   Note that if `at_end` is True, `tensors` should not include any tensor
   whose evaluation produces a side effect such as consuming additional inputs.
   """
 
-  def __init__(self, tensors, every_n_iter=None, every_n_secs=None,
-               at_end=False, formatter=None):
+  def __init__(self,
+               tensors,
+               every_n_iter=None,
+               every_n_secs=None,
+               at_end=False,
+               formatter=None):
     """Initializes a `LoggingTensorHook`.
 
     Args:
-      tensors: `dict` that maps string-valued tags to tensors/tensor names,
-          or `iterable` of tensors/tensor names.
+      tensors: `dict` that maps string-valued tags to tensors/tensor names, or
+        `iterable` of tensors/tensor names.
       every_n_iter: `int`, print the values of `tensors` once every N local
-          steps taken on the current worker.
+        steps taken on the current worker.
       every_n_secs: `int` or `float`, print the values of `tensors` once every N
-          seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
-          provided.
+        seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
+        provided.
       at_end: `bool` specifying whether to print the values of `tensors` at the
-          end of the run.
+        end of the run.
       formatter: function, takes dict of `tag`->`Tensor` and returns a string.
-          If `None` uses default printing all tensors.
+        If `None` uses default printing all tensors.
 
     Raises:
       ValueError: if `every_n_iter` is non-positive.
@@ -215,16 +217,18 @@
     self._tensors = tensors
     self._formatter = formatter
     self._timer = (
-        NeverTriggerTimer() if only_log_at_end else
-        SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter))
+        NeverTriggerTimer() if only_log_at_end else SecondOrStepTimer(
+            every_secs=every_n_secs, every_steps=every_n_iter))
     self._log_at_end = at_end
 
   def begin(self):
     self._timer.reset()
     self._iter_count = 0
     # Convert names to tensors if given
-    self._current_tensors = {tag: _as_graph_element(tensor)
-                             for (tag, tensor) in self._tensors.items()}
+    self._current_tensors = {
+        tag: _as_graph_element(tensor)
+        for (tag, tensor) in self._tensors.items()
+    }
 
   def before_run(self, run_context):  # pylint: disable=unused-argument
     self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
@@ -463,9 +467,10 @@
 
   ...
   listener = ExampleCheckpointSaverListener()
-  saver_hook = tf.train.CheckpointSaverHook(
+  saver_hook = tf.estimator.CheckpointSaverHook(
       checkpoint_dir, listeners=[listener])
-  with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
+  with
+  tf.compat.v1.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
     ...
   ```
 
@@ -516,9 +521,9 @@
       saver: `Saver` object, used for saving.
       checkpoint_basename: `str`, base name for the checkpoint files.
       scaffold: `Scaffold`, use to get saver object.
-      listeners: List of `CheckpointSaverListener` subclass instances.
-        Used for callbacks that run immediately before or after this hook saves
-        the checkpoint.
+      listeners: List of `CheckpointSaverListener` subclass instances. Used for
+        callbacks that run immediately before or after this hook saves the
+        checkpoint.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
@@ -531,8 +536,8 @@
     self._checkpoint_dir = checkpoint_dir
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
     self._scaffold = scaffold
-    self._timer = SecondOrStepTimer(every_secs=save_secs,
-                                    every_steps=save_steps)
+    self._timer = SecondOrStepTimer(
+        every_secs=save_secs, every_steps=save_steps)
     self._listeners = listeners or []
     self._steps_per_run = 1
 
@@ -555,13 +560,11 @@
     # add variables in begin. Graph is finalized after all begin calls.
     training_util.write_graph(
         ops.get_default_graph().as_graph_def(add_shapes=True),
-        self._checkpoint_dir,
-        "graph.pbtxt")
+        self._checkpoint_dir, "graph.pbtxt")
     saver_def = self._get_saver().saver_def if self._get_saver() else None
     graph = ops.get_default_graph()
     meta_graph_def = meta_graph.create_meta_graph_def(
-        graph_def=graph.as_graph_def(add_shapes=True),
-        saver_def=saver_def)
+        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
     self._summary_writer.add_graph(graph)
     self._summary_writer.add_meta_graph(meta_graph_def)
     # The checkpoint saved here is the state at step "global_step".
@@ -573,8 +576,8 @@
 
   def after_run(self, run_context, run_values):
     stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(
-        stale_global_step + self._steps_per_run):
+    if self._timer.should_trigger_for_step(stale_global_step +
+                                           self._steps_per_run):
       # get the real value after train op.
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
@@ -627,8 +630,8 @@
     elif len(savers) > 1:
       raise RuntimeError(
           "More than one item in collection {}. "
-          "Please indicate which one to use by passing it to the constructor.".
-          format(collection_key))
+          "Please indicate which one to use by passing it to the constructor."
+          .format(collection_key))
 
     self._saver = savers[0]
     return savers[0]
@@ -647,8 +650,8 @@
     if (every_n_steps is None) == (every_n_secs is None):
       raise ValueError(
           "exactly one of every_n_steps and every_n_secs should be provided.")
-    self._timer = SecondOrStepTimer(every_steps=every_n_steps,
-                                    every_secs=every_n_secs)
+    self._timer = SecondOrStepTimer(
+        every_steps=every_n_steps, every_secs=every_n_secs)
 
     self._summary_writer = summary_writer
     self._output_dir = output_dir
@@ -673,8 +676,9 @@
   def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
     steps_per_sec = elapsed_steps / elapsed_time
     if self._summary_writer is not None:
-      summary = Summary(value=[Summary.Value(
-          tag=self._summary_tag, simple_value=steps_per_sec)])
+      summary = Summary(value=[
+          Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)
+      ])
       self._summary_writer.add_summary(summary, global_step)
     logging.info("%s: %g", self._summary_tag, steps_per_sec)
 
@@ -682,8 +686,8 @@
     _ = run_context
 
     stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(
-        stale_global_step + self._steps_per_run):
+    if self._timer.should_trigger_for_step(stale_global_step +
+                                           self._steps_per_run):
       # get the real value after train op.
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
@@ -767,18 +771,18 @@
 
     Args:
       save_steps: `int`, save summaries every N steps. Exactly one of
-          `save_secs` and `save_steps` should be set.
+        `save_secs` and `save_steps` should be set.
       save_secs: `int`, save summaries every N seconds.
-      output_dir: `string`, the directory to save the summaries to. Only used
-          if no `summary_writer` is supplied.
+      output_dir: `string`, the directory to save the summaries to. Only used if
+        no `summary_writer` is supplied.
       summary_writer: `SummaryWriter`. If `None` and an `output_dir` was passed,
-          one will be created accordingly.
+        one will be created accordingly.
       scaffold: `Scaffold` to get summary_op if it's not provided.
       summary_op: `Tensor` of type `string` containing the serialized `Summary`
-          protocol buffer or a list of `Tensor`. They are most likely an output
-          by TF summary methods like `tf.summary.scalar` or
-          `tf.summary.merge_all`. It can be passed in as one tensor; if more
-          than one, they must be passed in as a list.
+        protocol buffer or a list of `Tensor`. They are most likely an output by
+        TF summary methods like `tf.compat.v1.summary.scalar` or
+        `tf.compat.v1.summary.merge_all`. It can be passed in as one tensor; if
+        more than one, they must be passed in as a list.
 
     Raises:
       ValueError: Exactly one of scaffold or summary_op should be set.
@@ -791,8 +795,8 @@
     self._summary_writer = summary_writer
     self._output_dir = output_dir
     self._scaffold = scaffold
-    self._timer = SecondOrStepTimer(every_secs=save_secs,
-                                    every_steps=save_steps)
+    self._timer = SecondOrStepTimer(
+        every_secs=save_secs, every_steps=save_steps)
     # TODO(mdan): Throw an error if output_dir and summary_writer are None.
 
   def begin(self):
@@ -903,8 +907,9 @@
         self._worker_is_started = True
         return None
       if current_step - last_logged_step > 1000:
-        logging.info("Waiting for global step %d before starting training. "
-                     "Current step is %d.", self._wait_until_step, current_step)
+        logging.info(
+            "Waiting for global step %d before starting training. "
+            "Current step is %d.", self._wait_until_step, current_step)
         last_logged_step = current_step
       time.sleep(0.5)
 
@@ -917,8 +922,8 @@
     """Initializes `FinalOpHook` with ops to run at the end of the session.
 
     Args:
-      final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of
-        names to `Tensors`.
+      final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
+        to `Tensors`.
       final_ops_feed_dict: A feed dictionary to use when running
         `final_ops_dict`.
     """
@@ -997,14 +1002,14 @@
 
     Args:
       save_steps: `int`, save profile traces every N steps. Exactly one of
-          `save_secs` and `save_steps` should be set.
+        `save_secs` and `save_steps` should be set.
       save_secs: `int` or `float`, save profile traces every N seconds.
       output_dir: `string`, the directory to save the profile traces to.
-          Defaults to the current directory.
+        Defaults to the current directory.
       show_dataflow: `bool`, if True, add flow events to the trace connecting
-          producers and consumers of tensors.
+        producers and consumers of tensors.
       show_memory: `bool`, if True, add object snapshot events to the trace
-          showing the sizes and lifetimes of tensors.
+        showing the sizes and lifetimes of tensors.
     """
     self._output_file = os.path.join(output_dir, "timeline-{}.json")
     self._file_writer = SummaryWriterCache.get(output_dir)
@@ -1024,8 +1029,9 @@
         self._next_step is not None and
         self._timer.should_trigger_for_step(self._next_step))
     requests = {"global_step": self._global_step_tensor}
-    opts = (config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
-            if self._request_summary else None)
+    opts = (
+        config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+        if self._request_summary else None)
 
     return SessionRunArgs(requests, options=opts)
 
@@ -1039,8 +1045,7 @@
     if self._request_summary:
       global_step = run_context.session.run(self._global_step_tensor)
       self._timer.update_last_triggered_step(global_step)
-      self._save(global_step,
-                 self._output_file.format(global_step),
+      self._save(global_step, self._output_file.format(global_step),
                  run_values.run_metadata.step_stats)
       self._file_writer.add_run_metadata(run_values.run_metadata,
                                          "step_%d" % global_step)
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 131ecf7..8382990 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -391,6 +391,10 @@
   This is the recommended way to get the mtimes, since it takes into account
   the naming difference between V1 and V2 formats.
 
+  Note: If not all checkpoints exist, the length of the returned mtimes list
+  will be smaller than the length of `checkpoint_prefixes` list, so mapping
+  checkpoints to corresponding mtimes will not be possible.
+
   Args:
     checkpoint_prefixes: a list of checkpoint paths, typically the results of
       `Saver.save()` or those of `tf.train.latest_checkpoint()`, regardless of
@@ -477,6 +481,7 @@
   """Deletes old checkpoints.
 
   Example usage:
+
   ```python
   import tensorflow as tf
   checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
@@ -663,7 +668,7 @@
 
     Returns:
       The path to the new checkpoint. It is also recorded in the `checkpoints`
-      and `latest_checkpoint` properies.
+      and `latest_checkpoint` properties.
     """
     # Save counter logic duplicated from tf.train.Checkpoint, soon to diverge
     # slightly with a custom numbering option.
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index b6c5d30..11f6a2f 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -106,7 +106,7 @@
   """Replaces `tf.Variable` initializers so they load from a checkpoint file.
 
   Values are not loaded immediately, but when the initializer is run
-  (typically by running a `tf.global_variables_initializer` op).
+  (typically by running a `tf.compat.v1.global_variables_initializer` op).
 
   Note: This overrides default initialization ops of specified variables and
   redefines dtype.
@@ -139,15 +139,15 @@
   #  -- name='old_scope_2/var3', shape=[100, 100]
 
   # Create new model's variables
-  with tf.variable_scope('new_scope_1'):
-    var1 = tf.get_variable('var1', shape=[20, 2],
-                           initializer=tf.zeros_initializer())
-  with tf.variable_scope('new_scope_2'):
-    var2 = tf.get_variable('var2', shape=[50, 4],
-                           initializer=tf.zeros_initializer())
+  with tf.compat.v1.variable_scope('new_scope_1'):
+    var1 = tf.compat.v1.get_variable('var1', shape=[20, 2],
+                           initializer=tf.compat.v1.zeros_initializer())
+  with tf.compat.v1.variable_scope('new_scope_2'):
+    var2 = tf.compat.v1.get_variable('var2', shape=[50, 4],
+                           initializer=tf.compat.v1.zeros_initializer())
     # Partition into 5 variables along the first axis.
-    var3 = tf.get_variable(name='var3', shape=[100, 100],
-                           initializer=tf.zeros_initializer(),
+    var3 = tf.compat.v1.get_variable(name='var3', shape=[100, 100],
+                           initializer=tf.compat.v1.zeros_initializer(),
                            partitioner=lambda shape, dtype: [5, 1])
 
   # Initialize all variables in `new_scope_1` from `old_scope_1`.
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 1f94679..ef6e059 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -131,9 +131,13 @@
 
 
 @tf_export(v1=["train.replica_device_setter"])
-def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
-                          worker_device="/job:worker", merge_devices=True,
-                          cluster=None, ps_ops=None, ps_strategy=None):
+def replica_device_setter(ps_tasks=0,
+                          ps_device="/job:ps",
+                          worker_device="/job:worker",
+                          merge_devices=True,
+                          cluster=None,
+                          ps_ops=None,
+                          ps_strategy=None):
   """Return a `device function` to use when building a Graph for replicas.
 
   Device Functions are used in `with tf.device(device_function):` statement to
@@ -158,7 +162,8 @@
   cluster_spec = {
       "ps": ["ps0:2222", "ps1:2222"],
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
-  with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
+  with
+  tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)):
     # Build your graph
     v1 = tf.Variable(...)  # assigned to /job:ps/task:0
     v2 = tf.Variable(...)  # assigned to /job:ps/task:1
@@ -218,6 +223,6 @@
     ps_strategy = _RoundRobinStrategy(ps_tasks)
   if not six.callable(ps_strategy):
     raise TypeError("ps_strategy must be callable")
-  chooser = _ReplicaDeviceChooser(
-      ps_tasks, ps_device, worker_device, merge_devices, ps_ops, ps_strategy)
+  chooser = _ReplicaDeviceChooser(ps_tasks, ps_device, worker_device,
+                                  merge_devices, ps_ops, ps_strategy)
   return chooser.device_function
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index 35f0b6e..d0aa3c3 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -65,8 +65,8 @@
   """Gets the eval step `Tensor` value after running `update_ops`.
 
   Args:
-    update_ops: A list of `Tensors` or a dictionary of names to `Tensors`,
-        which are run before reading the eval step value.
+    update_ops: A list of `Tensors` or a dictionary of names to `Tensors`, which
+      are run before reading the eval step value.
 
   Returns:
     A `Tensor` representing the value for the evaluation step.
@@ -102,21 +102,20 @@
 
   def after_create_session(self, session, coord):
     # Update number of steps to run in the first run call
-    if  self._num_evals is None:
+    if self._num_evals is None:
       steps = self._steps_per_run_initial_value
     else:
       steps = min(self._steps_per_run_initial_value, self._num_evals)
     self._steps_per_run_variable.load(steps, session=session)
 
   def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs({
-        'evals_completed': self._evals_completed
-    })
+    return session_run_hook.SessionRunArgs(
+        {'evals_completed': self._evals_completed})
 
   def after_run(self, run_context, run_values):
     evals_completed = run_values.results['evals_completed']
     # Update number of steps to run in the next iteration
-    if  self._num_evals is None:
+    if self._num_evals is None:
       steps = self._steps_per_run_initial_value
     else:
       steps = min(self._num_evals - evals_completed,
@@ -147,16 +146,15 @@
     self._evals_completed = None
     self._log_progress = log_progress
     # Reduce logging frequency if there are 20 or more evaluations.
-    self._log_frequency = (1 if (num_evals is None or num_evals < 20)
-                           else math.floor(num_evals / 10.))
+    self._log_frequency = (1 if (num_evals is None or num_evals < 20) else
+                           math.floor(num_evals / 10.))
 
   def _set_evals_completed_tensor(self, updated_eval_step):
     self._evals_completed = updated_eval_step
 
   def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs({
-        'evals_completed': self._evals_completed
-    })
+    return session_run_hook.SessionRunArgs(
+        {'evals_completed': self._evals_completed})
 
   def after_run(self, run_context, run_values):
     evals_completed = run_values.results['evals_completed']
@@ -205,20 +203,20 @@
   Args:
     checkpoint_path: The path to a checkpoint to use for evaluation.
     master: The BNS address of the TensorFlow master.
-    scaffold: An tf.train.Scaffold instance for initializing variables and
-      restoring variables. Note that `scaffold.init_fn` is used by the function
-      to restore the checkpoint. If you supply a custom init_fn, then it must
-      also take care of restoring the model from its checkpoint.
-    eval_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
-      to `Tensors`, which is run until the session is requested to stop,
-      commonly done by a `tf.contrib.training.StopAfterNEvalsHook`.
+    scaffold: An tf.compat.v1.train.Scaffold instance for initializing variables
+      and restoring variables. Note that `scaffold.init_fn` is used by the
+      function to restore the checkpoint. If you supply a custom init_fn, then
+      it must also take care of restoring the model from its checkpoint.
+    eval_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names to
+      `Tensors`, which is run until the session is requested to stop, commonly
+      done by a `tf.contrib.training.StopAfterNEvalsHook`.
     feed_dict: The feed dictionary to use when executing the `eval_ops`.
     final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
       to `Tensors`.
     final_ops_feed_dict: A feed dictionary to use when evaluating `final_ops`.
-    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
-      evaluation loop.
-    config: An instance of `tf.ConfigProto` that will be used to
+    hooks: List of `tf.estimator.SessionRunHook` callbacks which are run inside
+      the evaluation loop.
+    config: An instance of `tf.compat.v1.ConfigProto` that will be used to
       configure the `Session`. If left as `None`, the default will be used.
 
   Returns:
@@ -263,8 +261,8 @@
       master=master,
       config=config)
 
-  final_ops_hook = basic_session_run_hooks.FinalOpsHook(
-      final_ops, final_ops_feed_dict)
+  final_ops_hook = basic_session_run_hooks.FinalOpsHook(final_ops,
+                                                        final_ops_feed_dict)
   hooks.append(final_ops_hook)
 
   with monitored_session.MonitoredSession(
diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer.py b/tensorflow/python/training/experimental/loss_scale_optimizer.py
index b0d101f..eaa3f02 100644
--- a/tensorflow/python/training/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer.py
@@ -119,8 +119,8 @@
 
     grads = [g for g, _ in grads_and_vars]
     variables = [v for _, v in grads_and_vars]
-    scaled_grads = self._scale_grads(grads)
-    return list(zip(scaled_grads, variables))
+    unscaled_grads = self._unscale_grads(grads)
+    return list(zip(unscaled_grads, variables))
 
   def _scale_loss(self, loss):
     loss_scale = self._loss_scale()
@@ -128,7 +128,7 @@
       return lambda: loss() * loss_scale
     return loss * loss_scale
 
-  def _scale_grads(self, grads):
+  def _unscale_grads(self, grads):
     loss_scale = self._loss_scale()
     loss_scale_reciprical = 1 / loss_scale
     return [
@@ -171,6 +171,7 @@
       return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
 
     replica_context = distribution_strategy_context.get_replica_context()
+    grads_and_vars = tuple(grads_and_vars)
 
     # TODO(nluehr) cleanup GraphKeys.TRAIN_OP
     return replica_context.merge_call(
diff --git a/tensorflow/python/training/experimental/mixed_precision.py b/tensorflow/python/training/experimental/mixed_precision.py
index b96192b..9d4fa4b 100644
--- a/tensorflow/python/training/experimental/mixed_precision.py
+++ b/tensorflow/python/training/experimental/mixed_precision.py
@@ -19,8 +19,10 @@
 from __future__ import print_function
 
 from tensorflow.python.framework import config
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import optimizer
 from tensorflow.python.training.experimental import loss_scale_optimizer as loss_scale_optimizer_v1
+from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
@@ -134,30 +136,6 @@
   `tf.gradients`/`tf.GradientTape` will not. If you do directly use
   `tf.gradients` or `tf.GradientTape`, your model may train to a worse quality.
 
-  Note: If you explicitly pass a ConfigProto to your Session, you must set the
-  `auto_mixed_precision` option to ON. If you do not pass any ConfigProto to
-  your Session, no extra work needs to be done. For example:
-
-  ```
-  loss, trainable_vars = ...
-  opt = tf.keras.optimizers.SGD(0.001)
-  opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt)
-  train_op = opt.minimize(loss, vars=trainable_vars)
-
-  # No extra work needs to be done, as no ConfigProto is passed to the Session
-  with tf.Session() as sess:
-    sess.run(train_op)
-
-  # If a ConfigProto is passed to Session, you MUST set the
-  # `auto_mixed_precision` field to ON.
-  config = tf.ConfigProto()
-  from tensorflow.core.protobuf import rewriter_config_pb2
-  config.graph_options.rewrite_options.auto_mixed_precision = (
-      rewriter_config_pb2.RewriterConfig.ON)
-  with tf.Session(config=config) as sess:
-    sess.run(train_op)
-  ```
-
   Currently, mixed precision is only enabled on Volta GPUs and above. TPU
   support is coming soon. CPUs are not supported, as CPUs do not run float16
   operations faster than float32 operations.
@@ -181,6 +159,13 @@
 def _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
                                                use_v1_behavior):
   """Enables mixed precision. See `enable_mixed_precision_graph_rewrite`."""
+  if mixed_precision_global_state.non_mixed_precision_session_created:
+    # TODO(reedwm): Give the stacktrace of the existing Sessions. And if the
+    # Sessions have already been closed, do not raise this error message.
+    tf_logging.warn('You already have existing Sessions that do not use mixed '
+                    'precision. enable_mixed_precision_graph_rewrite() will '
+                    'not affect these Sessions.')
   opt = _wrap_optimizer(opt, loss_scale, use_v1_behavior=use_v1_behavior)
   config.set_optimizer_experimental_options({'auto_mixed_precision': True})
+  mixed_precision_global_state.mixed_precision_is_enabled = True
   return opt
diff --git a/tensorflow/python/training/experimental/mixed_precision_global_state.py b/tensorflow/python/training/experimental/mixed_precision_global_state.py
new file mode 100644
index 0000000..ffb3f90
--- /dev/null
+++ b/tensorflow/python/training/experimental/mixed_precision_global_state.py
@@ -0,0 +1,34 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains global variables related to mixed precision.
+
+This is not part of mixed_precision.py to avoid a circular dependency.
+mixed_precision.py depends on Session, and Session depends on this file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# Whether mixed precision has been enabled or not with
+# `enable_mixed_precision_graph_rewrite`. Used to turn on auto_mixed_precision
+# in ConfigProtos passed to Sessions.
+mixed_precision_is_enabled = False
+
+# True if a Session has been created without mixed precision being enabled. Used
+# to give a warning if mixed precision is enabled after a Session has already
+# been created.
+non_mixed_precision_session_created = False
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index 94bd438..a52e847 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -20,6 +20,7 @@
 import os
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -32,9 +33,11 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import gradient_descent as gradient_descent_v1
 from tensorflow.python.training.experimental import loss_scale_optimizer as loss_scale_optimizer_v1
 from tensorflow.python.training.experimental import mixed_precision
+from tensorflow.python.training.experimental import mixed_precision_global_state
 
 
 if tf2.enabled():
@@ -55,6 +58,10 @@
     # to ignore performance and always transform the graph.
     self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
     os.environ[self.IGNORE_PERF_VAR] = '1'
+    # Set global variables to their original state, in case other tests modified
+    # them
+    mixed_precision_global_state.mixed_precision_is_enabled = False
+    mixed_precision_global_state.non_mixed_precision_session_created = False
 
   def tearDown(self):
     # Set auto_mixed_precision back to it's default value.
@@ -64,6 +71,9 @@
       os.environ[self.IGNORE_PERF_VAR] = self._original_ignore_perf_value
     else:
       del os.environ[self.IGNORE_PERF_VAR]
+    # Revert global variables
+    mixed_precision_global_state.mixed_precision_is_enabled = False
+    mixed_precision_global_state.non_mixed_precision_session_created = False
     super(MixedPrecisionTest, self).tearDown()
 
   @test_util.run_in_graph_and_eager_modes
@@ -137,6 +147,34 @@
         sess.run(var.initializer)
         self.assertEqual(sess.run(out), float('Inf'))
 
+      # Test Session will enable the auto_mixed_precision grappler pass in a
+      # ConfigProto passed by the user
+      with session.Session(config=config_pb2.ConfigProto()) as sess:
+        out = overflow_in_float16()
+        sess.run(var.initializer)
+        self.assertEqual(sess.run(out), float('Inf'))
+
+  @test.mock.patch.object(tf_logging, 'warn')
+  def test_warn_if_session_already_exists(self, mock_warn):
+    with session.Session():
+      enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
+      mock_warn.assert_any_call(
+          'You already have existing Sessions that do not use mixed precision. '
+          'enable_mixed_precision_graph_rewrite() will not affect these '
+          'Sessions.')
+
+  @test.mock.patch.object(tf_logging, 'warn')
+  def test_do_not_warn_if_session_does_not_already_exist(self, mock_warn):
+    enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
+    with session.Session():
+      # Make sure the "You already have existing Sessions" warning was not
+      # issued, since the Session was only created after
+      # enable_mixed_precision_graph_rewrite.
+      for call_arg in mock_warn.call_args_list:
+        msg = call_arg[0][0]
+        self.assertNotIn('You already have existing Sessions that do not use '
+                         'mixed precision', msg)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 5095efa..756b9b5 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -1090,7 +1090,7 @@
 
   The `tensors_list` argument is a list of tuples of tensors, or a list of
   dictionaries of tensors.  Each element in the list is treated similarly
-  to the `tensors` argument of `tf.train.batch()`.
+  to the `tensors` argument of `tf.compat.v1.train.batch()`.
 
   WARNING: This function is nondeterministic, since it starts a separate thread
   for each tensor.
@@ -1284,7 +1284,7 @@
 
   ```python
   # Creates batches of 32 images and 32 labels.
-  image_batch, label_batch = tf.train.shuffle_batch(
+  image_batch, label_batch = tf.compat.v1.train.shuffle_batch(
         [single_image, single_label],
         batch_size=32,
         num_threads=4,
@@ -1425,7 +1425,7 @@
 
   The `tensors_list` argument is a list of tuples of tensors, or a list of
   dictionaries of tensors.  Each element in the list is treated similarly
-  to the `tensors` argument of `tf.train.shuffle_batch()`.
+  to the `tensors` argument of `tf.compat.v1.train.shuffle_batch()`.
 
   This version enqueues a different list of tensors in different threads.
   It adds the following to the current `Graph`:
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index ab9d923..3805175 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -56,24 +56,25 @@
   ...
   global_step = tf.Variable(0, trainable=False)
   starter_learning_rate = 0.1
-  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
+  learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate,
+  global_step,
                                              100000, 0.96, staircase=True)
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
+      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
 
   Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Must be positive.  See the decay computation above.
-    decay_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The decay rate.
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
+      step to use for the decay computation.  Must not be negative.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
+      be positive.  See the decay computation above.
+    decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
+      The decay rate.
     staircase: Boolean.  If `True` decay the learning rate at discrete intervals
     name: String.  Optional name of the operation.  Defaults to
       'ExponentialDecay'.
@@ -91,11 +92,8 @@
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_schedule.ExponentialDecay(learning_rate,
-                                                       decay_steps,
-                                                       decay_rate,
-                                                       staircase=staircase,
-                                                       name=name)
+  decayed_lr = learning_rate_schedule.ExponentialDecay(
+      learning_rate, decay_steps, decay_rate, staircase=staircase, name=name)
   if not context.executing_eagerly():
     decayed_lr = decayed_lr(global_step)
   else:
@@ -114,7 +112,8 @@
   global_step = tf.Variable(0, trainable=False)
   boundaries = [100000, 110000]
   values = [1.0, 0.5, 0.1]
-  learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
+  learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries,
+  values)
 
   # Later, whenever we perform an optimization step, we increment global_step.
   ```
@@ -202,27 +201,28 @@
   starter_learning_rate = 0.1
   end_learning_rate = 0.01
   decay_steps = 10000
-  learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
+  learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate,
+  global_step,
                                             decay_steps, end_learning_rate,
                                             power=0.5)
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
+      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
 
   Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Must be positive.  See the decay computation above.
-    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The minimal end learning rate.
-    power: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The power of the polynomial. Defaults to linear, 1.0.
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
+      step to use for the decay computation.  Must not be negative.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
+      be positive.  See the decay computation above.
+    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
+      number.  The minimal end learning rate.
+    power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
+      power of the polynomial. Defaults to linear, 1.0.
     cycle: A boolean, whether or not it should cycle beyond decay_steps.
     name: String.  Optional name of the operation. Defaults to
       'PolynomialDecay'.
@@ -292,21 +292,22 @@
   learning_rate = 0.1
   decay_steps = 5
   k = 0.5
-  learning_rate = tf.train.natural_exp_decay(learning_rate, global_step,
+  learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate,
+  global_step,
                                              decay_steps, k)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
+      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
 
   Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A Python number.
-      Global step to use for the decay computation.  Must not be negative.
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
+      The initial learning rate.
+    global_step: A Python number. Global step to use for the decay computation.
+      Must not be negative.
     decay_steps: How often to apply decay.
     decay_rate: A Python number.  The decay rate.
     staircase: Whether to apply decay in a discrete staircase, as opposed to
@@ -329,7 +330,10 @@
   """
   natural_exp_rate = math_ops.exp(math_ops.negative(decay_rate))
   decayed_lr = learning_rate_schedule.ExponentialDecay(
-      learning_rate, decay_steps, natural_exp_rate, staircase=staircase,
+      learning_rate,
+      decay_steps,
+      natural_exp_rate,
+      staircase=staircase,
       name=name)
 
   if not context.executing_eagerly():
@@ -376,21 +380,22 @@
   learning_rate = 0.1
   decay_steps = 1.0
   decay_rate = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
+  learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate,
+  global_step,
   decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
+      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
 
   Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A Python number.
-      Global step to use for the decay computation.  Must not be negative.
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
+      The initial learning rate.
+    global_step: A Python number. Global step to use for the decay computation.
+      Must not be negative.
     decay_steps: How often to apply decay.
     decay_rate: A Python number.  The decay rate.
     staircase: Whether to apply decay in a discrete staircase, as opposed to
@@ -412,11 +417,7 @@
   @end_compatibility
   """
   decayed_lr = learning_rate_schedule.InverseTimeDecay(
-      learning_rate,
-      decay_steps,
-      decay_rate,
-      staircase=staircase,
-      name=name)
+      learning_rate, decay_steps, decay_rate, staircase=staircase, name=name)
 
   if not context.executing_eagerly():
     decayed_lr = decayed_lr(global_step)
@@ -455,13 +456,14 @@
   Args:
     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
       The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    alpha: A scalar `float32` or `float64` Tensor or a Python number.
-      Minimum learning rate value as a fraction of learning_rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
+      step to use for the decay computation.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
+      of steps to decay over.
+    alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
+      learning rate value as a fraction of learning_rate.
     name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
+
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
     learning rate.
@@ -519,17 +521,18 @@
   Args:
     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
       The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
+      step to use for the decay computation.
     first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
       Number of steps to decay over.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Used to derive the number of iterations in the i-th period
+    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to
+      derive the number of iterations in the i-th period
     m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
       Used to derive the initial learning rate of the i-th period:
-    alpha: A scalar `float32` or `float64` Tensor or a Python number.
-      Minimum learning rate value as a fraction of the learning_rate.
+    alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
+      learning rate value as a fraction of the learning_rate.
     name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
     learning rate.
@@ -602,16 +605,17 @@
   Args:
     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
       The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    num_periods: Number of periods in the cosine part of the decay.
-      See computation above.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
+      step to use for the decay computation.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
+      of steps to decay over.
+    num_periods: Number of periods in the cosine part of the decay. See
+      computation above.
     alpha: See computation above.
     beta: See computation above.
     name: String.  Optional name of the operation.  Defaults to
       'LinearCosineDecay'.
+
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
     learning rate.
@@ -690,18 +694,19 @@
   Args:
     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
       The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
+      step to use for the decay computation.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
+      of steps to decay over.
     initial_variance: initial variance for the noise. See computation above.
     variance_decay: decay for the noise's variance. See computation above.
-    num_periods: Number of periods in the cosine part of the decay.
-      See computation above.
+    num_periods: Number of periods in the cosine part of the decay. See
+      computation above.
     alpha: See computation above.
     beta: See computation above.
     name: String.  Optional name of the operation.  Defaults to
       'NoisyLinearCosineDecay'.
+
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
     learning rate.
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index d313d13..46afabc 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -77,7 +77,8 @@
   The following pieces are directly accessible as attributes of the `Scaffold`
   object:
 
-  * `saver`: A `tf.train.Saver` object taking care of saving the variables.
+  * `saver`: A `tf.compat.v1.train.Saver` object taking care of saving the
+  variables.
     Picked from and stored into the `SAVERS` collection in the graph by default.
   * `init_op`: An op to run to initialize the variables.  Picked from and
     stored into the `INIT_OP` collection in the graph by default.
@@ -133,9 +134,9 @@
       local_init_op: Optional op to initialize local variables.
       summary_op: Optional op to gather all summaries.  Must return a scalar
         string tensor containing a serialized `Summary` proto.
-      saver: Optional `tf.train.Saver` object to use to save and restore
-        variables.  May also be a `tf.train.Checkpoint` object, in which case
-        object-based checkpoints are saved. This will also load some
+      saver: Optional `tf.compat.v1.train.Saver` object to use to save and
+        restore variables.  May also be a `tf.train.Checkpoint` object, in which
+        case object-based checkpoints are saved. This will also load some
         object-based checkpoints saved from elsewhere, but that loading may be
         fragile since it uses fixed keys rather than performing a full
         graph-based match. For example if a variable has two paths from the
@@ -199,8 +200,9 @@
             resources.report_uninitialized_resources()
         ], 0)
 
-      self._ready_op = Scaffold.get_or_default(
-          'ready_op', ops.GraphKeys.READY_OP, default_ready_op)
+      self._ready_op = Scaffold.get_or_default('ready_op',
+                                               ops.GraphKeys.READY_OP,
+                                               default_ready_op)
     if self._ready_for_local_init_op is None:
 
       def default_ready_for_local_init_op():
@@ -219,8 +221,9 @@
           'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
           Scaffold.default_local_init_op)
     if self._summary_op is None:
-      self._summary_op = Scaffold.get_or_default(
-          'summary_op', ops.GraphKeys.SUMMARY_OP, summary.merge_all)
+      self._summary_op = Scaffold.get_or_default('summary_op',
+                                                 ops.GraphKeys.SUMMARY_OP,
+                                                 summary.merge_all)
     # pylint: disable=g-long-lambda
     if self._saver is None:
       self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
@@ -292,7 +295,8 @@
 
     This op is used during session initialization when a Scaffold is
     initialized without specifying the local_init_op arg. It includes
-    `tf.local_variables_initializer`, `tf.tables_initializer`, and also
+    `tf.compat.v1.local_variables_initializer`,
+    `tf.compat.v1.tables_initializer`, and also
     initializes local session resources.
 
     Returns:
@@ -435,7 +439,8 @@
   For a chief, this utility sets proper session initializer/restorer. It also
   creates hooks related to checkpoint and summary saving. For workers, this
   utility sets proper session creator which waits for the chief to
-  initialize/restore. Please check `tf.train.MonitoredSession` for more
+  initialize/restore. Please check `tf.compat.v1.train.MonitoredSession` for
+  more
   information.
 
 
@@ -464,8 +469,9 @@
       to disk using a default summary saver.  If both `save_summaries_steps` and
       `save_summaries_secs` are set to `None`, then the default summary saver
       isn't used. Default not enabled.
-    config: an instance of `tf.ConfigProto` proto used to configure the session.
-      It's the `config` argument of constructor of `tf.Session`.
+    config: an instance of `tf.compat.v1.ConfigProto` proto used to configure
+      the session. It's the `config` argument of constructor of
+      `tf.compat.v1.Session`.
     stop_grace_period_secs: Number of seconds given to threads to stop after
       `close()` has been called.
     log_step_count_steps: The frequency, in number of global steps, that the
@@ -591,7 +597,7 @@
 
 @tf_export(v1=['train.ChiefSessionCreator'])
 class ChiefSessionCreator(SessionCreator):
-  """Creates a tf.Session for a chief."""
+  """Creates a tf.compat.v1.Session for a chief."""
 
   def __init__(self,
                scaffold=None,
@@ -643,7 +649,7 @@
 
 @tf_export(v1=['train.WorkerSessionCreator'])
 class WorkerSessionCreator(SessionCreator):
-  """Creates a tf.Session for a worker."""
+  """Creates a tf.compat.v1.Session for a worker."""
 
   def __init__(self,
                scaffold=None,
@@ -757,8 +763,9 @@
         `step_fn` will be returned from `run_step_fn`, unless a stop is
         requested.  In that case, the next `should_stop` call will return True.
         Example usage:  ```python
-           with tf.Graph().as_default(): c = tf.placeholder(dtypes.float32) v =
-             tf.add(c, 4.0) w = tf.add(c, 0.5)
+           with tf.Graph().as_default(): c =
+             tf.compat.v1.placeholder(dtypes.float32) v = tf.add(c, 4.0) w =
+             tf.add(c, 0.5)
              def step_fn(step_context):
                a = step_context.session.run(fetches=v, feed_dict={c: 0.5})
                if a <= 4.5: step_context.request_stop()
@@ -808,7 +815,7 @@
       """Initializes the `step_context` argument for a `step_fn` invocation.
 
       Args:
-        session: An instance of `tf.Session`.
+        session: An instance of `tf.compat.v1.Session`.
         run_with_hooks_fn: A function for running fetches and hooks.
       """
       self._session = session
@@ -901,13 +908,13 @@
     return self._coordinated_creator.tf_sess is None
 
   def _tf_sess(self):
-    """Return underlying tf.Session object.
+    """Return underlying tf.compat.v1.Session object.
 
     Warning: accessing the returned object in user code is likely to cause races
     or "flaky tests".
 
     Returns:
-      A tf.Session object.
+      A tf.compat.v1.Session object.
     """
     return self._coordinated_creator.tf_sess
 
@@ -955,7 +962,7 @@
   * suppresses `OutOfRange` error which indicates that all inputs have been
     processed if the monitored_session is used as a context
 
-  How to set `tf.Session` arguments:
+  How to set `tf.compat.v1.Session` arguments:
 
   * In most cases you can set session arguments as follows:
 
@@ -973,7 +980,8 @@
 
   See `MonitoredTrainingSession` for an example usage based on chief or worker.
 
-  Note: This is not a `tf.Session`. For example, it cannot do following:
+  Note: This is not a `tf.compat.v1.Session`. For example, it cannot do
+  following:
 
   * it cannot be set as default session.
   * it cannot be sent to saver.save.
@@ -1004,14 +1012,15 @@
   """Session-like object that handles initialization, restoring, and hooks.
 
   Please note that this utility is not recommended for distributed settings.
-  For distributed settings, please use `tf.train.MonitoredSession`. The
+  For distributed settings, please use `tf.compat.v1.train.MonitoredSession`.
+  The
   differences between `MonitoredSession` and `SingularMonitoredSession` are:
 
   * `MonitoredSession` handles `AbortedError` and `UnavailableError` for
     distributed settings, but `SingularMonitoredSession` does not.
   * `MonitoredSession` can be created in `chief` or `worker` modes.
     `SingularMonitoredSession` is always created as `chief`.
-  * You can access the raw `tf.Session` object used by
+  * You can access the raw `tf.compat.v1.Session` object used by
     `SingularMonitoredSession`, whereas in MonitoredSession the raw session is
     private. This can be used:
       - To `run` without hooks.
@@ -1093,7 +1102,7 @@
 
 
 class _WrappedSession(object):
-  """Wrapper around a `tf.Session`.
+  """Wrapper around a `tf.compat.v1.Session`.
 
   This wrapper is used as a base class for various session wrappers
   that provide additional functionality such as monitoring, coordination,
@@ -1108,7 +1117,8 @@
     """Creates a `_WrappedSession`.
 
     Args:
-      sess: A `tf.Session` or `_WrappedSession` object.  The wrapped session.
+      sess: A `tf.compat.v1.Session` or `_WrappedSession` object.  The wrapped
+        session.
     """
     self._sess = sess
     self._wrapped_is_stoppable = isinstance(self._sess, _WrappedSession)
@@ -1293,7 +1303,7 @@
     """Create a new `_CoordinatedSession`.
 
     Args:
-      sess: A `tf.Session` object.  The wrapped session.
+      sess: A `tf.compat.v1.Session` object.  The wrapped session.
       coord: A `tf.train.Coordinator` object.
       stop_grace_period_secs: Number of seconds given to threads to stop after
         `close()` has been called.
@@ -1364,7 +1374,7 @@
     """Initializes a _HookedSession object.
 
     Args:
-      sess: A `tf.Session` or a `_WrappedSession` object.
+      sess: A `tf.compat.v1.Session` or a `_WrappedSession` object.
       hooks: An iterable of `SessionRunHook' objects.
     """
 
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index cc58a95..ffab954 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -56,9 +56,9 @@
   E.g.:
 
   ```
-    with tf.variable_scope('scope1'):
-      with tf.variable_scope('scope2'):
-        var = tf.get_variable('foo')
+    with tf.compat.v1.variable_scope('scope1'):
+      with tf.compat.v1.variable_scope('scope2'):
+        var = tf.compat.v1.get_variable('foo')
         update_1 = tf.assign_moving_average(var, 0.0, 1.0)
         update_2 = tf.assign_moving_average(var, 0.0, 0.9)
 
@@ -73,13 +73,14 @@
     decay: A float Tensor or float value.  The moving average decay.
     zero_debias: A python bool. If true, assume the variable is 0-initialized
       and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
-      `_zero_debias` for more details.
+        `_zero_debias` for more details.
     name: Optional name of the returned operation.
 
   Returns:
     A tensor which if evaluated will compute and return the new moving average.
   """
-  def update_fn(v, value, decay=decay):
+
+  def update_delta_fn(v, value, decay=decay):
     decay = ops.convert_to_tensor(1.0 - decay, name="decay")
     if decay.dtype != v.dtype.base_dtype:
       decay = math_ops.cast(decay, v.dtype.base_dtype)
@@ -87,6 +88,9 @@
       update_delta = _zero_debias(v, value, decay)
     else:
       update_delta = (v - value) * decay
+    return update_delta
+
+  def update_fn(v, update_delta):
     return state_ops.assign_sub(v, update_delta, name=scope)
 
   with ops.name_scope(name, "AssignMovingAvg",
@@ -96,14 +100,16 @@
       # In a replica context, we update variable using the mean of value across
       # replicas.
       def merge_fn(strategy, v, value):
-        value = strategy.extended.reduce_to(
-            ds_reduce_util.ReduceOp.MEAN, value, v)
-        return strategy.extended.update(v, update_fn, args=(value,))
+        value = strategy.extended.reduce_to(ds_reduce_util.ReduceOp.MEAN, value,
+                                            v)
+        update_delta = update_delta_fn(v, value)
+        return strategy.extended.update(v, update_fn, args=(update_delta,))
 
       return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
       strategy = distribution_strategy_context.get_cross_replica_context()
-      return strategy.extended.update(variable, update_fn, args=(value,))
+      update_delta = update_delta_fn(variable, value)
+      return strategy.extended.update(variable, update_fn, args=(update_delta,))
 
 
 def weighted_moving_average(value,
@@ -124,15 +130,15 @@
   Args:
     value: A numeric `Tensor`.
     decay: A float `Tensor` or float value.  The moving average decay.
-    weight:  `Tensor` that keeps the current value of a weight.
-      Shape should be able to multiply `value`.
+    weight:  `Tensor` that keeps the current value of a weight. Shape should be
+      able to multiply `value`.
     truediv:  Boolean, if `True`, dividing by `moving_average(weight)` is
       floating point division.  If `False`, use division implied by dtypes.
     collections:  List of graph collections keys to add the internal variables
-      `value * weight` and `weight` to.
-      Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-    name: Optional name of the returned operation.
-      Defaults to "WeightedMovingAvg".
+      `value * weight` and `weight` to. Defaults to
+      `[GraphKeys.GLOBAL_VARIABLES]`.
+    name: Optional name of the returned operation. Defaults to
+      "WeightedMovingAvg".
 
   Returns:
     An Operation that updates and returns the weighted moving average.
@@ -203,50 +209,59 @@
     tensor will also update the shadow variables appropriately.
   """
   with variable_scope.variable_scope(
-      unbiased_var.name[:-len(":0")], values=[unbiased_var,
-                                              value, decay]) as scope:
-    with ops.colocate_with(unbiased_var):
-      with ops.init_scope():
-        biased_initializer = init_ops.zeros_initializer(
-            dtype=unbiased_var.dtype)(unbiased_var.get_shape())
-        local_step_initializer = init_ops.zeros_initializer()
-      def _maybe_get_unique(name):
-        """Get name for a unique variable, if not `reuse=True`."""
-        if variable_scope.get_variable_scope().reuse:
-          return name
-        vs_vars = [x.op.name for x in
-                   variable_scope.get_variable_scope().global_variables()]
-        full_name = variable_scope.get_variable_scope().name + "/" + name
-        if full_name not in vs_vars: return name
-        idx = 1
-        while full_name + ("_%d" % idx) in vs_vars:
-          idx += 1
-        return name + ("_%d" % idx)
+      unbiased_var.name[:-len(":0")], values=[unbiased_var, value,
+                                              decay]) as scope:
+    with ops.init_scope():
+      biased_initializer = init_ops.zeros_initializer()
+      local_step_initializer = init_ops.zeros_initializer()
+
+    def _maybe_get_unique(name):
+      """Get name for a unique variable, if not `reuse=True`."""
+      if variable_scope.get_variable_scope().reuse:
+        return name
+      vs_vars = [
+          x.op.name
+          for x in variable_scope.get_variable_scope().global_variables()
+      ]
+      full_name = variable_scope.get_variable_scope().name + "/" + name
+      if full_name not in vs_vars:
+        return name
+      idx = 1
+      while full_name + ("_%d" % idx) in vs_vars:
+        idx += 1
+      return name + ("_%d" % idx)
+
+    strategy = distribution_strategy_context.get_strategy()
+    with strategy.extended.colocate_vars_with(unbiased_var):
       biased_var = variable_scope.get_variable(
-          _maybe_get_unique("biased"), initializer=biased_initializer,
-          trainable=False)
+          _maybe_get_unique("biased"),
+          initializer=biased_initializer,
+          shape=unbiased_var.get_shape(),
+          dtype=unbiased_var.dtype,
+          trainable=False,
+          aggregation=variable_scope.VariableAggregation.MEAN)
       local_step = variable_scope.get_variable(
           _maybe_get_unique("local_step"),
           shape=[],
           dtype=unbiased_var.dtype,
           initializer=local_step_initializer,
-          trainable=False)
+          trainable=False,
+          aggregation=variable_scope.VariableAggregation.MEAN)
 
-      # Get an update ops for both shadow variables.
-      update_biased = state_ops.assign_sub(biased_var,
-                                           (biased_var - value) * decay,
-                                           name=scope.name)
-      update_local_step = local_step.assign_add(1)
+    # Get an update ops for both shadow variables.
+    update_biased = state_ops.assign_sub(
+        biased_var, (biased_var - value) * decay, name=scope.name)
+    update_local_step = local_step.assign_add(1)
 
-      # Compute the value of the delta to update the unbiased EMA. Make sure to
-      # use the new values of the biased variable and the local step.
-      with ops.control_dependencies([update_biased, update_local_step]):
-        # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
-        unbiased_ema_delta = (unbiased_var - biased_var.read_value() /
-                              (1 - math_ops.pow(
-                                  1.0 - decay, local_step.read_value())))
+    # Compute the value of the delta to update the unbiased EMA. Make sure to
+    # use the new values of the biased variable and the local step.
+    with ops.control_dependencies([update_biased, update_local_step]):
+      # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
+      unbiased_ema_delta = (
+          unbiased_var - biased_var.read_value() /
+          (1 - math_ops.pow(1.0 - decay, local_step.read_value())))
 
-      return unbiased_ema_delta
+    return unbiased_ema_delta
 
 
 @tf_export("train.ExponentialMovingAverage")
@@ -315,7 +330,7 @@
      for a given variable.
   *  Build a model normally but load the checkpoint files to evaluate by using
      the shadow variable names.  For this use the `average_name()` method.  See
-     the `tf.train.Saver` for more
+     the `tf.compat.v1.train.Saver` for more
      information on restoring saved variables.
 
   Example of restoring the shadow variable values:
@@ -324,13 +339,17 @@
   # Create a Saver that loads variables from their saved shadow values.
   shadow_var0_name = ema.average_name(var0)
   shadow_var1_name = ema.average_name(var1)
-  saver = tf.train.Saver({shadow_var0_name: var0, shadow_var1_name: var1})
+  saver = tf.compat.v1.train.Saver({shadow_var0_name: var0, shadow_var1_name:
+  var1})
   saver.restore(...checkpoint filename...)
   # var0 and var1 now hold the moving average values
   ```
   """
 
-  def __init__(self, decay, num_updates=None, zero_debias=False,
+  def __init__(self,
+               decay,
+               num_updates=None,
+               zero_debias=False,
                name="ExponentialMovingAverage"):
     """Creates a new ExponentialMovingAverage object.
 
@@ -376,7 +395,7 @@
 
     shadow variables are created with `trainable=False` and added to the
     `GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
-    `tf.global_variables()`.
+    `tf.compat.v1.global_variables()`.
 
     Returns an op that updates all shadow variables from the current value of
     their associated variables.
@@ -386,8 +405,8 @@
     be called in a loop.
 
     Args:
-      var_list: A list of Variable or Tensor objects. The variables
-        and Tensors must be of types bfloat16, float16, float32, or float64.
+      var_list: A list of Variable or Tensor objects. The variables and Tensors
+        must be of types bfloat16, float16, float32, or float64.
 
     Returns:
       An Operation that updates the moving averages.
@@ -417,10 +436,11 @@
         # tensors, we rely on the existing device allocation mechanism.
         with ops.init_scope():
           if isinstance(var, variables.Variable):
-            avg = slot_creator.create_slot(var,
-                                           var.initialized_value(),
-                                           self.name,
-                                           colocate_with_primary=True)
+            avg = slot_creator.create_slot(
+                var,
+                var.initialized_value(),
+                self.name,
+                colocate_with_primary=True)
             # NOTE(mrry): We only add `tf.Variable` objects to the
             # `MOVING_AVERAGE_VARIABLES` collection.
             ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
@@ -428,9 +448,9 @@
             avg = slot_creator.create_zeros_slot(
                 var,
                 self.name,
-                colocate_with_primary=(var.op.type in ["Variable",
-                                                       "VariableV2",
-                                                       "VarHandleOp"]))
+                colocate_with_primary=(var.op.type in [
+                    "Variable", "VariableV2", "VarHandleOp"
+                ]))
             if self._zero_debias:
               zero_debias_true.add(avg)
         self._averages[var] = avg
@@ -438,16 +458,16 @@
     with ops.name_scope(self.name) as scope:
       decay = ops.convert_to_tensor(self._decay, name="decay")
       if self._num_updates is not None:
-        num_updates = math_ops.cast(self._num_updates,
-                                    dtypes.float32,
-                                    name="num_updates")
+        num_updates = math_ops.cast(
+            self._num_updates, dtypes.float32, name="num_updates")
         decay = math_ops.minimum(decay,
                                  (1.0 + num_updates) / (10.0 + num_updates))
       updates = []
       for var in var_list:
         zero_debias = self._averages[var] in zero_debias_true
-        updates.append(assign_moving_average(
-            self._averages[var], var, decay, zero_debias=zero_debias))
+        updates.append(
+            assign_moving_average(
+                self._averages[var], var, decay, zero_debias=zero_debias))
       return control_flow_ops.group(*updates, name=scope)
 
   def average(self, var):
@@ -472,7 +492,7 @@
     To restore variables, you have to know the name of the shadow variables.
     That name and the original variable can then be passed to a `Saver()` object
     to restore the variable from the moving average value with:
-      `saver = tf.train.Saver({ema.average_name(var): var})`
+      `saver = tf.compat.v1.train.Saver({ema.average_name(var): var})`
 
     `average_name()` can be called whether or not `apply()` has been called.
 
@@ -499,7 +519,7 @@
 
     ```python
       variables_to_restore = ema.variables_to_restore()
-      saver = tf.train.Saver(variables_to_restore)
+      saver = tf.compat.v1.train.Saver(variables_to_restore)
     ```
 
     Below is an example of such mapping:
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index ac9d4c8..dd41495 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -434,14 +434,14 @@
 
   Raises:
     ValueError: if `sess` is None and there isn't any default session.
-    TypeError: if `sess` is not a `tf.Session` object.
+    TypeError: if `sess` is not a `tf.compat.v1.Session` object.
 
   Returns:
     A list of threads.
 
   Raises:
     RuntimeError: If called with eager execution enabled.
-    ValueError: If called without a default `tf.Session` registered.
+    ValueError: If called without a default `tf.compat.v1.Session` registered.
 
   @compatibility(eager)
   Not compatible with eager execution. To ingest data under eager execution,
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 215fc39..26eafaa 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -56,7 +56,6 @@
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
-
 # TODO(allenl): Remove these aliases once all users are migrated off.
 get_checkpoint_state = checkpoint_management.get_checkpoint_state
 update_checkpoint_state = checkpoint_management.update_checkpoint_state
@@ -174,13 +173,11 @@
     tensors = []
     for spec in saveable.specs:
       tensors.append(
-          io_ops.restore_v2(
-              filename_tensor,
-              [spec.name],
-              [spec.slice_spec],
-              [spec.dtype])[0])
+          io_ops.restore_v2(filename_tensor, [spec.name], [spec.slice_spec],
+                            [spec.dtype])[0])
 
     return tensors
+
   # pylint: enable=unused-argument
 
   def sharded_filename(self, filename_tensor, shard, num_shards):
@@ -217,8 +214,8 @@
     from each device.
 
     Args:
-      checkpoint_prefix: scalar String Tensor.  Interpreted *NOT AS A
-        FILENAME*, but as a prefix of a V2 checkpoint;
+      checkpoint_prefix: scalar String Tensor.  Interpreted *NOT AS A FILENAME*,
+        but as a prefix of a V2 checkpoint;
       per_device: A list of (device, BaseSaverBuilder.VarToSave) pairs, as
         returned by _GroupByDevices().
 
@@ -319,8 +316,8 @@
       saveables: A list of SaveableObject objects.
       restore_sequentially: True if we want to restore variables sequentially
         within a shard.
-      reshape: True if we want to reshape loaded tensors to the shape of
-        the corresponding variable.
+      reshape: True if we want to reshape loaded tensors to the shape of the
+        corresponding variable.
       preferred_shard: Shard to open first when loading a sharded file.
       name: Name for the returned op.
 
@@ -361,12 +358,12 @@
 
     Args:
       filename_tensor: Tensor for the path of the file to load.
-      per_device: A list of (device, SaveableObject) pairs, as
-        returned by _GroupByDevices().
+      per_device: A list of (device, SaveableObject) pairs, as returned by
+        _GroupByDevices().
       restore_sequentially: True if we want to restore variables sequentially
         within a shard.
-      reshape: True if we want to reshape loaded tensors to the shape of
-        the corresponding variable.
+      reshape: True if we want to reshape loaded tensors to the shape of the
+        corresponding variable.
 
     Returns:
       An Operation that restores the variables.
@@ -424,14 +421,13 @@
 
     Args:
       names_to_saveables: A dictionary mapping name to a Variable or
-        SaveableObject. Each name will be associated with the
-        corresponding variable in the checkpoint.
-      reshape: If True, allow restoring parameters from a checkpoint
-        that where the parameters have a different shape.  This is
-        only needed when you try to restore from a Dist-Belief checkpoint,
-        and only some times.
-      sharded: If True, shard the checkpoints, one per device that has
-        Variable nodes.
+        SaveableObject. Each name will be associated with the corresponding
+        variable in the checkpoint.
+      reshape: If True, allow restoring parameters from a checkpoint that where
+        the parameters have a different shape.  This is only needed when you try
+        to restore from a Dist-Belief checkpoint, and only some times.
+      sharded: If True, shard the checkpoints, one per device that has Variable
+        nodes.
       max_to_keep: Maximum number of checkpoints to keep.  As new checkpoints
         are created, old ones are deleted.  If None or 0, no checkpoints are
         deleted from the filesystem but only the last one is kept in the
@@ -597,8 +593,8 @@
     if len(savers) > 1:
       raise RuntimeError(
           "More than one item in collection {}. "
-          "Please indicate which one to use by passing it to the constructor.".
-          format(collection_key))
+          "Please indicate which one to use by passing it to the constructor."
+          .format(collection_key))
     return savers[0]
   saver = Saver(sharded=True, allow_empty=True)
   if saver is not None:
@@ -662,9 +658,9 @@
   ```python
   ...
   # Create a saver.
-  saver = tf.train.Saver(...variables...)
+  saver = tf.compat.v1.train.Saver(...variables...)
   # Launch the graph and train, saving the model every 1,000 steps.
-  sess = tf.Session()
+  sess = tf.compat.v1.Session()
   for step in xrange(1000000):
       sess.run(..training_op..)
       if step % 1000 == 0:
@@ -717,13 +713,13 @@
     v2 = tf.Variable(..., name='v2')
 
     # Pass the variables as a dict:
-    saver = tf.train.Saver({'v1': v1, 'v2': v2})
+    saver = tf.compat.v1.train.Saver({'v1': v1, 'v2': v2})
 
     # Or pass them as a list.
-    saver = tf.train.Saver([v1, v2])
+    saver = tf.compat.v1.train.Saver([v1, v2])
     # Passing a list is equivalent to passing a dict with the variable op names
     # as keys:
-    saver = tf.train.Saver({v.op.name: v for v in [v1, v2]})
+    saver = tf.compat.v1.train.Saver({v.op.name: v for v in [v1, v2]})
     ```
 
     The optional `reshape` argument, if `True`, allows restoring a variable from
@@ -738,35 +734,33 @@
       var_list: A list of `Variable`/`SaveableObject`, or a dictionary mapping
         names to `SaveableObject`s. If `None`, defaults to the list of all
         saveable objects.
-      reshape: If `True`, allows restoring parameters from a checkpoint
-        where the variables have a different shape.
+      reshape: If `True`, allows restoring parameters from a checkpoint where
+        the variables have a different shape.
       sharded: If `True`, shard the checkpoints, one per device.
-      max_to_keep: Maximum number of recent checkpoints to keep.
-        Defaults to 5.
-      keep_checkpoint_every_n_hours: How often to keep checkpoints.
-        Defaults to 10,000 hours.
+      max_to_keep: Maximum number of recent checkpoints to keep. Defaults to 5.
+      keep_checkpoint_every_n_hours: How often to keep checkpoints. Defaults to
+        10,000 hours.
       name: String.  Optional name to use as a prefix when adding operations.
       restore_sequentially: A `Bool`, which if true, causes restore of different
         variables to happen sequentially within each device.  This can lower
         memory usage when restoring very large models.
       saver_def: Optional `SaverDef` proto to use instead of running the
-        builder. This is only useful for specialty code that wants to recreate
-        a `Saver` object for a previously built `Graph` that had a `Saver`.
-        The `saver_def` proto should be the one returned by the
-        `as_saver_def()` call of the `Saver` that was created for that `Graph`.
+        builder. This is only useful for specialty code that wants to recreate a
+        `Saver` object for a previously built `Graph` that had a `Saver`. The
+        `saver_def` proto should be the one returned by the `as_saver_def()`
+        call of the `Saver` that was created for that `Graph`.
       builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
         Defaults to `BulkSaverBuilder()`.
       defer_build: If `True`, defer adding the save and restore ops to the
         `build()` call. In that case `build()` should be called before
         finalizing the graph or using the saver.
-      allow_empty: If `False` (default) raise an error if there are no
-        variables in the graph. Otherwise, construct the saver anyway and make
-        it a no-op.
+      allow_empty: If `False` (default) raise an error if there are no variables
+        in the graph. Otherwise, construct the saver anyway and make it a no-op.
       write_version: controls what format to use when saving checkpoints.  It
         also affects certain filepath matching logic.  The V2 format is the
-        recommended choice: it is much more optimized than V1 in terms of
-        memory required and latency incurred during restore.  Regardless of
-        this flag, the Saver is able to restore from both V2 and V1 checkpoints.
+        recommended choice: it is much more optimized than V1 in terms of memory
+          required and latency incurred during restore.  Regardless of this
+          flag, the Saver is able to restore from both V2 and V1 checkpoints.
       pad_step_number: if True, pads the global step number in the checkpoint
         filepaths to some fixed width (8 by default).  This is turned off by
         default.
@@ -877,7 +871,8 @@
           name=self._name,
           restore_sequentially=self._restore_sequentially,
           filename=checkpoint_path,
-          build_save=build_save, build_restore=build_restore)
+          build_save=build_save,
+          build_restore=build_restore)
     elif self.saver_def and self._name:
       # Since self._name is used as a name_scope by builder(), we are
       # overloading the use of this field to represent the "import_scope" as
@@ -997,8 +992,8 @@
         saver_def.filename_tensor_name, export_scope)
     saver_def.save_tensor_name = ops.strip_name_scope(
         saver_def.save_tensor_name, export_scope)
-    saver_def.restore_op_name = ops.strip_name_scope(
-        saver_def.restore_op_name, export_scope)
+    saver_def.restore_op_name = ops.strip_name_scope(saver_def.restore_op_name,
+                                                     export_scope)
     return saver_def
 
   @staticmethod
@@ -1066,8 +1061,12 @@
     Args:
       checkpoint_paths: a list of checkpoint paths.
     """
-    mtimes = checkpoint_management.get_checkpoint_mtimes(checkpoint_paths)
-    self.set_last_checkpoints_with_time(list(zip(checkpoint_paths, mtimes)))
+    checkpoints_with_mtimes = []
+    for checkpoint_path in checkpoint_paths:
+      mtime = checkpoint_management.get_checkpoint_mtimes([checkpoint_path])
+      if mtime:
+        checkpoints_with_mtimes.append((checkpoint_path, mtime[0]))
+    self.set_last_checkpoints_with_time(checkpoints_with_mtimes)
 
   def save(self,
            sess,
@@ -1092,14 +1091,13 @@
     Args:
       sess: A Session to use to save the variables.
       save_path: String.  Prefix of filenames created for the checkpoint.
-      global_step: If provided the global step number is appended to
-        `save_path` to create the checkpoint filenames. The optional argument
-        can be a `Tensor`, a `Tensor` name or an integer.
+      global_step: If provided the global step number is appended to `save_path`
+        to create the checkpoint filenames. The optional argument can be a
+        `Tensor`, a `Tensor` name or an integer.
       latest_filename: Optional name for the protocol buffer file that will
-        contains the list of most recent checkpoints.  That file,
-        kept in the same directory as the checkpoint files, is automatically
-        managed by the saver to keep track of recent checkpoints.  Defaults to
-        'checkpoint'.
+        contains the list of most recent checkpoints.  That file, kept in the
+        same directory as the checkpoint files, is automatically managed by the
+        saver to keep track of recent checkpoints.  Defaults to 'checkpoint'.
       meta_graph_suffix: Suffix for `MetaGraphDef` file. Defaults to 'meta'.
       write_meta_graph: `Boolean` indicating whether or not to write the meta
         graph file.
@@ -1107,7 +1105,8 @@
         `CheckpointStateProto`.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+        [Stripping Default-Valued
+          Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
       save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
         which in the same directory of save_path and with `_debug` added before
         the file extension. This is only enabled when `write_meta_graph` is
@@ -1151,8 +1150,7 @@
         checkpoint_file = "%s-%s" % (save_path, "{:08d}".format(global_step))
     else:
       checkpoint_file = save_path
-      if os.path.basename(
-          save_path) == latest_filename and not self._sharded:
+      if os.path.basename(save_path) == latest_filename and not self._sharded:
         # Guard against collision between data file and checkpoint state file.
         raise ValueError(
             "'latest_filename' collides with 'save_path': '%s' and '%s'" %
@@ -1197,7 +1195,8 @@
       if not context.executing_eagerly():
         with sess.graph.as_default():
           self.export_meta_graph(
-              meta_graph_filename, strip_default_attrs=strip_default_attrs,
+              meta_graph_filename,
+              strip_default_attrs=strip_default_attrs,
               save_debug_info=save_debug_info)
 
     if self._is_empty:
@@ -1225,11 +1224,12 @@
       clear_devices: Whether or not to clear the device field for an `Operation`
         or `Tensor` during export.
       clear_extraneous_savers: Remove any Saver-related information from the
-        graph (both Save/Restore ops and SaverDefs) that are not associated
-        with this Saver.
+        graph (both Save/Restore ops and SaverDefs) that are not associated with
+        this Saver.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+        [Stripping Default-Valued
+          Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
       save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
         which in the same directory of filename and with `_debug` added before
         the file extension.
@@ -1274,8 +1274,8 @@
       raise ValueError("Can't load save_path when it is None.")
 
     if not checkpoint_management.checkpoint_exists(compat.as_text(save_path)):
-      raise ValueError("The passed save_path is not a valid checkpoint: "
-                       + compat.as_text(save_path))
+      raise ValueError("The passed save_path is not a valid checkpoint: " +
+                       compat.as_text(save_path))
 
     logging.info("Restoring parameters from %s", compat.as_text(save_path))
     try:
@@ -1330,13 +1330,15 @@
       key: One of the GraphKeys or user-defined string.
       export_scope: Optional `string`. Name scope to remove.
     """
-    meta_graph.add_collection_def(meta_graph_def, key,
-                                  export_scope=export_scope)
+    meta_graph.add_collection_def(
+        meta_graph_def, key, export_scope=export_scope)
 
 
 @tf_export(v1=["train.import_meta_graph"])
-def import_meta_graph(meta_graph_or_file, clear_devices=False,
-                      import_scope=None, **kwargs):
+def import_meta_graph(meta_graph_or_file,
+                      clear_devices=False,
+                      import_scope=None,
+                      **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
 
   This function takes a `MetaGraphDef` protocol buffer as input. If
@@ -1358,10 +1360,10 @@
   ```Python
   ...
   # Create a saver.
-  saver = tf.train.Saver(...variables...)
+  saver = tf.compat.v1.train.Saver(...variables...)
   # Remember the training_op we want to run by adding it to a collection.
-  tf.add_to_collection('train_op', train_op)
-  sess = tf.Session()
+  tf.compat.v1.add_to_collection('train_op', train_op)
+  sess = tf.compat.v1.Session()
   for step in xrange(1000000):
       sess.run(train_op)
       if step % 1000 == 0:
@@ -1374,12 +1376,13 @@
   the model from scratch.
 
   ```Python
-  with tf.Session() as sess:
-    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
+  with tf.compat.v1.Session() as sess:
+    new_saver =
+    tf.compat.v1.train.import_meta_graph('my-save-dir/my-model-10000.meta')
     new_saver.restore(sess, 'my-save-dir/my-model-10000')
-    # tf.get_collection() returns a list. In this example we only want the
-    # first one.
-    train_op = tf.get_collection('train_op')[0]
+    # tf.compat.v1.get_collection() returns a list. In this example we only want
+    # the first one.
+    train_op = tf.compat.v1.get_collection('train_op')[0]
     for step in xrange(1000000):
       sess.run(train_op)
   ```
@@ -1393,14 +1396,14 @@
 
   ```Python
   # Saving contents and operations.
-  v1 = tf.placeholder(tf.float32, name="v1")
-  v2 = tf.placeholder(tf.float32, name="v2")
+  v1 = tf.compat.v1.placeholder(tf.float32, name="v1")
+  v2 = tf.compat.v1.placeholder(tf.float32, name="v2")
   v3 = tf.mul(v1, v2)
   vx = tf.Variable(10.0, name="vx")
   v4 = tf.add(v3, vx, name="v4")
-  saver = tf.train.Saver([vx])
-  sess = tf.Session()
-  sess.run(tf.initialize_all_variables())
+  saver = tf.compat.v1.train.Saver([vx])
+  sess = tf.compat.v1.Session()
+  sess.run(tf.compat.v1.initialize_all_variables())
   sess.run(vx.assign(tf.add(vx, vx)))
   result = sess.run(v4, feed_dict={v1:12.0, v2:3.3})
   print(result)
@@ -1411,8 +1414,8 @@
 
   ```Python
   # Restoring variables and running operations.
-  saver = tf.train.import_meta_graph("./model_ex1.meta")
-  sess = tf.Session()
+  saver = tf.compat.v1.train.import_meta_graph("./model_ex1.meta")
+  sess = tf.compat.v1.Session()
   saver.restore(sess, "./model_ex1")
   result = sess.run("v4:0", feed_dict={"v1:0": 12.0, "v2:0": 3.3})
   print(result)
@@ -1441,13 +1444,16 @@
   execution is enabled.
   @end_compatibility
   """  # pylint: disable=g-doc-exception
-  return _import_meta_graph_with_return_elements(
-      meta_graph_or_file, clear_devices, import_scope, **kwargs)[0]
+  return _import_meta_graph_with_return_elements(meta_graph_or_file,
+                                                 clear_devices, import_scope,
+                                                 **kwargs)[0]
 
 
-def _import_meta_graph_with_return_elements(
-    meta_graph_or_file, clear_devices=False, import_scope=None,
-    return_elements=None, **kwargs):
+def _import_meta_graph_with_return_elements(meta_graph_or_file,
+                                            clear_devices=False,
+                                            import_scope=None,
+                                            return_elements=None,
+                                            **kwargs):
   """Import MetaGraph, and return both a saver and returned elements."""
   if context.executing_eagerly():
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
@@ -1466,13 +1472,13 @@
           return_elements=return_elements,
           **kwargs))
 
-  saver = _create_saver_from_imported_meta_graph(
-      meta_graph_def, import_scope, imported_vars)
+  saver = _create_saver_from_imported_meta_graph(meta_graph_def, import_scope,
+                                                 imported_vars)
   return saver, imported_return_elements
 
 
-def _create_saver_from_imported_meta_graph(
-    meta_graph_def, import_scope, imported_vars):
+def _create_saver_from_imported_meta_graph(meta_graph_def, import_scope,
+                                           imported_vars):
   """Return a saver for restoring variable values to an imported MetaGraph."""
   if meta_graph_def.HasField("saver_def"):
     # Infer the scope that is prepended by `import_scoped_meta_graph`.
@@ -1510,7 +1516,9 @@
                       save_debug_info=False,
                       **kwargs):
   # pylint: disable=line-too-long
-  """Returns `MetaGraphDef` proto. Optionally writes it to filename.
+  """Returns `MetaGraphDef` proto.
+
+  Optionally writes it to filename.
 
   This function exports the graph, saver, and collection objects into
   `MetaGraphDef` protocol buffer with the intention of it being imported
@@ -1518,29 +1526,29 @@
   a subgraph.
 
   Args:
-    filename: Optional filename including the path for writing the
-      generated `MetaGraphDef` protocol buffer.
+    filename: Optional filename including the path for writing the generated
+      `MetaGraphDef` protocol buffer.
     meta_info_def: `MetaInfoDef` protocol buffer.
     graph_def: `GraphDef` protocol buffer.
     saver_def: `SaverDef` protocol buffer.
     collection_list: List of string keys to collect.
     as_text: If `True`, writes the `MetaGraphDef` as an ASCII proto.
     graph: The `Graph` to export. If `None`, use the default graph.
-    export_scope: Optional `string`. Name scope under which to extract
-      the subgraph. The scope name will be striped from the node definitions
-      for easy import later into new name scopes. If `None`, the whole graph
-      is exported. graph_def and export_scope cannot both be specified.
+    export_scope: Optional `string`. Name scope under which to extract the
+      subgraph. The scope name will be striped from the node definitions for
+      easy import later into new name scopes. If `None`, the whole graph is
+      exported. graph_def and export_scope cannot both be specified.
     clear_devices: Whether or not to clear the device field for an `Operation`
       or `Tensor` during export.
-    clear_extraneous_savers: Remove any Saver-related information from the
-        graph (both Save/Restore ops and SaverDefs) that are not associated
-        with the provided SaverDef.
+    clear_extraneous_savers: Remove any Saver-related information from the graph
+      (both Save/Restore ops and SaverDefs) that are not associated with the
+      provided SaverDef.
     strip_default_attrs: Boolean. If `True`, default-valued attributes will be
       removed from the NodeDefs. For a detailed guide, see
       [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
     save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
-      which in the same directory of filename and with `_debug` added before
-      the file extend.
+      which in the same directory of filename and with `_debug` added before the
+      file extend.
     **kwargs: Optional keyed arguments.
 
   Returns:
@@ -1603,10 +1611,8 @@
     Dictionary mapping tensor names to checkpoint keys.
   """
   reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path)
-  object_graph_string = reader.get_tensor(
-      trackable.OBJECT_GRAPH_PROTO_KEY)
-  object_graph_proto = (
-      trackable_object_graph_pb2.TrackableObjectGraph())
+  object_graph_string = reader.get_tensor(trackable.OBJECT_GRAPH_PROTO_KEY)
+  object_graph_proto = (trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   names_to_keys = {}
   for node in object_graph_proto.nodes:
@@ -1615,9 +1621,11 @@
   return names_to_keys
 
 
-def saver_from_object_based_checkpoint(
-    checkpoint_path, var_list=None, builder=None, names_to_keys=None,
-    cached_saver=None):
+def saver_from_object_based_checkpoint(checkpoint_path,
+                                       var_list=None,
+                                       builder=None,
+                                       names_to_keys=None,
+                                       cached_saver=None):
   """Return a `Saver` which reads from an object-based checkpoint.
 
   This function validates that all variables in the variables list are remapped
@@ -1659,8 +1667,8 @@
     try:
       names_to_keys = object_graph_key_mapping(checkpoint_path)
     except errors.NotFoundError:
-      raise ValueError("Checkpoint in %s not an object-based checkpoint."
-                       % checkpoint_path)
+      raise ValueError("Checkpoint in %s not an object-based checkpoint." %
+                       checkpoint_path)
   if var_list is None:
     var_list = variables._all_saveable_objects()  # pylint: disable=protected-access
   if builder is None:
@@ -1677,7 +1685,8 @@
     extra_names = previous_names - current_names
     intersecting_names = previous_names.intersection(current_names)
     raise errors.NotFoundError(
-        None, None,
+        None,
+        None,
         message=(
             "\n\nExisting variables not in the checkpoint: %s\n\n"
             "Variables names when this checkpoint was written which don't "
@@ -1695,9 +1704,9 @@
             "existed, and if variable names have changed you may need to "
             "make this a dictionary with the old names as keys. If you're "
             "using an Estimator, you'll need to return a tf.train.Saver "
-            "inside a tf.train.Scaffold from your model_fn.")
-        % (", ".join(sorted(missing_names)), ", ".join(sorted(extra_names)),
-           len(intersecting_names)))
+            "inside a tf.train.Scaffold from your model_fn.") %
+        (", ".join(sorted(missing_names)), ", ".join(
+            sorted(extra_names)), len(intersecting_names)))
   for saveable in saveables:
     for spec in saveable.specs:
       spec.name = names_to_keys[spec.name]
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 89e64a5..99492bc 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -1475,6 +1475,67 @@
           gfile.Exists(checkpoint_management.meta_graph_filename(s1)))
 
 
+class RecoverLastCheckpointsTest(test.TestCase):
+
+  def _get_test_dir(self, dirname):
+    test_dir = os.path.join(self.get_temp_dir(), dirname)
+    gfile.MakeDirs(test_dir)
+    return test_dir
+
+  def assertCheckpointState(self, model_checkpoint_path,
+                            all_model_checkpoint_paths, save_dir):
+    checkpoint_state = checkpoint_management.get_checkpoint_state(save_dir)
+    self.assertEqual(checkpoint_state.model_checkpoint_path,
+                     model_checkpoint_path)
+    self.assertEqual(checkpoint_state.all_model_checkpoint_paths,
+                     all_model_checkpoint_paths)
+
+  def test_recover_last_checkpoints(self):
+    with context.eager_mode():
+      save_dir = self._get_test_dir("recover_last_checkpoints")
+
+      v = variable_scope.variable(10.0, name="v")
+      save = saver_module.Saver({"v": v}, max_to_keep=10)
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual([], save.last_checkpoints)
+
+      s1 = save.save(None, os.path.join(save_dir, "ckpt-1"))
+      s2 = save.save(None, os.path.join(save_dir, "ckpt-2"))
+      s3 = save.save(None, os.path.join(save_dir, "ckpt-3"))
+      self.assertEqual([s1, s2, s3], save.last_checkpoints)
+      self.assertTrue(checkpoint_management.checkpoint_exists(s1))
+      self.assertTrue(checkpoint_management.checkpoint_exists(s2))
+      self.assertTrue(checkpoint_management.checkpoint_exists(s3))
+      self.assertCheckpointState(
+          model_checkpoint_path=s3,
+          all_model_checkpoint_paths=[s1, s2, s3],
+          save_dir=save_dir)
+
+      # Create another saver and recover last checkpoints.
+      save2 = saver_module.Saver({"v": v}, max_to_keep=10)
+      self.assertEqual([], save2.last_checkpoints)
+      save2.recover_last_checkpoints([s1, s2, s3])
+      self.assertEqual([s1, s2, s3], save2.last_checkpoints)
+
+      # Remove a checkpoint and check that last checkpoints are
+      # restored correctly.
+      for fname in gfile.Glob("{}*".format(s1)):
+        gfile.Remove(fname)
+      self.assertFalse(checkpoint_management.checkpoint_exists(s1))
+
+      # Create another saver and recover last checkpoints. The removed
+      # checkpoint would be correctly omitted.
+      save3 = saver_module.Saver({"v": v}, max_to_keep=10)
+      self.assertEqual([], save3.last_checkpoints)
+      save3.recover_last_checkpoints([s1, s2, s3])
+      self.assertEqual([s2, s3], save3.last_checkpoints)
+      s4 = save3.save(None, os.path.join(save_dir, "ckpt-4"))
+      self.assertCheckpointState(
+          model_checkpoint_path=s4,
+          all_model_checkpoint_paths=[s2, s3, s4],
+          save_dir=save_dir)
+
+
 class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index c5ca2ac..bd9c2382 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -32,21 +32,19 @@
   """Creates a `tf.train.ServerDef` protocol buffer.
 
   Args:
-    server_or_cluster_def: A `tf.train.ServerDef` or
-      `tf.train.ClusterDef` protocol buffer, or a
-      `tf.train.ClusterSpec` object, describing the server to be
-      defined and/or the cluster of which it is a member.
-    job_name: (Optional.) Specifies the name of the job of which the server
-      is a member. Defaults to the value in `server_or_cluster_def`, if
-      specified.
+    server_or_cluster_def: A `tf.train.ServerDef` or `tf.train.ClusterDef`
+      protocol buffer, or a `tf.train.ClusterSpec` object, describing the server
+      to be defined and/or the cluster of which it is a member.
+    job_name: (Optional.) Specifies the name of the job of which the server is a
+      member. Defaults to the value in `server_or_cluster_def`, if specified.
     task_index: (Optional.) Specifies the task index of the server in its job.
       Defaults to the value in `server_or_cluster_def`, if specified. Otherwise
       defaults to 0 if the server's job has only one task.
     protocol: (Optional.) Specifies the protocol to be used by the server.
-      Acceptable values include `"grpc", "grpc+verbs"`. Defaults to the value
-      in `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
-    config: (Options.) A `tf.ConfigProto` that specifies default configuration
-      options for all sessions that run on this server.
+      Acceptable values include `"grpc", "grpc+verbs"`. Defaults to the value in
+      `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
+    config: (Options.) A `tf.compat.v1.ConfigProto` that specifies default
+      configuration options for all sessions that run on this server.
 
   Returns:
     A `tf.train.ServerDef`.
@@ -88,7 +86,9 @@
 
     server_def = tensorflow_server_pb2.ServerDef(
         cluster=cluster_spec.as_cluster_def(),
-        job_name=job_name, task_index=task_index, protocol=protocol)
+        job_name=job_name,
+        task_index=task_index,
+        protocol=protocol)
     if config is not None:
       server_def.default_session_config.MergeFrom(config)
   return server_def
@@ -99,8 +99,8 @@
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
-  A `tf.train.Server` instance encapsulates a set of devices and a
-  `tf.Session` target that
+  A `tf.distribute.Server` instance encapsulates a set of devices and a
+  `tf.compat.v1.Session` target that
   can participate in distributed training. A server belongs to a
   cluster (specified by a `tf.train.ClusterSpec`), and
   corresponds to a particular task in a named job. The server can
@@ -120,31 +120,30 @@
     override any information provided in `server_or_cluster_def`.
 
     Args:
-      server_or_cluster_def: A `tf.train.ServerDef` or
-        `tf.train.ClusterDef` protocol buffer, or a
-        `tf.train.ClusterSpec` object, describing the server to be
-        created and/or the cluster of which it is a member.
-      job_name: (Optional.) Specifies the name of the job of which the server
-        is a member. Defaults to the value in `server_or_cluster_def`, if
+      server_or_cluster_def: A `tf.train.ServerDef` or `tf.train.ClusterDef`
+        protocol buffer, or a `tf.train.ClusterSpec` object, describing the
+        server to be created and/or the cluster of which it is a member.
+      job_name: (Optional.) Specifies the name of the job of which the server is
+        a member. Defaults to the value in `server_or_cluster_def`, if
         specified.
-      task_index: (Optional.) Specifies the task index of the server in its
-        job. Defaults to the value in `server_or_cluster_def`, if specified.
+      task_index: (Optional.) Specifies the task index of the server in its job.
+        Defaults to the value in `server_or_cluster_def`, if specified.
         Otherwise defaults to 0 if the server's job has only one task.
       protocol: (Optional.) Specifies the protocol to be used by the server.
-        Acceptable values include `"grpc", "grpc+verbs"`. Defaults to the
-        value in `server_or_cluster_def`, if specified. Otherwise defaults to
+        Acceptable values include `"grpc", "grpc+verbs"`. Defaults to the value
+        in `server_or_cluster_def`, if specified. Otherwise defaults to
         `"grpc"`.
-      config: (Options.) A `tf.ConfigProto` that specifies default
+      config: (Options.) A `tf.compat.v1.ConfigProto` that specifies default
         configuration options for all sessions that run on this server.
-      start: (Optional.) Boolean, indicating whether to start the server
-        after creating it. Defaults to `True`.
+      start: (Optional.) Boolean, indicating whether to start the server after
+        creating it. Defaults to `True`.
 
     Raises:
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         creating the TensorFlow server.
     """
-    self._server_def = _make_server_def(server_or_cluster_def,
-                                        job_name, task_index, protocol, config)
+    self._server_def = _make_server_def(server_or_cluster_def, job_name,
+                                        task_index, protocol, config)
     self._server = c_api.TF_NewServer(self._server_def.SerializeToString())
     if start:
       self.start()
@@ -195,15 +194,15 @@
 
   @property
   def target(self):
-    """Returns the target for a `tf.Session` to connect to this server.
+    """Returns the target for a `tf.compat.v1.Session` to connect to this server.
 
     To create a
-    `tf.Session` that
+    `tf.compat.v1.Session` that
     connects to this server, use the following snippet:
 
     ```python
-    server = tf.train.Server(...)
-    with tf.Session(server.target):
+    server = tf.distribute.Server(...)
+    with tf.compat.v1.Session(server.target):
       # ...
     ```
 
@@ -217,22 +216,24 @@
     """Creates a new single-process cluster running on the local host.
 
     This method is a convenience wrapper for creating a
-    `tf.train.Server` with a `tf.train.ServerDef` that specifies a
+    `tf.distribute.Server` with a `tf.train.ServerDef` that specifies a
     single-process cluster containing a single task in a job called
     `"local"`.
 
     Args:
-      config: (Options.) A `tf.ConfigProto` that specifies default
+      config: (Options.) A `tf.compat.v1.ConfigProto` that specifies default
         configuration options for all sessions that run on this server.
       start: (Optional.) Boolean, indicating whether to start the server after
         creating it. Defaults to `True`.
 
     Returns:
-      A local `tf.train.Server`.
+      A local `tf.distribute.Server`.
     """
     # Specifying port 0 means that the OS will choose a free port for the
     # server.
-    return Server({"local": ["localhost:0"]}, protocol="grpc", config=config,
+    return Server({"local": ["localhost:0"]},
+                  protocol="grpc",
+                  config=config,
                   start=start)
 
 
@@ -242,7 +243,7 @@
 
   A `tf.train.ClusterSpec` represents the set of processes that
   participate in a distributed TensorFlow computation. Every
-  `tf.train.Server` is constructed in a particular cluster.
+  `tf.distribute.Server` is constructed in a particular cluster.
 
   To create a cluster with two jobs and five tasks, you specify the
   mapping from job names to lists of network addresses (typically
@@ -272,10 +273,9 @@
     """Creates a `ClusterSpec`.
 
     Args:
-      cluster: A dictionary mapping one or more job names to (i) a
-        list of network addresses, or (ii) a dictionary mapping integer
-        task indices to network addresses; or a `tf.train.ClusterDef`
-        protocol buffer.
+      cluster: A dictionary mapping one or more job names to (i) a list of
+        network addresses, or (ii) a dictionary mapping integer task indices to
+        network addresses; or a `tf.train.ClusterDef` protocol buffer.
 
     Raises:
       TypeError: If `cluster` is not a dictionary mapping strings to lists
@@ -298,14 +298,16 @@
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
         self._cluster_spec[job_def.name] = {
-            i: t for i, t in job_def.tasks.items()}
+            i: t for i, t in job_def.tasks.items()
+        }
     elif isinstance(cluster, ClusterSpec):
       self._cluster_def = cluster_pb2.ClusterDef()
       self._cluster_def.MergeFrom(cluster.as_cluster_def())
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
         self._cluster_spec[job_def.name] = {
-            i: t for i, t in job_def.tasks.items()}
+            i: t for i, t in job_def.tasks.items()
+        }
     else:
       raise TypeError("`cluster` must be a dictionary mapping one or more "
                       "job names to lists of network addresses, or a "
@@ -326,7 +328,8 @@
   def __str__(self):
     key_values = self.as_dict()
     string_items = [
-        repr(k) + ": " + repr(key_values[k]) for k in sorted(key_values)]
+        repr(k) + ": " + repr(key_values[k]) for k in sorted(key_values)
+    ]
     return "ClusterSpec({" + ", ".join(string_items) + "})"
 
   def as_dict(self):
@@ -427,8 +430,8 @@
     try:
       return job[task_index]
     except KeyError:
-      raise ValueError("No task with index %r in job %r"
-                       % (task_index, job_name))
+      raise ValueError("No task with index %r in job %r" %
+                       (task_index, job_name))
 
   def job_tasks(self, job_name):
     """Returns a mapping from task ID to address in the given job.
@@ -482,6 +485,6 @@
         try:
           task_address = compat.as_bytes(task_address)
         except TypeError:
-          raise TypeError(
-              "Task address %r must be bytes or unicode" % task_address)
+          raise TypeError("Task address %r must be bytes or unicode" %
+                          task_address)
         job_def.tasks[i] = task_address
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index db45d80..ea9f70b 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -107,7 +107,7 @@
     self.assertAllEqual(2.0, sess.run(v1))
 
   def _useRPCConfig(self):
-    """Return a `tf.ConfigProto` that ensures we use the RPC stack for tests.
+    """Return a `tf.compat.v1.ConfigProto` that ensures we use the RPC stack for tests.
 
     This configuration ensures that we continue to exercise the gRPC
     stack when testing, rather than using the in-process optimization,
@@ -115,7 +115,7 @@
     master in the same process.
 
     Returns:
-      A `tf.ConfigProto`.
+      A `tf.compat.v1.ConfigProto`.
     """
     return config_pb2.ConfigProto(rpc_options=config_pb2.RPCOptions(
         use_rpc_for_inprocess_master=True))
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index c9a0c56..1ceddf7 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -174,8 +174,8 @@
       self.assertFalse(initialized)
       sess.run(v.initializer)
       self.assertEquals(1, sess.run(v))
-      saver.save(sess,
-                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
+      saver.save(sess, os.path.join(checkpoint_dir,
+                                    "recover_session_checkpoint"))
     self._test_recovered_variable(checkpoint_dir=checkpoint_dir)
     self._test_recovered_variable(
         checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
@@ -202,9 +202,9 @@
   def testInitWithNoneLocalInitOpError(self):
     # Creating a SessionManager with a None local_init_op but
     # non-None ready_for_local_init_op raises ValueError
-    with self.assertRaisesRegexp(ValueError,
-                                 "If you pass a ready_for_local_init_op "
-                                 "you must also pass a local_init_op "):
+    with self.assertRaisesRegexp(
+        ValueError, "If you pass a ready_for_local_init_op "
+        "you must also pass a local_init_op "):
       session_manager.SessionManager(
           ready_for_local_init_op=variables.report_uninitialized_variables(
               variables.global_variables()),
@@ -231,8 +231,8 @@
       self.assertFalse(initialized)
       sess.run(v.initializer)
       self.assertEquals(1, sess.run(v))
-      saver.save(sess,
-                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
+      saver.save(sess, os.path.join(checkpoint_dir,
+                                    "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
       v = variables.VariableV1(2, name="v")
@@ -266,7 +266,7 @@
 
   @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
-    # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
+    # We use ready_for_local_init_op=report_uninitialized_variables(),
     # which causes recover_session to not run local_init_op, and to return
     # initialized=False
 
@@ -290,8 +290,8 @@
       self.assertFalse(initialized)
       sess.run(v.initializer)
       self.assertEquals(1, sess.run(v))
-      saver.save(sess,
-                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
+      saver.save(sess, os.path.join(checkpoint_dir,
+                                    "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
       v = variables.VariableV1(2, name="v")
@@ -780,8 +780,8 @@
       self.assertFalse(initialized)
       sess.run(v.initializer)
       self.assertEquals(1, sess.run(v))
-      saver.save(sess,
-                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
+      saver.save(sess, os.path.join(checkpoint_dir,
+                                    "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
       v = variables.VariableV1(2, name="v")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 886ca46..e598bc2 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -68,7 +68,7 @@
 
 Above user code leads to following execution:
   call hooks.begin()
-  sess = tf.Session()
+  sess = tf.compat.v1.Session()
   call hooks.after_create_session()
   while not stop is requested:
     call hooks.before_run()
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 0868cfd..094f5f5 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -184,7 +184,7 @@
     dtype = primary.dtype
   slot_shape = primary.get_shape()
   if slot_shape.is_fully_defined():
-    initializer = init_ops.zeros_initializer(dtype)
+    initializer = init_ops.zeros_initializer()
     return create_slot_with_initializer(
         primary, initializer, slot_shape, dtype, name,
         colocate_with_primary=colocate_with_primary)
diff --git a/tensorflow/python/training/summary_io.py b/tensorflow/python/training/summary_io.py
index 970c67f..39760ab 100644
--- a/tensorflow/python/training/summary_io.py
+++ b/tensorflow/python/training/summary_io.py
@@ -56,9 +56,9 @@
     ```python
     ...create a graph...
     # Launch the graph in a session.
-    sess = tf.Session()
+    sess = tf.compat.v1.Session()
     # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.summary.FileWriter(<some-directory>, sess.graph)
+    writer = tf.compat.v1.summary.FileWriter(<some-directory>, sess.graph)
     ```
 
     The other arguments to the constructor control the asynchronous writes to
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index de60dd4..91960cb 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -45,7 +45,7 @@
   """A training helper that checkpoints models and computes summaries.
 
   This class is deprecated. Please use
-  `tf.train.MonitoredTrainingSession` instead.
+  `tf.compat.v1.train.MonitoredTrainingSession` instead.
 
   The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
   and a `SessionManager` that takes care of common needs of TensorFlow
@@ -97,7 +97,7 @@
   # or job_def.name, or job_def.tasks. It's entirely up to the end user.
   # But there can be only one *chief*.
   is_chief = (server_def.task_index == 0)
-  server = tf.train.Server(server_def)
+  server = tf.distribute.Server(server_def)
 
   with tf.Graph().as_default():
     ...add operations to the graph...
@@ -140,7 +140,7 @@
   * Specifying `'grpc://hostname:port'` requests a session that uses
     the RPC interface to a specific host, and also allows the in-process
     master to access remote tensorflow workers. Often, it is
-    appropriate to pass `server.target` (for some `tf.train.Server`
+    appropriate to pass `server.target` (for some `tf.distribute.Server`
     named `server).
 
   #### Advanced use
@@ -237,17 +237,16 @@
       ready_op: 1-D string `Tensor`.  This tensor is evaluated by supervisors in
         `prepare_or_wait_for_session()` to check if the model is ready to use.
         The model is considered ready if it returns an empty array.  Defaults to
-        the tensor returned from `tf.report_uninitialized_variables()`  If
-        `None`, the model is not checked for readiness.
+        the tensor returned from `tf.compat.v1.report_uninitialized_variables()`
+        If `None`, the model is not checked for readiness.
       ready_for_local_init_op: 1-D string `Tensor`.  This tensor is evaluated by
         supervisors in `prepare_or_wait_for_session()` to check if the model is
-        ready to run the local_init_op.
-        The model is considered ready if it returns an empty array. Defaults to
-        `None`. If `None`, the model is not checked for readiness before running
-        local_init_op.
-      is_chief: If True, create a chief supervisor in charge of initializing
-        and restoring the model.  If False, create a supervisor that relies
-        on a chief supervisor for inits and restore.
+        ready to run the local_init_op. The model is considered ready if it
+        returns an empty array. Defaults to `None`. If `None`, the model is not
+        checked for readiness before running local_init_op.
+      is_chief: If True, create a chief supervisor in charge of initializing and
+        restoring the model.  If False, create a supervisor that relies on a
+        chief supervisor for inits and restore.
       init_op: `Operation`.  Used by chief supervisors to initialize the model
         when it can not be recovered.  Defaults to an `Operation` that
         initializes all global variables.  If `None`, no initialization is done
@@ -255,20 +254,19 @@
       init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
         This feed dictionary will be used when `init_op` is evaluated.
       local_init_op: `Operation`. Used by all supervisors to run initializations
-        that should run for every new supervisor instance. By default these
-        are table initializers and initializers for local variables.
-        If `None`, no further per supervisor-instance initialization is
-        done automatically.
+        that should run for every new supervisor instance. By default these are
+        table initializers and initializers for local variables. If `None`, no
+        further per supervisor-instance initialization is done automatically.
       logdir: A string.  Optional path to a directory where to checkpoint the
-        model and log events for the visualizer.  Used by chief supervisors.
-        The directory will be created if it does not exist.
-      summary_op: An `Operation` that returns a Summary for the event logs.
-        Used by chief supervisors if a `logdir` was specified.  Defaults to the
+        model and log events for the visualizer.  Used by chief supervisors. The
+        directory will be created if it does not exist.
+      summary_op: An `Operation` that returns a Summary for the event logs. Used
+        by chief supervisors if a `logdir` was specified.  Defaults to the
         operation returned from summary.merge_all().  If `None`, summaries are
         not computed automatically.
       saver: A Saver object.  Used by chief supervisors if a `logdir` was
-        specified.  Defaults to the saved returned by Saver().
-        If `None`, the model is not saved automatically.
+        specified.  Defaults to the saved returned by Saver(). If `None`, the
+        model is not saved automatically.
       global_step: An integer Tensor of size 1 that counts steps.  The value
         from 'global_step' is used in summaries and checkpoint filenames.
         Default to the op named 'global_step' in the graph if it exists, is of
@@ -280,20 +278,20 @@
         disable summaries.
       save_model_secs: Number of seconds between the creation of model
         checkpoints.  Defaults to 600 seconds.  Pass 0 to disable checkpoints.
-      recovery_wait_secs: Number of seconds between checks that the model
-        is ready.  Used by supervisors when waiting for a chief supervisor
-        to initialize or restore the model.  Defaults to 30 seconds.
+      recovery_wait_secs: Number of seconds between checks that the model is
+        ready.  Used by supervisors when waiting for a chief supervisor to
+        initialize or restore the model.  Defaults to 30 seconds.
       stop_grace_secs: Grace period, in seconds, given to running threads to
         stop when `stop()` is called.  Defaults to 120 seconds.
       checkpoint_basename: The basename for checkpoint saving.
       session_manager: `SessionManager`, which manages Session creation and
         recovery. If it is `None`, a default `SessionManager` will be created
         with the set of arguments passed in for backwards compatibility.
-      summary_writer: `SummaryWriter` to use or `USE_DEFAULT`.  Can be `None`
-        to indicate that no summaries should be written.
-      init_fn: Optional callable used to initialize the model. Called
-        after the optional `init_op` is called.  The callable must accept one
-        argument, the session being initialized.
+      summary_writer: `SummaryWriter` to use or `USE_DEFAULT`.  Can be `None` to
+        indicate that no summaries should be written.
+      init_fn: Optional callable used to initialize the model. Called after the
+        optional `init_op` is called.  The callable must accept one argument,
+        the session being initialized.
       local_init_run_options: RunOptions to be passed as the SessionManager
         local_init_run_options parameter.
 
@@ -397,12 +395,11 @@
     """Initializes ready_op.
 
     Args:
-      ready_op: `Tensor` to check if the model is initialized.
-        If it's set to USE_DEFAULT, creates an op that checks all
-        the variables are initialized.
+      ready_op: `Tensor` to check if the model is initialized. If it's set to
+        USE_DEFAULT, creates an op that checks all the variables are
+        initialized.
       ready_for_local_init_op: `Tensor` to check if the model is ready to run
-        local_init_op.
-        If it's set to USE_DEFAULT, creates an op that checks all
+        local_init_op. If it's set to USE_DEFAULT, creates an op that checks all
         the global variables are initialized.
     """
     if ready_op is Supervisor.USE_DEFAULT:
@@ -440,9 +437,9 @@
 
     Args:
       local_init_op: `Operation` run for every new supervisor instance. If set
-      to USE_DEFAULT, use the first op from the GraphKeys.LOCAL_INIT_OP
-      collection. If the collection is empty, create an op that initializes
-      all local variables and all tables.
+        to USE_DEFAULT, use the first op from the GraphKeys.LOCAL_INIT_OP
+        collection. If the collection is empty, create an op that initializes
+        all local variables and all tables.
     """
     if local_init_op is Supervisor.USE_DEFAULT:
       local_init_op = self._get_first_op_from_collection(
@@ -461,8 +458,8 @@
     """Initializes saver.
 
     Args:
-      saver: A `Saver` object. If set to USE_DEFAULT, create one that
-        saves all the variables.
+      saver: A `Saver` object. If set to USE_DEFAULT, create one that saves all
+        the variables.
     """
     if saver is Supervisor.USE_DEFAULT:
       saver = self._get_first_op_from_collection(ops.GraphKeys.SAVERS)
@@ -475,8 +472,8 @@
     """Initializes summary_op.
 
     Args:
-      summary_op: An Operation that returns a Summary for the event logs.
-        If set to USE_DEFAULT, create an op that merges all the summaries.
+      summary_op: An Operation that returns a Summary for the event logs. If set
+        to USE_DEFAULT, create an op that merges all the summaries.
     """
     if summary_op is Supervisor.USE_DEFAULT:
       summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP)
@@ -490,8 +487,8 @@
     """Initializes global_step.
 
     Args:
-      global_step: An integer Tensor of size 1 that counts steps. If
-        set to USE_DEFAULT, creates global_step tensor.
+      global_step: An integer Tensor of size 1 that counts steps. If set to
+        USE_DEFAULT, creates global_step tensor.
     """
     if global_step is Supervisor.USE_DEFAULT:
       global_step = self._get_first_op_from_collection(
@@ -630,8 +627,9 @@
     """Writes graph_def to `logdir` and adds it to summary if applicable."""
     assert self._is_chief
     if self._logdir:
-      training_util.write_graph(self._graph.as_graph_def(add_shapes=True),
-                                self._logdir, "graph.pbtxt")
+      training_util.write_graph(
+          self._graph.as_graph_def(add_shapes=True), self._logdir,
+          "graph.pbtxt")
     if self._summary_writer and not self._graph_added_to_summary:
       self._summary_writer.add_graph(self._graph)
       self._summary_writer.add_meta_graph(self._meta_graph_def)
@@ -675,8 +673,7 @@
       # if there is no step value.
       current_step = training_util.global_step(sess, self._global_step)
       self._summary_writer.add_session_log(
-          SessionLog(status=SessionLog.START),
-          current_step)
+          SessionLog(status=SessionLog.START), current_step)
 
     threads = []
     if self._save_summaries_secs and self._summary_writer:
@@ -690,7 +687,9 @@
       t.start()
     return threads
 
-  def prepare_or_wait_for_session(self, master="", config=None,
+  def prepare_or_wait_for_session(self,
+                                  master="",
+                                  config=None,
                                   wait_for_checkpoint=False,
                                   max_wait_secs=7200,
                                   start_standard_services=True):
@@ -702,10 +701,10 @@
     manager to start the standard services.
 
     Args:
-      master: name of the TensorFlow master to use.  See the `tf.Session`
-        constructor for how this is interpreted.
-      config: Optional ConfigProto proto used to configure the session,
-        which is passed as-is to create the session.
+      master: name of the TensorFlow master to use.  See the
+        `tf.compat.v1.Session` constructor for how this is interpreted.
+      config: Optional ConfigProto proto used to configure the session, which is
+        passed as-is to create the session.
       wait_for_checkpoint: Whether we should wait for the availability of a
         checkpoint before creating Session. Defaults to False.
       max_wait_secs: Maximum time to wait for the session to become available.
@@ -724,18 +723,22 @@
 
     if self._is_chief:
       sess = self._session_manager.prepare_session(
-          master, init_op=self.init_op, saver=self.saver,
-          checkpoint_dir=self._logdir, wait_for_checkpoint=wait_for_checkpoint,
-          max_wait_secs=max_wait_secs, config=config,
-          init_feed_dict=self._init_feed_dict, init_fn=self._init_fn)
+          master,
+          init_op=self.init_op,
+          saver=self.saver,
+          checkpoint_dir=self._logdir,
+          wait_for_checkpoint=wait_for_checkpoint,
+          max_wait_secs=max_wait_secs,
+          config=config,
+          init_feed_dict=self._init_feed_dict,
+          init_fn=self._init_fn)
       self._write_graph()
       if start_standard_services:
         logging.info("Starting standard services.")
         self.start_standard_services(sess)
     else:
-      sess = self._session_manager.wait_for_session(master,
-                                                    config=config,
-                                                    max_wait_secs=max_wait_secs)
+      sess = self._session_manager.wait_for_session(
+          master, config=config, max_wait_secs=max_wait_secs)
     if start_standard_services:
       logging.info("Starting queue runners.")
       self.start_queue_runners(sess)
@@ -772,8 +775,8 @@
       queue_runners = self._graph.get_collection(ops.GraphKeys.QUEUE_RUNNERS)
     threads = []
     for qr in queue_runners:
-      threads.extend(qr.create_threads(sess, coord=self._coord, daemon=True,
-                                       start=True))
+      threads.extend(
+          qr.create_threads(sess, coord=self._coord, daemon=True, start=True))
     return threads
 
   def loop(self, timer_interval_secs, target, args=None, kwargs=None):
@@ -795,8 +798,12 @@
     Returns:
       The started thread.
     """
-    looper = coordinator.LooperThread(self._coord, timer_interval_secs,
-                                      target=target, args=args, kwargs=kwargs)
+    looper = coordinator.LooperThread(
+        self._coord,
+        timer_interval_secs,
+        target=target,
+        args=args,
+        kwargs=kwargs)
     looper.start()
     return looper
 
@@ -812,13 +819,13 @@
       threads: Optional list of threads to join with the coordinator.  If
         `None`, defaults to the threads running the standard services, the
         threads started for `QueueRunners`, and the threads started by the
-        `loop()` method.  To wait on additional threads, pass the
-        list in this parameter.
+        `loop()` method.  To wait on additional threads, pass the list in this
+        parameter.
       close_summary_writer: Whether to close the `summary_writer`.  Defaults to
         `True` if the summary writer was created by the supervisor, `False`
         otherwise.
-      ignore_live_threads: If `True` ignores threads that remain running after
-        a grace period when joining threads via the coordinator, instead of
+      ignore_live_threads: If `True` ignores threads that remain running after a
+        grace period when joining threads via the coordinator, instead of
         raising a RuntimeError.
     """
     self._coord.request_stop()
@@ -926,7 +933,9 @@
 
   # pylint: disable=g-doc-return-or-yield,broad-except
   @contextlib.contextmanager
-  def managed_session(self, master="", config=None,
+  def managed_session(self,
+                      master="",
+                      config=None,
                       start_standard_services=True,
                       close_summary_writer=True):
     """Returns a context manager for a managed session.
@@ -940,7 +949,7 @@
 
     ```python
     def train():
-      sv = tf.train.Supervisor(...)
+      sv = tf.compat.v1.train.Supervisor(...)
       with sv.managed_session(<master>) as sess:
         for step in xrange(..):
           if sv.should_stop():
@@ -973,14 +982,14 @@
     the training loop and are considered normal termination.
 
     Args:
-      master: name of the TensorFlow master to use.  See the `tf.Session`
-        constructor for how this is interpreted.
-      config: Optional `ConfigProto` proto used to configure the session.
-        Passed as-is to create the session.
-      start_standard_services: Whether to start the standard services,
-        such as checkpoint, summary and step counter.
-      close_summary_writer: Whether to close the summary writer when
-        closing the session.  Defaults to True.
+      master: name of the TensorFlow master to use.  See the
+        `tf.compat.v1.Session` constructor for how this is interpreted.
+      config: Optional `ConfigProto` proto used to configure the session. Passed
+        as-is to create the session.
+      start_standard_services: Whether to start the standard services, such as
+        checkpoint, summary and step counter.
+      close_summary_writer: Whether to close the summary writer when closing the
+        session.  Defaults to True.
 
     Returns:
       A context manager that yields a `Session` restored from the latest
@@ -989,7 +998,8 @@
     """
     try:
       sess = self.prepare_or_wait_for_session(
-          master=master, config=config,
+          master=master,
+          config=config,
           start_standard_services=start_standard_services)
       yield sess
     except Exception as e:
@@ -1011,6 +1021,7 @@
         except Exception:
           # Silently ignore exceptions raised by close().
           pass
+
   # pylint: enable=g-doc-return-or-yield,broad-except
 
 
@@ -1030,8 +1041,8 @@
 
   def run_loop(self):
     if self._sv.global_step is not None:
-      summary_strs, global_step = self._sess.run([self._sv.summary_op,
-                                                  self._sv.global_step])
+      summary_strs, global_step = self._sess.run(
+          [self._sv.summary_op, self._sv.global_step])
     else:
       summary_strs = self._sess.run(self._sv.summary_op)
       global_step = None
@@ -1063,8 +1074,7 @@
 
   def start_loop(self):
     self._last_time = time.time()
-    self._last_step = training_util.global_step(
-        self._sess, self._step_counter)
+    self._last_step = training_util.global_step(self._sess, self._step_counter)
 
   def run_loop(self):
     # Count the steps.
@@ -1080,12 +1090,13 @@
       steps_per_sec = added_steps / elapsed_time
     else:
       steps_per_sec = float("inf")
-    summary = Summary(value=[Summary.Value(tag=self._summary_tag,
-                                           simple_value=steps_per_sec)])
+    summary = Summary(value=[
+        Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)
+    ])
     if self._sv.summary_writer:
       self._sv.summary_writer.add_summary(summary, current_step)
-    logging.log_first_n(logging.INFO, "%s: %g", 10,
-                        self._summary_tag, steps_per_sec)
+    logging.log_first_n(logging.INFO, "%s: %g", 10, self._summary_tag,
+                        steps_per_sec)
 
 
 class SVTimerCheckpointThread(coordinator.LooperThread):
@@ -1104,13 +1115,13 @@
 
   def run_loop(self):
     logging.info("Saving checkpoint to path %s", self._sv.save_path)
-    self._sv.saver.save(self._sess, self._sv.save_path,
-                        global_step=self._sv.global_step)
+    self._sv.saver.save(
+        self._sess, self._sv.save_path, global_step=self._sv.global_step)
     if self._sv.summary_writer and self._sv.global_step is not None:
       current_step = training_util.global_step(self._sess, self._sv.global_step)
       self._sv.summary_writer.add_session_log(
-          SessionLog(status=SessionLog.CHECKPOINT,
-                     checkpoint_path=self._sv.save_path),
+          SessionLog(
+              status=SessionLog.CHECKPOINT, checkpoint_path=self._sv.save_path),
           current_step)
 
 
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 21e9a99..3b2d862 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -110,7 +110,7 @@
   # Note that if you want to have 2 backup replicas, you can change
   # total_num_replicas=52 and make sure this number matches how many physical
   # replicas you started in your job.
-  opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
+  opt = tf.compat.v1.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
                                  total_num_replicas=50)
 
   # Some models have startup_delays to help stabilize the model but when using
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 4efecbc..bf73e25 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -12,6 +12,10 @@
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_logged_benchmark",
+)
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 py_library(
@@ -143,17 +147,21 @@
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras:backend",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "@six_archive//:six",
     ],
 )
 
@@ -169,12 +177,14 @@
         "@six_archive//:six",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:saver",
@@ -182,12 +192,12 @@
         "//tensorflow/python:state_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:training_util",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
         "//tensorflow/python:variables",
@@ -279,3 +289,18 @@
         "//tensorflow/python:framework_test_lib",
     ],
 )
+
+tf_py_test(
+    name = "benchmarks_test",
+    srcs = ["benchmarks_test.py"],
+    additional_deps = [
+        ":util",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "benchmarks",
+    target = "//tensorflow/python/training/tracking:benchmarks_test",
+)
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index b069721..279d2db 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -19,9 +19,6 @@
 
 import abc
 import collections
-import functools
-import json
-import weakref
 
 import six
 
@@ -35,14 +32,11 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 
-
 # Key where the object graph proto is saved in a TensorBundle
 OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
 
-
 # A key indicating a variable's value in an object's checkpointed Tensors
 # (Trackable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
@@ -74,8 +68,7 @@
   """
 
   def __init__(self, checkpoint_position, shape=None):
-    self.wrapped_value = checkpoint_position.value_tensors()[
-        VARIABLE_VALUE_KEY]
+    self.wrapped_value = checkpoint_position.value_tensors()[VARIABLE_VALUE_KEY]
     if shape:
       # We need to set the static shape information on the initializer if
       # possible so we don't get a variable with an unknown shape.
@@ -97,8 +90,8 @@
   """Embeds a tensor in a checkpoint with no restore ops."""
 
   def __init__(self, tensor, name, dtype=None, device=None):
-    spec = saveable_object.SaveSpec(tensor, "", name, dtype=dtype,
-                                    device=device)
+    spec = saveable_object.SaveSpec(
+        tensor, "", name, dtype=dtype, device=device)
     super(NoRestoreSaveable, self).__init__(tensor, [spec], name)
 
   def restore(self, restored_tensors, restored_shapes):
@@ -123,7 +116,8 @@
     """Create a new `SaveableObject` which freezes current state as a constant.
 
     Used when executing eagerly to embed the current state as a constant, or
-    when creating a static tf.train.Saver with the frozen current Python state.
+    when creating a static tf.compat.v1.train.Saver with the frozen current
+    Python state.
 
     Returns:
       A `SaveableObject` which is not a `PythonStateSaveable` instance (i.e. has
@@ -140,24 +134,26 @@
 
     Args:
       name: The checkpoint key to write to.
-      state_callback: A function taking no arguments which returns a
-        string. This function is run every time a checkpoint is written.
+      state_callback: A function taking no arguments which returns a string.
+        This function is run every time a checkpoint is written.
       restore_callback: A function taking a Python string, used to restore
         state. Optional; defaults to doing nothing, in which case it is ignored
         by status assertions such as assert_consumed().
     """
     self._has_trivial_state_callback = (restore_callback is None)
+
     def _state_callback_wrapper():
       with ops.init_scope():
         return state_callback()
+
     self._state_callback = _state_callback_wrapper
     self._restore_callback = restore_callback
     with ops.device("/cpu:0"):
       self._save_string = constant_op.constant("", dtype=dtypes.string)
     spec = saveable_object.SaveSpec(
         self._save_string, "", name, dtype=dtypes.string)
-    super(PythonStringStateSaveable, self).__init__(
-        self._save_string, [spec], name)
+    super(PythonStringStateSaveable, self).__init__(self._save_string, [spec],
+                                                    name)
 
   @property
   def optional_restore(self):
@@ -170,8 +166,10 @@
 
   def freeze(self):
     """Create a frozen `SaveableObject` which saves the current state."""
+
     def _constant_state():
       return constant_op.constant(self._state_callback(), dtype=dtypes.string)
+
     return NoRestoreSaveable(
         tensor=_constant_state,
         dtype=dtypes.string,
@@ -217,6 +215,7 @@
 
     Args:
       trackable: The object to record a correspondence for.
+
     Returns:
       True if this is a new assignment, False if this object has already been
       mapped to a checkpointed `Object` proto.
@@ -226,6 +225,7 @@
     checkpoint = self.checkpoint
     checkpoint.all_python_objects.add(trackable)
     current_assignment = checkpoint.object_by_proto_id.get(self._proto_id, None)
+    checkpoint.matched_proto_ids.add(self._proto_id)
     if current_assignment is None:
       checkpoint.object_by_proto_id[self._proto_id] = trackable
       for deferred_slot_restoration in (
@@ -263,21 +263,21 @@
       # consistent (if the dependency DAG is not a tree then there are
       # multiple paths to the same object).
       if current_assignment is not trackable:
-        logging.warning(
-            ("Inconsistent references when loading the checkpoint into this "
-             "object graph. Either the Trackable object references in the "
-             "Python program have changed in an incompatible way, or the "
-             "checkpoint was generated in an incompatible program.\n\nTwo "
-             "checkpoint references resolved to different objects (%s and %s).")
-            % (current_assignment, trackable))
+        logging.warning((
+            "Inconsistent references when loading the checkpoint into this "
+            "object graph. Either the Trackable object references in the "
+            "Python program have changed in an incompatible way, or the "
+            "checkpoint was generated in an incompatible program.\n\nTwo "
+            "checkpoint references resolved to different objects (%s and %s)."),
+                        current_assignment, trackable)
       return False  # Not a new assignment
 
   def is_simple_variable(self):
     """Determine whether this value is restorable with a Tensor initializer."""
     attributes = self.object_proto.attributes
-    return (len(attributes) == 1
-            and attributes[0].name == VARIABLE_VALUE_KEY
-            and not self.object_proto.children)
+    return (len(attributes) == 1 and
+            attributes[0].name == VARIABLE_VALUE_KEY and
+            not self.object_proto.children)
 
   def value_tensors(self):
     """Create value `Tensor`s for this object's attributes.
@@ -335,8 +335,9 @@
         # If we've already created and cached a SaveableObject for this
         # attribute, we can re-use it to avoid re-creating some ops when graph
         # building.
-        saveable_list = saveables_cache.get(
-            self.trackable, {}).get(serialized_tensor.name, (None,))
+        saveable_list = saveables_cache.get(self.trackable,
+                                            {}).get(serialized_tensor.name,
+                                                    (None,))
         if len(saveable_list) == 1:
           # Almost every attribute will have exactly one SaveableObject.
           saveable, = saveable_list
@@ -363,15 +364,15 @@
           # checkpoint was loaded.
           if not serialized_tensor.optional_restore:
             self._checkpoint.unused_attributes.setdefault(
-                self.trackable, []).append(serialized_tensor.name)
+                self._proto_id, []).append(serialized_tensor.name)
           continue
         if callable(saveable_factory):
           saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
         else:
           saveable = saveable_factory
         if saveables_cache is not None:
-          saveables_cache.setdefault(
-              self.trackable, {})[serialized_tensor.name] = [saveable]
+          saveables_cache.setdefault(self.trackable,
+                                     {})[serialized_tensor.name] = [saveable]
       if isinstance(saveable, PythonStateSaveable):
         python_saveables.append(saveable)
       else:
@@ -388,11 +389,10 @@
       A list of operations when graph building, or an empty list when executing
       eagerly.
     """
-    (restore_ops,
-     tensor_saveables,
+    (restore_ops, tensor_saveables,
      python_saveables) = self._gather_ops_or_named_saveables()
-    restore_ops.extend(self._checkpoint.restore_saveables(
-        tensor_saveables, python_saveables))
+    restore_ops.extend(
+        self._checkpoint.restore_saveables(tensor_saveables, python_saveables))
     return restore_ops
 
   @property
@@ -416,13 +416,11 @@
 
 
 _DeferredSlotVariableRestoration = collections.namedtuple(
-    "_DeferredSlotVariableRestoration",
-    [
+    "_DeferredSlotVariableRestoration", [
         "original_variable",
         "slot_variable_id",
         "slot_name",
-    ]
-)
+    ])
 
 _SlotVariableRestoration = collections.namedtuple(
     "_SlotVariableRestoration",
@@ -446,6 +444,7 @@
 
   Args:
     method: The method to decorate.
+
   Returns:
     A decorated method which sets and un-sets automatic dependency tracking for
     the object the method is called on (not thread safe).
@@ -595,16 +594,21 @@
 
     Args:
       name: The local name of the dependency.
+
     Returns:
       A `Trackable` object, or `None` if no dependency by this name was
       found.
     """
     return self._self_unconditional_dependency_names.get(name, None)
 
-  def _add_variable_with_custom_getter(
-      self, name, shape=None, dtype=dtypes.float32,
-      initializer=None, getter=None, overwrite=False,
-      **kwargs_for_getter):
+  def _add_variable_with_custom_getter(self,
+                                       name,
+                                       shape=None,
+                                       dtype=dtypes.float32,
+                                       initializer=None,
+                                       getter=None,
+                                       overwrite=False,
+                                       **kwargs_for_getter):
     """Restore-on-create for a variable be saved with this `Trackable`.
 
     If the user has requested that this object or another `Trackable` which
@@ -640,11 +644,9 @@
             name=name, shape=shape)
       else:
         checkpoint_initializer = None
-      if (checkpoint_initializer is not None
-          and not (
-              isinstance(initializer, CheckpointInitialValue)
-              and (initializer.restore_uid
-                   > checkpoint_initializer.restore_uid))):
+      if (checkpoint_initializer is not None and
+          not (isinstance(initializer, CheckpointInitialValue) and
+               (initializer.restore_uid > checkpoint_initializer.restore_uid))):
         # If multiple Trackable objects are "creating" the same variable
         # via the magic of custom getters, the one with the highest restore UID
         # (the one called last) has to make the final initializer. If another
@@ -654,7 +656,10 @@
         initializer = checkpoint_initializer
         shape = None
     new_variable = getter(
-        name=name, shape=shape, dtype=dtype, initializer=initializer,
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
         **kwargs_for_getter)
 
     # If we set an initializer and the variable processed it, tracking will not
@@ -662,8 +667,7 @@
     # is a non-trivial restoration queued, it will handle that. This also
     # handles slot variables.
     if not overwrite or isinstance(new_variable, Trackable):
-      return self._track_trackable(new_variable, name=name,
-                                   overwrite=overwrite)
+      return self._track_trackable(new_variable, name=name, overwrite=overwrite)
     else:
       # TODO(allenl): Some variable types are not yet supported. Remove this
       # fallback once all get_variable() return types are Trackable.
@@ -681,6 +685,7 @@
       name: The object-local name of the dependency holding the variable's
         value.
       shape: The shape of the variable being loaded into.
+
     Returns:
       An callable for use as a variable's initializer/initial_value, or None if
       one should not be set (either because there was no variable with this name
@@ -718,8 +723,8 @@
 
     Args:
       trackable: A `Trackable` which this object depends on.
-      name: A local name for `trackable`, used for loading checkpoints into
-        the correct objects.
+      name: A local name for `trackable`, used for loading checkpoints into the
+        correct objects.
       overwrite: Boolean, whether silently replacing dependencies is OK. Used
         for __setattr__, where throwing an error on attribute reassignment would
         be inappropriate.
@@ -734,13 +739,11 @@
     """
     self._maybe_initialize_trackable()
     if not isinstance(trackable, Trackable):
-      raise TypeError(
-          ("Trackable._track_trackable() passed type %s, not a "
-           "Trackable.") % (type(trackable),))
+      raise TypeError(("Trackable._track_trackable() passed type %s, not a "
+                       "Trackable.") % (type(trackable),))
     new_reference = TrackableReference(name=name, ref=trackable)
     current_object = self._lookup_dependency(name)
-    if (current_object is not None
-        and current_object is not trackable):
+    if (current_object is not None and current_object is not trackable):
       if not overwrite:
         raise ValueError(
             ("Called Trackable._track_trackable() with name='%s', "
@@ -755,8 +758,7 @@
               index] = new_reference
     elif current_object is None:
       self._self_unconditional_checkpoint_dependencies.append(new_reference)
-      self._handle_deferred_dependencies(
-          name=name, trackable=trackable)
+      self._handle_deferred_dependencies(name=name, trackable=trackable)
     self._self_unconditional_dependency_names[name] = trackable
     return trackable
 
@@ -780,8 +782,7 @@
     Args:
       name: The name of the dependency within this object (`self`), used to
         match `trackable` with values saved in a checkpoint.
-      trackable: The Trackable object to restore (inheriting from
-        `Trackable`).
+      trackable: The Trackable object to restore (inheriting from `Trackable`).
     """
     self._maybe_initialize_trackable()
     trackable._maybe_initialize_trackable()  # pylint: disable=protected-access
@@ -809,15 +810,15 @@
     restore_ops = []
     while visit_queue:
       current_position = visit_queue.popleft()
-      restore_ops.extend(nest.flatten(
-          current_position.trackable  # pylint: disable=protected-access
-          ._single_restoration_from_checkpoint_position(
-              checkpoint_position=current_position,
-              visit_queue=visit_queue)))
+      restore_ops.extend(
+          nest.flatten(current_position.trackable  # pylint: disable=protected-access
+                       ._single_restoration_from_checkpoint_position(
+                           checkpoint_position=current_position,
+                           visit_queue=visit_queue)))
     return restore_ops
 
-  def _single_restoration_from_checkpoint_position(
-      self, checkpoint_position, visit_queue):
+  def _single_restoration_from_checkpoint_position(self, checkpoint_position,
+                                                   visit_queue):
     """Restore this object, and either queue its dependencies or defer them."""
     self._maybe_initialize_trackable()
     checkpoint = checkpoint_position.checkpoint
@@ -831,14 +832,13 @@
       restore_ops = ()
     for child in checkpoint_position.object_proto.children:
       child_position = CheckpointPosition(
-          checkpoint=checkpoint,
-          proto_id=child.node_id)
+          checkpoint=checkpoint, proto_id=child.node_id)
       local_object = self._lookup_dependency(child.local_name)
       if local_object is None:
         # We don't yet have a dependency registered with this name. Save it
         # in case we do.
-        self._deferred_dependencies.setdefault(child.local_name, []).append(
-            child_position)
+        self._deferred_dependencies.setdefault(child.local_name,
+                                               []).append(child_position)
       else:
         if child_position.bind_object(trackable=local_object):
           # This object's correspondence is new, so dependencies need to be
@@ -853,7 +853,8 @@
 
     Keys in the returned dictionary are local to this object and in a separate
     namespace from dependencies. Values may either be `SaveableObject` factories
-    or variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
+    or variables easily converted to `SaveableObject`s (as in
+    `tf.compat.v1.train.Saver`'s
     `var_list` constructor argument).
 
     `SaveableObjects` have a name set, which Trackable needs to generate
@@ -861,7 +862,8 @@
     should return a dictionary of callables which take `name` arguments and
     return `SaveableObjects` with that name.
 
-    If this object may also be passed to the global-name-based `tf.train.Saver`,
+    If this object may also be passed to the global-name-based
+    `tf.compat.v1.train.Saver`,
     the returned callables should have a default value for their name argument
     (i.e. be callable with no arguments).
 
@@ -877,30 +879,7 @@
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    if not hasattr(self, "get_config"):
-      return {}
-    try:
-      self.get_config()
-    except NotImplementedError:
-      return {}
-    weak_self = weakref.ref(self)
-    def _state_callback():
-      """Serializes `self.get_config()` for saving."""
-      dereferenced_self = weak_self()
-      if dereferenced_self:
-        try:
-          return json.dumps(
-              dereferenced_self,
-              default=serialization.get_json_type,
-              sort_keys=True).encode("utf8")
-        except TypeError:
-          # Even if get_config worked objects may have produced garbage.
-          return ""
-      else:
-        return ""
-    return {OBJECT_CONFIG_JSON_KEY: functools.partial(
-        PythonStringStateSaveable,
-        state_callback=_state_callback)}
+    return {}
 
   def _list_functions_for_serialization(self):
     """Lists the functions of this trackable to serialize.
diff --git a/tensorflow/python/training/tracking/benchmarks_test.py b/tensorflow/python/training/tracking/benchmarks_test.py
new file mode 100644
index 0000000..a3cec89c
--- /dev/null
+++ b/tensorflow/python/training/tracking/benchmarks_test.py
@@ -0,0 +1,118 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for checkpoint-related APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import util
+
+
+class _TrivialSaveable(saveable_object.SaveableObject):
+
+  def __init__(self, name):
+    op = lambda: array_ops.ones([])
+    super(_TrivialSaveable, self).__init__(
+        op=op,
+        specs=[saveable_object.SaveSpec(
+            op, "", name, dtype=dtypes.float32, device="CPU:0")],
+        name=name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    return control_flow_ops.no_op()
+
+
+class _TrivialRestore(base.Trackable):
+
+  def _gather_saveables_for_checkpoint(self):
+    return {base.VARIABLE_VALUE_KEY: _TrivialSaveable}
+
+
+class _LazyTrivialObjects(module.Module):
+
+  def __init__(self):
+    self.existing = [_TrivialRestore() for _ in range(5)]
+    self.lazy = []
+
+  def __call__(self):
+    if not self.lazy:
+      self.lazy.extend(_TrivialRestore() for _ in range(5))
+    return
+
+
+def _save_checkpoint():
+  original_checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+  original_checkpoint.m()
+  return original_checkpoint.write(os.path.join(test.get_temp_dir(), "ckpt"))
+
+
+class SavingBenchmarks(test.Benchmark):
+
+  def _run(self, func, num_iters, execution_mode=None):
+    func()
+    start = time.time()
+    for _ in xrange(num_iters):
+      func()
+    end = time.time()
+    mean_us = (end - start) * 1e6 / num_iters
+    self.report_benchmark(
+        iters=num_iters,
+        wall_time=mean_us,
+        extras={"examples_per_sec": num_iters / (end - start)})
+
+  def benchmark_baseline_no_restore(self):
+
+    def _create_and_call():
+      checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+      checkpoint.m()
+
+    self._run(_create_and_call, 3)
+
+  def benchmark_batch_restore(self):
+    checkpoint_path = _save_checkpoint()
+
+    def _create_and_call():
+      checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+      checkpoint.m()
+      checkpoint.restore(checkpoint_path)
+
+    self._run(_create_and_call, 3)
+
+  def benchmark_restore_on_create(self):
+    checkpoint_path = _save_checkpoint()
+
+    def _create_and_call():
+      checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+      checkpoint.restore(checkpoint_path)
+      checkpoint.m()
+
+    self._run(_create_and_call, 3)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index 73df687..1695e44 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -665,7 +665,6 @@
     wrapt.ObjectProxy.__init__(self, wrapped_dict)
     TrackableDataStructure.__init__(self)
     self._self_non_string_key = False
-    self._self_non_append_mutation = False
     self._self_external_modification = False
     self.__wrapped__.update(
         {key: self._track_value(
@@ -690,14 +689,12 @@
   # pylint: disable=protected-access
   def __copy__(self):
     copied = _DictWrapper(copy.copy(self.__wrapped__))
-    copied._self_non_append_mutation = self._self_non_append_mutation
     copied._self_external_modification = self._self_external_modification
     copied._self_non_string_key = self._self_non_string_key
     return copied
 
   def __deepcopy__(self, memo):
     copied = _DictWrapper(copy.deepcopy(self.__wrapped__, memo))
-    copied._self_non_append_mutation = self._self_non_append_mutation
     copied._self_external_modification = self._self_external_modification
     copied._self_non_string_key = self._self_non_string_key
     return copied
@@ -725,15 +722,6 @@
           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
           "object; it will be automatically un-wrapped and subsequently "
           "ignored." % (self,))
-    if self._self_non_append_mutation:
-      raise ValueError(
-          "Unable to save the object %s (a dictionary wrapper constructed "
-          "automatically on attribute assignment). A key mapping to a "
-          "trackable object was overwritten or deleted, which would "
-          "cause problems for restoration.\n\nIf you don't need this "
-          "dictionary checkpointed, wrap it in a "
-          "tf.contrib.checkpoint.NoDependency object; it will be automatically "
-          "un-wrapped and subsequently ignored." % (self,))
     if self._self_external_modification:
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
@@ -752,7 +740,6 @@
   def _dirty(self):
     """Check if there has already been a mutation which prevents saving."""
     return (self._self_external_modification
-            or self._self_non_append_mutation
             or self._self_non_string_key)
 
   def _check_self_external_modification(self):
@@ -800,39 +787,20 @@
     self._maybe_initialize_trackable()
     no_dep = isinstance(value, NoDependency)
     if isinstance(key, six.string_types):
-      existing_dependency = self._lookup_dependency(key)
       value = self._track_value(value, name=key)
     else:
       value = _wrap_or_unwrap(value)
-      existing_dependency = None
       if not no_dep and isinstance(value, base.Trackable):
         # Non-string keys are OK as long as we have no reason to add a
         # dependency on the value (either because the value is not
         # trackable, or because it was wrapped in a NoDependency object).
         self._self_non_string_key = True
-    if key in self.__wrapped__:
-      previous_value = self.__wrapped__[key]
-      if previous_value is not value:
-        if ((not no_dep and isinstance(value, base.Trackable))
-            # We don't want to just check that the existing object is
-            # trackable, since it may have been wrapped in a NoDependency
-            # object.
-            or existing_dependency is not None):
-          # A trackable object was replaced under the same key; this means
-          # that restoring would be error-prone, so we'll throw an exception on
-          # save.
-          self._self_non_append_mutation = True
     self.__wrapped__[key] = value
 
     self._update_snapshot()
 
   def __delitem__(self, key):
     self._check_self_external_modification()
-    existing_value = self[key]
-    if isinstance(existing_value, base.Trackable):
-      # Deleting tracked trackable values means restoring is problematic,
-      # so we'll throw an exception on save.
-      self._self_non_append_mutation = True
     del self.__wrapped__[key]
     self._update_snapshot()
 
diff --git a/tensorflow/python/training/tracking/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
index e05c682..42d75df 100644
--- a/tensorflow/python/training/tracking/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -27,7 +27,9 @@
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import normalization
@@ -109,6 +111,73 @@
     self.assertIn(v, model.variables)
     self.assertIn(v, model.trainable_variables)
     self.assertNotIn(v, model.non_trainable_variables)
+    self.assertIn(model.layer_list[0].trainable_weights[0],
+                  model.trainable_weights)
+
+  def testSubModelTracking(self):
+    model = training.Model()
+    model.v = variables.Variable(1.)
+    self.assertIn(model.v, model.trainable_weights)
+    model2 = training.Model()
+    model2.m = [model]
+    self.assertIn(model.v, model2.trainable_weights)
+
+  def testSubSequentialTracking(self):
+
+    class _Subclassed(training.Model):
+
+      def __init__(self, wrapped):
+        super(_Subclassed, self).__init__()
+        self._wrapped = wrapped
+
+      def call(self, x):
+        return self._wrapped(x)
+
+    model = sequential.Sequential()
+    layer = core.Dense(1)
+    model.add(layer)
+    model2 = _Subclassed(model)
+    model2(array_ops.ones([1, 2]))
+    model2.m = [model]
+    self.assertIn(layer.kernel, model2.trainable_weights)
+
+  def testLayerTrackedThroughSequential(self):
+    class AttrDict(dict):
+
+      def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+    def ffnet(layer_sizes, name):
+      ff = sequential.Sequential(name=name)
+      for i, width in enumerate(layer_sizes):
+        ff.add(core.Dense(
+            width,
+            activation=("relu" if i < len(layer_sizes)-1 else None)))
+      return ff
+
+    class MyModel2(training.Model):
+
+      def __init__(self, config, name="my_model_2"):
+        super(MyModel2, self).__init__(name=name)
+        self._num_tokens = config.num_tokens
+
+        # list of sub-models
+        self._ffnet = [ffnet(config.module_layers + (self._num_tokens,), "ff")]
+
+      def null_input(self):
+        return array_ops.zeros([1, self._num_tokens], dtype=dtypes.float32)
+
+      def call(self, input_, module_index=None):
+        return self._ffnet[0](input_)
+
+    m2 = MyModel2(AttrDict(
+        num_tokens=5,
+        module_layers=(50, 30)))
+
+    # Construct
+    m2(m2.null_input())
+    self.assertLen(m2.trainable_variables, 6)
 
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
@@ -594,15 +663,6 @@
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testDelNoSave(self):
-    model = training.Model()
-    model.d = {}
-    model.d["a"] = []
-    del model.d["a"]
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
-      model.save_weights(save_path)
-
   def testPopNoSave(self):
     model = training.Model()
     model.d = {}
@@ -621,14 +681,13 @@
     with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
       model.save_weights(save_path)
 
-  def testOverwriteNoSave(self):
+  def testOverwriteCanStillSave(self):
     model = training.Model()
     model.d = {}
     model.d["a"] = {}
     model.d["a"] = {}
     save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
-      model.save_weights(save_path)
+    model.save_weights(save_path)
 
   def testIter(self):
     model = training.Model()
diff --git a/tensorflow/python/training/tracking/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
index 66f8e3a..3568df6 100644
--- a/tensorflow/python/training/tracking/layer_utils.py
+++ b/tensorflow/python/training/tracking/layer_utils.py
@@ -33,9 +33,10 @@
 def has_weights(obj):
   """Implicit check for Layer-like objects."""
   # TODO(b/110718070): Replace with isinstance(obj, base_layer.Layer).
-  return (hasattr(obj, "trainable_weights")
-          and hasattr(obj, "non_trainable_weights")
-          and not isinstance(obj, type))
+  has_weight = (hasattr(type(obj), "trainable_weights")
+                and hasattr(type(obj), "non_trainable_weights"))
+
+  return has_weight and not isinstance(obj, type)
 
 
 def filter_empty_layer_containers(layer_list):
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index fa243b6..c61b697d 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -150,15 +150,27 @@
     _RESOURCE_TRACKER_STACK = old
 
 
-class TrackableResource(base.Trackable):
-  """Base class for all resources that need to be tracked."""
+class CapturableResource(base.Trackable):
+  """Holds a Tensor which a tf.function can capture.
 
-  def __init__(self):
-    global _RESOURCE_TRACKER_STACK
-    for resource_tracker in _RESOURCE_TRACKER_STACK:
-      resource_tracker.add_resource(self)
+  `CapturableResource`s are discovered by traversing the graph of object
+  attributes, e.g. during `tf.saved_model.save`. They are excluded from the
+  scope-based tracking of `TrackableResource`; generally things that require
+  initialization should inherit from `TrackableResource` instead of
+  `CapturableResource` directly.
+  """
 
+  def __init__(self, device=""):
+    """Initialize the `CapturableResource`.
+
+    Args:
+      device: A string indicating a required placement for this resource,
+        e.g. "CPU" if this resource must be created on a CPU device. A blank
+        device allows the user to place resource creation, so generally this
+        should be blank unless the resource only makes sense on one device.
+    """
     self._resource_handle = None
+    self._resource_device = device
 
   def _create_resource(self):
     """A function that creates a resource handle."""
@@ -173,7 +185,8 @@
   def resource_handle(self):
     """Returns the resource handle associated with this Resource."""
     if self._resource_handle is None:
-      self._resource_handle = self._create_resource()
+      with ops.device(self._resource_device):
+        self._resource_handle = self._create_resource()
     return self._resource_handle
 
   def _list_functions_for_serialization(self):
@@ -193,6 +206,24 @@
     }
 
 
+class TrackableResource(CapturableResource):
+  """Adds scope tracking to CapturableResource."""
+
+  def __init__(self, device=""):
+    """Initialize the `TrackableResource`.
+
+    Args:
+      device: A string indicating a required placement for this resource,
+        e.g. "CPU" if this resource must be created on a CPU device. A blank
+        device allows the user to place resource creation, so generally this
+        should be blank unless the resource only makes sense on one device.
+    """
+    global _RESOURCE_TRACKER_STACK
+    for resource_tracker in _RESOURCE_TRACKER_STACK:
+      resource_tracker.add_resource(self)
+    super(TrackableResource, self).__init__(device=device)
+
+
 class TrackableAsset(base.Trackable):
   """Base class for asset files which need to be tracked."""
 
@@ -201,7 +232,7 @@
     # The init_scope prevents functions from capturing `path` in an
     # initialization graph, since it is transient and should not end up in a
     # serialized function body.
-    with ops.init_scope():
+    with ops.init_scope(), ops.device("CPU"):
       self._path = ops.internal_convert_to_tensor(path, dtype=dtypes.string,
                                                   name="asset_path")
 
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 551e08a..a76f3b1 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -18,9 +18,12 @@
 from __future__ import print_function
 
 import abc
+import collections
 import os
 import weakref
 
+import six
+
 from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
@@ -38,6 +41,7 @@
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as v1_saver_lib
 from tensorflow.python.training.saving import functional_saver
@@ -49,10 +53,61 @@
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Loaded lazily due to a circular dependency.
+keras_backend = lazy_loader.LazyLoader(
+    "keras_backend", globals(),
+    "tensorflow.python.keras.backend")
+
+
+class _ObjectGraphProtoPrettyPrinter(object):
+  """Lazily traverses an object graph proto to pretty print names.
+
+  If no calls to `node_names` are made this object has no performance
+  overhead. On the other hand, it will only traverse the object graph once, so
+  repeated naming is cheap after the first.
+  """
+
+  def __init__(self, object_graph_proto):
+    self._object_graph_proto = object_graph_proto
+    self._node_name_cache = None
+
+  @property
+  def node_names(self):
+    """Lazily creates a mapping from node id to ("path", "to", "root")."""
+    if self._node_name_cache is not None:
+      return self._node_name_cache
+    path_to_root = object_identity.ObjectIdentityDictionary()
+    path_to_root[0] = ("(root)",)
+    to_visit = collections.deque([0])
+    while to_visit:
+      node_id = to_visit.popleft()
+      obj = self._object_graph_proto.nodes[node_id]
+      for child in obj.children:
+        if child.node_id not in path_to_root:
+          path_to_root[child.node_id] = (
+              path_to_root[node_id] + (child.local_name,))
+          to_visit.append(child.node_id)
+
+    node_names = {}
+    for node_id, path_to_root in path_to_root.items():
+      node_names[node_id] = ".".join(path_to_root)
+
+    for node_id, node in enumerate(self._object_graph_proto.nodes):
+      for slot_reference in node.slot_variables:
+        node_names[slot_reference.slot_variable_node_id] = (
+            "{}'s state '{}' for {}".format(
+                node_names[node_id],
+                slot_reference.slot_name,
+                node_names[slot_reference.original_variable_node_id]))
+    self._node_name_cache = node_names
+    return node_names
+
+
 class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
@@ -61,8 +116,8 @@
     """Specify the checkpoint being loaded.
 
     Args:
-      object_graph_proto: The TrackableObjectGraph protocol buffer
-        associated with this checkpoint.
+      object_graph_proto: The TrackableObjectGraph protocol buffer associated
+        with this checkpoint.
       save_path: A string, the path to the checkpoint, as returned by
         `tf.train.latest_checkpoint`.
       save_path_tensor: A string `Tensor` which contains or will be fed the save
@@ -76,9 +131,9 @@
     """
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
-    # Maps from objects to lists of attributes which were in the checkpoint but
-    # not loaded into any object, for error checking.
-    self.unused_attributes = weakref.WeakKeyDictionary()
+    # Maps from proto ids to lists of attributes which were in the checkpoint
+    # but not loaded into any object, for error checking.
+    self.unused_attributes = {}
     # Dictionary mapping from an id in the protocol buffer flat array to
     # Trackable Python objects. This mapping may be deferred if a
     # checkpoint is restored before all dependencies have been tracked. Uses
@@ -86,6 +141,7 @@
     # (as objects with deferred dependencies will generally have references to
     # this object).
     self.object_by_proto_id = weakref.WeakValueDictionary()
+    self.matched_proto_ids = set()
     # A set of all Python objects we've seen as dependencies, even if we didn't
     # use them (for example because of inconsistent references when
     # loading). Used to make status assertions fail when loading checkpoints
@@ -113,6 +169,9 @@
     # deferred_slot_restorations if the optimizer hasn't been created when that
     # happens.
     self.slot_restorations = {}
+    # Controls whether errors are printed in __del__ if some objects did not
+    # match.
+    self.expect_partial = False
     for node_index, node in enumerate(self.object_graph_proto.nodes):
       for slot_reference in node.slot_variables:
         # `node` refers to an `Optimizer`, since only these have slot variables.
@@ -142,12 +201,14 @@
     """
     restore_ops = []
     # Eagerly run restorations for Python state.
-    reader = pywrap_tensorflow.NewCheckpointReader(
-        self.save_path_string)
+    reader = None
     for saveable in python_saveables:
+      if reader is None:
+        # Lazily create the NewCheckpointReader, since this requires file access
+        # and we may not have any Python saveables.
+        reader = pywrap_tensorflow.NewCheckpointReader(self.save_path_string)
       spec_names = [spec.name for spec in saveable.specs]
-      saveable.python_restore(
-          [reader.get_tensor(name) for name in spec_names])
+      saveable.python_restore([reader.get_tensor(name) for name in spec_names])
 
     # If we have new SaveableObjects, extract and cache restore ops.
     if tensor_saveables:
@@ -167,6 +228,36 @@
           self.restore_ops_by_name[name] = restore_op
     return restore_ops
 
+  def __del__(self):
+    if self.expect_partial:
+      return
+    if logging is None:
+      # The logging module may have been unloaded when __del__ is called.
+      log_fn = print
+    else:
+      log_fn = logging.warning
+    printed_warning = False
+    pretty_printer = _ObjectGraphProtoPrettyPrinter(self.object_graph_proto)
+    for node_id in range(len(self.object_graph_proto.nodes)):
+      if node_id not in self.matched_proto_ids:
+        log_fn("Unresolved object in checkpoint: {}"
+               .format(pretty_printer.node_names[node_id]))
+        printed_warning = True
+    for node_id, attribute_name in self.unused_attributes.items():
+      log_fn(("Unused attribute in object {}: {}"
+              .format(pretty_printer.node_names[node_id], attribute_name)))
+      printed_warning = True
+    if printed_warning:
+      log_fn(
+          "A checkpoint was restored (e.g. tf.train.Checkpoint.restore or "
+          "tf.keras.Model.load_weights) but not all checkpointed values were "
+          "used. See above for specific issues. Use expect_partial() on the "
+          "load status object, e.g. "
+          "tf.train.Checkpoint.restore(...).expect_partial(), to silence these "
+          "warnings, or use assert_consumed() to make the check explicit. See "
+          "https://www.tensorflow.org/alpha/guide/checkpoints#loading_mechanics"
+          " for details.")
+
 
 class _NameBasedRestoreCoordinator(object):
   """Keeps the status of a name-based checkpoint restore."""
@@ -205,14 +296,13 @@
           # whether it's optional to restore it. If it's optional we don't need
           # to make assertions fail.
           if not saveable_factory("").optional_restore:
-            self.unused_attributes.setdefault(trackable, []).append(
-                attribute_name)
+            self.unused_attributes.setdefault(trackable,
+                                              []).append(attribute_name)
           continue
       else:
         saveable = saveable_factory
       names_to_saveables = saveable_object_util.op_list_to_dict(
-          [saveable],
-          convert_variable_to_tensor=False)
+          [saveable], convert_variable_to_tensor=False)
       for name, op in names_to_saveables.items():
         for saveable_object in saveable_object_util.saveable_objects_for_op(
             op=op, name=name):
@@ -224,8 +314,7 @@
     # run_restore_ops/initialize_or_restore on the status object for name-based
     # checkpoints.
     assert context.executing_eagerly()
-    for saveable in self.globally_named_object_attributes(
-        trackable):
+    for saveable in self.globally_named_object_attributes(trackable):
       restored_tensors = []
       tensor_missing = False
       for spec in saveable.specs:
@@ -248,14 +337,18 @@
         # Ignores values missing from the checkpoint, as with object-based
         # restore. Status assertions can be used to check exact matches,
         # although it's unlikely to ever happen for name-based checkpoints.
-        saveable.restore(restored_tensors=restored_tensors,
-                         restored_shapes=None)
+        saveable.restore(
+            restored_tensors=restored_tensors, restored_shapes=None)
 
 
 # TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
 # or consolidating the implementation with get_variable.
-def _default_getter(name, shape, dtype, initializer=None,
-                    partition_info=None, **kwargs):
+def _default_getter(name,
+                    shape,
+                    dtype,
+                    initializer=None,
+                    partition_info=None,
+                    **kwargs):
   """A pared-down version of get_variable which does not reuse variables."""
   dtype = dtypes.as_dtype(dtype)
   shape_object = tensor_shape.as_shape(shape)
@@ -263,7 +356,9 @@
     if initializer is None:
       initializer, initializing_from_value = (
           variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
-              name=name, shape=shape_object, dtype=dtype))
+              name=name,
+              shape=shape_object,
+              dtype=dtype))
     else:
       initializing_from_value = not callable(initializer)
     # Same logic as get_variable
@@ -276,24 +371,33 @@
       # Instantiate initializer if provided initializer is a type object.
       if isinstance(initializer, type(init_ops.Initializer)):
         initializer = initializer(dtype=dtype)
+
       def initial_value():
         return initializer(
             shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+
     return variables.VariableV1(
         initial_value=initial_value,
         name=name,
         dtype=variable_dtype,
         use_resource=True,
-        **kwargs
-    )
+        **kwargs)
 
 
-def add_variable(trackable, name, shape=None, dtype=dtypes.float32,
-                 initializer=None, trainable=True):
+def add_variable(trackable,
+                 name,
+                 shape=None,
+                 dtype=dtypes.float32,
+                 initializer=None,
+                 trainable=True):
   """Add a variable to a Trackable with no scope influence."""
   return trackable._add_variable_with_custom_getter(  # pylint: disable=protected-access
-      name=name, shape=shape, dtype=dtype,
-      initializer=initializer, getter=_default_getter, trainable=trainable)
+      name=name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      getter=_default_getter,
+      trainable=trainable)
 
 
 def object_metadata(save_path):
@@ -313,6 +417,7 @@
   Args:
     save_path: The path to the checkpoint, as returned by `save` or
       `tf.train.latest_checkpoint`.
+
   Returns:
     A parsed `tf.contrib.checkpoint.TrackableObjectGraph` protocol buffer.
   Raises:
@@ -320,16 +425,14 @@
   """
   reader = pywrap_tensorflow.NewCheckpointReader(save_path)
   try:
-    object_graph_string = reader.get_tensor(
-        base.OBJECT_GRAPH_PROTO_KEY)
+    object_graph_string = reader.get_tensor(base.OBJECT_GRAPH_PROTO_KEY)
   except errors_impl.NotFoundError:
     raise ValueError(
         ('The specified checkpoint "%s" does not appear to be object-based (it '
          'is missing the key "%s"). Likely it was created with a name-based '
-         'saver and does not contain an object dependency graph.') % (
-             save_path, base.OBJECT_GRAPH_PROTO_KEY))
-  object_graph_proto = (
-      trackable_object_graph_pb2.TrackableObjectGraph())
+         "saver and does not contain an object dependency graph.") %
+        (save_path, base.OBJECT_GRAPH_PROTO_KEY))
+  object_graph_proto = (trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   return object_graph_proto
 
@@ -343,8 +446,8 @@
   (i.e. if they would be saved with a checkpoint).
 
   Args:
-    root_trackable: A `Trackable` object whose dependencies should be
-      flattened.
+    root_trackable: A `Trackable` object whose dependencies should be flattened.
+
   Returns:
     A flat list of objects.
   """
@@ -362,12 +465,16 @@
 
   Args:
     root_trackable: A `Trackable` object to gather initializers for.
+
   Returns:
     A list of initialization ops.
   """
   trackable_objects = list_objects(root_trackable)
-  return [c.initializer for c in trackable_objects
-          if hasattr(c, "initializer") and c.initializer is not None]
+  return [
+      c.initializer
+      for c in trackable_objects
+      if hasattr(c, "initializer") and c.initializer is not None
+  ]
 
 
 @tf_contextlib.contextmanager
@@ -380,7 +487,7 @@
   object to add dependencies on variables created in a block of code which is
   not aware of object-based saving (and instead uses variable names
   heavily). This is how `Template` objects add dependencies on variables and
-  sub-`Template`s. Where possible, use `tf.make_template` directly.
+  sub-`Template`s. Where possible, use `tf.compat.v1.make_template` directly.
 
   Args:
     template: The `Template` object to register dependencies with.
@@ -390,8 +497,11 @@
   """
   name_prefix = template.variable_scope.name
 
-  def _trackable_custom_creator(next_creator, name, initial_value,
-                                trackable_parent=None, **kwargs):
+  def _trackable_custom_creator(next_creator,
+                                name,
+                                initial_value,
+                                trackable_parent=None,
+                                **kwargs):
     """A variable creation hook which adds Trackable dependencies.
 
     Set for example during a `Template`'s first wrapped function
@@ -415,21 +525,20 @@
       initial_value: See `variable_scope.variable_creator_scope`. Taken
         explicitly so the argument can be re-named and used with
         `Trackable._add_variable_with_custom_getter`.
-      trackable_parent: If not None, a more deeply nested trackable
-        object and its name prefix which were passed to `capture_dependencies`
-        to add a dependency on (rather than depending on the variable directly).
+      trackable_parent: If not None, a more deeply nested trackable object and
+        its name prefix which were passed to `capture_dependencies` to add a
+        dependency on (rather than depending on the variable directly).
       **kwargs: Passed through to the next creator.
 
     Returns:
       The output of `next_creator`: the fetched/created variable object.
     """
+
     def _call_next_creator_renaming_initializer(initializer, **inner_kwargs):
       inner_kwargs.pop("name")  # Ignored; this is the scope-stripped name which
       # we don't want to propagate.
-      return next_creator(
-          initial_value=initializer,
-          name=name,
-          **inner_kwargs)
+      return next_creator(initial_value=initializer, name=name, **inner_kwargs)
+
     if name is not None and name.startswith(name_prefix):
       scope_stripped_name = name[len(name_prefix) + 1:]
       if not trackable_parent:
@@ -450,8 +559,10 @@
             name=parent_name_prefix[len(name_prefix) + 1:],
             overwrite=True)
     return next_creator(
-        name=name, initial_value=initial_value,
-        trackable_parent=(template, name_prefix), **kwargs)
+        name=name,
+        initial_value=initial_value,
+        trackable_parent=(template, name_prefix),
+        **kwargs)
 
   with variable_scope.variable_creator_scope(_trackable_custom_creator):
     yield
@@ -485,21 +596,24 @@
     """Runs restore ops from the checkpoint, or initializes variables."""
     pass
 
+  def expect_partial(self):
+    """Silence warnings about incomplete checkpoint restores."""
+    return self
+
 
 def streaming_restore(status, session=None):
   """When graph building, runs restore ops as soon as they come in.
 
   Args:
-    status: A _LoadStatus objects from an object-based saver's
-      restore(). Streaming restore from name-based checkpoints is not currently
-      supported.
+    status: A _LoadStatus objects from an object-based saver's restore().
+      Streaming restore from name-based checkpoints is not currently supported.
     session: A session to run new restore ops in.
   """
   if context.executing_eagerly():
     # Streaming restore is the default/only behavior when executing eagerly.
     return
   if session is None:
-    session = ops.get_default_session()
+    session = keras_backend.get_session()
   if isinstance(status, NameBasedSaverStatus):
     raise NotImplementedError(
         "Streaming restore not supported from name-based checkpoints. File a "
@@ -545,21 +659,31 @@
         or if there are any checkpointed values which have not been matched to
         Python objects.
     """
+    pretty_printer = _ObjectGraphProtoPrettyPrinter(
+        self._checkpoint.object_graph_proto)
     self.assert_existing_objects_matched()
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
       trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
       if trackable is None:
-        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
+        raise AssertionError("Unresolved object in checkpoint {}: {}"
+                             .format(pretty_printer.node_names[node_id], node))
     if self._checkpoint.slot_restorations:
       # Sanity check; this collection should be clear if everything has been
       # restored.
-      raise AssertionError("Unresolved slot restorations: %s" % (
-          self._checkpoint.slot_restorations,))
+      raise AssertionError("Unresolved slot restorations: %s" %
+                           (self._checkpoint.slot_restorations,))
     if self._checkpoint.unused_attributes:
+      unused_attribute_messages = []
+      for node_id, attribute in six.iteritems(
+          self._checkpoint.unused_attributes):
+        obj = self._checkpoint.object_by_proto_id[node_id]
+        unused_attribute_messages.append(
+            "{} ({}): {}"
+            .format(pretty_printer.node_names[node_id], obj, attribute))
       raise AssertionError(
           ("Unused attributes in these objects (the attributes exist in the "
-           "checkpoint but not in the objects): %s") % (
-               list(self._checkpoint.unused_attributes.items()),))
+           "checkpoint but were not restored):\n{}")
+          .format("\n".join(unused_attribute_messages)))
     return self
 
   def assert_existing_objects_matched(self):
@@ -581,10 +705,10 @@
     """
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
       trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if (trackable is not None
-          and trackable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
-        raise AssertionError(
-            "Object not assigned a value from checkpoint: %s" % (node,))
+      if (trackable is not None and
+          trackable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
+        raise AssertionError("Object not assigned a value from checkpoint: %s" %
+                             (node,))
     for trackable_object in self._graph_view.list_objects():
       # Remove data structures that do not contain any variables from
       # restoration checks.
@@ -594,14 +718,14 @@
         continue
       self._checkpoint.all_python_objects.add(trackable_object)
     unused_python_objects = (
-        object_identity.ObjectIdentitySet(self._checkpoint.all_python_objects)
-        - object_identity.ObjectIdentitySet(
+        object_identity.ObjectIdentitySet(self._checkpoint.all_python_objects) -
+        object_identity.ObjectIdentitySet(
             self._checkpoint.object_by_proto_id.values()))
     if unused_python_objects:
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
-           "due to changes in the Python program: %s")
-          % (list(unused_python_objects),))
+           "due to changes in the Python program: %s") %
+          (list(unused_python_objects),))
     return self
 
   def assert_nontrivial_match(self):
@@ -610,8 +734,7 @@
       self._checkpoint.all_python_objects.add(trackable_object)
     if len(self._checkpoint.object_by_proto_id) <= 1:
       unused_python_objects = (
-          object_identity.ObjectIdentitySet(
-              self._checkpoint.all_python_objects)
+          object_identity.ObjectIdentitySet(self._checkpoint.all_python_objects)
           - object_identity.ObjectIdentitySet(
               self._checkpoint.object_by_proto_id.values()))
       if unused_python_objects:
@@ -622,8 +745,8 @@
              "checkpointed value: %s") % (list(unused_python_objects),))
       else:
         raise AssertionError(
-            "Nothing to load. No dependencies have been added to %s yet." % (
-                self._graph_view.root,))
+            "Nothing to load. No dependencies have been added to %s yet." %
+            (self._graph_view.root,))
     return self
 
   def run_restore_ops(self, session=None):
@@ -631,7 +754,7 @@
     if context.executing_eagerly():
       return  # Run eagerly
     if session is None:
-      session = ops.get_default_session()
+      session = keras_backend.get_session()
     session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
 
   def initialize_or_restore(self, session=None):
@@ -652,7 +775,7 @@
     if context.executing_eagerly():
       return  # Initialization and restoration ops are run eagerly
     if session is None:
-      session = ops.get_default_session()
+      session = keras_backend.get_session()
     all_objects = self._graph_view.list_objects()
     already_initialized_objects = object_identity.ObjectIdentitySet(
         self._checkpoint.object_by_proto_id.values())
@@ -665,6 +788,11 @@
     self.run_restore_ops(session=session)
     session.run(initializers_for_non_restored_variables)
 
+  def expect_partial(self):
+    """Silence warnings about incomplete checkpoint restores."""
+    self._checkpoint.expect_partial = True
+    return self
+
 
 class InitializationOnlyStatus(_LoadStatus):
   """Returned from `Saver.restore` when no checkpoint has been specified.
@@ -725,7 +853,7 @@
     if context.executing_eagerly():
       return  # run eagerly
     if session is None:
-      session = ops.get_default_session()
+      session = keras_backend.get_session()
     trackable_objects = self._graph_view.list_objects()
     initializers = [
         c.initializer for c in trackable_objects
@@ -760,8 +888,8 @@
     unused_attributes = dict(self._checkpoint.unused_attributes)
     if unused_attributes:
       raise AssertionError(
-          "Some objects had attributes which were not restored: %s"
-          % (unused_attributes,))
+          "Some objects had attributes which were not restored: %s" %
+          (unused_attributes,))
     for trackable in self._graph_view.list_objects():
       # pylint: disable=protected-access
       trackable._maybe_initialize_trackable()
@@ -799,16 +927,15 @@
         continue
       # pylint: enable=protected-access
       saveable_objects.extend(
-          self._checkpoint.globally_named_object_attributes(
-              trackable))
+          self._checkpoint.globally_named_object_attributes(trackable))
     return saveable_objects
 
   def run_restore_ops(self, session=None):
-    """Load the name-based training checkpoint using a new `tf.train.Saver`."""
+    """Load the name-based checkpoint using a new `tf.compat.v1.train.Saver`."""
     if context.executing_eagerly():
       return  # Nothing to do, variables are restored on creation.
     if session is None:
-      session = ops.get_default_session()
+      session = keras_backend.get_session()
     with ops.device("/cpu:0"):
       saveables = self._gather_saveable_objects()
       v1_saver_lib.Saver(saveables).restore(
@@ -840,7 +967,8 @@
   """Saves and restores a `Trackable` object and its dependencies.
 
   See `Trackable` for details of dependency management. `Saver` wraps
-  `tf.train.Saver` for saving, including extra information about the graph of
+  `tf.compat.v1.train.Saver` for saving, including extra information about the
+  graph of
   dependencies between Python objects. When restoring, it uses this information
   about the save-time dependency graph to more robustly match objects with their
   checkpointed values. When executing eagerly, it supports restoring variables
@@ -851,7 +979,8 @@
   checkpoint was written. To avoid breaking existing checkpoints when modifying
   a class, dependency names (the names of attributes to which `Trackable`
   objects are assigned) may not change. These names are local to objects, in
-  contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
+  contrast to the `Variable.name`-based save/restore from
+  `tf.compat.v1.train.Saver`, and
   so allow additional program transformations.
   """
 
@@ -877,8 +1006,7 @@
     self._restore_op_cache = {}
     self._graph_view = graph_view
 
-  def _gather_saveables(
-      self, object_graph_tensor=None):
+  def _gather_saveables(self, object_graph_tensor=None):
     """Wraps _serialize_object_graph to include the object graph proto."""
     (named_saveable_objects, graph_proto,
      feed_additions) = self._graph_view.serialize_object_graph()
@@ -892,14 +1020,12 @@
     assert base.OBJECT_GRAPH_PROTO_KEY not in named_saveable_objects
     named_saveable_objects.append(
         base.NoRestoreSaveable(
-            tensor=object_graph_tensor,
-            name=base.OBJECT_GRAPH_PROTO_KEY))
+            tensor=object_graph_tensor, name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects, graph_proto, feed_additions
 
-  def _save_cached_when_graph_building(
-      self,
-      file_prefix,
-      object_graph_tensor=None):
+  def _save_cached_when_graph_building(self,
+                                       file_prefix,
+                                       object_graph_tensor=None):
     """Create or retrieve save ops.
 
     Args:
@@ -921,8 +1047,7 @@
         # save() is called so they pick up new Tensors passed to their
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
-        or context.executing_eagerly()
-        or ops.inside_function()):
+        or context.executing_eagerly() or ops.inside_function()):
       saver = functional_saver.MultiDeviceSaver(named_saveable_objects)
       save_op = saver.save(file_prefix)
       with ops.device("/cpu:0"):
@@ -954,8 +1079,8 @@
       The full path to the checkpoint.
     """
     feed_dict = {}
-    use_session = (not context.executing_eagerly()
-                   and not ops.inside_function())
+    use_session = (not context.executing_eagerly() and
+                   not ops.inside_function())
     if checkpoint_number:
       file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
     if use_session:
@@ -976,14 +1101,13 @@
 
     file_io.recursive_create_dir(os.path.dirname(file_prefix))
     save_path, new_feed_additions = self._save_cached_when_graph_building(
-        file_prefix=file_prefix_tensor,
-        object_graph_tensor=object_graph_tensor)
+        file_prefix=file_prefix_tensor, object_graph_tensor=object_graph_tensor)
     if new_feed_additions:
       feed_dict.update(new_feed_additions)
     if not use_session:
       session = None
     elif session is None:
-      session = ops.get_default_session()
+      session = keras_backend.get_session()
 
     if session:
       return session.run(save_path, feed_dict=feed_dict)
@@ -1024,7 +1148,7 @@
     If the checkpoint has not been consumed completely, then the list of restore
     ops will grow as more objects are added to the dependency graph.
 
-    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    Name-based `tf.compat.v1.train.Saver` checkpoints can be loaded using this
     method. There is no deferred loading, and names are used to match
     variables. No restore ops are created/run until `run_restore_ops()` or
     `initialize_or_restore()` are called on the returned status object, even
@@ -1035,9 +1159,9 @@
       save_path: The path to the checkpoint, as returned by `save` or
         `tf.train.latest_checkpoint`. If None (as when there is no latest
         checkpoint for `tf.train.latest_checkpoint` to return), returns an
-        object which may run initializers for objects in the dependency
-        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
-        names are used to match variables.
+        object which may run initializers for objects in the dependency graph.
+        If the checkpoint was written by the name-based
+        `tf.compat.v1.train.Saver`, names are used to match variables.
 
     Returns:
       A load status object, which can be used to make assertions about the
@@ -1057,8 +1181,7 @@
     else:
       dtype_map = reader.get_variable_to_dtype_map()
     try:
-      object_graph_string = reader.get_tensor(
-          base.OBJECT_GRAPH_PROTO_KEY)
+      object_graph_string = reader.get_tensor(base.OBJECT_GRAPH_PROTO_KEY)
     except errors_impl.NotFoundError:
       # The object graph proto does not exist in this checkpoint. Try the
       # name-based compatibility mode.
@@ -1069,8 +1192,7 @@
           # pylint: disable=protected-access
           existing_trackable._maybe_initialize_trackable()
           existing_trackable._name_based_restores.add(restore_coordinator)
-          existing_trackable._name_based_attribute_restore(
-              restore_coordinator)
+          existing_trackable._name_based_attribute_restore(restore_coordinator)
           # pylint: enable=protected-access
       return NameBasedSaverStatus(
           restore_coordinator, graph_view=self._graph_view)
@@ -1085,8 +1207,7 @@
       with ops.device("/cpu:0"):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
-    object_graph_proto = (
-        trackable_object_graph_pb2.TrackableObjectGraph())
+    object_graph_proto = (trackable_object_graph_pb2.TrackableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
     checkpoint = _CheckpointRestoreCoordinator(
         object_graph_proto=object_graph_proto,
@@ -1094,8 +1215,8 @@
         save_path_tensor=file_prefix_tensor,
         restore_op_cache=self._restore_op_cache,
         graph_view=self._graph_view)
-    base.CheckpointPosition(checkpoint=checkpoint, proto_id=0).restore(
-        self._graph_view.root)
+    base.CheckpointPosition(
+        checkpoint=checkpoint, proto_id=0).restore(self._graph_view.root)
     load_status = CheckpointLoadStatus(
         checkpoint,
         graph_view=self._graph_view,
@@ -1104,7 +1225,7 @@
 
 
 def frozen_saver(root_trackable):
-  """Creates a static `tf.train.Saver` from a trackable object.
+  """Creates a static `tf.compat.v1.train.Saver` from a trackable object.
 
   The returned `Saver` saves object-based checkpoints, but these checkpoints
   will no longer reflect structural changes to the object graph, only changes to
@@ -1135,9 +1256,9 @@
     saveables_cache = None
   else:
     saveables_cache = object_identity.ObjectIdentityWeakKeyDictionary()
-  return TrackableSaver(graph_view_lib.ObjectGraphView(
-      weakref.ref(obj),
-      saveables_cache=saveables_cache))
+  return TrackableSaver(
+      graph_view_lib.ObjectGraphView(
+          weakref.ref(obj), saveables_cache=saveables_cache))
 
 
 # Mentions graph building / Sessions. The v2 version is below.
@@ -1146,7 +1267,7 @@
   """Groups trackable objects, saving and restoring them.
 
   `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain trackable state, such as `tf.train.Optimizer`
+  that contain trackable state, such as `tf.compat.v1.train.Optimizer`
   implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
   `tf.keras.Model` implementations. It saves these values with a checkpoint, and
   maintains a `save_counter` for numbering checkpoints.
@@ -1164,7 +1285,7 @@
   status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
   train_op = optimizer.minimize( ... )
   status.assert_consumed()  # Optional sanity checks.
-  with tf.Session() as session:
+  with tf.compat.v1.Session() as session:
     # Use the Session to restore variables, or initialize them if
     # tf.train.latest_checkpoint returned None.
     status.initialize_or_restore(session)
@@ -1179,7 +1300,7 @@
   import tensorflow as tf
   import os
 
-  tf.enable_eager_execution()
+  tf.compat.v1.enable_eager_execution()
 
   checkpoint_directory = "/tmp/training_checkpoints"
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -1193,13 +1314,14 @@
   ```
 
   `Checkpoint.save` and `Checkpoint.restore` write and read object-based
-  checkpoints, in contrast to `tf.train.Saver` which writes and reads
+  checkpoints, in contrast to `tf.compat.v1.train.Saver` which writes and reads
   `variable.name` based checkpoints. Object-based checkpointing saves a graph of
   dependencies between Python objects (`Layer`s, `Optimizer`s, `Variable`s,
   etc.) with named edges, and this graph is used to match variables when
   restoring a checkpoint. It can be more robust to changes in the Python
   program, and helps to support restore-on-create for variables when executing
-  eagerly. Prefer `tf.train.Checkpoint` over `tf.train.Saver` for new code.
+  eagerly. Prefer `tf.train.Checkpoint` over `tf.compat.v1.train.Saver` for new
+  code.
 
   `Checkpoint` objects have dependencies on the objects passed as keyword
   arguments to their constructors, and each dependency is given a name that is
@@ -1233,6 +1355,16 @@
   as a single checkpoint. This avoids copying all variables to one worker, but
   does require that all workers see a common filesystem.
 
+  While `tf.keras.Model.save_weights` and `tf.train.Checkpoint.save` save in the
+  same format, note that the root of the resulting checkpoint is the object the
+  save method is attached to. This means saving a `tf.keras.Model` using
+  `save_weights` and loading into a `tf.train.Checkpoint` with a `Model`
+  attached (or vice versa) will not match the `Model`'s variables. See the
+  [guide to training
+  checkpoints](https://www.tensorflow.org/alpha/guide/checkpoints) for
+  details. Prefer `tf.train.Checkpoint` over `tf.keras.Model.save_weights` for
+  training checkpoints.
+
   Attributes:
     save_counter: Incremented when `save()` is called. Used to number
       checkpoints.
@@ -1244,6 +1376,7 @@
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
         saved with the checkpoint. Values must be trackable objects.
+
     Raises:
       ValueError: If objects in `kwargs` are not trackable.
     """
@@ -1269,8 +1402,12 @@
         # add_variable creates a dependency named "save_counter"; NoDependency
         # prevents creating a second dependency named "_save_counter".
         self._save_counter = data_structures.NoDependency(
-            add_variable(self, name="save_counter", initializer=0,
-                         dtype=dtypes.int64, trainable=False))
+            add_variable(
+                self,
+                name="save_counter",
+                initializer=0,
+                dtype=dtypes.int64,
+                trainable=False))
 
   def write(self, file_prefix, session=None):
     """Writes a training checkpoint.
@@ -1294,9 +1431,7 @@
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    output = self._saver.save(
-        file_prefix=file_prefix,
-        session=session)
+    output = self._saver.save(file_prefix=file_prefix, session=session)
     if tensor_util.is_tensor(output):
       if context.executing_eagerly():
         return compat.as_str(output.numpy())
@@ -1355,7 +1490,7 @@
             "update metadata. tf.train.latest_checkpoint and related APIs will "
             "not see this checkpoint.")
       if session is None:
-        session = ops.get_default_session()
+        session = keras_backend.get_session()
       if self._save_counter is None:
         # When graph building, if this is a new save counter variable then it
         # needs to be initialized before assign_add. This is only an issue if
@@ -1370,8 +1505,8 @@
       checkpoint_number = session.run(self._save_assign_op)
     else:
       checkpoint_number = assign_op.numpy()
-    file_path = self.write("%s-%d" % (file_prefix, checkpoint_number),
-                           session=session)
+    file_path = self.write(
+        "%s-%d" % (file_prefix, checkpoint_number), session=session)
     checkpoint_management.update_checkpoint_state_internal(
         save_dir=os.path.dirname(file_prefix),
         model_checkpoint_path=file_path,
@@ -1417,7 +1552,7 @@
     If the checkpoint has not been consumed completely, then the list of restore
     ops will grow as more objects are added to the dependency graph.
 
-    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    Name-based `tf.compat.v1.train.Saver` checkpoints can be loaded using this
     method. Names are used to match variables. No restore ops are created/run
     until `run_restore_ops()` or `initialize_or_restore()` are called on the
     returned status object when graph building, but there is restore-on-creation
@@ -1428,9 +1563,9 @@
       save_path: The path to the checkpoint, as returned by `save` or
         `tf.train.latest_checkpoint`. If None (as when there is no latest
         checkpoint for `tf.train.latest_checkpoint` to return), returns an
-        object which may run initializers for objects in the dependency
-        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
-        names are used to match variables.
+        object which may run initializers for objects in the dependency graph.
+        If the checkpoint was written by the name-based
+        `tf.compat.v1.train.Saver`, names are used to match variables.
 
     Returns:
       A load status object, which can be used to make assertions about the
@@ -1453,7 +1588,8 @@
           built, and so has not created any variables, will pass this assertion
           but fail `assert_consumed`. Useful when loading part of a larger
           checkpoint into a new Python program, e.g. a training checkpoint with
-          a `tf.train.Optimizer` was saved but only the state required for
+          a `tf.compat.v1.train.Optimizer` was saved but only the state required
+          for
           inference is being loaded. This method returns the status object, and
           so may be chained with `initialize_or_restore` or `run_restore_ops`.
 
@@ -1463,6 +1599,11 @@
           checkpoint which haven't been created in Python and some Python
           objects may not have a checkpointed value.
 
+      * `expect_partial()`: Silence warnings about incomplete checkpoint
+          restores. Warnings are otherwise printed for unused parts of the
+          checkpoint file or object when the `Checkpoint` object is deleted
+          (often at program shutdown).
+
       * `initialize_or_restore(session=None)`:
           When graph building, runs variable initializers if `save_path` is
           `None`, but otherwise runs restore operations. If no `session` is
@@ -1488,7 +1629,7 @@
   """Groups trackable objects, saving and restoring them.
 
   `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain trackable state, such as `tf.train.Optimizer`
+  that contain trackable state, such as `tf.keras.optimizers.Optimizer`
   implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
   `tf.keras.Model` implementations. It saves these values with a checkpoint, and
   maintains a `save_counter` for numbering checkpoints.
@@ -1511,7 +1652,8 @@
   ```
 
   `Checkpoint.save` and `Checkpoint.restore` write and read object-based
-  checkpoints, in contrast to TensorFlow 1.x's `tf.train.Saver` which writes and
+  checkpoints, in contrast to TensorFlow 1.x's `tf.compat.v1.train.Saver` which
+  writes and
   reads `variable.name` based checkpoints. Object-based checkpointing saves a
   graph of dependencies between Python objects (`Layer`s, `Optimizer`s,
   `Variable`s, etc.) with named edges, and this graph is used to match variables
@@ -1550,6 +1692,16 @@
   as a single checkpoint. This avoids copying all variables to one worker, but
   does require that all workers see a common filesystem.
 
+  While `tf.keras.Model.save_weights` and `tf.train.Checkpoint.save` save in the
+  same format, note that the root of the resulting checkpoint is the object the
+  save method is attached to. This means saving a `tf.keras.Model` using
+  `save_weights` and loading into a `tf.train.Checkpoint` with a `Model`
+  attached (or vice versa) will not match the `Model`'s variables. See the
+  [guide to training
+  checkpoints](https://www.tensorflow.org/alpha/guide/checkpoints) for
+  details. Prefer `tf.train.Checkpoint` over `tf.keras.Model.save_weights` for
+  training checkpoints.
+
   Attributes:
     save_counter: Incremented when `save()` is called. Used to number
       checkpoints.
@@ -1561,6 +1713,7 @@
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
         saved with the checkpoint. Values must be trackable objects.
+
     Raises:
       ValueError: If objects in `kwargs` are not trackable.
     """
@@ -1586,8 +1739,12 @@
         # add_variable creates a dependency named "save_counter"; NoDependency
         # prevents creating a second dependency named "_save_counter".
         self._save_counter = data_structures.NoDependency(
-            add_variable(self, name="save_counter", initializer=0,
-                         dtype=dtypes.int64, trainable=False))
+            add_variable(
+                self,
+                name="save_counter",
+                initializer=0,
+                dtype=dtypes.int64,
+                trainable=False))
 
   def write(self, file_prefix):
     """Writes a training checkpoint.
@@ -1608,8 +1765,7 @@
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    output = self._saver.save(
-        file_prefix=file_prefix)
+    output = self._saver.save(file_prefix=file_prefix)
     if tensor_util.is_tensor(output):
       if context.executing_eagerly():
         return compat.as_str(output.numpy())
@@ -1664,7 +1820,7 @@
             "tf.train.Checkpoint.write(), a lower-level API which does not "
             "update metadata. tf.train.latest_checkpoint and related APIs will "
             "not see this checkpoint.")
-      session = ops.get_default_session()
+      session = keras_backend.get_session()
       if self._save_counter is None:
         # When graph building, if this is a new save counter variable then it
         # needs to be initialized before assign_add. This is only an issue if
@@ -1711,7 +1867,8 @@
     were not found in the checkpoint, or if any checkpointed values do not have
     a matching Python object.
 
-    Name-based `tf.train.Saver` checkpoints from TensorFlow 1.x can be loaded
+    Name-based `tf.compat.v1.train.Saver` checkpoints from TensorFlow 1.x can be
+    loaded
     using this method. Names are used to match variables. Re-encode name-based
     checkpoints using `tf.train.Checkpoint.save` as soon as possible.
 
@@ -1719,9 +1876,9 @@
       save_path: The path to the checkpoint, as returned by `save` or
         `tf.train.latest_checkpoint`. If None (as when there is no latest
         checkpoint for `tf.train.latest_checkpoint` to return), returns an
-        object which may run initializers for objects in the dependency
-        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
-        names are used to match variables.
+        object which may run initializers for objects in the dependency graph.
+        If the checkpoint was written by the name-based
+        `tf.compat.v1.train.Saver`, names are used to match variables.
 
     Returns:
       A load status object, which can be used to make assertions about the
@@ -1744,7 +1901,8 @@
           built, and so has not created any variables, will pass this assertion
           but fail `assert_consumed`. Useful when loading part of a larger
           checkpoint into a new Python program, e.g. a training checkpoint with
-          a `tf.train.Optimizer` was saved but only the state required for
+          a `tf.compat.v1.train.Optimizer` was saved but only the state required
+          for
           inference is being loaded. This method returns the status object, and
           so may be chained with other assertions.
 
@@ -1753,6 +1911,11 @@
           sanity checking in library code where objects may exist in the
           checkpoint which haven't been created in Python and some Python
           objects may not have a checkpointed value.
+
+      * `expect_partial()`: Silence warnings about incomplete checkpoint
+          restores. Warnings are otherwise printed for unused parts of the
+          checkpoint file or object when the `Checkpoint` object is deleted
+          (often at program shutdown).
     """
     status = self._saver.restore(save_path=save_path)
     # Create the save counter now so it gets initialized with other variables
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index df2f463..a08fe86 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -17,21 +17,20 @@
 from __future__ import print_function
 
 import functools
-import json
 import os
+import weakref
 
 from absl.testing import parameterized
 import six
 
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
@@ -43,6 +42,8 @@
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
@@ -87,6 +88,17 @@
     model.l1 = layer_one
     self.assertEqual([layer_one, layer_two], model.layers)
 
+  def testSaveWithOnlyKerasSession(self):
+
+    with ops.Graph().as_default():
+      inp = input_layer.Input([1])
+      dense = core.Dense(1)(inp)
+      model = training.Model(inp, dense)
+      model.compile(optimizer="sgd", loss="mse")
+      model.fit([1.], [2.])
+      checkpoint = trackable_utils.Checkpoint(model=model)
+      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAddVariable(self):
     obj = NonLayerTrackable()
@@ -312,12 +324,6 @@
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    expected_checkpoint_names.append(
-        "optimizer/.ATTRIBUTES/OBJECT_CONFIG_JSON")
-    # The Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend(
-        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
@@ -618,6 +624,84 @@
         self.fail("%s should have suffix %s" % (path, expected_suffix))
       self.evaluate(step.assign_add(2))
 
+  def testPartialRestoreWarningObject(self):
+    with context.eager_mode():
+      optimizer = adam.Adam(0.0)
+      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                                 v2=variables_lib.Variable(3.),
+                                                 optimizer=optimizer)
+      # Create a slot variable to save
+      optimizer.minimize(original_root.v1.read_value, [original_root.v1])
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      save_path = original_root.save(prefix)
+      partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
+      weak_partial_root = weakref.ref(partial_root)
+      weak_v1 = weakref.ref(partial_root.v1)
+      partial_root.restore(save_path)
+      self.assertEqual(2., partial_root.v1.numpy())
+      with test.mock.patch.object(logging, "warning") as mock_log:
+        del partial_root
+        self.assertIsNone(weak_partial_root())
+        self.assertIsNone(weak_v1())
+        messages = str(mock_log.call_args_list)
+      self.assertIn("(root).v2'", messages)
+      self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
+      self.assertNotIn("(root).v1'", messages)
+      self.assertIn("expect_partial()", messages)
+
+  def testPartialRestoreWarningAttribute(self):
+    with context.eager_mode():
+      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                                 v2=variables_lib.Variable(3.))
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      save_path = original_root.save(prefix)
+      partial_root = trackable_utils.Checkpoint(v1=base.Trackable(),
+                                                v2=variables_lib.Variable(0.))
+      weak_partial_root = weakref.ref(partial_root)
+      with test.mock.patch.object(logging, "warning") as mock_log:
+        # Note: Unlike in testPartialRestoreWarningObject, the warning actually
+        # prints immediately here, since all of the objects have been created
+        # and there's no deferred restoration sitting around.
+        partial_root.restore(save_path)
+        self.assertEqual(3., partial_root.v2.numpy())
+        del partial_root
+        self.assertIsNone(weak_partial_root())
+        messages = str(mock_log.call_args_list)
+      self.assertIn("(root).v1", messages)
+      self.assertNotIn("(root).v2", messages)
+      self.assertIn("expect_partial()", messages)
+
+  def testAttributeException(self):
+    with context.eager_mode():
+      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                                 v2=variables_lib.Variable(3.))
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      save_path = original_root.save(prefix)
+      partial_root = trackable_utils.Checkpoint(v1=base.Trackable(),
+                                                v2=variables_lib.Variable(0.))
+      status = partial_root.restore(save_path)
+      with self.assertRaisesRegexp(
+          AssertionError,
+          r"Unused attributes(.|\n)*\(root\).v1"):
+        status.assert_consumed()
+
+  def testSilencePartialWarning(self):
+    with context.eager_mode():
+      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                                 v2=variables_lib.Variable(3.))
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      save_path = original_root.save(prefix)
+      partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
+      weak_partial_root = weakref.ref(partial_root)
+      weak_v1 = weakref.ref(partial_root.v1)
+      partial_root.restore(save_path).expect_partial()
+      self.assertEqual(2., partial_root.v1.numpy())
+      with test.mock.patch.object(logging, "warning") as mock_log:
+        del partial_root
+        self.assertIsNone(weak_partial_root())
+        self.assertIsNone(weak_v1())
+        self.assertEmpty(mock_log.call_args_list)
+
   # pylint: disable=cell-var-from-loop
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only("b/120545219")
@@ -1528,47 +1612,6 @@
         self._check_sentinels(root)
 
 
-class PythonMetadataTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveLoad(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dense = core.Dense(1)
-    checkpoint = trackable_utils.Checkpoint(dense=dense)
-    dense(constant_op.constant([[1.]]))
-    checkpoint.restore(None).initialize_or_restore()
-    save_path = checkpoint.save(checkpoint_prefix)
-
-    def _get_dense_node_from_object_graph(object_graph_proto):
-      root_node = object_graph_proto.nodes[0]
-      for child in root_node.children:
-        if child.local_name == "dense":
-          break
-      else:
-        raise AssertionError(
-            "Expected a 'dense' dependency of root, didn't find one.")
-      dense_node = object_graph_proto.nodes[child.node_id]  # pylint: disable=undefined-loop-variable
-      self.assertEqual(1, len(dense_node.attributes))
-      reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-      layer_json = reader.get_tensor(dense_node.attributes[0].checkpoint_key)
-      return json.loads(layer_json.decode("utf-8"))
-
-    layer_data = _get_dense_node_from_object_graph(
-        trackable_utils.object_metadata(save_path))
-    self.assertEqual("Dense", layer_data["class_name"])
-    self.assertEqual(1, layer_data["config"]["units"])
-
-    # Check that no new ops are added to the graph the second time we save.
-    ops.get_default_graph().finalize()
-
-    dense.units = 42
-    save_path = checkpoint.save(checkpoint_prefix)
-    layer_data = _get_dense_node_from_object_graph(
-        trackable_utils.object_metadata(save_path))
-    self.assertEqual("Dense", layer_data["class_name"])
-    self.assertEqual(42, layer_data["config"]["units"])
-
-
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index e00131a..c36790b 100644
--- a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -125,10 +125,6 @@
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    # The Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend(
-        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 86f1b4d..47070f5 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utility functions for training."""
 from __future__ import absolute_import
 from __future__ import division
@@ -34,7 +33,6 @@
 # collection keys.
 GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 
-
 # TODO(drpng): remove this after legacy uses are resolved.
 write_graph = graph_io.write_graph
 
@@ -47,11 +45,12 @@
   # Create a variable to hold the global_step.
   global_step_tensor = tf.Variable(10, trainable=False, name='global_step')
   # Create a session.
-  sess = tf.Session()
+  sess = tf.compat.v1.Session()
   # Initialize the variable
   sess.run(global_step_tensor.initializer)
   # Get the variable value.
-  print('global_step: %s' % tf.train.global_step(sess, global_step_tensor))
+  print('global_step: %s' % tf.compat.v1.train.global_step(sess,
+  global_step_tensor))
 
   global_step: 10
   ```
@@ -109,8 +108,8 @@
   """Create global step tensor in graph.
 
   Args:
-    graph: The graph in which to create the global step tensor. If missing,
-      use default graph.
+    graph: The graph in which to create the global step tensor. If missing, use
+      default graph.
 
   Returns:
     Global step tensor.
@@ -130,8 +129,9 @@
           initializer=init_ops.zeros_initializer(),
           trainable=False,
           aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA,
-          collections=[ops.GraphKeys.GLOBAL_VARIABLES,
-                       ops.GraphKeys.GLOBAL_STEP])
+          collections=[
+              ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP
+          ])
   # Create in proper graph and base name_scope.
   with graph.as_default() as g, g.name_scope(None):
     return variable_scope.get_variable(
@@ -141,8 +141,7 @@
         initializer=init_ops.zeros_initializer(),
         trainable=False,
         aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES,
-                     ops.GraphKeys.GLOBAL_STEP])
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
 
 
 @tf_export(v1=['train.get_or_create_global_step'])
@@ -173,9 +172,8 @@
   if not (isinstance(global_step_tensor, variables.Variable) or
           isinstance(global_step_tensor, ops.Tensor) or
           resource_variable_ops.is_resource_variable(global_step_tensor)):
-    raise TypeError(
-        'Existing "global_step" must be a Variable or Tensor: %s.' %
-        global_step_tensor)
+    raise TypeError('Existing "global_step" must be a Variable or Tensor: %s.' %
+                    global_step_tensor)
 
   if not global_step_tensor.dtype.base_dtype.is_integer:
     raise TypeError('Existing "global_step" does not have integer type: %s' %
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 3154248..d73d437 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -49,8 +49,8 @@
   VocabInfo to warm-start.
 
   Attributes:
-    new_vocab: [Required] A path to the new vocabulary file (used with the
-      model to be trained).
+    new_vocab: [Required] A path to the new vocabulary file (used with the model
+      to be trained).
     new_vocab_size: [Required] An integer indicating how many entries of the new
       vocabulary will used in training.
     num_oov_buckets: [Required] An integer indicating how many OOV buckets are
@@ -76,7 +76,7 @@
           num_oov_buckets=1,
           old_vocab='pretrained_embeddings_vocab',
           old_vocab_size=10000,
-          backup_initializer=tf.truncated_normal_initializer(
+          backup_initializer=tf.compat.v1.truncated_normal_initializer(
               mean=0.0, stddev=(1 / math.sqrt(embedding_dim))),
           axis=0)
 
@@ -86,7 +86,7 @@
           num_oov_buckets=0,  # No OOV for classes.
           old_vocab='old_class_vocab',
           old_vocab_size=8,
-          backup_initializer=tf.glorot_uniform_initializer(),
+          backup_initializer=tf.compat.v1.glorot_uniform_initializer(),
           axis=1)
 
       softmax_output_layer_bias_vocab_info = tf.VocabInfo(
@@ -95,7 +95,7 @@
           num_oov_buckets=0,  # No OOV for classes.
           old_vocab='old_class_vocab',
           old_vocab_size=8,
-          backup_initializer=tf.zeros_initializer(),
+          backup_initializer=tf.compat.v1.zeros_initializer(),
           axis=0)
 
       Currently, only axis=0 and axis=1 are supported.
@@ -255,8 +255,7 @@
     partition_info = None
     if slice_info:
       partition_info = variable_scope._PartitionInfo(
-          full_shape=slice_info.full_shape,
-          var_offset=slice_info.var_offset)
+          full_shape=slice_info.full_shape, var_offset=slice_info.var_offset)
 
     if axis == 0:
       new_row_vocab_size = current_vocab_size
@@ -301,6 +300,8 @@
     new_init_val = ops.convert_to_tensor(
         init(shape=v_shape, partition_info=partition_info))
     v._initializer_op = state_ops.assign(v, new_init_val)
+
+
 # pylint: enable=protected-access
 
 
@@ -314,12 +315,12 @@
     vars_to_warm_start: One of the following:
 
       - A regular expression (string) that captures which variables to
-        warm-start (see tf.get_collection).  This expression will only consider
-        variables in the TRAINABLE_VARIABLES collection.
-      - A list of Variables to warm-start.
+        warm-start (see tf.compat.v1.get_collection).  This expression will
+        only consider variables in the TRAINABLE_VARIABLES collection.
       - A list of strings, each representing a full variable name to warm-start.
-      - `None`, in which case only variables specified in
-        `var_name_to_vocab_info` will be warm-started.
+        These will consider variables in GLOBAL_VARIABLES collection.
+      - A list of Variables to warm-start.
+      - `None`, in which case all variables in TRAINABLE_VARIABLES will be used.
   Returns:
     A dictionary mapping variable names (strings) to lists of Variables.
   Raises:
@@ -329,15 +330,15 @@
   if isinstance(vars_to_warm_start, str) or vars_to_warm_start is None:
     # Both vars_to_warm_start = '.*' and vars_to_warm_start = None will match
     # everything (in TRAINABLE_VARIABLES) here.
+    logging.info("Warm-starting variables only in TRAINABLE_VARIABLES.")
     list_of_vars = ops.get_collection(
-        ops.GraphKeys.TRAINABLE_VARIABLES,
-        scope=vars_to_warm_start)
+        ops.GraphKeys.TRAINABLE_VARIABLES, scope=vars_to_warm_start)
   elif isinstance(vars_to_warm_start, list):
     if all(isinstance(v, str) for v in vars_to_warm_start):
       list_of_vars = []
       for v in vars_to_warm_start:
-        list_of_vars += ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                           scope=v)
+        list_of_vars += ops.get_collection(
+            ops.GraphKeys.GLOBAL_VARIABLES, scope=v)
     elif all(checkpoint_utils._is_variable(v) for v in vars_to_warm_start):  # pylint: disable=protected-access
       list_of_vars = vars_to_warm_start
     else:
@@ -377,17 +378,17 @@
     vars_to_warm_start: [Optional] One of the following:
 
       - A regular expression (string) that captures which variables to
-        warm-start (see tf.get_collection).  This expression will only consider
-        variables in the TRAINABLE_VARIABLES collection -- if you need to
-        warm-start non_TRAINABLE vars (such as optimizer accumulators or batch
-        norm statistics), please use the below option.
+        warm-start (see tf.compat.v1.get_collection).  This expression will only
+        consider variables in the TRAINABLE_VARIABLES collection -- if you need
+        to warm-start non_TRAINABLE vars (such as optimizer accumulators or
+        batch norm statistics), please use the below option.
+      - A list of strings, each a regex scope provided to
+        tf.compat.v1.get_collection with GLOBAL_VARIABLES (please see
+        tf.compat.v1.get_collection).  For backwards compatibility reasons,
+        this is separate from the single-string argument type.
       - A list of Variables to warm-start.  If you do not have access to the
-        `Variable` objects at the call site, please use the below option.
-      - A list of strings, each a regex scope provided to tf.get_collection with
-        GLOBAL_VARIABLES (please see tf.get_collection).  For backwards
-        compatibility reasons, this is separate from the single-string argument
-        type.
-      - `None`, in which case only variables specified in
+        `Variable` objects at the call site, please use the above option.
+      - `None`, in which case only TRAINABLE variables specified in
         `var_name_to_vocab_info` will be warm-started.
 
       Defaults to `'.*'`, which warm-starts all variables in the
@@ -404,6 +405,7 @@
       effect on the set of variables that is warm-started, and only controls
       name mapping (use `vars_to_warm_start` for controlling what variables to
       warm-start).
+
   Raises:
     ValueError: If the WarmStartSettings contains prev_var_name or VocabInfo
       configuration for variable names that are not used.  This is to ensure
@@ -416,6 +418,7 @@
     var_name_to_prev_var_name = {}
   logging.info("Warm-starting from: %s", (ckpt_to_initialize_from,))
   grouped_variables = _get_grouped_variables(vars_to_warm_start)
+  warmstarted_count = 0
 
   # Keep track of which var_names in var_name_to_prev_var_name and
   # var_name_to_vocab_info have been used.  Err on the safer side by throwing an
@@ -434,6 +437,7 @@
     vocab_info = var_name_to_vocab_info.get(var_name)
     if vocab_info:
       vocab_info_used.add(var_name)
+      warmstarted_count += 1
       logging.debug(
           "Warm-starting variable: {}; current_vocab: {} current_vocab_size: {}"
           " prev_vocab: {} prev_vocab_size: {} current_oov: {} prev_tensor: {}"
@@ -458,6 +462,7 @@
       # For the special value of vars_to_warm_start = None,
       # we only warm-start variables with explicitly specified vocabularies.
       if vars_to_warm_start:
+        warmstarted_count += 1
         logging.debug("Warm-starting variable: {}; prev_var_name: {}".format(
             var_name, prev_var_name or "Unchanged"))
         # Because we use a default empty list in grouped_variables, single
@@ -473,6 +478,8 @@
       var_name_to_prev_var_name.keys()) - prev_var_name_used
   vocab_info_not_used = set(var_name_to_vocab_info.keys()) - vocab_info_used
 
+  logging.info("Warm-started %d variables.", warmstarted_count)
+
   if prev_var_name_not_used:
     raise ValueError(
         "You provided the following variables in "
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 908cb75..4f31498 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -40,10 +40,12 @@
 
 
 def as_bytes(bytes_or_text, encoding='utf-8'):
-  """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
+  """Converts `bytearray`, `bytes`, or unicode python input types to `bytes`.
+
+  Uses utf-8 encoding for text by default.
 
   Args:
-    bytes_or_text: A `bytes`, `str`, or `unicode` object.
+    bytes_or_text: A `bytearray`, `bytes`, `str`, or `unicode` object.
     encoding: A string indicating the charset for encoding unicode.
 
   Returns:
@@ -52,7 +54,9 @@
   Raises:
     TypeError: If `bytes_or_text` is not a binary or unicode string.
   """
-  if isinstance(bytes_or_text, _six.text_type):
+  if isinstance(bytes_or_text, bytearray):
+    return bytes(bytes_or_text)
+  elif isinstance(bytes_or_text, _six.text_type):
     return bytes_or_text.encode(encoding)
   elif isinstance(bytes_or_text, bytes):
     return bytes_or_text
@@ -62,7 +66,10 @@
 
 
 def as_text(bytes_or_text, encoding='utf-8'):
-  """Returns the given argument as a unicode string.
+  """Converts any string-like python input types to unicode.
+
+  Returns the input as a unicode string. Uses utf-8 encoding for text
+  by default.
 
   Args:
     bytes_or_text: A `bytes`, `str`, or `unicode` object.
@@ -95,7 +102,10 @@
 
 @tf_export('compat.as_str_any')
 def as_str_any(value):
-  """Converts to `str` as `str(value)`, but use `as_str` for `bytes`.
+  """Converts input to `str` type.
+
+     Uses `str(value)`, except for `bytes` typed inputs, which are converted
+     using `as_str`.
 
   Args:
     value: A object that can be converted to `str`.
@@ -111,7 +121,10 @@
 
 @tf_export('compat.path_to_str')
 def path_to_str(path):
-  """Returns the file system path representation of a `PathLike` object, else as it is.
+  """Converts input which is a `PathLike` object to `str` type.
+
+  Converts from any python constant representation of a `PathLike` object to
+  a string. If the input is not a `PathLike` object, simply returns the input.
 
   Args:
     path: An object that can be converted to path representation.
@@ -125,7 +138,7 @@
 
   Examples:
   ```python3
-  >>> tf.compat.path_to_str('C:\XYZ\tensorflow\./.././tensorflow')
+  >>> tf.compat.path_to_str('C:\\XYZ\\tensorflow\\./.././tensorflow')
   'C:\\XYZ\\tensorflow\\./.././tensorflow' # Windows OS
   >>> tf.compat.path_to_str(Path('C:\XYZ\tensorflow\./.././tensorflow'))
   'C:\\XYZ\\tensorflow\\..\\tensorflow' # Windows OS
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 5f93bb9..29f6632 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -616,15 +616,28 @@
   _PRINT_DEPRECATION_WARNINGS = print_deprecation_warnings
 
 
-class _HiddenTfApiAttribute(object):
-  pass
+class HiddenTfApiAttribute(property):
+  """Hides a class attribute from the public API.
 
-# Attributes in public classes can be hidden from the API by having an '_'
-# in front of the name (e.g. ClassName._variables). This doesn't work when
-# attributes or methods are inherited from a parent class. To hide inherited
-# attributes, set their values to be `deprecation.HIDDEN_ATTRIBUTE`.
-# For example, this is used in V2 Estimator to hide the deprecated
-# export_savedmodel method:
-#   class EstimatorV2(Estimator):
-#     export_savedmodel = deprecation.HIDDEN_ATTRIBUTE
-HIDDEN_ATTRIBUTE = _HiddenTfApiAttribute()
+  Attributes in public classes can be hidden from the API by having an '_' in
+  front of the name (e.g. ClassName._variables). This doesn't work when
+  attributes or methods are inherited from a parent class. To hide inherited
+  attributes, set their values to be `deprecation.hide_attribute_from_api`.
+  For example, this is used in V2 Estimator to hide the deprecated
+  export_savedmodel method:
+    class EstimatorV2(Estimator):
+       export_savedmodel = deprecation.hide_attribute_from_api('...')
+  """
+
+  def __init__(self, deprecation_message):
+
+    def raise_error(unused_self):
+      raise AttributeError(deprecation_message)
+
+    super(HiddenTfApiAttribute, self).__init__(raise_error)
+
+
+hide_attribute_from_api = HiddenTfApiAttribute  # pylint: disable=invalid-name
+
+# TODO(kathywu): Remove once cl/246395236 is submitted.
+HIDDEN_ATTRIBUTE = HiddenTfApiAttribute('This attribute has been deprecated.')
diff --git a/tensorflow/python/util/deprecation_wrapper.py b/tensorflow/python/util/deprecation_wrapper.py
index 7be1013..3de0028 100644
--- a/tensorflow/python/util/deprecation_wrapper.py
+++ b/tensorflow/python/util/deprecation_wrapper.py
@@ -83,7 +83,9 @@
 class DeprecationWrapper(types.ModuleType):
   """Wrapper for TensorFlow modules to support deprecation messages."""
 
-  def __init__(self, wrapped, module_name):  # pylint: disable=super-on-old-class
+  # TODO(annarev): remove unused_depr_to_canonical once estimator stops
+  # passing it in the next nightly build.
+  def __init__(self, wrapped, module_name, unused_depr_to_canonical=None):  # pylint: disable=super-on-old-class
     # Prefix all local attributes with _dw_ so that we can
     # handle them differently in attribute access methods.
     self._dw_wrapped_module = wrapped
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
index a9499f8..e237c84 100644
--- a/tensorflow/python/util/lazy_loader.py
+++ b/tensorflow/python/util/lazy_loader.py
@@ -21,6 +21,7 @@
 
 import importlib
 import types
+from tensorflow.python.platform import tf_logging as logging
 
 
 class LazyLoader(types.ModuleType):
@@ -46,7 +47,7 @@
 
     # Emit a warning if one was specified
     if self._warning:
-      print(self._warning)
+      logging.warning(self._warning)
       # Make sure to only warn once.
       self._warning = None
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 628e2d6..5bb548b 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -51,7 +51,7 @@
     "type {shallow_type}, while shallow structure has type {input_type}.")
 
 _INPUT_TREE_SMALLER_THAN_SHALLOW_TREE = (
-    "The input_tree has fewer elements than the input_tree. Input structure "
+    "The input_tree has fewer elements than the shallow_tree. Input structure "
     "has length {input_size}, while shallow structure has length "
     "{shallow_size}.")
 
@@ -110,8 +110,8 @@
   """Converts the sequence `args` to the same type as `instance`.
 
   Args:
-    instance: an instance of `tuple`, `list`, `namedtuple`, `dict`, or
-        `collections.OrderedDict`.
+    instance: an instance of `tuple`, `list`, `namedtuple`, `dict`,
+        `collections.OrderedDict`, or `composite_tensor.Composite_Tensor`.
     args: elements to be converted to the `instance` type.
 
   Returns:
@@ -128,7 +128,11 @@
   elif _is_namedtuple(instance) or _is_attrs(instance):
     return type(instance)(*args)
   elif _is_composite_tensor(instance):
-    return instance._from_components(args)  # pylint: disable=protected-access
+    assert len(args) == 1
+    metadata = instance._component_metadata()  # pylint: disable=protected-access
+    return type(instance)._from_components(args[0], metadata)  # pylint: disable=protected-access
+  elif isinstance(instance, _six.moves.range):
+    return _sequence_like(list(instance), args)
   else:
     # Not a namedtuple
     return type(instance)(args)
@@ -169,8 +173,7 @@
     for field in iterable._fields:
       yield field, getattr(iterable, field)
   elif _is_composite_tensor(iterable):
-    for item in enumerate(iterable._to_components()):  # pylint: disable=protected-access
-      yield item
+    yield type(iterable).__name__, iterable._to_components()  # pylint: disable=protected-access
   else:
     for item in enumerate(iterable):
       yield item
@@ -683,15 +686,16 @@
             input_type=type(input_tree),
             shallow_type=type(shallow_tree)))
 
-    while _is_composite_tensor(shallow_tree):
-      shallow_tree = shallow_tree._to_components()  # pylint: disable=protected-access
-    while _is_composite_tensor(input_tree):
-      input_tree = input_tree._to_components()  # pylint: disable=protected-access
-
-    if len(input_tree) < len(shallow_tree):
-      raise ValueError(_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
-          input_size=len(input_tree),
-          shallow_size=len(shallow_tree)))
+    if _is_composite_tensor(shallow_tree):
+      if not _is_composite_tensor(input_tree):
+        raise TypeError("If shallow structure is a CompositeTensor, input "
+                        "must also be a CompositeTensor.  Input has type: %s." %
+                        type(input_tree))
+    else:
+      if len(input_tree) < len(shallow_tree):
+        raise ValueError(
+            _INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+                input_size=len(input_tree), shallow_size=len(shallow_tree)))
 
     if isinstance(shallow_tree, _collections.Mapping):
       absent_keys = set(shallow_tree) - set(input_tree)
diff --git a/tensorflow/python/util/port.i b/tensorflow/python/util/port.i
index 2f73073..64681a9 100644
--- a/tensorflow/python/util/port.i
+++ b/tensorflow/python/util/port.i
@@ -22,7 +22,8 @@
 %ignoreall
 %unignore tensorflow;
 %unignore tensorflow::IsGoogleCudaEnabled;
-%unignore tensorflow::CudaSupportsHalfMatMulAndConv;
+%unignore tensorflow::IsBuiltWithROCm;
+%unignore tensorflow::GpuSupportsHalfMatMulAndConv;
 %unignore tensorflow::IsMklEnabled;
 %include "tensorflow/core/util/port.h"
 %unignoreall
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 538820d..103f99c 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -399,22 +399,3 @@
 def stack(context=1):
   """TFDecorator-aware replacement for inspect.stack."""
   return _inspect.stack(context)[1:]
-
-
-def getsource_no_unwrap(obj):
-  """Return source code for an object. Does not unwrap TFDecorators.
-
-  The source code is returned literally, including indentation for functions not
-  at the top level. This function is analogous to inspect.getsource, with one
-  key difference - it doesn't unwrap decorators. For simplicity, support for
-  some Python object types is dropped (tracebacks, frames, code objects).
-
-  Args:
-      obj: a class, method, or function object.
-
-  Returns:
-      source code as a string
-
-  """
-  lines, lnum = _inspect.findsource(obj)
-  return ''.join(_inspect.getblock(lines[lnum:]))
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 7c030d6..68ee867 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -741,73 +741,6 @@
         'c': 'goodbye'
     }, tf_inspect.getcallargs(decorated, 4, c='goodbye'))
 
-  def testGetSourceNoUnwrapHandlesPlainDecorator(self):
-    def dec(f):
-      def wrapper(*args, **kwargs):
-        return f(*args, **kwargs)
-      return wrapper
-
-    @dec
-    def f():
-      return 1
-
-    source = tf_inspect.getsource_no_unwrap(f)
-    self.assertNotIn('dec', source)
-    self.assertIn('wrapper', source)
-    self.assertNotIn('return 1', source)
-
-  def testGetSourceNoUnwrapHandlesFunctoolsDecorator(self):
-    def dec(f):
-      @functools.wraps(f)
-      def wrapper(*args, **kwargs):
-        return f(*args, **kwargs)
-      return wrapper
-
-    @dec
-    def f():
-      return 1
-
-    source = tf_inspect.getsource_no_unwrap(f)
-    self.assertNotIn('dec', source)
-    self.assertIn('wrapper', source)
-    self.assertNotIn('return 1', source)
-
-  def testGetSourceNoUnwrapHandlesPlainDecoratorFactory(self):
-    def dec_factory():
-      def dec(f):
-        def wrapper(*args, **kwargs):
-          return f(*args, **kwargs)
-        return wrapper
-      return dec
-
-    @dec_factory()
-    def f():
-      return 1
-
-    source = tf_inspect.getsource_no_unwrap(f)
-    self.assertNotIn('factory', source)
-    self.assertNotIn('dec', source)
-    self.assertIn('wrapper', source)
-    self.assertNotIn('return 1', source)
-
-  def testGetSourceNoUnwrapHandlesFunctoolsDecoratorFactory(self):
-    def dec_factory():
-      def dec(f):
-        @functools.wraps(f)
-        def wrapper(*args, **kwargs):
-          return f(*args, **kwargs)
-        return wrapper
-      return dec
-
-    @dec_factory()
-    def f():
-      return 1
-
-    source = tf_inspect.getsource_no_unwrap(f)
-    self.assertNotIn('factory', source)
-    self.assertNotIn('dec', source)
-    self.assertIn('wrapper', source)
-    self.assertNotIn('return 1', source)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index db364c0..a86bb29 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -420,17 +420,15 @@
   Py_ssize_t index_;
 };
 
-// Just return itself as a single item.
-class SparseTensorValueIterator : public ValueIterator {
+// Iterator that just returns a single python object.
+class SingleValueIterator : public ValueIterator {
  public:
-  explicit SparseTensorValueIterator(PyObject* tensor) : tensor_(tensor) {
-    Py_INCREF(tensor);
-  }
+  explicit SingleValueIterator(PyObject* x) : x_(x) { Py_INCREF(x); }
 
-  Safe_PyObjectPtr next() override { return std::move(tensor_); }
+  Safe_PyObjectPtr next() override { return std::move(x_); }
 
  private:
-  Safe_PyObjectPtr tensor_;
+  Safe_PyObjectPtr x_;
 };
 
 // Returns nullptr (to raise an exception) when next() is called.  Caller
@@ -538,7 +536,7 @@
   } else if (IsAttrsHelper(nested)) {
     return absl::make_unique<AttrsValueIterator>(nested);
   } else if (IsSparseTensorValueType(nested)) {
-    return absl::make_unique<SparseTensorValueIterator>(nested);
+    return absl::make_unique<SingleValueIterator>(nested);
   } else {
     return absl::make_unique<SequenceValueIterator>(nested);
   }
@@ -552,6 +550,9 @@
     if (PyErr_Occurred() || nested == nullptr) {
       return absl::make_unique<ErrorValueIterator>();
     }
+    ValueIteratorPtr result = absl::make_unique<SingleValueIterator>(nested);
+    Py_DECREF(nested);  // ValueIterator took ownership
+    return result;
   }
   return GetValueIterator(nested);
 }
@@ -624,7 +625,8 @@
     PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
     bool* is_type_error,
     const std::function<int(PyObject*)>& is_sequence_helper,
-    const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter) {
+    const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter,
+    bool check_composite_tensor_metadata) {
   DCHECK(error_msg);
   DCHECK(is_type_error);
   const bool is_seq1 = is_sequence_helper(o1);
@@ -730,6 +732,29 @@
     }
   }
 
+  if (check_composite_tensor_metadata && IsCompositeTensor(o1)) {
+    if (!IsCompositeTensor(o2)) return false;
+    static char _to_component_metadata[] = "_component_metadata";
+    Safe_PyObjectPtr m1(
+        PyObject_CallMethod(o1, _to_component_metadata, nullptr));
+    if (PyErr_Occurred() || m1 == nullptr) return false;
+    Safe_PyObjectPtr m2(
+        PyObject_CallMethod(o2, _to_component_metadata, nullptr));
+    if (PyErr_Occurred() || m2 == nullptr) {
+      return false;
+    }
+    if (PyObject_RichCompareBool(m1.get(), m2.get(), Py_NE)) {
+      *is_type_error = false;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two CompositeTensors have different metadata. "
+          "First CompositeTensor ",
+          PyObjectToString(o1), " has metadata ", PyObjectToString(m1.get()),
+          ", while second structure ", PyObjectToString(o2), " has metadata ",
+          PyObjectToString(m2.get()));
+      return false;
+    }
+  }
+
   ValueIteratorPtr iter1 = value_iterator_getter(o1);
   ValueIteratorPtr iter2 = value_iterator_getter(o2);
 
@@ -744,7 +769,8 @@
       }
       bool no_internal_errors = AssertSameStructureHelper(
           v1.get(), v2.get(), check_types, error_msg, is_type_error,
-          is_sequence_helper, value_iterator_getter);
+          is_sequence_helper, value_iterator_getter,
+          check_composite_tensor_metadata);
       Py_LeaveRecursiveCall();
       if (!no_internal_errors) return false;
       if (!error_msg->empty()) return true;
@@ -916,10 +942,12 @@
       expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
   const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
       expand_composites ? GetValueIteratorForComposite : GetValueIterator;
+  const bool check_composite_tensor_metadata = expand_composites;
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            is_sequence_helper, get_value_iterator);
+                            is_sequence_helper, get_value_iterator,
+                            check_composite_tensor_metadata);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
@@ -943,7 +971,7 @@
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceForDataHelper, GetValueIterator);
+                            IsSequenceForDataHelper, GetValueIterator, false);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
diff --git a/tensorflow/security/advisory/tfsa-2019-001.md b/tensorflow/security/advisory/tfsa-2019-001.md
new file mode 100644
index 0000000..65125d6
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2019-001.md
@@ -0,0 +1,35 @@
+## TFSA-2019-001: Null Pointer Dereference Error in Decoding GIF Files
+
+### CVE Number
+
+CVE-2019-9635
+
+### Issue Description
+
+Certain invalid GIF files can produce a null pointer dereference when reading
+from the color map of a frame if the color map is missing.
+
+### Impact
+
+A maliciously crafted GIF file could cause a denial of service attack for
+TensorFlow by making it crash.
+
+### Vulnerable Versions
+
+TensorFlow 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0,
+1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.9.0, 1.10.0, 1.10.1, 1.11.0, 1.12.0
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commit
+[e41cb124](https://github.com/tensorflow/tensorflow/commit/e41cb124cd0b325821af85cdacd9d8a12e206418).
+
+If users are loading untrusted configurations in TensorFlow, we encourage users
+to apply the patch to upgrade the version of TensorFlow they are currently using.
+
+Additionally, we have released TensorFlow version 1.12.2 to mitigate this
+vulnerability. Versions 1.13.0 and later were released using the patched commit.
+
+### Credits
+
+This issue was discovered by Yakun Zhang and Zheng Huang of Baidu Security Lab.
diff --git a/tensorflow/security/index.md b/tensorflow/security/index.md
index 0f17615..e28f8ff 100644
--- a/tensorflow/security/index.md
+++ b/tensorflow/security/index.md
@@ -8,6 +8,7 @@
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
+| [TFSA-2019-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-001.md)   | Null Pointer Dereference Error in Decoding GIF Files | <= 1.12 | Baidu Security Lab |  |
 | [TFSA-2018-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-006.md)   | Crafted Configuration File results in Invalid Memory Access | <= 1.7 | Blade Team of Tencent |  |
 | [TFSA-2018-005](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-005.md)   | Old Snappy Library Usage Resulting in Memcpy Parameter Overlap | <= 1.7 | Blade Team of Tencent |  |
 | [TFSA-2018-004](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-004.md)   | Checkpoint Meta File Out-of-Bounds Read | <= 1.7 | Blade Team of Tencent |  |
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index b1f5675..84d0780 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -93,6 +93,7 @@
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -140,6 +141,7 @@
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -152,6 +154,7 @@
     deps = [
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -192,7 +195,6 @@
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -200,6 +202,7 @@
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -232,6 +235,7 @@
         "//tensorflow/stream_executor/platform",
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -260,6 +264,7 @@
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -296,6 +301,7 @@
         ":stream_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -311,6 +317,8 @@
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -398,6 +406,8 @@
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -415,6 +425,7 @@
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -438,8 +449,10 @@
         ":plugin",
         ":stream_executor_headers",
         "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -475,7 +488,9 @@
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -503,10 +518,8 @@
         ":launch_dim",
         ":plugin_registry",
         ":stream_executor_headers",
-        "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -524,16 +537,15 @@
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":allocator_stats",
         ":dnn_proto_cc",
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
-        "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -559,6 +571,8 @@
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -608,6 +622,7 @@
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -651,6 +666,27 @@
     ],
 )
 
+cc_library(
+    name = "device_memory_allocator",
+    srcs = [
+        "device_memory_allocator.cc",
+        "owning_device_memory.cc",
+    ],
+    hdrs = [
+        "device_memory_allocator.h",
+        "owning_device_memory.h",
+    ],
+    deps = [
+        ":platform",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_cc_test(
     name = "stream_test",
     size = "small",
diff --git a/tensorflow/stream_executor/allocator_stats.h b/tensorflow/stream_executor/allocator_stats.h
index 786ceb0..62edfff 100644
--- a/tensorflow/stream_executor/allocator_stats.h
+++ b/tensorflow/stream_executor/allocator_stats.h
@@ -36,11 +36,20 @@
   // is known.
   absl::optional<int64> bytes_limit;
 
+  // Stack related memory usage.
+  int64 bytes_reserved;       // Number of bytes reserved on the stack.
+  int64 peak_bytes_reserved;  // The peak number of bytes reserved on the stack.
+  // The upper limit on the number bytes of reservable memory on the stack,
+  // if such a limit is known.
+  absl::optional<int64> bytes_reservable_limit;
+
   AllocatorStats()
       : num_allocs(0),
         bytes_in_use(0),
         peak_bytes_in_use(0),
-        largest_alloc_size(0) {}
+        largest_alloc_size(0),
+        bytes_reserved(0),
+        peak_bytes_reserved(0) {}
 
   string DebugString() const;
 };
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 53521fb..0dee518 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -58,7 +58,13 @@
             "//tensorflow/stream_executor/lib",
             "//tensorflow/stream_executor/platform",
         ],
-    ) + tf_additional_cuda_platform_deps(),
+    ) + tf_additional_cuda_platform_deps() + [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+    ],
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
 )
 
@@ -72,7 +78,7 @@
         "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-    ]),
+    ]) + ["@com_google_absl//absl/strings:str_format"],
 )
 
 cc_library(
@@ -107,7 +113,12 @@
         "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub"],
         "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub"],
         "//conditions:default": ["//tensorflow/core:cuda"],
-    }) + ["@com_google_absl//absl/strings:str_format"],
+    }) + [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+    ],
 )
 
 cc_library(
@@ -118,7 +129,7 @@
         "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub.cc"],
         "//conditions:default": [],
     }),
-    textual_hdrs = ["cuda_runtime_10_0.inc"],
+    textual_hdrs = glob(["cuda_runtime_*.inc"]),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:using_cuda_nvcc_with_dynamic_build": [
@@ -187,12 +198,22 @@
     ]),
 )
 
+alias(
+    name = "cublas_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:cublas",
+        ":cublas_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "cublas_plugin",
     srcs = if_cuda_is_configured(["cuda_blas.cc"]),
     hdrs = if_cuda_is_configured(["cuda_blas.h"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
+        ":cublas_lib",
         ":cuda_activation",
         ":cuda_gpu_executor",
         ":cuda_platform_id",
@@ -212,10 +233,10 @@
         "//tensorflow/stream_executor/gpu:gpu_helpers_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-    ] + if_static(
-        ["@local_config_cuda//cuda:cublas"],
-        [":cublas_stub"],
-    )),
+    ]) + [
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+    ],
     alwayslink = True,
 )
 
@@ -230,6 +251,15 @@
     ]),
 )
 
+alias(
+    name = "cufft_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:cufft",
+        ":cufft_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "cufft_plugin",
     srcs = if_cuda_is_configured(["cuda_fft.cc"]),
@@ -241,6 +271,7 @@
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
+        ":cufft_lib",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:fft",
@@ -250,10 +281,7 @@
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(
-        ["@local_config_cuda//cuda:cufft"],
-        [":cufft_stub"],
-    )),
+    ]),
     alwayslink = True,
 )
 
@@ -268,6 +296,15 @@
     ]),
 )
 
+alias(
+    name = "cudnn_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:cudnn",
+        ":cudnn_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "cudnn_plugin",
     srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
@@ -282,6 +319,7 @@
         ":cuda_stream",
         ":cuda_timer",
         ":cudnn_version",
+        ":cudnn_lib",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
@@ -295,10 +333,7 @@
         "//tensorflow/stream_executor:temporary_device_memory",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-    ]) + tf_additional_cudnn_plugin_deps() + if_cuda_is_configured(if_static(
-        ["@local_config_cuda//cuda:cudnn"],
-        [":cudnn_stub"],
-    )),
+    ]) + tf_additional_cudnn_plugin_deps() + ["@com_google_absl//absl/synchronization"],
     alwayslink = True,
 )
 
@@ -313,6 +348,15 @@
     ]),
 )
 
+alias(
+    name = "curand_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:curand",
+        ":curand_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "curand_plugin",
     srcs = if_cuda_is_configured(["cuda_rng.cc"]),
@@ -323,6 +367,7 @@
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
+        ":curand_lib",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
@@ -332,10 +377,7 @@
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(
-        ["@local_config_cuda//cuda:curand"],
-        [":curand_stub"],
-    )),
+    ]),
     alwayslink = True,
 )
 
@@ -464,7 +506,7 @@
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ]),
+    ]) + ["@com_google_absl//absl/strings:str_format"],
     alwayslink = True,
 )
 
@@ -473,7 +515,7 @@
     srcs = ["cudnn_version.cc"],
     hdrs = ["cudnn_version.h"],
     deps = [
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index b4e8bee..661d845 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -44,12 +44,13 @@
 #define EIGEN_HAS_CUDA_FP16
 #endif
 
-#include "third_party/eigen3/Eigen/Core"
-
 #include <assert.h>
+
 #include <complex>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -62,7 +63,6 @@
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -402,7 +402,7 @@
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
                                   bool use_tensor_op_math, Args... args) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
   if (!SetStream(stream)) {
@@ -1564,9 +1564,9 @@
     const DeviceMemory<Eigen::half> &b, int ldb, float beta,
     DeviceMemory<Eigen::half> *c, int ldc) {
 #if CUDA_VERSION >= 7050
-  VLOG(1) << port::Printf(
-      "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
-      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+  VLOG(1) << absl::StrFormat(
+      "doing cuBLAS SGEMM: at=%d bt=%d m=%u n=%u "
+      "k=%u alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
@@ -1624,9 +1624,9 @@
                           float alpha, const DeviceMemory<float> &a, int lda,
                           const DeviceMemory<float> &b, int ldb, float beta,
                           DeviceMemory<float> *c, int ldc) {
-  VLOG(1) << port::Printf(
-      "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
-      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+  VLOG(1) << absl::StrFormat(
+      "doing cuBLAS SGEMM: at=%d bt=%d m=%u n=%u "
+      "k=%u alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 63d0305..d0cd15f 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -20,9 +20,9 @@
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -142,8 +142,8 @@
                                    const T &beta, DeviceMemory<T> *y, int incy,
                                    blas::ProfileResult *output_profile_result);
 
-  // mutex that guards the cuBLAS handle for this device.
-  mutex mu_;
+  // Guards the cuBLAS handle for this device.
+  absl::Mutex mu_;
 
   // GpuExecutor which instantiated this CUDABlas.
   // Immutable post-initialization.
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index e58ebee..e8ff7ca 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -41,19 +41,21 @@
 
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/numbers.h"
 #include "tensorflow/stream_executor/lib/process_state.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
 namespace stream_executor {
 namespace cuda {
 
 string DriverVersionToString(DriverVersion version) {
-  return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
+  return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
+                         std::get<2>(version));
 }
 
 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
@@ -65,13 +67,14 @@
 }
 
 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
-  std::vector<string> pieces = port::Split(value, '.');
+  std::vector<string> pieces = absl::StrSplit(value, '.');
   if (pieces.size() < 2 || pieces.size() > 4) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
-        port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
-                     "for driver version; got \"%s\"",
-                     value.c_str()));
+        absl::StrFormat(
+            "expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
+            "for driver version; got \"%s\"",
+            value.c_str()));
   }
 
   int major;
@@ -80,23 +83,23 @@
   if (!port::safe_strto32(pieces[0], &major)) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
-        port::Printf("could not parse major version number \"%s\" as an "
-                     "integer from string \"%s\"",
-                     pieces[0].c_str(), value.c_str()));
+        absl::StrFormat("could not parse major version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[0], value));
   }
   if (!port::safe_strto32(pieces[1], &minor)) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
-        port::Printf("could not parse minor version number \"%s\" as an "
-                     "integer from string \"%s\"",
-                     pieces[1].c_str(), value.c_str()));
+        absl::StrFormat("could not parse minor version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[1].c_str(), value.c_str()));
   }
   if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
-        port::Printf("could not parse patch version number \"%s\" as an "
-                     "integer from string \"%s\"",
-                     pieces[2].c_str(), value.c_str()));
+        absl::StrFormat("could not parse patch version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[2], value));
   }
 
   DriverVersion result{major, minor, patch};
@@ -177,7 +180,7 @@
     string library_path = value == nullptr ? "" : value;
     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
 
-    std::vector<string> pieces = port::Split(library_path, ':');
+    std::vector<string> pieces = absl::StrSplit(library_path, ':');
     for (const auto &piece : pieces) {
       if (piece.empty()) {
         continue;
@@ -263,9 +266,9 @@
       }
       string dso_version = dot + strlen(so_suffix);
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
-      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
       auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
-      *result = cuda::StringToDriverVersion(stripped_dso_version);
+      *result = cuda::StringToDriverVersion(string(stripped_dso_version));
       return 1;
     }
     return 0;
@@ -295,9 +298,8 @@
   size_t space_index = version_and_rest.find(" ");
   auto kernel_version = version_and_rest.substr(0, space_index);
   // TODO(b/22689637): Eliminate the explicit namespace if possible.
-  auto stripped_kernel_version =
-      port::StripSuffixString(kernel_version, ".ld64");
-  return cuda::StringToDriverVersion(stripped_kernel_version);
+  auto stripped_kernel_version = absl::StripSuffix(kernel_version, ".ld64");
+  return cuda::StringToDriverVersion(string(stripped_kernel_version));
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index c0cc00c..12da285 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -22,7 +22,6 @@
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -134,8 +133,8 @@
  public:
   // Takes ownership of the executor context and the lock to access cuDNN
   // using handle.
-  CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
-              cudnnHandle_t handle)
+  CudnnHandle(gpu::ScopedActivateExecutorContext context,
+              std::unique_ptr<absl::MutexLock> lock, cudnnHandle_t handle)
       : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
   // Returns cuDNN handle. To be passed directly to cuDNN APIs, don't keep
@@ -144,7 +143,7 @@
 
  private:
   gpu::ScopedActivateExecutorContext context_;
-  mutex_lock lock_;
+  std::unique_ptr<absl::MutexLock> lock_;
   cudnnHandle_t handle_;  // Not owned.
 };
 
@@ -161,7 +160,7 @@
   explicit CudnnAccess(cudnnHandle_t handle) : handle_(handle) {}
 
   ~CudnnAccess() {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
     cudnnDestroy(handle_);
   }
 
@@ -182,7 +181,8 @@
   // therefore a bad idea (performance wise) to call any cuDNN APIs that
   // enqueue work in the stream.
   CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
-    mutex_lock lock(mutex_);
+    auto lock = absl::make_unique<absl::MutexLock>(&mutex_);
+    mutex_.AssertHeld();
     gpu::ScopedActivateExecutorContext context(executor);
     CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
     const auto status = cudnnSetStream(handle_, cu_stream);
@@ -192,7 +192,7 @@
 
  private:
   // Guards the enqueueing of cuDNN operations via the handle_ below.
-  mutex mutex_;
+  absl::Mutex mutex_;
 
   // cuDNN library handle.
   cudnnHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
@@ -307,7 +307,7 @@
     CudnnVersion loaded_version;
     TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&loaded_version));
     if (!IsSourceCompatibleWithCudnnLibrary(source_version, loaded_version)) {
-      const tensorflow::string error = absl::StrCat(
+      const string error = absl::StrCat(
           "Loaded runtime CuDNN library: ", loaded_version.ToString(),
           " but source was compiled with: ", source_version.ToString(),
           ".  CuDNN library major and minor version needs to match or have "
@@ -2556,7 +2556,9 @@
   // precision is set.
   // Set it temporary to false s.t. no error is raised when using fp16 inputs,
   // fp32 math precision.
-  static constexpr bool kDefaultFlag = false;
+  //
+  // cuDNN == 7.5.0 is verified to have this fixed.
+  static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7500;
 };
 
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 3a49469..80fc1ae 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -22,7 +22,6 @@
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index dc4f9ad..f41f64f 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -17,28 +17,29 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+
 #include <map>
 #include <set>
 #include <utility>
 
 #include "absl/base/casts.h"
+#include "absl/base/const_init.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/debugging/leak_check.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
-#include "tensorflow/stream_executor/lib/notification.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
@@ -65,19 +66,19 @@
  public:
   // Returns whether context is a member of the live set.
   static bool Has(CUcontext context) {
-    tf_shared_lock lock(mu_);
+    absl::ReaderMutexLock lock(&mu_);
     return Live()->find(context) != Live()->end();
   }
 
   // Adds context to the live set, or returns it if it's already present.
   static GpuContext* Add(CUcontext context) {
     CHECK(context != nullptr);
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     auto insert_result = Live()->insert(std::make_pair(context, nullptr));
     auto it = insert_result.first;
     if (insert_result.second) {
       // context was not present in the map.  Add it.
-      it->second = MakeUnique<GpuContext>(context, next_id_++);
+      it->second = absl::make_unique<GpuContext>(context, next_id_++);
     }
     return it->second.get();
   }
@@ -85,7 +86,7 @@
   // Removes context from the live set.
   static void Remove(CUcontext context) {
     CHECK(context != nullptr);
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     auto it = Live()->find(context);
     CHECK(it != Live()->end()) << context;
     Live()->erase(it);
@@ -100,11 +101,11 @@
   }
 
   // Lock that guards access-to/mutation-of the live set.
-  static mutex mu_;
+  static absl::Mutex mu_;
   static int64 next_id_;
 };
 
-/* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
+/* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
 
 // Formats CUresult to output prettified values into a log stream.
@@ -140,15 +141,9 @@
 // stack-limited threads (such as those spawned by a default-argument
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
-static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
-static port::ThreadPool* InitializeDriverExecutor() {
-  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
-                              "cuda_driver", 1);
-}
-
 port::ThreadPool* GetDriverExecutor() {
-  mutex_lock lock(driver_executor_threadpool_mu);
-  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
+  static port::ThreadPool* thread_pool = new port::ThreadPool(
+      port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
   return thread_pool;
 }
 
@@ -177,6 +172,9 @@
   cudaPointerAttributes attributes;
   cudaError_t err =
       cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
+  CHECK(err == cudaSuccess || err == cudaErrorInvalidValue)
+      << "Unexpected CUDA error: " << cudaGetErrorString(err);
+
   // If we failed, reset cuda error status to avoid poisoning cuda streams.
   if (err != cudaSuccess) cudaGetLastError();
   bool points_to_host_memory = (err == cudaErrorInvalidValue ||
@@ -328,17 +326,10 @@
 /* static */ port::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as cuInit need only be
   // called once, but GpuDriver::Init may be called many times.
-  static port::Status init_retval;
-  static bool set = false;
-  static mutex* init_mu = new mutex;
-
-  mutex_lock lock(*init_mu);
-  if (!set) {
-    init_retval = InternalInit();
-    set = true;
-  }
-
-  return init_retval;
+  static port::Status* init_retval = [] {
+    return new port::Status(InternalInit());
+  }();
+  return *init_retval;
 }
 
 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
@@ -587,7 +578,7 @@
 /* static */ bool GpuDriver::LoadPtx(GpuContext* context,
                                      const char* ptx_contents,
                                      CUmodule* module) {
-  port::Notification notification;
+  absl::Notification notification;
   bool ret = true;
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
@@ -956,13 +947,13 @@
     case CUDA_ERROR_NOT_INITIALIZED:
       return port::Status(
           port::error::FAILED_PRECONDITION,
-          port::Printf("error destroying CUDA event in context %p: %s", context,
-                       ToString(res).c_str()));
+          absl::StrFormat("error destroying CUDA event in context %p: %s",
+                          context, ToString(res)));
     default:
       return port::Status(
           port::error::INTERNAL,
-          port::Printf("error destroying CUDA event in context %p: %s", context,
-                       ToString(res).c_str()));
+          absl::StrFormat("error destroying CUDA event in context %p: %s",
+                          context, ToString(res)));
   }
 }
 
@@ -978,13 +969,13 @@
     case CUDA_ERROR_NOT_INITIALIZED:
       return port::Status(
           port::error::FAILED_PRECONDITION,
-          port::Printf("error recording CUDA event on stream %p: %s", stream,
-                       ToString(res).c_str()));
+          absl::StrFormat("error recording CUDA event on stream %p: %s", stream,
+                          ToString(res)));
     default:
       return port::Status(
           port::error::INVALID_ARGUMENT,
-          port::Printf("error recording CUDA event on stream %p: %s", stream,
-                       ToString(res).c_str()));
+          absl::StrFormat("error recording CUDA event on stream %p: %s", stream,
+                          ToString(res)));
   }
 }
 
@@ -995,7 +986,7 @@
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to query event: %s", ToString(res).c_str()));
+        absl::StrFormat("failed to query event: %s", ToString(res)));
   }
 
   return res;
@@ -1088,11 +1079,10 @@
   }
   CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(
-        port::Printf("failed to synchronous memcpy from device to host: %s; "
-                     "host dst: %p; GPU src: %p; size: %llu=0x%llx",
-                     ToString(res).c_str(), host_dst,
-                     absl::bit_cast<void*>(gpu_src), size, size));
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from device to host: %s; "
+        "host dst: %p; GPU src: %p; size: %u=0x%x",
+        ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
           << host_dst;
@@ -1110,11 +1100,10 @@
   }
   CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(port::Printf(
+    return port::InternalError(absl::StrFormat(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
-        " host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
-        size));
+        " host src: %p; size: %u=0x%x",
+        ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size));
   }
   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
   return port::Status::OK();
@@ -1131,10 +1120,10 @@
   }
   CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(port::Printf(
+    return port::InternalError(absl::StrFormat(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
-        "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
+        "GPU src: %p; size: %u=0x%x",
+        ToString(res), absl::bit_cast<void*>(gpu_dst),
         absl::bit_cast<void*>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
@@ -1153,11 +1142,10 @@
   }
   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << port::Printf(
+    LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
-        "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
-        size);
+        "GPU src: %p; size: %u=0x%x",
+        ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
     return false;
   }
   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
@@ -1178,11 +1166,10 @@
   }
   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << port::Printf(
+    LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
-        "host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
-        size);
+        "host src: %p; size: %u=0x%x",
+        ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
     return false;
   }
   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
@@ -1202,18 +1189,17 @@
   }
   CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
-    LOG(ERROR) << port::Printf(
+    LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from device to device: %s"
         "; GPU dst: %p on %s %s"
         "; GPU src: %p on %s %s"
-        "; can access? %s; size: %llu=0x%llx",
-        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
-        CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
-        CUDAPointerToDeviceString(gpu_dst).c_str(),
-        absl::bit_cast<void*>(gpu_src),
-        CUDAPointerToMemorySpaceString(gpu_src).c_str(),
-        CUDAPointerToDeviceString(gpu_src).c_str(),
-        CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
+        "; can access? %s; size: %u=0x%x",
+        ToString(result), absl::bit_cast<void*>(gpu_dst),
+        CUDAPointerToMemorySpaceString(gpu_dst),
+        CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
+        CUDAPointerToMemorySpaceString(gpu_src),
+        CUDAPointerToDeviceString(gpu_src),
+        CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
 
     return false;
   }
@@ -1221,9 +1207,9 @@
   return true;
 }
 
-/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
-                                                 CUevent* result,
-                                                 EventFlags flags) {
+/* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
+                                               CUevent* result,
+                                               EventFlags flags) {
   int cuflags;
   switch (flags) {
     case EventFlags::kDefault:
@@ -1317,14 +1303,14 @@
     // below).
     return port::Status(
         port::error::NOT_FOUND,
-        port::Printf("not a device pointer %p; %s",
-                     reinterpret_cast<void*>(dptr), ToString(result).c_str()));
+        absl::StrFormat("not a device pointer %p; %s",
+                        reinterpret_cast<void*>(dptr), ToString(result)));
   }
 
   return port::Status(
       port::error::INTERNAL,
-      port::Printf("failed to get pointer into for device pointer %p; %s",
-                   reinterpret_cast<void*>(dptr), ToString(result).c_str()));
+      absl::StrFormat("failed to get pointer into for device pointer %p; %s",
+                      reinterpret_cast<void*>(dptr), ToString(result)));
 }
 
 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
@@ -1348,9 +1334,9 @@
   if (res != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf(
+        absl::StrFormat(
             "failed to get compute capability major for device: %s; %d",
-            ToString(res).c_str(), device));
+            ToString(res), device));
   }
 
   res = cuDeviceGetAttribute(
@@ -1358,9 +1344,9 @@
   if (res != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf(
+        absl::StrFormat(
             "failed to get compute capability minor for device: %s; %d",
-            ToString(res).c_str(), device));
+            ToString(res), device));
   }
 
   return port::Status::OK();
@@ -1488,8 +1474,8 @@
   if (res != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to get device attribute %d for device %d: %s",
-                     attribute, device, ToString(res).c_str()));
+        absl::StrFormat("failed to get device attribute %d for device %d: %s",
+                        attribute, device, ToString(res)));
   }
   return val;
 }
@@ -1592,8 +1578,8 @@
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to enable peer access from %p to %p: %s", from, to,
-                     ToString(result).c_str()));
+        absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
+                        to, ToString(result)));
   }
 
   return port::Status::OK();
@@ -1610,8 +1596,8 @@
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
-                     ToString(result).c_str()));
+        absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
+                        kernel, ToString(result)));
   }
 
   return max_blocks;
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index ca63abf..cdd5ae7 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -24,7 +24,9 @@
 #else
 #include <unistd.h>
 #endif
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
@@ -40,10 +42,7 @@
 #include "tensorflow/stream_executor/lib/numbers.h"
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/process_state.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -210,9 +209,9 @@
   if (strip_exe) {
     // The exe is the last component of the path, so remove one component.
     string ret = exe_path;
-    std::vector<string> components = port::Split(exe_path, '/');
+    std::vector<string> components = absl::StrSplit(exe_path, '/');
     components.pop_back();
-    return port::Join(components, "/");
+    return absl::StrJoin(components, "/");
   }
   return exe_path;
 }
@@ -273,7 +272,7 @@
   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
 
   if (spec.has_cuda_cubin_in_memory()) {
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
     const char *cubin = spec.cuda_cubin_in_memory().bytes();
     if (!LoadModuleFromCuBin(cubin, &module)) {
@@ -296,7 +295,7 @@
       return false;
     }
 
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     if (!LoadModuleFromPtx(ptx, &module)) {
       return false;
     }
@@ -344,7 +343,7 @@
 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
-  mutex_lock lock{in_memory_modules_mu_};
+  absl::MutexLock lock{&in_memory_modules_mu_};
   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
@@ -363,7 +362,7 @@
   // ModuleHandle::id().
   CUmodule cu_module;
   if (spec.has_cuda_cubin_in_memory()) {
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     if (!LoadModuleFromCuBin(
             reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
             &cu_module)) {
@@ -381,7 +380,7 @@
       return false;
     }
 
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
       return false;
     }
@@ -395,7 +394,7 @@
 
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
-  mutex_lock lock{in_memory_modules_mu_};
+  absl::MutexLock lock{&in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
 
@@ -429,7 +428,7 @@
   // whether we've done an occupancy check on this kernel before isn't free
   // (because we have to synchronize), so we only do this at -v 2+.
   if (VLOG_IS_ON(2)) {
-    mutex_lock lock(launched_kernels_mu_);
+    absl::MutexLock lock(&launched_kernels_mu_);
     if (!launched_kernels_.count(cufunc)) {
       VlogOccupancyInfo(kernel, thread_dims, block_dims);
       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
@@ -712,8 +711,8 @@
   } else {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf("error recording waiting for CUDA event on stream %p",
-                     stream));
+        absl::StrFormat("error recording waiting for CUDA event on stream %p",
+                        stream));
   }
 }
 
@@ -893,7 +892,7 @@
   };
 
   {  // give limited scope to mutex_lock
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     if (static_cast<bool>(module_handle)) {
       auto it = gpu_binary_to_module_.find(module_handle.id());
       CHECK(it != gpu_binary_to_module_.end());
@@ -911,13 +910,13 @@
   return false;
 }
 
-bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
+bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
   // (as opposed to ThreadDim which expresses the dimensions of threads
   // within a block).
   int x, y, z;
-  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
     return false;
   }
 
@@ -982,7 +981,7 @@
   }
 
   string filename =
-      port::Printf("/sys/bus/pci/devices/%s/numa_node", pci_bus_id.c_str());
+      absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
 
   // We have to use fopen/fread here so that the device properties can be
   // populated before InitGoogle procedure has been completed (at which point we
@@ -1022,71 +1021,84 @@
 #endif
 }
 
-DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+GpuExecutor::CreateDeviceDescription(int device_ordinal) {
+  GpuDeviceHandle device;
+  auto status = GpuDriver::GetDevice(device_ordinal, &device);
+  if (!status.ok()) {
+    return status;
+  }
+
+  int cc_major;
+  int cc_minor;
+  status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
+  if (!status.ok()) {
+    return status;
+  }
+
   internal::DeviceDescriptionBuilder builder;
 
   {
     int driver_version = 0;
     (void)GpuDriver::GetDriverVersion(&driver_version);
-    string augmented_driver_version = port::Printf(
+    string augmented_driver_version = absl::StrFormat(
         "%d (%s)", driver_version,
-        cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
-            .c_str());
+        cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
     builder.set_driver_version(augmented_driver_version);
   }
 
   {
-    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
+    string pci_bus_id = GpuDriver::GetPCIBusID(device);
 
     // Lower the hex characters to match sysfs.
-    pci_bus_id = port::Lowercase(pci_bus_id);
+    pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
     builder.set_pci_bus_id(pci_bus_id);
 
     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
-    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
     builder.set_numa_node(numa_node);
   }
 
   {
     builder.set_threads_per_block_limit(
         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                                      device_)
+                                      device)
             .ValueOrDie());
 
     ThreadDim thread_dim_limit;
     thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
                              .ValueOrDie();
     thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
                              .ValueOrDie();
     thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
                              .ValueOrDie();
     builder.set_thread_dim_limit(thread_dim_limit);
 
     int clock_rate =
-        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
             .ValueOrDie();
     builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
   }
 
   {
     bool ecc_enabled = false;
-    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
+    (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
     builder.set_ecc_enabled(ecc_enabled);
   }
 
   {
     uint64 device_memory_size = -1;
-    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
     builder.set_device_memory_size(device_memory_size);
   }
 
   port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
-      CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
+      CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
   port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
-      CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
+      CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
     // Times 2 because HBM is DDR memory; it gets two data bits per each data
     // lane.
@@ -1097,94 +1109,47 @@
 
   {
     BlockDim block_dim_limit;
-    FillBlockDimLimit(&block_dim_limit);
+    FillBlockDimLimit(device, &block_dim_limit);
     builder.set_block_dim_limit(block_dim_limit);
   }
 
   {
     string device_name;
-    (void)GpuDriver::GetDeviceName(device_, &device_name);
+    (void)GpuDriver::GetDeviceName(device, &device_name);
     builder.set_name(device_name);
   }
 
   builder.set_platform_version(
-      absl::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
+      absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
 
   // TODO(leary) should be a way to query this from the driver, but this is
   // unlikely to change for us any time soon.
   builder.set_device_address_bits(64);
 
   builder.set_device_vendor("NVIDIA Corporation");
-  builder.set_cuda_compute_capability(cc_major_, cc_minor_);
+  builder.set_cuda_compute_capability(cc_major, cc_minor);
   builder.set_shared_memory_per_core(
-      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
   builder.set_shared_memory_per_block(
-      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
   builder.set_core_count(
-      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
+      GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
   builder.set_threads_per_core_limit(
-      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
   builder.set_registers_per_block_limit(
-      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
   builder.set_threads_per_warp(
-      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
+      GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
   builder.set_registers_per_core_limit(
       GpuDriver::GetDeviceAttribute(
-          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
+          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
           .ValueOrDie());
 
-  // We are loading a dummy ptx kernel to set the device description's
-  // blocks_per_core_limit by calling the CUDA occupancy calculator.  This
-  // value is currently required XLA GPU's CalculateLaunchDimensions()
-  const char* blank_ptx = R"(
-.version 6.0
-.target sm_30
-.address_size 64
-
-        // .globl       testkernel
-.visible .entry testkernel()
-{
-        ret;
-})";
-  const char* kernel_name = "testkernel";
-
-  CUmodule blank_module;
-  CUfunction blank_function;
-  int bpc = -1;
-  bool ptx_success =
-      cuda::CUDADriver::LoadPtx(context_, blank_ptx, &blank_module);
-  if (ptx_success) {
-    ptx_success = cuda::CUDADriver::GetModuleFunction(
-        context_, blank_module, kernel_name, &blank_function);
-    if (ptx_success) {
-      CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
-          &bpc, blank_function, 1, 1);
-      if (result != CUDA_SUCCESS) {
-        bpc = -1;
-        ptx_success = false;
-      }
-    }
-    cuda::CUDADriver::UnloadModule(context_, blank_module);
-  }
-  if (!ptx_success) {
-    LOG(ERROR) << "Failed to calculate max blocks per SM using dummy kernel.";
-  }
-  builder.set_blocks_per_core_limit(bpc);
-
-  auto built = builder.Build();
-  return built.release();
+  return builder.Build();
 }
 
 }  // namespace gpu
 
-void initialize_cuda_gpu_executor() {
-  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
-    return new gpu::GpuExecutor{config};
-  };
-}
-
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {
-  stream_executor::initialize_cuda_gpu_executor();
-});
+REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 54aba01..8ca5185 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -15,14 +15,16 @@
 
 #include "tensorflow/stream_executor/cuda/cuda_platform.h"
 
+#include "absl/base/const_init.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -74,30 +76,25 @@
 void CudaPlatform::InspectNumaNodes() {
   // To get NUMA node information, we need to create all executors, so we can
   // examine their device descriptions to see their bus assignments.
-  static bool initialized = false;
-  static mutex numa_mutex(LINKER_INITIALIZED);
-  mutex_lock lock(numa_mutex);
-  if (initialized) {
-    return;
-  }
-
-  StreamExecutorConfig config;
-  for (int i = 0; i < VisibleDeviceCount(); i++) {
-    config.ordinal = i;
-    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
-    if (i == 0) {
-      // NUMA nodes may not start at 0, so set the minimum node  based on the
-      // first executor we see.
-      min_numa_node_ = exec->GetDeviceDescription().numa_node();
-      limit_numa_node_ = min_numa_node_ + 1;
-    } else {
-      min_numa_node_ =
-          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
-      limit_numa_node_ = std::max(limit_numa_node_,
-                                  exec->GetDeviceDescription().numa_node() + 1);
+  static std::once_flag once;
+  std::call_once(once, [&] {
+    StreamExecutorConfig config;
+    for (int i = 0; i < VisibleDeviceCount(); i++) {
+      config.ordinal = i;
+      StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+      if (i == 0) {
+        // NUMA nodes may not start at 0, so set the minimum node  based on the
+        // first executor we see.
+        min_numa_node_ = exec->GetDeviceDescription().numa_node();
+        limit_numa_node_ = min_numa_node_ + 1;
+      } else {
+        min_numa_node_ =
+            std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+        limit_numa_node_ = std::max(
+            limit_numa_node_, exec->GetDeviceDescription().numa_node() + 1);
+      }
     }
-  }
-  initialized = true;
+  });
 }
 
 int CudaPlatform::BusCount() {
@@ -126,7 +123,7 @@
 
   return port::Status(
       port::error::NOT_FOUND,
-      port::Printf("Executor for bus %d not found.", bus_ordinal));
+      absl::StrFormat("Executor for bus %d not found.", bus_ordinal));
 }
 
 Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
@@ -143,6 +140,11 @@
 
 const string& CudaPlatform::Name() const { return name_; }
 
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+CudaPlatform::DescriptionForDevice(int ordinal) const {
+  return GpuExecutor::CreateDeviceDescription(ordinal);
+}
+
 port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -168,15 +170,15 @@
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<GpuExecutor>(config.plugin_config));
+  auto executor = absl::make_unique<StreamExecutor>(
+      this, absl::make_unique<GpuExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf(
+        absl::StrFormat(
             "failed initializing StreamExecutor for CUDA device ordinal %d: %s",
-            config.ordinal, init_status.ToString().c_str()));
+            config.ordinal, init_status.ToString()));
   }
 
   return std::move(executor);
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index b21e979..b452fd3 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -24,7 +24,6 @@
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -66,6 +65,9 @@
 
   const string& Name() const override;
 
+  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
   port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
 
   port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 36eef0a..b86c312 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -71,7 +71,7 @@
 }
 
 bool GpuRng::Init() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   CHECK(rng_ == nullptr);
 
   cuda::ScopedActivateExecutorContext sac(parent_);
@@ -106,7 +106,7 @@
 
 template <typename T>
 bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
 
@@ -164,7 +164,7 @@
                                             ElemT stddev,
                                             DeviceMemory<ElemT>* v,
                                             FuncT func) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
 
   if (!SetStream(stream)) {
     return false;
@@ -197,7 +197,7 @@
 }
 
 bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   CHECK(rng_ != nullptr);
 
   if (!CheckSeed(seed, seed_bytes)) {
diff --git a/tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc b/tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc
new file mode 100644
index 0000000..d55a94f
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc
@@ -0,0 +1,1843 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
+                                                            int len,
+                                                            int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
+                          int srcDevice, int dstDevice) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
+                                                          int len) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
+                             int priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
+    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
+                                           void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
+                         size_t length __dv(0),
+                         unsigned int flags __dv(cudaMemAttachSingle)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
+    unsigned long long *pId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus, pId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
+                                                           cudaEvent_t start,
+                                                           cudaEvent_t end) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
+    cudaExternalMemory_t *extMem_out,
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
+    void **devPtr, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
+                               const struct cudaExternalMemoryBufferDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, cudaExternalMemory_t,
+      const struct cudaExternalMemoryMipmappedArrayDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
+    cudaExternalSemaphore_t *extSem_out,
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreSignalParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreSignalParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreWaitParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreWaitParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
+                 size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
+    const void *func, dim3 gridDim, dim3 blockDim, void **args,
+    size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
+    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
+                                           unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
+                                                         cudaHostFn_t fn,
+                                                         void *userData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, fn, userData);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
+                                              int blockSize,
+                                              size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
+                                                       const void *func,
+                                                       int blockSize,
+                                                       size_t dynamicSMemSize,
+                                                       unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
+    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
+                                                      size_t *pitch,
+                                                      size_t width,
+                                                      size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
+    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
+    size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
+                                                    unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
+                                                       unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
+                                                       void *pHost) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
+                  struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray,
+    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
+    unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
+      struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
+    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
+    unsigned int level) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
+    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
+    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
+                                                     size_t *total) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
+                 unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           struct cudaExtent *, unsigned int *,
+                                           cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
+                                                 size_t count,
+                                                 enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
+                                                     const void *src,
+                                                     int srcDevice,
+                                                     size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
+                                                   const void *src,
+                                                   size_t spitch, size_t width,
+                                                   size_t height,
+                                                   enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           cudaArray_const_t, size_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
+    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
+    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemcpyAsync(void *dst, const void *src, size_t count,
+                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
+    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
+    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           const void *, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
+                                           size_t, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
+                               enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
+    void *dst, const void *symbol, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
+                                                 size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
+                                                   int value, size_t width,
+                                                   size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
+    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
+    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
+                  size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
+                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
+                                           struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
+                                                           const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
+                                                        const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
+                     cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
+              int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
+                                           enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
+    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
+    const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
+    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
+    size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
+                               size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
+                  const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
+                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
+                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
+                         cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
+    struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
+    cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
+    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
+    cudaArray_t *array, cudaGraphicsResource_t resource,
+    unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsResourceGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaBindTexture(
+    size_t *offset, const struct textureReference *texref, const void *devPtr,
+    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
+                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
+                  size_t width, size_t height, size_t pitch) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
+    const struct textureReference *texref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, array, desc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaBindTextureToMipmappedArray(const struct textureReference *texref,
+                                cudaMipmappedArray_const_t mipmappedArray,
+                                const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaMipmappedArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, mipmappedArray, desc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaUnbindTexture(const struct textureReference *texref) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(
+    size_t *offset, const struct textureReference *texref) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
+    const struct textureReference **texref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
+    const struct surfaceReference *surfref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct surfaceReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, array, desc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
+    const struct surfaceReference **surfref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
+    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
+    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
+    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
+    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
+                                           const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
+                                                      unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
+    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
+    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemcpy3DParms *pCopyParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pCopyParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemsetParams *pMemsetParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pMemsetParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
+    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
+    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                           const cudaGraphNode_t *pDependencies,
+                           size_t numDependencies, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphClone, originalGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
+                         cudaGraph_t clonedGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pNode, originalNode, clonedGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pType);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
+                                                        cudaGraphNode_t *nodes,
+                                                        size_t *numNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, nodes, numNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
+    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, pRootNodes, pNumRootNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
+                                                        cudaGraphNode_t *from,
+                                                        cudaGraphNode_t *to,
+                                                        size_t *numEdges) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
+                                           cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numEdges);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
+    size_t *pNumDependencies) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependencies, pNumDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
+    size_t *pNumDependentNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependentNodes, pNumDependentNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                         const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                            const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphDestroyNode(cudaGraphNode_t node) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
+    char *pLogBuffer, size_t bufferSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           cudaGraphNode_t *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
+    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc b/tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc
new file mode 100644
index 0000000..3acb286
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc
@@ -0,0 +1,1410 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
+                                                            int len,
+                                                            int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue,
+                                                         enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
+                          int srcDevice, int dstDevice) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
+                                                          int len) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
+                             int priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
+    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
+                                           void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
+                         size_t length __dv(0),
+                         unsigned int flags __dv(cudaMemAttachSingle)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
+                                                           cudaEvent_t start,
+                                                           cudaEvent_t end) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
+                 size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
+    const void *func, dim3 gridDim, dim3 blockDim, void **args,
+    size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
+    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
+                                           unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
+                                              int blockSize,
+                                              size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
+                                                       const void *func,
+                                                       int blockSize,
+                                                       size_t dynamicSMemSize,
+                                                       unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0),
+                  cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(dim3, dim3, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaConfigureCall");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(gridDim, blockDim, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
+                                                        size_t size,
+                                                        size_t offset) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetupArgument");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(arg, size, offset);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
+    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
+                                                      size_t *pitch,
+                                                      size_t width,
+                                                      size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
+    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
+    size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
+                                                    unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
+                                                       unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
+                                                       void *pHost) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
+                  struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray,
+    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
+    unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
+      struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
+    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
+    unsigned int level) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
+    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
+    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
+                                                     size_t *total) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
+                 unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           struct cudaExtent *, unsigned int *,
+                                           cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
+                                                 size_t count,
+                                                 enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
+                                                     const void *src,
+                                                     int srcDevice,
+                                                     size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
+                  const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
+                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
+                                                   const void *src,
+                                                   size_t spitch, size_t width,
+                                                   size_t height,
+                                                   enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           cudaArray_const_t, size_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
+    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
+    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemcpyAsync(void *dst, const void *src, size_t count,
+                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(
+    void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
+    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
+    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           const void *, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
+                                           size_t, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
+                               enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
+    void *dst, const void *symbol, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
+                                                 size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
+                                                   int value, size_t width,
+                                                   size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
+    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
+    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
+                  size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
+                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
+                                           struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
+                                                           const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
+                                                        const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
+                     cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
+              int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
+                                           enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
+    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
+    const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
+    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
+    size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
+                               size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
+    struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
+    cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
+    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
+    cudaArray_t *array, cudaGraphicsResource_t resource,
+    unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsResourceGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
+    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaBindTexture(
+    size_t *offset, const struct textureReference *texref, const void *devPtr,
+    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
+                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
+                  size_t width, size_t height, size_t pitch) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
+    const struct textureReference *texref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, array, desc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaBindTextureToMipmappedArray(const struct textureReference *texref,
+                                cudaMipmappedArray_const_t mipmappedArray,
+                                const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaMipmappedArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, mipmappedArray, desc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaUnbindTexture(const struct textureReference *texref) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(
+    size_t *offset, const struct textureReference *texref) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
+    const struct textureReference **texref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
+    const struct surfaceReference *surfref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct surfaceReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, array, desc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
+    const struct surfaceReference **surfref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
+    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
+    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
+    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
+                                           const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
+    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 4d77b76..660cf8c 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -45,7 +45,15 @@
 
 #define __dv(v)
 #define __CUDA_DEPRECATED
+
+// A bunch of new symbols were introduced in version 10
+#if CUDART_VERSION <= 9020
+#include "tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc"
+#elif CUDART_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc"
+#endif
 #undef __dv
 #undef __CUDA_DEPRECATED
 
@@ -112,4 +120,13 @@
   if (!func_ptr) return 0;
   return func_ptr(gridDim, blockDim, sharedMem, stream);
 }
+
+#if CUDART_VERSION >= 10010
+extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
+  using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaRegisterFatBinaryEnd");
+  if (!func_ptr) return;
+  func_ptr(fatCubinHandle);
+}
+#endif
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 4607a9b..c741c41 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -18,7 +18,7 @@
 
 #include <string>
 
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "absl/strings/str_cat.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -29,9 +29,8 @@
   CudnnVersion(int major, int minor, int patch)
       : major_version(major), minor_version(minor), patch_level(patch) {}
 
-  tensorflow::string ToString() const {
-    return tensorflow::strings::StrCat(major_version, ".", minor_version, ".",
-                                       patch_level);
+  std::string ToString() const {
+    return absl::StrCat(major_version, ".", minor_version, ".", patch_level);
   }
 
   int major_version;
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 3247665..9038c04 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -37,7 +37,6 @@
                         kUninitializedUint64),
       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
                        kUninitializedUint64),
-      blocks_per_core_limit_(kUninitializedUint64),
       threads_per_core_limit_(kUninitializedUint64),
       threads_per_block_limit_(kUninitializedUint64),
       threads_per_warp_(kUninitializedUint64),
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index 356b605..db14b51 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -78,10 +78,6 @@
   // legitimate kernel launch request.
   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
 
-  // Returns the maximum number of simultaneously resident blocks
-  // on a multiprocessor.
-  int64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
-
   // Returns the limit on the total number of threads that can be launched in a
   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
   // This limit affects what constitutes a legitimate kernel launch request.
@@ -183,8 +179,6 @@
   ThreadDim thread_dim_limit_;
   BlockDim block_dim_limit_;
 
-  int64 blocks_per_core_limit_;
-
   int64 threads_per_core_limit_;
   int64 threads_per_block_limit_;
   int64 threads_per_warp_;
@@ -251,10 +245,6 @@
     device_description_->block_dim_limit_ = value;
   }
 
-  void set_blocks_per_core_limit(int64 value) {
-    device_description_->blocks_per_core_limit_ = value;
-  }
-
   void set_threads_per_core_limit(int64 value) {
     device_description_->threads_per_core_limit_ = value;
   }
diff --git a/tensorflow/stream_executor/device_memory_allocator.cc b/tensorflow/stream_executor/device_memory_allocator.cc
new file mode 100644
index 0000000..e925b7b
--- /dev/null
+++ b/tensorflow/stream_executor/device_memory_allocator.cc
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+
+namespace stream_executor {
+
+StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
+    const Platform* platform,
+    absl::Span<StreamExecutor* const> stream_executors)
+    : DeviceMemoryAllocator(platform),
+      stream_executors_(stream_executors.begin(), stream_executors.end()) {}
+
+port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  port::StatusOr<StreamExecutor*> stream_executor_or =
+      GetStreamExecutor(device_ordinal);
+  TF_RETURN_IF_ERROR(stream_executor_or.status());
+  DeviceMemoryBase result =
+      stream_executor_or.ValueOrDie()->AllocateArray<uint8>(size);
+  if (size > 0 && result == nullptr) {
+    return tensorflow::errors::ResourceExhausted(
+        "Failed to allocate request for %s (%uB) on device ordinal %d",
+        tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
+  }
+  VLOG(3) << absl::StreamFormat(
+      "Allocated %s (%uB) on device ordinal %d: %p",
+      tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal,
+      result.opaque());
+  return OwningDeviceMemory(result, device_ordinal, this);
+}
+
+port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                       DeviceMemoryBase mem) {
+  if (!mem.is_null()) {
+    port::StatusOr<StreamExecutor*> stream_executor_or =
+        GetStreamExecutor(device_ordinal);
+    TF_RETURN_IF_ERROR(stream_executor_or.status());
+    VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
+                                  mem.opaque(), device_ordinal);
+    stream_executor_or.ValueOrDie()->Deallocate(&mem);
+  }
+  return port::Status::OK();
+}
+
+port::StatusOr<StreamExecutor*>
+StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
+  if (device_ordinal < 0) {
+    return tensorflow::errors::InvalidArgument(
+        "device ordinal value (%d) must be non-negative", device_ordinal);
+  }
+  if (device_ordinal >= stream_executors_.size()) {
+    return tensorflow::errors::InvalidArgument(
+        "device ordinal value (%d) >= number of devices (%u)", device_ordinal,
+        stream_executors_.size());
+  }
+  if (stream_executors_[device_ordinal] == nullptr) {
+    return tensorflow::errors::NotFound(
+        absl::StrFormat("Device %s:%d present but not supported",
+                        platform()->Name(), device_ordinal));
+  }
+  return stream_executors_[device_ordinal];
+}
+
+bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
+  return false;
+}
+
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
similarity index 74%
rename from tensorflow/compiler/xla/service/device_memory_allocator.h
rename to tensorflow/stream_executor/device_memory_allocator.h
index a2308ee..0d911e2 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@@ -19,13 +19,13 @@
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
+#include "tensorflow/stream_executor/platform.h"
 
-namespace xla {
+namespace stream_executor {
 
 // Interface for device memory allocators used within the XLA service. An
 // allocator is responsible for allocating memory on all devices of a particular
@@ -34,7 +34,7 @@
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const se::Platform* platform)
+  explicit DeviceMemoryAllocator(const Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -47,23 +47,23 @@
   // fails, the allocation should return immediately without retrying.  An
   // example use case is optional scratch spaces where a failure has only
   // performance impact.
-  virtual StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                                bool retry_on_failure) = 0;
+  virtual port::StatusOr<OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool retry_on_failure) = 0;
 
   // Two-arg version of Allocate(), which sets retry-on-failure to true.
   //
   // (We don't simply use a default argument on the virtual Allocate function
   // because default args on virtual functions are disallowed by the Google
   // style guide.)
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
     return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
   }
 
   // Must be a nop for null pointers.
-  virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0;
+  virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  const se::Platform* platform() const { return platform_; }
+  const Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
@@ -71,7 +71,7 @@
 
  protected:
   friend class OwningDeviceMemory;
-  const se::Platform* platform_;
+  const Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -79,28 +79,28 @@
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      const se::Platform* platform,
-      absl::Span<se::StreamExecutor* const> stream_executors);
+      const Platform* platform,
+      absl::Span<StreamExecutor* const> stream_executors);
 
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) override;
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                              bool retry_on_failure) override;
 
   // Pull in two-arg overload that sets retry_on_failure to true.
   using DeviceMemoryAllocator::Allocate;
 
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
+  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
  private:
-  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
+  port::StatusOr<StreamExecutor*> GetStreamExecutor(int device_ordinal);
 
   // A vector indexed by device ordinal of StreamExecutors for each device of
   // the allocator's platform type. If an element is nullptr, then the device
   // with the respective device ordinal is not supported by XLA.
-  std::vector<se::StreamExecutor*> stream_executors_;
+  std::vector<StreamExecutor*> stream_executors_;
 };
 
-}  // namespace xla
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DEVICE_MEMORY_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index fcc3db9..f8d1405 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -15,15 +15,16 @@
 
 #include "tensorflow/stream_executor/dnn.h"
 
+#include "absl/hash/hash.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "absl/strings/str_format.h"
 
 namespace stream_executor {
 namespace dnn {
 
 uint64 AlgorithmDesc::hash() const {
-  return ::tensorflow::Hash64Combine(algo_id(), tensor_ops_enabled());
+  auto p = std::make_pair(algo_id(), tensor_ops_enabled());
+  return absl::Hash<decltype(p)>()(p);
 }
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -283,13 +284,13 @@
 string BatchDescriptor::ToString() const {
   string spatial;
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
+    absl::StrAppend(&spatial, "%d ", spatial_size()[i]);
   }
-  return port::Printf(
-      "{count: %lld feature_map_count: %lld spatial: %s "
+  return absl::StrFormat(
+      "{count: %d feature_map_count: %d spatial: %s "
       "value_min: %f value_max: %f layout: %s}",
-      count(), feature_map_count(), spatial.c_str(), value_min_, value_max_,
-      DataLayoutString(layout()).c_str());
+      count(), feature_map_count(), spatial, value_min_, value_max_,
+      DataLayoutString(layout()));
 }
 
 string BatchDescriptor::ToShortString() const {
@@ -301,7 +302,7 @@
 
   string spatial = "s";
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
+    absl::StrAppend(&spatial, "%d ", spatial_size()[i]);
   }
 
   string suffix;
@@ -394,13 +395,13 @@
 }
 
 string FilterDescriptor::ToString() const {
-  string desc = port::Printf(
-      "{output_feature_map_count: %lld input_feature_map_count: %lld "
+  string desc = absl::StrFormat(
+      "{output_feature_map_count: %d input_feature_map_count: %d "
       "layout: %s shape: ",
       output_feature_map_count(), input_feature_map_count(),
-      FilterLayoutString(layout()).c_str());
+      FilterLayoutString(layout()));
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&desc, "%lld ", input_filter_dims()[i]);
+    absl::StrAppend(&desc, "%d ", input_filter_dims()[i]);
   }
   absl::StrAppend(&desc, "}");
 
@@ -416,7 +417,7 @@
 
   string spatial = "s";
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&spatial, "%lld ", input_filter_dims()[i]);
+    absl::StrAppend(&spatial, "%d ", input_filter_dims()[i]);
   }
 
   switch (layout()) {
@@ -470,29 +471,28 @@
   string strides;
   string dilations;
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&padding, "%lld ", this->padding()[i]);
-    port::Appendf(&strides, "%lld ", this->strides()[i]);
-    port::Appendf(&dilations, "%lld ", this->dilations()[i]);
+    absl::StrAppend(&padding, "%d ", this->padding()[i]);
+    absl::StrAppend(&strides, "%d ", this->strides()[i]);
+    absl::StrAppend(&dilations, "%d ", this->dilations()[i]);
   }
 
-  return port::Printf(
+  return absl::StrFormat(
       "{zero_padding: %s pad_alignment: %s filter_strides: %s dilation_rates: "
       "%s}",
-      padding.c_str(), PadAlignmentString(pad_alignment()).c_str(),
-      strides.c_str(), dilations.c_str());
+      padding, PadAlignmentString(pad_alignment()), strides, dilations);
 }
 
 string ConvolutionDescriptor::ToShortString() const {
   string desc;
   for (int i = 0; i < ndims(); i++) {
-    if (i > 0) port::Appendf(&desc, "_");
-    port::Appendf(&desc, "p%d:%lld", i, padding()[i]);
+    if (i > 0) absl::StrAppend(&desc, "_");
+    absl::StrAppend(&desc, "p%d:%d", i, padding()[i]);
   }
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&desc, "_s%d:%lld", i, strides()[i]);
+    absl::StrAppend(&desc, "_s%d:%d", i, strides()[i]);
   }
   for (int i = 0; i < ndims(); i++) {
-    port::Appendf(&desc, "_d%d:%lld", i, dilations()[i]);
+    absl::StrAppend(&desc, "_d%d:%d", i, dilations()[i]);
   }
   return desc;
 }
@@ -524,25 +524,24 @@
 
   string window, strides, padding;
   for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&window, "%lld ", window_[i]);
-    port::Appendf(&strides, "%lld ", strides_[i]);
-    port::Appendf(&padding, "%lld", padding_[i]);
+    absl::StrAppend(&window, "%d ", window_[i]);
+    absl::StrAppend(&strides, "%d ", strides_[i]);
+    absl::StrAppend(&padding, "%d", padding_[i]);
   }
 
   const char* propagate_string = propagate_nans_ ? "Yes" : "No";
 
-  return port::Printf(
+  return absl::StrFormat(
       "{mode: %s window: %s strides: %s padding: %s propagate NaNs: %s}",
-      mode_string, window.c_str(), strides.c_str(), padding.c_str(),
-      propagate_string);
+      mode_string, window, strides, padding, propagate_string);
 }
 
 string PoolingDescriptor::ToShortString() const {
   string window, strides, padding;
   for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&window, "_w%d:%lld", i, window_[i]);
-    port::Appendf(&strides, "_s%d:%lld", i, strides_[i]);
-    port::Appendf(&padding, "_p%d:%lld", i, padding_[i]);
+    absl::StrAppend(&window, "_w%d:%d", i, window_[i]);
+    absl::StrAppend(&strides, "_s%d:%d", i, strides_[i]);
+    absl::StrAppend(&padding, "_p%d:%d", i, padding_[i]);
   }
   return absl::StrCat(mode_ == dnn::PoolingMode::kMaximum ? "max" : "avg",
                       window, strides, padding,
@@ -569,7 +568,7 @@
 }
 
 string NormalizeDescriptor::ToString() const {
-  return port::Printf(
+  return absl::StrFormat(
       "{bias: %f range: %d alpha: %f beta: %f wrap_around: %d "
       "segment_size: %d}",
       bias_, range_, alpha_, beta_, wrap_around_, segment_size_);
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 34de151..38baef6 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -26,10 +26,10 @@
 #include <limits>
 #include <memory>
 #include <tuple>
+#include <type_traits>
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -70,19 +70,34 @@
   return SetDim(absl::MakeSpan(*data), dim, value);
 }
 
-// tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
-// open-source. Wrapper function that gives an int64 array slice view of a
-// repeated int64 protobuf field.
-inline absl::Span<const int64> AsInt64Slice(
-    const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
-  return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
-                                 v.size());
+// int64 is not the same type as tensorflow::protobuf_int64 in open-source. This
+// wrapper function gives an int64 array slice view of a repeated int64 protobuf
+// field.
+//
+// T should be a protobuf RepeatedField.
+template <typename T>
+inline absl::Span<const int64> AsInt64Slice(const T& repeated_field) {
+  using data_ty =
+      typename std::remove_reference<decltype(*repeated_field.data())>::type;
+  static_assert(std::is_integral<data_ty>::value &&
+                    std::is_signed<data_ty>::value && sizeof(data_ty) == 8,
+                "repeated_field.data() must return a pointer to a signed "
+                "64-bit integer type.");
+  return absl::Span<const int64>(
+      reinterpret_cast<const int64*>(repeated_field.data()),
+      repeated_field.size());
 }
-
-inline absl::Span<int64> AsInt64Slice(
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
-  return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
-                           v->size());
+template <typename T>
+inline absl::Span<int64> AsInt64Slice(T* repeated_field) {
+  using data_ty =
+      typename std::remove_reference<decltype(*repeated_field->data())>::type;
+  static_assert(std::is_integral<data_ty>::value &&
+                    std::is_signed<data_ty>::value && sizeof(data_ty) == 8,
+                "repeated_field->data() must return a pointer to a signed "
+                "64-bit integer type.");
+  return absl::Span<int64>(
+      reinterpret_cast<int64*>(repeated_field->mutable_data()),
+      repeated_field->size());
 }
 
 // Returns a string representation of the given data layout.
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 188137b..fb6bda9 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -71,6 +71,7 @@
   FORWARD = 1;
   BACKWARD_FILTER = 2;
   BACKWARD_DATA = 3;
+  FORWARD_BIAS_ACTIVATION = 4;
 }
 
 // Generic tensor representation.
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index 0b3ad7e..2dd0303 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -15,7 +15,8 @@
 
 #include "tensorflow/stream_executor/executor_cache.h"
 
-#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 
 namespace stream_executor {
 
@@ -32,7 +33,7 @@
 
   Entry* entry = nullptr;
   {
-    mutex_lock lock{mutex_};
+    absl::MutexLock lock{&mutex_};
     entry = &cache_[config.ordinal];
     // Release the map lock; the address of 'entry' is stable because
     // std::map guarantees reference stability.
@@ -41,7 +42,7 @@
   // Acquire the per-Entry mutex without holding the map mutex. Initializing
   // an Executor may be expensive, so we want to allow concurrent
   // initialization of different entries.
-  mutex_lock lock{entry->configurations_mutex};
+  absl::MutexLock lock{&entry->configurations_mutex};
   for (const auto& iter : entry->configurations) {
     if (iter.first.plugin_config == config.plugin_config &&
         iter.first.device_options == config.device_options) {
@@ -66,21 +67,23 @@
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
-    tf_shared_lock lock{mutex_};
+    absl::ReaderMutexLock lock{&mutex_};
     auto it = cache_.find(config.ordinal);
     if (it != cache_.end()) {
       entry = &it->second;
     } else {
-      return port::Status(port::error::NOT_FOUND,
-                          port::Printf("No executors registered for ordinal %d",
-                                       config.ordinal));
+      return port::Status(
+          port::error::NOT_FOUND,
+          absl::StrFormat("No executors registered for ordinal %d",
+                          config.ordinal));
     }
   }
-  tf_shared_lock lock{entry->configurations_mutex};
+  absl::ReaderMutexLock lock{&entry->configurations_mutex};
   if (entry->configurations.empty()) {
     return port::Status(
         port::error::NOT_FOUND,
-        port::Printf("No executors registered for ordinal %d", config.ordinal));
+        absl::StrFormat("No executors registered for ordinal %d",
+                        config.ordinal));
   }
   for (const auto& iter : entry->configurations) {
     if (iter.first.plugin_config == config.plugin_config &&
@@ -94,12 +97,12 @@
 }
 
 void ExecutorCache::DestroyAllExecutors() {
-  mutex_lock lock{mutex_};
+  absl::MutexLock lock{&mutex_};
   cache_.clear();
 }
 
 ExecutorCache::Entry::~Entry() {
-  mutex_lock lock{configurations_mutex};
+  absl::MutexLock lock{&configurations_mutex};
   configurations.clear();
 }
 
diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h
index bbeeaed..0100f6d 100644
--- a/tensorflow/stream_executor/executor_cache.h
+++ b/tensorflow/stream_executor/executor_cache.h
@@ -19,9 +19,9 @@
 #include <functional>
 #include <map>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace stream_executor {
@@ -54,11 +54,11 @@
   struct Entry {
     ~Entry();
 
-    // Mutex that locks the contents of each entry. The 'mutex_' of the
+    // Mutex that guards the contents of each entry. The 'mutex_' of the
     // ExecutorCache class protects both the 'cache_' and the existence of each
     // Entry, but not the Entry's contents. 'configurations_mutex' protects the
     // contents of the entry after 'mutex_' has been dropped.
-    mutex configurations_mutex;
+    absl::Mutex configurations_mutex;
 
     // Vector of cached {config, executor} pairs.
     std::vector<
@@ -69,7 +69,7 @@
   // Maps ordinal number to a list of cached executors for that ordinal.
   // We key off of ordinal (instead of just looking up all fields in the
   // StreamExecutorConfig) for a slight improvement in lookup time.
-  mutex mutex_;
+  absl::Mutex mutex_;
   std::map<int, Entry> cache_ GUARDED_BY(mutex_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index e681238..9ac895d 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -11,7 +11,10 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 
 package(
-    default_visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    default_visibility = [
+        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
+    ],
 )
 
 # Filegroup used to collect source files for the dependency check.
@@ -98,6 +101,7 @@
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -126,6 +130,7 @@
         "//tensorflow/stream_executor:plugin_registry",
         "//tensorflow/stream_executor:rng",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index a5ef48d..73cc24f 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -90,8 +90,8 @@
   // Creates a new event associated with the given context.
   // result is an outparam owned by the caller and must not be null.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
-                                  EventFlags flags);
+  static port::Status InitEvent(GpuContext* context, GpuEventHandle* result,
+                                EventFlags flags);
 
   // Destroys *event and turns it into a nullptr. event may not be null, but
   // *event may be, via cuEventDestroy
diff --git a/tensorflow/stream_executor/gpu/gpu_event.cc b/tensorflow/stream_executor/gpu/gpu_event.cc
index a523958..abd8937 100644
--- a/tensorflow/stream_executor/gpu/gpu_event.cc
+++ b/tensorflow/stream_executor/gpu/gpu_event.cc
@@ -28,8 +28,8 @@
 GpuEvent::~GpuEvent() {}
 
 port::Status GpuEvent::Init() {
-  return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
-                                GpuDriver::EventFlags::kDisableTiming);
+  return GpuDriver::InitEvent(parent_->gpu_context(), &gpu_event_,
+                              GpuDriver::EventFlags::kDisableTiming);
 }
 
 port::Status GpuEvent::Destroy() {
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
index 1b5151a..2149f13 100644
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -26,12 +26,12 @@
 #include <unordered_map>
 
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -198,12 +198,13 @@
   bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
                  void** mem, size_t* bytes) override;
 
-  DeviceDescription* PopulateDeviceDescription() const override;
+  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return CreateDeviceDescription(device_ordinal_);
+  }
 
-  // Populates the block_dim_limit by querying the device driver API. If an
-  // error occurs at any point while asking the driver for block dim limits, it
-  // will be only partially populated as a result, and an error will be logged.
-  bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
+  static port::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
 
   bool SupportsBlas() const override;
 
@@ -286,7 +287,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // Guards the on-disk-module mapping.
-  mutex disk_modules_mu_;
+  absl::Mutex disk_modules_mu_;
 
   // Mapping from filename to GPUModuleHandle, if it was already retrieved.
   // Multiple GPUFunctionHandle are usually obtained from a single
@@ -295,7 +296,7 @@
   std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
 
   // Guards the in-memory-module mapping.
-  mutex in_memory_modules_mu_;
+  absl::Mutex in_memory_modules_mu_;
 
   std::map<const char*, GpuModuleHandle> in_memory_modules_
       GUARDED_BY(in_memory_modules_mu_);
@@ -308,7 +309,7 @@
       gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
 
   // Guards the launched kernel set.
-  mutex launched_kernels_mu_;
+  absl::Mutex launched_kernels_mu_;
 
   // Keeps track of the set of launched kernels. Currently used to suppress the
   // occupancy check on subsequent launches.
diff --git a/tensorflow/stream_executor/gpu/gpu_rng.h b/tensorflow/stream_executor/gpu/gpu_rng.h
index d4bf1e1..f256e54 100644
--- a/tensorflow/stream_executor/gpu/gpu_rng.h
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@@ -16,14 +16,13 @@
 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/rng.h"
 
-#include "tensorflow/stream_executor/gpu/gpu_types.h"
-
 namespace stream_executor {
 
 class Stream;
@@ -83,8 +82,8 @@
   // with random number generation.
   bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // mutex that guards the gpu rng library handle for this device.
-  mutex mu_;
+  // Guards the gpu rng library handle for this device.
+  absl::Mutex mu_;
 
   // GpuExecutor which instantiated this GpuRng.
   // Immutable post-initialization.
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
index f435003..887522c 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -26,8 +26,8 @@
   if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
     return false;
   }
-  return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
-                                GpuDriver::EventFlags::kDisableTiming)
+  return GpuDriver::InitEvent(parent_->gpu_context(), &completed_event_,
+                              GpuDriver::EventFlags::kDisableTiming)
       .ok();
 }
 
diff --git a/tensorflow/stream_executor/gpu/gpu_timer.cc b/tensorflow/stream_executor/gpu/gpu_timer.cc
index cc4b50d..3294590 100644
--- a/tensorflow/stream_executor/gpu/gpu_timer.cc
+++ b/tensorflow/stream_executor/gpu/gpu_timer.cc
@@ -26,15 +26,15 @@
 bool GpuTimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
   GpuContext* context = parent_->gpu_context();
-  port::Status status = GpuDriver::CreateEvent(context, &start_event_,
-                                               GpuDriver::EventFlags::kDefault);
+  port::Status status = GpuDriver::InitEvent(context, &start_event_,
+                                             GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return false;
   }
 
-  status = GpuDriver::CreateEvent(context, &stop_event_,
-                                  GpuDriver::EventFlags::kDefault);
+  status = GpuDriver::InitEvent(context, &stop_event_,
+                                GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
     status = GpuDriver::DestroyEvent(context, &start_event_);
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
index 127452a..6ad06bb 100644
--- a/tensorflow/stream_executor/host/BUILD
+++ b/tensorflow/stream_executor/host/BUILD
@@ -4,6 +4,7 @@
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package_group(
     name = "friends",
@@ -51,6 +52,8 @@
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
     ],
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
 )
@@ -64,8 +67,10 @@
         "host_stream.h",
     ],
     deps = [
+        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor:kernel",
         "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -108,3 +113,18 @@
     ],
     alwayslink = True,
 )
+
+tf_cc_test(
+    name = "host_stream_test",
+    srcs = ["host_stream_test.cc"],
+    deps = [
+        ":host_platform",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 318f0dc..ad25dd4 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -182,7 +182,8 @@
   return port::Status::OK();
 }
 
-DeviceDescription *HostExecutor::PopulateDeviceDescription() const {
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+HostExecutor::CreateDeviceDescription(int device_ordinal) {
   internal::DeviceDescriptionBuilder builder;
 
   builder.set_device_address_bits(64);
@@ -195,8 +196,7 @@
       tensorflow::profile_utils::CpuUtils::GetCycleCounterFrequency());
   builder.set_clock_rate_ghz(cycle_counter_frequency / 1e9);
 
-  auto built = builder.Build();
-  return built.release();
+  return builder.Build();
 }
 
 bool HostExecutor::SupportsBlas() const {
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 5422029..a1dbb9f 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -61,7 +61,7 @@
   }
 
   void *Allocate(uint64 size) override;
-  void *GetSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+  void *GetSubBuffer(DeviceMemoryBase *parent, uint64 offset_bytes,
                      uint64 size_bytes) override;
   void Deallocate(DeviceMemoryBase *mem) override;
 
@@ -77,7 +77,7 @@
   bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
               uint64 size) override;
   bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
-                            const DeviceMemoryBase &host_src,
+                            const DeviceMemoryBase &gpu_src,
                             uint64 size) override;
 
   bool MemZero(Stream *stream, DeviceMemoryBase *location,
@@ -147,7 +147,13 @@
     return false;
   }
 
-  DeviceDescription *PopulateDeviceDescription() const override;
+  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return CreateDeviceDescription(0);
+  }
+
+  static port::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
 
   port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
     return port::Status::OK();
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index d16cca8..6250de8 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -17,14 +17,13 @@
 
 #include <thread>
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/host/host_gpu_executor.h"
 #include "tensorflow/stream_executor/host/host_platform_id.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
 namespace host {
@@ -41,6 +40,11 @@
 
 const string& HostPlatform::Name() const { return name_; }
 
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+HostPlatform::DescriptionForDevice(int ordinal) const {
+  return HostExecutor::CreateDeviceDescription(ordinal);
+}
+
 port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDevice(int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -66,13 +70,13 @@
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<HostExecutor>(config.plugin_config));
+  auto executor = absl::make_unique<StreamExecutor>(
+      this, absl::make_unique<HostExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status(
         port::error::INTERNAL,
-        port::Printf(
+        absl::StrFormat(
             "failed initializing StreamExecutor for device ordinal %d: %s",
             config.ordinal, init_status.ToString().c_str()));
   }
diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h
index c6f46a2..1a5c276 100644
--- a/tensorflow/stream_executor/host/host_platform.h
+++ b/tensorflow/stream_executor/host/host_platform.h
@@ -27,7 +27,6 @@
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
@@ -52,6 +51,9 @@
 
   const string& Name() const override;
 
+  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
   port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
 
   port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index bfbfb56..413edc6 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -17,49 +17,62 @@
 // the HostExecutor implementation.
 #include "tensorflow/stream_executor/host/host_stream.h"
 
+#include "absl/synchronization/notification.h"
+#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/setround.h"
+
 namespace stream_executor {
 namespace host {
 
 HostStream::HostStream()
-    : host_executor_(new port::ThreadPool(port::Env::Default(),
-                                          port::ThreadOptions(),
-                                          "host_executor", kExecutorThreads)) {}
+    : thread_(port::Env::Default()->StartThread(
+          port::ThreadOptions(), "host_executor", [this]() { WorkLoop(); })) {}
 
-HostStream::~HostStream() {}
-
-bool HostStream::EnqueueTask(std::function<void()> task) {
-  struct NotifiedTask {
-    HostStream* stream;
-    std::function<void()> task;
-
-    void operator()() {
-      task();
-      // Destroy the task before unblocking its waiters, as BlockHostUntilDone()
-      // should guarantee that all tasks are destroyed.
-      task = std::function<void()>();
-      {
-        mutex_lock lock(stream->mu_);
-        --stream->pending_tasks_;
-      }
-      stream->completion_condition_.notify_all();
-    }
-  };
-
+HostStream::~HostStream() {
   {
-    mutex_lock lock(mu_);
-    ++pending_tasks_;
+    absl::MutexLock lock(&mu_);
+    work_queue_.push(nullptr);
   }
-  host_executor_->Schedule(NotifiedTask{this, std::move(task)});
+  // thread_'s destructor blocks until the thread finishes running.
+  thread_.reset();
+}
+
+bool HostStream::EnqueueTask(std::function<void()> fn) {
+  CHECK(fn != nullptr);
+  absl::MutexLock lock(&mu_);
+  work_queue_.push(std::move(fn));
   return true;
 }
 
-void HostStream::BlockUntilDone() {
-  mutex_lock lock(mu_);
-  while (pending_tasks_ != 0) {
-    completion_condition_.wait(lock);
+bool HostStream::WorkAvailable() { return !work_queue_.empty(); }
+
+void HostStream::WorkLoop() {
+  // Set denormal and rounding behavior to match the default TF ThreadPool
+  // behavior.
+  // TODO(phawkins, jlebar): it's not clear this is the best place to set this.
+  tensorflow::port::ScopedFlushDenormal flush;
+  tensorflow::port::ScopedSetRound round(FE_TONEAREST);
+  while (true) {
+    std::function<void()> fn;
+    {
+      absl::MutexLock lock(&mu_);
+      mu_.Await(absl::Condition(this, &HostStream::WorkAvailable));
+      fn = std::move(work_queue_.front());
+      work_queue_.pop();
+    }
+    if (!fn) {
+      return;
+    }
+    fn();
   }
 }
 
+void HostStream::BlockUntilDone() {
+  absl::Notification done;
+  EnqueueTask([&done]() { done.Notify(); });
+  done.WaitForNotification();
+}
+
 }  // namespace host
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index be88f07..1bf37ea 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -20,7 +20,9 @@
 
 #include <functional>
 #include <memory>
+#include <queue>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
@@ -40,14 +42,12 @@
   void BlockUntilDone();
 
  private:
-  // Use only one thread and own task queue to preserve FIFO ordering
-  // for the operations enqueued by any given stream.
-  static const int kExecutorThreads = 1;
-  std::unique_ptr<port::ThreadPool> host_executor_;
+  bool WorkAvailable() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void WorkLoop();
 
-  mutex mu_;
-  int pending_tasks_ GUARDED_BY(mu_) = 0;
-  condition_variable completion_condition_;
+  absl::Mutex mu_;
+  std::queue<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+  std::unique_ptr<port::Thread> thread_;
 };
 
 }  // namespace host
diff --git a/tensorflow/stream_executor/host/host_stream_test.cc b/tensorflow/stream_executor/host/host_stream_test.cc
new file mode 100644
index 0000000..bcc7817
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_stream_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace se = stream_executor;
+
+TEST(HostStream, EnforcesFIFOOrder) {
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).ValueOrDie();
+  se::Stream stream(executor);
+  stream.Init();
+
+  absl::Mutex mu;
+  int expected = 0;
+  bool ok = true;
+  for (int i = 0; i < 2000; ++i) {
+    stream.ThenDoHostCallback([i, &mu, &expected, &ok]() {
+      absl::MutexLock lock(&mu);
+      if (expected != i) {
+        ok = false;
+      }
+      ++expected;
+    });
+  }
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  absl::MutexLock lock(&mu);
+  EXPECT_TRUE(ok);
+}
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index 240e955..2aee961 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -19,13 +19,12 @@
 
 #include "tensorflow/stream_executor/kernel.h"
 
-#include "tensorflow/stream_executor/platform/port.h"
-
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/stream_executor/lib/demangle.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace stream_executor {
@@ -91,16 +90,12 @@
   return implementation_->GetPreferredCacheConfig();
 }
 
-// Prefix stub functions emitted by the CUDA splitter.
-static const char *kStubPrefix = "__device_stub_";
-
 void KernelBase::set_name(absl::string_view name) {
   name_ = string(name);
-  absl::string_view stubless_name = name;
-  if (tensorflow::str_util::StartsWith(name, kStubPrefix)) {
-    stubless_name.remove_prefix(strlen(kStubPrefix));
-  }
-  demangled_name_ = port::Demangle(stubless_name.data());
+
+  // CUDA splitter prefixes stub functions with __device_stub_.
+  demangled_name_ =
+      port::Demangle(absl::StripPrefix(name, "__device_stub_").data());
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index 2e090af..d7e0020 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -94,7 +94,7 @@
     return nullptr;
   }
 
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
 
   auto ptx = ptx_by_compute_capability_.begin()->second;
   // Check if there is an entry in decompressed ptx table.
@@ -128,7 +128,7 @@
     return nullptr;
   }
 
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
 
   // Check if there is an entry in decompressed ptx table.
   auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
index 04b2eab..7199f60 100644
--- a/tensorflow/stream_executor/kernel_spec.h
+++ b/tensorflow/stream_executor/kernel_spec.h
@@ -47,13 +47,13 @@
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
 
 #include <stddef.h>
+
 #include <map>
 #include <memory>
-#include "tensorflow/stream_executor/platform/port.h"
 
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -206,7 +206,7 @@
   // Stores all decompressed ptx strings, with original ptx string as keys.
   // It is marked as mutable for lazy decompression.
   mutable std::map<const char *, string> decompressed_ptx_;
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   // Defines the minimum compute capability possible. Used when PTX has no
   // compute capability specified (in the single-PTX constructor).
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
index 133ff2b..edf4958 100644
--- a/tensorflow/stream_executor/lib/BUILD
+++ b/tensorflow/stream_executor/lib/BUILD
@@ -32,9 +32,9 @@
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:ptr_util",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -50,13 +50,3 @@
         "//tensorflow/core:test_main",
     ],
 )
-
-cc_library(
-    name = "utility_headers",
-    hdrs = [
-        "ptr_util.h",
-    ],
-    deps = [
-        "//tensorflow/core:ptr_util",
-    ],
-)
diff --git a/tensorflow/stream_executor/lib/human_readable.h b/tensorflow/stream_executor/lib/human_readable.h
index 893865f..5e5525e 100644
--- a/tensorflow/stream_executor/lib/human_readable.h
+++ b/tensorflow/stream_executor/lib/human_readable.h
@@ -17,9 +17,10 @@
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
 
 #include <assert.h>
+
 #include <limits>
 
-#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -38,7 +39,7 @@
     // Special case for bytes.
     if (num_bytes < 1024LL) {
       // No fractions for bytes.
-      return port::Printf("%s%lldB", neg_str, num_bytes);
+      return absl::StrFormat("%s%dB", neg_str, num_bytes);
     }
 
     static const char units[] = "KMGTPE";  // int64 only goes up to E.
@@ -49,8 +50,10 @@
       assert(unit < units + sizeof(units));
     }
 
-    return port::Printf(((*unit == 'K') ? "%s%.1f%c" : "%s%.2f%c"), neg_str,
-                        num_bytes / 1024.0, *unit);
+    if (*unit == 'K') {
+      return absl::StrFormat("%s%.1f%c", neg_str, num_bytes / 1024.0, *unit);
+    }
+    return absl::StrFormat("%s%.2f%c", neg_str, num_bytes / 1024.0, *unit);
   }
 
  private:
diff --git a/tensorflow/stream_executor/lib/notification.h b/tensorflow/stream_executor/lib/notification.h
deleted file mode 100644
index 472d8c9..0000000
--- a/tensorflow/stream_executor/lib/notification.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_
-
-#include "tensorflow/core/platform/notification.h"
-
-namespace stream_executor {
-namespace port {
-
-using tensorflow::Notification;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_
diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h
index 76a623c..902331b 100644
--- a/tensorflow/stream_executor/lib/path.h
+++ b/tensorflow/stream_executor/lib/path.h
@@ -17,14 +17,11 @@
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
 namespace port {
 
-using tensorflow::io::Dirname;
-
 namespace internal {
 // TODO(rspringer): Move to cc/implementation file.
 // Not part of the public API.
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index 72d71e6..1b85a76 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -21,8 +21,10 @@
 #include <WinSock2.h>
 #pragma comment(lib, "Ws2_32.lib")
 #else
+#include <errno.h>
 #include <unistd.h>
 #endif
+
 #include <memory>
 
 namespace stream_executor {
diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h
deleted file mode 100644
index 8f9f420..0000000
--- a/tensorflow/stream_executor/lib/ptr_util.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
-
-#include <memory>
-#include "tensorflow/core/util/ptr_util.h"
-
-namespace stream_executor {
-using tensorflow::MakeUnique;
-using tensorflow::WrapUnique;
-}  // namespace stream_executor
-
-namespace perftools {
-namespace gputools {
-
-// Temporarily pull stream_executor into perftools::gputools while we migrate
-// code to the new namespace.  TODO(jlebar): Remove this once we've completed
-// the migration.
-using namespace stream_executor;  // NOLINT[build/namespaces]
-
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
deleted file mode 100644
index e99dfa8..0000000
--- a/tensorflow/stream_executor/lib/str_util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-
-namespace stream_executor {
-namespace port {
-
-using tensorflow::str_util::Join;
-using tensorflow::str_util::Split;
-
-// Returns a copy of the input string 'str' with the given 'suffix'
-// removed. If the suffix doesn't match, returns a copy of the original string.
-inline string StripSuffixString(absl::string_view str,
-                                absl::string_view suffix) {
-  if (tensorflow::str_util::EndsWith(str, suffix)) {
-    str.remove_suffix(suffix.size());
-  }
-  return string(str);
-}
-
-using tensorflow::str_util::Lowercase;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/stringprintf.h b/tensorflow/stream_executor/lib/stringprintf.h
deleted file mode 100644
index 2f65ed9..0000000
--- a/tensorflow/stream_executor/lib/stringprintf.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_
-
-#include "tensorflow/core/lib/strings/stringprintf.h"
-
-namespace stream_executor {
-namespace port {
-
-using tensorflow::strings::Printf;
-using tensorflow::strings::Appendf;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_
diff --git a/tensorflow/stream_executor/lib/threadpool.h b/tensorflow/stream_executor/lib/threadpool.h
index 220068a..b0986e4 100644
--- a/tensorflow/stream_executor/lib/threadpool.h
+++ b/tensorflow/stream_executor/lib/threadpool.h
@@ -18,12 +18,12 @@
 
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/stream_executor/lib/env.h"
-#include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/thread_options.h"
 
 namespace stream_executor {
 namespace port {
 
+using tensorflow::Thread;
 using tensorflow::thread::ThreadPool;
 
 }  // namespace port
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index bbb5607..434efa9 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -17,12 +17,12 @@
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
 namespace {
@@ -71,7 +71,7 @@
 port::Status MultiPlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
-  string key = port::Lowercase(platform->Name());
+  string key = absl::AsciiStrToLower(platform->Name());
   absl::MutexLock lock(&mu_);
   if (name_map_.find(key) != name_map_.end()) {
     return port::Status(port::error::INTERNAL,
@@ -140,7 +140,7 @@
   if (platform->Initialized()) {
     return port::Status(
         port::error::FAILED_PRECONDITION,
-        port::Printf("platform with id 0x%p is already initialized", id));
+        absl::StrFormat("platform with id %p is already initialized", id));
   }
 
   SE_RETURN_IF_ERROR(platform->Initialize(options));
@@ -170,7 +170,7 @@
 
 port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
     absl::string_view target) {
-  auto it = name_map_.find(port::Lowercase(target));
+  auto it = name_map_.find(absl::AsciiStrToLower(target));
   if (it == name_map_.end()) {
     return port::Status(
         port::error::NOT_FOUND,
@@ -186,7 +186,7 @@
   if (it == id_map_.end()) {
     return port::Status(
         port::error::NOT_FOUND,
-        port::Printf("could not find registered platform with id: 0x%p", id));
+        absl::StrFormat("could not find registered platform with id: %p", id));
   }
   return it->second;
 }
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.cc b/tensorflow/stream_executor/owning_device_memory.cc
similarity index 82%
rename from tensorflow/compiler/xla/service/owning_device_memory.cc
rename to tensorflow/stream_executor/owning_device_memory.cc
index c115bc0..8b92ccf 100644
--- a/tensorflow/compiler/xla/service/owning_device_memory.cc
+++ b/tensorflow/stream_executor/owning_device_memory.cc
@@ -13,11 +13,11 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
-namespace xla {
+namespace stream_executor {
 
 void OwningDeviceMemory::Free() {
   CHECK(allocator_ != nullptr)
@@ -29,7 +29,7 @@
   }
 
   allocator_ = nullptr;
-  mem_ = se::DeviceMemoryBase();
+  mem_ = DeviceMemoryBase();
 }
 
-}  // namespace xla
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.h b/tensorflow/stream_executor/owning_device_memory.h
similarity index 88%
rename from tensorflow/compiler/xla/service/owning_device_memory.h
rename to tensorflow/stream_executor/owning_device_memory.h
index 4be9bd8..46946c4 100644
--- a/tensorflow/compiler/xla/service/owning_device_memory.h
+++ b/tensorflow/stream_executor/owning_device_memory.h
@@ -16,12 +16,10 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
 
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace xla {
+namespace stream_executor {
 
 // Break circular dependency between this file and device_memory_allocator.h.
 class DeviceMemoryAllocator;
@@ -43,7 +41,7 @@
  public:
   OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
 
-  explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal,
+  explicit OwningDeviceMemory(DeviceMemoryBase mem, int device_ordinal,
                               DeviceMemoryAllocator* allocator)
       : mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
     CHECK(allocator != nullptr) << "allocator cannot be null.";
@@ -53,7 +51,7 @@
       : mem_(other.mem_),
         device_ordinal_(other.device_ordinal_),
         allocator_(other.allocator_) {
-    other.mem_ = se::DeviceMemoryBase();
+    other.mem_ = DeviceMemoryBase();
     other.allocator_ = nullptr;
   }
 
@@ -65,7 +63,7 @@
     device_ordinal_ = other.device_ordinal_;
     allocator_ = other.allocator_;
 
-    other.mem_ = se::DeviceMemoryBase();
+    other.mem_ = DeviceMemoryBase();
     other.allocator_ = nullptr;
     return *this;
   }
@@ -100,25 +98,25 @@
   // !is_null() is sufficient but not necessary to imply `this` is active.
   bool is_null() const { return mem_.is_null(); }
 
-  se::DeviceMemoryBase AsDeviceMemoryBase() const {
+  DeviceMemoryBase AsDeviceMemoryBase() const {
     // This const_cast is necessary because DeviceMemoryBase's constructor
     // doesn't accept a const void*.  This isn't ideal, but it's better than the
     // alternative of making a AsDeviceMemoryBase non-const member function.
     //
     // This is safe (i.e. not UB) because the casted pointer is derived from a
     // non-const pointer, namely mem_.opaque().
-    return se::DeviceMemoryBase(const_cast<void*>(opaque()), size());
+    return DeviceMemoryBase(const_cast<void*>(opaque()), size());
   }
 
   // Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
   // this object.  Precondition: `this` is active.
-  TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() {
+  TF_MUST_USE_RESULT DeviceMemoryBase Forget() {
     CHECK(allocator_ != nullptr)
         << "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
            "or Free()'ed) instance.";
     allocator_ = nullptr;
-    se::DeviceMemoryBase mem(mem_);
-    mem_ = se::DeviceMemoryBase();
+    DeviceMemoryBase mem(mem_);
+    mem_ = DeviceMemoryBase();
     return mem;
   }
 
@@ -127,11 +125,11 @@
   void Free();
 
  private:
-  se::DeviceMemoryBase mem_;
+  DeviceMemoryBase mem_;
   int device_ordinal_;
   DeviceMemoryAllocator* allocator_;  // Null if this object is inactive.
 };
 
-}  // namespace xla
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index 2c2cd77..aefb94f 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -21,6 +21,7 @@
 
 #include <map>
 
+#include "tensorflow/stream_executor/device_description.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
@@ -32,6 +33,7 @@
 namespace stream_executor {
 
 class StreamExecutor;
+class DeviceDescription;
 
 // Describes the platform for a StreamExecutor instantiation to act upon.
 //
@@ -133,6 +135,15 @@
   virtual port::Status Initialize(
       const std::map<string, string>& platform_options);
 
+  // Returns a populated DeviceDescription for the device at the given ordinal.
+  // This should not require device initialization. Note that not all platforms
+  // may support acquiring the DeviceDescription indirectly.
+  //
+  // Alternatively callers may call GetDeviceDescription() on the StreamExecutor
+  // which returns a cached instance specific to the initialized StreamExecutor.
+  virtual port::StatusOr<std::unique_ptr<DeviceDescription>>
+  DescriptionForDevice(int ordinal) const = 0;
+
   // Returns a device with the given ordinal on this platform with a default
   // plugin configuration or, if none can be found with the given ordinal or
   // there is an error in opening a context to communicate with the device, an
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
index 702b2cd..ee093b4 100644
--- a/tensorflow/stream_executor/platform/BUILD
+++ b/tensorflow/stream_executor/platform/BUILD
@@ -16,7 +16,6 @@
     name = "platform",
     textual_hdrs = [
         "logging.h",
-        "mutex.h",
         "platform.h",
         "port.h",
         "thread_annotations.h",
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
index f1ae7d8..1be09d8 100644
--- a/tensorflow/stream_executor/platform/default/BUILD
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -6,7 +6,6 @@
     name = "platform",
     textual_hdrs = [
         "initialize.h",
-        "mutex.h",
     ],
     deps = ["//tensorflow/core:lib"],
 )
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index d4ba67f..80d71e2 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -19,7 +19,6 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "cuda/cuda_config.h"
-#include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/path.h"
@@ -84,11 +83,11 @@
 }
 
 port::StatusOr<void*> GetCusolverDsoHandle() {
-  return GetDsoHandle("cusolver", GetCudaVersion());
+  return GetDsoHandle("cusolver", GetCudaLibVersion());
 }
 
 port::StatusOr<void*> GetCusparseDsoHandle() {
-  return GetDsoHandle("cusparse", GetCudaVersion());
+  return GetDsoHandle("cusparse", GetCudaLibVersion());
 }
 
 port::StatusOr<void*> GetCurandDsoHandle() {
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 4c86822..d8d0af0 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -20,13 +20,13 @@
 #define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
 
 #include <vector>
-#include "tensorflow/stream_executor/platform/port.h"
 
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
 namespace internal {
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
deleted file mode 100644
index 2f8f063..0000000
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
-
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace stream_executor {
-
-#undef mutex_lock
-#undef tf_shared_lock
-
-using tensorflow::ConditionResult;
-using tensorflow::WaitForMilliseconds;
-using tensorflow::condition_variable;
-using tensorflow::mutex;
-using tensorflow::mutex_lock;
-using tensorflow::tf_shared_lock;
-
-#define mutex_lock(x) static_assert(0, "mutex_lock_decl_missing_var_name");
-#define tf_shared_lock(x) \
-  static_assert(0, "tf_shared_lock_decl_missing_var_name");
-
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
diff --git a/tensorflow/stream_executor/platform/mutex.h b/tensorflow/stream_executor/platform/mutex.h
deleted file mode 100644
index fa6c8c0..0000000
--- a/tensorflow/stream_executor/platform/mutex.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
-
-#include "tensorflow/stream_executor/platform/platform.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/mutex.h"
-#else
-#include "tensorflow/stream_executor/platform/default/mutex.h"
-#endif
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 57ad965..26c14bd 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -19,7 +19,6 @@
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
 
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace stream_executor {
@@ -38,9 +37,6 @@
 using std::string;
 #endif
 
-using tensorflow::LinkerInitialized;
-using tensorflow::LINKER_INITIALIZED;
-
 #define SE_FALLTHROUGH_INTENDED TF_FALLTHROUGH_INTENDED
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc
index c53685c..1e6a2d4 100644
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@@ -15,8 +15,11 @@
 
 #include "tensorflow/stream_executor/plugin_registry.h"
 
+#include "absl/base/const_init.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
 namespace stream_executor {
@@ -43,9 +46,9 @@
 PluginRegistry::DefaultFactories::DefaultFactories() :
     blas(kNullPlugin), dnn(kNullPlugin), fft(kNullPlugin), rng(kNullPlugin) { }
 
-static mutex& GetPluginRegistryMutex() {
-  static mutex* mu = new mutex;
-  return *mu;
+static absl::Mutex& GetPluginRegistryMutex() {
+  static absl::Mutex mu(absl::kConstInit);
+  return mu;
 }
 
 /* static */ PluginRegistry* PluginRegistry::instance_ = nullptr;
@@ -53,7 +56,7 @@
 PluginRegistry::PluginRegistry() {}
 
 /* static */ PluginRegistry* PluginRegistry::Instance() {
-  mutex_lock lock{GetPluginRegistryMutex()};
+  absl::MutexLock lock{&GetPluginRegistryMutex()};
   if (instance_ == nullptr) {
     instance_ = new PluginRegistry();
   }
@@ -69,14 +72,14 @@
 port::Status PluginRegistry::RegisterFactoryInternal(
     PluginId plugin_id, const string& plugin_name, FACTORY_TYPE factory,
     std::map<PluginId, FACTORY_TYPE>* factories) {
-  mutex_lock lock{GetPluginRegistryMutex()};
+  absl::MutexLock lock{&GetPluginRegistryMutex()};
 
   if (factories->find(plugin_id) != factories->end()) {
     return port::Status(
         port::error::ALREADY_EXISTS,
-        port::Printf("Attempting to register factory for plugin %s when "
-                     "one has already been registered",
-                     plugin_name.c_str()));
+        absl::StrFormat("Attempting to register factory for plugin %s when "
+                        "one has already been registered",
+                        plugin_name));
   }
 
   (*factories)[plugin_id] = factory;
@@ -94,7 +97,7 @@
     if (iter == generic_factories.end()) {
       return port::Status(
           port::error::NOT_FOUND,
-          port::Printf("Plugin ID %p not registered.", plugin_id));
+          absl::StrFormat("Plugin ID %p not registered.", plugin_id));
     }
   }
 
@@ -233,8 +236,8 @@
     auto iter = platform_id_by_kind_.find(platform_kind);                     \
     if (iter == platform_id_by_kind_.end()) {                                 \
       return port::Status(port::error::FAILED_PRECONDITION,                   \
-                          port::Printf("Platform kind %d not registered.",    \
-                                       static_cast<int>(platform_kind)));     \
+                          absl::StrFormat("Platform kind %d not registered.", \
+                                          static_cast<int>(platform_kind)));  \
     }                                                                         \
     return GetFactory<PluginRegistry::FACTORY_TYPE>(iter->second, plugin_id); \
   }
diff --git a/tensorflow/stream_executor/plugin_registry.h b/tensorflow/stream_executor/plugin_registry.h
index 3065b5c..e032116 100644
--- a/tensorflow/stream_executor/plugin_registry.h
+++ b/tensorflow/stream_executor/plugin_registry.h
@@ -25,7 +25,6 @@
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
 
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index 902d8f9..c64bf4d 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -138,6 +138,7 @@
         ":rocm_driver",
         ":rocm_gpu_executor",
         ":rocm_platform_id",
+        "@com_google_absl//absl/memory",
         "//tensorflow/stream_executor",  # buildcleaner: keep
         "//tensorflow/stream_executor:executor_cache",
         "//tensorflow/stream_executor:multi_platform_manager",
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
index 82b9666..f3003e0 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -18,12 +18,13 @@
 #include "tensorflow/stream_executor/rocm/rocm_blas.h"
 
 #define EIGEN_USE_GPU
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 #include <assert.h>
+
 #include <complex>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/gpu/gpu_activation.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
@@ -34,7 +35,6 @@
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -382,7 +382,7 @@
 bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
                                   Args... args) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
 
   CHECK(blas_ != nullptr);
   if (!SetStream(stream)) {
@@ -1462,8 +1462,8 @@
                           float alpha, const DeviceMemory<Eigen::half> &a,
                           int lda, const DeviceMemory<Eigen::half> &b, int ldb,
                           float beta, DeviceMemory<Eigen::half> *c, int ldc) {
-  VLOG(1) << port::Printf(
-      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+  VLOG(1) << absl::StreamFormat(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
@@ -1507,8 +1507,8 @@
                           float alpha, const DeviceMemory<float> &a, int lda,
                           const DeviceMemory<float> &b, int ldb, float beta,
                           DeviceMemory<float> *c, int ldc) {
-  VLOG(1) << port::Printf(
-      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+  VLOG(1) << absl::StreamFormat(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.h b/tensorflow/stream_executor/rocm/rocm_blas.h
index 75c6848..a974331 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.h
+++ b/tensorflow/stream_executor/rocm/rocm_blas.h
@@ -20,8 +20,8 @@
 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -141,7 +141,7 @@
                                    blas::ProfileResult *output_profile_result);
 
   // mutex that guards the rocBLAS handle for this device.
-  mutex mu_;
+  absl::Mutex mu_;
 
   // GpuExecutor which instantiated this ROCMBlas.
   // Immutable post-initialization.
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
index 812974a..71bbdcc 100644
--- a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
@@ -13,8 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
-#include <dirent.h>
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
 
+#include <dirent.h>
 #include <limits.h>
 #include <link.h>
 #include <stddef.h>
@@ -23,6 +24,7 @@
 #include <string.h>
 #include <sys/sysmacros.h>
 #include <unistd.h>
+
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -30,14 +32,13 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/numbers.h"
 #include "tensorflow/stream_executor/lib/process_state.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
 
 namespace stream_executor {
 namespace rocm {
@@ -56,7 +57,7 @@
 }
 
 port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
-  std::vector<string> pieces = port::Split(value, '.');
+  std::vector<string> pieces = absl::StrSplit(value, '.');
   if (pieces.size() != 2 && pieces.size() != 3) {
     return port::Status{port::error::INVALID_ARGUMENT,
                         absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
@@ -121,7 +122,7 @@
     string library_path = value == nullptr ? "" : value;
     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
 
-    std::vector<string> pieces = port::Split(library_path, ':');
+    std::vector<string> pieces = absl::StrSplit(library_path, ':');
     for (const auto& piece : pieces) {
       if (piece.empty()) {
         continue;
@@ -179,9 +180,9 @@
       }
       string dso_version = dot + strlen(so_suffix);
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
-      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
       auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
-      *result = rocm::StringToDriverVersion(stripped_dso_version);
+      *result = rocm::StringToDriverVersion(string(stripped_dso_version));
       return 1;
     }
     return 0;
@@ -209,9 +210,8 @@
   size_t space_index = version_and_rest.find(" ");
   auto kernel_version = version_and_rest.substr(0, space_index);
   // TODO(b/22689637): Eliminate the explicit namespace if possible.
-  auto stripped_kernel_version =
-      port::StripSuffixString(kernel_version, ".ld64");
-  return rocm::StringToDriverVersion(stripped_kernel_version);
+  auto stripped_kernel_version = absl::StripSuffix(kernel_version, ".ld64");
+  return rocm::StringToDriverVersion(string(stripped_kernel_version));
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index e1a2e45..25a6881 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -98,8 +98,8 @@
  public:
   // Takes ownership of the executor context and the lock to access MIOpen
   // using handle.
-  MIOpenHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
-               miopenHandle_t handle)
+  MIOpenHandle(gpu::ScopedActivateExecutorContext context,
+               std::unique_ptr<absl::MutexLock> lock, miopenHandle_t handle)
       : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
   // Returns MIOpen handle. To be passed directly to MIOpen APIs, don't keep
@@ -108,7 +108,7 @@
 
  private:
   gpu::ScopedActivateExecutorContext context_;
-  mutex_lock lock_;
+  std::unique_ptr<absl::MutexLock> lock_;
   miopenHandle_t handle_;  // Not owned.
 };
 
@@ -318,7 +318,7 @@
                            miopenFusionPlanDescriptor_t* fusion_plan,
                            miopenFusionDirection_t fusion_direction,
                            miopenTensorDescriptor_t input_descriptor) {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};
 
     bool found_cached_plan = false;
 
@@ -342,7 +342,7 @@
 
   // Need to figure out the right place to call this routine.
   static void Clear() {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};
 
     for (auto it : cached_plans) {
       auto status = wrap::miopenDestroyFusionPlan(it.second);
@@ -359,19 +359,19 @@
 
   // Is the Fusion plan corresponding to this hash unsupported.
   static bool IsUnsupportedFusionPlan(uint64 hash) {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};
     return unsupported_plans.count(hash) > 0;
   }
 
   // Mark the given hash value as corresponding to an unsupported fusion plan.
   static void MarkFusionPlanUnsupported(uint64 hash) {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};
     unsupported_plans.insert(hash);
   }
 
  private:
   // Mutex to guard access to all data within this class.
-  static mutex cached_plans_mutex;
+  static absl::Mutex cached_plans_mutex;
 
   // Map of hash-value to MIOpen Fusion plan descriptors.
   // Need to be able share this across more than one stream and hence static.
@@ -382,7 +382,7 @@
   static std::set<uint64> unsupported_plans;
 };
 
-mutex CachedFusionPlans::cached_plans_mutex;
+absl::Mutex CachedFusionPlans::cached_plans_mutex;
 std::map<uint64, miopenFusionPlanDescriptor_t> CachedFusionPlans::cached_plans;
 std::set<uint64> CachedFusionPlans::unsupported_plans;
 
@@ -449,7 +449,7 @@
   explicit MIOpenAccess(miopenHandle_t handle) : handle_(handle) {}
 
   ~MIOpenAccess() {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
     wrap::miopenDestroy(handle_);
   }
 
@@ -468,7 +468,8 @@
   // therefore a bad idea (performance wise) to call any MIOpen APIs that
   // enqueue work in the stream.
   MIOpenHandle GetHandle(GpuExecutor* executor, Stream* stream) {
-    mutex_lock lock(mutex_);
+    auto lock = absl::make_unique<absl::MutexLock>(&mutex_);
+    mutex_.AssertHeld();
     gpu::ScopedActivateExecutorContext context(executor);
     hipStream_t hip_stream = stream ? AsGpuStreamValue(stream) : nullptr;
     auto status = wrap::miopenSetStream(handle_, hip_stream);
@@ -478,7 +479,7 @@
 
  private:
   // Guards the enqueueing of MIOpen operations via the handle_ below.
-  mutex mutex_;
+  absl::Mutex mutex_;
 
   // MIOpen library handle.
   miopenHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index 643f9b3..125acf7 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -19,9 +19,9 @@
 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 1c44e9f..7cd35ff 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -15,6 +15,7 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+
 #include <map>
 #include <set>
 #include <utility>
@@ -23,18 +24,17 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
-#include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
 
@@ -115,15 +115,9 @@
 // stack-limited threads (such as those spawned by a default-argument
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
-static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
-static port::ThreadPool* InitializeDriverExecutor() {
-  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
-                              "rocm_driver", 1);
-}
-
 port::ThreadPool* GetDriverExecutor() {
-  mutex_lock lock(driver_executor_threadpool_mu);
-  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
+  static port::ThreadPool* thread_pool = new port::ThreadPool(
+      port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
   return thread_pool;
 }
 
@@ -311,17 +305,10 @@
 /* static */ port::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as hipInit need only be
   // called once, but GpuDriver::Init may be called many times.
-  static port::Status init_retval;
-  static bool set = false;
-  static mutex* init_mu = new mutex;
-
-  mutex_lock lock(*init_mu);
-  if (!set) {
-    init_retval = InternalInit();
-    set = true;
-  }
-
-  return init_retval;
+  static port::Status* init_retval = [] {
+    return new port::Status(InternalInit());
+  }();
+  return *init_retval;
 }
 
 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
@@ -472,7 +459,7 @@
 /* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
                                        const char* hsaco_contents,
                                        hipModule_t* module) {
-  port::Notification notification;
+  absl::Notification notification;
   bool ret = true;
   GetDriverExecutor()->Schedule(
       [context, hsaco_contents, module, &ret, &notification]() {
@@ -1008,9 +995,9 @@
   return true;
 }
 
-/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
-                                                 GpuEventHandle* event,
-                                                 EventFlags flags) {
+/* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
+                                               GpuEventHandle* event,
+                                               EventFlags flags) {
   int hipflags;
   switch (flags) {
     case EventFlags::kDefault:
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index 2fe3632..61eb190 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -16,8 +16,10 @@
 #include <unistd.h>
 
 #include "absl/base/casts.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/gpu/gpu_event.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
@@ -31,10 +33,7 @@
 #include "tensorflow/stream_executor/lib/numbers.h"
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/process_state.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -115,7 +114,7 @@
 }
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
-  mutex_lock lock{in_memory_modules_mu_};
+  absl::MutexLock lock{&in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
 
@@ -211,9 +210,9 @@
   if (strip_exe) {
     // The exe is the last component of the path, so remove one component.
     string ret = exe_path;
-    std::vector<string> components = port::Split(exe_path, '/');
+    std::vector<string> components = absl::StrSplit(exe_path, '/');
     components.pop_back();
-    return port::Join(components, "/");
+    return absl::StrJoin(components, "/");
   }
   return exe_path;
 }
@@ -237,7 +236,7 @@
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
 
     const char* hsaco = spec.cuda_cubin_in_memory().bytes();
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     module = in_memory_modules_[hsaco];
 
     if (module == nullptr) {
@@ -295,7 +294,7 @@
   // whether we've done an occupancy check on this kernel before isn't free
   // (because we have to synchronize), so we only do this at -v 2+.
   if (VLOG_IS_ON(2)) {
-    mutex_lock lock(launched_kernels_mu_);
+    absl::MutexLock lock(&launched_kernels_mu_);
     if (!launched_kernels_.count(hipfunc)) {
       VlogOccupancyInfo(kernel, thread_dims, block_dims);
       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
@@ -367,7 +366,7 @@
   hipModule_t hip_module = nullptr;
   // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
   if (spec.has_cuda_cubin_in_memory()) {
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
     if (!LoadModuleFromHsaco(
             reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
             &hip_module)) {
@@ -766,8 +765,8 @@
 bool GpuExecutor::GetSymbol(const string& symbol_name,
                             ModuleHandle module_handle, void** mem,
                             size_t* bytes) {
-  {  // give limited scope to mutex_lock
-    mutex_lock lock{disk_modules_mu_};
+  {  // give limited scope to lock
+    absl::MutexLock lock{&disk_modules_mu_};
     for (auto& it : disk_modules_) {
       if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
                                      reinterpret_cast<hipDeviceptr_t*>(mem),
@@ -777,8 +776,8 @@
     }
   }
 
-  {  // give limited scope to mutex_lock
-    mutex_lock lock{in_memory_modules_mu_};
+  {  // give limited scope to lock
+    absl::MutexLock lock{&in_memory_modules_mu_};
     for (auto& it : in_memory_modules_) {
       if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
                                      reinterpret_cast<hipDeviceptr_t*>(mem),
@@ -788,8 +787,8 @@
     }
   }
 
-  {  // give limited scope to mutex_lock
-    mutex_lock lock{in_memory_modules_mu_};
+  {  // give limited scope to lock
+    absl::MutexLock lock{&in_memory_modules_mu_};
     if (static_cast<bool>(module_handle)) {
       auto it = gpu_binary_to_module_.find(module_handle.id());
       CHECK(it != gpu_binary_to_module_.end());
@@ -813,13 +812,13 @@
   return false;
 }
 
-bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
+bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
   // (as opposed to ThreadDim which expresses the dimensions of threads
   // within a block).
   int x, y, z;
-  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
     return false;
   }
 
@@ -869,7 +868,20 @@
   return 1;
 }
 
-DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+GpuExecutor::CreateDeviceDescription(int device_ordinal) {
+  GpuDeviceHandle device;
+  auto status = GpuDriver::GetDevice(device_ordinal, &device);
+  if (!status.ok()) {
+    return status;
+  }
+
+  int version;
+  status = GpuDriver::GetGpuISAVersion(&version, device);
+  if (!status.ok()) {
+    return status;
+  }
+
   internal::DeviceDescriptionBuilder builder;
 
   {
@@ -883,19 +895,19 @@
   }
 
   {
-    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
+    string pci_bus_id = GpuDriver::GetPCIBusID(device);
 
     // Lower the hex characters to match sysfs.
-    pci_bus_id = port::Lowercase(pci_bus_id);
+    pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
     builder.set_pci_bus_id(pci_bus_id);
 
     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
-    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
     builder.set_numa_node(numa_node);
   }
 
   hipDeviceProp_t prop;
-  if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
+  if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
     builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
 
     ThreadDim thread_dim_limit;
@@ -910,65 +922,56 @@
 
   {
     bool ecc_enabled = false;
-    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
+    (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
     builder.set_ecc_enabled(ecc_enabled);
   }
 
   {
     uint64 device_memory_size = -1;
-    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
     builder.set_device_memory_size(device_memory_size);
   }
 
   {
     BlockDim block_dim_limit;
-    FillBlockDimLimit(&block_dim_limit);
+    FillBlockDimLimit(device, &block_dim_limit);
     builder.set_block_dim_limit(block_dim_limit);
   }
 
   {
     string device_name;
-    (void)GpuDriver::GetDeviceName(device_, &device_name);
+    (void)GpuDriver::GetDeviceName(device, &device_name);
     builder.set_name(device_name);
   }
 
   builder.set_platform_version(
-      absl::StrCat("AMDGPU ISA version: gfx", version_));
+      absl::StrCat("AMDGPU ISA version: gfx", version));
 
   // TODO(leary) should be a way to query this from the driver, but this is
   // unlikely to change for us any time soon.
   builder.set_device_address_bits(64);
 
   builder.set_device_vendor("Advanced Micro Devices, Inc");
-  builder.set_rocm_amdgpu_isa_version(version_);
+  builder.set_rocm_amdgpu_isa_version(version);
   builder.set_shared_memory_per_core(
-      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
   builder.set_shared_memory_per_block(
-      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
   builder.set_core_count(
-      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
+      GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
   builder.set_threads_per_core_limit(
-      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
   builder.set_registers_per_block_limit(
-      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
   builder.set_threads_per_warp(
-      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
+      GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
   builder.set_registers_per_core_limit(64 * 1024);
 
-  auto built = builder.Build();
-  return built.release();
+  return builder.Build();
 }
 
 }  // namespace gpu
 
-void initialize_rocm_gpu_executor() {
-  *internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
-    return new gpu::GpuExecutor{config};
-  };
-}
-
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
-  stream_executor::initialize_rocm_gpu_executor();
-});
+REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.cc b/tensorflow/stream_executor/rocm/rocm_platform.cc
index ce09165..156ec8d 100644
--- a/tensorflow/stream_executor/rocm/rocm_platform.cc
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@@ -15,14 +15,13 @@
 
 #include "tensorflow/stream_executor/rocm/rocm_platform.h"
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 
 namespace stream_executor {
@@ -39,30 +38,25 @@
 void ROCmPlatform::InspectNumaNodes() {
   // To get NUMA node information, we need to create all executors, so we can
   // examine their device descriptions to see their bus assignments.
-  static bool initialized = false;
-  static mutex numa_mutex(LINKER_INITIALIZED);
-  mutex_lock lock(numa_mutex);
-  if (initialized) {
-    return;
-  }
-
-  StreamExecutorConfig config;
-  for (int i = 0; i < VisibleDeviceCount(); i++) {
-    config.ordinal = i;
-    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
-    if (i == 0) {
-      // NUMA nodes may not start at 0, so set the minimum node  based on the
-      // first executor we see.
-      min_numa_node_ = exec->GetDeviceDescription().numa_node();
-      limit_numa_node_ = min_numa_node_ + 1;
-    } else {
-      min_numa_node_ =
-          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
-      limit_numa_node_ = std::max(limit_numa_node_,
-                                  exec->GetDeviceDescription().numa_node() + 1);
+  std::once_flag once;
+  std::call_once(once, [&] {
+    StreamExecutorConfig config;
+    for (int i = 0; i < VisibleDeviceCount(); i++) {
+      config.ordinal = i;
+      StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+      if (i == 0) {
+        // NUMA nodes may not start at 0, so set the minimum node  based on the
+        // first executor we see.
+        min_numa_node_ = exec->GetDeviceDescription().numa_node();
+        limit_numa_node_ = min_numa_node_ + 1;
+      } else {
+        min_numa_node_ =
+            std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+        limit_numa_node_ = std::max(
+            limit_numa_node_, exec->GetDeviceDescription().numa_node() + 1);
+      }
     }
-  }
-  initialized = true;
+  });
 }
 
 int ROCmPlatform::BusCount() {
@@ -109,6 +103,11 @@
 
 const string& ROCmPlatform::Name() const { return name_; }
 
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+ROCmPlatform::DescriptionForDevice(int ordinal) const {
+  return GpuExecutor::CreateDeviceDescription(ordinal);
+}
+
 port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -134,8 +133,8 @@
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<GpuExecutor>(config.plugin_config));
+  auto executor = absl::make_unique<StreamExecutor>(
+      this, absl::make_unique<GpuExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.h b/tensorflow/stream_executor/rocm/rocm_platform.h
index d498e5f..c4838ee 100644
--- a/tensorflow/stream_executor/rocm/rocm_platform.h
+++ b/tensorflow/stream_executor/rocm/rocm_platform.h
@@ -19,11 +19,11 @@
 #include <memory>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/executor_cache.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -64,6 +64,9 @@
 
   const string& Name() const override;
 
+  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
   port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
 
   port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
@@ -87,7 +90,7 @@
   string name_;
 
   // mutex that guards internal state.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   // Cache of created executors.
   ExecutorCache executor_cache_;
diff --git a/tensorflow/stream_executor/rocm/rocm_rng.cc b/tensorflow/stream_executor/rocm/rocm_rng.cc
index 99bfc49..38f4f8b 100644
--- a/tensorflow/stream_executor/rocm/rocm_rng.cc
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@@ -126,7 +126,7 @@
 }
 
 bool GpuRng::Init() {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
   CHECK(rng_ == nullptr);
 
   hiprandStatus_t ret =
@@ -161,7 +161,7 @@
 
 template <typename T>
 bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
 
@@ -220,7 +220,7 @@
                                             ElemT stddev,
                                             DeviceMemory<ElemT>* v,
                                             FuncT func) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
 
   if (!SetStream(stream)) {
     return false;
@@ -252,7 +252,7 @@
 }
 
 bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
   CHECK(rng_ != nullptr);
 
   if (!CheckSeed(seed, seed_bytes)) {
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 2577d38..8f30219 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -290,7 +290,7 @@
 Stream &Stream::Init() {
   VLOG_CALL();
 
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   CHECK_EQ(false, allocated_)
       << "stream appears to already have been initialized";
   CHECK(!ok_) << "stream should be in !ok() state pre-initialization";
@@ -1736,7 +1736,7 @@
 }
 
 Stream *Stream::GetOrCreateSubStream() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
 
   // Look for the first reusable sub_stream that is ok, dropping !ok sub_streams
   // we encounter along the way.
@@ -1783,7 +1783,7 @@
 }
 
 void Stream::ReturnSubStream(Stream *sub_stream) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
 
   // Look for the sub-stream.
   for (int64 index = 0; index < sub_streams_.size(); ++index) {
@@ -5285,7 +5285,7 @@
     return;
   }
   LOG(ERROR) << status;
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   ok_ = false;
 }
 
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 3e67d55..cdd3464 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -25,6 +25,7 @@
 #include <functional>
 #include <memory>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/device_memory.h"
@@ -35,7 +36,6 @@
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
@@ -1964,7 +1964,7 @@
   friend class ocl::CLBlas;    // for parent_.
 
   bool InErrorState() const LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock(mu_);
+    absl::ReaderMutexLock lock(&mu_);
     return !ok_;
   }
 
@@ -1974,7 +1974,7 @@
     if (operation_retcode) {
       return;
     }
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     ok_ = false;
   }
 
@@ -1998,7 +1998,7 @@
 
   // mutex that guards the allocation / error state flags.
   // Mutable so that it can be obtained via const reader lock.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   // Whether Init() was successfully called to allocate this stream on the
   // underlying platform. It simply flips from 0 to 1 with a sanity check.
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 46afede..affecaa 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -18,31 +18,6 @@
 namespace stream_executor {
 namespace internal {
 
-// -- CUDA
-
-StreamExecutorFactory* MakeCUDAExecutorImplementation() {
-  static StreamExecutorFactory instance;
-  return &instance;
-}
-
-// -- ROCm
-
-StreamExecutorFactory* MakeROCMExecutorImplementation() {
-  static StreamExecutorFactory instance;
-  return &instance;
-}
-
-// -- OpenCL
-
-StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
-  static StreamExecutorFactory instance;
-  return &instance;
-}
-
-// -- Host
-
-StreamExecutorFactory MakeHostExecutorImplementation;
-
 // The default implementation just calls the other HostCallback method.
 // It should make all existing code that uses a void() callback still work.
 bool StreamExecutorInterface::HostCallback(Stream* stream,
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 4f9b65e..4619fe1 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -288,7 +288,8 @@
 
   // Creates a new DeviceDescription object. Ownership is transferred to the
   // caller.
-  virtual DeviceDescription *PopulateDeviceDescription() const = 0;
+  virtual port::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription() const = 0;
 
   // Attempts to register the provided TraceListener with the device-specific
   // Executor implementation. When this is called, the PIMPL interface has
@@ -383,21 +384,6 @@
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
 };
 
-using StreamExecutorFactory =
-    std::function<StreamExecutorInterface *(const PluginConfig &)>;
-using EventFactory = std::function<EventInterface *(StreamExecutor *)>;
-using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
-using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
-using KernelFactory = std::function<KernelInterface*()>;
-
-StreamExecutorFactory *MakeCUDAExecutorImplementation();
-
-StreamExecutorFactory *MakeROCMExecutorImplementation();
-
-StreamExecutorFactory *MakeOpenCLExecutorImplementation();
-
-extern StreamExecutorFactory MakeHostExecutorImplementation;
-
 
 }  // namespace internal
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 2870c38..97d77c0 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -20,18 +20,19 @@
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 #include <atomic>
+#include <memory>
 #include <utility>
 
+#include "absl/base/const_init.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/notification.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
@@ -55,42 +56,11 @@
 // Make sure the executor is done with its work; we know (because this isn't
 // publicly visible) that all enqueued work is quick.
 void BlockOnThreadExecutor(port::ThreadPool *executor) {
-  port::Notification n;
+  absl::Notification n;
   executor->Schedule([&n]() { n.Notify(); });
   n.WaitForNotification();
 }
 
-internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
-    PlatformKind platform_kind, const PluginConfig &plugin_config) {
-  // Note: we use this factory-assignment-in-switch pattern instead of just
-  // invoking the callable in case linkage is messed up -- instead of invoking a
-  // nullptr std::function (due to failed registration) we give a nice
-  // LOG(FATAL) message.
-  internal::StreamExecutorFactory factory;
-  switch (platform_kind) {
-    case PlatformKind::kCuda:
-      factory = *internal::MakeCUDAExecutorImplementation();
-      break;
-    case PlatformKind::kROCm:
-      factory = *internal::MakeROCMExecutorImplementation();
-      break;
-    case PlatformKind::kOpenCL:
-      factory = *internal::MakeOpenCLExecutorImplementation();
-      break;
-    case PlatformKind::kHost:
-      factory = internal::MakeHostExecutorImplementation;
-      break;
-    default:
-      factory = nullptr;
-  }
-  if (factory == nullptr) {
-    LOG(FATAL)
-        << "cannot create StreamExecutor implementation for platform kind: "
-        << PlatformKindString(platform_kind);
-  }
-  return factory(plugin_config);
-}
-
 std::atomic_int_fast64_t correlation_id_generator(0);
 
 }  // namespace
@@ -123,7 +93,7 @@
   void Trace(CallbackT callback, TraceArgsT... args) {
     {
       // Instance tracers held in a block to limit the lock lifetime.
-      tf_shared_lock lock{stream_exec_->mu_};
+      absl::ReaderMutexLock lock{&stream_exec_->mu_};
       for (TraceListener *listener : stream_exec_->listeners_) {
         (listener->*callback)(correlation_id_,
                               std::forward<TraceArgsT>(args)...);
@@ -152,21 +122,7 @@
   auto tracer = MakeScopedTracer(this, &LOC ## Begin,               \
                                  &LOC ## Complete, ## __VA_ARGS__);
 
-/* static */ mutex StreamExecutor::static_mu_{LINKER_INITIALIZED};
-
-StreamExecutor::StreamExecutor(PlatformKind platform_kind,
-                               const PluginConfig &plugin_config)
-    : platform_(nullptr),
-      implementation_(StreamExecutorImplementationFromPlatformKind(
-          platform_kind, plugin_config)),
-      platform_kind_(platform_kind),
-      device_ordinal_(-1),
-      background_threads_(new port::ThreadPool(
-          port::Env::Default(), "stream_executor", kNumBackgroundThreads)),
-      live_stream_count_(0),
-      tracing_enabled_(false) {
-  CheckPlatformKindIsValid(platform_kind);
-}
+/* static */ absl::Mutex StreamExecutor::static_mu_{absl::kConstInit};
 
 // Get per-device memory limit in bytes. Returns 0 if
 // TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
@@ -189,13 +145,14 @@
       tracing_enabled_(false),
       mem_alloc_bytes_(0),
       memory_limit_bytes_(GetMemoryLimitBytes()) {
-  if (port::Lowercase(platform_->Name()) == "cuda") {
+  string name = absl::AsciiStrToLower(platform_->Name());
+  if (name == "cuda") {
     platform_kind_ = PlatformKind::kCuda;
-  } else if (port::Lowercase(platform_->Name()) == "rocm") {
+  } else if (name == "rocm") {
     platform_kind_ = PlatformKind::kROCm;
-  } else if (port::Lowercase(platform_->Name()) == "opencl") {
+  } else if (name == "opencl") {
     platform_kind_ = PlatformKind::kOpenCL;
-  } else if (port::Lowercase(platform_->Name()) == "host") {
+  } else if (name == "host") {
     platform_kind_ = PlatformKind::kHost;
   } else {
     platform_kind_ = PlatformKind::kInvalid;
@@ -214,7 +171,7 @@
   if (FLAGS_check_device_leaks) {
     for (auto it : mem_allocs_) {
       LOG(INFO) << "Memory alloced at executor exit: addr: "
-                << port::Printf("%p", it.first)
+                << absl::StrFormat("%p", it.first)
                 << ", bytes: " << it.second.bytes << ", trace: \n"
                 << it.second.stack_trace;
     }
@@ -261,7 +218,7 @@
 }
 
 void StreamExecutor::GetMemAllocs(std::map<void *, AllocRecord> *records_out) {
-  tf_shared_lock lock(mu_);
+  absl::ReaderMutexLock lock(&mu_);
   *records_out = mem_allocs_;
 }
 
@@ -282,7 +239,7 @@
   if (config != SharedMemoryConfig::kDefault &&
       config != SharedMemoryConfig::kFourByte &&
       config != SharedMemoryConfig::kEightByte) {
-    string error_msg = port::Printf(
+    string error_msg = absl::StrFormat(
         "Invalid shared memory config specified: %d", static_cast<int>(config));
     LOG(ERROR) << error_msg;
     return port::Status(port::error::INVALID_ARGUMENT, error_msg);
@@ -291,12 +248,12 @@
 }
 
 const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (device_description_ != nullptr) {
     return *device_description_;
   }
 
-  device_description_.reset(PopulateDeviceDescription());
+  device_description_ = CreateDeviceDescription();
   return *device_description_;
 }
 
@@ -437,7 +394,7 @@
 }
 
 dnn::DnnSupport *StreamExecutor::AsDnn() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (dnn_ != nullptr) {
     return dnn_.get();
   }
@@ -447,7 +404,7 @@
 }
 
 blas::BlasSupport *StreamExecutor::AsBlas() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (blas_ != nullptr) {
     return blas_.get();
   }
@@ -457,7 +414,7 @@
 }
 
 fft::FftSupport *StreamExecutor::AsFft() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (fft_ != nullptr) {
     return fft_.get();
   }
@@ -467,7 +424,7 @@
 }
 
 rng::RngSupport *StreamExecutor::AsRng() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (rng_ != nullptr) {
     return rng_.get();
   }
@@ -677,12 +634,12 @@
 
   result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
   if (!result.ok()) {
-    result = port::Status(port::error::INTERNAL,
-                          port::Printf("failed to synchronously memcpy "
-                                       "device-to-host: device %p to host %p "
-                                       "size %lld: %s",
-                                       device_src.opaque(), host_dst, size,
-                                       result.ToString().c_str()));
+    result = port::Status(
+        port::error::INTERNAL,
+        absl::StrFormat("failed to synchronously memcpy device-to-host: device "
+                        "%p to host %p size %d: %s",
+                        device_src.opaque(), host_dst, size,
+                        result.ToString()));
   }
 
   return result;
@@ -702,10 +659,10 @@
   if (!result.ok()) {
     result = port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to synchronously memcpy host-to-device: host "
-                     "%p to device %p size %lld: %s",
-                     host_src, device_dst->opaque(), size,
-                     result.ToString().c_str()));
+        absl::StrFormat("failed to synchronously memcpy host-to-device: host "
+                        "%p to device %p size %d: %s",
+                        host_src, device_dst->opaque(), size,
+                        result.ToString()));
   }
 
   return result;
@@ -809,8 +766,10 @@
   return implementation_->StopTimer(stream, timer);
 }
 
-DeviceDescription *StreamExecutor::PopulateDeviceDescription() const {
-  return implementation_->PopulateDeviceDescription();
+std::unique_ptr<DeviceDescription> StreamExecutor::CreateDeviceDescription()
+    const {
+  auto desc_status = implementation_->CreateDeviceDescription();
+  return desc_status.ConsumeValueOrDie();
 }
 
 bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
@@ -823,7 +782,7 @@
 
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
   if (FLAGS_check_device_leaks && opaque != nullptr && bytes != 0) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     mem_allocs_[opaque] = AllocRecord{
         bytes, ""};
     mem_alloc_bytes_ += bytes;
@@ -832,10 +791,9 @@
 
 void StreamExecutor::EraseAllocRecord(void *opaque) {
   if (FLAGS_check_device_leaks && opaque != nullptr) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     if (mem_allocs_.find(opaque) == mem_allocs_.end()) {
-      LOG(ERROR) << "Deallocating unknown pointer: "
-                 << port::Printf("0x%p", opaque);
+      LOG(ERROR) << "Deallocating unknown pointer: " << opaque;
     } else {
       mem_alloc_bytes_ -= mem_allocs_[opaque].bytes;
       mem_allocs_.erase(opaque);
@@ -847,7 +805,7 @@
 
 void StreamExecutor::RegisterTraceListener(TraceListener *listener) {
   {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     if (listeners_.find(listener) != listeners_.end()) {
       LOG(INFO) << "Attempt to register already-registered listener, "
                 << listener;
@@ -861,7 +819,7 @@
 
 bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
   {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     if (listeners_.find(listener) == listeners_.end()) {
       LOG(INFO) << "Attempt to unregister unknown listener, " << listener;
       return false;
@@ -882,7 +840,7 @@
   if (tracing_enabled_) {
     {
       // instance tracers held in a block to limit the lock lifetime.
-      tf_shared_lock lock(mu_);
+      absl::ReaderMutexLock lock(&mu_);
       for (TraceListener *listener : listeners_) {
         (listener->*trace_call)(std::forward<ArgsT>(args)...);
       }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 6f88df8..4c3aeda 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -23,13 +23,13 @@
 #include <vector>
 
 #include "absl/base/macros.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/rng.h"
@@ -70,9 +70,6 @@
 // StreamExecutor interface should not be invoked from a signal handler.
 class StreamExecutor {
  public:
-  explicit StreamExecutor(PlatformKind kind,
-                          const PluginConfig &plugin_config = PluginConfig());
-
   StreamExecutor(
       const Platform *platform,
       std::unique_ptr<internal::StreamExecutorInterface> implementation);
@@ -213,7 +210,7 @@
   // Memory allocated in this manner (or allocated and registered with
   // HostMemoryRegister() is required for use in asynchronous memcpy operations,
   // such as Stream::ThenMemcpy.
-  void *HostMemoryAllocate(uint64 bytes);
+  void *HostMemoryAllocate(uint64 size);
 
   // Deallocates a region of host memory allocated by HostMemoryAllocate().
   void HostMemoryDeallocate(void *location);
@@ -572,23 +569,23 @@
   // Requests the current status of the event from the underlying platform.
   Event::Status PollForEventStatus(Event *event);
 
-  // Allocates stream resources on the underlying platform for subject and
-  // initializes its internals.
-  bool AllocateStream(Stream *subject);
+  // Allocates stream resources on the underlying platform and initializes its
+  // internals.
+  bool AllocateStream(Stream *stream);
 
   // Deallocates stream resources on the underlying platform.
-  void DeallocateStream(Stream *subject);
+  void DeallocateStream(Stream *stream);
 
   // Causes dependent to not begin execution until other has finished its
   // last-enqueued work.
   bool CreateStreamDependency(Stream *dependent, Stream *other);
 
-  // Allocates timer resources on the underlying platform for subject and
-  // initializes its internals.
-  bool AllocateTimer(Timer *subject);
+  // Allocates timer resources on the underlying platform and initializes its
+  // internals.
+  bool AllocateTimer(Timer *timer);
 
   // Deallocates timer resources on the underlying platform.
-  void DeallocateTimer(Timer *subject);
+  void DeallocateTimer(Timer *timer);
 
   // Records a start event for an interval timer.
   bool StartTimer(Stream *stream, Timer *timer);
@@ -598,7 +595,7 @@
 
   // Allocates a new metadata object, appropriately populated, on the heap, with
   // ownership transfer to caller.
-  DeviceDescription *PopulateDeviceDescription() const;
+  std::unique_ptr<DeviceDescription> CreateDeviceDescription() const;
 
   // Adds a task to the port::ThreadPool work queue. These tasks must be
   // fire-and-forget and have no external data or timing dependencies; their
@@ -610,7 +607,7 @@
   // Adds an AllocRecord for 'opaque' of size 'bytes' to the record map, for
   // leak checking. NULL buffer pointers and buffer sizes of 0 will not be
   // tracked.
-  void CreateAllocRecord(void *opaque, uint64 size);
+  void CreateAllocRecord(void *opaque, uint64 bytes);
 
   // Removes the AllocRecord keyed by 'opaque' from the record map. NULL
   // pointers will not be erased (as they're not tracked, per above).
@@ -622,13 +619,13 @@
   void SubmitTrace(TraceCallT trace_call, ArgsT&&... args);
 
   // Reader/writer lock for class-static StreamExecutor members.
-  static mutex static_mu_;
+  static absl::Mutex static_mu_;
 
   // Reader/writer lock for mutable data structures on this StreamExecutor.
   //
   // Mutable so that caching functions (like DeviceDescription, AsBlas, etc.)
   // can acquire the lock on their first (mutating) call as well.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   // Reference to the platform that created this executor.
   const Platform *platform_;
diff --git a/tensorflow/stream_executor/temporary_memory_manager.cc b/tensorflow/stream_executor/temporary_memory_manager.cc
index cd6a3cd..080e020 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/stream_executor/temporary_memory_manager.cc
@@ -16,8 +16,7 @@
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
@@ -26,7 +25,7 @@
 namespace internal {
 
 void TemporaryMemoryManager::ForceDeallocateAll() {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   VLOG(1) << "force-deallocating " << records_.size() << " remaining records";
   for (auto it = records_.begin(); it != records_.end(); ++it) {
     DeviceMemoryBase device_memory = it->first;
@@ -36,7 +35,7 @@
 
 void TemporaryMemoryManager::MarkFinalized(
     const DeviceMemoryBase& device_memory, uint64 generation, bool must_exist) {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   auto it = records_.find(device_memory);
   if (it == records_.end()) {
     if (must_exist) {
@@ -49,7 +48,7 @@
 }
 
 void TemporaryMemoryManager::DeallocateFinalizedTemporaries() {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   int deallocated_count = 0;
   for (auto it = records_.begin(); it != records_.end();) {
     if (it->second.finalized) {
@@ -66,7 +65,7 @@
 
 bool TemporaryMemoryManager::IsFinalized(const DeviceMemoryBase& device_memory,
                                          uint64 allocation_generation) const {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   auto it = records_.find(device_memory);
   if (it == records_.end()) {
     return true;  // If there's no record present it's vacuously finalized.
@@ -82,7 +81,7 @@
 
 bool TemporaryMemoryManager::HasAllocated(const DeviceMemoryBase& device_memory,
                                           uint64 generation) const {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   auto it = records_.find(device_memory);
   if (it == records_.end()) {
     return false;
@@ -107,16 +106,16 @@
   // Add the record before instantiating the device memory instance so we can
   // check the allocation invariant at TemporaryDeviceMemory construction time.
   {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
     generation = ++generation_;
     DCHECK(records_.find(device_memory) == records_.end());
     records_[device_memory] = {generation,
                                /*finalized=*/false};
   }
 
-  VLOG(1) << port::Printf(
-      "stream %p allocated temporary device memory at %p (size %llu) in "
-      "generation %llu",
+  VLOG(1) << absl::StreamFormat(
+      "stream %p allocated temporary device memory at %p (size %u) in "
+      "generation %u",
       stream_, device_memory.opaque(), byte_size, generation);
   std::unique_ptr<TemporaryDeviceMemoryBase> result(
       new TemporaryDeviceMemoryBase(stream_, device_memory, generation));
diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h
index faf1338..12d4d4b 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/stream_executor/temporary_memory_manager.h
@@ -24,10 +24,10 @@
 #include <map>
 #include <memory>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
@@ -108,7 +108,7 @@
       uint64 element_count, uint64 element_size);
 
   // Mutex to guard temporary record state.
-  mutable mutex mutex_;
+  mutable absl::Mutex mutex_;
 
   // Mapping from device memory to the current (live) temporary memory record.
   //
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 50d69c3..3f4bcab 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -224,12 +224,6 @@
         "//conditions:default": otherwise,
     })
 
-def if_not_windows_cuda(a):
-    return select({
-        clean_dep("//tensorflow:with_cuda_support_windows_override"): [],
-        "//conditions:default": a,
-    })
-
 def if_linux_x86_64(a):
     return select({
         clean_dep("//tensorflow:linux_x86_64"): a,
@@ -2033,7 +2027,8 @@
         flaky = 0,
         xla_enable_strict_auto_jit = False,
         xla_enabled = False,
-        grpc_enabled = False):
+        grpc_enabled = False,
+        **kwargs):
     """Create one or more python tests with extra tensorflow dependencies."""
     xla_test_true_list = []
 
@@ -2063,6 +2058,7 @@
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
         ] + additional_deps + xla_test_true_list,
+        **kwargs
     )
 
 register_extension_info(
@@ -2431,3 +2427,33 @@
         restricted_to = restricted_to,
         compatible_with = compatible_with,
     )
+
+def if_cuda_or_rocm(if_true, if_false = []):
+    """Shorthand for select()'ing whether to build for either CUDA or ROCm.
+
+    Returns a select statement which evaluates to
+       if_true if we're building with either CUDA or ROCm enabled.
+       if_false, otherwise.
+
+    Sometimes a target has additional CUDa or ROCm specific dependencies.
+    The `if_cuda` / `if_rocm` functions are used to specify these additional
+    dependencies. For eg, see the `//tensorflow/core/kernels:bias_op` target
+
+    If the same additional dependency is needed for both CUDA and ROCm
+    (for eg. `reduction_ops` dependency for the `bias_op` target above),
+    then specifying that dependency in both  both `if_cuda` and `if_rocm` will
+    result in both those functions returning a select statement, which contains
+    the same dependency, which then leads to a duplicate dependency bazel error.
+
+    In order to work around this error, any additional dependency that is common
+    to both the CUDA and ROCm platforms, should be specified using this function.
+    Doing so will eliminate the cause of the bazel error (i.e. the  same
+    dependency showing up in two different select statements)
+
+    """
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
+        "@local_config_rocm//rocm:using_hipcc": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index 8a9b8bb..da5b5e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -50,6 +50,12 @@
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "share_cluster_devices_in_session"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 4e3960c..a961fca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -173,6 +173,12 @@
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "share_cluster_devices_in_session"
+        number: 10
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 6c528dd..79c33f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -91,8 +91,20 @@
         type: TYPE_BOOL
       }
       field {
-        name: "pending_cap"
-        number: 6
+        name: "kernel_tracker_max_interval"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "kernel_tracker_max_bytes"
+        number: 8
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "kernel_tracker_max_pending"
+        number: 9
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index feb831f..972e7d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -37,7 +37,7 @@
   }
   member_method {
     name: "bounding_shape"
-    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "consumers"
@@ -45,43 +45,43 @@
   }
   member_method {
     name: "from_nested_row_lengths"
-    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_nested_row_splits"
-    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_nested_value_rowids"
-    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "from_row_lengths"
-    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_row_limits"
-    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_row_splits"
-    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_row_starts"
-    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_sparse"
-    argspec: "args=[\'cls\', \'st_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'st_input\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "from_tensor"
-    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\'], "
+    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "from_value_rowids"
-    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "nested_row_lengths"
@@ -89,7 +89,7 @@
   }
   member_method {
     name: "nrows"
-    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "row_lengths"
@@ -124,6 +124,10 @@
     argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "with_row_splits_dtype"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "with_values"
     argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
index 1f04d02..d283fb8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
@@ -18,10 +18,6 @@
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "ERROR_REWRITING"
-    mtype: "<enum \'Feature\'>"
-  }
-  member {
     name: "LISTS"
     mtype: "<enum \'Feature\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 7c8298a..94ffbca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -3,6 +3,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -37,6 +38,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index b3d775f..0ed2d44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -39,6 +40,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 70e3b67..a40c032 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -8,7 +8,7 @@
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_numa_aware"
+    name: "experimental_distribute"
     mtype: "<type \'property\'>"
   }
   member {
@@ -16,6 +16,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "experimental_slack"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 9ba5e9f..60f7e1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -39,6 +40,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 6e364b9..d335061 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -39,6 +40,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 8010c8b..3943195 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -39,6 +40,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt
new file mode 100644
index 0000000..5909fc7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DistributeOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.distribute_options.DistributeOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "auto_shard"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index 7149dbb..005e7a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -52,6 +52,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "parallel_batch"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-ragged-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-ragged-tensor-structure.pbtxt
new file mode 100644
index 0000000..97303fb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-ragged-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.RaggedTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.RaggedTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ragged_rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 5005de6..6221aaa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -39,6 +40,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 933f9e8..d190330 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -39,6 +40,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 1442189..1b1e67d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -17,6 +17,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "DistributeOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "INFINITE_CARDINALITY"
     mtype: "<type \'int\'>"
   }
@@ -41,6 +45,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "RaggedTensorStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "RandomDataset"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-estimator.pbtxt
index 883f1f2..1a00f88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-estimator.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'use_tpu\', \'train_batch_size\', \'eval_batch_size\', \'predict_batch_size\', \'batch_axis\', \'eval_on_tpu\', \'export_to_tpu\', \'export_to_cpu\', \'warm_start_from\', \'experimental_export_device_assignment\', \'experimental_embedding_config_spec\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\', \'True\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'use_tpu\', \'train_batch_size\', \'eval_batch_size\', \'predict_batch_size\', \'batch_axis\', \'eval_on_tpu\', \'export_to_tpu\', \'export_to_cpu\', \'warm_start_from\', \'experimental_export_device_assignment\', \'embedding_config_spec\', \'export_saved_model_api_version\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\', \'True\', \'None\', \'False\', \'None\', \'ExportSavedModelApiVersion.V1\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-fast-g-file.pbtxt
index a69679b..cf9be7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.gfile.-fast-g-file.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-fast-g-file.pbtxt
@@ -11,10 +11,6 @@
     name: "name"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "seekable"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
@@ -48,6 +44,10 @@
     argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
+    name: "seekable"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "size"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-g-file.pbtxt
index 503da52..53753fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.gfile.-g-file.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-g-file.pbtxt
@@ -11,10 +11,6 @@
     name: "name"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "seekable"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
@@ -48,6 +44,10 @@
     argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
+    name: "seekable"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "size"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-open.pbtxt
index 43e2925..e58c4bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.gfile.-open.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-open.pbtxt
@@ -11,10 +11,6 @@
     name: "name"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "seekable"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
@@ -48,6 +44,10 @@
     argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
+    name: "seekable"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "size"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 7b8e00e..ca30692 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -62,7 +62,7 @@
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
@@ -246,11 +246,11 @@
   }
   member_method {
     name: "ssim"
-    argspec: "args=[\'img1\', \'img2\', \'max_val\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'filter_size\', \'filter_sigma\', \'k1\', \'k2\'], varargs=None, keywords=None, defaults=[\'11\', \'1.5\', \'0.01\', \'0.03\'], "
   }
   member_method {
     name: "ssim_multiscale"
-    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\'], "
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\', \'filter_size\', \'filter_sigma\', \'k1\', \'k2\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\', \'11\', \'1.5\', \'0.01\', \'0.03\'], "
   }
   member_method {
     name: "total_variation"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
index cf6f5e0..c7b1782 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
@@ -11,10 +11,6 @@
     name: "name"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "seekable"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
@@ -48,6 +44,10 @@
     argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
+    name: "seekable"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "size"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index cd64c3e..73f765f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -74,7 +74,7 @@
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
@@ -94,7 +94,7 @@
   }
   member_method {
     name: "decode_raw"
-    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'input_bytes\', \'out_type\', \'little_endian\', \'name\', \'bytes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "deserialize_many_sparse"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 7de6822..ab09474 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -104,6 +104,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 8560973..9d9afe3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -105,6 +105,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 73b6eeb..af0da4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 11081ed..78d5475 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 2512c30..416a309 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -87,6 +87,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 26187f3..f277bfb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index ed43cf3..e880978 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 8b09832..407eb2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
new file mode 100644
index 0000000..d429857
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -0,0 +1,206 @@
+path: "tensorflow.keras.layers.AdditiveAttention"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.dense_attention.AdditiveAttention\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.dense_attention.BaseDenseAttention\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'use_scale\'], varargs=None, keywords=kwargs, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index b9abcb4..3e1801a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index d72f31f..52d4488 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 3f4d9a6..3730402 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index a1666f9..e47e21e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 7e71d7b..2ffc509 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index 2b54b0a..e993e45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 2392a96..4c27bb2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8747160..b1148dc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 27f8b33..55ab4e5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index 36e7ed9..89b5f4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index bdae01b..d29e7cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -84,6 +84,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
@@ -105,7 +109,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'layer\', \'merge_mode\', \'weights\'], varargs=None, keywords=kwargs, defaults=[\'concat\', \'None\'], "
+    argspec: "args=[\'self\', \'layer\', \'merge_mode\', \'weights\', \'backward_layer\'], varargs=None, keywords=kwargs, defaults=[\'concat\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index f061166..aa1a763 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 792573a..2f618c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -161,6 +161,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 0caff0d..c3d6060 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 0cf6376..134140d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 137afe4..d4fca49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b973c19..195e105 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index f3c86ec..037105c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index d19703c..53fa432 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4c6a4fa..73b0c70 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 6247a83..064ddf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 284b385..80eb98b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 8135ced..6f4126f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 01037ab..703838b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index bbf91ca..3239169 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index ef48b2b..6545f77 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 4822a69..969fd8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -89,6 +89,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index ba9c428..1aa8523 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -89,6 +89,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index de0b35f..55e8780 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index c9dfb59..205652a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 32cba18..9435078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index afaf648..e6f0947 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 59b0404..bd71d50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index f284946..1ace5ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 3190565..1407e65 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 299169f..72a89d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 532e286..67defe6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index c276949..8e288cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -144,6 +144,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index fb7431f..7b99615 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 9bad46e..26a5914 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 1e9030d..773d98f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index d7148b0..318d3be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index a8350af..0899553 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index a067299..c3c0c03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 0d0ad66..55c5870 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 45438bd..30fcd8e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 0bf8efa..6f3dc3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index b4446b1..1819793 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index ccd2ee4..dd9cb0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index f4aad9e..380c6a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 6a58f7b..fca2eb2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 1a76d09..59b2d6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index b64ca63..6f2277f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 544c68d..58228d3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 446c51c..e8c3cf3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -140,6 +140,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2dc143c..9c388af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index e4d70a6..8e149c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 23ce714..097a4c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -78,6 +78,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 8bca987..4d09984 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fcf0b9c..292e821 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 6491126..465cc1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 4535ddc..9eba022 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index b1ca635..08636cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index d47303a..377c7ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ba14ef3..43c3b4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index d30dc3b..54debff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index d330797..6733e4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 9db6af6..c7ed48d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index cec1596..8a24322 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index f89e27f..2a9c04b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index e1c450b..8605b7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 4583764..31668c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 0d9553f..244f156 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index c5790b3..e258727 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index e4aae27..7d659b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index bd6548f..02ae681 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 731341b..cf08b25 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index db493c5..84bd0df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 26b96ab..b87fb3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index ade9dbf..2b8aba8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 3b0cd02..6162b9f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index f795267..e6ce21c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3be8360..f6ccb28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -132,6 +132,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 21169cf..9a2c62a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 790682e..23f29f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 400159f..6d6bb87 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 41c02d5..3d2ca03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1407d95..6127668 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -87,6 +87,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 5679b63..5ea0e25 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index ea7cb15..a0457e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index d0bf532..8504231 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index b5e994e..5edc9f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index a30d438..2282e9a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 50152d7..425e736 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index d97e427..acfe1d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index a7e49b8..e8854a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index cbf69a5..e76f6d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 0bcba0f..0a6cb86 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index cac14a3..cabe216 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -17,6 +17,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "AdditiveAttention"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "AlphaDropout"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
index 1242eec..9ccd6bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'auto\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
index cf3c2de..919bedc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'auto\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
index fa374af..d298a55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'categorical_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
index aa14c44..661375b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'auto\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
index a4c25ee..0c82d90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
index 1fa8ffa..678c57e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'auto\', \'huber_loss\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
index d950c78..30d68f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'kullback_leibler_divergence\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
index fbbd531..9310f07 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'logcosh\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
index 04a9cc9..54316ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
@@ -4,7 +4,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
index 9da6b59..711ca17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
index 7c3ae9b..cb4e2ae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
index 2126ac6..e9a3a4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
index 6ef9610..572a106 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
index 61c90c3..86cf812 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'poisson\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
index c13f9f9..5cd2722 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
index fabe4c7..453e321 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'squared_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index 840f019..91dcbd8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 9270466..24854ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 1cbaa41..1cb9a7f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 84f6159..ae0cc85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 819d56c..20567ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index ddcaaab..4512fc8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 61e5f31..8246b68 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 9bb4fac..0afbb70 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 6d109a4..29690d0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 28e6856..a1fbded 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index 3e7651b..6a5fe85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index a683124..965098d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index d399050..49a620d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 10f3aaa..8ac0943 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 2cc27e7..3dd2cfc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 545fb62..350bf48 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 0f2c4ac..d96d503 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -82,6 +82,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 05af94c..41b2610 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index bffb2ab..b72ed24 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 8419527..3a82535 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -88,6 +88,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 021df9e..22a91dc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index 2486468..ffe189f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index 1347230..2041c56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index 307ce9c..637f129 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 1614323..5008b2e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index 4fd4e52..5f47075 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0b2be40..c03826e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index ddcbd80..c3fe4ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index df3d6ef..dac7863 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 23431e4..345cae9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 388267b..757db17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 7c23fd5..76a5473 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index d815fe5..704ab64 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 4a7edf5..4faa79d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index c8bb427..17249aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index e73842c..8570935 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 56f85ae..62949ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -104,6 +104,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 21018c6..ac70d55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -105,6 +105,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 4163555..c85e88a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -14,7 +14,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'SGD\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index fe469b4..fe33366 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 16b87f9..dccf58e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index aed4197..9c155b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index dcca23e..5c8b409 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 6ee991e..c30de1ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 3e9f60e..7661674 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -91,6 +91,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index a09c8d8..5e6b54c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index e86bf9f..3bba8ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -91,6 +91,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 12cc0f7..75a44aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 85944f6..e852844 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -89,6 +89,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 74a5c1c..bddf795 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -89,6 +89,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 92e196f..f48d3ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -89,6 +89,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 905b5c1..e712e36 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -87,6 +87,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index e551ec7..9c9328b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 8ff72d5..7b3ebab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 64cad47..77bc2d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index a653b84..ee90918 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -91,6 +91,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 1f2dbf2..a684efd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -91,6 +91,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
new file mode 100644
index 0000000..46aed53
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorHouseholder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_householder.LinearOperatorHouseholder\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reflection_axis"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reflection_axis\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'LinearOperatorHouseholder\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
new file mode 100644
index 0000000..db125db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -0,0 +1,154 @@
+path: "tensorflow.linalg.LinearOperatorToeplitz"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_toeplitz.LinearOperatorToeplitz\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "col"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'col\', \'row\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'LinearOperatorToeplitz\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index da44ebb..a5b3123 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -37,6 +37,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "LinearOperatorHouseholder"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
@@ -61,6 +65,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "LinearOperatorToeplitz"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "LinearOperatorZeros"
     mtype: "<type \'type\'>"
   }
@@ -209,7 +217,11 @@
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
   member_method {
+    name: "tridiagonal_matmul"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'None\'], "
+  }
+  member_method {
     name: "tridiagonal_solve"
-    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\', \'partial_pivoting\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
index 68c651a..c3199b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -9,4 +9,8 @@
     name: "TFLITE_BUILTINS"
     mtype: "<enum \'OpsSet\'>"
   }
+  member {
+    name: "TFLITE_BUILTINS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
index fedb5ee..0c9a9e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
@@ -2,6 +2,10 @@
 tf_class {
   is_instance: "<enum \'Optimize\'>"
   member {
+    name: "DEFAULT"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
     name: "OPTIMIZE_FOR_LATENCY"
     mtype: "<enum \'Optimize\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
index 791031c..0aea893 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.lite.TFLiteConverter"
 tf_class {
   is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverter\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterBase\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
index f6a5006..8e1d054 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -98,6 +98,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index 5d437bb..612228a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -98,6 +98,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
index 354a708..e4250ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
@@ -8,4 +8,8 @@
     name: "convert_op_hints_to_stubs"
     argspec: "args=[\'session\', \'graph_def\', \'write_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'<function <lambda> instance>\'], "
   }
+  member_method {
+    name: "get_potentially_supported_ops"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt
index e89846b..5a23453 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt
@@ -5,6 +5,7 @@
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.InitializableLookupTableBase\'>"
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.CapturableResource\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt
index d839fa1..25ad8cd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticVocabularyTable\'>"
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.CapturableResource\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
index 2308185..9367498 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
@@ -3,6 +3,7 @@
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.DenseHashTable\'>"
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.CapturableResource\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 6fea38d..1fd765a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -153,6 +153,10 @@
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "floormod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -261,6 +265,10 @@
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "mod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index cfc6400..815c075 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -98,6 +98,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 3ac302e..326c0a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -98,6 +98,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 9daccc5..282c089 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -99,6 +99,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index bd36c2f..ddc5943 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -99,6 +99,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index f863d61..4bb92bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -98,6 +98,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index ac9e102..8ad90e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -98,6 +98,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index ca9e2f7..c4a1b59 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -97,6 +97,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 75c82ca..3a5513a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -96,6 +96,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index c118f5d..32a3129 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -99,6 +99,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 256ed1e..091cc04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1062,7 +1062,7 @@
   }
   member_method {
     name: "decode_raw"
-    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'input_bytes\', \'out_type\', \'little_endian\', \'name\', \'bytes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "delete_session_tensor"
@@ -1253,6 +1253,10 @@
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'farmhash64\', \'None\'], "
+  }
+  member_method {
     name: "fixed_size_partitioner"
     argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
@@ -2417,6 +2421,10 @@
     argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
   }
   member_method {
+    name: "vectorized_map"
+    argspec: "args=[\'fn\', \'elems\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "verify_tensor_all_finite"
     argspec: "args=[\'t\', \'msg\', \'name\', \'x\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
index 06449de..5d74f3b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
@@ -6,11 +6,11 @@
   }
   member_method {
     name: "constant"
-    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "constant_value"
-    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'int64\'], "
   }
   member_method {
     name: "map_flat_values"
@@ -22,14 +22,14 @@
   }
   member_method {
     name: "range"
-    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "row_splits_to_segment_ids"
-    argspec: "args=[\'splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'splits\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_ids_to_row_splits"
-    argspec: "args=[\'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'segment_ids\', \'num_segments\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt
index 98b3e82..5e3f772 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt
@@ -9,6 +9,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "key"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "state"
     mtype: "<type \'property\'>"
   }
@@ -33,6 +37,10 @@
     argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "split"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 5baa82e..12e6689 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -294,7 +294,7 @@
   }
   member_method {
     name: "BatchDatasetV2"
-    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'parallel_copy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "BatchFFT"
@@ -853,6 +853,10 @@
     argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
   }
   member_method {
+    name: "DecodePaddedRaw"
+    argspec: "args=[\'input_bytes\', \'fixed_length\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
     name: "DecodePng"
     argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
@@ -1329,6 +1333,10 @@
     argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "Fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "FixedLengthRecordDataset"
     argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -2306,7 +2314,7 @@
   }
   member_method {
     name: "PaddedBatchDatasetV2"
-    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\', \'parallel_copy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "PaddingFIFOQueue"
@@ -2386,7 +2394,7 @@
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
@@ -2406,7 +2414,7 @@
   }
   member_method {
     name: "PrintV2"
-    argspec: "args=[\'input\', \'output_stream\', \'name\'], varargs=None, keywords=None, defaults=[\'stderr\', \'None\'], "
+    argspec: "args=[\'input\', \'output_stream\', \'end\', \'name\'], varargs=None, keywords=None, defaults=[\'stderr\', \'\\n\', \'None\'], "
   }
   member_method {
     name: "PriorityQueue"
@@ -2658,13 +2666,21 @@
   }
   member_method {
     name: "RaggedRange"
-    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "RaggedTensorFromVariant"
+    argspec: "args=[\'encoded_ragged\', \'input_ragged_rank\', \'output_ragged_rank\', \'Tvalues\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "RaggedTensorToSparse"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "RaggedTensorToVariant"
+    argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "RandomCrop"
     argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
   }
@@ -3193,6 +3209,10 @@
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "RngSkip"
+    argspec: "args=[\'resource\', \'algorithm\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "Roll"
     argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -3382,7 +3402,7 @@
   }
   member_method {
     name: "ShardDataset"
-    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\', \'require_non_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "ShardedFilename"
@@ -3441,6 +3461,10 @@
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "SnapshotDataset"
+    argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "Softmax"
     argspec: "args=[\'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -3853,6 +3877,10 @@
     argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
+    name: "StringLower"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
     name: "StringSplit"
     argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
@@ -3881,6 +3909,10 @@
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
+    name: "StringUpper"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
     name: "Sub"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -4257,8 +4289,12 @@
     argspec: "args=[\'x\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "TridiagonalMatMul"
+    argspec: "args=[\'superdiag\', \'maindiag\', \'subdiag\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "TridiagonalSolve"
-    argspec: "args=[\'diagonals\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'diagonals\', \'rhs\', \'partial_pivoting\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "TruncateDiv"
@@ -4282,11 +4318,11 @@
   }
   member_method {
     name: "UnicodeDecode"
-    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "UnicodeDecodeWithOffsets"
-    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "UnicodeEncode"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 8a326ed..0fa45e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -21,6 +21,10 @@
     argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
   member_method {
+    name: "lower"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
     name: "reduce_join"
     argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
   }
@@ -88,4 +92,8 @@
     name: "unicode_transcode"
     argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "upper"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
index 3879645..ebd1700 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
@@ -29,6 +29,10 @@
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
   member_method {
+    name: "all_v2_summary_ops"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "audio"
     argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index feb831f..972e7d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -37,7 +37,7 @@
   }
   member_method {
     name: "bounding_shape"
-    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "consumers"
@@ -45,43 +45,43 @@
   }
   member_method {
     name: "from_nested_row_lengths"
-    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_nested_row_splits"
-    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_nested_value_rowids"
-    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "from_row_lengths"
-    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_row_limits"
-    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_row_splits"
-    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_row_starts"
-    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_sparse"
-    argspec: "args=[\'cls\', \'st_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cls\', \'st_input\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "from_tensor"
-    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\'], "
+    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "from_value_rowids"
-    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "nested_row_lengths"
@@ -89,7 +89,7 @@
   }
   member_method {
     name: "nrows"
-    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "row_lengths"
@@ -124,6 +124,10 @@
     argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "with_row_splits_dtype"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "with_values"
     argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
index 1f04d02..d283fb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
@@ -18,10 +18,6 @@
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "ERROR_REWRITING"
-    mtype: "<enum \'Feature\'>"
-  }
-  member {
     name: "LISTS"
     mtype: "<enum \'Feature\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index f888034..bb56967 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -2,6 +2,7 @@
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -24,6 +25,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a256d6f..597c5bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -26,6 +27,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 70e3b67..a40c032 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -8,7 +8,7 @@
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_numa_aware"
+    name: "experimental_distribute"
     mtype: "<type \'property\'>"
   }
   member {
@@ -16,6 +16,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "experimental_slack"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 4c34c31..c24bac5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -3,6 +3,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -25,6 +26,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index ed5546d..8946cec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -26,6 +27,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 09d5bf0..2365c62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -26,6 +27,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt
new file mode 100644
index 0000000..5909fc7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DistributeOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.distribute_options.DistributeOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "auto_shard"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index 7149dbb..005e7a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -52,6 +52,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "parallel_batch"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-ragged-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-ragged-tensor-structure.pbtxt
new file mode 100644
index 0000000..97303fb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-ragged-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.RaggedTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.RaggedTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ragged_rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index b76676c..af008c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -26,6 +27,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 0e7b719..34370ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -26,6 +27,10 @@
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "enumerate"
+    argspec: "args=[\'self\', \'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 90ff169..b393680 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -17,6 +17,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "DistributeOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "INFINITE_CARDINALITY"
     mtype: "<type \'int\'>"
   }
@@ -41,6 +45,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "RaggedTensorStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "RandomDataset"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index d13d8ae..f51c46b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -62,7 +62,7 @@
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
@@ -218,11 +218,11 @@
   }
   member_method {
     name: "ssim"
-    argspec: "args=[\'img1\', \'img2\', \'max_val\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'filter_size\', \'filter_sigma\', \'k1\', \'k2\'], varargs=None, keywords=None, defaults=[\'11\', \'1.5\', \'0.01\', \'0.03\'], "
   }
   member_method {
     name: "ssim_multiscale"
-    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\'], "
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\', \'filter_size\', \'filter_sigma\', \'k1\', \'k2\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\', \'11\', \'1.5\', \'0.01\', \'0.03\'], "
   }
   member_method {
     name: "total_variation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
index cf6f5e0..c7b1782 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
@@ -11,10 +11,6 @@
     name: "name"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "seekable"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
@@ -48,6 +44,10 @@
     argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
+    name: "seekable"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "size"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index fc4c341..865f0f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -54,7 +54,7 @@
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
@@ -74,7 +74,7 @@
   }
   member_method {
     name: "decode_raw"
-    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'input_bytes\', \'out_type\', \'little_endian\', \'fixed_length\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "deserialize_many_sparse"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 7de6822..ab09474 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -104,6 +104,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 8560973..9d9afe3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -105,6 +105,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.name_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.name_scope.pbtxt
deleted file mode 100644
index a2b98b1..0000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.name_scope.pbtxt
+++ /dev/null
@@ -1,13 +0,0 @@
-path: "tensorflow.keras.backend.name_scope"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.name_scope\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index 81844f6..b7cfb94 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.keras.backend"
 tf_module {
-  member {
-    name: "name_scope"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "abs"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
@@ -309,6 +305,10 @@
     argspec: "args=[\'x\', \'value\', \'momentum\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "name_scope"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "ndim"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 6a00e0a..24385e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\', \'profile_batch\'], varargs=None, keywords=kwargs, defaults=[\'logs\', \'0\', \'True\', \'False\', \'epoch\', \'2\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\', \'profile_batch\', \'embeddings_freq\', \'embeddings_metadata\'], varargs=None, keywords=kwargs, defaults=[\'logs\', \'0\', \'True\', \'False\', \'epoch\', \'2\', \'0\', \'None\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 73b6eeb..af0da4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 11081ed..78d5475 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 2512c30..416a309 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -87,6 +87,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 26187f3..f277bfb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index ed43cf3..e880978 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 8b09832..407eb2f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
new file mode 100644
index 0000000..d429857
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -0,0 +1,206 @@
+path: "tensorflow.keras.layers.AdditiveAttention"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.dense_attention.AdditiveAttention\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.dense_attention.BaseDenseAttention\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'use_scale\'], varargs=None, keywords=kwargs, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index b9abcb4..3e1801a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index d72f31f..52d4488 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 3f4d9a6..3730402 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index a1666f9..e47e21e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 7e71d7b..2ffc509 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index 2b54b0a..e993e45 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 2392a96..4c27bb2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8747160..b1148dc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 27f8b33..55ab4e5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 3727d46..cbbfd50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index bdae01b..d29e7cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -84,6 +84,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
@@ -105,7 +109,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'layer\', \'merge_mode\', \'weights\'], varargs=None, keywords=kwargs, defaults=[\'concat\', \'None\'], "
+    argspec: "args=[\'self\', \'layer\', \'merge_mode\', \'weights\', \'backward_layer\'], varargs=None, keywords=kwargs, defaults=[\'concat\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index f061166..aa1a763 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 792573a..2f618c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -161,6 +161,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 0caff0d..c3d6060 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 0cf6376..134140d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 137afe4..d4fca49 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b973c19..195e105 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index f3c86ec..037105c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index d19703c..53fa432 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4c6a4fa..73b0c70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 6247a83..064ddf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 284b385..80eb98b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 8135ced..6f4126f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 01037ab..703838b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index bbf91ca..3239169 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index ef48b2b..6545f77 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index de0b35f..55e8780 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index c9dfb59..205652a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 32cba18..9435078 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index afaf648..e6f0947 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 59b0404..bd71d50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index f284946..1ace5ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 3190565..1407e65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 299169f..72a89d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 092fb55..f7aa2fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 27f5ef1..019dd04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -146,6 +146,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index fb7431f..7b99615 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 9bad46e..26a5914 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 1e9030d..773d98f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index d7148b0..318d3be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index a8350af..0899553 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index a067299..c3c0c03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 0d0ad66..55c5870 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 45438bd..30fcd8e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 0bf8efa..6f3dc3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index b4446b1..1819793 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index ccd2ee4..dd9cb0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index f4aad9e..380c6a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 6a58f7b..fca2eb2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 1a76d09..59b2d6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index b64ca63..6f2277f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 70e06a6..52b41d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index ec9e19b..f7851d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -142,6 +142,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 2dc143c..9c388af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index e4d70a6..8e149c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 23ce714..097a4c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -78,6 +78,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 8bca987..4d09984 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fcf0b9c..292e821 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 6491126..465cc1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 4535ddc..9eba022 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index b1ca635..08636cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index d47303a..377c7ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ba14ef3..43c3b4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index d30dc3b..54debff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index d330797..6733e4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 9db6af6..c7ed48d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index cec1596..8a24322 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index f89e27f..2a9c04b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index e1c450b..8605b7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 4583764..31668c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 0d9553f..244f156 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index c5790b3..e258727 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index e4aae27..7d659b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index bd6548f..02ae681 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 731341b..cf08b25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index db493c5..84bd0df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 26b96ab..b87fb3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index ade9dbf..2b8aba8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 3b0cd02..6162b9f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index f795267..e6ce21c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3be8360..f6ccb28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -132,6 +132,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 21169cf..9a2c62a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 790682e..23f29f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 400159f..6d6bb87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 41c02d5..3d2ca03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1407d95..6127668 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -87,6 +87,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 5679b63..5ea0e25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index ea7cb15..a0457e0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index d0bf532..8504231 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index b5e994e..5edc9f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index a30d438..2282e9a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 50152d7..425e736 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index d97e427..acfe1d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index a7e49b8..e8854a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index cbf69a5..e76f6d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 0bcba0f..0a6cb86 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index ff95366..b8bd6d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -17,6 +17,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "AdditiveAttention"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "AlphaDropout"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
index 1242eec..9ccd6bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'auto\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
index cf3c2de..919bedc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'auto\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
index fa374af..d298a55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'categorical_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
index aa14c44..661375b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'auto\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
index a4c25ee..0c82d90 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
index 1fa8ffa..678c57e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'auto\', \'huber_loss\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
index d950c78..30d68f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'kullback_leibler_divergence\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
index fbbd531..9310f07 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'logcosh\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
index 04a9cc9..54316ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
@@ -4,7 +4,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
index 9da6b59..711ca17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
index 7c3ae9b..cb4e2ae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
index 2126ac6..e9a3a4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
index 6ef9610..572a106 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
index 61c90c3..86cf812 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'poisson\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
index 76f6c39..7981f94 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -3,6 +3,10 @@
   is_instance: "<class \'tensorflow.python.ops.losses.loss_reduction.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
+    name: "AUTO"
+    mtype: "<type \'str\'>"
+  }
+  member {
     name: "NONE"
     mtype: "<type \'str\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
index c13f9f9..5cd2722 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
index fabe4c7..453e321 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'squared_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index 840f019..91dcbd8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 9270466..24854ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 1cbaa41..1cb9a7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 84f6159..ae0cc85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 819d56c..20567ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index ddcaaab..4512fc8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 61e5f31..8246b68 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 9bb4fac..0afbb70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 6d109a4..29690d0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 28e6856..a1fbded 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index 3e7651b..6a5fe85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index a683124..965098d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index d399050..49a620d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 10f3aaa..8ac0943 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 2cc27e7..3dd2cfc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 545fb62..350bf48 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 0f2c4ac..d96d503 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -82,6 +82,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 05af94c..41b2610 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index bffb2ab..b72ed24 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 8419527..3a82535 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -88,6 +88,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 021df9e..22a91dc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index 2486468..ffe189f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index 1347230..2041c56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index 307ce9c..637f129 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 1614323..5008b2e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index 4fd4e52..5f47075 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0b2be40..c03826e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index ddcbd80..c3fe4ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index df3d6ef..dac7863 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 23431e4..345cae9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 388267b..757db17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 7c23fd5..76a5473 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index d815fe5..704ab64 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 4a7edf5..4faa79d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index c8bb427..17249aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index e73842c..8570935 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 56f85ae..62949ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -104,6 +104,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 21018c6..ac70d55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -105,6 +105,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 4163555..c85e88a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -14,7 +14,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'SGD\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
new file mode 100644
index 0000000..46aed53
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorHouseholder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_householder.LinearOperatorHouseholder\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reflection_axis"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reflection_axis\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'LinearOperatorHouseholder\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
new file mode 100644
index 0000000..db125db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -0,0 +1,154 @@
+path: "tensorflow.linalg.LinearOperatorToeplitz"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_toeplitz.LinearOperatorToeplitz\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "col"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'col\', \'row\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'LinearOperatorToeplitz\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 14bd28e..d5ab294 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -37,6 +37,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "LinearOperatorHouseholder"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
@@ -61,6 +65,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "LinearOperatorToeplitz"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "LinearOperatorZeros"
     mtype: "<type \'type\'>"
   }
@@ -205,7 +213,11 @@
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
   member_method {
+    name: "tridiagonal_matmul"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'None\'], "
+  }
+  member_method {
     name: "tridiagonal_solve"
-    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\', \'partial_pivoting\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
index 68c651a..c3199b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -9,4 +9,8 @@
     name: "TFLITE_BUILTINS"
     mtype: "<enum \'OpsSet\'>"
   }
+  member {
+    name: "TFLITE_BUILTINS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
index fedb5ee..0c9a9e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
@@ -2,6 +2,10 @@
 tf_class {
   is_instance: "<enum \'Optimize\'>"
   member {
+    name: "DEFAULT"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
     name: "OPTIMIZE_FOR_LATENCY"
     mtype: "<enum \'Optimize\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
index d856442..63a6667 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.lite.TFLiteConverter"
 tf_class {
   is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterV2\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterBase\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt
index fbc59f4..0139ed6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt
@@ -4,6 +4,7 @@
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.InitializableLookupTableBase\'>"
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.CapturableResource\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt
index 0656983..fa12eea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt
@@ -3,6 +3,7 @@
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticVocabularyTable\'>"
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.CapturableResource\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
index 2308185..9367498 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
@@ -3,6 +3,7 @@
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.DenseHashTable\'>"
   is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.CapturableResource\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
index 1d180a9..5de8296 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'auto\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
index 3937dfa..a715bec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'auto\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
index d2a064d..b9985c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'categorical_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
index 7829f0f..2ccac4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'auto\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
index 155154c..bbee2fe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
index 5052c19..c8d2d54 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'auto\', \'huber_loss\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
index b739c05..21930e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'kullback_leibler_divergence\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
index 557cc21..44d1f89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'logcosh\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
index 2bcc6f8..0cabe5d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
@@ -4,7 +4,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
index a33db29..7fd0d0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
index 4c79a5e..e9a692b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
index b99e194..3d3e40c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
index e018273..3626470 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
index b6603cb..6d531e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'poisson\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 8b14a80..5f901d0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -3,6 +3,10 @@
   is_instance: "<class \'tensorflow.python.ops.losses.loss_reduction.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
+    name: "AUTO"
+    mtype: "<type \'str\'>"
+  }
+  member {
     name: "NONE"
     mtype: "<type \'str\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
index 5e3ce6f..60709a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'auto\', \'None\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
index b5e3757..4be04f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
@@ -6,7 +6,7 @@
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'squared_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index c2c5bb5..3ec5c65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -153,6 +153,10 @@
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "floormod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -261,6 +265,10 @@
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "mod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
index 2242558..afa9598 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
index 1858961..44425c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
index eeffba3..711cc5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
index 9cb7245..cf1053c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
index c6aa5a3..3beb5da 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
index c2ec5e0..06d86d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
index e727df0..d4ef1c9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index 22e7969..735dfe8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
index 5f709fa..e58ad3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
index 3a458d8..30b9d0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
index e6b1502..765f9b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
index db4a315..d12afc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
index ae0a2ad..afd1d57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
index 8db5a38..f0d8fd8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
index 0581fb2..bf16a6c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
index 7c85919..004825b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
index d078882..8b656e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -82,6 +82,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
index 26e2fc7..39b589e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
index 4f40183..f0503f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
index e3b1527..5fc12fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -88,6 +88,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
index 7be9dc1..88901af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
index 5b13db0..6827926 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -79,6 +79,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
index 3d206f1..ccb453c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
index 4b2456a..04c3ded 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
index 0cd7170..b81e4c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -80,6 +80,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
index 27c4143..99037be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
index e1dca78..49c0951 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
index 10af944..d08920f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
index 2b102c3..ab13993 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 95ec2ee..af18823 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
index eb467f2..e32214c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
index b373aab..ecd99a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
index bfc7e92..5ad2008 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
index 525f161..97c11cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -83,6 +83,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
index 6cbb051..4499145 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
index 6425d8a..c38d8ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -81,6 +81,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index cbc4b47..b8fd91bbf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
index 9ca22c4..f7a8668 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
index f159724..f8854a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -90,6 +90,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
index 381f727..8a24dcf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -14,7 +14,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'SGD\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 15d7fdc..656d026 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -605,18 +605,14 @@
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'farmhash64\', \'None\'], "
+  }
+  member_method {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "floor_div"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "floormod"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
     name: "foldl"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
   }
@@ -757,10 +753,6 @@
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "mod"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -777,10 +769,6 @@
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "no_regularizer"
-    argspec: "args=[\'_\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
     name: "nondifferentiable_batch_function"
     argspec: "args=[\'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'allowed_batch_sizes\', \'max_enqueued_batches\', \'autograph\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\'], "
   }
@@ -1073,6 +1061,10 @@
     argspec: "args=[\'variable_creator\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "vectorized_map"
+    argspec: "args=[\'fn\', \'elems\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "where"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
index 5fde488..e9398da 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
@@ -2,7 +2,7 @@
 tf_module {
   member_method {
     name: "constant"
-    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "map_flat_values"
@@ -10,14 +10,14 @@
   }
   member_method {
     name: "range"
-    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "row_splits_to_segment_ids"
-    argspec: "args=[\'splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'splits\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_ids_to_row_splits"
-    argspec: "args=[\'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'segment_ids\', \'num_segments\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt
index 98b3e82..5e3f772 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt
@@ -9,6 +9,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "key"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "state"
     mtype: "<type \'property\'>"
   }
@@ -33,6 +37,10 @@
     argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "split"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 5baa82e..12e6689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -294,7 +294,7 @@
   }
   member_method {
     name: "BatchDatasetV2"
-    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'parallel_copy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "BatchFFT"
@@ -853,6 +853,10 @@
     argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
   }
   member_method {
+    name: "DecodePaddedRaw"
+    argspec: "args=[\'input_bytes\', \'fixed_length\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
     name: "DecodePng"
     argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
@@ -1329,6 +1333,10 @@
     argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "Fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "FixedLengthRecordDataset"
     argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -2306,7 +2314,7 @@
   }
   member_method {
     name: "PaddedBatchDatasetV2"
-    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\', \'parallel_copy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "PaddingFIFOQueue"
@@ -2386,7 +2394,7 @@
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
@@ -2406,7 +2414,7 @@
   }
   member_method {
     name: "PrintV2"
-    argspec: "args=[\'input\', \'output_stream\', \'name\'], varargs=None, keywords=None, defaults=[\'stderr\', \'None\'], "
+    argspec: "args=[\'input\', \'output_stream\', \'end\', \'name\'], varargs=None, keywords=None, defaults=[\'stderr\', \'\\n\', \'None\'], "
   }
   member_method {
     name: "PriorityQueue"
@@ -2658,13 +2666,21 @@
   }
   member_method {
     name: "RaggedRange"
-    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "RaggedTensorFromVariant"
+    argspec: "args=[\'encoded_ragged\', \'input_ragged_rank\', \'output_ragged_rank\', \'Tvalues\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "RaggedTensorToSparse"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "RaggedTensorToVariant"
+    argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "RandomCrop"
     argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
   }
@@ -3193,6 +3209,10 @@
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "RngSkip"
+    argspec: "args=[\'resource\', \'algorithm\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "Roll"
     argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -3382,7 +3402,7 @@
   }
   member_method {
     name: "ShardDataset"
-    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\', \'require_non_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "ShardedFilename"
@@ -3441,6 +3461,10 @@
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "SnapshotDataset"
+    argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "Softmax"
     argspec: "args=[\'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -3853,6 +3877,10 @@
     argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
+    name: "StringLower"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
     name: "StringSplit"
     argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
@@ -3881,6 +3909,10 @@
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
+    name: "StringUpper"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
     name: "Sub"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -4257,8 +4289,12 @@
     argspec: "args=[\'x\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "TridiagonalMatMul"
+    argspec: "args=[\'superdiag\', \'maindiag\', \'subdiag\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "TridiagonalSolve"
-    argspec: "args=[\'diagonals\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'diagonals\', \'rhs\', \'partial_pivoting\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "TruncateDiv"
@@ -4282,11 +4318,11 @@
   }
   member_method {
     name: "UnicodeDecode"
-    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "UnicodeDecodeWithOffsets"
-    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "UnicodeEncode"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
deleted file mode 100644
index cd97716..0000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.sparse.SparseConditionalAccumulator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.SparseConditionalAccumulator\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'sparse_conditional_accumulator\', \'MEAN\'], "
-  }
-  member_method {
-    name: "apply_grad"
-    argspec: "args=[\'self\', \'grad_indices\', \'grad_values\', \'grad_shape\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "apply_indexed_slices_grad"
-    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_indexed_slices_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index b8bd2c0..cbb8439 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -1,10 +1,6 @@
 path: "tensorflow.sparse"
 tf_module {
   member {
-    name: "SparseConditionalAccumulator"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index b959c7c..592da35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -21,6 +21,10 @@
     argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
+    name: "lower"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
     name: "reduce_join"
     argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], "
   }
@@ -88,4 +92,8 @@
     name: "unicode_transcode"
     argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "upper"
+    argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 7b275c5..88c73d6 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -185,8 +185,8 @@
     def _AddMember(member_name, member_obj, proto):
       """Add the child object to the object being constructed."""
       _, member_obj = tf_decorator.unwrap(member_obj)
-      if (_SkipMember(parent, member_name)
-          or member_obj == deprecation.HIDDEN_ATTRIBUTE):
+      if (_SkipMember(parent, member_name) or
+          isinstance(member_obj, deprecation.HiddenTfApiAttribute)):
         return
       if member_name == '__init__' or not member_name.startswith('_'):
         if tf_inspect.isroutine(member_obj):
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index a2440ea..7f7748c 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -1,5 +1,10 @@
 # TensorFlow API backwards compatibility tests.
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+)
+
 package(
     default_visibility = ["//tensorflow/tools/api:__subpackages__"],
 )
@@ -22,8 +27,12 @@
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_rocm"],
+    tags = [
+        "no_pip",
+        "no_rocm",
+    ],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -39,6 +48,7 @@
 py_test(
     name = "deprecation_test",
     srcs = ["deprecation_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 7ed92dc..b152937 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -357,17 +357,24 @@
 
   @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
-    api_version = 2 if '_api.v2' in tf.__name__ else 1
+    api_version = 2 if '_api.v2' in tf.bitwise.__name__ else 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
+    omit_golden_symbols_map = {}
+    if api_version == 2 and FLAGS.only_test_core_api:
+      # In TF 2.0 these summary symbols are imported from TensorBoard.
+      omit_golden_symbols_map['tensorflow.summary'] = [
+          'audio', 'histogram', 'image', 'scalar', 'text']
+
     self._checkBackwardsCompatibility(
         tf,
         golden_file_pattern,
         api_version,
         # Skip compat.v1 and compat.v2 since they are validated
         # in separate tests.
-        additional_private_map={'tf.compat': ['v1', 'v2']})
+        additional_private_map={'tf.compat': ['v1', 'v2']},
+        omit_golden_symbols_map=omit_golden_symbols_map)
 
     # Also check that V1 API has contrib
     self.assertTrue(
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_centos b/tensorflow/tools/ci_build/Dockerfile.custom_op_centos
new file mode 100644
index 0000000..b978d4b
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_centos
@@ -0,0 +1,24 @@
+FROM quay.io/aicoe/manylinux2010_x86_64:latest
+
+LABEL maintainer="Amit Patankar <amitpatankar@google.com>"
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_yum_packages.sh
+
+# Enable devtoolset-7, python27 and rh-python35 in the docker image.
+env PATH="/opt/rh/rh-python35/root/usr/bin:/opt/rh/python27/root/usr/bin:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
+    LD_LIBRARY_PATH="/opt/rh/rh-python35/root/usr/lib64:/opt/rh/python27/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/opt/rh/devtoolset-7/root/usr/lib64/dyninst:/opt/rh/devtoolset-7/root/usr/lib/dyninst:/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib" \
+    PCP_DIR="/opt/rh/devtoolset-7/root" \
+    PERL5LIB="/opt/rh/devtoolset-7/root//usr/lib64/perl5/vendor_perl:/opt/rh/devtoolset-7/root/usr/lib/perl5:/opt/rh/devtoolset-7/root//usr/share/perl5/vendor_perl" \
+    PKG_CONFIG_PATH="/opt/rh/rh-python35/root/usr/lib64/pkgconfig/:/opt/rh/python27/root/usr/lib64/pkgconfig/"
+
+RUN /install/install_centos_pip_packages.sh
+RUN /install/install_bazel_from_source.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier.sh
+RUN /install/install_golang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6 b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6
new file mode 100644
index 0000000..0a56400
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6
@@ -0,0 +1,42 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cpu-centos6 \
+#       --tag "gcr.io/tensorflow-testing/nosla-centos6" .
+# $ docker push gcr.io/tensorflow-testing/nosla-centos6
+
+FROM quay.io/aicoe/manylinux2010_x86_64:latest
+LABEL maintainer="Amit Patankar <amitpatankar@google.com>"
+
+# Install packages required to build tensorflow.
+RUN yum install -y centos-release-scl && \
+    yum install -y \
+      devtoolset-7 \
+      java-1.8.0-openjdk-devel \
+      patch \
+      python27 \
+      rh-python35 \
+      wget && \
+    yum clean all -y
+
+# Enable devtoolset-7, python27, and rh-python35 in the docker image.
+env PATH="/opt/rh/rh-python35/root/usr/bin:/opt/rh/python27/root/usr/bin:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
+    LD_LIBRARY_PATH="/opt/rh/rh-python35/root/usr/lib64:/opt/rh/python27/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/opt/rh/devtoolset-7/root/usr/lib64/dyninst:/opt/rh/devtoolset-7/root/usr/lib/dyninst:/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib" \
+    PCP_DIR="/opt/rh/devtoolset-7/root" \
+    PERL5LIB="/opt/rh/devtoolset-7/root//usr/lib64/perl5/vendor_perl:/opt/rh/devtoolset-7/root/usr/lib/perl5:/opt/rh/devtoolset-7/root//usr/share/perl5/vendor_perl" \
+    PKG_CONFIG_PATH="/opt/rh/rh-python35/root/usr/lib64/pkgconfig/:/opt/rh/python27/root/usr/lib64/pkgconfig/"
+
+# Install pip packages needed to build tensorflow.
+COPY install/*.sh /install/
+RUN bash install/install_centos_pip_packages.sh
+
+# Install a /usr/bin/python3 link.
+# centos by default does not provide links, and instead relies on paths into
+# /opt/ to switch to alternative configurations. For bazel remote builds,
+# the python path between the local machine running bazel and the remote setup
+# must be the same.
+RUN update-alternatives --install /usr/bin/python3 python3 /opt/rh/rh-python35/root/usr/bin/python3 0
+
+# Install a ubuntu-compatible openjdk link so that ubuntu JAVA_HOME works
+# for this image.
+# TODO(klimek): Figure out a way to specify a different remote java path from
+# the local one.
+RUN ln -s /usr/lib/jvm/java /usr/lib/jvm/java-8-openjdk-amd64
diff --git a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
new file mode 100755
index 0000000..ab6c19b
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -x
+
+cd bazel_pip
+virtualenv --system-site-packages --python=python .env
+source .env/bin/activate
+pip --version
+pip install portpicker
+pip install *.whl
+
+# Use default configuration
+yes "" | python configure.py
+
+PIP_TEST_ROOT=pip_test_root
+mkdir -p ${PIP_TEST_ROOT}
+ln -s $(pwd)/tensorflow ${PIP_TEST_ROOT}/tensorflow
+bazel test --define=no_tensorflow_py_deps=true \
+      --test_lang_filters=py \
+      --build_tests_only \
+      -k \
+      --test_tag_filters=-no_oss,-oss_serial,-no_pip,-nopip \
+      --test_size_filters=small,medium \
+      --test_timeout 300,450,1200,3600 \
+      --test_output=errors \
+      -- //${PIP_TEST_ROOT}/tensorflow/python/... \
+      -//${PIP_TEST_ROOT}/tensorflow/python/keras:training_eager_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python/keras:base_layer_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python/distribute:distribute_lib_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test_gpu
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 5fba43c..30d5efb 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -51,8 +51,8 @@
   rm -rf ${DIR}
 
   TARBALL_SUFFIX="${1}"
-  BAZEL_OPTS="-c opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
-  export CC_OPT_FLAGS='-mavx'
+  BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+  export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
     export TF_NEED_ROCM=0
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index 1cc5aed..0c8c506 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,6 +40,8 @@
   ADDUSER_OPTS="--force-badname"
 fi
 
+apt-get install sudo
+
 getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index f78bab3..c288ea4 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -440,7 +440,6 @@
 do_bazel_nobuild() {
   BUILD_TARGET="//tensorflow/..."
   BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/delegates/gpu/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/examples/android/..."
   BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/java/demo/app/..."
   BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/schema/..."
   BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} -- ${BUILD_TARGET}"
@@ -541,12 +540,48 @@
   python file_name_test.py
 }
 
-do_libtensorflow_framework_not_depend_on_cuda_check() {
-  bazel build --action_env=TF_NEED_CUDA=1 --define framework_shared_object=true --config=cuda --nobuild_tests_only tensorflow/core/platform/default/build_config:libtensorflow_cuda_check_deps
+# Check that TARGET does not depend on DISALLOWED_DEP.
+_check_no_deps() {
+  TARGET="$1"
+  DISALLOWED_DEP="$2"
+
+  TMP_FILE="$(mktemp)_tmp.log"
+  echo "Checking ${TARGET} does not depend on ${DISALLOWED_DEP} ..."
+  bazel cquery "somepath(${TARGET}, ${DISALLOWED_DEP})" --keep_going> "${TMP_FILE}" 2>&1
+  if cat "${TMP_FILE}" | grep "Empty query results"; then
+      echo "Success."
+  else
+      cat "${TMP_FILE}"
+      echo
+      echo "ERROR: Found path from ${TARGET} to disallowed dependency ${DISALLOWED_DEP}."
+      echo "See above for path."
+      rm "${TMP_FILE}"
+      exit 1
+  fi
+  rm "${TMP_FILE}"
+}
+
+do_pip_no_cuda_deps_check() {
+  DISALLOWED_CUDA_DEPS=("@local_config_cuda//cuda:cudart"
+        "@local_config_cuda//cuda:cublas"
+        "@local_config_cuda//cuda:cuda_driver"
+        "@local_config_cuda//cuda:cudnn"
+        "@local_config_cuda//cuda:curand"
+        "@local_config_cuda//cuda:cusolver"
+        "@local_config_cuda//cuda:cusparse")
+  for cuda_dep in "${DISALLOWED_CUDA_DEPS[@]}"
+  do
+   _check_no_deps "//tensorflow/tools/pip_package:build_pip_package" "${cuda_dep}"
+   RESULT=$?
+
+   if [[ ${RESULT} != "0" ]]; then
+    exit 1
+   fi
+  done
 }
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_libtensorflow_framework_not_depend_on_cuda_check")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu libtensorflow_framework.so does not depend on cuda shared libraries.")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_pip_no_cuda_deps_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu pip package does not depend on cuda shared libraries.")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 59f418f..75de245 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -35,6 +35,6 @@
 
 curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip
 unzip bazel-$BAZEL_VERSION-dist.zip
-bash ./compile.sh
+env EXTRA_BAZEL_ARGS="--host_javabase=@local_jdk//:jdk" bash ./compile.sh
 cp output/bazel /usr/local/bin/
 rm -rf /bazel
diff --git a/tensorflow/tools/ci_build/install/install_centos_pip_packages.sh b/tensorflow/tools/ci_build/install/install_centos_pip_packages.sh
new file mode 100755
index 0000000..f9004b5
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_centos_pip_packages.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+pip2 install -U pip==18.1
+pip3 install -U pip==18.1
+
+pip2 install wheel==0.31.1
+pip3 install wheel==0.31.1
+
+# Install last working version of setuptools. This must happen before we install
+# absl-py, which uses install_requires notation introduced in setuptools 20.5.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
+pip2 install virtualenv
+pip3 install virtualenv
+
+# Install six.
+pip2 install --upgrade six==1.12.0
+pip3 install --upgrade six==1.12.0
+
+# Install absl-py.
+pip2 install --upgrade absl-py
+pip3 install --upgrade absl-py
+
+# Install werkzeug.
+pip2 install --upgrade werkzeug==0.11.10
+pip3 install --upgrade werkzeug==0.11.10
+
+# Install bleach. html5lib will be picked up as a dependency.
+pip2 install --upgrade bleach==2.0.0
+pip3 install --upgrade bleach==2.0.0
+
+# Install markdown.
+pip2 install --upgrade markdown==2.6.8
+pip3 install --upgrade markdown==2.6.8
+
+# Install protobuf.
+pip2 install --upgrade protobuf==3.6.1
+pip3 install --upgrade protobuf==3.6.1
+
+pip2 install --upgrade numpy==1.14.5
+pip3 install --upgrade numpy==1.14.5
+
+pip2 install scipy==1.1.0
+pip3 install scipy==1.1.0
+
+pip2 install scikit-learn==0.18.1
+pip3 install scikit-learn==0.18.1
+
+# pandas required by `inflow`
+pip2 install pandas==0.19.2
+pip3 install pandas==0.19.2
+
+# Benchmark tests require the following:
+pip2 install psutil
+pip3 install psutil
+pip2 install py-cpuinfo
+pip3 install py-cpuinfo
+
+# pylint tests require the following:
+pip2 install pylint==1.6.4
+pip3 install pylint==1.6.4
+
+# pycodestyle tests require the following:
+pip2 install pycodestyle
+pip3 install pycodestyle
+
+# tf.mock require the following for python2:
+pip2 install mock
+
+pip2 install portpicker
+pip3 install portpicker
+
+# TensorFlow Serving integration tests require the following:
+pip2 install grpcio
+pip3 install grpcio
+
+# Eager-to-graph execution needs astor, gast and termcolor:
+pip2 install --upgrade astor
+pip3 install --upgrade astor
+pip2 install --upgrade gast
+pip3 install --upgrade gast
+pip2 install --upgrade termcolor
+pip3 install --upgrade termcolor
+
+# Keras
+pip2 install keras_applications==1.0.6 --no-deps
+pip3 install keras_applications==1.0.6 --no-deps
+pip2 install keras_preprocessing==1.0.5 --no-deps
+pip3 install keras_preprocessing==1.0.5 --no-deps
+pip2 install --upgrade h5py==2.8.0
+pip3 install --upgrade h5py==2.8.0
+
+# Estimator
+pip2 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+pip3 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 131950d..b5da21c 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -128,5 +128,9 @@
 pip3 install --upgrade h5py==2.8.0
 
 # Estimator
-pip2 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
-pip3 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+pip2 install tf-estimator-nightly --no-deps
+pip3 install tf-estimator-nightly --no-deps
+
+# Argparse
+pip2 install --upgrade argparse
+pip3 install --upgrade argparse
diff --git a/tensorflow/tools/ci_build/install/install_yum_packages.sh b/tensorflow/tools/ci_build/install/install_yum_packages.sh
new file mode 100755
index 0000000..665409a
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_yum_packages.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+
+set -e
+
+yum install -y  epel-release \
+                centos-release-scl \
+                sudo
+
+yum install -y  atlas-devel \
+                bzip2-devel \
+                curl-devel \
+                devtoolset-7 \
+                expat-devel \
+                gdbm-devel \
+                gettext-devel \
+                java-1.8.0-openjdk \
+                java-1.8.0-openjdk-devel \
+                libffi-devel \
+                libtool \
+                libuuid-devel \
+                ncurses-devel \
+                openssl-devel \
+                patch \
+                patchelf \
+                perl-core \
+                python27 \
+                readline-devel \
+                rh-python35 \
+                rh-python36 \
+                sqlite-devel \
+                wget \
+                xz-devel \
+                zlib-devel
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
new file mode 100755
index 0000000..151cc5c
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -0,0 +1,47 @@
+ARG ROOT_CONTAINER_TAG=devel
+ARG ROOT_CONTAINER=tensorflow/tensorflow
+
+FROM ${ROOT_CONTAINER}:${ROOT_CONTAINER_TAG}
+
+LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
+
+# These parameters can be overridden
+ARG PYTHON="python"
+ARG WHL_DIR="/tmp/pip"
+ARG PIP="pip"
+ARG TARGET_PLATFORM="haswell"
+
+# Download and build TensorFlow from the latest sources found in the root container
+# make sure that if they pass in a tag, that it is loaded or we'll get an error
+WORKDIR /
+
+COPY tensorflow/ /tensorflow/
+
+WORKDIR /tensorflow
+
+RUN yes "" | ${PYTHON} configure.py
+
+ENV CI_BUILD_PYTHON ${PYTHON}
+
+# This script detects the version of gcc in the container, sets the appropriate
+# compiler flags based on parameters
+ADD set-build-env.py .
+RUN ${PYTHON} set-build-env.py -p ${TARGET_PLATFORM} -f /root/.mkl.bazelrc --disable-v2
+
+# Pull the compiler flags we just wrote into root user's .bazelrc file
+RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
+
+RUN bazel --bazelrc=/root/.bazelrc build -c opt \
+    tensorflow/tools/pip_package:build_pip_package && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
+    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
+    rm -rf /root/.cache
+    # Clean up Bazel cache when done.
+
+# TensorBoard
+EXPOSE 6006
+# IPython
+EXPOSE 8888
+
+WORKDIR /root
+
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index b497326..5ef8221 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -16,6 +16,10 @@
 # Build a whl and container with Intel(R) MKL support
 # Usage: build-dev-container.sh
 
+DEBUG=1
+DOCKER_BINARY="docker"
+TMP_DIR=$(pwd)
+
 # Helper function to traverse directories up until given file is found.
 function upsearch () {
   test / == "$PWD" && return || \
@@ -23,81 +27,248 @@
       cd .. && upsearch "$1"
 }
 
+function debug()
+{
+  if [[ ${DEBUG} == 1 ]] ; then
+    echo $1
+  fi
+}
+
+function die()
+{
+  echo $1
+  exit 1
+}
+
 # Set up WORKSPACE.
 WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
 
-TF_DOCKER_BUILD_DEVEL_BRANCH=${TF_DOCKER_BUILD_DEVEL_BRANCH:-master}
-TF_DOCKER_BUILD_IMAGE_NAME=${TF_DOCKER_BUILD_IMAGE_NAME:-intel-mkl/tensorflow}
+ROOT_CONTAINER=${ROOT_CONTAINER:-tensorflow/tensorflow}
+TF_ROOT_CONTAINER_TAG=${ROOT_CONTAINER_TAG:-devel}
+TF_BUILD_VERSION=${TF_DOCKER_BUILD_DEVEL_BRANCH:-master}
+TF_REPO=${TF_REPO:-https://github.com/tensorflow/tensorflow}
+FINAL_IMAGE_NAME=${TF_DOCKER_BUILD_IMAGE_NAME:-intel-mkl/tensorflow}
 TF_DOCKER_BUILD_VERSION=${TF_DOCKER_BUILD_VERSION:-nightly}
+BUILD_AVX_CONTAINERS=${BUILD_AVX_CONTAINERS:-no}
+BUILD_AVX2_CONTAINERS=${BUILD_AVX2_CONTAINERS:-no}
+BUILD_SKX_CONTAINERS=${BUILD_SKX_CONTAINERS:-no}
+BUILD_CLX_CONTAINERS=${BUILD_CLX_CONTAINERS:-no}
+CONTAINER_PORT=${TF_DOCKER_BUILD_PORT:-8888}
+BUILD_TF_V2_CONTAINERS=${BUILD_TF_V2_CONTAINERS:-no}
+ENABLE_SECURE_BUILD=${ENABLE_SECURE_BUILD:-no}
 
-echo "TF_DOCKER_BUILD_DEVEL_BRANCH=${TF_DOCKER_BUILD_DEVEL_BRANCH}"
-echo "TF_DOCKER_BUILD_IMAGE_NAME=${TF_DOCKER_BUILD_IMAGE_NAME}"
-echo "TF_DOCKER_BUILD_VERSION=${TF_DOCKER_BUILD_VERSION}"
+debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
+debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
+debug "TF_BUILD_VERSION=${TF_BUILD_VERSION}"
+debug "FINAL_IMAGE_NAME=${FINAL_IMAGE_NAME}"
+debug "TF_DOCKER_BUILD_VERSION=${TF_DOCKER_BUILD_VERSION}"
+debug "BUILD_AVX_CONTAINERS=${BUILD_AVX_CONTAINERS}"
+debug "BUILD_AVX2_CONTAINERS=${BUILD_AVX2_CONTAINERS}"
+debug "BUILD_SKX_CONTAINERS=${BUILD_SKX_CONTAINERS}"
+debug "BUILD_CLX_CONTAINERS=${BUILD_CLX_CONTAINERS}"
+debug "BUILD_TF_V2_CONTAINERS=${BUILD_TF_V2_CONTAINERS}"
+debug "ENABLE_SECURE_BUILD=${ENABLE_SECURE_BUILD}"
+debug "TMP_DIR=${TMP_DIR}"
 
-# Build containers for AVX
-# Include the instructions for sandybridge and later, but tune for ivybridge
-TF_BAZEL_BUILD_OPTIONS="--config=mkl --copt=-march=sandybridge --copt=-mtune=ivybridge --copt=-O3 --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+function build_container()
+{
+  if [[ $# -lt 2 ]]; then
+    die "Usage: build_container <TEMP_IMAGE_NAME> <TF_DOCKER_BUILD_ARGS>."
+  fi
+  TEMP_IMAGE_NAME=${1}
+  debug "TEMP_IMAGE_NAME=${TEMP_IMAGE_NAME}"
+  shift
+  TF_DOCKER_BUILD_ARGS=("${@}")
 
-# build the python 2 container and whl
-TF_DOCKER_BUILD_TYPE="MKL" \
-  TF_DOCKER_BUILD_IS_DEVEL="YES" \
-  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
-  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
-  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
-  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
-  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh 
+  # Add the proxy info build args
+  TF_DOCKER_BUILD_ARGS+=("--build-arg http_proxy=${http_proxy}")
+  TF_DOCKER_BUILD_ARGS+=("--build-arg https_proxy=${https_proxy}")
+  TF_DOCKER_BUILD_ARGS+=("--build-arg socks_proxy=${socks_proxy}")
+  TF_DOCKER_BUILD_ARGS+=("--build-arg no_proxy=${no_proxy}")
+  TF_DOCKER_BUILD_ARGS+=("--build-arg HTTP_PROXY=${http_proxy}")
+  TF_DOCKER_BUILD_ARGS+=("--build-arg SOCKS_PROXY=${socks_proxy}")
+  TF_DOCKER_BUILD_ARGS+=("--build-arg NO_PROXY=${no_proxy}")
 
-# build the python 3 container and whl
-TF_DOCKER_BUILD_TYPE="MKL" \
-  TF_DOCKER_BUILD_IS_DEVEL="YES" \
-  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
-  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
-  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
-  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3" \
-  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
-  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+  #Add --config=v2 build arg for TF v2
+  if [[ ${BUILD_TF_V2_CONTAINERS} == "no" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg CONFIG_V2_DISABLE=--disable-v2")
+  fi
 
-# build the python3.6 container and whl
-TF_DOCKER_BUILD_TYPE="MKL" \
-  TF_DOCKER_BUILD_IS_DEVEL="YES" \
-  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
-  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
-  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
-  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3.6" \
-  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
-  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+  #Add build arg for Secure Build
+  if [[ ${ENABLE_SECURE_BUILD} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_SECURE_BUILD=--secure-build")
+  fi
 
+  # Perform docker build
+  debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
+  CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."
+  debug "CMD=${CMD}"
+  ${CMD}
 
-# Build containers for AVX2
-# Include the instructions for haswell and later, but tune for broadwell
-TF_BAZEL_BUILD_OPTIONS="--config=mkl --copt=-march=haswell --copt=-mtune=broadwell --copt=-O3 --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+  if [[ $? == "0" ]]; then
+    debug "${DOCKER_BINARY} build of ${TEMP_IMAGE_NAME} succeeded"
+  else
+    die "FAIL: ${DOCKER_BINARY} build of ${TEMP_IMAGE_NAME} failed"
+  fi
+}
 
-# build the python 2 container and whl
-TF_DOCKER_BUILD_TYPE="MKL" \
-  TF_DOCKER_BUILD_IS_DEVEL="YES" \
-  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
-  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
-  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
-  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
-  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh 
+function test_container()
+{
+  if [[ "$#" != "1" ]]; then
+    die "Usage: ${FUNCNAME} <TEMP_IMAGE_NAME>"
+  fi
 
-# build the python 3 container and whl
-TF_DOCKER_BUILD_TYPE="MKL" \
-  TF_DOCKER_BUILD_IS_DEVEL="YES" \
-  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
-  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
-  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
-  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3" \
-  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
-  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+  TEMP_IMAGE_NAME=${1}
 
-# build the python3.6 container and whl
-TF_DOCKER_BUILD_TYPE="MKL" \
-  TF_DOCKER_BUILD_IS_DEVEL="YES" \
-  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
-  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
-  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
-  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3.6" \
-  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
-  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+  # Make sure that there is no other containers of the same image running
+  if "${DOCKER_BINARY}" ps | grep -q "${TEMP_IMAGE_NAME}"; then
+    die "ERROR: It appears that there are docker containers of the image "\
+  "${TEMP_IMAGE_NAME} running. Please stop them before proceeding"
+  fi
+
+  # Start a docker container from the newly-built docker image
+  DOCKER_RUN_LOG="${TMP_DIR}/docker_run.log"
+  debug "  Log file is at: ${DOCKER_RUN_LOG}"
+
+  debug "Running docker container from image ${TEMP_IMAGE_NAME}..."
+  RUN_CMD="${DOCKER_BINARY} run --rm -d -p ${CONTAINER_PORT}:${CONTAINER_PORT} ${TEMP_IMAGE_NAME} tail -f /dev/null 2>&1 > ${DOCKER_RUN_LOG}"
+  debug "RUN_CMD=${RUN_CMD}"
+  ${RUN_CMD}
+
+  # Get the container ID
+  CONTAINER_ID=""
+  while [[ -z ${CONTAINER_ID} ]]; do
+    sleep 1
+    debug "Polling for container ID..."
+    CONTAINER_ID=$("${DOCKER_BINARY}" ps | grep "${TEMP_IMAGE_NAME}" | awk '{print $1}')
+  done
+
+  debug "ID of the running docker container: ${CONTAINER_ID}"
+
+  debug "Performing basic sanity checks on the running container..."
+  TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'")
+  debug "Running test command: ${TEST_CMD}"
+  if [ "${TEST_CMD}" = "True" ] ; then
+      echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  else
+      die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  fi
+
+  # Stop the running docker container
+  sleep 1
+  "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}
+}
+
+function checkout_tensorflow()
+{
+  if [[ "$#" != "2" ]]; then
+    die "Usage: ${FUNCNAME} <REPO_URL> <BRANCH/TAG/COMMIT-ID>"
+  fi
+
+  TF_REPO="${1}"
+  TF_BUILD_VERSION="${2}"
+  TENSORFLOW_DIR="tensorflow"
+
+  debug "Checking out ${TF_REPO}:${TF_BUILD_VERSION} into ${TENSORFLOW_DIR}"
+
+  # Clean any existing tensorflow sources
+  rm -rf "${TENSORFLOW_DIR}"
+
+  # Let's make this simeple for now; we can be more fancy later
+  git clone ${TF_REPO} ${TENSORFLOW_DIR}
+  cd ${TENSORFLOW_DIR}
+  git checkout ${TF_BUILD_VERSION}
+  if [ $? -ne 0 ]; then
+    die "Unable to find ${TF_BUILD_VERSION} on ${TF_REPO}"
+  fi
+  cd ..
+}
+
+function tag_container()
+{
+  # Apply the final image name and tag
+  TEMP_IMAGE_NAME="${1}"
+  FINAL_IMG="${2}"
+
+  DOCKER_VER=$("${DOCKER_BINARY}" version | grep Version | head -1 | awk '{print $NF}')
+  if [[ -z "${DOCKER_VER}" ]]; then
+    die "ERROR: Failed to determine ${DOCKER_BINARY} version"
+  fi
+  DOCKER_MAJOR_VER=$(echo "${DOCKER_VER}" | cut -d. -f 1)
+  DOCKER_MINOR_VER=$(echo "${DOCKER_VER}" | cut -d. -f 2)
+
+  FORCE_TAG=""
+  if [[ "${DOCKER_MAJOR_VER}" -le 1 ]] && \
+    [[ "${DOCKER_MINOR_VER}" -le 9 ]]; then
+    FORCE_TAG="--force"
+  fi
+
+  "${DOCKER_BINARY}" tag ${FORCE_TAG} "${TEMP_IMAGE_NAME}" "${FINAL_IMG}" || \
+      die "Failed to tag intermediate docker image ${TEMP_IMAGE_NAME} as ${FINAL_IMG}"
+
+  debug "Successfully tagged docker image: ${FINAL_IMG}"
+}
+
+PYTHON_VERSIONS=("python" "python3")
+PLATFORMS=()
+if [[ ${BUILD_AVX_CONTAINERS} == "yes" ]]; then
+  PLATFORMS+=("sandybridge")
+fi
+
+if [[ ${BUILD_AVX2_CONTAINERS} == "yes" ]]; then
+  PLATFORMS+=("haswell")
+fi
+
+if [[ ${BUILD_SKX_CONTAINERS} == "yes" ]]; then
+  PLATFORMS+=("skylake")
+fi
+
+if [[ ${BUILD_CLX_CONTAINERS} == "yes" ]]; then
+  PLATFORMS+=("icelake")
+fi
+
+# Checking out sources needs to be done only once
+checkout_tensorflow "${TF_REPO}" "${TF_BUILD_VERSION}"
+
+for PLATFORM in "${PLATFORMS[@]}"
+do
+  for PYTHON in "${PYTHON_VERSIONS[@]}"
+  do
+    # Clear the build args array
+    TF_DOCKER_BUILD_ARGS=("--build-arg TARGET_PLATFORM=${PLATFORM}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg ROOT_CONTAINER=${ROOT_CONTAINER}")
+    FINAL_TAG="${TF_DOCKER_BUILD_VERSION}"
+    ROOT_CONTAINER_TAG="${TF_ROOT_CONTAINER_TAG}"
+
+      if [[ ${PLATFORM} == "haswell" ]]; then
+        FINAL_TAG="${FINAL_TAG}-avx2"
+      fi
+
+      if [[ ${PLATFORM} == "skylake" ]]; then
+        FINAL_TAG="${FINAL_TAG}-avx512"
+      fi
+
+      if [[ ${PLATFORM} == "icelake" ]]; then
+        FINAL_TAG="${FINAL_TAG}-avx512-VNNI"
+      fi
+
+      # Add -devel-mkl to the image tag
+      FINAL_TAG="${FINAL_TAG}-devel-mkl"
+      if [[ "${PYTHON}" == "python3" ]]; then
+        TF_DOCKER_BUILD_ARGS+=("--build-arg WHL_DIR=/tmp/pip3")
+        TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
+        FINAL_TAG="${FINAL_TAG}-py3"
+        ROOT_CONTAINER_TAG="${ROOT_CONTAINER_TAG}-py3"
+      fi
+
+      TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${PYTHON}")
+      TF_DOCKER_BUILD_ARGS+=("--build-arg ROOT_CONTAINER_TAG=${ROOT_CONTAINER_TAG}")
+
+      # Intermediate image name with tag
+      TEMP_IMAGE_NAME="${USER}/tensorflow:${FINAL_TAG}"
+      build_container "${TEMP_IMAGE_NAME}" "${TF_DOCKER_BUILD_ARGS[@]}"
+      test_container "${TEMP_IMAGE_NAME}"
+      tag_container "${TEMP_IMAGE_NAME}" "${FINAL_IMAGE_NAME}:${FINAL_TAG}"
+  done
+done
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/set-build-env.py b/tensorflow/tools/ci_build/linux/mkl/set-build-env.py
new file mode 100755
index 0000000..ba19c61
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/set-build-env.py
@@ -0,0 +1,225 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Configure build environment for certain Intel platforms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import subprocess
+
+NEHALEM_CPU_INSTRUCTIONS = [
+    "MMX", "SSE", "SSE2", "SSE3", "SSSE3", "SSE4.1", "SSE4.2", "POPCNT"
+]
+
+SANDYBRIDGE_CPU_INSTRUCTIONS = NEHALEM_CPU_INSTRUCTIONS[:]
+SANDYBRIDGE_CPU_INSTRUCTIONS.extend(["AVX", "AES", "PCLMUL"])
+
+HASWELL_CPU_INSTRUCTIONS = SANDYBRIDGE_CPU_INSTRUCTIONS[:]
+HASWELL_CPU_INSTRUCTIONS.extend(
+    ["FSGSBASE", "RDRND", "FMA", "BMI", "BMI2", "F16C", "MOVBE", "AVX2"])
+
+SKYLAKE_CPU_INSTRUCTIONS = HASWELL_CPU_INSTRUCTIONS[:]
+SKYLAKE_CPU_INSTRUCTIONS.extend([
+    "PKU", "RDSEED", "ADCX", "PREFETCHW", "CLFLUSHOPT", "XSAVEC", "XSAVES",
+    "AVX512F", "CLWB", "AVX512VL", "AVX512BW", "AVX512DQ", "AVX512CD"
+])
+
+ICELAKE_CPU_INSTRUCTIONS = SKYLAKE_CPU_INSTRUCTIONS[:]
+ICELAKE_CPU_INSTRUCTIONS.extend([
+    "AVX512VBMI", "AVX512IFMA", "SHA", "CLWB", "UMIP", "RDPID", "GFNI",
+    "AVX512VBMI2", "AVX512VPOPCNTDQ", "AVX512BITALG", "AVX512VNNI",
+    "VPCLMULQDQ", "VAES"
+])
+
+BASIC_BUILD_OPTS = ["--cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0", "--copt=-O3"]
+
+SECURE_BUILD_OPTS = [
+    "--copt=-Wformat", "--copt=-Wformat-security", "--copt=-fstack-protector",
+    "--copt=-fPIC", "--copt=-fpic", "--linkopt=-znoexecstack",
+    "--linkopt=-zrelro", "--linkopt=-znow", "--linkopt=-fstack-protector"
+]
+
+
+class BuildEnvSetter(object):
+  """Prepares the proper environment settings for various Intel platforms."""
+  default_platform_ = "haswell"
+  PLATFORMS = {
+      "nehalem": {
+          "min_gcc_major_version": "4",
+          "min_gcc_minor_version": "8",
+          "flags": NEHALEM_CPU_INSTRUCTIONS
+      },
+      "sandybridge": {
+          "min_gcc_major_version": "4",
+          "min_gcc_minor_version": "8",
+          "flags": SANDYBRIDGE_CPU_INSTRUCTIONS
+      },
+      "haswell": {
+          "min_gcc_major_version": "4",
+          "min_gcc_minor_version": "8",
+          "flags": HASWELL_CPU_INSTRUCTIONS
+      },
+      "skylake": {
+          "min_gcc_major_version": "6",
+          "min_gcc_minor_version": "0",
+          "flags": SKYLAKE_CPU_INSTRUCTIONS
+      },
+      "icelake": {
+          "min_gcc_major_version": "8",
+          "min_gcc_minor_version": "0",
+          "flags": ICELAKE_CPU_INSTRUCTIONS
+      }
+  }
+
+  def __init__(self):
+    self.args = None
+    self.bazel_flags_ = "build "
+    self.go()
+
+  def gcc_version_ok(self, min_gcc_major_version, min_gcc_minor_version):
+    """Make sure the GCC version installed on the machine is acceptable."""
+    # check to see if gcc is present
+    gcc_path = ""
+    gcc_path_cmd = "command -v gcc"
+    try:
+      print("gcc_path_cmd = {}".format(gcc_path_cmd))
+      gcc_path = subprocess.check_output(gcc_path_cmd, shell=True,
+                                         stderr=subprocess.STDOUT).\
+      strip()
+      print("gcc located here: {}".format(gcc_path))
+      if not os.access(gcc_path, os.F_OK | os.X_OK):
+        raise ValueError(
+            "{} does not exist or is not executable.".format(gcc_path))
+
+      gcc_output = subprocess.check_output([gcc_path, "-dumpversion"],
+                                           stderr=subprocess.STDOUT)
+      # handle python2 vs 3 (bytes vs str type)
+      if isinstance(gcc_output, bytes):
+        gcc_output = gcc_output.decode("utf-8")
+      print("gcc version: {}".format(gcc_output))
+      gcc_info = gcc_output.split(".")
+      if gcc_info[0] < min_gcc_major_version:
+        print("Your MAJOR version of GCC is too old: {}; "
+              "it must be at least {}.{}".format(gcc_info[0],
+                                                 min_gcc_major_version,
+                                                 min_gcc_minor_version))
+        return False
+
+      elif gcc_info[0] == min_gcc_major_version:
+        if gcc_info[1] < min_gcc_minor_version:
+          print("Your MINOR version of GCC is too old: {}; "
+                "it must be at least {}.{}".format(gcc_info[1],
+                                                   min_gcc_major_version,
+                                                   min_gcc_minor_version))
+          return False
+        return True
+      else:
+        self._debug("gcc version OK: {}.{}".format(gcc_info[0], gcc_info[1]))
+        return True
+    except subprocess.CalledProcessException as e:
+      print("Problem getting gcc info: {}".format(e))
+      return False
+
+  def parse_args(self):
+    """Set up argument parser, and parse CLI args."""
+    arg_parser = argparse.ArgumentParser(
+        description="Parse the arguments for the "
+        "TensorFlow build environment "
+        " setter")
+    arg_parser.add_argument(
+        "--disable-mkl",
+        dest="disable_mkl",
+        help="Turn off MKL. By default the compiler flag "
+        "--config=mkl is enabled.",
+        action="store_true")
+    arg_parser.add_argument(
+        "--disable-v2",
+        dest="disable_v2",
+        help="Don't build TensorFlow v2. By default the "
+        " compiler flag --config=v2 is enabled.",
+        action="store_true")
+    arg_parser.add_argument(
+        "-s",
+        "--secure-build",
+        dest="secure_build",
+        help="Enable secure build flags.",
+        action="store_true")
+    arg_parser.add_argument(
+        "-p",
+        "--platform",
+        choices=self.PLATFORMS.keys(),
+        help="The target platform.",
+        dest="target_platform",
+        default=self.default_platform_)
+    arg_parser.add_argument(
+        "-f",
+        "--bazelrc-file",
+        dest="bazelrc_file",
+        help="The full path to the bazelrc file into which "
+        "the build command will be written. The path "
+        "will be relative to the container "
+        " environment.",
+        required=True)
+
+    self.args = arg_parser.parse_args()
+
+  def validate_args(self):
+    if os.path.exists(self.args.bazelrc_file):
+      if os.path.isfile(self.args.bazelrc_file):
+        self._debug("The file {} exists and will be deleted.".format(
+            self.args.bazelrc_file))
+      elif os.path.isdir(self.args.bazelrc_file):
+        raise ValueError("{} is not a valid file name".format(
+            self.args.bazelrc_file))
+    return True
+
+  def set_build_args(self):
+    """Generate Bazel build flags."""
+    for flag in BASIC_BUILD_OPTS:
+      self.bazel_flags_ += "{} ".format(flag)
+    if self.args.secure_build:
+      for flag in SECURE_BUILD_OPTS:
+        self.bazel_flags_ += "{} ".format(flag)
+    for flag in self.PLATFORMS.get(self.args.target_platform)["flags"]:
+      self.bazel_flags_ += "--copt=-m{} ".format(flag.lower())
+    if not self.args.disable_mkl:
+      self.bazel_flags_ += "--config=mkl "
+    if not self.args.disable_v2:
+      self.bazel_flags_ += "--config=v2 "
+
+  def write_build_args(self):
+    self._debug("Writing build flags: {}".format(self.bazel_flags_))
+    with open(self.args.bazelrc_file, "w") as f:
+      f.write(self.bazel_flags_)
+
+  def _debug(self, msg):
+    print(msg)
+
+  def go(self):
+    self.parse_args()
+    target_platform = self.PLATFORMS.get(self.args.target_platform)
+    if self.validate_args() and \
+      self.gcc_version_ok(target_platform["min_gcc_major_version"],
+                          target_platform["min_gcc_minor_version"]):
+      self.set_build_args()
+      self.write_build_args()
+    else:
+      print("Error.")
+
+
+env_setter = BuildEnvSetter()
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index c167a17..203c438 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -36,15 +36,7 @@
 SETUP_PY = "%s/tools/pip_package/setup.py" % TF_SRC_DIR
 README_MD = "./README.md"
 TENSORFLOW_BZL = "%s/tensorflow.bzl" % TF_SRC_DIR
-DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel" % TF_SRC_DIR
-GPU_DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel-gpu" % TF_SRC_DIR
-CPU_MKL_DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel-mkl" % TF_SRC_DIR
-RELEVANT_FILES = [TF_SRC_DIR,
-                  VERSION_H,
-                  SETUP_PY,
-                  README_MD,
-                  DEVEL_DOCKERFILE,
-                  GPU_DEVEL_DOCKERFILE]
+RELEVANT_FILES = [TF_SRC_DIR, VERSION_H, SETUP_PY, README_MD]
 
 # Version type parameters.
 NIGHTLY_VERSION = 1
@@ -238,24 +230,6 @@
   return False
 
 
-def update_dockerfiles(old_version, new_version):
-  """Update dockerfiles if there was a major change."""
-  if major_minor_change(old_version, new_version):
-    old_r_major_minor = "r%s.%s" % (old_version.major, old_version.minor)
-    r_major_minor = "r%s.%s" % (new_version.major, new_version.minor)
-
-    print("Detected Major.Minor change.")
-    print("Updating pattern %s to %s in additional files"
-          % (old_r_major_minor, r_major_minor))
-
-    # Update dockerfiles
-    replace_string_in_line(old_r_major_minor, r_major_minor, DEVEL_DOCKERFILE)
-    replace_string_in_line(old_r_major_minor, r_major_minor,
-                           GPU_DEVEL_DOCKERFILE)
-    replace_string_in_line(old_r_major_minor, r_major_minor,
-                           CPU_MKL_DEVEL_DOCKERFILE)
-
-
 def check_for_lingering_string(lingering_string):
   """Check for given lingering strings."""
   formatted_string = lingering_string.replace(".", r"\.")
@@ -333,7 +307,6 @@
   update_version_h(old_version, new_version)
   update_setup_dot_py(old_version, new_version)
   update_readme(old_version, new_version)
-  update_dockerfiles(old_version, new_version)
   update_tensorflow_bzl(old_version, new_version)
 
   # Print transition details.
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 8c01d15..05fc81b 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -21,6 +21,7 @@
 py_test(
     name = "public_api_test",
     srcs = ["public_api_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":public_api",
@@ -38,6 +39,7 @@
 py_test(
     name = "traverse_test",
     srcs = ["traverse_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":test_module1",
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 6db6669..fb0c7ef 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -2,6 +2,7 @@
     "//tensorflow:tensorflow.bzl",
     "tf_copts",  # @unused
     "tf_cc_test",  # @unused
+    "py_test",
 )
 
 licenses(["notice"])  # Apache 2.0
@@ -54,6 +55,9 @@
     name = "tf_upgrade_test",
     srcs = ["tf_upgrade_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
     deps = [
         ":tf_upgrade_lib",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index ca6dd5a..01db9a9 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -27,8 +27,13 @@
 tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded --copyotherfiles False
 ```
 
-*Note: `tf_upgrade_v2` is installed automatically as a script by the pip install 
-after TensorFlow 1.12.
+*Note: `tf_upgrade_v2` is installed automatically as a script by the pip install
+ after TensorFlow 1.12.
+
+You may want to retain revision history, especially when preparing a CL:
+```
+g4 integrate --retroactive coolcode/... coolcode-upgraded/...
+```
 
 ## Report
 
diff --git a/tensorflow/tools/compatibility/all_renames_v2.py b/tensorflow/tools/compatibility/all_renames_v2.py
index bbe9cbd..a68c452 100644
--- a/tensorflow/tools/compatibility/all_renames_v2.py
+++ b/tensorflow/tools/compatibility/all_renames_v2.py
@@ -159,6 +159,8 @@
         "tf.CriticalSection",
     "tf.contrib.framework.is_tensor":
         "tf.is_tensor",
+    "tf.contrib.framework.load_variable":
+        "tf.train.load_variable",
     "tf.contrib.framework.nest.assert_same_structure":
         "tf.nest.assert_same_structure",
     "tf.contrib.framework.nest.flatten":
@@ -201,6 +203,8 @@
         "tf.sort",
     "tf.contrib.framework.argsort":
         "tf.argsort",
+    "tf.contrib.summary.all_summary_ops":
+        "tf.compat.v1.summary.all_v2_summary_ops",
     "tf.contrib.summary.always_record_summaries":
         "tf.compat.v2.summary.record_if",
     "tf.contrib.summary.audio":
diff --git a/tensorflow/tools/compatibility/ipynb.py b/tensorflow/tools/compatibility/ipynb.py
index d37a1ab..fed5f0f 100644
--- a/tensorflow/tools/compatibility/ipynb.py
+++ b/tensorflow/tools/compatibility/ipynb.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""A module to support operation on ipynb files"""
+"""A module to support operations on ipynb files"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +21,7 @@
 import collections
 import copy
 import json
+import re
 import shutil
 import tempfile
 
@@ -62,8 +63,45 @@
   return files_processed, report_text, errors
 
 
+def skip_magic(code_line, magic_list):
+  """Checks if the cell has magic, that is not Python-based.
+
+  Args:
+      code_line: A line of Python code
+      magic_list: A list of jupyter "magic" exceptions
+
+  Returns:
+    If the line jupyter "magic" line, not Python line
+
+   >>> skip_magic('!ls -laF', ['%', '!', '?'])
+  True
+  """
+
+  for magic in magic_list:
+    if code_line.startswith(magic):
+      return True
+
+  return False
+
+
+def check_line_split(code_line):
+  r"""Checks if a line was split with `\`.
+
+  Args:
+      code_line: A line of Python code
+
+  Returns:
+    If the line was split with `\`
+
+  >>> skip_magic("!gcloud ml-engine models create ${MODEL} \\\n")
+  True
+  """
+
+  return re.search(r"\\\s*\n$", code_line)
+
+
 def _get_code(input_file):
-  """Load the ipynb file and return a list of CodeLines."""
+  """Loads the ipynb file and returns a list of CodeLines."""
 
   raw_code = []
 
@@ -75,15 +113,21 @@
     if is_python(cell):
       cell_lines = cell["source"]
 
+      is_line_split = False
       for line_idx, code_line in enumerate(cell_lines):
 
         # Sometimes, jupyter has more than python code
         # Idea is to comment these lines, for upgrade time
-        if code_line.startswith("%") or code_line.startswith("!") \
-            or code_line.startswith("?"):
+        if skip_magic(code_line, ["%", "!", "?"]) or is_line_split:
           # Found a special character, need to "encode"
           code_line = "###!!!" + code_line
 
+          # if this cell ends with `\` -> skip the next line
+          is_line_split = check_line_split(code_line)
+
+        if is_line_split:
+          is_line_split = check_line_split(code_line)
+
         # Sometimes, people leave \n at the end of cell
         # in order to migrate only related things, and make the diff
         # the smallest -> here is another hack
@@ -102,7 +146,7 @@
 
 
 def _update_notebook(original_notebook, original_raw_lines, updated_code_lines):
-  """Update notebook, once migration is done."""
+  """Updates notebook, once migration is done."""
 
   new_notebook = copy.deepcopy(original_notebook)
 
@@ -113,7 +157,7 @@
 
   code_cell_idx = 0
   for cell in new_notebook["cells"]:
-    if cell["cell_type"] != "code":
+    if not is_python(cell):
       continue
 
     applicable_lines = [
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index fc11a9b..2019258 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -116,7 +116,7 @@
     'tf.SessionLog':
         'tf.compat.v1.SessionLog',
     'tf.SparseConditionalAccumulator':
-        'tf.sparse.SparseConditionalAccumulator',
+        'tf.compat.v1.SparseConditionalAccumulator',
     'tf.SparseFeature':
         'tf.io.SparseFeature',
     'tf.SparseTensorValue':
@@ -383,6 +383,10 @@
         'tf.signal.fft3d',
     'tf.fixed_size_partitioner':
         'tf.compat.v1.fixed_size_partitioner',
+    'tf.floor_div':
+        'tf.math.floordiv',
+    'tf.floormod':
+        'tf.math.floormod',
     'tf.floordiv':
         'tf.math.floordiv',
     'tf.get_collection':
@@ -605,6 +609,8 @@
         'tf.compat.v1.lite.constants.TFLITE',
     'tf.lite.experimental.convert_op_hints_to_stubs':
         'tf.compat.v1.lite.experimental.convert_op_hints_to_stubs',
+    'tf.lite.experimental.get_potentially_supported_ops':
+        'tf.compat.v1.lite.experimental.get_potentially_supported_ops',
     'tf.lite.experimental.nn.TFLiteLSTMCell':
         'tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell',
     'tf.lite.experimental.nn.TfLiteRNNCell':
@@ -807,6 +813,8 @@
         'tf.compat.v1.metrics.true_positives_at_thresholds',
     'tf.min_max_variable_partitioner':
         'tf.compat.v1.min_max_variable_partitioner',
+    'tf.mod':
+        'tf.math.mod',
     'tf.model_variables':
         'tf.compat.v1.model_variables',
     'tf.moving_average_variables':
@@ -879,6 +887,8 @@
         'tf.random.uniform_candidate_sampler',
     'tf.nn.xw_plus_b':
         'tf.compat.v1.nn.xw_plus_b',
+    'tf.no_regularizer':
+        'tf.compat.v1.no_regularizer',
     'tf.op_scope':
         'tf.compat.v1.op_scope',
     'tf.parse_single_sequence_example':
@@ -1129,6 +1139,8 @@
         'tf.sets.union',
     'tf.space_to_depth':
         'tf.compat.v1.space_to_depth',
+    'tf.sparse.SparseConditionalAccumulator':
+        'tf.compat.v1.sparse.SparseConditionalAccumulator',
     'tf.sparse.matmul':
         'tf.sparse.sparse_dense_matmul',
     'tf.sparse.merge':
@@ -1235,6 +1247,8 @@
         'tf.compat.v1.summary.SummaryDescription',
     'tf.summary.TaggedRunMetadata':
         'tf.compat.v1.summary.TaggedRunMetadata',
+    'tf.summary.all_v2_summary_ops':
+        'tf.compat.v1.summary.all_v2_summary_ops',
     'tf.summary.audio':
         'tf.compat.v1.summary.audio',
     'tf.summary.get_summary_description':
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 2a35d9f..e446f67 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -465,6 +465,15 @@
         "tf.nn.weighted_cross_entropy_with_logits": {
             "targets": "labels",
         },
+        "tf.decode_raw": {
+            "bytes": "input_bytes",
+        },
+        "tf.io.decode_raw": {
+            "bytes": "input_bytes",
+        },
+        "tf.contrib.framework.load_variable": {
+            "checkpoint_dir": "ckpt_dir_or_file",
+        }
     }
 
     # Mapping from function to the new name of the function
@@ -663,6 +672,16 @@
         " they may already have been correct)."
     )
 
+    contrib_layers_layer_norm_comment = (
+        ast_edits.WARNING,
+        "(Manual edit required) `tf.contrib.layers.layer_norm` has been "
+        "deprecated, and its implementation has been integrated with "
+        "`tf.keras.layers.LayerNormalization` in TensorFlow 2.0. "
+        "Note that, the default value of `epsilon` is changed to `1e-3` in the "
+        "new API from `1e-12`, and this may introduce numerical differences. "
+        "Please check the new API and use that instead."
+    )
+
     initializers_no_dtype_comment = (
         ast_edits.INFO,
         "Initializers no longer have the "
@@ -873,6 +892,10 @@
             assert_rank_comment,
         "tf.assert_rank_in":
             assert_rank_comment,
+        "tf.contrib.layers.layer_norm":
+            contrib_layers_layer_norm_comment,
+        "tf.contrib.summary.all_summary_ops":
+            contrib_summary_comment,
         "tf.contrib.summary.audio":
             contrib_summary_comment,
         "tf.contrib.summary.create_file_writer":
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 8eb0b83..f02482a 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -1528,6 +1528,10 @@
     _, _, errors, _ = self._upgrade("tf.flags.FLAGS")
     self.assertIn("tf.flags has been removed", errors[0])
 
+  def test_contrib_layers_layer_norm_deprecation(self):
+    _, report, _, _ = self._upgrade("tf.contrib.layers.layer_norm")
+    self.assertIn("`tf.contrib.layers.layer_norm` has been deprecated", report)
+
   def test_contrib_rnn_deprecation(self):
     _, report, _, _ = self._upgrade("tf.contrib.rnn")
     self.assertIn("tf.contrib.rnn.* has been deprecated", report)
@@ -1712,6 +1716,12 @@
     expected_error = "replaced by a call to tf.compat.v2.summary.record_if()"
     self.assertIn(expected_error, errors[0])
 
+  def test_contrib_summary_all_summary_ops(self):
+    text = "tf.contrib.summary.all_summary_ops()"
+    expected = "tf.compat.v1.summary.all_v2_summary_ops()"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
   def test_contrib_summary_full_example(self):
     deindent = lambda n, s: "\n".join(line[n:] for line in s.split("\n"))
     text = deindent(4, """
@@ -1955,6 +1965,25 @@
     self.assertEqual(expected, new_text)
     self.assertIn("tf.contrib.distribute.* have been migrated", report)
 
+  def test_decode_raw(self):
+    text = "tf.io.decode_raw(bytes=[1,2,3], output_dtype=tf.int32)"
+    expected_text = (
+        "tf.io.decode_raw(input_bytes=[1,2,3], output_dtype=tf.int32)")
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_load_variable(self):
+    text = "tf.contrib.framework.load_variable('a')"
+    expected_text = (
+        "tf.train.load_variable('a')")
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+    text = "tf.contrib.framework.load_variable(checkpoint_dir='a')"
+    expected_text = (
+        "tf.train.load_variable(ckpt_dir_or_file='a')")
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index 593603c..1758e0e 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -5,6 +5,7 @@
 py_binary(
     name = "generate_v2_renames_map",
     srcs = ["generate_v2_renames_map.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
@@ -19,6 +20,7 @@
 py_binary(
     name = "generate_v2_reorders_map",
     srcs = ["generate_v2_reorders_map.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/docker/BUILD b/tensorflow/tools/docker/BUILD
deleted file mode 100644
index 849ba49..0000000
--- a/tensorflow/tools/docker/BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-# Description:
-# Various tools and rules related to the TensorFlow docker container.
-
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "simple_console",
-    srcs = ["simple_console.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
deleted file mode 100644
index 6676de0..0000000
--- a/tensorflow/tools/docker/Dockerfile
+++ /dev/null
@@ -1,73 +0,0 @@
-FROM ubuntu:18.04
-
-LABEL maintainer="Craig Citro <craigcitro@google.com>"
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng-dev \
-        libzmq3-dev \
-        pkg-config \
-        python \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-RUN pip --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        numpy \
-        pandas \
-        scipy \
-        sklearn \
-        && \
-    python -m ipykernel.kernelspec
-
-# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
-# These lines will be edited automatically by parameterized_docker_build.sh. #
-# COPY _PIP_FILE_ /
-# RUN pip --no-cache-dir install /_PIP_FILE_
-# RUN rm -f /_PIP_FILE_
-
-# Install TensorFlow CPU version from central repo
-RUN pip --no-cache-dir install \
-    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
-# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
-
-# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Copy sample notebooks.
-COPY notebooks /notebooks
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR "/notebooks"
-
-CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
deleted file mode 100644
index c26fa01..0000000
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ /dev/null
@@ -1,108 +0,0 @@
-FROM ubuntu:18.04
-
-LABEL maintainer="Craig Citro <craigcitro@google.com>"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-RUN pip --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        mock \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        && \
-    python -m ipykernel.kernelspec
-
-# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-# Install the most recent bazel release.
-ENV BAZEL_VERSION 0.20.0
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    cd / && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /tensorflow
-RUN git clone --branch=r1.13 --depth=1 https://github.com/tensorflow/tensorflow.git .
-
-# TODO(craigcitro): Don't install the pip package, since it makes it
-# more difficult to experiment with local changes. Instead, just add
-# the built directory to the path.
-
-ENV CI_BUILD_PYTHON python
-
-RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-        # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
-        # For ivy-bridge or sandy-bridge
-        # --copt=-march="ivybridge" \
-        # for haswell, broadwell, or skylake
-        # --copt=-march="haswell" \
-        tensorflow/tools/pip_package:build_pip_package && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
-    pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
-    rm -rf /tmp/pip && \
-    rm -rf /root/.cache
-# Clean up pip wheel and Bazel cache when done.
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR /root
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
deleted file mode 100644
index f745018..0000000
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ /dev/null
@@ -1,125 +0,0 @@
-FROM nvidia/cuda:9.0-base-ubuntu16.04
-
-LABEL maintainer="Craig Citro <craigcitro@google.com>"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-RUN pip --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        mock \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        && \
-    python -m ipykernel.kernelspec
-
-# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-# Install the most recent bazel release.
-ENV BAZEL_VERSION 0.20.0
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    cd / && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /tensorflow
-RUN git clone --branch=r1.13 --depth=1 https://github.com/tensorflow/tensorflow.git .
-
-# Configure the build for our CUDA configuration.
-ENV CI_BUILD_PYTHON python
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
-ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
-ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=9.0
-ENV TF_CUDNN_VERSION=7
-
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
-    tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --copt=-mavx --config=cuda \
-	--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-        tensorflow/tools/pip_package:build_pip_package && \
-    rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
-    pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
-    rm -rf /tmp/pip && \
-    rm -rf /root/.cache
-# Clean up pip wheel and Bazel cache when done.
-
-WORKDIR /root
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
deleted file mode 100755
index 32aa00b..0000000
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ /dev/null
@@ -1,130 +0,0 @@
-FROM ubuntu:18.04
-
-LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
-
-# These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.13
-ARG PYTHON="python"
-ARG PYTHON3_DEV=""
-ARG WHL_DIR="/tmp/pip"
-ARG PIP="pip"
-
-RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        ${PYTHON} \
-        ${PYTHON}-dev \
-        ${PYTHON}-pip \
-        ${PYTHON}-setuptools \
-        ${PYTHON}-wheel \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng-dev \
-        libssl-dev \
-        libzmq3-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-
-RUN ${PIP} --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        mock \
-        numpy \
-        pandas \
-        scipy \
-        sklearn \
-        && \
-    ${PYTHON} -m ipykernel.kernelspec
-
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-# Install the most recent bazel release.
-ENV BAZEL_VERSION 0.20.0
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    cd / && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /tensorflow
-
-# Download and build TensorFlow.
-# Enable checking out both tags and branches
-RUN export TAG_PREFIX="v" && \
-    echo ${TF_BUILD_VERSION} | grep -q ^${TAG_PREFIX}; \
-    if [ $? -eq 0 ]; then \
-        git clone --depth=1 https://github.com/tensorflow/tensorflow.git . && \
-        git fetch --tags && \
-        git checkout ${TF_BUILD_VERSION}; \
-   else \
-        git clone --depth=1 --branch=${TF_BUILD_VERSION} https://github.com/tensorflow/tensorflow.git . ; \
-    fi
-
-RUN yes "" | ${PYTHON} configure.py
-RUN cp .bazelrc /root/.bazelrc
-
-ENV CI_BUILD_PYTHON ${PYTHON}
-
-# Set bazel build parameters in .bazelrc in parameterized_docker_build.sh
-# Use --copt=-march values to get optimized builds appropriate for the hardware
-#   platform of your choice.
-# For ivy-bridge or sandy-bridge
-# --copt=-march="avx" \
-# For haswell, broadwell, or skylake
-# --copt=-march="avx2" \
-COPY .bazelrc /root/.mkl.bazelrc
-RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
-
-RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel --bazelrc=/root/.bazelrc build -c opt \
-    tensorflow/tools/pip_package:build_pip_package && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
-    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
-    rm -rf /root/.cache
-# Clean up Bazel cache when done.
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR /root
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
deleted file mode 100755
index 2114091..0000000
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ /dev/null
@@ -1,168 +0,0 @@
-FROM ubuntu:18.04
-
-LABEL maintainer="Cong Xu <cong.xu@intel.com>"
-
-# These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.13
-ARG PYTHON="python"
-ARG PYTHON3_DEV=""
-ARG WHL_DIR="/tmp/pip"
-ARG PIP="pip"
-
-
-RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        ${PYTHON} \
-        ${PYTHON}-dev \
-        ${PYTHON}-pip \
-        ${PYTHON}-setuptools \
-        ${PYTHON}-wheel \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libnuma-dev \
-        libpng-dev \
-        libzmq3-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        openssh-client \
-        openssh-server \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        wget \
-        zip \
-        zlib1g-dev \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-
-RUN ${PIP} --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        mock \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        && \
-    ${PYTHON} -m ipykernel.kernelspec
-
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-# Install the most recent bazel release.
-ENV BAZEL_VERSION 0.20.0
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    cd / && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /tensorflow
-
-# Download and build TensorFlow.
-# Enable checking out both tags and branches
-RUN export TAG_PREFIX="v" && \
-    echo ${TF_BUILD_VERSION} | grep -q ^${TAG_PREFIX}; \
-    if [ $? -eq 0 ]; then \
-        git clone --depth=1 https://github.com/tensorflow/tensorflow.git . && \
-        git fetch --tags && \
-        git checkout ${TF_BUILD_VERSION}; \
-   else \
-        git clone --depth=1 --branch=${TF_BUILD_VERSION} https://github.com/tensorflow/tensorflow.git . ; \
-    fi
-
-RUN yes "" | ${PYTHON} configure.py
-RUN cp .bazelrc /root/.bazelrc
-
-ENV CI_BUILD_PYTHON ${PYTHON}
-
-# Set bazel build parameters in .bazelrc in parameterized_docker_build.sh
-# Use --copt=-march values to get optimized builds appropriate for the hardware
-#   platform of your choice.
-# For ivy-bridge or sandy-bridge
-# --copt=-march="avx" \
-# For haswell, broadwell, or skylake
-# --copt=-march="avx2" \
-COPY .bazelrc /root/.mkl.bazelrc
-RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
-
-RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel --bazelrc=/root/.bazelrc build -c opt \
-    tensorflow/tools/pip_package:build_pip_package && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
-    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
-    rm -rf /root/.cache
-# Clean up Bazel cache when done.
-
-WORKDIR /root
-
-# Install Open MPI
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
-    tar zxf openmpi-3.0.0.tar.gz && \
-    cd openmpi-3.0.0 && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
-
-# Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
-
-# Install Horovod
-RUN ${PIP} install --no-cache-dir horovod
-
-# Install OpenSSH for MPI to communicate between containers
-RUN mkdir -p /var/run/sshd
-
-# Allow OpenSSH to talk to containers without asking for confirmation
-RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
-    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
-    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR /root
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
deleted file mode 100644
index 7dc92a8..0000000
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ /dev/null
@@ -1,89 +0,0 @@
-FROM nvidia/cuda:9.0-base-ubuntu16.04
-
-LABEL maintainer="Craig Citro <craigcitro@google.com>"
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
-        curl \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
-
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-RUN pip --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        numpy \
-        pandas \
-        scipy \
-        sklearn \
-        && \
-    python -m ipykernel.kernelspec
-
-# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
-# These lines will be edited automatically by parameterized_docker_build.sh. #
-# COPY _PIP_FILE_ /
-# RUN pip --no-cache-dir install /_PIP_FILE_
-# RUN rm -f /_PIP_FILE_
-
-# Install TensorFlow GPU version.
-RUN pip --no-cache-dir install \
-    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
-# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
-
-# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Copy sample notebooks.
-COPY notebooks /notebooks
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# For CUDA profiling, TensorFlow requires CUPTI.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR "/notebooks"
-
-CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
deleted file mode 100755
index 3f7729b..0000000
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ /dev/null
@@ -1,75 +0,0 @@
-FROM ubuntu:18.04
-
-LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
-
-# This parameter MUST be set by parameterized_docker_build.sh
-ARG TF_WHL_URL
-
-# Optional parameters
-ARG TF_BUILD_VERSION=r1.13
-ARG PYTHON="python"
-ARG PYTHON_DEV="python-dev"
-ARG PIP="pip"
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        ${PYTHON} \
-        ${PYTHON}-dev \
-        ${PYTHON}-pip \
-        ${PYTHON}-setuptools \
-        ${PYTHON}-wheel \
-        build-essential \
-        curl \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-
-RUN ${PIP} --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        numpy \
-        pandas \
-        scipy \
-        sklearn \
-        && \
-    ${PYTHON} -m ipykernel.kernelspec
-
-
-COPY ${TF_WHL_URL} /
-RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
-    rm -rf /${TF_WHL_URL}
-
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Copy sample notebooks.
-COPY notebooks /notebooks
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR "/notebooks"
-
-CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
deleted file mode 100755
index b0afd63..0000000
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ /dev/null
@@ -1,113 +0,0 @@
-FROM ubuntu:18.04
-
-LABEL maintainer="Cong Xu <cong.xu@intel.com>"
-
-# This parameter MUST be set by parameterized_docker_build.sh
-ARG TF_WHL_URL
-
-# Optional parameters
-ARG TF_BUILD_VERSION=r1.13
-ARG PYTHON="python"
-ARG PYTHON_DEV="python-dev"
-ARG PIP="pip"
-
-# Pick up some TF dependencies
-# RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        ${PYTHON} \
-        ${PYTHON}-dev \
-        ${PYTHON}-pip \
-        ${PYTHON}-setuptools \
-        ${PYTHON}-wheel \
-        build-essential \
-        curl \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libnuma-dev \
-        libpng-dev \
-        libzmq3-dev \
-        openssh-client \
-        openssh-server \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        wget \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-
-RUN ${PIP} --no-cache-dir install \
-        Pillow \
-        h5py \
-        ipykernel \
-        jupyter \
-        keras_applications \
-        keras_preprocessing \
-        matplotlib \
-        numpy \
-        pandas \
-        scipy \
-        sklearn \
-        && \
-    ${PYTHON} -m ipykernel.kernelspec
-
-
-COPY ${TF_WHL_URL} /
-RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
-    rm -rf /${TF_WHL_URL}
-
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Copy sample notebooks.
-COPY notebooks /notebooks
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-WORKDIR /root
-
-# Install Open MPI
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
-    tar zxf openmpi-3.0.0.tar.gz && \
-    cd openmpi-3.0.0 && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
-
-# Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
-
-# Install Horovod
-RUN ${PIP} install --no-cache-dir horovod
-
-# Install OpenSSH for MPI to communicate between containers
-RUN mkdir -p /var/run/sshd
-
-# Allow OpenSSH to talk to containers without asking for confirmation
-RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
-    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
-    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-WORKDIR "/notebooks"
-
-CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/LICENSE b/tensorflow/tools/docker/LICENSE
deleted file mode 100644
index dea770e..0000000
--- a/tensorflow/tools/docker/LICENSE
+++ /dev/null
@@ -1,13 +0,0 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
deleted file mode 100644
index 176094c..0000000
--- a/tensorflow/tools/docker/README.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# WARNING: THESE IMAGES ARE DEPRECATED.
-
-TensorFlow's Dockerfiles are now located in
-[`tensorflow/tools/dockerfiles/`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles).
-However, these Dockerfiles are still used to build
-[TensorFlow's official Docker images](https://hub.docker.com/r/tensorflow/tensorflow)
-while the internal infrastructure for the newer Dockerfiles is being developed.
-
-This directory will eventually be removed.
-
-# Using TensorFlow via Docker
-
-This directory contains `Dockerfile`s to make it easy to get up and running with
-TensorFlow via [Docker](http://www.docker.com/).
-
-## Installing Docker
-
-General installation instructions are
-[on the Docker site](https://docs.docker.com/installation/), but we give some
-quick links here:
-
-* [OSX](https://www.docker.com/products/docker#/mac)
-* [Ubuntu](https://docs.docker.com/engine/installation/linux/ubuntulinux/)
-
-## Which containers exist?
-
-We currently maintain two Docker container images:
-
-* `tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
-
-* `tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
-  and support for NVidia CUDA
-
-Note: We store all our containers on 
-[Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
-
-
-## Running the container
-
-Run non-GPU container using
-
-    $ docker run -it -p 8888:8888 tensorflow/tensorflow
-
-For GPU support install NVidia drivers (ideally latest) and
-[nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using
-
-    $ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu
-
-
-Note: If you would have a problem running nvidia-docker you may try the old method
-we have used. But it is not recommended. If you find a bug in nvidia-docker, please report
-it there and try using nvidia-docker as described above.
-
-    $ # The old, not recommended way to run docker with gpu support:
-    $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
-    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES tensorflow/tensorflow:latest-gpu
-
-
-## More containers
-
-See all available [tags](https://hub.docker.com/r/tensorflow/tensorflow/tags/)
-for additional containers, such as release candidates or nightly builds.
-
-
-## Rebuilding the containers
-
-Building TensorFlow Docker containers should be done through the
-[parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/parameterized_docker_build.sh)
-script. The raw Dockerfiles should not be used directly as they contain strings
-to be replaced by the script during the build.
-
-Attempting to run [parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/parameterized_docker_build.sh)
-from a binary docker image such as for example `tensorflow/tensorflow:latest` will
-not work. One needs to execute the script from a developer docker image since by
-contrast with a binary docker image it contains not only the compiled solution but
-also the tensorflow source code. Please select the appropriate developer docker
-image of tensorflow at `tensorflow/tensorflow:[.](https://hub.docker.com/r/tensorflow/tensorflow/tags/)`.
-
-The smallest command line to generate a docker image will then be:
-```docker run -it tensorflow/tensorflow:"right_tag"```
-
-If you would like to start a jupyter notebook on your docker container, make sure
-to map the port 8888 of your docker container by adding -p 8888:8888 to the above
-command.
-
-To use the script, specify the container type (`CPU` vs. `GPU`), the desired
-Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
-is to be built (`NO` vs. `YES`). In addition, you need to specify the central
-location from where the pip package of TensorFlow will be downloaded.
-
-For example, to build a CPU-only non-developer Docker image for Python 2, using
-TensorFlow's nightly pip package:
-
-``` bash
-export TF_DOCKER_BUILD_IS_DEVEL=NO
-export TF_DOCKER_BUILD_TYPE=CPU
-export TF_DOCKER_BUILD_PYTHON_VERSION=PYTHON2
-
-pip download --no-deps tf-nightly
-
-export TF_DOCKER_BUILD_CENTRAL_PIP=$(ls tf_nightly*.whl)
-export TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL=1
-
-tensorflow/tools/docker/parameterized_docker_build.sh
-```
-
-If successful, the image will be tagged as `${USER}/tensorflow:latest` by default.
-
-Rebuilding GPU images requires [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
deleted file mode 100644
index 4449e35..0000000
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-import os
-from IPython.lib import passwd
-
-c = c  # pylint:disable=undefined-variable
-c.NotebookApp.ip = '0.0.0.0'  # https://github.com/jupyter/notebook/issues/3946
-c.NotebookApp.port = int(os.getenv('PORT', 8888))
-c.NotebookApp.open_browser = False
-
-# sets a password if PASSWORD is set in the environment
-if 'PASSWORD' in os.environ:
-  password = os.environ['PASSWORD']
-  if password:
-    c.NotebookApp.password = passwd(password)
-  else:
-    c.NotebookApp.password = ''
-    c.NotebookApp.token = ''
-  del os.environ['PASSWORD']
diff --git a/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb b/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
deleted file mode 100644
index 8fa871e..0000000
--- a/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
+++ /dev/null
@@ -1,683 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "a3bskVXPvchm"
-      },
-      "source": [
-        "# Hello, TensorFlow\n",
-        "## A beginner-level, getting started, basic introduction to TensorFlow"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rb5rSpcZvYbX"
-      },
-      "source": [
-        "TensorFlow is a general-purpose system for graph-based computation. A typical use is machine learning. In this notebook, we'll introduce the basic concepts of TensorFlow using some simple examples.\n",
-        "\n",
-        "TensorFlow gets its name from [tensors](https://en.wikipedia.org/wiki/Tensor), which are arrays of arbitrary dimensionality. A vector is a 1-d array and is known as a 1st-order tensor. A matrix is a 2-d array and a 2nd-order tensor. The \"flow\" part of the name refers to computation flowing through a graph. Training and inference in a neural network, for example, involves the propagation of matrix computations through many nodes in a computational graph.\n",
-        "\n",
-        "When you think of doing things in TensorFlow, you might want to think of creating tensors (like matrices), adding operations (that output other tensors), and then executing the computation (running the computational graph). In particular, it's important to realize that when you add an operation on tensors, it doesn't execute immediately. Rather, TensorFlow waits for you to define all the operations you want to perform. Then, TensorFlow optimizes the computation graph, deciding how to execute the computation, before generating the data. Because of this, a tensor in TensorFlow isn't so much holding the data as a placeholder for holding the data, waiting for the data to arrive when a computation is executed."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "E8FhiMivhcYB"
-      },
-      "source": [
-        "## Adding two vectors in TensorFlow\n",
-        "\n",
-        "Let's start with something that should be simple. Let's add two length four vectors (two 1st-order tensors):\n",
-        "\n",
-        "$\\begin{bmatrix} 1. \u0026 1. \u0026 1. \u0026 1.\\end{bmatrix} + \\begin{bmatrix} 2. \u0026 2. \u0026 2. \u0026 2.\\end{bmatrix} = \\begin{bmatrix} 3. \u0026 3. \u0026 3. \u0026 3.\\end{bmatrix}$"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 2922,
-          "status": "ok",
-          "timestamp": 1474675631337,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "2iv3XQ6k3eF1",
-        "outputId": "7dbded62-91bc-4e38-9f25-53375c4c8dd8"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "result:  [ 3.  3.  3.  3.]\n"
-          ]
-        }
-      ],
-      "source": [
-        "from __future__ import print_function\n",
-        "\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "with tf.Session():\n",
-        "    input1 = tf.constant([1.0, 1.0, 1.0, 1.0])\n",
-        "    input2 = tf.constant([2.0, 2.0, 2.0, 2.0])\n",
-        "    output = tf.add(input1, input2)\n",
-        "    result = output.eval()\n",
-        "    print(\"result: \", result)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "dqLV5GXT3wLy"
-      },
-      "source": [
-        "What we're doing is creating two vectors, [1.0, 1.0, 1.0, 1.0] and [2.0, 2.0, 2.0, 2.0], and then adding them. Here's equivalent code in raw Python and using numpy:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 214,
-          "status": "ok",
-          "timestamp": 1474675631563,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "7DzDJ7sW79ao",
-        "outputId": "588b573b-95d2-4587-849e-af6f3ec1303e"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[3.0, 3.0, 3.0, 3.0]\n"
-          ]
-        }
-      ],
-      "source": [
-        "print([x + y for x, y in zip([1.0] * 4, [2.0] * 4)])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 340,
-          "status": "ok",
-          "timestamp": 1474675631948,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "MDWJf0lHAF4E",
-        "outputId": "bee09475-24dd-4331-fc46-692a07dae101"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[ 1.  1.  1.  1.] + [ 2.  2.  2.  2.] = [ 3.  3.  3.  3.]\n"
-          ]
-        }
-      ],
-      "source": [
-        "import numpy as np\n",
-        "x, y = np.full(4, 1.0), np.full(4, 2.0)\n",
-        "print(\"{} + {} = {}\".format(x, y, x + y))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "I52jQOyO8vAn"
-      },
-      "source": [
-        "## Details of adding two vectors in TensorFlow\n",
-        "\n",
-        "The example above of adding two vectors involves a lot more than it seems, so let's look at it in more depth.\n",
-        "\n",
-        "\u003e`import tensorflow as tf`\n",
-        "\n",
-        "This import brings TensorFlow's public API into our IPython runtime environment.\n",
-        "\n",
-        "\u003e`with tf.Session():`\n",
-        "\n",
-        "When you run an operation in TensorFlow, you need to do it in the context of a `Session`. A session holds the computation graph, which contains the tensors and the operations. When you create tensors and operations, they are not executed immediately, but wait for other operations and tensors to be added to the graph, only executing when finally requested to produce the results of the session. Deferring the execution like this provides additional opportunities for parallelism and optimization, as TensorFlow can decide how to combine operations and where to run them after TensorFlow knows about all the operations. \n",
-        "\n",
-        "\u003e\u003e`input1 = tf.constant([1.0, 1.0, 1.0, 1.0])`\n",
-        "\n",
-        "\u003e\u003e`input2 = tf.constant([2.0, 2.0, 2.0, 2.0])`\n",
-        "\n",
-        "The next two lines create tensors using a convenience function called `constant`, which is similar to numpy's `array` and numpy's `full`. If you look at the code for `constant`, you can see the details of what it is doing to create the tensor. In summary, it creates a tensor of the necessary shape and applies the constant operator to it to fill it with the provided values. The values to `constant` can be Python or numpy arrays. `constant` can take an optional shape parameter, which works similarly to numpy's `fill` if provided, and an optional name parameter, which can be used to put a more human-readable label on the operation in the TensorFlow operation graph.\n",
-        "\n",
-        "\u003e\u003e`output = tf.add(input1, input2)`\n",
-        "\n",
-        "You might think `add` just adds the two vectors now, but it doesn't quite do that. What it does is put the `add` operation into the computational graph. The results of the addition aren't available yet. They've been put in the computation graph, but the computation graph hasn't been executed yet.\n",
-        "\n",
-        "\u003e\u003e`result = output.eval()`\n",
-        "\n",
-        "\u003e\u003e`print result`\n",
-        "\n",
-        "`eval()` is also slightly more complicated than it looks. Yes, it does get the value of the vector (tensor) that results from the addition. It returns this as a numpy array, which can then be printed. But, it's important to realize it also runs the computation graph at this point, because we demanded the output from the operation node of the graph; to produce that, it had to run the computation graph. So, this is the point where the addition is actually performed, not when `add` was called, as `add` just put the addition operation into the TensorFlow computation graph."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "H_5_2YY3ySr2"
-      },
-      "source": [
-        "## Multiple operations\n",
-        "\n",
-        "To use TensorFlow, you add operations on tensors that produce tensors to the computation graph, then execute that graph to run all those operations and calculate the values of all the tensors in the graph.\n",
-        "\n",
-        "Here's a simple example with two operations:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 1203,
-          "status": "ok",
-          "timestamp": 1474675633108,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "-kQmn3U_yXX8",
-        "outputId": "8ba14a4d-b0cd-4b90-8b95-790e77d35e70"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[ 6.  6.  6.  6.]\n"
-          ]
-        }
-      ],
-      "source": [
-        "import tensorflow as tf\n",
-        "\n",
-        "with tf.Session():\n",
-        "    input1 = tf.constant(1.0, shape=[4])\n",
-        "    input2 = tf.constant(2.0, shape=[4])\n",
-        "    input3 = tf.constant(3.0, shape=[4])\n",
-        "    output = tf.add(tf.add(input1, input2), input3)\n",
-        "    result = output.eval()\n",
-        "    print(result)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Hod0zvsly8YT"
-      },
-      "source": [
-        "This version uses `constant` in a way similar to numpy's `fill`, specifying the optional shape and having the values copied out across it.\n",
-        "\n",
-        "The `add` operator supports operator overloading, so you could try writing it inline as `input1 + input2` instead as well as experimenting with other operators."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 350,
-          "status": "ok",
-          "timestamp": 1474675633468,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "yS2WElRfxz53",
-        "outputId": "2e3efae6-3990-447c-e05d-56a9d9701e87"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[ 3.  3.  3.  3.]\n"
-          ]
-        }
-      ],
-      "source": [
-        "with tf.Session():\n",
-        "    input1 = tf.constant(1.0, shape=[4])\n",
-        "    input2 = tf.constant(2.0, shape=[4])\n",
-        "    output = input1 + input2\n",
-        "    print(output.eval())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "zszjoYUjkUNU"
-      },
-      "source": [
-        "##  Adding two matrices"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EWNYBCB6kbri"
-      },
-      "source": [
-        "Next, let's do something very similar, adding two matrices:\n",
-        "\n",
-        "$\\begin{bmatrix}\n",
-        "  1. \u0026 1. \u0026 1. \\\\\n",
-        "  1. \u0026 1. \u0026 1. \\\\\n",
-        "\\end{bmatrix} + \n",
-        "\\begin{bmatrix}\n",
-        "  1. \u0026 2. \u0026 3. \\\\\n",
-        "  4. \u0026 5. \u0026 6. \\\\\n",
-        "\\end{bmatrix} = \n",
-        "\\begin{bmatrix}\n",
-        "  2. \u0026 3. \u0026 4. \\\\\n",
-        "  5. \u0026 6. \u0026 7. \\\\\n",
-        "\\end{bmatrix}$"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 1327,
-          "status": "ok",
-          "timestamp": 1474675634683,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "tmWcCxSilYkg",
-        "outputId": "8a135ccf-e706-457c-f4bc-2187039ffd92"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[[ 2.  3.  4.]\n",
-            " [ 5.  6.  7.]]\n"
-          ]
-        }
-      ],
-      "source": [
-        "import tensorflow as tf\n",
-        "import numpy as np\n",
-        "\n",
-        "with tf.Session():\n",
-        "    input1 = tf.constant(1.0, shape=[2, 3])\n",
-        "    input2 = tf.constant(np.reshape(np.arange(1.0, 7.0, dtype=np.float32), (2, 3)))\n",
-        "    output = tf.add(input1, input2)\n",
-        "    print(output.eval())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "JuU3Bmglq1vd"
-      },
-      "source": [
-        "Recall that you can pass numpy or Python arrays into `constant`.\n",
-        "\n",
-        "In this example, the matrix with values from 1 to 6 is created in numpy and passed into `constant`, but TensorFlow also has `range`, `reshape`, and `tofloat` operators. Doing this entirely within TensorFlow could be more efficient if this was a very large matrix.\n",
-        "\n",
-        "Try experimenting with this code a bit -- maybe modifying some of the values, using the numpy version, doing this using, adding another operation, or doing this using TensorFlow's `range` function."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "gnXnpnuLrflb"
-      },
-      "source": [
-        "##  Multiplying matrices"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Ho-QNSOorj0y"
-      },
-      "source": [
-        "Let's move on to matrix multiplication. This time, let's use a bit vector and some random values, which is a good step toward some of what we'll need to do for regression and neural networks."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 2353,
-          "status": "ok",
-          "timestamp": 1474675637053,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "uNqMaFR8sIY5",
-        "outputId": "b630554e-68b3-4904-c07d-f28a0a41bbd2"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Input:\n",
-            "[[ 1.  0.  0.  1.]]\n",
-            "Weights:\n",
-            "[[ 0.3949919  -0.83823347]\n",
-            " [ 0.25941893 -1.58861065]\n",
-            " [-1.11733329 -0.60435963]\n",
-            " [ 1.04782867  0.18336453]]\n",
-            "Output:\n",
-            "[[ 1.44282055 -0.65486896]]\n"
-          ]
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "import tensorflow as tf\n",
-        "import numpy as np\n",
-        "\n",
-        "with tf.Session():\n",
-        "    input_features = tf.constant(np.reshape([1, 0, 0, 1], (1, 4)).astype(np.float32))\n",
-        "    weights = tf.constant(np.random.randn(4, 2).astype(np.float32))\n",
-        "    output = tf.matmul(input_features, weights)\n",
-        "    print(\"Input:\")\n",
-        "    print(input_features.eval())\n",
-        "    print(\"Weights:\")\n",
-        "    print(weights.eval())\n",
-        "    print(\"Output:\")\n",
-        "    print(output.eval())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "JDAVTPhb22AP"
-      },
-      "source": [
-        "Above, we're taking a 1 x 4 vector [1 0 0 1] and multiplying it by a 4 by 2 matrix full of random values from a normal distribution (mean 0, stdev 1). The output is a 1 x 2 matrix.\n",
-        "\n",
-        "You might try modifying this example. Running the cell multiple times will generate new random weights and a new output. Or, change the input, e.g., to \\[0 0 0 1]), and run the cell again. Or, try initializing the weights using the TensorFlow op, e.g., `random_normal`, instead of using numpy to generate the random weights.\n",
-        "\n",
-        "What we have here is the basics of a simple neural network already. If we are reading in the input features, along with some expected output, and change the weights based on the error with the output each time, that's a neural network."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "XhnBjAUILuy8"
-      },
-      "source": [
-        "## Use of variables\n",
-        "\n",
-        "Let's look at adding two small matrices in a loop, not by creating new tensors every time, but by updating the existing values and then re-running the computation graph on the new data. This happens a lot with machine learning models, where we change some parameters each time such as gradient descent on some weights and then perform the same computations over and over again."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 2561,
-          "status": "ok",
-          "timestamp": 1474675639610,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "vJ_AgZ8lLtRv",
-        "outputId": "b8f19c28-a9b4-4fb3-9e90-6e432bf300a7"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[[ -7.29560852e-05   8.01583767e-01]] [[ -7.29560852e-05   8.01583767e-01]]\n",
-            "[[ 0.64477301 -0.03944111]] [[ 0.64470005  0.76214266]]\n",
-            "[[-0.07470274 -0.76814342]] [[ 0.56999731 -0.00600076]]\n",
-            "[[-0.34230471 -0.42372179]] [[ 0.2276926  -0.42972255]]\n",
-            "[[ 0.67873812  0.65932178]] [[ 0.90643072  0.22959924]]\n"
-          ]
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "import tensorflow as tf\n",
-        "import numpy as np\n",
-        "\n",
-        "with tf.Session() as sess:\n",
-        "    # Set up two variables, total and weights, that we'll change repeatedly.\n",
-        "    total = tf.Variable(tf.zeros([1, 2]))\n",
-        "    weights = tf.Variable(tf.random_uniform([1,2]))\n",
-        "\n",
-        "    # Initialize the variables we defined above.\n",
-        "    tf.global_variables_initializer().run()\n",
-        "\n",
-        "    # This only adds the operators to the graph right now. The assignment\n",
-        "    # and addition operations are not performed yet.\n",
-        "    update_weights = tf.assign(weights, tf.random_uniform([1, 2], -1.0, 1.0))\n",
-        "    update_total = tf.assign(total, tf.add(total, weights))\n",
-        "  \n",
-        "    for _ in range(5):\n",
-        "        # Actually run the operation graph, so randomly generate weights and then\n",
-        "        # add them into the total. Order does matter here. We need to update\n",
-        "        # the weights before updating the total.\n",
-        "        sess.run(update_weights)\n",
-        "        sess.run(update_total)\n",
-        "    \n",
-        "        print(weights.eval(), total.eval())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "kSYJr89aM_n0"
-      },
-      "source": [
-        "This is more complicated. At a high level, we create two variables and add operations over them, then, in a loop, repeatedly execute those operations. Let's walk through it step by step.\n",
-        "\n",
-        "Starting off, the code creates two variables, `total` and `weights`. `total` is initialized to \\[0, 0\\] and `weights` is initialized to random values between -1 and 1.\n",
-        "\n",
-        "Next, two assignment operators are added to the graph, one that updates weights with random values from [-1, 1], the other that updates the total with the new weights. Again, the operators are not executed here. In fact, this isn't even inside the loop. We won't execute these operations until the `eval` call inside the loop.\n",
-        "\n",
-        "Finally, in the for loop, we run each of the operators. In each iteration of the loop, this executes the operators we added earlier, first putting random values into the weights, then updating the totals with the new weights. This call uses `eval` on the session; the code also could have called `eval` on the operators (e.g. `update_weights.eval`).\n",
-        "\n",
-        "It can be a little hard to wrap your head around exactly what computation is done when. The important thing to remember is that computation is only performed on demand.\n",
-        "\n",
-        "Variables can be useful in cases where you have a large amount of computation and data that you want to use over and over again with just a minor change to the input each time. That happens quite a bit with neural networks, for example, where you just want to update the weights each time you go through the batches of input data, then run the same operations over again."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "fL3WfAbKzqr5"
-      },
-      "source": [
-        "## What's next?\n",
-        "\n",
-        "This has been a gentle introduction to TensorFlow, focused on what TensorFlow is and the very basics of doing anything in TensorFlow. If you'd like more, the next tutorial in the series is Getting Started with TensorFlow, also available in the [notebooks directory](../notebooks)."
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "default_view": {},
-      "name": "Untitled",
-      "provenance": [],
-      "version": "0.3.2",
-      "views": {}
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tensorflow/tools/docker/notebooks/2_getting_started.ipynb b/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
deleted file mode 100644
index b0963eb..0000000
--- a/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
+++ /dev/null
@@ -1,863 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "6TuWv0Y0sY8n"
-      },
-      "source": [
-        "# Getting Started in TensorFlow\n",
-        "## A look at a very simple neural network in TensorFlow"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "u9J5e2mQsYsQ"
-      },
-      "source": [
-        "This is an introduction to working with TensorFlow. It works through an example of a very simple neural network, walking through the steps of setting up the input, adding operators, setting up gradient descent, and running the computation graph. \n",
-        "\n",
-        "This tutorial presumes some familiarity with the TensorFlow computational model, which is introduced in the [Hello, TensorFlow](../notebooks/1_hello_tensorflow.ipynb) notebook, also available in this bundle."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Dr2Sv0vD8rT-"
-      },
-      "source": [
-        "## A simple neural network\n",
-        "\n",
-        "Let's start with code. We're going to construct a very simple neural network computing a linear regression between two variables, y and x. The function it tries to compute is the best $w_1$ and $w_2$ it can find for the function $y = w_2 x + w_1$ for the data. The data we're going to give it is toy data, linear perturbed with random noise.\n",
-        "\n",
-        "This is what the network looks like:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 681,
-          "status": "ok",
-          "timestamp": 1474671827305,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "q09my4JYtKXw",
-        "outputId": "4938066b-231d-4078-e2dd-fd223eca7c9f"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJYAAABkCAYAAABkW8nwAAAO90lEQVR4Xu2dT5Dc1J3Hv+YQT8VJ\nZUhVdprLWs4FTSrGGv4ql9CuHBCH4GaTFCLZwnIcjOAy8l6Q/1SlU4XHcg6xJgtY2OOik2KxSGoT\nGWrXzYFC2T2MDAtWitRavmQ0e9k2SYGowom4hNRPtqA9TE+rW3/cPfPepcfup6f3fu/Tv9/T+/PV\npo8//vhjsMQsULAFNjGwCrYoKy6xAAOLgVCKBRhYpZiVFcrAYgyUYgEGVilmZYUysBgDpViAgVWK\nWVmhDCzGQCkWGEuwrly5gtf++zW887/vYOn/lnD5T5cT40x9ZQrb/nEbxDtFiHeI2LJlSylGY4X2\nt8BYgUVAvfzqy3i5/TI+vPLhmq37wpYv4AHpATxw3wMMsP4cFJ5jbMAiqA4eOYg/Lv8xMcL26e34\n+vTXk8+vbv1q8n/03TsX38EfLv4h+aRE380dmmNwFY7O2gWOBVgE1Y/2/yjxUls+vwXaY1oS7tZK\n3v94MJ8zceUvV0Dea+H4AoOrQrhGHqxuT0Xjp0P7D2HqH6Yymejyu5dx5PiRZBxGnmt+bj7TdSxT\nfgv0ASuAzglwmyE8pfbZu3VaEDkDdT+AweevzGolvPjvL+LMb84knmr+yHxmqNKyCK7ZQ7OJ5yIo\n+3m6clqx8UrNB1bso2W64FQN9cnijdcdAvNAQWGRPBcLicX3Ua8S84FVcj3PnjuLhRcWkgH63OG5\nXHc7+NTBZEBP47NvffNbucpiF/e3QCaw2g0NfNvES5c+wtQ9u2G0LCj8BLAiFEaeBU0zYJ9fxkfY\njKl7FZgtCzIHIA7QUmXov/g9LmMztt6rwLBMyFROj3TkZ0fgveXh4X96GN//zvf7t2aNHGlI7VlW\n0pYmRC+AKUwAsQu5thOuvIjQEjGBGJ7CQYptdOw6etc6VzXXzcUZwJrGseWt2P28DV2I4OgyDgQK\nFgMTYtQ1xqq10eDuR6j8Fi1NxGTkwpAfRos7h05bQscQIFgibEeHMBHCVhs4EBtY8lQQd6ulvbN7\n8e6f302mC7Z/bXsuo9NkKk1X9PZ+IUyeR0sN4GscYl8DPzOP5VuPYynQwMU+dL4O3wzRbpQQ93O1\nbvQuzgRWS0p/tQA6Nuqcilq7A5u3Px28T7qw7BB1VUHqhEKTB2+pCAIVHZVD3dPgujpE6peOBzes\nQRS5nr/+b//g24nF7JN27qkCGq/J++RknHXm5JlVeiKGr/MQPQMdV0ZkCRBbNUwEMYzQhRyZEHgH\nOv29ynPM6HXtja1Rf7B4AZ7RgZv+SuMAOj+NtrYEX3avfyqMfDi2DdcLEAQBvPOX8MGtR3Ex0MEF\nJiRxP373wWZsvaeBhixDVRrg1/jxlwEWPV3ap+xVrR57Cjgpht2xEDV4mLIFvqkiaoUwwzp4U4Hv\n9/awN7YrR+vuGcAS4ZsdtKV0VNEFVqMLrIkWJGEPPP4hKA0RgiCAc1XsdJQErGQ2Ig7hOQ5sx4Hz\n0u+wvHX2akjtMWCpNhQCiCicq+AcCx1Fh9B2IegcNN6B4Teg1z0EeknzKqPFRe7a9AeLm4ajXvzU\noJEDqUahMESrKxSqbQHbDBGLoXUNlBiuUsNOT8fFQEVsNdHmdOjStTgSGOCnLTQuBDBosLxKqnTw\nntw/glPnoHMS4E6iFVjgbBGcwUGMPAjtawP73GZf/wVkAutYtAvPezYUPoKjipBdGZ5vQOgavGte\nHbfsiXD09TZUIUbg6JD3vITlrU/iYthErPOYaQk44ZhocDF8U0HDqsEOHfQaC7/2X68lyzJVTjd0\nWiJu2XMem++7+tAxSd52+hguTe3GYtjq6V3XPyqDtbA/WLyAtqRg0rHhLceo3avCsk0kjqd7uoEL\n0FJkaC/9Hh/gS9ixS0dTCaDKHVidNhoTNN2gQP/FedAmly/t2IWm2YK2xswqDbj3antzz5oToD/9\n15/i5smbcdo8vfaDQGiC37YfEyeW4KtcMu2g1HbCrp9Dx5Fw3ZCw04ZSb0Jse6CsLH1qgZFfK0zn\nn+hpznzKHGpJRzus4YJ/AX/78G94ofUC7r777pwMxAhdE6pyAK8u78CJJZ+BtcKiIw8Wea0DTx34\nZCH5oHYwM1y0TjhnziXbaWgB+4cP/RCPPfYYtm/fjpMnT+Kmm24aDrDYhdpoQdAbaMtNSB4Da6Uh\nRx4sqnB3SCTPNbtvtu9iMoU/Wg5Kt9p0h8DTp09j3759ePrpp/H4448PB1fylOtC5jTUGVifseFY\ngJXClXou+jcN6Gk2nj7JG1Gi7TG0Hkiz7OlGP/ru6OGjq46rnnjiCSwuLibe66677hocMAZWT5uN\nDVgpXGfbZ5OtybQNZq1EE6G0NXmXtGvNwbrv+4n3uu222wYPjwys9QFW2goKjbQ4Tdth6CAFeSpK\n5J3oQMUwhynS8PjMM89AVdVs3ouBtb7Aytbrw+WiMZfnednCIwOLgTUIZml43LFjB5577rnhnx4H\nuek6yztWY6yqbb+wsJBMTwwUHquu5Ijej4GVoWMoPJ4/fz7xXkM9PWa4x3rLwsDK2KMXLlxIvBeF\nR5qe2LRpU8YrN2Y2BtaA/U7hkaYnnn322exPjwPeYz1kZ2AN2YtpeCTvdeeddw5Zyvq9jIGVo28p\nPJL3ok2NLDxeb0gGVg6w0kvT8HjixIlkHJY1lauaE8GRangwsvD/noKqt+kzsLJSkCEfzdi/8cYb\nifdaKzxWoppDmxJ5FT54NH06YZShAQVmYWAVaEwqKg2PMzMzyfTEyqfHqlRzAoOH6OqwJnXoNQeB\nSWcjq0sMrJJsferUqSQsdofHylRzYg8aLyG0QtiTOvhGhFZglyKD0Mt8DKySwEqLpfD45ptvYn5+\nHr/+z19/sukwj2pOP72vyJXBy4BNME340Pg6AiNAu8IDkQysksGi4t9++2189wffxee++DkIO4Tc\nqjlrSw504Eg81FobYetq+KOwKDgagjVOnRdtBgZW0RZdpbw0BL73/nv4yZM/6bv7tVeVxkk1h4FV\nAVgbUTWHgVUBWGUcvCVV6EP/cuiztQ9NCNsMiIshrPSIeaK3oUNIlXQqaDMDqwIjlyEV0Fv6MoQl\nbENT/FTIhWSXOF2AF5jocei8cCswsAo36WcLLEPchO7yyr+9smrt6TQ3geQmcgcd2CQbIHoIDKGy\nuSwG1joEi06oU+jj3RAWR2HQgFiiTuxqJmRgVQBWGaGQDo78/OjPe9T+qpfSeBeeqIM3JPip4k8F\n7aVbMLAqMHSlg/dr7YkcCZxWg1Jz0G5UL7/EwKoArBuhmoNEbupBvPrRDhxf8qFVLFrCwKoArFQi\n4P3o/VwTpCmgdBi3r2oOIrQbNdwfGljytZ46r2U1n4FVlmW7yn3rrbfwvX/+XrKkMyPM5FLNIS2K\nbCrSNI8loKX48G6AxhIDq2SwaIcDgWWaJn71H78qRDWnlxbF1aaQxJILj6TRjRhm0L4hYrwMrJLA\nos1+BBXtyaLty5SKVs1Zverx1RB4dhIPPe/CVioeXF2rFAOrYLDIOxFQd9xxRwLVytSt90XfFaGa\nU3ATCimOgVWIGa8WkoY9AorA6pUIrqJVcwpsRiFFMbAKMONqYS9LsWWo5mS5bxV5GFg5rExhj8ZP\ndHBitbCXo+ixv5SBNWQXpmGPvNXtt98+ZCnr9zIG1oB9O2zYG/A2Y5+dgZWxC1nYy2goNt2Q3VA0\njqIDESzsZbcZ81hr2CoNe/T56KOPZrcqy8m2zazGAAt7+X8ZzGOtsCELe/mhohLGEqwyVFpY2CsG\nqLSUsQKrDJUWFvaKBWrswCpDpYWFvXKgKiYUxh5U/huwhd8idBqYRARX4bHTldd8Le8gTSpapYWW\nX0is47qnveTdi02I6aFOejlAbSdcOT2fF8NTOEixDTqnV6Uk0CC2GpW8hYTCyFXA72yj8XoAAzoE\n+nsxgNnrZc8DtL7bU9HJlDwqLY9855FkbY8ktS3LWlGLECbPo6UG8DUOsa+Bn5nH8q3HsRRo4GIS\nL6vDN0O0e70SdoB2rfeshYBF71Juyzzu90TcF59FIC8WJvSVvgiT9nnPH5nP/K7CtOPonYWzh2aT\nF2Fu+usmvPjLF3us7cXwdR6iZ6DjyogsAWKrhokghhG6kCMTAu9Ap7+r1l0cQwoLAote4+ugwT+I\nsxO78XrQKkTkqzsEkqeily8Nk0il5cfHfowv3/xlLBxf6Pk2sNhTwEkx7I6FqMHDlC3wTRVRK4QZ\n1sGbCnxfrfxgwjBtvtHXFAZW7OsQZo7hEm7Fkxf8nm+mH6TBlau0RG00OBWcY6Gj6BDaLgSdDn46\nMPwG9Hr15/MGsdco5S0GrDiAIU7D5M/AgIo9gY6Lng4+5wi3jIOea59wieCQzgEnAe4kWoEFzhbB\nGRzEyIPQDmBWpaoxSpQMUZdCwCLh1OlmDWcCBzJsSNzDiIyL8LR8Ur1lHE2nPeZzh+d6mooENW7Z\ncx6b7zuHTlvCJB1Nnz6GS1O7sUhKxDl/LEP00Vhekh8sUjThNUyYAdxr59dCSwSvAWbg5Xq7exkq\nLfRO6TMnz/TurNAEv20/Jk4swaf2xC6U2k7Y9XPoOBIm6crYh6UoaLodABOoSU3YlpLbQ48lQT0q\nnR+sEq1RBlj0dGmfsnPVOtB51IMmfEdGLQ7RkkSYkps8VbJ01QIjDdaNCIVZwOi4DnxOgsRRXIzh\nazwakY3gmphsljLWe56RBqv6wfvg3R0HFqS6CcHxC5kQHrwGo3nFSIN1Q1RaBuinyDchSyYmDRct\nhWPLPF22G2mwuo+k55kgHUylJRtZoa1A0kI0bAdGPRnSszQuYFE90yUdepoznzKHWtLRDmsglZY8\ncHZTE7UVCGqEpmtDScZZLK20wEh7LKpst9YBKQUf1A5mhovWCefMuU9eM9JbWnEQMAIY/DQOXLr+\nmqmHXkfIdj18YpSRByuFa6+2F1f+cgXkuWb3zfZdN6Twt/DCQuKpsgmVDQIXy9vPAmMB1krPRf9e\nryot/TpsXL4fG7BSuNa7Ssu4gNOvnmMFVtqY9azS0q/DxuX7sQRrXIy7kevJwNrIvV9i2xlYJRp3\nIxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3Ixf9d0NIelzdt4X5\nAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "\u003cIPython.core.display.Image at 0xa646e50\u003e"
-            ]
-          },
-          "execution_count": 1,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from __future__ import print_function\n",
-        "\n",
-        "from IPython.display import Image\n",
-        "import base64\n",
-        "Image(data=base64.decodestring(\"iVBORw0KGgoAAAANSUhEUgAAAJYAAABkCAYAAABkW8nwAAAO90lEQVR4Xu2dT5Dc1J3Hv+YQT8VJZUhVdprLWs4FTSrGGv4ql9CuHBCH4GaTFCLZwnIcjOAy8l6Q/1SlU4XHcg6xJgtY2OOik2KxSGoTGWrXzYFC2T2MDAtWitRavmQ0e9k2SYGowom4hNRPtqA9TE+rW3/cPfPepcfup6f3fu/Tv9/T+/PVpo8//vhjsMQsULAFNjGwCrYoKy6xAAOLgVCKBRhYpZiVFcrAYgyUYgEGVilmZYUysBgDpViAgVWKWVmhDCzGQCkWGEuwrly5gtf++zW887/vYOn/lnD5T5cT40x9ZQrb/nEbxDtFiHeI2LJlSylGY4X2t8BYgUVAvfzqy3i5/TI+vPLhmq37wpYv4AHpATxw3wMMsP4cFJ5jbMAiqA4eOYg/Lv8xMcL26e34+vTXk8+vbv1q8n/03TsX38EfLv4h+aRE380dmmNwFY7O2gWOBVgE1Y/2/yjxUls+vwXaY1oS7tZK3v94MJ8zceUvV0Dea+H4AoOrQrhGHqxuT0Xjp0P7D2HqH6Yymejyu5dx5PiRZBxGnmt+bj7TdSxTfgv0ASuAzglwmyE8pfbZu3VaEDkDdT+AweevzGolvPjvL+LMb84knmr+yHxmqNKyCK7ZQ7OJ5yIo+3m6clqx8UrNB1bso2W64FQN9cnijdcdAvNAQWGRPBcLicX3Ua8S84FVcj3PnjuLhRcWkgH63OG5XHc7+NTBZEBP47NvffNbucpiF/e3QCaw2g0NfNvES5c+wtQ9u2G0LCj8BLAiFEaeBU0zYJ9fxkfYjKl7FZgtCzIHIA7QUmXov/g9LmMztt6rwLBMyFROj3TkZ0fgveXh4X96GN//zvf7t2aNHGlI7VlW0pYmRC+AKUwAsQu5thOuvIjQEjGBGJ7CQYptdOw6etc6VzXXzcUZwJrGseWt2P28DV2I4OgyDgQKFgMTYtQ1xqq10eDuR6j8Fi1NxGTkwpAfRos7h05bQscQIFgibEeHMBHCVhs4EBtY8lQQd6ulvbN78e6f302mC7Z/bXsuo9NkKk1X9PZ+IUyeR0sN4GscYl8DPzOP5VuPYynQwMU+dL4O3wzRbpQQ93O1bvQuzgRWS0p/tQA6Nuqcilq7A5u3Px28T7qw7BB1VUHqhEKTB2+pCAIVHZVD3dPgujpE6peOBzesQRS5nr/+b//g24nF7JN27qkCGq/J++RknHXm5JlVeiKGr/MQPQMdV0ZkCRBbNUwEMYzQhRyZEHgHOv29ynPM6HXtja1Rf7B4AZ7RgZv+SuMAOj+NtrYEX3avfyqMfDi2DdcLEAQBvPOX8MGtR3Ex0MEFJiRxP373wWZsvaeBhixDVRrg1/jxlwEWPV3ap+xVrR57Cjgpht2xEDV4mLIFvqkiaoUwwzp4U4Hv9/awN7YrR+vuGcAS4ZsdtKV0VNEFVqMLrIkWJGEPPP4hKA0RgiCAc1XsdJQErGQ2Ig7hOQ5sx4Hz0u+wvHX2akjtMWCpNhQCiCicq+AcCx1Fh9B2IegcNN6B4Teg1z0EeknzKqPFRe7a9AeLm4ajXvzUoJEDqUahMESrKxSqbQHbDBGLoXUNlBiuUsNOT8fFQEVsNdHmdOjStTgSGOCnLTQuBDBosLxKqnTwntw/glPnoHMS4E6iFVjgbBGcwUGMPAjtawP73GZf/wVkAutYtAvPezYUPoKjipBdGZ5vQOgavGteHbfsiXD09TZUIUbg6JD3vITlrU/iYthErPOYaQk44ZhocDF8U0HDqsEOHfQaC7/2X68lyzJVTjd0WiJu2XMem++7+tAxSd52+hguTe3GYtjq6V3XPyqDtbA/WLyAtqRg0rHhLceo3avCsk0kjqd7uoEL0FJkaC/9Hh/gS9ixS0dTCaDKHVidNhoTNN2gQP/FedAmly/t2IWm2YK2xswqDbj3antzz5oToD/915/i5smbcdo8vfaDQGiC37YfEyeW4KtcMu2g1HbCrp9Dx5Fw3ZCw04ZSb0Jse6CsLH1qgZFfK0znn+hpznzKHGpJRzus4YJ/AX/78G94ofUC7r777pwMxAhdE6pyAK8u78CJJZ+BtcKiIw8Wea0DTx34ZCH5oHYwM1y0TjhnziXbaWgB+4cP/RCPPfYYtm/fjpMnT+Kmm24aDrDYhdpoQdAbaMtNSB4Da6UhRx4sqnB3SCTPNbtvtu9iMoU/Wg5Kt9p0h8DTp09j3759ePrpp/H4448PB1fylOtC5jTUGVifseFYgJXClXou+jcN6Gk2nj7JG1Gi7TG0Hkiz7OlGP/ru6OGjq46rnnjiCSwuLibe66677hocMAZWT5uNDVgpXGfbZ5OtybQNZq1EE6G0NXmXtGvNwbrv+4n3uu222wYPjwys9QFW2goKjbQ4Tdth6CAFeSpK5J3oQMUwhynS8PjMM89AVdVs3ouBtb7Aytbrw+WiMZfnednCIwOLgTUIZml43LFjB5577rnhnx4Huek6yztWY6yqbb+wsJBMTwwUHquu5Ijej4GVoWMoPJ4/fz7xXkM9PWa4x3rLwsDK2KMXLlxIvBeFR5qe2LRpU8YrN2Y2BtaA/U7hkaYnnn322exPjwPeYz1kZ2AN2YtpeCTvdeeddw5Zyvq9jIGVo28pPJL3ok2NLDxeb0gGVg6w0kvT8HjixIlkHJY1lauaE8GRangwsvD/noKqt+kzsLJSkCEfzdi/8cYbifdaKzxWoppDmxJ5FT54NH06YZShAQVmYWAVaEwqKg2PMzMzyfTEyqfHqlRzAoOH6OqwJnXoNQeBSWcjq0sMrJJsferUqSQsdofHylRzYg8aLyG0QtiTOvhGhFZglyKD0Mt8DKySwEqLpfD45ptvYn5+Hr/+z19/sukwj2pOP72vyJXBy4BNME340Pg6AiNAu8IDkQysksGi4t9++2189wffxee++DkIO4TcqjlrSw504Eg81FobYetq+KOwKDgagjVOnRdtBgZW0RZdpbw0BL73/nv4yZM/6bv7tVeVxkk1h4FVAVgbUTWHgVUBWGUcvCVV6EP/cuiztQ9NCNsMiIshrPSIeaK3oUNIlXQqaDMDqwIjlyEV0Fv6MoQlbENT/FTIhWSXOF2AF5jocei8cCswsAo36WcLLEPchO7yyr+9smrt6TQ3geQmcgcd2CQbIHoIDKGyuSwG1joEi06oU+jj3RAWR2HQgFiiTuxqJmRgVQBWGaGQDo78/OjPe9T+qpfSeBeeqIM3JPip4k8F7aVbMLAqMHSlg/dr7YkcCZxWg1Jz0G5UL7/EwKoArBuhmoNEbupBvPrRDhxf8qFVLFrCwKoArFQi4P3o/VwTpCmgdBi3r2oOIrQbNdwfGljytZ46r2U1n4FVlmW7yn3rrbfwvX/+XrKkMyPM5FLNIS2KbCrSNI8loKX48G6AxhIDq2SwaIcDgWWaJn71H78qRDWnlxbF1aaQxJILj6TRjRhm0L4hYrwMrJLAos1+BBXtyaLty5SKVs1Zverx1RB4dhIPPe/CVioeXF2rFAOrYLDIOxFQd9xxRwLVytSt90XfFaGaU3ATCimOgVWIGa8WkoY9AorA6pUIrqJVcwpsRiFFMbAKMONqYS9LsWWo5mS5bxV5GFg5rExhj8ZPdHBitbCXo+ixv5SBNWQXpmGPvNXtt98+ZCnr9zIG1oB9O2zYG/A2Y5+dgZWxC1nYy2goNt2Q3VA0jqIDESzsZbcZ81hr2CoNe/T56KOPZrcqy8m2zazGAAt7+X8ZzGOtsCELe/mhohLGEqwyVFpY2CsGqLSUsQKrDJUWFvaKBWrswCpDpYWFvXKgKiYUxh5U/huwhd8idBqYRARX4bHTldd8Le8gTSpapYWWX0is47qnveTdi02I6aFOejlAbSdcOT2fF8NTOEixDTqnV6Uk0CC2GpW8hYTCyFXA72yj8XoAAzoE+nsxgNnrZc8DtL7bU9HJlDwqLY9855FkbY8ktS3LWlGLECbPo6UG8DUOsa+Bn5nH8q3HsRRo4GISL6vDN0O0e70SdoB2rfeshYBF71Juyzzu90TcF59FIC8WJvSVvgiT9nnPH5nP/K7CtOPonYWzh2aTF2Fu+usmvPjLF3us7cXwdR6iZ6DjyogsAWKrhokghhG6kCMTAu9Ap7+r1l0cQwoLAote4+ugwT+IsxO78XrQKkTkqzsEkqeily8Nk0il5cfHfowv3/xlLBxf6Pk2sNhTwEkx7I6FqMHDlC3wTRVRK4QZ1sGbCnxfrfxgwjBtvtHXFAZW7OsQZo7hEm7Fkxf8nm+mH6TBlau0RG00OBWcY6Gj6BDaLgSdDn46MPwG9Hr15/MGsdco5S0GrDiAIU7D5M/AgIo9gY6Lng4+5wi3jIOea59wieCQzgEnAe4kWoEFzhbBGRzEyIPQDmBWpaoxSpQMUZdCwCLh1OlmDWcCBzJsSNzDiIyL8LR8Ur1lHE2nPeZzh+d6mooENW7Zcx6b7zuHTlvCJB1Nnz6GS1O7sUhKxDl/LEP00Vhekh8sUjThNUyYAdxr59dCSwSvAWbg5Xq7exkqLfRO6TMnz/TurNAEv20/Jk4swaf2xC6U2k7Y9XPoOBIm6crYh6UoaLodABOoSU3YlpLbQ48lQT0qnR+sEq1RBlj0dGmfsnPVOtB51IMmfEdGLQ7RkkSYkps8VbJ01QIjDdaNCIVZwOi4DnxOgsRRXIzhazwakY3gmphsljLWe56RBqv6wfvg3R0HFqS6CcHxC5kQHrwGo3nFSIN1Q1RaBuinyDchSyYmDRcthWPLPF22G2mwuo+k55kgHUylJRtZoa1A0kI0bAdGPRnSszQuYFE90yUdepoznzKHWtLRDmsglZY8cHZTE7UVCGqEpmtDScZZLK20wEh7LKpst9YBKQUf1A5mhovWCefMuU9eM9JbWnEQMAIY/DQOXLr+mqmHXkfIdj18YpSRByuFa6+2F1f+cgXkuWb3zfZdN6Twt/DCQuKpsgmVDQIXy9vPAmMB1krPRf9eryot/TpsXL4fG7BSuNa7Ssu4gNOvnmMFVtqY9azS0q/DxuX7sQRrXIy7kevJwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3Ixf9d0NIelzdt4X5AAAAAElFTkSuQmCC\".encode('utf-8')), embed=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "fBQq_R8B8rRf"
-      },
-      "source": [
-        "Here is the TensorFlow code for this simple neural network and the results of running this code:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 7741,
-          "status": "ok",
-          "timestamp": 1474671834967,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "Dy8pFefa_Ho_",
-        "outputId": "318456b0-f9de-4717-d9c7-956b5d390d05"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAAESCAYAAAAhTatLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xt8k+X9//FXjj2fDwkFBApCObSAICI6qSgoIF8QnNs8\nDPEAsxseBvKb7MA8O5U5nbOAIoqimydUTooWWkQEReUoyqml0JK0packTQ9J7t8fhXKmp6Q5fZ6P\n7QG9k9x5pynx0+u67s+lUhRFQQghhBBCuI3a2wGEEEIIIQKNFFhCCCGEEG4mBZYQQgghhJtJgSWE\nEEII4WZSYAkhhBBCuJkUWEIIIYQQbqZ1x0lef/113n//fVQqFb179+app55Cr9e749RCCOERc+fO\nJTc3l4SEBFasWAHAgw8+SEFBAQBVVVXExMSwfPlyABYuXMgHH3yARqPhz3/+M1deeaW3ogsh/EC7\nR7DMZjNvvvkmH374IStWrMDpdLJ69Wp3ZBNCCI+ZPHkyixcvPu3Y888/z/Lly1m+fDnXXXcdo0eP\nBuDAgQOsWbOG1atX88orr/DII48gLQSFEBfililCl8uF3W7H4XBQW1tLcnKyO04rhBAeM3ToUKKj\no897+5o1a5gwYQIAOTk5jBs3Dq1WS5cuXejWrRs7duzoqKhCCD/U7gLLYDAwbdo0MjMzueqqq4iK\nimLEiBHuyCaEEF6xdetWEhMT6dq1K9A4Ut+pU6em2w0GA2az2VvxhBB+oN0FVnV1NTk5Oaxfv54v\nv/ySmpqapvUMQgjhj1auXMkNN9zQ9PW5pgNVKlVHRhJC+Jl2F1ibNm2ia9euxMbGotFoGD16ND/8\n8MMFHyNrF4QQvsrpdPL5558zduzYpmNGo5GjR482fW0ymVq0FEI+64QIXu2+ijAlJYXt27dTV1eH\nXq9n8+bNpKenX/AxKpWK0lJLe5/aLZKSonwii6/kAMniyznA97L4s3MVQF999RWpqakYDIamY6NG\njWL27NnccccdmM1mCgsLycjIaPb8vvRZ11a+9PPWFv6eH/z/Nfh7fmjbZ127C6yMjAyuu+46Jk2a\nhFarpV+/ftx8883tPa0QQnjUrFmz2LJlC5WVlWRmZjJz5kymTJnCmjVrTpseBOjVqxdjx45l/Pjx\naLVa5s2bJ1OEQogLUileGsP2lWrWVyprX8kBksWXc4DvZREX5ivvVVv50s9bW/h7fvD/1+Dv+aFt\nn3XSyV0IIYQQws2kwBJCCCGEcDMpsIQQwkNWb8rHJVcSChGUpMASQggPyf5gB/nF1d6OIYTwAimw\nhBDCgyqt9d6OIITwAimwhBDCg6x2KbCECEZSYAkhhAdZahq8HUEI4QVSYAkhhAdJgSVEcJICSwgh\nPMgiU4RCBCUpsIQQwoNkBEuI4CQFlhBCeEioXoOlRkawhAhGUmAJIYSHREeGyAiWEEFKCiwhhPCQ\n6Ag9lpoGFOnmLkTQkQJLCCE8JCZCj8Pporbe6e0oQogOJgWWEEJ4SExkCAAWu0wTChFspMASQggP\niY7QA8hCdyGCkNbbAYQQLWO1WVmW+wY1umrCHdHcNvIOIiIivB1LXEDTCJYsdBci6EiBJYSfePnT\nF1la9AFaRywJ6kTq6+uZOelBb8cSFxAjI1hCBC2ZIhTCD9gabLyzdyXWeg2O2iTKyzvz2ZY93o4l\nmnFiBMsqI1hCBB23FFgWi4X77ruPsWPHMn78eLZv3+6O0wohgFpHLa/uWECdQyGkrhMhDZ1BUXCV\nJ3s7ml+bO3cuI0aMYMKECacdf/PNN7n++uuZMGECzz33XNPxhQsXMmbMGMaOHcvGjRtb9BzRkSdG\nsKTAEiLYuGWK8IknnmDkyJG8+OKLOBwOamtr3XFaIYJeg7OBJbtepdBSyKDQURz6vhtK5DHU1kSu\nSO/l7Xh+bfLkydx+++3MmTOn6diWLVtYv349K1euRKvVUl5eDsCBAwdYs2YNq1evxmQyMW3aNNau\nXYtKpbrgc8REnFiDJVOEQgSbdo9gWa1Wtm7dypQpUwDQarVERka2O5gQwc7pcvLmntfZX7mP9MQM\nFvzuT0y6JIOrkycx8ZJ07r9vmLcj+rWhQ4cSHR192rF33nmHe+65B6228XfP+Ph4AHJychg3bhxa\nrZYuXbrQrVs3duzY0exzxJwYwZI2DUIEnXaPYB05coS4uDgefvhhfvrpJwYMGMCf//xnQkND3ZFP\niKCkKAr/+/ltdpftondcb27rNxWtWstDDw0nKSmK0lKLtyMGpIKCArZu3crzzz9PSEgI/+///T8G\nDBiA2Wxm0KBBTfczGAyYzeZmzxcWokWrUckIlhBBqN0jWA6Hgx9//JFbbrmF5cuXExoayqJFi9yR\nTYigpCgKy/e/z3fmrXSL7s4dA+5Gq5YLfjuC0+mkurqad999l4ceeoj7778f4Jxb3TQ3PXjiPlHh\nelmDJUQQaventtFoxGg0kp6eDsB1113Hq6++2uzjkpKi2vvUbuMrWXwlB0iWcwkNVTN//ncUF4eS\nkmJn9uyhREa2vw+VxWI77bxpk0x8f2wLvZK7M3vEbMJ14Wc9xle+J4HGaDQyZswYADIyMtBoNFRU\nVGA0Gjl69GjT/UwmE8nJLbvIIC4qlOIyq1+/Z/6cHfw/P/j/a/D3/G3R7gIrMTGRTp06kZ+fT48e\nPdi8eTM9e/Zs9nG+MsXhK9MtvpIDJMv5cjz22Ca2bctEpVJRUKDw6KO5PPTQ8Haf+9lnNzed97vK\ntSx3LOXa4f25rddd2Cqd2Dj99fvK9wT8/0PzzJGpa6+9lq+//ppLL72U/Px8GhoaiIuLY9SoUcye\nPZs77rgDs9lMYWEhGRkZLXqOML2a2nonRcWV6HUaT7wMj/Kln7e28Pf84P+vwd/zQ9s+69wy7/CX\nv/yF2bNn43A46Nq1K0899ZQ7TiuETzGZwpqmhVQqFSZTmFvOe/iIwiHl31QnbsUenU9KdTemD8wi\nSh/d/INFm82aNYstW7ZQWVlJZmYmM2fOZMqUKTz88MNMmDABnU7HP/7xDwB69erV1IZGq9Uyb968\nFk0RAkSFn2zVkBDjfwWWEKJt3FJgpaWl8cEHH7jjVEL4LIOhBpNJQaVSoSgKRqPdLec16ZZTnlaM\nPXwvuLTElYcQH5rglnOL85s/f/45jz/77LPnPD5jxgxmzJjR6ueJDNcBYLHXkxAjF/8IESxk5awQ\nLZSVlUF2di4mUxhGo5177013y3mTh1TjKtuGzqHBqOtExpAYt5xX+IZTR7CEEMFDCiwhWigyMsIt\na65OdbByP4ccB0lMDKdvfD+i9NF0sXd263MI74o6MYIlrRqECCpSYImAYbHYWLBgJyZTGAZDDVlZ\nGW65ys9TjlgOs3jnInp16k28NR6VXU2yM5lbR071djThRlFhMoIlRDCSAksEjAULdjZdjWcyKWRn\nu+cqv7a6UMFnrjGzaEc2dc56pmbcyaDkS7yWU3jWyREsKbCECCZu2exZCF/gqav82upEwWc2D2f7\n9qvJzt4JQHntMRZu/w+2Bhs39b5ZiqsAJ1OEQgQnKbBEwDAYapr6GrnzKr+2OlfBZ6mvZtH2l6mq\nq+KG1P9jeMoIr2YUnieL3IUITjJFKPzeiam4I0d0WK1v0q1bV7p2pV1X+Z05vTdv3hWtPseZbR0S\njBUs3L6GUnsZ13Ybw9UXXdPmfMJ/hIdqUatUWGXDZyGCihRYwu+duvYqMvJKunZt/9qrM9dzPffc\n1/z+94Mv+Jgzi7KpU3uxdGljW4dEYxW6X2zhqO0oV3S+kuu7j2tXPuE/1CoVkWFamSIUIshIgSX8\nnifWXp15zuLi5htEnlmULV3aWOg5XA4W71zI3opihhiGcmOvm1rcBVwEhqhwPRWWOm/HEEJ0ICmw\nhN/zRIf1M8+ZklLb7GPOLMoOH1FYsOol1lfmYHFa+L/+N/KrPrdIcRWEosJ1FJXZcDhdaDWy9FWI\nYCAFlvB7nuiwfuY5Z88egd3uuuBjzizKTPoPOVS/izJVKVHhMSgmFxq17EUXjCKPL3S32huIjQzx\nchohREeQAkv4PU90WD/znJGREdjtF94N/tSizGCsYZuxgCJ7KRH6KNLi0zhWfcytGYX/OLUXlhRY\nQgQHKbCEcJNTi7LPCtaQu6GCMG04afF9Uas0JOuTvZxQeEtUmPTCEiLYSIElhJtYbVbezlvKVsu3\nHKk7zCVdhpBkS8ZSbSFZL1vgBDPphSVE8JECSwg3eTtvKRuUPPKdB9CF6EmwJXL/hFnejiV8gHRz\nFyL4SIElhJvssu4k33kQrVpH3/j+2Gw2b0cSPkJGsIQIPnK9sBBu8FP5Hg7W7UeNmrT4voRpw2TN\nlWjSNIIl3dyFCBoygiVEO+VXHeT1Xa/SO6UvidYEXDWKrLkSpzk5giVThEIECymwhGgjq83Kyzkv\n8kXFZ+jUeh7JfJKhXS/1dizhgyLDGj9qZYpQiODhtilCl8vFjTfeyO9+9zt3nVIIn7Zo3X9YVbMC\nm76GhIREvt/5rbcjiVaYO3cuI0aMYMKECU3HXnrpJa666ipuvPFGbrzxRjZs2NB028KFCxkzZgxj\nx45l48aNrXoujVpNRKjsRyhEMHHbCNbSpUvp2bMnVqvVXacU4oLO3Fw5KyuDyMgIt93/Qipqy/m8\nYi0ObQM9YnqSFJ5MSWVJW1+K8ILJkydz++23M2fOnNOOT5s2jWnTpp127MCBA6xZs4bVq1djMpmY\nNm0aa9eubdW2R1HhehnBEiKIuGUEy2QykZeXxy9/+Ut3nE6IFjmxubLZPJzt268mO3unW+9/JqvN\nyqLVL/O3D+cy44M7cTlddI26CEOEEUVRZFG7nxk6dCjR0dFnHVcU5axjOTk5jBs3Dq1WS5cuXejW\nrRs7duxo1fNFheuw2Rtwuc4+vxAi8LilwHryySeZM2eObGIrOtSZmyubTGFuvf+Z3s5byq6QXXzt\n/IojqiP0DO3FKM21xFXGkWZPk0XtAWLZsmVMnDiRP//5z1gsjdsjmc1mOnXq1HQfg8GA2Wxu1Xmj\nwvUogLVWRrGECAbtniLMzc0lMTGRvn37smXLlhY/Likpqr1P7Ta+ksVXcoB/ZElNdVJermnaXDk1\n1XXB3K29/5mqNeUctOylTrHTNbYzvTQ9+PNt/6/Vr8cdfOn9CSS33HILv//971GpVDz//PM8/fTT\nPPHEE+cc1WrpL5Qn3quk+HAAdCE6v3v//C3vmfw9P/j/a/D3/G3R7gLr+++/Z926deTl5VFXV4fN\nZmPOnDk888wzF3xcaemFN87tKElJUT6RxVdygP9kmTq1DzU1X2AyhWE02pk6Nf2CuVt7/1PFJYSx\nq+JHyl2VJIYlkRJ6EZG1sV75Pvna+xNI4uPjm/5+8803N120YzQaOXr0aNNtJpOJ5OSWTQmfeK90\nx+cLCosqCdP4z2i/L/28tYW/5wf/fw3+nh/a9lnX7gLrj3/8I3/84x8B+Oabb3jttdeaLa6EcIdT\nN1f2xP1PcCkuFn+/mJjEWNLK0uhOKsZao0wJBoAzR6ZKS0tJSkoC4PPPP6d3794AjBo1itmzZ3PH\nHXdgNpspLCwkIyOjVc8VFSbd3IUIJtIHS4gLUBSF937+Lzsrv6d3QhrTr74XnUbn7VjCDWbNmsWW\nLVuorKwkMzOTmTNnsmXLFvbs2YNaraZz5848+uijAPTq1YuxY8cyfvx4tFot8+bNa/WaU9mPUIjg\n4tYCa9iwYQwbNsydpxTCLVrbosFqs7Is9w02VX9FaUMJ4weN5a5+06W4CiDz588/69iUKVPOe/8Z\nM2YwY8aMNj+f7EcoRHCRvQhFUGhti4a385ayzpXDfmUvdaF1hB8LJ1Qb2kFpRSA6OYIlBZYQwUCm\nCEVQaG2Lhu8sWzniLCREE0q/hP5UNlR2REwRwJpGsOwyRShEMJARLBEUDIaapgXNiqJgNNrPe9+t\npm84XFeIVq0jLaEfOrUeQ6iho6KKABUZJiNYQgQTGcESQSErK4Ps7NymFg333pt+zvvtLNvB/35+\nh4yuA0m0JFJrrSNZn8zdN91NTY2rg1OLQKLTqgnVa2SRuxBBQgosERRa0qJhX8Ve3vrxdXRqLdMH\nZdE9pkfTbREREdTU+HcfF+F9UeE6GcESIkjIFKEQQEFVPkt2vQLAtAH3nFZcCeEuUeF6rPaGc3aG\nF0IEFimwRNA7ai1m8c6FNLgc3NbvDi6O6+3tSCJARYfrcboULHYZxRIi0EmBJYJaaU0pC3e8TI3D\nzq/6/Ib0xNZ15xaiNbokN/ZeO2SS6WYhAp2swRIBobWNRAEqaytYtONlLPUWbrx4CkON0iRXeFZq\npxgA8ourSU9N8HIaIYQnyQiWCAitbSRqrbewcMfLlNeWM7bHOK7sfFUHJRXBrEdKNAAHj1Z7OYkQ\nwtNkBEsEhJY2ErXarLyxfjFrKz6lQWlg2pB7uOaiMR0ZVQSxmAg9CdGhHCyuRlGUVu9nKITwHzKC\nJQJCSxqJHjx0kMtmDeSJHx7hu8Pf4lQ5KS8ok//IiQ7VIyUaq72B0qpab0cRQniQFFgiIGRlZTBo\nUC4Gw2YGDco9q5Go1WZlwlOjKe1XijPOSUOMgwNb91PaUOqlxCJYpXZqnCbML5ZpQiECmUwRioDQ\nXCPRt3JfpzKlEpVGBSpQOaBBX0+yPrkDUwoBqcfXYeUfreayfrIFkxCBSgosEfAURWFj1QZUWhWo\nVKgUUFwKUZYobh051dvxRJDpZohCrVJxUEawhAhoUmAJv9XS1gyrDq6g3FFOakpPircV06CrJ8oS\nxaqHvyAi4sKtHIRwtxC9hs5JERwyW3A4XWg1slJDiEAkBZbwWydaMzReNaiQnZ171jThusLPWX84\nh8tTryC2MpaqzCqS9cncOnKqFFfCa1JTojlcYqWo1EY3Y5S34wghPEAKLOG3mmvNsKloI6sOriQu\nJI4/DL6f2NA4b8QU4iypnaLJ21bMwaPVUmAJEaCkwBJ+y2CowWRq7CV0ZmuG781b+XDf+0TqIpk+\nMIvY0Lg2dXsXwhOaGo4WV3H14M5eTiOE8IR2T/6bTCZ++9vfMm7cOCZMmMDSpUvdkUuIZp2vNcPu\nsl2889MyQrUhzBiYRXJ445WCre32LgLf3LlzGTFiBBMmTDjrtsWLF5OWlkZlZWXTsccff5wxY8Yw\nceJE9uzZ0+bnTUmIIESvIf+o7EkoRKBq9wiWRqPh4Ycfpm/fvthsNiZPnswVV1xBz5493ZFPiPM6\ntTWD1Wbltc8Wsmb/aopDj9AlugvPXvcCKZEnRwda2u1dBI/Jkydz++23M2fOnNOOm0wmNm3aREpK\nStOxvLw8CgsLWbt2Ldu3b2fevHm8++67bXpetVpFD2MUPxdWYq9zEBYikwlCBJp2j2AlJSXRt29f\nACIiIujZsyclJSXtDiZEa7ydt5SPjnzI/vifsYXbsCk1bPwh77T7tKTbuwguQ4cOJTo6+qzjTz75\n5FlFV05ODpMmTQJg4MCBWCwWysrK2vzcPVKiUYAC2ZdQiIDk1uuDjxw5wk8//URGRoY7TyvEBVlt\nVj7Z9TH7a/dRV11HlKqxz1BJ/emFfnPd3oUAWLduHZ06daJPnz6nHS8pKcFoNDZ9bTAYMJvNbX6e\nEx3dZeNnIQKT28albTYb9913H3Pnzm3R5e9JSb5z5YyvZPGVHOBfWRate5HDCQVQq6DWqWkw1RPZ\nI4LU+ItOe2xoqJrw8BBCQ/WEhztJSopq1SL3M3NYLDbmz/+O4uJQUlLszJ49tMMWzfvS+xNIamtr\nWbBgAa+99tpZt50Y/TxVS/axPN97daleC8t3UXSsxuffT1/P1xx/zw/+/xr8PX9buKXAcjgc3Hff\nfUycOJFrr722RY8pLfWNxZ1JSVE+kcVXcoD/ZDGXmvnjG39gk2MjREAnXSfsNbXobDpGh49l4pBf\nnfbYZ5/d3NQ3q6BA4dFHz+6b1Zoc7Tlfe/ja+xNICgsLKSoqYuLEiSiKgtlsZvLkybz33nsYDAZM\nJlPTfU0mE8nJzW+1dKH3KjZSz56CckpKqn1203Ff+nlrC3/PD/7/Gvw9P7Tts84tBdbcuXPp1asX\nU6fKtiOi48x6YyZb477FWeVEpVfhqHVy7cDrSLOnMX1c1ln3d/cid1k0HxhOHZnq3bs3X331VdPX\no0aNYvny5cTExHDNNdewbNkyxo0bx7Zt24iOjiYxMbFdz52aEsP3e0upsNQRHx3arnMJIXxLu9dg\nfffdd6xYsYLNmzczadIkbrzxRjZs2OCObEKcV62jll3aXTgVJ9FRMYSXRlBvaiDNnnbe/QXdvchd\nFs37v1mzZvHrX/+a/Px8MjMz+eCDD067/USPNYCRI0fSpUsXRo8ezd/+9jfmzZvX7ufv0anxt2LZ\nl1CIwNPuEawhQ4a0qx+MCGzubu5ptVl5M3cJn1espabWSki0nuiQaJQe0Odon3OOXJ2QlZVBdnYu\nJlMYRqO93Yvc3X0+0fHmz59/wdtzcnJO+/pvf/ubW58/NSUGgPyj1QxNa366UQjhP6T5ivColuwX\n2Bpv5b7OytoVVKrL6d4lldrtdiKSozCoknl26r8u+NhT+2a5g7vPJ4JPd2MUKmQES4hAJAWW8Ch3\nrlNSFIW8qlwqKScmJJY+8X1JiEzgL5P+7qa0QnSssBAtKUkR5B+tpsHhRKfVeDuSEMJN3NoHS4gz\nxcWVs3//l+ze/S379m0gLq68TedRFIXl+9+nylFJpC6K3nFpqFCRrJdpFeHf+nePp97h4ufDlc3f\nWQjhN6TAEh6lUmmA/kAfVKoBx79uvU8LVvNV0UZ+0WskE8ImklideMEF7UL4i/SeCQDsPNC2Xz6E\nEL5JpgiFR5WXx9CrV/xpX7fW+sIcvji0lqSwRH4/+H6i9GdvbSKEv+rdJZYQnYadB4/xGy72dhwh\nhJtIgSXapbmrBA2GGkwmpely99a2Mvjy0JesPPgJsSGxTB+YJcWVCDg6rZq+3eLYtr+M0ko7SbHS\nT02IQCBThKJdTlwlaDYPZ/v2q8nO3nna7e3Z/29byfcs27mMCF0E0wdmER+a4O74QviE9NTGUd6d\nB495OYkQwl1kBEu0S3NXCba1lcGeYz+ybM+bxEREcEe/GRjCDW7J6+6+XEK4Q3rqiXVYxxh1SRcv\npxFCuIMUWKJdLjQF2NZi5mDlft7YvRiNSs0fhv2BWJfRbXnd3ZdLCHdIjA2jU0I4eworpF2DEAFC\npghFu1xoCrC56cMzWW1W/vHJE8z4+C52HtrBzam3cHGCexf9yv6BwlelpyZQ3+Bi7+Eqb0cRQriB\njGCJdrnQFGBri5kX1/yTt46+T71DIUndma+2buC6wZluzdveRfdCeEp6agJrvz3MzoPH6N8jvvkH\nCCF8moxg+RGLxcazz25m1qztPPPM11itNm9HuqCWboZstVn514pneeWnJVTX1qCz9KP2WD8+2+L+\nPS7bs+heCE/q3TUWvU4tC92FCBAyguVH/G39UEs2QzaXmpny7A0cTi6kztmAxpWIo6ECSMFV7v4u\n7bJ/oPBVOq2avhfFsf3AMcoq7SRKuwYh/JoUWH7E39YPtaSYmf3GfRzqWkCD2olLo0LZYUfRuIiw\n9eOK9F4dlFQI35DeM4HtB46x8+AxrparCYXwazJF6EdaOuXmL+qcdezS7sTpVFCcOjTqMIhREWHq\nz8RL0rn/vmHejihEh2pq13BQts0Rwt9JgeVHAmX9kNVmJXvVv7nlnV9iq7WiVsLRWiLBpiHc1IWR\nPX/DQw8Nl/5UokVWr16N1WoF4IUXXuCuu+5i165dXk7VNkmxYRjjw9lzqIIGh8vbcYQQ7SAFlh85\nMeU2f/5Avy5A3sp9nVV1KzCpi+nWJZXI7TGEFaQSv/MqLnF+SleZGRGtkJ2dTWRkJDt27GDjxo1M\nmjSJxx9/3Nux2iw9NYG6Bid7j1R6O4oQoh2kwBIdxmqzsnDVf1i89xUOlh0gUhdJn+Q0UpOH0T1/\nDsaSyQwe+J3fjswJ79BqG5eSfvXVV/zyl79kwoQJ1NXVeTlV26X3PL5tzgG5mlAIfyYFlugwy3Lf\n4DPHp9i0VlwaF6oaNbt3HiP/hwyOHcvEar0elUrjtyNzwjtUKhWrV69m9erVXH755QA0NDR4OVXb\n9TnermHbvrKmNZdCCP/jlgJrw4YNXH/99Vx33XUsWrTIHacUAWiLZTNm21GMiZ3oZuuOqkRN+foU\nlCP3U1cXi8WSyPr1/jvyILzjL3/5CytXruSmm26ia9euFBQUcNlll7XosXPnzmXEiBFMmDCh6dgL\nL7zA//3f/zFp0iTuuusuSktLm257/PHHGTNmDBMnTmTPHvf3aQPQaTUM6pVISaWdQ2aLR55DCOF5\n7S6wXC4Xjz32GIsXL2blypWsWrWKAwcOuCObCABWm5VFq19m+v+msd38A1pFR//kAfRPT+fG/pNJ\ntGeiIhzg+G/rvt08VfieSy65hJdffpmpU6cC0L17d/7617+26LGTJ09m8eLFpx27++67+eSTT/jo\no4/IzMzkpZdeAiAvL4/CwkLWrl3Lo48+yrx589z7Qk4xrG/j5ubf/FjisecQQnhWuwusHTt20K1b\nNzp37oxOp2P8+PHk5OS4I5sIAEs+f4U3Ta+TV7WeurA6DEcMGCxG0uxp3DpyKldfHUtMzAZCQrYQ\nE7OBzMyYC57P37rZC897+umnsVgsOBwObrnlFgYNGsTHH3/coscOHTqU6Ojo045FRJycorbb7ajV\njR+TOTk5TJo0CYCBAwdisVgoKytz06s4XXpqPGEhGr79yYxLpgmF8EvtLrDMZjOdOnVq+tpgMFBS\nIr91icbRq9d+eIX8uoPYnXbCQsNwaV38ZdLfmT4ui4iICB54YBhTpjgYNcrJlCkOHnjgwr2vWruB\ntAh8mzZtIioqio0bN2IwGPjss8947bXX2nXO559/nszMTFasWMF9990HQElJCUajsek+BoMBs9nc\nruc5H51WwyUXJ3Gsuo6DRdUeeQ4hhGe1u5N7WxdhJiVFtfep3cZXsvhKDnBPlvlrn6AiohzUoNNo\nqamzoVeH699cAAAgAElEQVRrTzt3UlIUzzxjvMBZTs9SWRlDSIjutK876vsWaO9PoPn2228ZPXo0\nBoOhaceDtnrwwQd58MEHWbRoEW+99RYzZ84852ddS56nre/VtcO789UuEzsKyrl8sHd7l/j7z5u/\n5wf/fw3+nr8t2l1gGY1GiouLm742m80kJze/h1xpqW8s3kxKivKJLL6SA9yTJb/qIGuKPyU8JBxH\nmROVSoW6XsPlGb9o1bnPzBITU0lBQQMqlQpFUYiNreqQ71ugvT/u4gsfmgkJCcybN48vv/yS6dOn\n43A4cDqdbjn3DTfcwIwZM5g5cyYGgwGTydR0m8lk8uhnXee4UCJCtXz5QxGTRnRHrW5f0dhWvvTz\n1hb+nh/8/zX4e35o22ddu6cI09PTKSwspKioiPr6elatWsU111zT3tMKP1ZkOcLinQsJ0YQypPsw\nekVdTEpEZ4ZEDuXO66a369yB0s1euM/8+fPp0aMH//znP4mJicFkMjFt2rQWP/7MkalDhw41/T0n\nJ4fU1FQArrnmGj766CMAtm3bRnR0NImJiW54Beem1agZmpZMla2enw9L01Eh/E27R7A0Gg1//etf\nufPOO1EUhZtuuomePXu6I5vwQyU1JSzakU2to44/X/V3du3eTslFJSTrk7l15NTTFhC3RUs2kBbB\nJT4+nttuu438/Hz2799P9+7dmTx5coseO2vWLLZs2UJlZSWZmZnMnDmTvLw88vPzUavVpKSk8Mgj\njwAwcuRI8vLyGD16NGFhYTz11FOefFkADEtLJm9bMd/sMdO3W5zHn08I4T4qxUud7HxluNBXhi59\nJQe0PUtFbTkv/fAClXWV3NT7Zi5PucJrWdzNV3KA72Xxtp07d3Lfffeh1+tRFAWHw8G///1v+vfv\n7+1oQPs+61wuhT/+5ytcLoV//uEKtJqO7w3tSz9vbeHv+cH/X4O/54e2fda1ewRLBDerzcqCVf/h\n3e3vcCy8jOjwaP567SNuKa6EaIknnniCJ598sqmL++bNm3nsscf473//6+Vk7adWq7g0LZmc746w\n51AF6akJ3o4khGgh2SqnBaT30vm9nbeUt4pf5+hFRdQm2Kmqr+KTz5Z7O5YIIna7vam4Ahg+fDh2\nu92LidxrWN/GhfTf/OiZlhBCCM+QAqsFpPfS+R2tO0qlphKXoqBVadE6tZgV6YMmOk5YWBibN29u\n+vqbb74hLCzMi4ncq2fnGOKjQ/h+XykNDvdcHSmE8DyZImwBkymsqd+NSqXCZAqcD+/2cLgc5Ncd\nQAVoVGo0Ki0apwaDvvlL14Vwl7lz53L//fej1+uBxo2eX3zxRS+nch+1SsWwNAOfflPIroPlDO6d\n5O1IQogWkAKrBQyGGkwmpan3ktEYONMPbeVSXCzbs5SYxFhG1F3J/p/309DQQL/4fjw79V/ejieC\nSEZGBmvXriU/Px9FUejRowdjxowhNzfX29HcZli/ZD79ppBNu01SYAnhJ6TAaoGsrAyys3MxmcIw\nGu1B23vJYrHx4r+3sPFQDhVdthDT1cn1fcfz3G0voNPomj+BEB6i0+no3bt309deujjaY7oZouiS\nFMm2fWVUWuuIjQzxdiQhRDOkwGoB6b3U6Ll/5vLGj49T2/MwuFzEH0tCX6qT4kr4nPZuleNrVCoV\nVw9O4c21e/lyx1EmjOju7UhCiGZIgSVabGX+y9RedghFVw9AzW4rFT0rvJxKBKv9+/ef9zaHw9GB\nSTrG8P5G3l1/gA3bihg/vJvXts4RQrSMFFiiWVablbfzllKavA2XqhaVokathKKE1ZMsC9qFl0yf\nfv5tl0JCAm8KLSxEy/D+BvK2FbPz4DEG9vLcNj1CiPaTAks06+28pXzJBtQGF2odqCxqtKEuuqq7\ncOvIqd6OJ4LUunXrvB2hw2UO6kzetmJyfyiSAksIHycFVoCyWGwsWLATkykMg6GGrKwMIiPbtg/g\nbtsu8h0HSEnujK3AiqtGYYhuCM/+6V/t3ltQCNFy3YxR9OgUzY4DxyirspMYIy1jhPBVUmAFmBOF\n1WefWbBYYunRIx2TKZzs7Nw2LdTfV7GXA7X7UWnVDDD0J7JLJGn2NKaPy/JAeiFEczIHp7DkaDUb\nth9l8lWp3o4jhDgP6eQeYE50nT92LJPq6pEUFHzf5uaoBVX5LNn1Cj2SehKzM4P9K0I5siqUSUNv\n8kByIURLDOtrICxEy5fbi3E4Xd6OI4Q4DymwAsyJrvMhIU5UKhV1daFtao561FrM4p0LaXA5CNl9\nJTGH3iSlYgm6oy/yxhsHPZReCNGcEJ2GKwYYqbLVs21fmbfjCCHOQwqsAGMw1KAoCt27RxEdXUZC\nwn4GDcptVXPUElsJC3e8TI3Dzq/6/AaleIBsFSSED8kc3BmA9T8UeTmJEOJ8ZA1WgDm16/yQIXbu\nvXd0ixe3W21WFq9bwDrr5zQ0OPnjiDkMNQ5jneFr2SpICB+SkhhBn66x7DlUgam8BmN8uLcjCSHO\nICNYAeZE1/n58wfy0EPDW3Xl4JL1r/CJ/WMqNBVExUVzcG9jI8esrAwGDcolPn49NttbHD6s5Zln\nvsZqtXnqZQghmnFiFOuLrYe9nEQIcS5SYAkAah21fFGxllqHnS5RXegc2YWS+hLgZNHWtStERNxG\nefkv2L79arKzd3o5tRDBa0ifJBJjQtmw/ShV1jpvxxFCnEEKLEGDs4HFOxfRoDSQFJZMj9geAGd1\naT+xgB5kLZYQ3qbVqBk7vBsOp4vPvpVRLCF8TbsKrGeeeYaxY8cyceJEZs6cidVqdVcu0UEcLgdv\n7F7MwaoD3Jh+E9frx5FoSSTNnnZWl/YTC+gBWYsl/N7cuXMZMWIEEyZMaDp2oc+0hQsXMmbMGMaO\nHcvGjRu9EfksV6YbiY3Us/77Iqz2Bm/HEUKcol0F1pVXXsmqVav4+OOP6datGwsXLnRXLtEBXIqL\nd356iz3le0iLT2PawLv53fg/8OStTzJ9XNZZXdpPrMUyGDa3+spEIXzN5MmTWbx48WnHzveZtn//\nftasWcPq1at55ZVXeOSRR5p+2fAmnVbD9cMuoq7ByecyiiWET2lXgTVixAjU6sZTDBo0CJPJ5JZQ\nwvMUReGDve+yreQHesSkMrX/XWjVF76otD0L6IXwNUOHDiU6Ovq0Y+f7TFu3bh3jxo1Dq9XSpUsX\nunXrxo4dOzo887mMHNSZyDAdOd8doabW4e04Qojj3LYG6/333+eqq65y1+mEh606uILNR7+mc2Rn\n7kqfjl6j93YkIXzK+++/z8iRIwEwm8106tSp6TaDwYDZbPZWtNOE6DWMubQrNXUO1v9wxNtxhBDH\nNdsHa9q0aZSVnd0t+MEHH2TUqFEAZGdno9PpTlvL0JykpKhWxPQsX8nSUTk+3f8pm0s30D2xCw+N\neIiokLOf11e+J+A7WXwlB/hWlkB04jPthhtuADjndOCJCz6a0xHv1c1j0vjsm0I+33qEX1/Xl9AQ\n97Y49PefN3/PD/7/Gvw9f1s0+69wyZIlF7x9+fLl5OXlsXTp0lY9cWmppVX395SkpCifyNJROTYV\nbeSDfe8RFxLH7b3uprYaajn9eX3lewK+k8VXcoDvZQk05/pMMxqNHD16tOlrk8lEcnLyuR5+lo56\nr0Zd0oUVmwr4IGcvYy7t6rbz+tLPW1v4e37w/9fg7/mhbZ917Zoi3LBhA6+++irZ2dno9TLF5Ou+\nN2/lw33vE6mLZPrALGJD47wdSQivOnNk6nyfaaNGjWL16tXU19dz+PBhCgsLycjI6Oi4FzT60q6E\n6DR8uuUQDQ7ZBFoIb2vXOPLjjz9OQ0MDd955JwADBw7k73//uztyCTfbXbaLd35aRqg2hBkDs0gO\nb9lv30IEqlmzZrFlyxYqKyvJzMxk5syZLFy48Jyfab169WLs2LGMHz8erVbLvHnzWjxF2FEiw3Rc\nPbgzn35TyPofitw6iiWEaD2V4qVrjX1luNBXhi49meNA5T4W7chGrVIzPSOLHjGpXsvSWr6SxVdy\ngO9lERfWke+VpaaePy3cjAp4asZwosLbP7PgSz9vbeHv+cH/X4O/5wcvTBEK31dYfYjFOxehKAp3\n9L+72eJKCOG/osL1TLyiOzV1Dj7emO/tOEIENSmwApTVZuW5T57m7uVT2Z7/A1N63Eyf+DRvxxJC\neNioIV0wxIeT+0MxRaWyu4YQ3iIFlo+yWGw8++xmZs3azjPPfI3VamvV419Zl83Kmk+w6azEJySy\nfdcPHkra/qxCCPfRatT8alQvXIrCf3P2+UTHeSGCkRRYPmrBgp1s25aJ2Tyc7duvJjt7Z4seZ7VZ\neWHFfJYceJWSUjOdw7qSHGGgpL7E57IKITxjYM8E+veIZ3dBBdsPHPN2HCGCkhRYPspkCmu6Skml\nUmEyhbXoca+vf5UV9o9xhDvQRmmxHragKArJes9dNdjWrEIIz1CpVPx6VC/UKhX/y9mHwyltG4To\naFJg+SiDoaZpaF9RFIxGe7OPqXXU8nnFWuyOGvokpdFZ1QWlDtLsadw6cqpPZRVCeFbnpEgyB6dg\nrrCz7jvZQkeIjube/RREu1gsNhYs2InJFEZcXC39+q2lvDwGo9HOvfemX/CxDc4Glux6lQalnsSw\nZHrE9oRYSEtKY/q4LI/mzsrKIDs7F5MprEVZhRAdY+KVPdi828zHXxVwWX8jMRHSEFqIjiIFlg85\nsZapcZpNYdCgXObPH9js45wuJ2/ueZ39lfuYNGAyrqMuyqrKSNYne3Tk6oTIyAgeemi4x59HCNE6\nUeF6brwqlWWf72Xppz/xh8npPtcgVYhAJQWWD2nLWiZFUfjfz2+zu2wXveN6c2f6dLSXyNsqhGh0\n9SWd2fpTCT/sK+Pr3SZGDOjk7UhCBAVZg+VDWruWSVEUlu9/n+/MW+kW3Z07BtyNVi3FlRDiJLVK\nxZ3j+xKi17Ds832UV9d6O5IQQUEKLB+SlZXBoEG5GAybGTQot9m1TJ8WrOaroo2kRKZwd/oMQjQh\nHZRUCOFPkmLD+PWoXtjrHCxZ85P0xhKiA8hwhw9pzVqm9YU5fHFoLUlhiUzPuJdwXbiH0wkh/NlV\nA1P4bm8puw6Wk7etmMzBnb0dSYiAJiNYfmhz8SZWHvyE2JBYpg/MIkof7e1IQggfp1KpmDa2L+Eh\nWv63bj8lldJORQhPkhEsP2G1WXnts0WsPPAJ5pCjdIvpzovjFxAfmtChOU5tJWEw1JCVlUFkZESH\nZhBCtE1cVAi3ju7NKyt/5LWVP/LQLYPRqOX3bCE8Qf5l+Ym385by4ZH3OBh3gJpwOzaXjbXfrOnw\nHLItjhD+bXh/A0P6JLH3SBXv5x7wdhwhApYUWH7AarPy0a4POVh3gPraOqJ10SgqxaP7C56PbIsj\nhH87MVVojA/ns28O8/Vuk7cjCRGQpMDycVablbsW3Mbu0F04cKDSqbFb7ISoQj26v+D5uGNbHIvF\nxrPPbmbWrO0888zXWK02d8cUQlxAeKiWmVPSCQvR8Pqanzhksng7khABRwosH7cw5z/s0O9AF6Yl\nXBeBtkBDWHEY42Ju6JAu7WdqbSuJc5FpRiG8r1NCBPdM6I/D4eLfH+6g2lbv7UhCBBRZ5O6jrDYr\nr67L5vWDr1FbZyc6LIbI5Ej0MXp+Y7jN4/sLno87tsWRaUYhfMOgXolMuiqV5RsOkv3RLmb9ehBa\njfzeLYQ7uOVf0uLFi0lLS6OystIdpxPAa+sX8UnNxzjDHUQmRaM9oEVfGkK3Y929MnLlTu6YZhSi\nvebOncuIESOYMGFC07FPP/2UG264gb59+7J79+7T7r9w4ULGjBnD2LFj2bhxY0fH9ZgbLu/GkD5J\n/Hy4kne+2CdNSIVwk3YXWCaTiU2bNpGSkuKOPEHNarPy74+e58b543lxxz85WllMz9iL0ZTEYq+O\nIunwFTzz238REeHfbRHcMc0oRHtNnjyZxYsXn3asd+/evPTSS1x66aWnHT9w4ABr1qxh9erVvPLK\nKzzyyCMBU4ioVCruGt+XLkkRrP+hiOVf5ns7khABod0F1pNPPsmcOXPckSXovZ23lJWVn/BT3B7q\nQuqpa6ij8GA5iuVSlL0j2fvFb7nnnjy/XRR+YnH7vHn7URSFRx7pxUMPDZc+WsIrhg4dSnT06U16\nU1NT6d69+1nFU05ODuPGjUOr1dKlSxe6devGjh07OjKuR4XqtfzxV4NIjg1j5aYCVn1d4O1IQvi9\ndhVY69ato1OnTvTp08ddeYKaqc7E0YZiGlwNREVHEVEUQcNhDfXfROI6/BT19Zdx6NBEv10ULovb\nhb8ym8106tSp6WuDwYDZbPZiIveLjQxh9m8GER8dwgd5B/li62FvRxLCrzW7yH3atGmUlZWddfyB\nBx5g4cKFvPbaa03HWjNknpQU1eL7epovZHEpLkwcwaVyEqYLJSYkhoSuCUQfGcKP5bdRqwmnpqaC\n+voGcnJs/O1vao+O/LTme2Kx2Jg//zuKi0NJSbEze/bQc2arrIwhJER32tcteR5feH/Ad3KAb2UJ\nBuf6bDtxoUZz/Om9SkqK4qmsK/nTfzby9hf7SIyPYHRSlF+9hnPx9/zg/6/B3/O3RbMF1pIlS855\nfO/evRQVFTFx4kQURcFsNjNlyhTee+89EhKa376ltNQ3+q4kJUV5PYuiKKwu/hBdVCgDbYOxm+2o\nHGoyL76aX910K/fszmPnzhQUJRmtVkNlZQyPPrqp3VfznU9rvyfPPruZbdsyUalUFBQoPPpo7jmz\nxcRUUlDQgEqlQlEUYmOrmn0eX3h/fCkH+F6WYGA0Gjl69GjT1yaTieTklvWh85X3qqV0wB9vHsg/\n3v6Bf7+7Da1WzYCLYr0dq8186d9LW/n7a/D3/NC2z7o2t2no3bs3X331VdPXo0aNYvny5cTExLT1\nlEFJURQ+3v8hW499TWpcT/6R+U9CtaGn3WfZsnFMmvQ5x471IiSklu7dL8Fk8p3ptZa2XcjKyiA7\nOxeTKQyj0S6L24XXXWjU/dTbRo0axezZs7njjjswm80UFhaSkZHRERG9onNSJLN+NYhn3vmBf779\nPTdf3YvrhnVt8aidEMKNfbBOjEqIlrHarLydt5TN1V9jqj/KmIxrubv/784qrqCx99SYMQa2bx/a\n9H32pdYGBkMNJpPSbDZ39NASwl1mzZrFli1bqKysJDMzk5kzZxITE8Njjz1GRUUFv/vd70hLS+PV\nV1+lV69ejB07lvHjx6PVapk3b17AFxvdjFH86dZLePGDHby7fj/l1bX8+pqLUasD+3UL4S4qxUtV\nka8MF3pr6HLR6pdZ71pHYXUBIZpQbk3+DXdd/fvz3t9qtZGdvfO00R9PrcFq7ffEk9l8ZWjZV3KA\n72URF+Yr71WbabX8dcFXFJXZGNI7iXsm9EOv03g7VYv50r+XtvL31+Dv+aGDpwhF21ltVt77+X8U\nRhwiRBVC+kUZVDdUX/Axvjz648vZhBDtkxQXxsO3XcJLH+7ku72lVP13GzOnpBMVrvd2NCF8muyJ\n4AXPrX2KIu0RXGoXoWFhHDIdwhBq8HYsIYQ4p/BQHQ/ePIjL+hnYX1TFI69/y/4jVd6OJYRPkwKr\ng/1UvocvqzYQHxfPRTXdiCyPINwUwd3X3+3taEIIcV46rZp7JvTjxl/0oMJSx9PLvmf15kO4ZO2t\nEOckU4QdKL/qIK/vepUwTRg9ElOJ6RKLoiik2dOIiIigpsa/56iFEIFNrVIx4Yoe9O4ay8JPdvN+\n7gF+OlTB3Tf0IzpCpgyFOJWMYHWQIssRFu9ciFNx8ffMJxjmuoy4yjjS7Gl+v3mzECK49Lkojr/f\nOYwBqfHsyi9n3pJv2HHgmLdjCeFTZASrA5TUlLBoRza1jjpu7Xs7gw1DGNr10uYfKIQQPio6XM8D\nvxzIZ98U8mHeQf713naG9U3mN9dcTExkiLfjCeF1UmB5WEVtOQu3/wdrg5Wbet/MYMMQb0cSQgi3\nUKtUjL2sG+k9Enjjs5/4Zk8JOw+W88vMnlw1KAV1gPcKE+JCZIrQg6z1FhbueJnKukrGp97A5SlX\neDuSEEK4XZfkSB6+bQi3j+kNKCz97Geefut7DhTLlYYieMkIlofUNNSwaEc2pTWlXN31GkZdNNrb\nkZplsdhYsGAnlZUxxMRUkpWV4dENpYUQgUOtUnH1JV0Y3DuJt7/Yx9afSnhi6XcMvjiRyVel0jkp\n0tsRhehQUmC5mdVmZWnua3xe/hl1Sh23Dv4t41MneDtWiyxYsJNt2zIJCdFRUNBAdva5N20WQojz\niY0MIWvSAH4urOCDvIP8sK+MbfvKuHyAkUlX9iAx9tx7lQoRaKTAcrM3c5ewsvYTqjSVxIcmYims\nRpXuH+sQWrppsxBCNKfPRXE8fNslbD9wjA/zDrJpl4ktP5q5tG8y1116Ed2Mss2SCGxSYLmRS3GR\nW7meKlUlsaFx9Iq7mLKqMm/HarETmzYDPrehtBDC/6hUKgb1SiSjZwLf/Ghm1deH2LzbzObdZvp0\njWXMsK4M7JUoi+FFQJICy00UReG9n/+LxVlNVHg0vePSUKEiWZ/s7WgtlpWVQXZ2LpWVMcTGVnHv\nvenejiSECABqlYrh/Y1c1s/A7vxyPvv2MLvzy/n5cCVJsaFcmd6JEQM6kRAT6u2oQriNFFhuoCgK\nKw58xDemLWRePIqIYxFUVFWQrE/2qyaiJzZtDoSdz4UQvkelUjEgNYEBqQkcKbGyduthvvnRzPIv\n8/noy3zSusVxZXonLumdRIhe4+24QrSLFFhukFO4lrwjuRjCDWQNvo9InVwtI4QQF9IlOZI7x/Xl\nN9dczLc/lfDVzqPsOVTBnkMV6LVqBqQmcEnvRAb2SiQiVOftuEK0mhRYbWS1WXk7bynfWbZyuK6Q\nYT0uY8bALCmuhBCiFcJCtFw1MIWrBqZgLq/hq10mtv5Uwvd7S/l+bykatYo+F8UysGci/XrEk5IQ\n3nQxjhC+TAqsNno7bylfsoGDzv1o9TqiK2OICYlt6iVlMoVhMNRILykhhGghQ3w4k69KZfJVqRSX\n2ZqKrB8LKvixoAKA2Eg9/brH0697HL27xpIQHSoFl/BJUmC10W7bLvIdB9CotfRL6E+NrQY42Uuq\nsc2BIr2khBCiDVISI0hJjOCGEd0pr65ld0H58UKrnE27TGzaZQIgJlJPr84x9OocQ8/OMXRNjiRE\nJ+u3hPdJgdUG+yr2cqB2PyqtmrT4foRpw5uuFpReUkII4V7x0aH8IiOFX2Sk4FIUikpt/FhQzv6i\nKvYXVfHdz6V893MpACoVGOPD6WaIoqshkvSLk4nQqYmN1MtIl+hQ7S6w3nzzTZYtW4ZOp2PkyJHM\nnj3bHbl8VkFVPkt2vUKPpJ7kb3Cw36IjMSSUP825CTjZS0qlUkkvKSF83Ny5c8nNzSUhIYEVK1YA\nUFVVxYMPPkhRURFdunThX//6F1FRjU0xH3/8cTZs2EBYWBhPP/00ffv29Wb8oKRWqeiaHEnX5Eiu\no/Eq7mPVtewvquJgUTWFJVYOl1g4eqyGzT+aeW/9AQBC9Ro6JUSQkhCOMSGc5LhwkmJDSY4NI1wW\n0QsPaFeBtWXLFtavX8/KlSvRarWUl5e7K5dPKrYWsXjnQhpcDkJ2X0nMobuIPV5IvfFGLg89lNzU\nS8pkCsNotJ+3l9SZa7XmzZONoIXoaJMnT+b2229nzpw5TccWLVrE5Zdfzj333MOiRYtYuHAhs2fP\nJi8vj8LCQtauXcv27duZN28e7777rhfTC2icKUiMCSMxJozh/YwAuBSFsko7hWYrx2z1HCis4Oix\nGgrNFvKPVp91johQLYkxYcRHhxAfFdr4Z3QocVEhxEToiYnUE6qXCR/ROu36iXnnnXe455570Gob\nTxMfH++WUL7GarOyaN3LrC1fg1qlYe5Vf2P5JzHnnAo80UuqOWeu1Xruua/5/e8He/R1CCFON3To\nUIqKik47lpOTw1tvvQXAjTfeyG9/+1tmz55NTk4OkyZNAmDgwIFYLBbKyspITEzs8NziwtQqFclx\nx0epTunr53S5KK2sxXSshtJKO6WVdkqO/3n0mI1D5vP3/wvRaYiJ0BMVoSMqTE9kuI6o8Ma/R4Rp\niQzVER6qJSJMR0SojrAQDSE6jUxLBrF2FVgFBQVs3bqV559/npCQEObMmUN6euB1/168bgEraj6m\nXldHt+ge/PjjLgyGwe2aCjxzrVZxsXQwFsIXlJeXNxVNSUlJTSPzJSUlGI3GpvsZDAbMZrMUWH5E\no1ZjjA/HGB9+1m2KomCrdVBeXcux6lrKq+uotNZRZa2nylZPlbWOSls9ZcW1uBSlRc+nUkGYXktY\niJawEA2hei0heg2heg2hOg0heg16XWMhFqLToNep0Wsb/9Rp1OiOf11hd2Cz1KLVNh5v/FOFRqNG\nq1GhVqmkkPNBzRZY06ZNo6zs7P30HnjgAZxOJ9XV1bz77rvs2LGDBx54gJycnBY9cVKS72z0eWoW\ni8XG/PnfUVwcSkqKnRn39SHXmoNT00DPhFQuirkIW3UV8+ZdwXPPfX38frXMnj2iVe0YUlOdlJdr\nmgq0lJRan/2eeJuvZPGVHOBbWYKFco7/qLbkP2qB8F75+2toTf4ezdzucinU1DY0FV3VtnqsNfVY\n7Q1Yahoa/17TQE2dA5u9AXudA1ttAxWWOuz1NbhcLSvOWkOlAq1Gffz/xwsv9SkFmPr48ePHNGoV\navXxr9Xqpr+r1Y3F2sk/OeuYStU4QqhSn/J3lQq1qvHfw4ljqEBF43FUJ/6kad9JlUrF8ZuAxsed\n9ndOnq/xaOOdT9zW9MLhlPOcfHzTY0677eTRU4+d+liAHinR9EiJafsbclyzBdaSJUvOe9t///tf\nxowZA0BGRgZqtZqKigri4uKafWJf2YrlzG1hnn12c9PU3YHCGnIa7qcuvoGkcCOJeiM19noucsRg\nt1TVsb4AABLXSURBVLtOm9Kz213Y7S1/TVOn9qGm5oumtVqzZ4/w2e+JN/lKFl/JAb6XJdAkJCQ0\nTf2VlpY2LX0wGAyYTKam+5lMJpKTm99r1Ffeq7bypZ+3tvBUfj2QFKknKVLf4scoioLD6cJe76Su\n3kltvZN6h5P6eid1Dhf1DU7qG1w0OJzUO1zUOxr/rtPrqK6upcHposHhwuE88X/llL+7cDoVnC4F\nh0vB6Ww8n71OweVScLpcOF0KTqeCS1Fo4SBcUEqMCeWZe0ecdqwtn3XtmiK89tpr+frrr7n00kvJ\nz8/H4XC0qLjyZSem7lyqBo50WoBiL2fGoFuxH66htKrUbfsLnrlWKzIyolUFmhDCPc4cmRo1ahQf\nfvgh06dPZ/ny5VxzzTUAXHPNNSxbtoxx48axbds2oqOjZXpQtIpKpUKn1aDTauDsWcrz8kSR6FIa\nCy9FaSzKXK6Tx07exvFiTMGlcPJP18kiTeHU+9E0Qqc03Q4x0WFUVNU0/ltToPEuJ2/n+HmO/+94\n8XeyCFQ4+e/05LGTNzb9C1ZOHj9XAXnWY0+cHE49Qtdk9+zI0q4Ca/LkycydO5cJEyag0+n4xz/+\n4ZZQ3mQw1FBsaqDIuBBb6D76h/bgtvSpqDPU3o4mhHCzWbNmsWXLFiorK8nMzGTmzJlMnz6d+++/\nnw8++ICUlBReeOEFAEaOHEleXh6jR48mLCyMp556ysvphWg7tUqFWtM4JebpJhX+PgraVirlXAsL\nOoCvfLPPfOOPmk3cvPBuSjTFJDoNvDd9MSnGlA7P4U2SxXdzgO9lERfmK+9VW/nSz1tb+Ht+8P/X\n4O/5wQtThIFGURQey/srzl4meuhj6ROfysrvP2L6uKwOyyB7GQohhBD+T+a9TrHq4Ap+qvmRCF0E\nfeL7olVrKakv6dAMJ/pjmc3D2b79arKzd3bo8wshhBCi/WQE67h1hZ+z/nAOibokDHGd0Kq1KIrS\ntMdgR5G9DIUQQgj/JyNYwKaijaw6uJK4kDj+NfY/pNenE1cZR5o9zS1XDLaGwVBzytUSspehEEII\n4Y+CfgRry5EtfLjvfSJ1kUwfmEVyeHKHrrk6U0v3MhRCCCGE7wrqAmt32S7+d2ApodoQZhwvrjrC\nuRayn7hCoaV7GQohhBDCdwXtFGFlbQVLf3wNrVrLXekzSIns3GHPLQvZhRBCiMAWtCNYek0I/RIG\nMGHA9cQrnu9zdSpZyC6EEEIEtqAdwQrXhTO1/530SezT4c8tC9mFEEKIwBa0I1jeJAvZhRBCiMAm\nBZYXyEJ2IYQQIrAF7RShEEIIIYSnSIElhBBCCOFmUmAJIYQQQriZFFhCCCGEEG4mBZYQQgghhJtJ\ngSWEEP+/vTuPieL84zj+Xg6NBWxVLHgQa7XWE6kX9QhaRFEEgSiY2NYGRdvaggcpAkZtPFOw9Ijx\nIN7GauqBptFqFA+ichQv2hC0Ui2islQ5BEVg2ef3h5F4gAcuO7v5fV9/scPsPJ8nzDx8Z3Z2HiGE\nMDEpsIQQQgghTOy1Cqzc3FwmTZpEUFAQEydO5M8/ZU49IYT127JlCwEBAQQEBLB161YAysrKmDp1\nKr6+vkybNo3y8nKNUwohLNlrFVgJCQlERESwb98+IiIiiI+PN1UuIYTQxN9//83u3bvZs2cP+/bt\n48SJE/z7778kJSUxePBgDh8+jKenJ+vWrdM6qhDCgr1WgaXT6erO4srLy3FxcTFJKCGE0EpeXh4e\nHh40a9YMW1tbBgwYwJEjRzh27BjBwcEABAcHc/ToUY2TCiEs2WtNlRMbG0t4eDjfffcdSil27txp\nqlxCCKGJ9957jx9//JGysjKaNWtGamoqvXv35s6dOzg7OwPQtm1bSkpKNE4qhLBkLyywwsLCuH37\n9jPL58yZw5kzZ5g/fz4+Pj4cOnSIuLg4Nm3a1CRBhRDCHLp06cL06dMJCwvDwcGB7t27Y2trq3Us\nIYSV0SmlVGPfPGDAALKysupe9+/fn7Nnz5okmBBCWIIffvgBV1dXtm7dyrZt23B2dua///5jypQp\n/P7771rHE0JYqNe6B8vFxYXMzEwA0tLSeOedd0yRSQghNFVcXAzAzZs3OXLkCP7+/nh7e7N3714A\nkpOTGTlypJYRhRAW7rWuYJ07d46lS5diNBpp3rw5ixYtomfPnqbMJ4QQZvfxxx9TVlaGnZ0dsbGx\neHp6UlpayuzZs7l16xbt27fnp59+omXLllpHFUJYqNcqsIQQQgghxLPkSe5CCCGEECYmBZYQQggh\nhIlJgSWEEEIIYWKaFljbtm1jzJgxBAQEsHLlSi2jsGHDBrp3705paalmGeLj4xk7diyBgYFERERQ\nUVFh1vZTU1MZM2YMvr6+JCUlmbXtxxUWFjJlyhT8/PyemAtOS0ajkeDgYL744gvNMpSXlxMZGcnY\nsWMZN24cFy9e1CzL5s2b8ff3JyAggKioKKqrqzXLYqks5Xh6FXFxcQwZMoSAgIC6ZdY0B2NDY4e1\n9KG6upqQkBCCgoIICAhg1apVABQUFBAaGoqvry9z587FYDBonPTFnh4zrakP3t7ejB8/vm6eZWjk\nPqQ0kp6ersLCwlRNTY1SSqk7d+5oFUXdunVLTZ06VX300UeqpKREsxynT59WtbW1SimlEhIS1MqV\nK83Wdm1trfLx8VEFBQWqurpajR8/Xl25csVs7T+uqKhI5eTkKKWUqqioUKNHj9YsyyObNm1SUVFR\n6vPPP9csw7x589Tu3buVUkrV1NSo8vJyTXIUFhYqb29vVVVVpZRSatasWSo5OVmTLJbKko6nV/HH\nH3+onJwc5e/vX7csPj5eJSUlKaWUWrdunUpISNAq3gs1NHZYUx/u37+vlFLKYDCokJAQdeHCBTVr\n1ix18OBBpZRSCxcuVDt27NAy4kt5esy0pj54e3ur0tLSJ5Y1Zh/S7ArWjh07mD59OnZ2Dx8m37p1\na62isHz5cqKjozVr/5EhQ4ZgY/PwT+Lh4UFhYaHZ2s7OzqZTp0506NABe3t7xo0bR0pKitnaf1zb\ntm3p0aMHAA4ODnTp0oWioiJNssDDs+KTJ08SEhKiWYaKigqysrKYMGECAHZ2djg6OmqWx2g0UllZ\nicFg4MGDB7z99tuaZbFElnQ8vYoBAwY88+iJlJQUq5mDsb6xQ6/XW1UfWrRoATy8mmUwGNDpdGRk\nZODr6ws8zH/kyBEtI75QfWNmenq61fRBKYXRaHxiWWP2Ic0KrGvXrpGVlUVoaCiffvopf/75pyY5\njh07Rrt27Xj//fc1ab8hu3fvxsvLy2zt6fV62rVrV/faxcVF06LmkYKCAnJzc3F3d9csw6MCXKfT\naZahoKCAVq1aERsbS3BwMAsWLODBgweaZHFxcSEsLIwRI0bg5eWFk5MTQ4YM0SSLpbLU46kxiouL\nrXIOxkdjR9++fa1qHkmj0UhQUBBDhw5l6NChuLm50bJly7qTb1dXV4vfl54eM0tKSnjzzTetpg86\nnY5p06YxYcIEdu3aBdCofei1Jnt+kYbmMZw9eza1tbXcvXuXX3/9lezsbGbPnt1kZ3jPy7Fu3To2\nbtxYt0w18WPBnje3o7e3NwBr1qzB3t7+ifsgmlpT97sx7t27R2RkJHFxcTg4OGiS4cSJEzg7O9Oj\nRw8yMjI0yQBgMBjIyclh4cKF9OnTh2XLlpGUlERkZKTZs9y9e5eUlBSOHz+Ok5MTkZGR/Pbbb2bd\nXy2dJR5P/0+eHju0PDl6VTY2Nuzbt4+Kigq++uor8vLynlnHkvtT35iplHrmmLDkPuzcuZO2bdtS\nXFzM1KlT6dy5c6PyNmmB9byJn3fu3Mno0aMBcHd3x8bGhpKSElq1amW2HJcvX+bGjRsEBgailEKv\n19dVrG3atDF5judleSQ5OZmTJ0+a/cZuV1dXbt68Wfdar9dr+rGPwWAgMjKSwMBAfHx8NMtx7tw5\njh07xsmTJ6mqquLevXtER0cTHx9v1hyurq64urrSp08fAHx9fVm/fr1ZMzxy5swZ3NzceOuttwAY\nNWoU58+flwLrMZZ2PL2ONm3acPv27bo5GLW8neNl1Dd2WFsfABwdHRk4cCAXL17k7t27GI1GbGxs\nKCwstOh9qb4xc/ny5ZSXl1tNH9q2bQs8vHXJx8eH7OzsRu1Dmn1E6OPjQ1paGgBXr17FYDA0SXH1\nPN26deP06dOkpKRw7NgxXFxcSE5ObrLi6kVSU1NZv349a9asoVmzZmZtu0+fPuTn53Pjxg2qq6s5\ncOCApnOtxcXF0bVrVz777DPNMgDMnTuXEydOkJKSQmJiIp6enmYvrgCcnZ1p164dV69eBR7ez9Cl\nSxez5wBo3749Fy9epKqqCqWUplkslaUdT6/i6SsN1jYHY31jh7X0obi4uO7baQ8ePCAtLY2uXbvi\n6enJoUOHAMvOD/WPmStXrrSaPlRWVnLv3j0A7t+/z6lTp+jWrVuj9iHNpsqpqakhLi6O3Nxc7O3t\niYmJYdCgQVpEqTNy5Ej27NlTd2ZubqNHj6ampqau/b59+/Ltt9+arf3U1FSWLVuGUoqJEycyY8YM\ns7X9uLNnz/LJJ5/QrVs3dDodOp2OOXPmmPWetPpkZmayceNG1q5dq0n7ubm5zJ8/H4PBgJubGytW\nrMDJyUmTLKtWreLAgQPY2dnRs2dPli5dir29vSZZLJWlHE+vIioqioyMDEpLS3F2diYiIgIfHx9m\nzZplFXMwNjR2uLu7W8U8kpcuXSImJgaj0YjRaMTPz48vv/yS69evM3fuXO7evUuPHj1ISEiwiuPt\n8THTWvpw/fp1vv76a3Q6HbW1tQQEBDBjxoxGzUUqcxEKIYQQQpiYPMldCCGEEMLEpMASQgghhDAx\nKbCEEEIIIUxMCiwhhBBCCBOTAksIIYQQwsSkwBJCCCGEMDEpsIQQQli00NBQgoODGTduHL169SI4\nOJjg4GDi4uJeeVvh4eFPPGW/IbGxsVy4cKExcV9JTk4Ohw8fbvJ2hPnJc7CEEEJYhRs3bjBx4sS6\nWUDq82g6Fmuxa9cu0tLSSExM1DqKMLEmnYtQCCGEaEppaWnEx8fTrVs3Ll26RFRUFMXFxWzfvh2D\nwQDwxEwhw4cPZ/PmzXTu3JnJkyfzwQcfcP78eYqKivD392f27NkATJ48mZkzZzJs2DC++eYbHB0d\nycvLQ6/X069fP1asWAFAYWEh0dHRlJSU4ObmRm1tLd7e3kyaNOmJnLdv3yYqKoqSkhIAhg0bRnh4\nOKtXr+b+/fsEBwfj6elJTEwM58+fJzExkcrKSgAiIyPx8vIiPz+fyZMn4+fnR1paGjY2NixatIh+\n/frVu/3o6Oim/wOIBkmBJYQQwqpdvnyZJUuW0Lt3bwDKysoIDAwEIC8vj/DwcI4fP17ve/V6Pb/8\n8gvl5eX4+PgQEhJChw4dnlnvypUrbNq0CaPRSGBgIJmZmQwaNIjFixfj5eVFeHg4BQUFjB8/Hm9v\n72fev3//frp27cqCBQsAKC8vx8nJiZkzZ5Kens73339fl33x4sVs2LCB1q1bo9frCQ0N5eDBg8DD\nQs3Dw4O4uDjS0tKIiori6NGj9W5faEsKLCGEEFbt3XffrSuuAK5du8bPP/9MUVERtra2FBUVUVpa\nWu88s2PHjgXAycmJzp07k5+fX2+BNWrUKOzsHv7L7NmzJ/n5+QwaNIiMjAyWLl0KQMeOHRucU9fD\nw4Pt27fTokULBg4cyLBhw+pd7+zZsxQUFDBt2rS6ibdtbW25fv06b7zxBi1atMDPzw+AwYMHY2tr\ny7Vr1156+8J8pMASQghh1RwcHJ54PWfOHBYtWsTw4cMxGo24u7tTVVVV73ubN29e97ONjQ21tbWv\ntJ5Op3upjP3792fv3r2cOXOGPXv2sH79erZt2/bMekopevXqxebNm5/5XX5+/jPLjEYjOp3upbcv\nzMd67gQUQgjxf+9lvpdVUVFBx44dAdi5c2eDRZMpDBo0iL179wIPb8LPzMysd72CggIcHR3x8/Mj\nJiaGv/76CwBHR8cnPs7r168fV65cISsrq25ZdnZ23c+VlZV1Hxemp6cD0KlTpwa3L7QjV7CEEEJY\njZe5YhQXF8eMGTN48803GTFiBE5OTvW+/+ltNfS75623YMEC5s2bx/79++nYsSN9+/Z9or1H0tLS\n2Lp1K7a2tiilWLJkCQBDhw5ly5YtBAUF8eGHHxITE8Pq1atJSEigvLycmpoa3NzcWLt2LQDOzs5k\nZ2ezdu1adDodiYmJ2NraNrh9oR15TIMQQgjRSFVVVdjb22NjY4NeryckJITt27fj5uZm8rYefYvw\n1KlTJt+2MD25giWEEEI00j///ENsbCxKKYxGI3PmzGmS4kpYH7mCJYQQQghhYnKTuxBCCCGEiUmB\nJYQQQghhYlJgCSGEEEKYmBRYQgghhBAmJgWWEEIIIYSJSYElhBBCCGFi/wOdzCpA+NTPOgAAAABJ\nRU5ErkJggg==\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xa5aaf90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "import tensorflow as tf\n",
-        "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "%matplotlib inline\n",
-        "\n",
-        "# Set up the data with a noisy linear relationship between X and Y.\n",
-        "num_examples = 50\n",
-        "X = np.array([np.linspace(-2, 4, num_examples), np.linspace(-6, 6, num_examples)])\n",
-        "X += np.random.randn(2, num_examples)\n",
-        "x, y = X\n",
-        "bias_with_x = np.array([(1., a) for a in x]).astype(np.float32)\n",
-        "\n",
-        "losses = []\n",
-        "training_steps = 50\n",
-        "learning_rate = 0.002\n",
-        "\n",
-        "with tf.Session() as sess:\n",
-        "    # Set up all the tensors, variables, and operations.\n",
-        "    input = tf.constant(bias_with_x)\n",
-        "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
-        "    weights = tf.Variable(tf.random_normal([2, 1], 0, 0.1))\n",
-        "\n",
-        "    tf.global_variables_initializer().run()\n",
-        "\n",
-        "    yhat = tf.matmul(input, weights)\n",
-        "    yerror = tf.subtract(yhat, target)\n",
-        "    loss = tf.nn.l2_loss(yerror)\n",
-        "  \n",
-        "    update_weights = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)\n",
-        "  \n",
-        "    for _ in range(training_steps):\n",
-        "        # Repeatedly run the operations, updating the TensorFlow variable.\n",
-        "        update_weights.run()\n",
-        "        losses.append(loss.eval())\n",
-        "\n",
-        "    # Training is done, get the final values for the graphs\n",
-        "    betas = weights.eval()\n",
-        "    yhat = yhat.eval()\n",
-        "\n",
-        "# Show the fit and the loss over time.\n",
-        "fig, (ax1, ax2) = plt.subplots(1, 2)\n",
-        "plt.subplots_adjust(wspace=.3)\n",
-        "fig.set_size_inches(10, 4)\n",
-        "ax1.scatter(x, y, alpha=.7)\n",
-        "ax1.scatter(x, np.transpose(yhat)[0], c=\"g\", alpha=.6)\n",
-        "line_x_range = (-4, 6)\n",
-        "ax1.plot(line_x_range, [betas[0] + a * betas[1] for a in line_x_range], \"g\", alpha=0.6)\n",
-        "ax2.plot(range(0, training_steps), losses)\n",
-        "ax2.set_ylabel(\"Loss\")\n",
-        "ax2.set_xlabel(\"Training steps\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "vNtkU8h18rOv"
-      },
-      "source": [
-        "In the remainder of this notebook, we'll go through this example in more detail."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "r6rsv-q5gnn-"
-      },
-      "source": [
-        "## From the beginning\n",
-        "\n",
-        "Let's walk through exactly what this is doing from the beginning. We'll start with what the data looks like, then we'll look at this neural network, what is executed when, what gradient descent is doing, and how it all works together."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "UgtkJKqAjuDj"
-      },
-      "source": [
-        "## The data\n",
-        "\n",
-        "This is a toy data set here. We have 50 (x,y) data points. At first, the data is perfectly linear."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "form",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 271,
-          "status": "ok",
-          "timestamp": 1474671835304,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "-uoBWol3klhA",
-        "outputId": "cc31ce5d-9b65-4ef6-8475-643be268569a"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQYAAAEDCAYAAADX+KqPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFTlJREFUeJzt3XtQVVXfB/Dv4aaOVIJcjxneIrFQZzSbUUd9EHUMDRTQ\nd3LSkVJ7HWREnVMes2w0SHzK6TKDt9SBmpyilIhynJCLqXjNSy8xjg7pKAIhkogXhLPfP0gCtsDB\nvfbZZ+/z/fwlPbB/y8t8n7X2Wue3TJIkSSAiasVN6wEQkfNhMBCRDIOBiGQYDEQkw2AgIhkGAxHJ\neIh4yO7du5GVlQWTyYTQ0FCkpqbCy8tLxKOJSAOKZwyVlZXIzMzE999/j5ycHDQ1NeGnn34SMTYi\n0oiQGYPNZsPdu3fh5uaGe/fuISAgQMRjiUgjioMhMDAQCxcuxKRJk9CrVy+MGzcOY8eOFTE2ItKI\n4qXErVu3kJeXh/z8fBw6dAh37txBTk6OiLERkUYUB8ORI0fQv39/9OnTB+7u7pgyZQp+++23Tn+G\nH88gcm6KlxJmsxlnz57F/fv34eXlheLiYoSHh3f6MyaTCX/9Vae0tN38/Z9gPZ3WM/LvTat69lAc\nDMOHD8e0adMQExMDDw8PDBs2DHPmzFH6WCLSkJBdicTERCQmJop4FBE5AZ58JCIZBgMRyTAYiEiG\nwUBEMgwGIpJhMBCRDIOBiGQYDEQkw2AgIhkGAxHJMBiISIbBQEQyDAYikmEwEJEMg4GIZBgMRCTD\nYCAiGQYDEckwGIhIRkgw1NXVISkpCdOnT0dUVBTOnj0r4rFEpBEhzWA/+OADTJw4EZ9++ikaGxtx\n7949EY8lIo0onjHcvn0bJ0+eRGxsLADAw8MD3t7eigdGRNpRPGO4evUqfHx8sHr1apSWluKFF17A\nmjVr0LNnTxHjIzKkmppavPVWPsrLfWA21yAtLQI+Pn20HlYLxTOGxsZGlJSU4NVXX8XevXvRs2dP\nbNu2TcTYiAzrrbfykZ39Gk6cmIns7PmwWPK1HlIbimcMQUFBCAoKarmWbtq0adixY0eXP2fvVVmi\nsJ5+6xnx91Ze7gPA9M9XJpSX+zj899kZxcHg5+eH4OBglJWVYeDAgSguLsbgwYO7/Dmj3w/Ievqr\npXa9h8uHy5efRFXV/wGYgeZwkGA233TI79Nhd1cCwDvvvINVq1ahsbER/fv3R2pqqojHEhnKw+VD\ncxhMhNmcin79wmE230Ra2n+0Hl4bQoJh6NCh+O6770Q8isiwLl9+Ev8uH3wQEDAMx4/PdOiMyF5C\ngoGI5FovHUJC/kZwcAPOnJHwcPkQEnJL6yF2iMFApJLWS4czZyRMn74F0dGZ/wTFLadbPrTGYCBS\nSdulgwnXrwfjwIHJWg7JbvwQFZFKQkL+BiD985VzLx3a44yBSKDW7xWCg+sxffoXuH7dz+mXDu0x\nGIgEav9eITo6UzfLh9a4lCASqP17heav9YczBiKF5Ccao6GHLcnOMBiIFHrUicaAgGG6e6/QGoOB\nSKFHnWjU43uF1viOgUghPW9LdoQzBqJuan/U2WodBUAfJxrtxWAg6qb2W5JAJrZvn6X1sITiUoKo\nm4yyJdkZzhiI7GDELcnOMBiI7GDELcnOMBiI7GDELcnOMBiIHkHPTVZEYDAQPYKem6yIICwYbDYb\nYmNjERgYiC1btoh6LJEm9NxkRQRh25UZGRl2tY0n0gMjnmbsDiHBUFFRgcLCQsTHx4t4HJEmampq\nMXfu15g6NQ8NDc1NVkaO3Ifo6EzDLx3aE7KUSElJgcViQV2d87XBJrKXUZqsiKA4GAoKCuDn54ew\nsDAcO3bM7p8z4rVjrKfvWlpdG+dMV9M9pDgYTp8+jYMHD6KwsBD3799HfX09LBYL0tLSOv05o1w7\nxnr6rqX1tXFa/N3ZQ3EwrFixAitWrAAAHD9+HDt37uwyFIicRfsTjU8/vRF+fkNdYkuyMzzHQC6t\n/YnG4ODnkZs7ScMROQehwTBmzBiMGTNG5COJVBUS8nebE40DB97WekhOgTMGcildNVlJT38FTU1a\nj1J7DAZyKV01WfH1dezLQGfFRi3kUlyhyYoInDGQ4blakxURGAxkeK7WZEUEBgMZnqs1WRGBwUCG\n4+pNVkRgMJDhuHqTFREYDGQ4rt5kRQRuV5LhuHqTFRE4YyBDaP1eITi4ucnK9et+XDo8JgYDGQKb\nrIjFpQQZAk80isUZA+kStyTVxWAgXeKWpLoYDKRL3JJUF98xkC5xS1JdnDGQbnBL0nEYDKQb3JJ0\nHMXBUFFRAYvFgurqari7uyM+Ph7z588XMTaiNrgl6TiKg8Hd3R2rV69GWFgY6uvrMXv2bIwbN473\nWJIQN27UYtGiH9hkxcEUB4O/vz/8/f0BAL1798bgwYNRVVXFYCAhli79mU1WNCD0HcPVq1dRWlqK\n4cOHi3wsubCyMm+wyYrjCQuG+vp6JCUlwWq1onfv3l1+v5HvWmQ9cQYOrMOJE/+eaAwNvat6baP+\nWXaHkGBobGxEUlISoqOjERkZadfPGPWuRdZTpv1R548+moT79/890bh+/X9U/b0a6c+yo3r2EBIM\nVqsVQ4YMwYIFC0Q8jlxY+y3JHj32tLn3gRxD8cnHU6dOIScnB8XFxYiJicGsWbNQVFQkYmzkgtpv\nSTa/YyBHUzxjGDVqFP744w8RYyEX1dm9D7xLUhs8+Uia6+zeB94lqQ0GA2mus3sfeJekNhgM5HBs\nsuL8GAzkcGyy4vwYDORwbLLi/NiohRyOTVacH2cM5BBssqIvDAZyCDZZ0RcuJcgh2GRFXzhjINV0\ndqKR7xWcG4OBVNPZiUa+V3BuDAZSTWcnGsm58R0DqYbbkvrFGQMJ0/6os9U6CgBPNOoRg4GEab8l\nCWSyyYpOcSlBwnBL0jg4YyBFuCVpTAwGUoRbksbEYCBFuCVpTEKCoaioCCkpKZAkCbGxsVi8eLGI\nx5ITqqmpRWLij7hwoRebrBiY4mCw2WxYv349du/ejYCAAMTFxWHy5Mm8os6g2GTFNSgOhnPnziEk\nJAT9+vUDAERFRSEvL4/BYFBssuIaFG9XVlZWIjg4uOXrwMBAVFVVKX0sOSmeZnQNimcMkiR1/U2P\nYPT7AY1U78aNWixd+jPKyrxhNjcgJiYD1675YuDA20hPfwW+vrxLUk/17KE4GIKCglBeXt7ydWVl\nJQICArr8OaPfD2ikeosW/dBqS1LCnDl7kJs7CQDQ1KTu36XR/iydoZ49FC8lwsPDceXKFVy7dg0N\nDQ3Izc3F5MlccxoJr41zPYpnDO7u7li7di0SEhIgSRLi4uL44lHnurr3gdfGGZ+QcwwTJkzAhAkT\nRDyKnEBXW5K8Ns74ePKRZLrakuS1ccbHT1eSDLckiTMGAsB7H6gtBgMB4L0P1BaXEgSATVaoLc4Y\nXBibrFBHGAwujE1WqCMMBhfGJivUEb5jcGHclqSOcMbgQnjvA9mLweBCeO8D2YtLCRfCLUmyF2cM\nBsctSXocDAaD45YkPQ4Gg8FxS5IeB4PBYLpqssLlA9mDwWAwvPeBRGAwGAzvfSARuF1pMDzNSCIo\nmjGkpaUhPz8fXl5eeOaZZ5Camgpvb3YQdrQbN2qxaNEPbLJCwiiaMYwfPx65ubnIzs5GSEgItm7d\nKmpc1A1Ll/6M7OzXcOZMDH7++X/h5eWFAwcmY/v2WfDx6aP18EiHFAXD2LFj4ebW/IiRI0eioqJC\nyKCoe5rveeCJRhJH2MvHrKwsREVFiXocdaH1tmR19R8AZoBbkiSKSeri8smFCxeiurpa9t+Tk5MR\nEREBAEhPT0dJSQk+++wzdUZJMnPnfo1vvvkfNIfBTTz99FYEBz//z32S0+HryyUEPb4uZwy7du3q\n9H/fu3cvCgsLkZGR0a3CRr8fUO16Fy70QusTjX5+Qw15n6QR/+60rmcPRe8YioqKsGPHDqSnp8PL\ny0vJo6ibuC1JalL0jmHDhg148OABEhISAAAjRozAunXrRIyL2umsyUpo6F2sX89tSRJHUTAcOHBA\n1DioC501WXH0dJSMjycfdYJNVsiR+FkJJ8YmK6QVBoMTY5MV0gqDwYmxyQpphcHgRNhkhZwFg8GJ\nsMkKOQsGgxNhkxVyFtyudCI8zUjOgjMGjbV+r8AmK+QsGAwaa/9eITo6k8sH0hyXEhrjiUZyRpwx\nOBi3JEkPGAwOxi1J0gMGg4NxS5L0gO8YHIxbkqQHnDE4ALckSW8YDA7ALUnSGy4lHIBbkqQ3nDGo\nhE1WSM+EBMMXX3yBTZs2obi4GH368D4DgE1WSN8UB0NFRQWOHDkCs9ksYjyGwSYrpGeK3zGkpKTA\nYrGIGIuhcFuS9EzRjOHgwYMIDg7Gc889J2o8utX6nUJo6J029z5w+UB602UwdHR35fLly7F161bs\n3Lmz5b91cQ1mG/ZelSWK2vUSE39sd+/DHuzbN1/Vmq0Z7c9Tq1quUM8ej3135YULF3Dt2jVER0dD\nkiRUVlYiNjYW3377Lfr27dtlYaPdD9j2LkkTLlzoxfsddVbLVerZ47GXEqGhoTh8+HDL1xEREdi7\ndy+eeuqpx32k7nBLkoxK2DkGk8nUraWEEXS0Jcm7JEnvhAVDXl6eqEfpRkdbkrxLkvSOJx+7gU1W\nyFUwGLqBTVbIVTAYuoFNVshV8NOV3cDTjOQqOGPoApuskCtiMHSBTVbIFXEp0QU2WSFXxBnDI/BE\nI7k6BsMjsMkKuToGwyOwyQq5Or5jeARuS5Kr44wB8qPObLJCro7BAPmWJJCJ7dtnaT0sIs1wKQFu\nSRK157IzBm5JEnXMZYOBW5JEHXPZYOCWJFHHXCYY2GSFyH4uEwxsskJkP8XBkJmZia+++gqenp6Y\nOHEiVq1aJWJcwrHJCpH9FAXDsWPHkJ+fjx9//BEeHh6oqakRNS7hQkL+5tKByE6KguHrr7/GokWL\n4OHR/BhfX18hgxLl4XuF8nIf+PmxyQqRvRQFw59//omTJ09i8+bN6NGjBywWC8LDw0WNTbG2W5Js\nskJkL0V3VzY1NeHWrVv45ptvcO7cOSxfvtzu+yUccV9febkPWr9XKC/3cdg9gUa//5B3V+q3nj0e\n++5KANizZw+mTp0KABg+fDjc3Nxw8+ZN+Pj4dFnYEReymM01aP6UZPOMwWy+6ZC6rnD/Ie+u1G89\neyhaSkRGRuLo0aN48cUXUVZWhsbGRrtCwVHS0iIAZKK83Adm802+VyCyk6JgmD17NqxWK2bOnAlP\nT09s3LhR1LiE8PHpg+3bZ/HKOKJuUhQMnp6e2LRpk6ixEJGT4MeuiUiGwUBEMgwGIpJhMBCRDIOB\niGQYDEQkw2AgIhkGAxHJMBiISIbBQEQyDAYikmEwEJEMg4GIZBgMRCTDYCAiGQYDEckwGIhIhsFA\nRDKKgqG0tBRz585FTEwM4uLicP78eVHjIiINKQqGTZs2YdmyZdi3bx+WLVuGtLQ0UeMiIg0pCgaT\nyYS6uubuy3V1dQgMDBQyKCLSlqIu0atXr8Ybb7yBjRs3QpIk7NmzR9S4iEhDj31FXXJyMo4cOYI1\na9YgMjIS+/fvh9Vq7fTmKiLSB5MkSdLj/vDo0aNx8uTJlq9HjRqFU6dOCRkYEWlH0TuGwMBAHD9+\nHABw9OhRDBgwQMSYiEhjimYMp0+fxoYNG2Cz2dCjRw+89957GDZsmMjxEZEGFAUDERkTTz4SkQyD\ngYhkGAxEJKPogJMSn3zyCfLy8uDm5oa+ffviww8/hL+/v2r10tLSkJ+fDy8vLzzzzDNITU2Ft7e3\nKrX279+Pzz//HJcuXUJWVhaef/55VeoUFRUhJSUFkiQhNjYWixcvVqUOAFitVhQUFKBv377IyclR\nrc5DFRUVsFgsqK6uhru7O+Lj4zF//nzV6jU0NGDevHl48OABmpqaMG3aNCQmJqpW7yGbzYbY2FgE\nBgZiy5YtqtaKiIiAt7c33Nzc4OHhgaysrI6/WdLI7du3W36dkZEhvfvuu6rWO3z4sNTU1CRJkiRt\n2rRJ+u9//6tarUuXLkllZWXSa6+9Jv3++++q1GhqapIiIyOlq1evSg0NDdIrr7wiXbx4UZVakiRJ\nJ06ckEpKSqQZM2aoVqO1qqoqqaSkRJKk5n8rU6dOVfX3J0mSdOfOHUmSJKmxsVGKj4+Xzp49q2o9\nSZKkXbt2SStXrpSWLFmieq2IiAiptrbWru/VbCnRu3fvll/fvXsXbm7qDmXs2LEtNUaOHImKigrV\nag0aNAgDBgyApOKGz7lz5xASEoJ+/frB09MTUVFRyMvLU63e6NGj8eSTT6r2/Pb8/f0RFhYGoPnf\nyuDBg1FVVaVqzV69egFonj00NjaqWgtonhUVFhYiPj5e9VoAIEkSbDabXd+r2VICADZv3ozs7Gw8\n8cQTyMjIcFjdrKwsREVFOayeGiorKxEcHNzydWBgoGE/9n716lWUlpZi+PDhqtax2WyYPXs2rly5\ngnnz5qleLyUlBRaLpeWDiGozmUx4/fXXYTKZMHfuXMyZM6fD71U1GDr7nEVERASSk5ORnJyMbdu2\n4csvv8SyZctUrQcA6enp8PT0xMyZM1WvpSY1ZyPOpL6+HklJSbBarW1mmWpwc3PDvn37cPv2bSxd\nuhQXL17EkCFDVKlVUFAAPz8/hIWF4dixY6rUaG/Pnj3w9/dHTU0NFi5ciEGDBmH06NGP/F5Vg8He\nD1TNmDEDS5YsURwMXdXbu3cvCgsLhcxOtP6wWFBQEMrLy1u+rqysREBAgIYjEq+xsRFJSUmIjo5G\nZGSkw+p6e3tjzJgxOHTokGrBcPr0aRw8eBCFhYW4f/8+6uvrYbFYVO1p8vDlvq+vL6ZMmYLz5893\nGAyavWO4fPlyy6/z8vIwaNAgVesVFRVhx44dSE9Ph5eXl6q1WlPr/9nDw8Nx5coVXLt2DQ0NDcjN\nzcXkyZNVqfWQo2cpVqsVQ4YMwYIFC1SvVVNT0zKlv3fvHo4eParqv8kVK1agoKAAeXl5+Pjjj/HS\nSy+pGgp3795FfX09AODOnTv49ddf8eyzz3b4/Zq9Y/joo49QVlYGNzc3mM1mvP/++6rW27BhAx48\neICEhAQAwIgRI7Bu3TpVav3yyy9Yv349bt68iTfffBNDhw7Fjh07hNZwd3fH2rVrkZCQAEmSEBcX\nh8GDBwut0drKlStx7Ngx1NbWYtKkSVi2bBliY2NVq3fq1Cnk5OQgNDQUMTExMJlMSE5OxoQJE1Sp\n99dff+Htt9+GzWaDzWbDyy+/jIkTJ6pSSwvV1dVITEyEyWRCU1MTZs6cifHjx3f4/fysBBHJ8OQj\nEckwGIhIhsFARDIMBiKSYTAQkQyDgYhkGAxEJMNgICKZ/wfuExw3vDkyWAAAAABJRU5ErkJggg==\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xa5aa810\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "num_examples = 50\n",
-        "X = np.array([np.linspace(-2, 4, num_examples), np.linspace(-6, 6, num_examples)])\n",
-        "plt.figure(figsize=(4,4))\n",
-        "plt.scatter(X[0], X[1])\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "AId3xHBNlcnk"
-      },
-      "source": [
-        "Then we perturb it with noise:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "form",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 375,
-          "status": "ok",
-          "timestamp": 1474671835705,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "fXcGNNtjlX63",
-        "outputId": "455c3e70-a724-4e0a-d08e-9bf6bd1aa7e9"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQYAAAEDCAYAAADX+KqPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFp1JREFUeJzt3X9QVOX+B/D3CvjjijcWRX6Ud+PSUDpBztjtD+trSQih\nIqBmNU02cqPsjm46FQ6U1YwKtTb9UXfGNENHm0nTREKrYQYJmyzL394xYyqqiwReW0ggQBee7x8I\n4R7ZPXjOnuec3ffrP+rsOZ+wffs8z3l+2IQQAkREg4yQXQARmQ+DgYgUGAxEpMBgICIFBgMRKTAY\niEghXI+bbN26Fbt374bNZkNycjJKS0sxcuRIPW5NRBJobjE0Nzdj+/bt2LNnDyorK9HT04OPP/5Y\nj9qISBJdWgy9vb3o7OzEiBEj0NXVhYkTJ+pxWyKSRHMwxMbGYsmSJbjvvvswZswY3H333Zg+fboe\ntRGRJJq7EhcvXkR1dTVqamrw+eef448//kBlZaUetRGRJJqD4dChQ5g0aRKioqIQFhaGWbNm4fjx\n4z4/w+UZROamuSuRkJCAkydPoru7GyNHjsRXX32FlJQUn5+x2Wz43//atD5aVzEx41iTSmasizWp\nExMzTtV1moMhNTUVmZmZyM3NRXh4OKZMmYJFixZpvS0RSaTLW4lly5Zh2bJletyKiEyAMx+JSIHB\nQEQKDAYiUmAwEJECg4GIFBgMRKTAYCAiBQYDESkwGIhIgcFARAoMBiJSYDAQkQKDgYgUGAxEpMBg\nICIFBgMRKTAYiEhBlx2ciPTkdrdi1aoa/PzzX+Fw/A6XKw12e5TPaxsb7UhIcPu8ltRjMJDprFpV\ng4qKxwDYcOKEALAd77yT5/dawPe1pJ4uXYm2tjY4nU5kZWVhzpw5OHnypB63pRD1889/Rd8XHQBs\nV37Wfq3e3O5WFBSUIyOjGgUFe9DS0mrYswNNlxbDunXrcO+99+LNN9+Ex+NBV1eXHrelEOVw/H6l\npdDXCnA4Lupyrd6G07KxGs3B0N7ejiNHjuDVV1/tu2F4OCIjIzUXRqHL5UoDsP3KGMNFuFwz/V7b\nN8bQ4vNavclsrQSa5mBoaGiA3W5HUVERzp49i9tvvx0vvPACRo8erUd9ZAHDGSxUw26PUvU37+Dn\nJie3Yc2amYYOPMpsrQSa5mDweDw4c+YMXnrpJaSkpGDdunXYtGkTnE6nHvWRBchqUns/t7vb2Kb8\ncFo2VqM5GOLi4hAXFzdwLF1mZiY2b97s93Nqj8oyEmtSb3BdjY12DG5SNzbaDalb1nP7xcSMw969\ni/1eY0Wag2HChAmIj49HfX09EhMT8dVXXyEpKcnv58x4ph9rUse7roQEN/peFfY1qRMSWgypW9Zz\n1TLjn59hZ1cCwIsvvojnnnsOHo8HkyZNQmlpqR63JYuQ1aQe/Nzk5E6sWRM8TXnZbELSmfRmTFLW\npI4Z62JN6qhtMXCtBBEpMBiISIHBQEQKDAYKGT/88DOmTn0LDkc5pk59E/X1P8suybS4upJCxoIF\nH6GxsQiADZ2dAnl5pThxYrnsskyJLQYKGS0tN2HwhKi+n+laGAwUMuz2/6JvQhQACNjtDTLLMTV2\nJShklJfnIC+vFC0tN8Fub0B5+TzZJZkWg4FCRmKig2MKKrErQUQKDAYiUmAwEJECg4GIFBgMRKTA\nYCAiBQYDESkwGIhIgROcyC/v7eHLynIAhKm+3ujzJGU/PxgwGMgv723an356B/7977mqrzf6hCbZ\nzw8GunUlent7kZeXh6VLl+p1SzIJ7xOX6ut9nzQm+4Qm2c8PBroFw7Zt21RtG0/W43D8jsGrEhMT\n24d1vdEnNMl+fjDQpSvR1NSE2tpaLF26FFu2bNHjlmQi3tvDb9gwDz096q83+oQm2c8PBroEQ0lJ\nCQoLC9HWZq6tskkf3mdJRkf73hZd7dmTgSL7+cFAczB89tlnmDBhAiZPnozDhw+r/pwZj+5iTeqZ\nsS7WpB/NB8688cYb+OijjxAWFobu7m50dHRg1qxZcLlcPj9nxoM4WJM6ZqyLNamjNqh0PYnq66+/\nRllZGd5++22/15rxF8aa1DFjXaxJHZ5ERUTXTdcJTnfddRfuuusuPW9JRBJw5iNJxynM5sNgIOk4\nhdl8OMZA0nEKs/kwGEg6TmE2H3YlyK/hLrseLk5hNh8GA/k13GXXQxlqkJFTmM2HwUB+DXfZ9VA4\nyGgdHGMgv4a77HooHGS0DrYYyK/hLrseisPx+5WWgg0cZDQ3BgP5Ndxl10PhIKN1MBjIMBxktA6O\nMRCRAoOBiBQYDESkwDGGEMUVjeQLgyFEcbIR+cKuRIjiZCPyhcEQoriikXxhVyJEcbIR+aI5GJqa\nmlBYWIgLFy4gLCwMDz74IBYvXqxHbRRAnGxEvmgOhrCwMBQVFWHy5Mno6OjA/Pnzcffdd/McyxDF\ntx3BQXMwxMTEICYmBgAwduxYJCUl4fz58wyGIOV2t2LZsn2oqxtzzS8+33YEB13HGBoaGnD27Fmk\npqbqeVsyEX9ffL7tCA66BUNHRwecTieKi4sxduxYv9eb8Uw/1uRfY6Mdg7/4jY32q2pMTv7jqqXV\nycmdhv03mO13BZizJjV0CQaPxwOn04mcnBykp6er+owZj+5iTf4lJLjR95qz74ufkNCC777778C4\nQnx8B7Ky3savv8bD4biINWtmGvLfYMbflVlrUkOXYCguLsYtt9yCxx9/XI/bkYm5XGkYNWrHlTGG\nvtechYVXdy9ycrajqup+2aWSBpqD4ejRo6isrERycjJyc3Nhs9mwcuVKzJgxQ4/6yGTs9ijs3PnI\nVX8Tclwh+GgOhmnTpuHbb7/VoxayKG7ZFnw485E04yzK4MNgIM04izL4MBhIgbMXicFACpy9SAyG\nEDZUy8DfW4bffmtFQcFHAW9R9Nf3ww9/gdv9HaKjb0ZSkoctGAMwGELYUC0Df28Z/vWvTwxpUQyu\nDxBobNyB//xnccCeR39iMISwoVoG/t4y9J1dGfh5C971AZEBfR79icEQwoZqGfh7y5CY2IZvvgn8\nvAXv+oD2gD6P/sRgCGHXO/9gw4bZ6O4O/LyF/vp+/PEv+O23OkRHO5CUtJ3zJAxgE0II/5fpz4yL\nSwJd03BfA16rJjO8SjTr4iDW5J+hi6hIHT1eA/JVIhmBu0QbSI/FRlywREZgMBhIjy3bue07GYFd\nCQPpsdiIC5bICBx8vMKsA0VmqwkwZ12sSR21g4/sShCRAoOBiBQYDESkwMFHCzPDZCcKTroEw8GD\nB1FSUgIhBBYsWIAnn3xSj9uSH5zsRIGiORh6e3uxZs0abN26FRMnTsTChQtx//3384g6A8ic7OTd\nWikqmobS0mOK1gtbNdakORhOnToFh8OBG2+8EQAwZ84cVFdXMxgMIHN3Zu/WyjfflKKxsQjerRe2\naqxJczA0NzcjPj5+4OfY2FicPn1a621JBZmTnbxbKy0tN+FarRdO4bYmzcFwvfOjzHimn9VqiokZ\nh717FxtYzZ+8z6gcP/4cGhqUZ1YaeZal1f78zExzMMTFxaGxsXHg5+bmZkycONHv58w4I4w1qRMT\nMw5r1vzfVXsyFBdno6Tkz5/7z6z0vi5QZ1ma8Xdl1prU0BwMKSkp+OWXX3Du3DnExMRg//79eOON\nN7TelkzuWrs8vfOOQ9V1ZH6agyEsLAyrV69Gfn4+hBBYuHAhBx6JLE6XeQwzZszgIbYmwdeDpAfO\nfAwyfD1IeuBaiSAT6NeDbncrHnrofWRkVKOgYA9aWlp1vT+ZA1sMQSbQk57YIgkNDIYgE+hJT5yw\nFBoYDEEm0K8HZU7DJuMwGGhYXK40jBq1A3V1Y7jnZBBjMNCw2O1R2LnzEdPN6CN98a0EESmwxRBA\nnGxEVsVgCCC+2iOrYlcigPhqj6yKwRBAVjpOzu1uRUFBOWc0EgB2JQLKSsfJsdtDgzEYAshKexGw\n20ODsStBAKzV7aHAY4uBAFir20OBx2AgANbq9lDgsStBRAoMBiJS0NSVcLlcqKmpwciRI/G3v/0N\npaWliIyM1Ks2IpJEU4vhnnvuwf79+1FRUQGHw4GNGzfqVVfAeU/ocbs5oYeon6ZgmD59OkaM6LvF\n1KlT0dTUpEtRRuif0HPiRC4qKhbj6ac/kV0SkWnoNsawe/duS20h7z2hp76eXSCifn7HGJYsWYIL\nFy4o/vnKlSuRlpYGANiwYQMiIiKQnZ2t+sGyz/TzPlMxMbFdek3XYsaaAHPWxZr0YxPXeyrtFeXl\n5di5cye2bduGkSNHqv6c7B2AWlpaUVhYMzChp6xsHnp6wqTW5M2MZx8C5qyLNaljyNmVBw8exObN\nm/Hee+8NKxTMwHtCT3S0+f4QiWTRFAxr167F5cuXkZ+fDwC444478Morr+hRFxFJpCkYqqqq9KqD\niEyEMx+JSIHBQEQKXF1pEFk7RnOnaroeDAaDyNo6jVu20fVgV8IgsrZO835uVRW42Sv5xWDQSO3u\nyrK2TvN+bmdnBCoqFqOwsMaQ55M1sSuhkdqmuqyt0/qfW1UFdHZGAMgCN3slfxgMKvgawFPbRZC1\ndVr/cwsK9qCi4mHw+HpSg8Gggq9WgcPx+1WLscz6heNmrzQcDAYVfLUKrPKF42avNBwMBhV8tQr4\nhaNgxGDwoX9s4YcfwpGQUIrx45Px97//YdpWAZFeGAw+DB5bAAT+8Q/9JgdxRiKZGYPBh0BOSuKM\nRDIzTnDyIZCTkniILJkZWww+BPKNg1Vec1JoYjD4EMg3Dv5Cx+1uxbJl+1BXN4ZjEGS4oA0Gsw/u\n+QsdjkGQTEEbDFb/YnEMgmTSZfDx3XffxW233YbWVvMs5e37Iv0O4H0A+1Bb22SppcayVmMSATq0\nGJqamnDo0CEkJCToUY9u+gb3PgbwCAAbWlvnorDQOq0GlysNo0btuDLGYN6p1hScNLcYSkpKUFhY\nqEctunK50hAV1QWrNsft9ijs3PkIqqruxzvv5JlqfISCn6ZgOHDgAOLj43HrrbfqVY9u7PYo3Htv\nGNgcJxo+v0fUDXV25YoVK7Bx40aUlZUhMjISaWlp+PDDD2G32wNW7HC53a14+ulPUF8ficTEdmzY\nkIXoaP7NS+TPdZ9dWVdXhyVLlmD06NEQQqC5uRmxsbHYtWsXxo8f7/fz/o6DM/p1o1nPGTRbTYA5\n62JN6gT87Mrk5GR88cUXAz+npaWhvLwcN9xww/Xe8ipWf91IZGW6rZWw2WzQeHD2Vfgen0ge3YKh\nuroaUVH6NfX5Hp9IHtPOfLTKlmlEwci0wcAt04jk4X4MRKTAYCAiBQYDESkwGIhIwbSDj4OZfdMV\nomBjiWAI9VmQDEYymiWCIdRnQYZ6MJLxLDHGEOqzIEM9GMl4lmgxhPosSO+t5uPiLqCgoJxdCwoY\nSwSDllmQwdA/9w7GS5cuo6Lin2DXggLFEsGgRTD0z72DMSOjGuxaUCBZYoxBC+/+eVUVUFCwx1I7\nRnsL9TEXCrygbzF49887OyNQUfEwrNhy6BfqYy4UeEEfDP1foqoqoLMzAkAWrN785spTCrSg70r0\nf4kyMgSAhwFEgc1vIt+CvsXQj81vIvVCJhjY/CZST3NXYvv27XjggQeQnZ2N119/XY+aiEgyTS2G\nw4cPo6amBvv27UN4eDjcbrdedRGRRJpaDO+//z4KCgoQHt6XL9HR0boURURyaQqGn376CUeOHMGi\nRYvw2GOP4fTp03rVRUQS+e1K+Dq7sqenBxcvXsQHH3yAU6dOYcWKFaiurlb1YLVHZRmJNalnxrpY\nk378BsOWLVuG/Hc7duxARkYGACA1NRUjRoxAS0uLqoNtzXimH2tSx4x1sSZ11AaVpq5Eeno6vvzy\nSwBAfX09PB6PqU67JqLro+mtxPz581FcXIzs7GxERETgtdde06suIpJIUzBERERg/fr1etVCRCYR\n9GsliGj4GAxEpMBgICIFBgMRKTAYiEghZJZdB4Ng2PGarIHBYCHBsOM1WQO7EhbCE6nIKAwGC+G2\n8WQUdiUshPtWklEYDBbCfSvJKOxKEJECg4GIFBgMRKTAYCAiBQYDESkwGIhIgcFARAqaguHs2bN4\n6KGHkJubi4ULF/JcCaIgoSkY1q9fj+XLl2Pv3r1Yvnw5XC6XXnURkUSagsFms6GtrW/f/La2NsTG\nxupSFBHJpWlKdFFREZ544gm89tprEEJgx44detVFRBJd9xF1K1euxKFDh/DCCy8gPT0dn376KYqL\ni32eXEVE1mATQgj/l13bnXfeiSNHjgz8PG3aNBw9elSXwohIHk1jDLGxsfj6668BAF9++SVuvvlm\nPWoiIsk0tRiOHTuGtWvXore3F6NGjcLLL7+MKVOm6FkfEUmgKRiIKDhx5iMRKTAYiEiBwUBECtKD\n4d1338Vtt92G1tZW2aXA5XIhKysLOTk5WL58Odrb26XVcvDgQTzwwAPIzMzEpk2bpNXRr6mpCYsX\nL8bs2bORnZ2Nbdu2yS5pQG9vL/Ly8rB06VLZpQxoa2uD0+lEVlYW5syZg5MnT8ouCVu3bsXcuXOR\nnZ2NZ599FpcuXRr6YiHRr7/+KvLz88XMmTNFS0uLzFKEEEJ88cUXoqenRwghxPr168Xrr78upY6e\nnh6Rnp4uGhoaxKVLl8S8efPE999/L6WWfufPnxdnzpwRQgjR3t4uMjIypNfUb8uWLeLZZ58VTz31\nlOxSBqxatUrs3r1bCCHE5cuXRVtbm9R6mpqaRFpamuju7hZCCPHMM8+I8vLyIa+X2mIoKSlBYWGh\nzBKuMn36dIwY0fcrmTp1KpqamqTUcerUKTgcDtx4442IiIjAnDlzUF1dLaWWfjExMZg8eTIAYOzY\nsUhKSsL58+el1gT0tWRqa2vx4IMPyi5lQHt7O44cOYIFCxYAAMLDwxEZGSm5qr6WVWdnJzweD7q6\nujBx4sQhr5UWDAcOHEB8fDxuvfVWWSX4tHv3bsyYMUPKs5ubmxEfHz/wc2xsrCm+hP0aGhpw9uxZ\npKamyi5l4C8Xm83m/2KDNDQ0wG63o6ioCHl5eVi9ejW6urqk1hQbG4slS5bgvvvuw4wZMzBu3DhM\nnz59yOsDeq7EUOssVqxYgY0bN6KsrGzgnwmDplP4WvuRlpYGANiwYQMiIiKQnZ1tSE3ejPpdXI+O\njg44nU4UFxdj7NixUmv57LPPMGHCBEyePBmHDx+WWstgHo8HZ86cwUsvvYSUlBSsW7cOmzZtgtPp\nlFbTxYsXUV1djZqaGowbNw5OpxOVlZVD/j8e0GAYakFVXV0dzp07h5ycHAgh0NzcjAULFmDXrl0Y\nP358IEvyu8irvLwctbW1UgfX4uLi0NjYOPBzc3Ozz2afUTweD5xOJ3JycpCeni67HBw7dgwHDhxA\nbW0turu70dHRgcLCQun7gsTFxSEuLg4pKSkAgMzMTGzevFlqTYcOHcKkSZMQFdV3OvqsWbNw/Pjx\nof/yM2Tkw4+ZM2eK1tZW2WWI2tpaMXv2bOF2u6XW4fF4BgYfu7u7TTH4KIQQzz//vCgpKZFdxjUd\nPnzYVIOPjz76qPjxxx+FEEK89dZbwuVySa3n5MmTYu7cuaKrq0v09vaKVatWiffee2/I601xRJ3N\nZjNF83nt2rW4fPky8vPzAQB33HEHXnnlFcPrCAsLw+rVq5Gfnw8hBBYuXIikpCTD6xjs6NGjqKys\nRHJyMnJzc2Gz2bBy5Upp4zBm9+KLL+K5556Dx+PBpEmTUFpaKrWe1NRUZGZmIjc3F+Hh4ZgyZQoW\nLVo05PVcK0FECtInOBGR+TAYiEiBwUBECgwGIlJgMBCRAoOBiBQYDESkwGAgIoX/B0/dgNBk20eF\nAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xbdce750\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "X += np.random.randn(2, num_examples)\n",
-        "plt.figure(figsize=(4,4))\n",
-        "plt.scatter(X[0], X[1])\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "3dc1cl5imNLM"
-      },
-      "source": [
-        "## What we want to do\n",
-        "\n",
-        "What we're trying to do is calculate the green line below:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "form",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 1150,
-          "status": "ok",
-          "timestamp": 1474671836784,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "P0m-3Mf8sQaA",
-        "outputId": "32a8a45d-ba64-4286-acf7-0883d9184693"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQYAAAEDCAYAAADX+KqPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XlYE+faP/BvCJsFlIDI4oKIoqAgde1BBUXUuqDi2uXo\nqVZa7VEEhSAo7oKAWrtcx2Nr1Z/2bfXUI1W7vfRFDG7gjrYuVEUUkEUhsi+B+f1BoUqEBDOZmZD7\nc139IzqZuYXy5Zknz9yPiGEYBoQQ8hwDvgsghAgPBQMhRAkFAyFECQUDIUQJBQMhRAkFAyFEiSEb\nJ9m/fz+OHDkCkUgEFxcXxMTEwNjYmI1TE0J4oPGIIT8/HwcPHsTRo0dx4sQJ1NXV4aeffmKjNkII\nT1gZMdTX16OyshIGBgaoqqpCly5d2DgtIYQnGgeDra0tFixYgNGjR6NDhw4YMWIEvLy82KiNEMIT\njW8lSkpKkJSUhOTkZJw+fRoVFRU4ceIEG7URQniicTCcO3cO3bt3h6WlJcRiMcaNG4erV6+2+h56\nPIMQYdP4VsLBwQHp6emorq6GsbExUlNT4e7u3up7RCIRCgtLNb00q2xsLKgmNQmxLqpJPTY2Fmod\np3EweHh4YMKECZg+fToMDQ3h5uaGOXPmaHpaQgiPWPlUYunSpVi6dCkbpyKECACtfCSEKKFgIIQo\noWAghCihYCCEKKFgIIQooWAghCihYCCEKKFgIIQooWAghCihYCCEKKFgIIQooWAghCihYCCEKKFg\nIIQooWAghCihYCCEKKFgIIQoYaWDEyFsKiqSIzw8GVlZHeHo+Axxcb6QSCxbPTY3VwIHh6JWjyXq\no2AgghMenoxjx+YBEOHaNQbAQXz5ZYDKY4HWjyXqY+VWorS0FEFBQZg4cSImT56M9PR0Nk5L9FRW\nVkc0/KADgOjP15ofy7aiIjkCAxMwfnwSAgOPorhYztm1tY2VEcOWLVvg4+ODTz/9FAqFAlVVVWyc\nlugpR8dnf44UGkYBjo4lrBzLtraMbHSNxsFQVlaGS5cuYevWrQ0nNDSEubm5xoUR/RUX5wvg4J9z\nDCWIixuj8tiGOYbiVo9lG5+jFW3TOBiys7MhkUgQERGB27dvY8CAAVi9ejVMTU3ZqI/ogLZMFqpD\nIrFU6zfv89d1cSnFpk1jOJ145HO0om0aB4NCocDNmzexdu1auLu7Y8uWLfjiiy8QFBTERn1EB/A1\npG5+3epqbofybRnZ6BqNg8HOzg52dnZN29JNmDABe/bsUfk+dbfK4hLVpL7n68rNleD5IXVuroST\nuvm6biMbGwt8//18lcfoIo2DoXPnzrC3t0dmZiacnJyQmpoKZ2dnle8T4p5+VJN6mtfl4FCEho8K\nG4bUDg7FnNTN13XVJcTvH2d7VwLAmjVrEBoaCoVCge7duyMmJoaN0xIdwdeQ+vnrurhUYtOm9jOU\n55uI4WlPeiEmKdWkHiHWRTWpR90RAz0rQQhRQsFACFFCwUAIUULBQPTGvXtZ8PT8DI6OCfD0/BSZ\nmVkv/P2Je8ew+Nf3UVL9jKcKhYOeriR6Y+bM48jNjQAgQmUlg4CAGFy7tgwAcPxuAj67uhNWplYQ\nG9CPBY0YiN4oLu6G5xdENbwGfrh3HJ9d3QlLE0vE++yEmZEZbzUKBQUD0RsSySM0LIgCAAYSSTZ+\nyfwJn1zZjk5/hkKPjo58ligYNGYieiMhYRoCAmJQXNwNEkk2Vu7uhB2XYtHRuCPifHagZycnvksU\nDAoGojecnByb5hSSshIRe2ELzI0tEOezA706qV7Gr0/oVoLoneSHSYi7GA0zI3PEeu+As2UfvksS\nHAoGoldSsk9h64VNMDXsgK3e29FH4sJ3SYJEwUD0xpmcFESnboCJ2BQxo7ahr1U/vksSLAoGohfO\n557FltT1MBIbI3pUPNys+/NdkqBRMJB2L+1xKjaeXwuxyBBbRsZiQGd3vksSPAoG0q5dzEvDhnNr\nYCAywJaRsfCw8eS7JJ1AwUDarSv5l7Du7GoAwKYRMRjY5XWeK9IdtI6BtEvpBVcRdTYCDBhsHBGN\nQbZD+C5Jp1AwEJWat4ffu3caALHax3O9n+SZe2ew6L/LUVUjwqAns9Hbty9n124vKBiISs3btC9Z\ncgiffz5F7eO53KHptyc3sOi/wSiSWwG/xuFM1ihIy9vPDlFcYW2Oob6+HgEBAVi8eDFbpyQC0XzH\npczM1nca42uHpptPf0fk6TBU1TDA/8UAWd6cXr89YS0YDhw4oFbbeKJ7HB2f4fmnEp2cytp0PBc7\nNN0svImI06GorquC59MA4MFoTq/f3rByK5GXlweZTIbFixdj3759bJySCEjz9vC7dk1FXZ36x2u7\nnfwfxRlYfT4UVYpKrBoWBc+xgyEta587RHGFlWCIjo6GVCpFaamwWmUTdjTfS9LKqvW26OruPcmG\ne/I/EJ6yAlVMBaRDIzGmx1gAoDkFDWkcDKdOnULnzp3h6uqKtLQ0td8nxK27qCb1CaGuu0V3sfp8\nGKqYCqzzWYcpLi1PiPJFCF+nV6HxhjM7duzA8ePHIRaLUV1djfLycowbNw5xcXGtvk+IG3FQTeoR\nQl0PnmUiVBaMZ9VyrBgixfxhb/FeU3NC+Do1p25QsboT1YULF7B37178+9//VnmsEL9gVJN6+K7r\nYUkWVp4KgrxajuWDVmKK81Tea3oZodakDloSTXRKdukjhMmCIa+WY9nrwZjiPJXvktolVhc4DRs2\nDMOGDWPzlIQ0yS3LQZgsGEVVRfjIcxmm9qYJRm2hlY+Ed+osoc4rf4zQU8vxpPIJPhz4EQL6zOKp\nWv1AwUB4p2oJdX55HkJPLUdhZSEWuX+IWS5zeatVX9AcA+Fda0uoCyoKECpbjvyKfCwYsAhz+73D\nS436hoKB8K6lJdSFFYUIkwUjrzwP89zewzuu83irUd/QrQRRqa2PXbfVy5ZQP618ijBZMHLLcvCO\n6zzMc3uPtesR1SgYiEptfey6JS1NMjZfQl1cVYSwU8HIKcvGW/3ewXv934dIJGrlzIRtFAxEpbY+\ndt0Sdfo0yKuKESYLwaPSh5jlMgcLB3xAocADmmMgKrX1seuWqOrT8KxaDmlKCLJKHiCgzyx84PER\nhQJPaMRAVGrrY9ctcXR89udIQYTmfRJKa0oQnrICmc8yMbV3AJYMXEqhwCMKBqJSWx+7bklLfRpK\na0ogla3APfk9TOk1FUs9l1Mo8IyCgXDmZX0aymrLsColFHflf2Ci02QsGxRCoSAANMdAeFNeW47I\n02HIKL6D8T3fRPDgUBiI6H9JIaDvAuFFRW0FVp+W4tbTmxjrOA4rh4RTKAgIfScI5yoVlYg6uwq/\nP/0No7v7Qjo0kkJBYGiOQU/xtSlMlaIKUWcicL0wHd7dRmPVsDUUCgJEwaCn+NgUprquGmvPRiC9\n8CpGdB2FiOFREBuwt7SasIeiWk9xvSlMTV0NNpxbg6sFV/CGgxdWD18HQwP6vSRUFAx6istNYWrq\narDxfBQu5l3AMLvhiHpjA4zERlq7HtEcRbae4mpTmNq6WmxOXY+0x6kYYjcU67w2w1hsrJVrEfZo\nHAx5eXmQSqV48uQJxGIxZs+ejfnz57NRG9EiLjaFUdQrEJ22Eedzz+L1LoOw3msLhYKO0DgYxGIx\nIiIi4OrqivLycsyYMQMjRoygfSz1VOOnHQ+yzFD+t70wcX2EwXZDsXFEDEzEJnyXR9Sk8RyDjY0N\nXF1dAQBmZmZwdnZGQUGBxoURYSoqkmPu3G8xfnwSAgOPorhY/sLfh4cn49jxd5FudQV36/JRlmGF\nTSNjYGpoylPF5FWwOseQnZ2N27dvw8PDg83TEgFR9THngyxzYPQGoPf/AnkD0SlnPDoYduCtXvJq\nWAuG8vJyBAUFITIyEmZmZiqPF+KeflSTarm5Ejz/MWdurqSpxnqmHgqv/wEUWUC+O/DzJ+gXcIKz\nf4PQvlaAMGtSByvBoFAoEBQUhGnTpsHPz0+t9whx6y6qSTUHhyI0fMzZ0FPBwaEYd+48gjQ8CRdM\nfkVFj8twqLSFdfZ49Jp0FJs2jeHk3yDEr5VQa1IHK8EQGRmJ3r174x//+AcbpyMCFhfnCxOTQ8jI\n6ND0MWeY9CSOP80F3O4A90diJDMK+38W3s7TRH0aB8Ply5dx4sQJuLi4YPr06RCJRAgJCYG3tzcb\n9RGBkUgscfjw202/CRmGwUWjk4Db78BTF+DHfyHX9STPVRJNaRwMgwcPxq1bt9iohegYhmGwK/1z\nVPRKAe4PB378F1BtodVVlIQbtPKRvBKGYfDl9V1I+OMI/ta3P0R3vfG43ymtrqIk3KFgIG3GMAz2\n/vYFvss4jO4WPbB99CeQ+FvxXRZhEQUDUaKqV8Puy7tx6PY36GreDfE+OyExpVBobygYiJLWFjEd\nvLkf39z5f7A3d0C8z05Yd7DmtVaiHRQMeqylkUFLvRq+uXUQB37fBzszWzDHJ+Ddj69rtftTY333\n7r2GoqI7sLLqCWdnBWfdpvQZBYMea2lk8LKNYQ7f/gb7ftsD29dsYZzojxP/XaL0Pm3WBzDIzT2E\n336br7Xrkb9QMOixlkYGzXs1jPxIjj039sOmgw3ifXZi0b8yXvo+bdcHmGv1euQv1MFJj7XUxamx\nV0Ni4lhMktbh67v7Yd3BGvE+O2Fv7gAnp9KXvk/b9QFlWr0e+QuNGPSYqi5Ox+8m4F/XPoPEVIJt\nPp+gq0U3AMCuXZNQXa397k+N9d2//xqePs2AlZUjnJ0P0joJDogYhmFUH8Y+IT5cou2a2tqy/WU1\ncdX2/Yd7x/HJle2wNLHE9tGfokdHx1br4hvVpB5OH6Ii6mGjZTsXbd9/yfwJn1zZjk4mloj32flC\nKBD9QHMMHGKjZbu2274nPvgZOy7FoqNxR8T57EDPTk6snp/oBgoGDrHRsl2bbd+TshKx7eJWmBtb\nIM5nB3p1or6d+opuJTjERst2bbV9T36YhLiL0TAzMkes9w44W/Zh5bxEN1EwcIiNlu3aaPuekn0K\nWy9sgqlhB2z13o4+EhdWz090D91K6LmzOacRnboBJmJTxIzahr5W/fguiQgABYMeO597FptT18FI\nbIzoUfFws+7Pd0lEICgY9FTa41RsPL8WYpEhtoyMxYDO7nyXRASE5hh02KsudrqYl4YN59bAQGSA\nzSO3wsPGk4NqiS5hJRhSUlIQHR0NhmEwc+ZMfPDBB2yclqjwKoudruRfwrqzqwEAm0bEwLPLIO0X\nSnSOxsFQX1+PTZs2Yf/+/ejSpQtmzZqFsWPH0t6VHGjrYqf0gquIOhsBBgw2jojGINshr3zt5qOV\niIjBiIm5ojR64WoJN2GXxsFw/fp1ODo6omvXrgCAyZMnIykpiYKBAy/rm9CSG4XpWH0mHPVMHdZ7\nRWOo3XCNrt18tHLxYgxycyPQfPTCxRJuwj6NgyE/Px/29vZNr21tbXHjxg1NT0vUoO5ip9+e3EDk\nGSkU9bVY57UZw+3f0PjazUcrxcXd8LLRi7aXcBPt0DgYXvXhTCHu6adrNdnYWOD77+e3+v4b+Tew\nLnUV6kUKbHszHmOc2Fkp6eJS8cJoxdo6B9nZf712camEjY2F0nGNf64Nuvb9EzKNg8HOzg65ublN\nr/Pz89GlSxeV7xPi46jtraY7RbchTQlBlaISkcPXYYD5EFb+jTY2Fti0adQLPRkiI/0RHf3X68Y9\nK5sfp629LNvj908bOHvs2t3dHQ8fPkROTg5sbGzw448/YseOHZqelmjoj+IMrEpZiSpFJVYNi4JP\nd3abm7xsafaXXyo/nq2NJdxE+zQOBrFYjKioKCxcuBAMw2DWrFk08cize/I/EJ6yAuW1ZZAOi8SY\nHmP5LonoGFbWMXh7e9MmtgJxNesa3j20GOWKGngUTcFgP80+fSD6iZZEtyMPnmXi798uwZPSDqj8\nZRvSDu6AVJrMd1lEB1EwtBMPS7IQJgtGWV0VcDoSuDMN2vh4sKhIjrlzv8X48UkIDDyK4mI5q+cn\nwkDPSrQD2aWPECYLhrxajv7yCbh0q3Gyj/1W67RgST9QMOi43LIchMmCUVRVhCWeSzHGbxykcu21\ndqcFS/qBgkGH5ZU/Ruip5XhS+QQfDvwIM/rMBgCt/gZvyzJsorsoGHRUfnkeQk8tR2FlId53/wCz\nXOZyct24OF+YmBxCRkYHrW42Q/hFwaCDCioKECpbjvyKfCwYsAhv9XuXs2tLJJY4fPhtwa3oI+yi\nTyV0TGFFIcJkwcgrz8M8t/fwjus8vksi7RCNGLSI7V4ETyufIkwWjNyyHLzt+nfMc3uPvWIJeQ4F\ngxax+dFecVURwmTByCnLxty+b2NB/0UQiUSq30jIK6BbCS1i66M9eVUxwmQheFT6ELNc5uB99w8p\nFIhWUTBoERvbyT2rlkOaEoKskgcI6DMLH3h8pJVQKCqSIzAwgVY0EgB0K6FVmm4nV1JdgvCUFch8\nlompvQOwZOBSrY0UaEUjeR4FgxZp0ougrKYUK34Kxz35PUzu5Y9/egZp9faBVjSS59GthACV1ZZh\n1elQ3Cq8hTedJiFo0AoYiLT7rdLmLtpE99CIQWDKa8sReToMd4puY7rbVCxxC9F6KADa20Wb6CYK\nBgGpqK3A6tNS3Hp6E2N7+GGtz1o8fVLOybWpBRt5Ht1KCESlohJRZ1fh96e/YXR3X4QNjeRkpEDI\ny9D/eQJQpahC1JkIXC9Mh3e30Vg1bA3EBmK+yyJ6TKNbibi4OCQnJ8PY2Bg9evRATEwMzM3N2apN\nL1TXVWPt2QikF17FiK6jEDE8ikKB8E6jEcPIkSPx448/4tixY3B0dMTu3bvZqkvrmi/oKSrifkFP\nTV0NNpxbg6sFV/CGgxdWD18HQwOa9iH80ygYvLy8YGDQcApPT0/k5eWxUhQXGhf0XLs2HceOzceS\nJT9zev2auhpsPB+Fi3kXMMxuOKLe2AAjsRGnNRDSEtbmGI4cOaJTLeSbL+jJzOTuFqi2rhabU9cj\n7XEqhtgNxTqvzTAWG3N2fUJUUTluXbBgAZ48eaL05yEhIfD19QUA7Nq1C0ZGRvD391f7wnzv6dd8\nT0UnpzJOalLUKxCZtAkXC87Dy/ENfDzhY5gYmrR4PN9fp5YIsS6qiT0i5lV3pf1TQkICDh8+jAMH\nDsDYWP3fenx3ACoulkMqTW5a0LN371TU1Wl30q+uvg7RaRuRkn0KHjae2DIyFqaGpi0eL8S9DwFh\n1kU1qYeTvStTUlKwZ88efP31120KBSFovqDHykq738R6ph6xF7YgJfsU3Dt7YPPIra2GAiF80igY\nNm/ejNraWixcuBAAMHDgQKxfv56NutqVeqYecRejkfwoCW7W/bF5ZCw6GHbguyxCWqRRMCQmJrJV\nR7tVz9Rj+6VYJGX9in5WrogeFY/XjF7juyxCWkUrH7WonqnHzsvbkPjgF7hI+iLGexvMjMz4LosQ\nlSgYtIRhGHx25WP8nPkjelv2wVbvbTA3olWhRDfQMjstYBgGn1/7BD/cPw5nS2fE+exAbVk9AsMT\nWOsYrS62O1UT/UDBwDKGYbAr/XMcv5sAp05OiPXeAQvjjggMT+CldRq1bCOvgm4lWMQwDL68vgsJ\nfxyBY8eeiPXegU4mDb+d+Wqd1vy6iYmgZq9EJRoxaKhxqP4gywL1g4+ifmA6nCx7Id7nY0hMrZqO\n42sz2ObXraw0wrFjb4FGDqQ1FAwaahqqD9kNiG/B/r4xjoTsfCEUAP5apzVeNzERqKw0AjAR1OyV\nqELBoIbWJvCysjoCg/Y0/FfSDVbXpsM60lrpHHy1Tmu8bmDg0T9HCrR9PVGNgkENrU7gvX4CMEoH\nSh2AE/9Gbz9hLvqiZq+kLSgY1NDSxOHh29+g9vWLsM00QOfr09HbL1GwP3DU7JW0BQWDGl42cXgk\n4zD23NgNO3M7fLPsM9hHOPBdJiGsoWBoRePcwr17hnBwiIG1tQt69aqAzz9LsTv9K1h3sMY2n09g\nb06hQNoXCoZWPD+3ADAYOvQg/MOBz65+BYmpBNt8PkFXi26vdG5akUiEjIKhFc3nFtJrfseDq7/C\n0sQS23w+QTeL7q98blqRSISMVj624oX9HPseQ7H7f9DRpBPifXaiR0dHjc5Nm8gSIaMRQysaP+K7\nXnUHxR7fop+TA+K9d6BnJyeNz83XSkhC1EHB0AqJxBJvrTFD1oWf0dW4O+J9PkYvy96snFvVuoKi\nIjmWLv0BGRkdaA6CcK7dBgMbk3vJD5MQdzEarxmZYeuo7XC27MNafarWFdAcBOFTuw0GTX+wUrJP\nYeuFTTA17ICt3tvhYtVXa7W+DM1BED6xMvn41VdfoV+/fpDLhfMob8MP0jMA3wL4ATJZntqPGp/N\nOY3o1A0wEZsiZtQ29LNy1WapL/XCxCfNQRCOaTxiyMvLw7lz5+DgIKxFPg2Tez8BeBuACHL5FEil\nqkcN53PPYnPqOhiJjRE9Kh5u1v05qbe5uDhfmJgc+nOOgZ5tINzSeMQQHR0NqVTKRi2siovzhaVl\nFdoyHE97nIqN59dCLDLElpGxGNDZXet1tkQiscThw28jMXEsvvwygCYeCac0CoaTJ0/C3t4effty\ne/+tDonEEj4+Yqg7HE/NTsWGc2tgIDLA5pFb4WHjyUmdhAjRK+9dGRwcjN27d2Pv3r1Nf9aW3e64\n2NNv795pWLLkEDIzzeHkVIZdu6bCykr5uhdyLmDF/66AkZEYO9/ciWFdh2m9NnUJde9DIdZFNbHn\nlfeuzMjIwIIFC2BqagqGYZCfnw9bW1t89913sLZWblTSnKrt4Lh6liC94Coiz0ghNhQhatgmDLUb\nzvo1XpUQ9z4EhFkX1aQere9d6eLigrNnzza99vX1RUJCAjp16vSqp3wBF5/j3yhMx+oz4ahn6rBj\n3Mfo22Egq+cnRFex9qyESCRq062EKtr+HP/3J78h8owUivparP3bJozsMZLV8xOiy1gLhqSkJFha\nsjfU1+bn+Lee3kTE6VDU1FVj9Rvr8TeHEaydm5D2QLArH7XVo/BO0W2sOr0S1XVViBy+DqO6+bBy\nXkLaE8EGgzZ6FP5RnIFVKStRpahE+LA18OlOi4YIeRm96cdwT/4HwlNWoLy2DGFDI+Dbw4/vkggR\nLL0IhvvP7kEqW4GymlKsHLoKfo4T+C6JEEFr98Hw4FkmpLIVKKkpQfDgMEzoOZHvkggRvHYdDA9L\nshAmC8azajmWD1qJSb2m8F0SITpBsJOPz3uVVZDZpY8QJguGvFqOpa8vxxTnqRxVS4ju04lgaOsq\nyNyyHITJglFUVYQlnksxrfcMzmrVBmo1T7imE8HQllWQeeWPEXpqOZ5UPsEHHkswo89sTmrUJmrz\nRrimE3MM6q6CzC/PQ+ip5SisLMT77h9gdt+3OKtRm6jNG+GaTowY1FkFWVBRgDBZMPIr8vFe//fx\nVr93uS9US5q3mreze4LAwAS6tSBaoxPBoGoVZGFFIcJkwXhc/hjz3N7Du27zm/6uPdyfNw/Gmppa\nHDv2PujWgmiLTgRDa55WPkWYLBi5ZTl42/XvmOf23gt/3x7uz5sH4/jxSaBbC6JNOjHH0JLiqiKE\nyYKRU5aNuX3fxoL+iyASiV44pvn9eWIiEBh4VO2O0UJEHaSJtunsiKGitgJhshA8Kn2ImS6z8b77\nh0qhACjfn1dWGuHYsbegiyOHRtp68pSQRjobDNmlj5BV8gAz+szGhx7/fGkoAH/9ECUmApWVRgAm\nQteH39p48pSQ5+nsrYSLVV8kTPsBSzyXthgKwF8/ROPHMwDeAmAJGn4T0jqdHTEAgLmx+h14afhN\niPp0OhjagobfhKhP41uJgwcP4s0334S/vz+2bdvGRk2EEJ5pNGJIS0tDcnIyfvjhBxgaGqKoqIit\nugghPNJoxPDtt98iMDAQhoYN+WJlZcVKUYQQfmkUDA8ePMClS5cwZ84czJs3Dzdu3GCrLkIIjzTa\nu7Kurg4lJSX4z3/+g+vXryM4OBhJSUlqXViIe/pRTeoTYl1UE3tUBsO+ffta/LtDhw5h/PjxAAAP\nDw8YGBiguLgYEolE5YWFuKcf1aQeIdZFNalH3aDS6FbCz88P58+fBwBkZmZCoVCoFQqEEGHT6FOJ\nGTNmIDIyEv7+/jAyMkJsbCxbdRFCeKRRMBgZGSE+Pp6tWgghAqGzz0oQQrSHgoEQooSCgRCihIKB\nEKKEgoEQokRvHrtuD9pDx2uiGygYdEh76HhNdAPdSugQ2pGKcIWCQYdQ23jCFbqV0CHUt5JwhYJB\nh1DfSsIVupUghCihYCCEKKFgIIQooWAghCihYCCEKKFgIIQooWAghCjRKBhu376NuXPnYvr06Zg1\naxbtK0FIO6FRMMTHx2PZsmX4/vvvsWzZMsTFxbFVFyGERxoFg0gkQmlpQ9/80tJS2NraslIUIYRf\nGi2JjoiIwKJFixAbGwuGYXDo0CG26iKE8OiVt6gLCQnBuXPnsHr1avj5+eGXX35BZGRkqztXEUJ0\ng4hhGEb1YS83ZMgQXLp0qen14MGDcfnyZVYKI4TwR6M5BltbW1y4cAEAcP78efTs2ZONmgghPNNo\nxHDlyhVs3rwZ9fX1MDExwbp16+Dm5sZmfYQQHmgUDISQ9olWPhJClFAwEEKUUDAQQpTwHgxfffUV\n+vXrB7lczncpiIuLw8SJEzFt2jQsW7YMZWVlvNWSkpKCN998ExMmTMAXX3zBWx2N8vLyMH/+fEya\nNAn+/v44cOAA3yU1qa+vR0BAABYvXsx3KU1KS0sRFBSEiRMnYvLkyUhPT+e7JOzfvx9TpkyBv78/\nVq5ciZqampYPZnj0+PFjZuHChcyYMWOY4uJiPkthGIZhzp49y9TV1TEMwzDx8fHMtm3beKmjrq6O\n8fPzY7Kzs5mamhpm6tSpzN27d3mppVFBQQFz8+ZNhmEYpqysjBk/fjzvNTXat28fs3LlSubDDz/k\nu5Qm4eHhzJEjRxiGYZja2lqmtLSU13ry8vIYX19fprq6mmEYhlm+fDmTkJDQ4vG8jhiio6MhlUr5\nLOEFXl7UuJp4AAADF0lEQVReMDBo+JJ4enoiLy+PlzquX78OR0dHdO3aFUZGRpg8eTKSkpJ4qaWR\njY0NXF1dAQBmZmZwdnZGQUEBrzUBDSMZmUyG2bNn811Kk7KyMly6dAkzZ84EABgaGsLc3JznqhpG\nVpWVlVAoFKiqqkKXLl1aPJa3YDh58iTs7e3Rt29fvkpo1ZEjR+Dt7c3LtfPz82Fvb9/02tbWVhA/\nhI2ys7Nx+/ZteHh48F1K0y8XkUik+mCOZGdnQyKRICIiAgEBAYiKikJVVRWvNdna2mLBggUYPXo0\nvL29YWFhAS8vrxaP1+q+Ei09ZxEcHIzdu3dj7969TX/GcLScorVnP3x9fQEAu3btgpGREfz9/Tmp\nqTmuvhavory8HEFBQYiMjISZmRmvtZw6dQqdO3eGq6sr0tLSeK3leQqFAjdv3sTatWvh7u6OLVu2\n4IsvvkBQUBBvNZWUlCApKQnJycmwsLBAUFAQTpw40eL/41oNhpYeqMrIyEBOTg6mTZsGhmGQn5+P\nmTNn4rvvvoO1tbU2S1L5kFdCQgJkMhmvk2t2dnbIzc1tep2fn9/qsI8rCoUCQUFBmDZtGvz8/Pgu\nB1euXMHJkychk8lQXV2N8vJySKVS3vuC2NnZwc7ODu7u7gCACRMmYM+ePbzWdO7cOXTv3h2Wlg27\no48bNw5Xr15t+ZcfJzMfKowZM4aRy+V8l8HIZDJm0qRJTFFREa91KBSKpsnH6upqQUw+MgzDhIWF\nMdHR0XyX8VJpaWmCmnx89913mfv37zMMwzCfffYZExcXx2s96enpzJQpU5iqqiqmvr6eCQ8PZ77+\n+usWjxfEFnUikUgQw+fNmzejtrYWCxcuBAAMHDgQ69ev57wOsViMqKgoLFy4EAzDYNasWXB2dua8\njuddvnwZJ06cgIuLC6ZPnw6RSISQkBDe5mGEbs2aNQgNDYVCoUD37t0RExPDaz0eHh6YMGECpk+f\nDkNDQ7i5uWHOnDktHk/PShBClPC+wIkQIjwUDIQQJRQMhBAlFAyEECUUDIQQJRQMhBAlFAyEECUU\nDIQQJf8fkI+nnJy5GlIAAAAASUVORK5CYII=\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xa5aa650\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "weights = np.polyfit(X[0], X[1], 1)\n",
-        "plt.figure(figsize=(4,4))\n",
-        "plt.scatter(X[0], X[1])\n",
-        "line_x_range = (-3, 5)\n",
-        "plt.plot(line_x_range, [weights[1] + a * weights[0] for a in line_x_range], \"g\", alpha=0.8)\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "VYUr2uPA9ah8"
-      },
-      "source": [
-        "Remember that our simple network looks like this:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "cellView": "form",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 898,
-          "status": "ok",
-          "timestamp": 1474671837740,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "gt8UuSQA9frA",
-        "outputId": "6eb7616b-25a9-4845-aeab-7472201c60f6"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJYAAABkCAYAAABkW8nwAAAO90lEQVR4Xu2dT5Dc1J3Hv+YQT8VJ\nZUhVdprLWs4FTSrGGv4ql9CuHBCH4GaTFCLZwnIcjOAy8l6Q/1SlU4XHcg6xJgtY2OOik2KxSGoT\nGWrXzYFC2T2MDAtWitRavmQ0e9k2SYGowom4hNRPtqA9TE+rW3/cPfPepcfup6f3fu/Tv9/T+/PV\npo8//vhjsMQsULAFNjGwCrYoKy6xAAOLgVCKBRhYpZiVFcrAYgyUYgEGVilmZYUysBgDpViAgVWK\nWVmhDCzGQCkWGEuwrly5gtf++zW887/vYOn/lnD5T5cT40x9ZQrb/nEbxDtFiHeI2LJlSylGY4X2\nt8BYgUVAvfzqy3i5/TI+vPLhmq37wpYv4AHpATxw3wMMsP4cFJ5jbMAiqA4eOYg/Lv8xMcL26e34\n+vTXk8+vbv1q8n/03TsX38EfLv4h+aRE380dmmNwFY7O2gWOBVgE1Y/2/yjxUls+vwXaY1oS7tZK\n3v94MJ8zceUvV0Dea+H4AoOrQrhGHqxuT0Xjp0P7D2HqH6Yymejyu5dx5PiRZBxGnmt+bj7TdSxT\nfgv0ASuAzglwmyE8pfbZu3VaEDkDdT+AweevzGolvPjvL+LMb84knmr+yHxmqNKyCK7ZQ7OJ5yIo\n+3m6clqx8UrNB1bso2W64FQN9cnijdcdAvNAQWGRPBcLicX3Ua8S84FVcj3PnjuLhRcWkgH63OG5\nXHc7+NTBZEBP47NvffNbucpiF/e3QCaw2g0NfNvES5c+wtQ9u2G0LCj8BLAiFEaeBU0zYJ9fxkfY\njKl7FZgtCzIHIA7QUmXov/g9LmMztt6rwLBMyFROj3TkZ0fgveXh4X96GN//zvf7t2aNHGlI7VlW\n0pYmRC+AKUwAsQu5thOuvIjQEjGBGJ7CQYptdOw6etc6VzXXzcUZwJrGseWt2P28DV2I4OgyDgQK\nFgMTYtQ1xqq10eDuR6j8Fi1NxGTkwpAfRos7h05bQscQIFgibEeHMBHCVhs4EBtY8lQQd6ulvbN7\n8e6f302mC7Z/bXsuo9NkKk1X9PZ+IUyeR0sN4GscYl8DPzOP5VuPYynQwMU+dL4O3wzRbpQQ93O1\nbvQuzgRWS0p/tQA6Nuqcilq7A5u3Px28T7qw7BB1VUHqhEKTB2+pCAIVHZVD3dPgujpE6peOBzes\nQRS5nr/+b//g24nF7JN27qkCGq/J++RknHXm5JlVeiKGr/MQPQMdV0ZkCRBbNUwEMYzQhRyZEHgH\nOv29ynPM6HXtja1Rf7B4AZ7RgZv+SuMAOj+NtrYEX3avfyqMfDi2DdcLEAQBvPOX8MGtR3Ex0MEF\nJiRxP373wWZsvaeBhixDVRrg1/jxlwEWPV3ap+xVrR57Cjgpht2xEDV4mLIFvqkiaoUwwzp4U4Hv\n9/awN7YrR+vuGcAS4ZsdtKV0VNEFVqMLrIkWJGEPPP4hKA0RgiCAc1XsdJQErGQ2Ig7hOQ5sx4Hz\n0u+wvHX2akjtMWCpNhQCiCicq+AcCx1Fh9B2IegcNN6B4Teg1z0EeknzKqPFRe7a9AeLm4ajXvzU\noJEDqUahMESrKxSqbQHbDBGLoXUNlBiuUsNOT8fFQEVsNdHmdOjStTgSGOCnLTQuBDBosLxKqnTw\nntw/glPnoHMS4E6iFVjgbBGcwUGMPAjtawP73GZf/wVkAutYtAvPezYUPoKjipBdGZ5vQOgavGte\nHbfsiXD09TZUIUbg6JD3vITlrU/iYthErPOYaQk44ZhocDF8U0HDqsEOHfQaC7/2X68lyzJVTjd0\nWiJu2XMem++7+tAxSd52+hguTe3GYtjq6V3XPyqDtbA/WLyAtqRg0rHhLceo3avCsk0kjqd7uoEL\n0FJkaC/9Hh/gS9ixS0dTCaDKHVidNhoTNN2gQP/FedAmly/t2IWm2YK2xswqDbj3antzz5oToD/9\n15/i5smbcdo8vfaDQGiC37YfEyeW4KtcMu2g1HbCrp9Dx5Fw3ZCw04ZSb0Jse6CsLH1qgZFfK0zn\nn+hpznzKHGpJRzus4YJ/AX/78G94ofUC7r777pwMxAhdE6pyAK8u78CJJZ+BtcKiIw8Wea0DTx34\nZCH5oHYwM1y0TjhnziXbaWgB+4cP/RCPPfYYtm/fjpMnT+Kmm24aDrDYhdpoQdAbaMtNSB4Da6Uh\nRx4sqnB3SCTPNbtvtu9iMoU/Wg5Kt9p0h8DTp09j3759ePrpp/H4448PB1fylOtC5jTUGVifseFY\ngJXClXou+jcN6Gk2nj7JG1Gi7TG0Hkiz7OlGP/ru6OGjq46rnnjiCSwuLibe66677hocMAZWT5uN\nDVgpXGfbZ5OtybQNZq1EE6G0NXmXtGvNwbrv+4n3uu222wYPjwys9QFW2goKjbQ4Tdth6CAFeSpK\n5J3oQMUwhynS8PjMM89AVdVs3ouBtb7Aytbrw+WiMZfnednCIwOLgTUIZml43LFjB5577rnhnx4H\nuek6yztWY6yqbb+wsJBMTwwUHquu5Ijej4GVoWMoPJ4/fz7xXkM9PWa4x3rLwsDK2KMXLlxIvBeF\nR5qe2LRpU8YrN2Y2BtaA/U7hkaYnnn322exPjwPeYz1kZ2AN2YtpeCTvdeeddw5Zyvq9jIGVo28p\nPJL3ok2NLDxeb0gGVg6w0kvT8HjixIlkHJY1lauaE8GRangwsvD/noKqt+kzsLJSkCEfzdi/8cYb\nifdaKzxWoppDmxJ5FT54NH06YZShAQVmYWAVaEwqKg2PMzMzyfTEyqfHqlRzAoOH6OqwJnXoNQeB\nSWcjq0sMrJJsferUqSQsdofHylRzYg8aLyG0QtiTOvhGhFZglyKD0Mt8DKySwEqLpfD45ptvYn5+\nHr/+z19/sukwj2pOP72vyJXBy4BNME340Pg6AiNAu8IDkQysksGi4t9++2189wffxee++DkIO4Tc\nqjlrSw504Eg81FobYetq+KOwKDgagjVOnRdtBgZW0RZdpbw0BL73/nv4yZM/6bv7tVeVxkk1h4FV\nAVgbUTWHgVUBWGUcvCVV6EP/cuiztQ9NCNsMiIshrPSIeaK3oUNIlXQqaDMDqwIjlyEV0Fv6MoQl\nbENT/FTIhWSXOF2AF5jocei8cCswsAo36WcLLEPchO7yyr+9smrt6TQ3geQmcgcd2CQbIHoIDKGy\nuSwG1joEi06oU+jj3RAWR2HQgFiiTuxqJmRgVQBWGaGQDo78/OjPe9T+qpfSeBeeqIM3JPip4k8F\n7aVbMLAqMHSlg/dr7YkcCZxWg1Jz0G5UL7/EwKoArBuhmoNEbupBvPrRDhxf8qFVLFrCwKoArFQi\n4P3o/VwTpCmgdBi3r2oOIrQbNdwfGljytZ46r2U1n4FVlmW7yn3rrbfwvX/+XrKkMyPM5FLNIS2K\nbCrSNI8loKX48G6AxhIDq2SwaIcDgWWaJn71H78qRDWnlxbF1aaQxJILj6TRjRhm0L4hYrwMrJLA\nos1+BBXtyaLty5SKVs1Zverx1RB4dhIPPe/CVioeXF2rFAOrYLDIOxFQd9xxRwLVytSt90XfFaGa\nU3ATCimOgVWIGa8WkoY9AorA6pUIrqJVcwpsRiFFMbAKMONqYS9LsWWo5mS5bxV5GFg5rExhj8ZP\ndHBitbCXo+ixv5SBNWQXpmGPvNXtt98+ZCnr9zIG1oB9O2zYG/A2Y5+dgZWxC1nYy2goNt2Q3VA0\njqIDESzsZbcZ81hr2CoNe/T56KOPZrcqy8m2zazGAAt7+X8ZzGOtsCELe/mhohLGEqwyVFpY2CsG\nqLSUsQKrDJUWFvaKBWrswCpDpYWFvXKgKiYUxh5U/huwhd8idBqYRARX4bHTldd8Le8gTSpapYWW\nX0is47qnveTdi02I6aFOejlAbSdcOT2fF8NTOEixDTqnV6Uk0CC2GpW8hYTCyFXA72yj8XoAAzoE\n+nsxgNnrZc8DtL7bU9HJlDwqLY9855FkbY8ktS3LWlGLECbPo6UG8DUOsa+Bn5nH8q3HsRRo4GIS\nL6vDN0O0e70SdoB2rfeshYBF71Juyzzu90TcF59FIC8WJvSVvgiT9nnPH5nP/K7CtOPonYWzh2aT\nF2Fu+usmvPjLF3us7cXwdR6iZ6DjyogsAWKrhokghhG6kCMTAu9Ap7+r1l0cQwoLAote4+ugwT+I\nsxO78XrQKkTkqzsEkqeily8Nk0il5cfHfowv3/xlLBxf6Pk2sNhTwEkx7I6FqMHDlC3wTRVRK4QZ\n1sGbCnxfrfxgwjBtvtHXFAZW7OsQZo7hEm7Fkxf8nm+mH6TBlau0RG00OBWcY6Gj6BDaLgSdDn46\nMPwG9Hr15/MGsdco5S0GrDiAIU7D5M/AgIo9gY6Lng4+5wi3jIOea59wieCQzgEnAe4kWoEFzhbB\nGRzEyIPQDmBWpaoxSpQMUZdCwCLh1OlmDWcCBzJsSNzDiIyL8LR8Ur1lHE2nPeZzh+d6mooENW7Z\ncx6b7zuHTlvCJB1Nnz6GS1O7sUhKxDl/LEP00Vhekh8sUjThNUyYAdxr59dCSwSvAWbg5Xq7exkq\nLfRO6TMnz/TurNAEv20/Jk4swaf2xC6U2k7Y9XPoOBIm6crYh6UoaLodABOoSU3YlpLbQ48lQT0q\nnR+sEq1RBlj0dGmfsnPVOtB51IMmfEdGLQ7RkkSYkps8VbJ01QIjDdaNCIVZwOi4DnxOgsRRXIzh\nazwakY3gmphsljLWe56RBqv6wfvg3R0HFqS6CcHxC5kQHrwGo3nFSIN1Q1RaBuinyDchSyYmDRct\nhWPLPF22G2mwuo+k55kgHUylJRtZoa1A0kI0bAdGPRnSszQuYFE90yUdepoznzKHWtLRDmsglZY8\ncHZTE7UVCGqEpmtDScZZLK20wEh7LKpst9YBKQUf1A5mhovWCefMuU9eM9JbWnEQMAIY/DQOXLr+\nmqmHXkfIdj18YpSRByuFa6+2F1f+cgXkuWb3zfZdN6Twt/DCQuKpsgmVDQIXy9vPAmMB1krPRf9e\nryot/TpsXL4fG7BSuNa7Ssu4gNOvnmMFVtqY9azS0q/DxuX7sQRrXIy7kevJwNrIvV9i2xlYJRp3\nIxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3Ixf9d0NIelzdt4X5\nAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "\u003cIPython.core.display.Image at 0xbde1fd0\u003e"
-            ]
-          },
-          "execution_count": 6,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from IPython.display import Image\n",
-        "import base64\n",
-        "Image(data=base64.decodestring(\"iVBORw0KGgoAAAANSUhEUgAAAJYAAABkCAYAAABkW8nwAAAO90lEQVR4Xu2dT5Dc1J3Hv+YQT8VJZUhVdprLWs4FTSrGGv4ql9CuHBCH4GaTFCLZwnIcjOAy8l6Q/1SlU4XHcg6xJgtY2OOik2KxSGoTGWrXzYFC2T2MDAtWitRavmQ0e9k2SYGowom4hNRPtqA9TE+rW3/cPfPepcfup6f3fu/Tv9/T+/PVpo8//vhjsMQsULAFNjGwCrYoKy6xAAOLgVCKBRhYpZiVFcrAYgyUYgEGVilmZYUysBgDpViAgVWKWVmhDCzGQCkWGEuwrly5gtf++zW887/vYOn/lnD5T5cT40x9ZQrb/nEbxDtFiHeI2LJlSylGY4X2t8BYgUVAvfzqy3i5/TI+vPLhmq37wpYv4AHpATxw3wMMsP4cFJ5jbMAiqA4eOYg/Lv8xMcL26e34+vTXk8+vbv1q8n/03TsX38EfLv4h+aRE380dmmNwFY7O2gWOBVgE1Y/2/yjxUls+vwXaY1oS7tZK3v94MJ8zceUvV0Dea+H4AoOrQrhGHqxuT0Xjp0P7D2HqH6Yymejyu5dx5PiRZBxGnmt+bj7TdSxTfgv0ASuAzglwmyE8pfbZu3VaEDkDdT+AweevzGolvPjvL+LMb84knmr+yHxmqNKyCK7ZQ7OJ5yIo+3m6clqx8UrNB1bso2W64FQN9cnijdcdAvNAQWGRPBcLicX3Ua8S84FVcj3PnjuLhRcWkgH63OG5XHc7+NTBZEBP47NvffNbucpiF/e3QCaw2g0NfNvES5c+wtQ9u2G0LCj8BLAiFEaeBU0zYJ9fxkfYjKl7FZgtCzIHIA7QUmXov/g9LmMztt6rwLBMyFROj3TkZ0fgveXh4X96GN//zvf7t2aNHGlI7VlW0pYmRC+AKUwAsQu5thOuvIjQEjGBGJ7CQYptdOw6etc6VzXXzcUZwJrGseWt2P28DV2I4OgyDgQKFgMTYtQ1xqq10eDuR6j8Fi1NxGTkwpAfRos7h05bQscQIFgibEeHMBHCVhs4EBtY8lQQd6ulvbN78e6f302mC7Z/bXsuo9NkKk1X9PZ+IUyeR0sN4GscYl8DPzOP5VuPYynQwMU+dL4O3wzRbpQQ93O1bvQuzgRWS0p/tQA6Nuqcilq7A5u3Px28T7qw7BB1VUHqhEKTB2+pCAIVHZVD3dPgujpE6peOBzesQRS5nr/+b//g24nF7JN27qkCGq/J++RknHXm5JlVeiKGr/MQPQMdV0ZkCRBbNUwEMYzQhRyZEHgHOv29ynPM6HXtja1Rf7B4AZ7RgZv+SuMAOj+NtrYEX3avfyqMfDi2DdcLEAQBvPOX8MGtR3Ex0MEFJiRxP373wWZsvaeBhixDVRrg1/jxlwEWPV3ap+xVrR57Cjgpht2xEDV4mLIFvqkiaoUwwzp4U4Hv9/awN7YrR+vuGcAS4ZsdtKV0VNEFVqMLrIkWJGEPPP4hKA0RgiCAc1XsdJQErGQ2Ig7hOQ5sx4Hz0u+wvHX2akjtMWCpNhQCiCicq+AcCx1Fh9B2IegcNN6B4Teg1z0EeknzKqPFRe7a9AeLm4ajXvzUoJEDqUahMESrKxSqbQHbDBGLoXUNlBiuUsNOT8fFQEVsNdHmdOjStTgSGOCnLTQuBDBosLxKqnTwntw/glPnoHMS4E6iFVjgbBGcwUGMPAjtawP73GZf/wVkAutYtAvPezYUPoKjipBdGZ5vQOgavGteHbfsiXD09TZUIUbg6JD3vITlrU/iYthErPOYaQk44ZhocDF8U0HDqsEOHfQaC7/2X68lyzJVTjd0WiJu2XMem++7+tAxSd52+hguTe3GYtjq6V3XPyqDtbA/WLyAtqRg0rHhLceo3avCsk0kjqd7uoEL0FJkaC/9Hh/gS9ixS0dTCaDKHVidNhoTNN2gQP/FedAmly/t2IWm2YK2xswqDbj3antzz5oToD/915/i5smbcdo8vfaDQGiC37YfEyeW4KtcMu2g1HbCrp9Dx5Fw3ZCw04ZSb0Jse6CsLH1qgZFfK0znn+hpznzKHGpJRzus4YJ/AX/78G94ofUC7r777pwMxAhdE6pyAK8u78CJJZ+BtcKiIw8Wea0DTx34ZCH5oHYwM1y0TjhnziXbaWgB+4cP/RCPPfYYtm/fjpMnT+Kmm24aDrDYhdpoQdAbaMtNSB4Da6UhRx4sqnB3SCTPNbtvtu9iMoU/Wg5Kt9p0h8DTp09j3759ePrpp/H4448PB1fylOtC5jTUGVifseFYgJXClXou+jcN6Gk2nj7JG1Gi7TG0Hkiz7OlGP/ru6OGjq46rnnjiCSwuLibe66677hocMAZWT5uNDVgpXGfbZ5OtybQNZq1EE6G0NXmXtGvNwbrv+4n3uu222wYPjwys9QFW2goKjbQ4Tdth6CAFeSpK5J3oQMUwhynS8PjMM89AVdVs3ouBtb7Aytbrw+WiMZfnednCIwOLgTUIZml43LFjB5577rnhnx4Huek6yztWY6yqbb+wsJBMTwwUHquu5Ijej4GVoWMoPJ4/fz7xXkM9PWa4x3rLwsDK2KMXLlxIvBeFR5qe2LRpU8YrN2Y2BtaA/U7hkaYnnn322exPjwPeYz1kZ2AN2YtpeCTvdeeddw5Zyvq9jIGVo28pPJL3ok2NLDxeb0gGVg6w0kvT8HjixIlkHJY1lauaE8GRangwsvD/noKqt+kzsLJSkCEfzdi/8cYbifdaKzxWoppDmxJ5FT54NH06YZShAQVmYWAVaEwqKg2PMzMzyfTEyqfHqlRzAoOH6OqwJnXoNQeBSWcjq0sMrJJsferUqSQsdofHylRzYg8aLyG0QtiTOvhGhFZglyKD0Mt8DKySwEqLpfD45ptvYn5+Hr/+z19/sukwj2pOP72vyJXBy4BNME340Pg6AiNAu8IDkQysksGi4t9++2189wffxee++DkIO4TcqjlrSw504Eg81FobYetq+KOwKDgagjVOnRdtBgZW0RZdpbw0BL73/nv4yZM/6bv7tVeVxkk1h4FVAVgbUTWHgVUBWGUcvCVV6EP/cuiztQ9NCNsMiIshrPSIeaK3oUNIlXQqaDMDqwIjlyEV0Fv6MoQlbENT/FTIhWSXOF2AF5jocei8cCswsAo36WcLLEPchO7yyr+9smrt6TQ3geQmcgcd2CQbIHoIDKGyuSwG1joEi06oU+jj3RAWR2HQgFiiTuxqJmRgVQBWGaGQDo78/OjPe9T+qpfSeBeeqIM3JPip4k8F7aVbMLAqMHSlg/dr7YkcCZxWg1Jz0G5UL7/EwKoArBuhmoNEbupBvPrRDhxf8qFVLFrCwKoArFQi4P3o/VwTpCmgdBi3r2oOIrQbNdwfGljytZ46r2U1n4FVlmW7yn3rrbfwvX/+XrKkMyPM5FLNIS2KbCrSNI8loKX48G6AxhIDq2SwaIcDgWWaJn71H78qRDWnlxbF1aaQxJILj6TRjRhm0L4hYrwMrJLAos1+BBXtyaLty5SKVs1Zverx1RB4dhIPPe/CVioeXF2rFAOrYLDIOxFQd9xxRwLVytSt90XfFaGaU3ATCimOgVWIGa8WkoY9AorA6pUIrqJVcwpsRiFFMbAKMONqYS9LsWWo5mS5bxV5GFg5rExhj8ZPdHBitbCXo+ixv5SBNWQXpmGPvNXtt98+ZCnr9zIG1oB9O2zYG/A2Y5+dgZWxC1nYy2goNt2Q3VA0jqIDESzsZbcZ81hr2CoNe/T56KOPZrcqy8m2zazGAAt7+X8ZzGOtsCELe/mhohLGEqwyVFpY2CsGqLSUsQKrDJUWFvaKBWrswCpDpYWFvXKgKiYUxh5U/huwhd8idBqYRARX4bHTldd8Le8gTSpapYWWX0is47qnveTdi02I6aFOejlAbSdcOT2fF8NTOEixDTqnV6Uk0CC2GpW8hYTCyFXA72yj8XoAAzoE+nsxgNnrZc8DtL7bU9HJlDwqLY9855FkbY8ktS3LWlGLECbPo6UG8DUOsa+Bn5nH8q3HsRRo4GISL6vDN0O0e70SdoB2rfeshYBF71Juyzzu90TcF59FIC8WJvSVvgiT9nnPH5nP/K7CtOPonYWzh2aTF2Fu+usmvPjLF3us7cXwdR6iZ6DjyogsAWKrhokghhG6kCMTAu9Ap7+r1l0cQwoLAote4+ugwT+IsxO78XrQKkTkqzsEkqeily8Nk0il5cfHfowv3/xlLBxf6Pk2sNhTwEkx7I6FqMHDlC3wTRVRK4QZ1sGbCnxfrfxgwjBtvtHXFAZW7OsQZo7hEm7Fkxf8nm+mH6TBlau0RG00OBWcY6Gj6BDaLgSdDn46MPwG9Hr15/MGsdco5S0GrDiAIU7D5M/AgIo9gY6Lng4+5wi3jIOea59wieCQzgEnAe4kWoEFzhbBGRzEyIPQDmBWpaoxSpQMUZdCwCLh1OlmDWcCBzJsSNzDiIyL8LR8Ur1lHE2nPeZzh+d6mooENW7Zcx6b7zuHTlvCJB1Nnz6GS1O7sUhKxDl/LEP00Vhekh8sUjThNUyYAdxr59dCSwSvAWbg5Xq7exkqLfRO6TMnz/TurNAEv20/Jk4swaf2xC6U2k7Y9XPoOBIm6crYh6UoaLodABOoSU3YlpLbQ48lQT0qnR+sEq1RBlj0dGmfsnPVOtB51IMmfEdGLQ7RkkSYkps8VbJ01QIjDdaNCIVZwOi4DnxOgsRRXIzhazwakY3gmphsljLWe56RBqv6wfvg3R0HFqS6CcHxC5kQHrwGo3nFSIN1Q1RaBuinyDchSyYmDRcthWPLPF22G2mwuo+k55kgHUylJRtZoa1A0kI0bAdGPRnSszQuYFE90yUdepoznzKHWtLRDmsglZY8cHZTE7UVCGqEpmtDScZZLK20wEh7LKpst9YBKQUf1A5mhovWCefMuU9eM9JbWnEQMAIY/DQOXLr+mqmHXkfIdj18YpSRByuFa6+2F1f+cgXkuWb3zfZdN6Twt/DCQuKpsgmVDQIXy9vPAmMB1krPRf9eryot/TpsXL4fG7BSuNa7Ssu4gNOvnmMFVtqY9azS0q/DxuX7sQRrXIy7kevJwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3IxfNwNrIvV9i2xlYJRp3Ixf9d0NIelzdt4X5AAAAAElFTkSuQmCC\".encode('utf-8')), embed=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Ft95NDUZy4Rr"
-      },
-      "source": [
-        "That's equivalent to the function $\\hat{y} = w_2 x + w_1$. What we're trying to do is find the \"best\" weights $w_1$ and $w_2$. That will give us that green regression line above.\n",
-        "\n",
-        "What are the best weights? They're the weights that minimize the difference between our estimate $\\hat{y}$ and the actual y. Specifically, we want to minimize the sum of the squared errors, so minimize $\\sum{(\\hat{y} - y)^2}$, which is known as the *L2 loss*. So, the best weights are the weights that minimize the L2 loss."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "RHDGz_14vGNg"
-      },
-      "source": [
-        "## Gradient descent\n",
-        "\n",
-        "What gradient descent does is start with random weights for $\\hat{y} = w_2 x + w_1$ and gradually moves those weights toward better values.\n",
-        "\n",
-        "It does that by following the downward slope of the error curves. Imagine that the possible errors we could get with different weights as a landscape. From whatever weights we have, moving in some directions will increase the error, like going uphill, and some directions will decrease the error, like going downhill. We want to roll downhill, always moving the weights toward lower error.\n",
-        "\n",
-        "How does gradient descent know which way is downhill? It follows the partial derivatives of the L2 loss. The partial derivative is like a velocity, saying which way the error will change if we change the weight. We want to move in the direction of lower error. The partial derivative points the way.\n",
-        "\n",
-        "So, what gradient descent does is start with random weights and gradually walk those weights toward lower error, using the partial derivatives to know which direction to go."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "W7SgnPAWBX2M"
-      },
-      "source": [
-        "## The code again\n",
-        "\n",
-        "Let's go back to the code now, walking through it with many more comments in the code this time:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 548,
-          "status": "ok",
-          "timestamp": 1474671838303,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "4qtXAPGmBWUW",
-        "outputId": "841884bc-9a23-4627-cdec-8e7d4261a3c0"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl4AAAESCAYAAAAsU9sMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xl8E3X+P/DX5GzaND3TlNJSzlJA2ir1QuUUELRyiOiu\ni4ooKIoX6v7EXXCV1V2Q/equawFFEGR1FWVVqIJWW+QoiAot900pkPRuk6ZHjvn9URoLFHqmk+P1\nfMijZJKZz3ukHd585jPvtyCKoggiIiIicjuZ1AEQERER+QsmXkRERESdhIkXERERUSdh4kVERETU\nSZh4EREREXUSJl5EREREnaRDEq+5c+di8ODBSEtLc22rqKjAQw89hDFjxmD69Okwm80dMRQRkVsZ\njUbcf//9GDduHNLS0rBq1SoAV76mLViwAKNHj8b48eNx4MABqUInIi/QIYnXpEmTsHz58gu2LVu2\nDDfeeCM2btyI66+/HkuXLu2IoYiI3Eoul+PFF19ERkYGPv74Y6xZswbHjh277DUtOzsb+fn52LRp\nE1555RXMnz9f4jMgIk/WIYlXamoqdDrdBdsyMzMxceJEAMDEiRPx3XffdcRQRERupdfr0a9fPwBA\nUFAQevXqBZPJdMk1LTMzE0D9tW7ChAkAgOTkZJjNZhQXF0sTPBF5PLet8SotLUVkZCSA+gtZWVmZ\nu4YiInKLgoICHDx4EMnJySgpKbngmlZaWgoAKCwsRHR0tGsfg8EAk8kkSbxE5Pm4uJ6IqAlVVVV4\n8sknMXfuXAQFBUEQhCY/11TXtct9lojIbYlXRESEa7q9qKgI4eHhze7DtpFE5AnsdjuefPJJjB8/\nHrfeeiuAy1/TDAYDjEaja1+j0YioqKhmx+D1jsg/KTrqQBdfREaMGIHPP/8cM2bMwLp16zBy5Mhm\njyEIAoqKPOPpR70+mLF4aByA58TiKXEAnheLN5s7dy569+6NBx54wLXtcte0kSNHYs2aNRg3bhx2\n794NnU7nuiV5JZ50vWsLT/p+aytvPwdvjx/w/nNoy7WuQxKvOXPmYMeOHSgvL8ewYcMwe/ZszJgx\nA0899RQ+++wzxMTE4K233uqIoYiI3Ornn3/GV199hYSEBEyYMAGCIOCZZ57BI488gqeffvqSa9rQ\noUORnZ2NUaNGQaPR4PXXX2/RONYamztPg4g8lCB62Hy3p2S+npSFe0osnhIH4DmxeEocgOfFQlf2\nxeZjGNyv+VuSnsqTvt/aytvPwdvjB7z/HNpyrePieiIiCZRV1kgdAhFJgIkXEZEEqmrsUodARBJg\n4kVEJIGqaq7xIvJHTLyIiCTAxIvIPzHxIiKSABMvIv/ExIuISAIWJl5EfomJFxGRBKpYx4vILzHx\nIiKSAG81EvknJl5ERBKw2Z2w2R1Sh0FEnYyJF5EP+MW0C18d+krqMKiVrKzlReR3mHgReblfTT9j\nzYHVyCnIuaRZPXk2ay0TLyJ/w8SLyIsdLj2Ejw5+iAC5Go9d+xgEQZA6JGoFzngR+R8mXkReKr/y\nFFbuew+CIGD6wBmI1cVKHRK1Eme8iPwPEy8iL1RoLcR7eUtR57DhD/0fRM/Q3lKHRG3AGS8i/8PE\ni8jLVNSW493cdFTZqjA5YQoGRiZJHRK1EWe8iPwPEy8iL2K1WfFu7hKU1pRibI9xuCFmsNQhUTtY\nWUSVyO8w8SLyEjaHDSv2vodzVedwc9dbMLLbaKlDonbijBeR/2HiReQFnKITqw+sxPGKY0iJuhoT\net/FJxh9QDXXeBH5HSZeRB5OFEWsPfxf7Cveiz5hffC7xD8w6fIRnPEi8j9MvIg83DcnM7DjXA7i\nguPw4ICHoZAppA6JOgifaiTyP0y8iDzYljOb8d2pTdBrIvHwwJkIUARIHRJ1EIVcxhkvIj/ExIvI\nQ/1q+hn/O/I5glXBeCRpFrSqYKlDog6k1Sg540Xkh5h4EXmghlZAarkKM5IeQ4QmQuqQqIMFaRSc\n8SLyQ0y8iDzMaXO+qxXQQwMfQYy2q9QhkRsEccaLyC8x8SLyIIXWQrybu6S+FVC/B9ArtI/UIZGb\nBAUoYXc4YbM7pA6FiDoREy8iD3FJKyB9stQhkRsFaZQA+GQjkb9h4kXkAaw2K97LW4rSmlLc1n0s\nWwH5AVfixXVeRH6FiReRxBpaAZ21nMVNXW/GrfFjpA6JOoGWM15EfomJF5GEnKITHx74gK2APMzc\nuXMxePBgpKWlubYdPHgQ99xzDyZMmIDJkycjNzfX9d6CBQswevRojB8/HgcOHGjRGJzxIvJPTLyI\nJCKKIj47/An2Fue5WgHJBP5IeoJJkyZh+fLlF2xbtGgRZs+ejf/973+YPXs2Fi1aBADIzs5Gfn4+\nNm3ahFdeeQXz589v0Rhc40Xkn3iVJ5LINyczkHNuO2K1sWwF5GFSU1Oh0+ku2CYIAsxmMwDAbDbD\nYDAAADIzMzFhwgQAQHJyMsxmM4qLi5sdIyiAM15E/ohXeiIJNLQCitRE4OEktgLyBi+++CIefvhh\n/P3vf4coivj4448BAIWFhYiOjnZ9zmAwwGQyITIy8orH+23Gy+a+oInI43DGi6iT7S78xdUKaEbS\n4whW6ZrfiST30Ucf4aWXXkJWVhZefPFFzJ07F0D9LeOLtWSdnpZrvIj8Eme8iDrR4dJD+M+B1VDL\nVXhk4KNsBeRF/ve//+FPf/oTAOC2225z/d5gMMBoNLo+ZzQaERUV1ezxGma8REEGvd47+3B6a9yN\nefs5eHv8gG+cQ2sw8SLqJBe3AuoaHCt1SHQFF89kGQwG7Ny5E9dddx22b9+O+Ph4AMDIkSOxZs0a\njBs3Drt374ZOp2v2NiPwW+JVWm5FUZG540/AzfT6YK+MuzFvPwdvjx/w/nNoS9LIxIuoExRZi/Be\n7lLUOWx4YMA0tgLycHPmzMGOHTtQXl6OYcOGYfbs2Xj11VexYMECOJ1OqNVqvPrqqwCAoUOHIjs7\nG6NGjYJGo8Hrr7/eojH4VCORf2LiReRGlioL3v9hGb4pzQAAPHfT/2MrIC+wePHiJrd//vnnTW6f\nN29eq8dQKWRQyAWu8SLyM1xcT+RGK394D19av0CFohza0GAcPXRY6pDIQwiCgEC1gjNeRH6GiReR\nm9gcNnxf/h2q7VUwBHVBbHAcCusKpQ6LPIgmQMkZLyI/w1uN5PHM5iosWZIHo1EDg8GKWbOSPP4p\nGKfoxJoDq1DjrEFYYAS663oAAKJUzT/tRv4jUK1ASUWN1GEQUSdi4kUeb8mSPOzePQyCIMBoFJGe\nnoWFC6Ob31EiDa2A8opzcVviOCiLlCipKEGUKgr3DX1A6vDIgwQGKGB3OGGzO6BUyKUOh4g6ARMv\n8nhGo8ZVkLI++dJIHNGVbTz5NXLObUdXbVfMSJnFqvR0WYHq+kuwtcaOEC0TLyJ/wDVe5PEMBqur\nppIoioiOrpY4osvbcmYzvj21EZGaCDyS9CiTLrqiwIDziRfXeRH5Dc54kcebNSsJ6elZMBo1iI6u\nxmOPDZQ6pCaxFRC1VuMZLyLyD0y8yONptUF4/vkbpA7jio6UHWYrIGo1zngR+R8mXkTtYKmy4O9f\n/hXflW2EWhaA10YuYisgajHOeBH5H67xImqHN9e/iQzrelhV1YiI0CNn91apQyIvouGMF5HfcfuM\n14gRI6DVaiGTyaBQKLB27Vp3D0nUpKbqgWm1QW06ltFUiMcX/xU/B60HdDZcF38NIgMjUVjOAqnU\ncoHqhn6NNokjIaLO4vbESxAErF69GiEhIe4eiuiKmqoH1ta1Y08s/ht2R+yB3SEAzkAcPnwOsdd3\nY4FUahWu8SLyP25PvERRhNPpdPcw5MdaOpPVUfXAbA4bDmt+gVNuRZA9EY6TtahzWpCYlMgCqdQq\nDWu8qrnGi8hvdMqM1/Tp0yEIAu655x5MmTLF3UOSn3nrrZ1Yvz4YtbVyqFQK1NXtxJ/+NPySzxkM\nVhiNIgRBaHM9sIZWQEJANRS2CGjtfeEIdaJvjRIzxs3qiNMhP8IZLyL/4/bE6+OPP4Zer0dpaSmm\nTZuGnj17IjU19bKf96QefIzlUp4SB/BbLD/+aIHZnAZBEFBXJ+LHHz9qMs7582/CG29sx9mzAYiJ\nqcFzzw1u1Rovs8WMWf99Ansr9yIpvhtse7qhpK4O0ZpgfPD3P3vE/xtPiIFajk81Evkftydeer0e\nABAeHo5Ro0YhLy/violXUZHZ3SG1iF4fzFg8NA7gwljsdjVEUXRVt3c4Ai4b5+OPX+36fXW1E9XV\nLT+fZ9c+gZy6nQhSByE8XI8BQ3vhpQf+6BpL6v83nvbnQ81TKmRQyAXOeBH5EbeWk6iurkZVVRUA\nwGq1YsuWLejTp487hyQ/NHx4AHS6YqjV5dDpijF8uLrDx9h65kfstvyKAEUAEsP7QyFXorCOTzBS\n+wiCgEC1gjNeRH7ErTNexcXFeOKJJyAIAhwOB9LS0nDzzTe7c0jyQ08/PQhq9Z5GLYUGdejx9xT+\ninVHPkOIIhRdwrpCJVdBFEU+wUgdQhOg5IwXkR9xa+IVFxeHL774wp1DELm1pdCRssP4z8H6VkAL\nR/8fftj1HQqrChGliuITjNQhAtUKlFTUSB0GEXUStgwin9WegqmWKgveyfwnvi37xtUKqE9UAvqM\nS3Bz1ORvAgMUsDucsNkdUCrkUodDRG7GlkHksxoKpppMN2DPnuFIT89r8b7Lvn8HG6xftbsVkNlc\nhUWLcjBnzh4sXLgdFktVm45DvotPNhL5FyZe5LPaWjDVXFeJ78o2we60oYeuZ30roDYupG9P8kf+\nQaNmLS8if8LEi3yWwWB1lZhoacHUans1luWmAwC6auNgCIpu10L6jqqWT77LVUSVM15EfoGJF/ms\nWbOSkJKSBYMhBykpWXjssYFX/LzNYcOKve/hrOUs7up3D/Bjb/z6mQ0FGwIwIXVym2JoS/JH/iWQ\nM15EfoWL68lnteZpx4ZWQMfKjyJJnwxTRiKUZ2cj5nx7oQ8+yMKjjwZdslg/IECGRYtyLruAf9as\nJKSnZzUqdXHl5I/8D2e8iPwLEy/ye6Io4vMjnyKvOBe9Qnvj94lT8f/e33/JLcKG9Vr1r0Wkp2ch\nMFB9ybbGyZ47S12Qb+CMF5F/4a1G8nubTn2D7We3oau2K6Zd9TCUcmWTtwibWq919mwA13BRu/w2\n42WTOBIi6gxMvMjnXamkw7YzW7Dp5DeICIjAI0mPQqOoT5yaWh/WVDIWE1PNNVzULoFqJQDOeBH5\nC95qJI/WniKoDZq6Rfj88zcgt2g3Pj+yFlqlFjOSZyFYpXPt09QtwqbWa+n1wXjlFa7horbTnJ/x\nquYaLyK/wMSLPNrlkqbWaOoW4ZGyw1hzYBXUchVmJD2GSE1ks8dpKhnjGi7fNHfuXGRlZSEiIgJf\nffWVa/vq1auxZs0aKJVKDB06FM899xwAYOnSpfjss88gl8vx0ksvtaonLdd4EfkXJl7k0TqiDlZY\nWCm2bv0R1mon6qI+R9fYE5izvgK9YxLw8DWPomtwbEeHTV5u0qRJmDp1Kl544QXXth07duCHH37A\n+vXroVAoUFpaCgA4duwYvv76a2RkZMBoNGLatGnYtGmT6/u2OXyqkci/cI0XebSOqIMlCHIAA1Ad\nvgO1AwtxNuYASuTFCK7Uok8Yey/SpVJTU6HT6S7Y9tFHH+GRRx6BQlGfKIWHhwMAMjMzMW7cOCgU\nCsTGxiI+Ph65ubktHkulkEEuEzjjReQnmHiRR2ttEdSmmAqVUPT8EM6+n8MevQ02hwPdQ3pCKajc\nEDH5qpMnT2LXrl2YMmUKpk6dir179wIATCYTunTp4vqcwWCAyWRq8XEFQUBggIIzXkR+grcayaN1\nxBqqM7L/4lzPPFiVh+FU26Cpi4AhMBpR1W1rA0T+yeFwoLKyEp988glyc3Px1FNPITMz0zUj21hL\nbzPq9cEAgOBAFay1dtdrb+Ft8TbF28/B2+MHfOMcWoOJF/k8e8xB2DWHITicUFQKCLSKSKxOxH1D\nH5A6NPIi0dHRGD16NAAgKSkJcrkcZWVliI6Oxrlz51yfMxqNiIpqWVJfVGQGAKiVMhSW2VyvvYFe\nH+xV8TbF28/B2+MHvP8c2pI08lYj+SxLlQVLNryNA7UH4KytRWigBt16dEWv0ATMGDcLQUGtK0tB\n/uXimaxbb70V27dvBwCcOHECNpsNYWFhGDFiBDIyMlBXV4fTp08jPz8fSUlJrRorUK2A3eGEze7o\nsPiJyDNxxos8VuMaXqGhFRAEB8rKwmEwWDF//k3N7r8m6wNstH0NuV4OuU0G+XEFQruEYVif4Z0Q\nPXmzOXPmYMeOHSgvL8ewYcMwe/Zs3HXXXXjxxReRlpYGpVKJv//97wCA3r17Y+zYsbj99tuhUCgw\nf/78Ft9qbKAJOF9EtcaOEK28w8+HiDwHEy/yWI1reG3dWgpgH3r3vgFGo4g33tiOxx+/+or77zTv\nQKHThJjQGChrVBDCZZjYexJvMVKzFi9e3OT2RYsWNbl95syZmDlzZpvHa1zLK0SrbvNxiMjzMfEi\nj9W4hldtrQJAAOx2O06dMuP06QpUVW2/bCX77We34lzdWajVAegfORBKmRKJ+kTMGDerk8+CqHms\n5UXkP7jGizxW4xpeKpUdanUNTp0yo7w8HA5HCPbsGY709LxL9sst2o3PDn+KlG7X4I7AO2EwG7iY\nnjwaq9cT+Q/OeJHHatwbsVevYvz8sxEHDxZCJqtGXNy4JivZHy07gjUHVkElV2JWymzEBsdJFD1R\ny+mC6mvKlZtrJY6EiNyNiRd5rMY1vBYtykFIyHRERpajoiIcp09vRffuN7oq2VuqLHgn85/4tmwj\n1DI1Xhu5kEkXeQ19aP0/IArLW9+ZgYi8C281kldoWO/VvXswQkNLIZdXuCrZm4pMGPPKMLx98C0c\nNR5GkEqLnN3bpA6ZqMUMYecTrzImXkS+jjNe5BUMBiuMRhEKhQI9e4Zi8OAQ11ONM5c8iPz4U3DK\nRAgCcHT3YQy8vnV1lIikFBqshkIu44wXkR9g4kVeofF6r+joajz33GBUVztRY6/BPsVeCIIAhUwO\npUwJq7waUSq2AyLvIRME6EMDUFhWDVEUW10HjIi8BxMv8goX92zUaoNgrirDir3vQS7IEazWwWlz\nwmG3I6pWzycYyesYwgJxrsSKqho7tBql1OEQkZsw8aLLalw5vqXV4juDpcqCD7Pfw1cFGTA7KnHf\n1fdjz85fUSgWwSBEYdELb7IdEHmdhgX2pjIrtJoQiaMhIndh4kWX1bhyfEurxXekixO/WbOSYCoq\nxO2v3AlzbBGgcqJPXG9ozBp88NxHnRYXkTtEnV9gX1RWjV4xTLyIfBUTL7qsxpXjBUHA2bMBnTr+\nxYlfenoW1h17GeUDiyGq7IBTQP5OE0pGl3RqXETuEMUnG4n8AstJ0GU1rhwviiJiYmo6dfzGiZ/D\nWYUfjq5HvnYvnEItIAIyMQA2pY0L6cknuBIvPtlI5NOYeNFlzZqVhJSULBgMOUhJycJzzw3q1PEb\nJ375zndwsueXsIdbAZkTqFQANhm0laFcSE8+IUIXAJkgcMaLyMfxViNdVlNPElZXmztt/IYSEqcL\nRGy3v4VaZTmUMjlsZwXIyoHuQj/8Z/4SLqQnn6CQyxARokZhmVXqUIjIjTjjRR5Lqw3C/Q/2wC7N\nE7D0KoJNboNMK0eYNgTDEgdj25Jv0L17d6nDJOowUWGBqLTaUM1m2UQ+i4kXeSxTkQnDX74RR0OP\nQlSIgADYy2xQ1CowasAoqcMj6nBR50tKFHGdF5HPYuJFHuvZD55AWXwZIAcgBwS7ALlcgUGhqXh8\nwuNSh0fU4fhkI5HvY+JFHqlxKyBBJUB2Tg4UCehR3AP/fuxdrusin8QnG4l8HxMv8jh2px0r99W3\nAgrRhUJTHAClXIHIsgisfeFLJl3ksxpuNXLGi8h38alG8ihO0Yn/HFiNI2VH8OD107Fry04U6s63\nAnriTej1rNlFvkvvSrz4ZCORr2LiRR7BVGTCCx88jYOyg7DLbbhn0O/x6LVPQHkDmwWT/1Ap5QgL\nVnNxPZEPY+JFnaKpvota7W+3DP/4wTP4NfJXWO1VkAty5OXsgfJmJl3kf6JCNTh8uhw2uwNKhVzq\ncIiogzHxok7RVN/FRx8d6ErGclS5sIdYIJcpEBYQhmKxvv9icwkbka/Rh2lw6HQ5isprEBPJ73Ui\nX8PF9dQpLm64bTRqXMnYEYsaVXKgymKHrTIQZSW1iHCGA/gtYTOZbsCePcORnp4n5WkQuZ2BTzYS\n+TQmXtShzOYqLFqUgzlz9mDhwu2wWKoAXNpwOzq6GsdPVmGXeix+jrkD9lonkN0HsgNdodg2CL1q\npgFoOmEjaqmMjAxYLBYAwFtvvYXp06dj7969Ekd1ZVFhgQD4ZCORr2LiRR3qcjNUFzfcfuyxgdgl\nLEBl0s8QQ+vgDJNB7dDiBusWXK1YC0tlDICmEzailkpPT4dWq0Vubi62bNmCCRMmYMGCBVKHdUWu\n6vVMvIh8Etd4UYe63AzVxQ23S6pLUN2rAAqFDQp7MBwKG+SGOqDmwgSroVG20ahBdHQ1HntsYOef\nFHkthaL+Erd161bcfffdSEtLw/vvvy9xVFfWUFLCVM6SEkS+iIkXdSiDwQqjUYQgCJedobLUmfFu\n7jtQy9SICFYgSBkEh8MB2XElDIacCxKsixM2otYQBAEZGRnIyMjAO++8AwCw2WwSR3VlgQEKaDVK\nzngR+Si332rcvHkzbrvtNowZMwbLli1z93AksaZuKTZWY6/Bu7lLUFRdjBeGvoiUoquhOxOKRFM/\nfPHqKixenIznn7+BTy5Sh/jTn/6E9evXY/LkyYiLi8PJkydx/fXXN7vf3LlzMXjwYKSlpV3y3vLl\ny5GYmIjy8nLXtgULFmD06NEYP348Dhw40O64DWEaFFfUwOF0tvtYRORZ3Drj5XQ68eqrr2LlypWI\niorC5MmTMXLkSPTq1cudw5KELjdDZamyYHXWCnxXtgnVzmr8LuUPmDLw97gn5T4JoiR/cc0117hm\nugCge/fu+POf/9zsfpMmTcLUqVPxwgsvXLDdaDRi27ZtiImJcW3Lzs5Gfn4+Nm3ahD179mD+/Pn4\n5JNP2hW3PkyDY2crUVpZ67r1SES+wa0zXrm5uYiPj0fXrl2hVCpx++23IzMz051Dkof6MGslMmo3\nwCg7B5lWhqrTFtdaMCJ3+dvf/gaz2Qy73Y7f//73SElJwRdffNHsfqmpqdDpdJdsf+211y5JxjIz\nMzFhwgQAQHJyMsxmM4qLi9sVN3s2EvkutyZeJpMJXbp0cb02GAwoLCx055DkgURRxNaKLSitKUaw\nSoc+YX1RbGvfX0xELbFt2zYEBwdjy5YtMBgM2LhxY5sX13///ffo0qUL+vbte8H2wsJCREdHu14b\nDAaYTKZ2xW1wlZTgAnsiX+PWW40NZQBaQ68PdkMkbcNYLtWWODKOZKBSKINOHYzk6CTIBTl6yru1\n+5y8+f+Ju3hSLJ7kp59+wqhRo2AwGNo001pTU4MlS5Y0mbQ1dZ1r6RiX+/NK6FH/AIC51uHRf6ae\nHFtLefs5eHv8gG+cQ2u4NfGKjo7G2bNnXa9NJhOioqKuuE9RkdmdIbWYXh/MWFoZR1PtffZW7sGn\nh9fi6thUhJWHw1xoRpQqCuOH3tOuc/KW/yedydNi8QQRERGYP38+fvzxR8yYMQN2ux0Oh6PVx8nP\nz8eZM2cwfvx4iKIIk8mESZMm4dNPP4XBYIDRaHR91mg0Nnuda3C5Py+lUJ/MnTpb4TF/phfzpO+3\ntvL2c/D2+AHvP4e2XOvcmngNHDjQdcHS6/XYsGED/vGPf7hzSOpEFydaNpsN+/ePdvVjfHnpe8Cg\nXQhSBmHW1U8hKrBlfxkRdZTFixfjyy+/xMSJExESEoKCggJMmzatRfs2nslKSEjA1q1bXa9HjBiB\ndevWISQkBCNHjsSaNWswbtw47N69GzqdDpGRke2KO1ijRIBKzrZBRD7IrYmXXC7Hn//8Zzz00EMQ\nRRGTJ0/mE40+5OLG1ybTV4jUV6FAeB9VoQewu2oHxjmvw8MDZzLpIkmEh4fjD3/4A06cOIGjR4+i\ne/fumDRpUrP7zZkzBzt27EB5eTmGDRuG2bNn46677nK931CnDgCGDh2K7OxsjBo1ChqNBq+//nq7\n4xYEAVFhGhhL6js38EEUIt/h9gKqQ4YMwZAhQ9w9DEng4ir1omhBAd5HZZ+fYNbmQR1gRkhlCLrp\n4iWOlPxVXl4ennzySahUKoiiCLvdjn/9618YMGDAFfdbvHjxFd+/+OnsefPmtTvWi0WFBSLfZEG5\npQ5hweoOPz4RSYOV66nNLq5Sf9PNanxVuRJlIachlwFXx13b4mM1tT6MRVSpvf7617/itddew403\n3ggAyMnJwauvvoqPP/5Y4sia1zUyCLsAnDxXibBgvdThEFEHYZNsarOLq9TrknPh6FkMtc6J0DAt\nyivKEaVq2S3GyzXXJmqP6upqV9IFADfccAOqq71j3VTfuFAAwKHT5c18koi8CRMvarOGKvV/eaUX\nAvvtwIf5q2BT2hBRpUd4RRgCjUG4b+gDLTrW5ZprE7WHRqNBTk6O6/XOnTuh0XjH91bPGB0UcgEH\n88ukDoWIOhBvNVK7rc5agfU1X6JOXgu5Wo5wTTj6dRuAxOpEBAVd/nZh49uLR48eQkDAVVAqtZdt\nrk3UWnPnzsVTTz0FlUoFoL5B9j//+U+Jo2oZlVKOnl10OFJQAWuNDYEBSqlDIqIOwMSL2sxUZMIL\nHzyNLTVb4NTY0UvfG8pyFVArIFGf2OxsV+OnIgMCBqG29mPExvZBdHT1Jc21idoiKSkJmzZtwokT\nJyCKInr06IHRo0cjKytL6tBaJKFbGA4XVOBwQQVSerevRAUReQYmXtRmL3zwNH6O+Bm2qlqIChGl\nJ0oxbMhIJFYnYsa4Wc3u3/j2olKpRGxsHyxenOzusMnPKJVKJCQkuF63paOGVBK7hWL9NuBQfhkT\nLyIfwTV510I9AAAgAElEQVRe1CaWKgt2Wnai3FoGQZQhqE4LW40didXNz3Q1MBisrr8EeXuROos3\n1cTq1TUEcpmAQ/lcYE/kKzjjRW3y2sa/oCbYCshFKFRyCDUCBkUMatFMV4NZs5KQnp4Fo1HD24vU\noY4ePXrZ9+x2eydG0j5qpRw9uuhw7GwFqmvt0Kh5ySbydvwpplbLK85FTuV2xER1RdWJKtQJddBV\n6rDohTdbdZyGpyKJOtqMGTMu+55a7V3FSPt2C8XRMxU4UlCBpF4RUodDRO3ExMvPtLdQ6fHyo/hw\n/0po5UHoHdUHwXE6iKKIxOpE6PVsC0Se4fvvv5c6hA7Tt1soNmw/hUP5ZUy8iHwA13j5mfYUKi2o\nLMD7e9+FKIp4ZcTrSHVci7DysFat6yKi1undNQQyQWAhVSIfwRkvP9PaQqWWKgve37gM3x3dhHOB\nZxChjcS84QvQS9cX3+6rQ5lRA5XBCpHtOIncIkClQPcuwTh5zoyaOjsCVLxsE3kz/gT7mYv7Kzb3\nJOF/sldhfcGXOBVxEiKcUDkDsHffHmzdJ3fV4DIaRaSnZ3Xaei2zuQr//vevOH5cxr6O5Bf6xoXi\n+NlKHD1Tgat68HYjkTfjrUY/c3F/xSs9SWipsuDrvA04bD2I6gorNIIGapkKhXWFkrb4WbIkD7t2\n3cK+juQ3+nYLAwCWlSDyAZzx8mJtWSjfmicJV2etwKmIk3DUOGGHEyWHqyGG2BASGgJVK2fOOhL7\nOpK/6RMbAkFg4kXkCzjj5cXas1C+OaIoIrs8C2qNGupaHWRFEZCXhkC7+35U7k+5YOasX79NqK2t\nxpw5e7Bw4XZYLFUdFkdTWHiV/I1GrUC8IRgnzlWi1uaQOhwiagfOeHkxd8z8WKosWJP1AbZW/ogj\nxYcQ3iUCQl0iqp0GhFQPRKzycZSW5Fwwc7ZoUU6nrveaNSsJH3zwI44fl7kKr7a3TAaRp+vbLRQn\njWYcO1OB/t3DpQ6HiNqIiZcXa+1C+Zb4T/YqfO/MRIGYD50+BNH50SirDEL5yf7opnoYDvul43T2\nrT+tNggvvzwERUVm17bOTv6IOlvfbmHYuPM0DuaXM/Ei8mJMvLxYR7fcMRWZ8M7Of6E0ogRKqDCi\n562ID+yOpx99HunpeSgv34fQ0IpLxnFHAthaXPdFvi4hNgQCgMP5ZVKHQkTtwMTLi3V0y53HVk1H\nSXAJnEonFHInfj7yE65NuM41jl4ffMEsUwNP6LnoCckfkTsFBigRZ9Di+LlK9m0k8mL8ySVYqiz4\n53f/wM/2XYBcRKApEDKlDCgTcN8jzVek94Sei56Q/BG52zV99Mg3WfDL4SLcNLCL1OEQURsw8SKk\nZ/4L31g3QBYog6AGlDVKhMSHoq+qL4KCvGOBuickf0TudsMAA/635QS27zMy8SLyUiwn4edKa0qQ\nWfYtHKIDqV2ug8IUjKqzTmCzHi9Pfk3q8IiokaiwQPTqqsOBk2UoM9dKHQ4RtQETLz9mqTNjWW46\nZIIM3XTdUXYmANq6OxBXPhc9LVn49FOj1CES0UUGD4iGCGDHfpPUoRBRGzDx8lM19hq8l7cURdYi\nPJw6E8NlI+A8Gg7tsf6IFafzyUAiD3VtPwPkMgE5+/gPIyJvxDVefsZSZcHqrBX4ruxbVDutuDf5\nPkzsNxlCfwHle7djT+lwCDI+GUjkqbQaJQb2jMDuo8UoKLIgVq+VOiQiagUmXh7ObK7Cv//9K44f\nl3VIRfY1WR8go3YDSmXFCNWEwVpQBUtPK5YsyUNBgRIWy2rEx8chLg58MpDIQ914VTR2Hy1Gzj4T\nJg9j4kXkTXir0cMtWZKHXbtu6ZB+jGaLGWuOrMbh4oOwWqvRQ9cTxbZiV8/H0tJboNVORVwc8Pzz\nN7DlDpGHSu4VAY1ajpz9RjjP9y0lIu/AxMvDdWRF9r988ycUy4sgyAG1Ro1jZ48iShXFqu9EF5k7\ndy4GDx6MtLQ017aFCxdi7NixGD9+PGbPng2LxeJ6b+nSpRg9ejTGjh2LLVu2uD0+lVKOQX2jUFpZ\niyOny90+HhF1HCZeHs5gsEI8/y/a9qy72nEuB79YdkEfEYVuVd0RWKJBoDEI9w19oMPGIPIVkyZN\nwvLlyy/YdvPNN2PDhg344osvEB8fj6VLlwIAjh49iq+//hoZGRl499138Ze//MX18+RONw6IBgBs\n5yJ7Iq/CxMvDzZqVhNTUH2Ew5CAlJatN6672Fufh00MfQ6fQob9+AAYmJSNl4CCMGXgbgoKCMGtW\nElJSsto1BpEvSU1NhU6nu2Db4MGDIZPVXzJTUlJgNNYnPN9//z3GjRsHhUKB2NhYxMfHIzc31+0x\n9u0WirBgNX46WASb3eH28YioY3BxvYfTaoPw8stDmuyR2BLHy4/iw/0roZQr8Ldb/4HNv/yAwqpC\nRKmicN/QB1xjsOo7UcutXbsWd9xxBwDAZDIhJSXF9Z7BYIDJ5P4aWzJBwA39Dfh6Rz72HC1BamKU\n28ckovZj4uVDzOYqLFmSB6NRA4PBikkP6rHy8LtwiE5Mv2oGEsP7IXFcP6nDJPJq6enpUCqVrsSr\nqduKDWsmm6PXB7crlnG39MLXO/Lx85FijL2lV7uO1Rbtjd8TePs5eHv8gG+cQ2sw8fIhDU8nCoKA\n/OJiZP/nSSSmaPH7fn9AYjgTLqL2WrduHbKzs7Fq1SrXtujoaJw7d8712mg0IiqqZbNPbZ3JbhCk\nENDNoMVP+03Yf6QQ+tDOezBGrw9ud/xS8/Zz8Pb4Ae8/h7YkjVzj5UOMRg0cziocFxZid9wknDDv\nx+iut2GQ4VqpQyPyOhfPZG3evBnvvfce0tPToVKpXNtHjBiBjIwM1NXV4fTp08jPz0dSUlKnxTnm\num5wiiI2/XS608YkorbjjJcPCQsvxtdnp6IyYRdEmRMGTTROHTkB9JE6MiLvMmfOHOzYsQPl5eUY\nNmwYZs+ejaVLl8Jms+Ghhx4CACQnJ+Pll19G7969MXbsWNx+++1QKBSYP39+i281doRrE6PwefYx\n/LjnLO68qTuCA1XN70REkmHi5UOC+/+Kas12iMpaKKFEpEOBwrrCFu9/8Rqx9lbJJ/JWixcvvmTb\nXXfdddnPz5w5EzNnznRnSJelkMsw+rpu+Oi7I/j+lzMYf3MPSeIgopbhrUYfIYoitlu3Q6OVIUQb\niJjwSNgEG6JULX/SqWGNWEdUySeizjMkKQZBAQpk/lyA2jqWliDyZEy8fIAoivjy2DpUOirQNaQr\nusm6QV2nRrwt3lUyoiVYwZ7IO6lVcoy4JhaWahu25J1rfgcikgxvNXoxS5UF/8lehZ2VO3C27gyu\n73YDIiyRKFeX19fpmvIAgoJafqvQYLDCaBQhCEKLKtjz1iSR5xg5KBbf7MzHxp35GHZ1DOQy/rua\nyBMx8fJi/8lehc1iNk44j0GlViHUHIon0p5u8/FmzUrCm29+jaysCgBBqK1Vw2Kpumwy1bh8hdEo\nIj09i4VYiSSiC1Lh5qQu+OGXM/jpYCFu6B8tdUhE1AQmXl5sX9U+nLAfh0KmRL+IATA3atrbFlpt\nENRqDQyGsRAEAQcOiHjzzU1Qq1UwGjXo2dOBBx7o60rEeGuSyLOMuTYOWb+ewTc5+bi+n6FTn64k\nopbhXLSXOl5xDMdrjkAGAX3D+yFArmnVQvrLuTiZysqqdS2437VryAUL7tlcm8izRIUFIrVvFPIL\nLdh3slTqcIioCUy8vNA5y1msyHsXvbskYLRmLLpZuyGxOrFVC+kv5+JkCqi67KwWm2sTeZ6xN3QD\nAHy19WST7YyISFq81ehlSmtK8G7eEljt1fjDwAc6vCr9rFlJSE/PgtGoQXR0NWprQ3DgQNML7tlc\nm8jzdI/WIaV3JHYfLcYvh4sxqK9e6pCIqBEmXl7EYrNgWW46KmorcGevCW5pBXRxMmWxVLkSsZ49\nnXjgAc5qEXm6u4f3Qt7xEnyadRTJvSOgkPPmBpGnYOLlJWodtVieuxRF1iIMjxuJoXHDO2XcxomY\ntzczJfIXXSKCMCylKzJ/KcD3v5zB6GvjpA6JiM5z2z+D3n77bQwZMgQTJ07ExIkTsXnzZncN5dNM\nRSaMnzceN/5zEFZtWYG+QYm4vWea1GERkYe78+bu0KgV+GrrCViqbVKHQ0TnuXX+edq0aVi3bh3W\nrVuHIUOGuHMon2SpsuDuhXfiO+E7FCsLYQ2yYmv2Zj4iTkTNCg5UIW1wd1TV2LF+20mpwyGi89ya\nePGJmvZZvnEZjgWfRA3qYIMDYrWIQrFI6rCIyEuMHBSLyJAAZP5cAFOZVepwiAhuTrzWrFmD8ePH\n46WXXoLZzLVBDczmKixalIM5c/Zg4cLtsFiqmvzcp3k/wCGXQxTlgCMQVdW1MAhXrtXV0mMTke9T\nKmSYPKwXHE4Ra384JnU4RARAENsxLTVt2jQUFxdfsv2ZZ55BSkoKwsLCIAgC/u///g9FRUV47bXX\n2hWsr3j55c3YtesWV4mG1NQf8fLLF96K3XZ6Gya+MRNVCifs5yrhVNZBa9Li4KrtiIq6fPLVkmMT\nkWfojIdVRFHEax/+jGNnKvH/7rsGCXGhHXJcX3jYxtvPwdvjB7z/HPT64Fbv066nGlesWNGiz02Z\nMgWPPvpoiz7rKX8A7vxmOH5cBpvNccHrxmPtK96LlfveR0xgT1TuvhoydR1klkiMHdgbgqC5YlzN\nHbs9POkHxFNi8ZQ4AM+LhTyDIAi4d0Qf/HX1z1i18RDmP3gtlAqWlyCSitt++oqKfluL9O233yIh\nIcFdQ3mdK7XaOVFxHKv3r4BCJsebv5uLuwbehOFREzD+moF46snr2nVsIvJPvbqGYPjVXXG2uApf\nbTshdThEfs1tdbwWLVqEAwcOQCaToWvXrnjllVfcNZTXubg6/GOPDYSlyoIlmW9jU9k3UMqUeHX4\n39C/S3/0f751MxlNHZuIaPKwXsg9VoKM7fm4JkGP7tE6qUMi8ktuS7wWLlzorkN7vaZa7bz51SKs\nr/4SNmUdeob2xq7cHbgmdlCHHJuISKNW4MFxiVj88W68v+EA5j14LSvaE0mAP3UewGKz4LuyTbA5\n6tBN1x1RgQYU1hV2+Dh84pHIvw3oHo4hyTEoKKpibS8iiTDxklhDKyCbw47aoiDk5wVg924jQhDS\n4WMtWZKH3buHwWS6AXv2DEd6el6Hj0FEnm3K8N4IC1Zjw/ZTyDd5xsMYRP6EiZeEHE4HVu17H/nm\nfESUDIY65z7YjiaidsetqNyf0uHjGY0aV9V7QRBgNGo6fAwi8myBAQo8ODYRDqeI9zccgN3hlDok\nIr/CJtkSsFRZsCbrA2RV/IAKeznu6D8eznM3QSa/CagFIAdKS3I6fFyDwQqjUXTV+OITj0T+aWDP\nCNw0MBpb84z4YssJ3DW0l9QhEfkNznhJYE3WB9hk34h8nIRD4wBMIroYat1eBmLWrCSkpGTBYMhB\nSkoWn3gk8mO/G9kHUaEabNh+CruPXFoIm4jcgzNeEthl/glG51loFIHoG9EPpZWleLoTykDwiUci\nahAYoMSsiVfhr6t/xrvr92P+tGsRFcrlB0TuxsSrk+08twNn6gqgUquQGNEfCkGBKFUUkyIi6nTd\nDMGYOrov3s84gHc+z8PcqYOgUsqlDovIp/FWYyfaV7wXnx7+GMndUnCH5k5Em6ORWJ2I+4Y+IHVo\nROSnbk7qgiHJMcgvtODDbw9LHQ6Rz+OMVydpaAUkF2SYmfI4uof0kDokIiIAwH2j+uCU0YwtuefQ\nu2sIhiTHSB0Skc/ijFcnMFadw/t5y+AQnXhgwHQmXUQebu7cuRg8eDDS0tJc2yoqKvDQQw9hzJgx\nmD59Oszm32pgLViwAKNHj8b48eNx4MABKUJuF6VCjlkTr0JQgAIfbjqMY2cqpA6JyGcx8XKzsppS\nLMtNh9VejXv6/g79IvpLHRIRNWPSpElYvnz5BduWLVuGG2+8ERs3bsT111+PpUuXAgCys7ORn5+P\nTZs24ZVXXsH8+fOlCLnd9KEazLhzABxOJ95am4tzJexsQeQOTLzcyGKzYFluOipqK3BHzzuRGn2d\n1CERUQukpqZCp7uwiXRmZiYmTpwIAJg4cSIyMzNd2ydMmAAASE5OhtlsRnGxd5ZnGNgzAg/clghL\ntQ3/+O8elJlrpQ6JyOcw8XKTWkct3s9bhkJrIYbFjcDwbiOlDomI2qG0tBSRkZEAAL1ej9LSUgBA\nYWEhoqOjXZ8zGAwwmUySxNgRhiTHYOItPVBSWYP/+2QPrDV2qUMi8ilMvNygoRXQqcpTSDVcizt6\n3il1SETkJg2FjxtraM3lre4Y3B3Dr+mKgiIL/vVZLmx2h9QhEfkMPtXYwURRxH8P/QcHSw+iX3g/\nTOn7u3ZdhM3mKvz737/i+HEZDAYrZs1KglYb1IERE1FLREREoLi4GJGRkSgqKkJ4eDiA+hkuo9Ho\n+pzRaERUVFSLjqnXB7sl1o7w1O8GodbuxLbcc1i16Qie/8MgyOUX/lvdk+NvKW8/B2+PH/CNc2gN\nJl4dSBRFfHXsf/jZtAvxuu6YOmAa5LL2FSNcsiQP+/ffCpvNAaNRRHp6FgutEnWCi2eyRowYgc8/\n/xwzZszAunXrMHJk/fKBkSNHYs2aNRg3bhx2794NnU7nuiXZnKIic/MfktADoxNQUlaNrblnUbvc\nhhl3DoDifPKl1wd7fPzN8fZz8Pb4Ae8/h7YkjbzV2IGyTn+P7IIsGAINmD5wBtRydbuPaTRqXDNm\ngiDAaGRLDyJ3mzNnDu69916cOHECw4YNw2effYYZM2Zg27ZtGDNmDLZv344ZM2YAAIYOHYrY2FiM\nGjUK8+bN89qnGpuiVMjx5OQkJMSFYtehIrz9eR7qbLztSNQenPHqID8Zd2D98S8Rqg7FjKTHEKTs\nmNuBBoMVpaXubZ5NRBdavHhxk9tXrlzZ5PZ58+a5MRppadQKPDMlGf/+PA+5x0rw5qd78OTkJKnD\nIvJanPHqAPuK9+KTQx8jUKHBjKTHEBoQ1mHHnjUrCampP8JgyEFKSpZbmmcTEV2JWinH7LuScE2C\nHgfzy7H4v7thqbZJHRaRV+KMVzs1bgU0feBMGIKim9+pFbTaILz88hCvvgdORN5PqZDhsQkDsHzD\nAeTsM2HuO1vw+ISrEK4LkDo0Iq/CGa92aNwK6P4BD7EVEBH5NLlMhodv749hV3fFibOVeOWDXTha\nwPZCRK3BxKuNGrcCmtL3XvSPGCB1SEREbieTCZg6OgEzJgyExWrDwo9+wY+5Z6UOi8hrMPFqgypb\n1QWtgK6Nvl7qkIiIOo0gCEi7pSeeuScZaqUcKzIO4uPMI3A4nVKHRuTxmHi1Uq2jFsvzlp5vBTSc\nrYCIyG8N6B6OP92fii4Rgdj002n2dyRqASZereBwOrB63wqcqjyFQYZU3NFzvNQhERFJyhAeiJem\npiKldyQOnCrDvOU7sOtgodRhEXksJl4tJIoiPjn0EQ6UHkC/8H64p+/vvb4fGxFRRwgMUGD2XQMx\ndXQCbHYn3vnfXizfsB/VtWywTXQxlpNoofXHv8Au00+I18V3SCsgIiJfIggChl8Ti8T4MCz7cj+2\n5hlx+HQ5pt/eHwlxoVKHR+QxOOPVAj/kZyLr9A+ICozC9IEzO6QVEBGRL+oSEYSX7h+EcTfEo7i8\nBn9b8wuWb9iPSmud1KEReQTOeDVjl3En1h//EiHqkA5tBURE5KsUchkmD+uFlD6RWL3xELbmGbH7\nSDEmDe2FockxkMm4TIP8F2e8rmB/yT7899BHrlZAYQHhUodEROQ1encNwbwHU/G7W/vA4RSxeuMh\nLFi1C0cKyqUOjUgynPG6jONlx7Fq3/uQCzI8NHAGooO6SB0SEZHXkctkGJUah2sTo/DJ90eRs9+E\n1z/8BUm9IjBpSE90MwRLHSJRp2Li1QRTlRHLD74Dh+jEgwOmo0dIT6lDIiLyaqFaNWbcOQDDr+mK\nz7KPI/dYCXKPleC6flGYcEtPRIcHSh0iUadg4nURURSxYt97sDqtuDvhXgyIvErqkIiIfEaf2FD8\n8fdXY9/JUnyWfRw7DxRi18EipCbqMea6bujRRSd1iERuxcTrIoIgYGBkMhJiuqNPwECpwyEi8jmC\nIOCqHhEY0D0cvxwuxpdbT2DngULsPFCIhNgQjLmuG5L7RELGWonkg5h4NeH2nmnQ64NRVGSWOhQi\nIp8lCAIG9dXjmoT6qvff7MzH3uOlOFyQh6gwDYYmx2DwVdEI0bKED/kOJl5ERCQpQRDQv3s4+ncP\nx5kiCzb9dBrb95nwadYxfJZ9HMm9I3BLUgwG9gqHXMaH8cm7MfEiIiKP0VWvxbRx/TBlRG/k7DPh\nx9yz+PVIMX49UgxdkAqD+upxXWIU+sSGsh4YeSUmXkRE5HGCApQYOSgWIwfF4pTRjB9zz2LngUL8\n8MsZ/PDLGVcSlpqgR5+4UCjknAkj78DEi4iIPFp8dDDio/vi3pF9cCi/HD8dLMQvh4tcSZhaJUf/\n+DAM7BWBpJ4RCNcFSB0y0WUx8SIiIq+gkMswoEc4BvQIx9QxCTiYX449R4qRd7zEdTsSALpEBKJv\nXCgSuoWib1wYwoK5OJ88BxMvIiLyOnKZDAO6h2NA9/pWbqYyK/KOlSDveCkOF5Qja/dZZO0+CwCI\nCtOgT9cQ9IjRoUcXHeKitLw1SZJh4kVERF7PEBYIQ2ogbk2Ng8PpxCmjBYdOl+FQfjmOFJRj614j\ntu41AqifOYs3aJHQPRyRwWrERWkRqw9CgIp/JZL78buMiIh8ilwmQ88YHXrG6DD2+ng4nSLOlVpx\n4mwlTpyrxPFzlThpNOPY2coL9osK1aBLRCC6RAS5vkZHBCIoQAGBxVypgzDxIiIinyaTCegaGYSu\nkUG4OakLAMBmd6DaAeQeKsTpQgtOF5pRUFSFPcdKsOdYyQX7a9QKRIVqoA8NgD5MA32IBuG6AITr\n1IjQBUCj5l+l1HL8biEiIr+jVMgR0yUYOrX8gu1max2MpVacK7HCWGKFsdSKovJqnCupwilT091M\nNGo5woMDEKJVIVSr/u1rkAq6QBWCg1TQBSoRpFGyDRIx8SIiImoQHKhCcKAKfWJDL9guiiIqqupQ\nWFaN4opqlFbWorSyBqXmWpRU1KDMXIszxVVXPLZMEBCkUUCrqU/CtAFKBGkUCApQIlCtQGDA+V9q\nJTRqOTRqBQLUCmhUcgSoFFDIBd7y9AHtSry++eYbvP322zh27BjWrl2LAQMGuN5bunQpPvvsM8jl\ncrz00ku4+eab2x0sEZHUVq5cibVr10IQBCQkJOD1119HYWEhnn32WVRUVGDAgAFYuHAhFAr+u9aX\nCIKAUK0aoVo1EuJCm/xMnc2B8qo6VFhqUW6pQ2VV/S+ztQ6VVhsqrXWwWG2wVNtgKq2GUxRbFYNc\nJkCtlEOtkiNAJXfNoKmUMqgVcqiUcqiUMigVMqgUcqgUMiiV9b9XKmRQyAUoFfL6r3IZFA2/FAIU\nchnkMsG1TS4TIJcL9V9lMsjlAmfrOki7rgwJCQl4++23MW/evAu2Hzt2DF9//TUyMjJgNBoxbdo0\nbNq0iZk6EXk1k8mE1atX4+uvv4ZKpcLTTz+NDRs2IDs7G9OmTcPYsWMxf/58rF27Fvfee6/U4VIn\nUynliArVICpU0+xnnaKImlo7LNU2WGvtsNac/3X+99W1dlTX2VFT6zj/1Y4amwO1dQ7U1Dlgqbah\npLIWdTZHJ5xZPUGof3ChPhkTIGv0VSZcuE0QBMhk9bN8De/LhPr1dsL53wsyAQFqJWx1dtd2QYDr\nfVmj1wLqv0JA/b7nt6HhM2i8DRDQsG/978//d8F7Ded0/hNonKK4juXaB9AFqXBLcky7E9B2JV49\ne/YEUD8F21hmZibGjRsHhUKB2NhYxMfHIzc3F8nJye0ZjohIck6nE9XV1ZDJZKipqUFUVBR27NiB\nf/zjHwCAiRMn4l//+hcTL7oimSAgMECJwABlm4+h1wfDVFgJm92JOpsDtTYH6mxO2Oz1v+rsDtTZ\nnbCff21z/Pae3dHwS4TdUf+ewyHC4XDC4RRd7znFRtucIhwOEU6nCIfz/HvO+vdFUUSd3QmnCDgc\n9V+dogjx/OecTrR6hs8TXdUjAhEh7euM4Ja5cJPJhJSUFNdrg8EAk8nkjqGIiDqNwWDAtGnTMGzY\nMGg0Gtx0003o378/dDodZLL6gpzR0dEoLCyUOFLyFzLh/O1HpRzBUgfTAqJ4YSIWEaFFUZH5/Pbz\nyZpY/7mGr06nCBGo/yWKQOPP1R8UovhbYteQ3zV+LaL+w/Wfr3/d8LnGk0eNj4mGMc+/F6xRtjvp\nAlqQeE2bNg3FxcWXbH/mmWcwYsSIJve5eAYMAG8zEpHXq6ysRGZmJn744QcEBwfjqaeewubNmy/5\nHK93RE0TBAFyQUBD4wCNWuF35TiaPdsVK1a0+qDR0dE4d+6c67XRaERUVFSL9tXrPSdnZyyX8pQ4\nAM+JxVPiADwrFl+0bds2xMXFITS0fnH1rbfeil9//RWVlZVwOp2QyWRee71rC2+PH/D+c/D2+AHf\nOIfW6LBmVY1nuUaMGIGMjAzU1dXh9OnTyM/PR1JSUkcNRUQkiZiYGOzZswe1tbUQRRE5OTno06cP\nrr/+enzzzTcAgHXr1mHkyJESR0pEnkoQm7ov2ELfffcdXn31VZSVlUGn0yExMRHvvfcegPpyEmvX\nroVCoWA5CSLyGW+//TY2bNgAhUKB/v37Y8GCBTAajXj22WdRWVmJfv36YdGiRVAq275omoh8V7sS\nLyIiIiJquQ671UhEREREV8bEi4iIiKiTMPEiIiIi6iQemXitXr0at912G9LS0vDGG29IHQ6WL1+O\nxMRElJeXSzL+woULMXbsWIwfPx6zZ8+GxWLp9Bg2b96M2267DWPGjMGyZcs6fXygvizJ/fffj3Hj\nxshCFA0AAAwvSURBVCEtLQ2rVq2SJI7GnE4nJk6ciEcffVTSOMxmM5588kmMHTsWt99+O/bs2SNJ\nHCtXrsQdd9yBtLQ0zJkzB3V1dZLE4ck84WeptebOnYvBgwcjLS3Nta2iogIPPfQQxowZg+nTp8Ns\nNksY4ZVd7trhTedQV1eHu+++GxMmTEBaWhrefvttAEBBQQGmTJmCMWPG4Nlnn4Xdbpc40iu7+Jrp\nbfGPGDECd955JyZMmIDJkycDaMP3kehhcnJyxGnTpok2m00URVEsKSmRNJ5z586JDz30kDh8+HCx\nrKxMkhi2bt0qOhwOURRFcdGiReIbb7zRqeM7HA7x1ltvFQsKCsS6ujrxzjvvFI8ePdqpMYiiKBYW\nFor79+8XRVEULRaLOHr0aEniaGzFihXinDlzxJkzZ0oaxx//+Edx7dq1oiiKos1mE81mc6fHYDQa\nxREjRoi1tbWiKIriU089Ja5bt67T4/BknvKz1Fo//fSTuH//fvGOO+5wbVu4cKG4bNkyURRFcenS\npeKiRYukCq9Zl7t2eNM5iKIoWq1WURRF0W63i3fffbe4e/du8amnnhIzMjJEURTFefPmiR999JGU\nITbr4mumt8U/YsQIsby8/IJtrf0+8rgZr48++giPPPIIFIr62q7h4eGSxvPaa6/hhRdekDSGwYMH\nu9qRpKSkwGg0dur4ubm5iI+PR9euXaFUKnH77bcjMzOzU2MAAL1ej379+gEAgoKC0KtXL0lbsxiN\nRmRnZ+Puu++WLAYAsFgs2LVrF+666y4AgEKhgFarlSSWhj6Gdrvd1ceQfuMpP0utlZqaCp1Od8G2\nzMxMTJw4EUB9f8rvvvtOitBapKlrh8lk8qpzAACNpr75dl1dHex2OwRBwI4dOzBmzBgA9efw7bff\nShniFTV1zczJyfGa+IGGFkbOC7a19vvI4xKvkydPYteuXZgyZQqmTp2KvLw8yWL5/vvv0aVLF/Tt\n21eyGC62du1aDBkypFPHNJlM6NKli+u1wWCQvBddQUEBDh48KGlh3oakXOr2MAUFBQgLC8OLL76I\niRMn4s9//jNqamo6PY7GfQyHDBmC4OBgDB48uNPj8GSe+LPUVqWlpYiMjARQn9iUlZVJHFHLNFw7\nkpOTUVJS4lXn4HQ6MWHCBNx000246aabEBcX51V9Qi++ZpaVlSEkJMRr4gfqWx5Nnz4dd911Fz79\n9FMAaPX3kSQNki7X//Hpp5+Gw+FAZWUlPvnkE+Tm5uLpp592678IrxTL0qVL8f7777u2iW4sedaS\nnpjp6elQKpUXrLPoDO4877aoqqrCk08+iblz5yLo/7d3/yF11X8cx5/33tawKVFemS0lLBFq80fG\n3CJxYXcbc2sm7AdIEJQIWdnskt3dWIuZCVZC/bFkWFsbDiHUjSgINreJ605pWRLixtzGVatrQw3d\nzLze2x/DS87rmn3nuffu+3r8dX98zufzPnjOx/c953Pve8mSkMRw8uRJrFYrjz76KO3t7SGJYZrX\n66W7u5t33nmH1NRUKisr2bdvH6WlpYbGcWMdw9LSUr766ivDj9dwFm7n0v+bG+eOUH9omi+z2cyR\nI0cYGxvjlVdeobe3d1abcN2nYHOm3++fdU6Ea/zTGhoaiIuLY2hoiBdffJGkpKR5xxySxOtm9R8b\nGhpYt24dAGlpaZjNZoaHh7nvvvsMjeX8+fMMDAyQn5+P3+/H4/EEMtzY2FjD4pjW3NzMqVOnQrKg\nPD4+nl9++SXw3OPxhOwWktfrpbS0lPz8fGw2W0hiAPjhhx9oaWnh1KlTTExMcPXqVcrLy6murjY8\nlvj4eOLj40lNTQVg/fr1gQoSRrqxjuHatWvp7OxU4vUP4XQu/a9iY2O5cuUKVquV33//PeTLQv5N\nsLkj0vZhWnR0NCtXruSnn376z3VCjRZsznz//fcZHR2NiPinxcXFAdeXQdlsNrq6uuZ9HIXdrUab\nzYbL5QLg0qVLeL3eBUu6biYlJYXTp09z/PhxWlpaWLp0Kc3NzQuSdP2b1tZW6urq+PTTT7n77rsN\nHz81NRW3283AwAB//fUXX3/9dchq0TmdTpKTk3nhhRdCMv60N954g5MnT3L8+HFqampYtWpVSJIu\nAKvVygMPPMClS5eA62smHnnkEcPjCFbHMBRxhLNwOpfm68YrE7m5uTQ1NQGRUZ8y2NwRSfswNDQU\n+Lbcn3/+icvlIjk5OWLqhAabMz/88MOIiR9gfHycq1evAnDt2jXa2tpISUmZ93EUdiWDJicncTqd\n9PT0sGjRIhwOB1lZWaEOi2eeeYbGxsbAp3kjrVu3jsnJycDY6enpvPvuu4bG0NraSmVlJX6/ny1b\ntlBcXGzo+ABnz57l+eefJyUlBZPJhMlkoqyszPA1bzfq6Ojg888/p7a2NmQx9PT08Pbbb+P1eklM\nTKSqqoqYmBjD4whWx1A1C2cKh3Npvux2O+3t7YyMjGC1Wnnttdew2Wy8/vrr/PrrryxbtoyPP/54\n1gL8cDHX3JGWlsaOHTsiYh/OnTuHw+HA5/Ph8/nIy8vj5Zdfpq+vL+LqhP5zzoyk+Pv6+nj11Vcx\nmUxMTU3x7LPPUlxczMjIyLyOo7BLvERERETuVGF3q1FERETkTqXES0RERMQgSrxEREREDKLES0RE\nRMQgSrxEREREDKLES0RERMQgSrxERCRibdu2jYKCAjZu3Mjy5cspKCigoKAAp9M5776KiopmVBaY\ny86dO/nxxx//S7jz0t3dzbfffrvg44ix9DteIiIS8QYGBtiyZUug8kkw06VpIsWXX36Jy+WipqYm\n1KHIbRSSWo0iIiILzeVyUV1dTUpKCufOncNutzM0NER9fT1erxdgRnWUNWvWcODAAZKSkigsLOTx\nxx+ns7OTwcFBNm3axI4dOwAoLCykpKSE7Oxs3nzzTaKjo+nt7cXj8ZCZmUlVVRUAv/32G+Xl5QwP\nD5OYmMjU1BS5ubls3759RpxXrlzBbrczPDwMQHZ2NkVFRezdu5dr165RUFDAqlWrcDgcdHZ2UlNT\nw/j4OAClpaXk5OTgdrspLCwkLy8Pl8uF2Wxm9+7dZGZmBu2/vLx84f8AEpQSLxERuWOdP3+eiooK\nVqxYAcAff/xBfn4+AL29vRQVFXHixImg23o8Hg4fPszo6Cg2m42tW7fy4IMPzmp34cIF9u/fj8/n\nIz8/n46ODrKystizZw85OTkUFRXR39/P5s2byc3NnbX90aNHSU5OZteuXQCMjo4SExNDSUkJZ86c\n4aOPPgrEvmfPHj777DPuv/9+PB4P27Zt45tvvgGuJ3AZGRk4nU5cLhd2u51jx44F7V9CR4mXiIjc\nsR5++OFA0gVw+fJlPvnkEwYHB7FYLAwODjIyMhK0Du+GDRsAiImJISkpCbfbHTTxWrt2LXfddf3f\n6WOPPYbb7SYrK4v29nbee+89ABISEuasO5yRkUF9fT1RUVGsXLmS7OzsoO3Onj1Lf38/L730UqBo\nucVioa+vj3vuuYeoqCjy8vIAePLJJ7FYLFy+fPmW+xdjKPESEZE71pIlS2Y8LysrY/fu3axZswaf\nz0daWhoTExNBt128eHHgsdlsZmpqal7tTCbTLcX4xBNP0NTUxHfffUdjYyN1dXUcOnRoVju/38/y\n5cs5cODArPfcbves13w+HyaT6Zb7F2NEzipDERGRm7iV74qNjY2RkJAAQENDw5zJ1O2QlZVFU1MT\ncH3xf0dHR9B2/f39REdHk5eXh8Ph4OeffwYgOjp6xm3BzMxMLly4wPfffx94raurK/B4fHw8cNvx\nzJkzADz00ENz9i+hoSteIiJyR7iVK0xOp5Pi4mLuvfdenn76aWJiYoJuf2Nfc713s3a7du3irbfe\n4ujRoyQkJJCenj5jvGkul4uDBw9isVjw+/1UVFQA8NRTT/HFF1/w3HPPsXr1ahwOB3v37uWDDz5g\ndHSUyclJEhMTqa2tBcBqtdLV1UVtbS0mk4mamhosFsuc/Uto6OckREREFsDExASLFi3CbDbj8XjY\nunUr9fX1JCYm3vaxpr/V2NbWdtv7lttLV7xEREQWwMWLF9m5cyd+vx+fz0dZWdmCJF0SWXTFS0RE\nRMQgWlwvIiIiYhAlXiIiIiIGUeIlIiIiYhAlXiIiIiIGUeIlIiIiYhAlXiIiIiIG+RunOLafdYsI\nRAAAAABJRU5ErkJggg==\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xc447410\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "import tensorflow as tf\n",
-        "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Set up the data with a noisy linear relationship between X and Y.\n",
-        "num_examples = 50\n",
-        "X = np.array([np.linspace(-2, 4, num_examples), np.linspace(-6, 6, num_examples)])\n",
-        "# Add random noise (gaussian, mean 0, stdev 1)\n",
-        "X += np.random.randn(2, num_examples)\n",
-        "# Split into x and y\n",
-        "x, y = X\n",
-        "# Add the bias node which always has a value of 1\n",
-        "bias_with_x = np.array([(1., a) for a in x]).astype(np.float32)\n",
-        "\n",
-        "# Keep track of the loss at each iteration so we can chart it later\n",
-        "losses = []\n",
-        "# How many iterations to run our training\n",
-        "training_steps = 50\n",
-        "# The learning rate. Also known has the step size. This changes how far\n",
-        "# we move down the gradient toward lower error at each step. Too large\n",
-        "# jumps risk inaccuracy, too small slow the learning.\n",
-        "learning_rate = 0.002\n",
-        "\n",
-        "# In TensorFlow, we need to run everything in the context of a session.\n",
-        "with tf.Session() as sess:\n",
-        "    # Set up all the tensors.\n",
-        "    # Our input layer is the x value and the bias node.\n",
-        "    input = tf.constant(bias_with_x)\n",
-        "    # Our target is the y values. They need to be massaged to the right shape.\n",
-        "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
-        "    # Weights are a variable. They change every time through the loop.\n",
-        "    # Weights are initialized to random values (gaussian, mean 0, stdev 0.1)\n",
-        "    weights = tf.Variable(tf.random_normal([2, 1], 0, 0.1))\n",
-        "\n",
-        "    # Initialize all the variables defined above.\n",
-        "    tf.global_variables_initializer().run()\n",
-        "\n",
-        "    # Set up all operations that will run in the loop.\n",
-        "    # For all x values, generate our estimate on all y given our current\n",
-        "    # weights. So, this is computing y = w2 * x + w1 * bias\n",
-        "    yhat = tf.matmul(input, weights)\n",
-        "    # Compute the error, which is just the difference between our \n",
-        "    # estimate of y and what y actually is.\n",
-        "    yerror = tf.subtract(yhat, target)\n",
-        "    # We are going to minimize the L2 loss. The L2 loss is the sum of the\n",
-        "    # squared error for all our estimates of y. This penalizes large errors\n",
-        "    # a lot, but small errors only a little.\n",
-        "    loss = tf.nn.l2_loss(yerror)\n",
-        "\n",
-        "    # Perform gradient descent. \n",
-        "    # This essentially just updates weights, like weights -= grads * learning_rate\n",
-        "    # using the partial derivative of the loss with respect to the\n",
-        "    # weights. It's the direction we want to go to move toward lower error.\n",
-        "    update_weights = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)\n",
-        "\n",
-        "    # At this point, we've defined all our tensors and run our initialization\n",
-        "    # operations. We've also set up the operations that will repeatedly be run\n",
-        "    # inside the training loop. All the training loop is going to do is \n",
-        "    # repeatedly call run, inducing the gradient descent operation, which has the effect of\n",
-        "    # repeatedly changing weights by a small amount in the direction (the\n",
-        "    # partial derivative or gradient) that will reduce the error (the L2 loss).\n",
-        "    for _ in range(training_steps):\n",
-        "        # Repeatedly run the operations, updating the TensorFlow variable.\n",
-        "        sess.run(update_weights)\n",
-        "\n",
-        "        # Here, we're keeping a history of the losses to plot later\n",
-        "        # so we can see the change in loss as training progresses.\n",
-        "        losses.append(loss.eval())\n",
-        "\n",
-        "    # Training is done, get the final values for the charts\n",
-        "    betas = weights.eval()\n",
-        "    yhat = yhat.eval()\n",
-        "\n",
-        "# Show the results.\n",
-        "fig, (ax1, ax2) = plt.subplots(1, 2)\n",
-        "plt.subplots_adjust(wspace=.3)\n",
-        "fig.set_size_inches(10, 4)\n",
-        "ax1.scatter(x, y, alpha=.7)\n",
-        "ax1.scatter(x, np.transpose(yhat)[0], c=\"g\", alpha=.6)\n",
-        "line_x_range = (-4, 6)\n",
-        "ax1.plot(line_x_range, [betas[0] + a * betas[1] for a in line_x_range], \"g\", alpha=0.6)\n",
-        "ax2.plot(range(0, training_steps), losses)\n",
-        "ax2.set_ylabel(\"Loss\")\n",
-        "ax2.set_xlabel(\"Training steps\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "lSWT9YsLP1de"
-      },
-      "source": [
-        "This version of the code has a lot more comments at each step. Read through the code and the comments.\n",
-        "\n",
-        "The core piece is the loop, which contains a single `run` call. `run` executes the operations necessary for the `GradientDescentOptimizer` operation. That includes several other operations, all of which are also executed each time through the loop. The `GradientDescentOptimizer` execution has a side effect of assigning to weights, so the variable weights changes each time in the loop.\n",
-        "\n",
-        "The result is that, in each iteration of the loop, the code processes the entire input data set, generates all the estimates $\\hat{y}$ for each $x$ given the current weights $w_i$, finds all the errors and L2 losses $(\\hat{y} - y)^2$, and then changes the weights $w_i$ by a small amount in the direction of that will reduce the L2 loss.\n",
-        "\n",
-        "After many iterations of the loop, the amount we are changing the weights gets smaller and smaller, and the loss gets smaller and smaller, as we narrow in on near optimal values for the weights. By the end of the loop, we should be near the lowest possible values for the L2 loss, and near the best possible weights we could have."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "dFOk7ERATLk2"
-      },
-      "source": [
-        "## The details\n",
-        "\n",
-        "This code works, but there are still a few black boxes that are worth diving into here. `l2_loss`? `GradientDescentOptimizer`? What exactly are those doing?\n",
-        "\n",
-        "One way to understand exactly what those are doing is to do the same thing without using those functions. Here is equivalent code that calculates the gradients (derivatives), L2 loss (sum squared error), and `GradientDescentOptimizer` from scratch without using those functions."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "both",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 4219,
-          "status": "ok",
-          "timestamp": 1474671842604,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "_geHN4sPTeRk",
-        "outputId": "3ee8e5e5-0db0-4e6b-ef7e-f9e530fa7bee"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl4AAAESCAYAAAAsU9sMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xl4VOX1wPHvnTWTfZ8AgSCbQSCgUBFEWUVBkcWli3UB\nFBXFpVh/FVuhatWCtrW1ZVHccKuiqAgIGiSAsgiVhFV2wjaThGwzk22W+/sjMAYIEpKZzEzmfJ7H\nh+TOnfueC0k8ed/3nqOoqqoihBBCCCH8ThPoAIQQQgghwoUkXkIIIYQQzUQSLyGEEEKIZiKJlxBC\nCCFEM5HESwghhBCimUjiJYQQQgjRTHySeE2bNo3+/fszatQo77GysjImTJjAtddey8SJE7HZbL4Y\nSggh/MpisXDHHXcwcuRIRo0axYIFCwCYOXMmI0aMYPTo0UyZMgW73e59z9y5cxk+fDgjRoxg7dq1\ngQpdCBECfJJ4jRs3jvnz5592bN68efTr14/ly5fTt29f5s6d64uhhBDCr7RaLU888QRLly7lgw8+\n4J133mHfvn0MGDCAJUuW8Nlnn5GRkeH9mbZ3716WLVvG0qVLefXVV/nzn/+MlEcUQpyLTxKvPn36\nEBsbe9qx7Oxsxo4dC8DYsWP5+uuvfTGUEEL4VUpKCl27dgUgKiqKjh07UlBQQP/+/dFoan9k9urV\nC4vFAsDKlSsZOXIkOp2O9PR0MjIyyMvLC1j8Qojg5rc9XsXFxSQnJwO1P8hKSkr8NZQQQvjFkSNH\n2LVrF1lZWacdX7hwIQMHDgTAarXSqlUr72tmsxmr1dqscQohQodsrhdCiHo4HA4eeughpk2bRlRU\nlPf47Nmz0ev13HDDDQD1LisqitJscQohQovfEq+kpCSKiooAKCwsJDEx8bzvkX0RQohg4HK5eOih\nhxg9ejTDhg3zHl+0aBE5OTm89NJL3mNpaWkcP37c+7nFYiE1NfW8Y8jPOyHCk85XFzrzh8iQIUP4\n5JNPmDRpEosWLWLo0KHnvYaiKBQWBsfTjykpMRJLkMYBwRNLsMQBwRdLKJs2bRqdOnXizjvv9B5b\nvXo1r732Gu+88w4Gg8F7fMiQITz22GPcddddWK1W8vPzz1qarE8w/bxrjGD6emusUL+HUI8fQv8e\nGvOzzieJ19SpU9mwYQOlpaUMGjSIKVOmMGnSJB5++GE+/vhjWrduzcsvv+yLoYQQwq82b97M4sWL\n6dKlC2PGjEFRFB555BH+8pe/4HQ6mTBhAgA9e/ZkxowZdOrUiREjRnD99dej0+mYPn26LDUKIc5J\nUYNsvjtYMt9gysKDJZZgiQOCJ5ZgiQOCLxZxfsHy79UYwfT11lihfg+hHj+E/j005medbK4XQggh\nhGgmkngJIYQQQjQTSbyEEEIIIZqJJF5CCCGEEM1EEi8hhBBCiGYiiZcQQgghRDORxEsIIYQQoplI\n4iWEEEII0Uwk8RJCCCGEaCY+69UohAiczdbvqSm20y9xcKBDEUII8TNkxkuIEPeDdTPv7XyHDUc2\nnNWsXgghRHCRxEuIEPZj8S7e3/UOJl0E9//ifmnOLIQQQU4SLyFCVH75Id7aPh9FUZjQ/R7SY9MD\nHZIQQojzkMRLiBBUUFHAa1vnUuN28ttL7qJDfKdAhySEEKIBJPESIsSUVZfyat5sHE4HN1/8S3ok\nZwU6JCGEEA0kiZcQIaTCWcG8vNkUVxUz4qKRXNGqX6BDEkIIcQEk8RIiRDjdTl7f9ioWh4UBba5i\naLvhgQ5JCCHEBZLES4gQ4FE9LNj5JgfK9tMr9VLGdLpJnmAUQogQJImXEEFOVVU++vEDthdto0tC\nF36d+VtJulqAnP8dCXQIQogAkMRLiCC37MASNlo20DamLXd2m4hOIw0nWoJ8qy3QIQghAkASLyGC\n2JojOWTnf0WKKZm7e9xLhC4i0CEJH7FX1AQ6BCFEAEjiJUSQ+sG6mc/2LiLGEMM9WZOJNsQEOiTh\nQ45KV6BDEEIEgCReQgSh3cU/8v6udzBqDUzKup8kU1KgQxI+5qhyBjoEIUQASOIlRJDJLz/Em9tf\nQ1EUJvaYROvoNoEOKaxYLBbuuOMORo4cyahRo3j77bcBKCsrY8KECVx77bVMnDgRm+2nPVrPPvss\nw4cPZ/To0ezcubNB48hSoxDhSRIvIYKItAIKPK1WyxNPPMHSpUv54IMPePfdd9m3bx/z5s2jX79+\nLF++nL59+zJ37lwAcnJyyM/PZ8WKFTz99NNMnz69QePIjJcQ4UkSLyGCxGmtgLrcKq2AAiQlJYWu\nXbsCEBUVRceOHbFarWRnZzN27FgAxo4dS3Z2NgDZ2dmMGTMGgJ49e2Kz2SgqKjrvOI5KSbyECEeS\neAkRBCqcFbyaN+enVkCt+wc6JAEcOXKEXbt20bNnT06cOEFycjJQm5wVFxcDUFBQQFpamvc9ZrMZ\nq9V63mvbKyTxEiIcSUEgIQLM6XbyxrbXOO44zpVtBkgroCDhcDh46KGHmDZtGlFRUecsWquq6lnH\nGlLgtsblIS4+EoNe2+RYAyUlJfSftA31ewj1+KFl3MOFkMRLiADyqB7e2fkW+8v2SSugIOJyuXjo\noYcYPXo0w4YNAyApKYmioiKSk5MpLCwkMTERqJ3hslgs3vdaLBZSU1MbNE7+kRLioo2+v4FmkJIS\nQ2FhaBeBDfV7CPX4IfTvoTFJoyw1ChEgqqqycPd/2Va0lc4Jnfl15m/RKPItGQymTZtGp06duPPO\nO73HhgwZwieffALAokWLGDp0KABDhw7l008/BWDLli3ExsZ6lyTPp6JaankJEW5kxkuIAPny4FI2\nHF9PenQ6d3W7W1oBBYnNmzezePFiunTpwpgxY1AUhUcffZR77rmHRx55hI8//pjWrVvz8ssvAzBw\n4EBycnK45pprMJlMPP/88w0eq6JKEi8hwo38pBciANYeXc3Xh1aQYkrmnqz7pBVQEOndu/c5a3G9\n+eab9R5/6qmnGjWWQxIvIcKOrGsI0cy2FPyPT/d8Iq2ABBXV8mSjEOFGEi8hmtHu4h95b+cCaQUk\nAFlqFCIcSeIlRDM5bMv3tgKa0OMeaQUkJPESIgxJ4iVEMyisKOS1vJOtgLreScf4zoEOSQQBeapR\niPAjm+uF8CO7w87r38zjy+KlAPx+wBP0SOkZ4KhEsKiQfo1ChB2Z8RLCj9785jU+r/iMMl0p0fEx\n7Nn1Y6BDEkFElhqFCD+SeAnhJ063k5WlX1PpcmCOakV6TFsKagoCHZYIIrLUKET4kaVGEfRsNgdz\n5mzFYjFhNlcweXJW0Pf28qge3t35NlWeKhIjk2gfexEAqYaGtZIRLZ/RoJU6XkKEIUm8RNCbM2cr\nW7YMQlEULBaV2bNXMXNmWqDDOidVVfl494dsLcrjusyR6Av1nCg7QaohldsG3nn+C4iwEBWhp1IS\nLyHCjiReIuhZLCZv4+ja5MsU4Ih+3vKDy1h/fB3p0elM6jVZqtKLekWZ9BSXVQY6DCFEM5M9XiLo\nmc0VqKoK1M4mpaUF7/+s1h5dzVeHlpNsSuLurHsl6RLnFG3SU1Ht8n5tCyHCg8x4iaA3eXIWs2ev\nwmIxkZZWyf339wh0SPWq2wpoUtYDxBhiAx2SCGJRJj2qClU1bkxG+VEsRLiQ73YR9KKjo/j9768I\ndBg/q24roHt63CetgMR5RUfqgdqSEpJ4CRE+5LtdiCawO+z89fO/8HXJcoyaCJ4f+iJtYtIDHZYI\nAdERJxOvaheSpgsRPmSPlxBN8I8v/sHSii+oMFSSlJTCui1rAx2SCBFRplMzXlK9Xohw4vcZryFD\nhhAdHY1Go0Gn07Fw4UJ/DylEveqrBxYdHdWoa1msBTzw0l/YHPUFxDq5POMykiOTKSiVAqmiYeou\nNQohwoffEy9FUViwYAFxcXH+HkqIn1VfPbDG7h178KUX2JKUi8utgCeS3buPk963nRRIFQ0WVWep\nUQgRPvyeeKmqisfj8fcwIow1dCbLV/XAnG4nu03/w6OtIMrVFffBKmo8djKzMqVAqmiwUzNeUr1e\niPDSLDNeEydORFEUfvnLX3Lrrbf6e0gRZl5+eSNffBFDdbUWg0FHTc1G/vjHwWedZzZXYLGoKIrS\n6Hpgp1oBKRGV6JxJRLu64I73cHGVnkkjJ/vidkSYkD1eQoQnvydeH3zwASkpKRQXFzN+/Hg6dOhA\nnz59znl+MPXgk1jOFixxwE+xrFljx2YbhaIo1NSorFnzfr1xTp9+JS++uI5jxyJo3bqKxx7rf0F7\nvGx2G5P/+yDbyreRldEOZ247TtTUkGaK4a2//iko/m6CIQbRMLLUKER48nvilZKSAkBiYiLXXHMN\nW7du/dnEq7DQ5u+QGiQlJUZiCdI44PRYXC4jqqp6K4C73RHnjPOBBy71flxZ6aGysuH387uFD7K+\nZiNRxigSE1PoNrAjT975f96xAv13E2z/PuLnRUcaANlcL0S48Ws5icrKShwOBwAVFRWsXbuWzp07\n+3NIEYYGD44gNrYIo7GU2NgiBg82+nyMb4+uYYv9ByJ0EWQmXoJOq6egRp5gFI3301KjJF5ChBO/\nzngVFRXx4IMPoigKbrebUaNGMWDAAH8OKcLQI4/0xmjMrdNSqLdPr59b8AOL9nxMnC6eVgltMGgN\nqKoqTzCKJok06lCQPV5ChBu/Jl5t27bls88+8+cQQvi1pdCekt28t6u2FdDM4X/nm01fU+AoINWQ\nKk8wtmDTpk1j1apVJCUlsXjxYgB27drF9OnTqa6uRqfT8dRTT5GVlQXAs88+y+rVqzGZTLzwwgt0\n7dr1vGNoNAomo072eAkRZqRlkGixmlIw1e6w85/sf/JVyZcYNRE8N3QWnVO70HlkFz9HLYLBuHHj\nuP3223n88ce9x2bNmsWUKVMYMGAAOTk5zJo1iwULFpCTk0N+fj4rVqwgNzeX6dOn8+GHHzZonMgI\nSbyECDfSMki0WKcKplqtV5CbO5jZs7c2+L3zVv6HJRWLva2A1m/5tlEx2GwOZs1az9SpucycuQ67\n3dGo64jm1adPH2JjY087pigKNlvtwws2mw2z2QxAdnY2Y8aMAaBnz57YbDaKiooaNE5khE7qeAkR\nZmTGS7RYjS2Yaqsp5+uSFbh0Ti6K69ikVkC+rJYvAuuJJ57g7rvv5q9//SuqqvLBBx8AUFBQQFpa\nmvc8s9mM1WolOTn5vNeMNOqornHj9njQauT3YCHCgSReosVqTMHUSlcl8/JmA9Amui3mqLQmbaT3\nVbV8EXjvv/8+Tz75JMOGDePLL79k2rRpvPHGG94yJnWd+jc/n4Q4E1CKKSqCuGjfP43rby2hbEio\n30Ooxw8t4x4uhCReosWaPDmL2bNX1XnascfPnu90O3lj22scsx/jpq6/ZPmHu/mhykGyMZo/PH5z\no2LwRbV8ERw+/fRT/vjHPwJw3XXXeT82m81YLBbveRaLhdTUhiXq2pP52eFjpdQkRPo2YD8Lprpx\njRXq9xDq8UPo30NjkkZJvESLdSFPO55qBbSvdC9ZKT2xLs1Ef2wKrU8mTG+9tYr77os6a7N+RISG\nWbPWn3MD/4UmfyJ4nDmTZTab2bhxI5dffjnr1q0jIyMDgKFDh/Luu+8ycuRItmzZQmxsbIOWGaF2\nqRGklpcQ4UQSLxH2VFXlkz0fsbUoj47xnfhN5u384fUdZy0R1rdfKzLS+LN7uPxZ6kL4z9SpU9mw\nYQOlpaUMGjSIKVOm8Mwzz/Dss8/i8XgwGo0888wzAAwcOJCcnByuueYaTCYTzz//fIPHiYqQxEuI\ncCOJlwh7Kw59ybpj39Emug3ju9+NXquvd4mwvv1aEREG2cPVAr300kv1Hv/kk0/qPf7UU081apxI\n6dcoRNiRx2hEi/dzJR2+O7qWFQe/JCkiiXuy7sOkq02cJk/OolevVZjN6+nVaxX3398Ds7nCu/x0\nKhlr3bryrGNCNFTkyRkvh1SvFyJsyIyXCGpNKYJ6yrlKOuQVbuGTPQuJ1kdzT9b9xBh+qttU3xJh\nffu1UlJiePpp2cMlGufUHq9KWWoUImxI4iWCmi/qYNW3RLinZDfv7nwbo9bAPVn3kRKZct7r1JeM\nyR4u0RSnZrxkqVGI8CGJlwhqvqiDlZBQzLffrqGi0kNN6ie0ST/A1C/K6NS6C3dfdh/pMW19HbYQ\nDXJqj5dUrxcifMgeLxHU6ttXdaEURQt0ozJxA9U9CjjeeicntEXElEfTOUF6L4rA+amchOzxEiJc\nyIyXCGq+qINlLdCj6/AOnthPcKUdp8atpX1cB/ROgx8iFqLhomSpUYiwI4mXCGq+2EN1VPNfjnfY\nSoV+Nx6jE1NNEubINFIrG9cGSAhf0es06LSK1PESIoxI4iVaPFfrXbhMe1DcHnTlCpEVKpmVmdw2\n8M5AhybCnKIoRBp1kngJEUYk8RItlt1h551Vb7KzeiceTRXxsdHEJcXT+mg6k0ZODnR4QgC1G+xl\nj5cQ4UMSLxG06tbwio8vQ1HclJQkYjZXMH36led9/7ur3mK5cxnaFC1apwbtfh3xrRIY1HlwM0Qv\nRMNERugoKqstxHvqCV4hRMsliZcIWnVreH37bTGwnU6drsBiUXnxxXU88MClP/v+jbYNFHistI5v\njb7KgJKoYWyncbLEKIJKZIQOl1vF6fJg0GsDHY4Qws8k8RJBq24Nr+pqHRCBy+Xi0CEbhw+X4XCs\nO2cl+3XHvuV4zTGMxgguSe6BXqMnMyVTlhhF0DlVUsJR5ZLES4gwIHW8RNCqW8PLYHBhNFZx6JCN\n0tJE3O44cnMHM3v21rPel1e4hY93f0SvdpdxQ+SNmG1m2UwvgpY0yhYivMiMlwhadWt4dexYxObN\nFnbtKkCjqaRt25H1VrLfW7KHd3e+jUGr5/5eD9I2pl2AoheiYby1vGSDvRBhQRIvEbTq1vCaNWs9\ncXETSU4upawskcOHv6V9+37eSvZ2h53/ZP+Tr0qWY9QYeG7oLEm6REj4qXq9zHgJEQ5kqVGEhFP7\nvdq3jyE+vhittoxevVZx//09sBZaufbpQbyy62X2WnYTZYhh/ZbvAh2yEA0ijbKFCC8y4yVCgtlc\ngcWiotPp6NAhnv7947xPNd475y7yMw7h0agoCuzdspsefbMCHLEQDePd4yUzXkKEBUm8REg4s2fj\nY4/1p7LSQ5Wriu26bSiKgk6jRa/RU6GtJNUg7YBEaJBG2UKEF0m8REg4s2djdHQUNkcJb25/Da2i\nJcYYi8fpwe1ykVqdIk8wipAhS41ChBdJvMQ51a0c39Bq8c3B7rDzTs5rLD6yFJu7nNsuvYPcjT9Q\noBZiVlKZ9fg/iIo6u7aXEMHoVOLlkKVGIcKCJF7inOpWjm9otXhfOjPxmzw5C2thAdc/fSO29EIw\neOjcthMmm4m3Hnu/2eISwpeiTu7xqpTES4iwIImXOKe6leMVReHYsYhmHf/MxG/27FUs2jeD0h5F\nqAYXeBTyN1o5MfxEs8YlhC+ZjLXV6h2yx0uIsCDlJMQ51a0cr6oqrVtXNev4dRM/t8fBN3u/ID96\nGx6lGlTQqBE49U7ZSC9CmlajwWjQyh4vIcKEJF7inCZPzqJXr1WYzevp1WsVjz3Wu1nHr5v45Xv+\nw8EOn+NKrACNB8p14NQQXR4vG+mFz02bNo3+/fszatSo044vWLCA6667jlGjRvHiiy96j8+dO5fh\nw4czYsQI1q5de8HjRUXopJyEEGFClhrFOdX3JGFlpa3Zxj9VQuLwEZV1rpep1pei12hxHlPQlEJ7\npSvvTZ8jG+mFz40bN47bb7+dxx9/3Htsw4YNfPPNN3zxxRfodDqKi4sB2LdvH8uWLWPp0qVYLBbG\njx/PihUrvLO1DRFp1FFcXu3z+xBCBB+Z8RJBKzo6ijvuuohNpgexdyzEqXWiidaSEB3HoMz+fDfn\nS9q3bx/oMEUL1KdPH2JjY0879v7773PPPfeg09X+vpqYmAhAdnY2I0eORKfTkZ6eTkZGBnl5eRc0\nXmSEnspqF56TM7xCiJZLEi8RtKyFVgbP6Mfe+L2oOhUUcJU40VXruKbbNYEOT4SZgwcPsmnTJm69\n9VZuv/12tm3bBoDVaqVVq1be88xmM1ar9YKuHWnUoQJVss9LiBZPlhpF0PrdWw9SklECCqAFpVpB\nq9XRO74PD4x5gIoKT6BDFGHE7XZTXl7Ohx9+SF5eHg8//DDZ2dnefYh1NXSZMSUlBoDEeBMAEVER\npCRG+i5oPzsVfygL9XsI9fihZdzDhZDESwSl2lZA21GcCopBQTmuATdc5LqIf894laioKCoqmm+/\nmRBpaWkMHz4cgKysLLRaLSUlJaSlpXH8+HHveRaLhdTUhj1pW1hY+zWsoTZ5O3KsFI3b7ePI/SMl\nJcYbf6gK9XsI9fgh9O+hMUmjLDWKoOPyuE62AtIQFxuPqSgCvVZHckkSCx//XDbTi2Zx5kzWsGHD\nWLduHQAHDhzA6XSSkJDAkCFDWLp0KTU1NRw+fJj8/Hyysi6sSfupfo1SvV6Ilk9mvERQ8age3tu5\ngD0le7ir70Q2rd1IQezJVkAP/oOUFKnZJfxv6tSpbNiwgdLSUgYNGsSUKVO46aabeOKJJxg1ahR6\nvZ6//vWvAHTq1IkRI0Zw/fXXo9PpmD59+gU90QgQbaqtXm+vlCKqQrR0kniJoGAttPL4W4+wS7ML\nl9bJL3v/hvt+8SD6K/SBDk2EoZdeeqne47Nmzar3+L333su9997b6PFSTu7xKiipaPQ1hBChQRIv\n0Szq67sYHf3TkuH/vfUoPyT/QIXLgVbRsnV9LvoBknSJ8GA+uaHeUiyJlxAtnSReolnU13fxvvt6\neJOxDYY8nHF2tBodCcYEitTa/ovnS9iEaAmS4yLQKArWkspAhyKE8DPZXC+axZkNty0WkzcZ22M3\nYteCw+7CWR5JSXE1SZ7a4pSnzrFaryA3dzCzZ28N5G0I4Rc6rYaU+AisMuMlRIsniZfwKZvNwaxZ\n65k6NZeZM9dhtzuAsxtup6VVsv+gg03GEWxufQOuag/kdEazsw2673rTsWo8UH/CJkRDLV26FLvd\nDsDLL7/MxIkTvYVPg405MRJbhRNHlWywF6Ilk8RL+NS5ZqjObLh9//092KQ8S3nWZtT4GjwJGozu\naK6oWMuluoXYy1sD9SdsQjTU7NmziY6OJi8vj7Vr1zJmzBieffbZQIdVr7ST+7ysxfI1LkRLJnu8\nhE+da4bqzIbbJypPUNnxCDqdE50rBrfOidZcA1WnJ1inGmVbLCbS0iq5//4ezX9TImSd6qv47bff\ncssttzBq1Chef/31AEdVP7M38aqgQ+vY85wthAhVkngJnzKbK7BYVBRFOecMlb3Gxqt5/8GoMZIU\noyNKH4Xb7UazX4/ZvP60BOvMhE2IC6EoCkuXLmXp0qX85z//AcDpDM6lvLSE2l9S5MlGIVo2vyde\nq1ev5rnnnkNVVW666SYmTZrk7yFFAJ1vhqrKVcVrW+dSWFnE7wc+wZdfL8GqFtQWSH1GCqQK3/rj\nH//Ia6+9xs0330zbtm05ePAgffv2DXRY9fLOeEktLyFaNL8mXh6Ph2eeeYY333yT1NRUbr75ZoYO\nHUrHjh39OawIoHPNUNkddhaseoOvS1ZQ6ankV71u45c9fsOvet0WgChFuLjsssu8M10A7du3509/\n+lMAIzq3+BgjBp1GZryEaOH8urk+Ly+PjIwM2rRpg16v5/rrryc7O9ufQ4og9e6qt1havQSL5jhK\nlIaKw44LbqsixIV64YUXsNlsuFwufvOb39CrVy8+++yzQIdVL42ikJoQibW48qw+kUKIlsOviZfV\naqVVq1bez81mMwUFBf4cUgQhVVX5tnwNxVVFxBhi6ZJ4MUXOokCHJcLAd999R0xMDGvXrsVsNrN8\n+fKg3VwPkJZootrppsxRE+hQhBB+4telxsb81paSEuOHSBpHYjlbY+JYumcpZZQQGxFDz7QstIqW\nDtp2Tb6nUP478ZdgiiWYfP/991xzzTWYzeagnmmt+2RjfLQxwNEIIfzBr4lXWloax44d835utVpJ\nTf35zdOFhTZ/htRgKSkxEssFxlFfe59t5bl8tHshl6b3IaE0EVuBjVRDKqMH/rJJ9xQqfyfNKdhi\nCQZJSUlMnz6dNWvWMGnSJFwuF263O9BhnVNanZ6NF7dLCHA0Qgh/8Gvi1aNHD/Lz8zl69CgpKSks\nWbKEv/3tb/4cUjSjMxMtp9PJjh3Dvf0YZ8ydD72/J0ofxeRLHyY1Up5YFM3rpZde4vPPP2fs2LHE\nxcVx5MgRxo8fH+iwzsksRVSFaPH8mnhptVr+9Kc/MWHCBFRV5eabb5YnGluQMxtfW62LSU5xcER5\nHUf8TrY4NjDSczl397hXki4REImJifz2t7/lwIED7N27l/bt2zNu3LhAh3VOdWe8hBAtk9/reF19\n9dVcffXV/h5GBMCZVepV1c4RXqe88/fYordijLARVx5Lu9iMAEcqwtXWrVt56KGHMBgMqKqKy+Xi\nX//6F926dQt0aPWKNumJitBJLS8hWjCpXC8a7cwq9VcOMLK4/E1K4g6j1cClbX8BNGwjc337w6Kj\no/x7A6LF+8tf/sJzzz1Hv379AFi/fj3PPPMMH3zwQYAjOzdzYiSHLDbcHg9ajbTTFaKlke9q0Whn\nNr6O7ZmHu0MRxlgP8QnRlJaVkmpo2BLjuZprC9EUlZWV3qQL4IorrqCyMrj3T5kTInF7VE6UVQU6\nFCGEH0jiJRrtVJX6Pz/dkciuG3gn/22ceidJjhQSyxKItERx28A7G3StczXXFqIpTCYT69ev936+\nceNGTKbg/tpKSzzVszG4E0QhROPIUqNosgWr3uCL6s+p0VajNWpJNCXStV03MisziYo693Jh3eXF\nvXt/JCKiO3p99DmbawtxoaZNm8bDDz+MwWAAahtk//Of/wxwVD/v9J6NSYENRgjhc5J4iUazFlp5\n/K1HWFu1Fo/JRceUTuhLDVCtkJmSed7ZrrpPRUZE9Ka6+gPS0zvX21xbiMbIyspixYoVHDhwAFVV\nueiiixipvGqIAAAgAElEQVQ+fDirVq0KdGjnlFaniKoQouWRxEs02uNvPcLmpM04HdWoOpXiA8UM\nunoomZWZTBo5+bzvr7u8qNfrSU/vzEsv9fR32CLM6PV6unTp4v082PsgpibULjVK4iVEyyR7vESj\n2B12Nto3UlpRgqJqiKqJxlnlIrPy/DNdp5jNFd7/CcryomguwdwyCCDCoCM+2iB7vIRooWTGSzTK\nc8ufpiqmArQqOoMWpUqhd1LvBs10nTJ5chazZ6/CYjHJ8qLwqb17957zNZfLdd73T5s2jVWrVpGU\nlMTixYtPe23+/PnMmjWL9evXEx8fD8Czzz7L6tWrMZlMvPDCC3Tt2rVJ8aclRvJjfik1TjcGvbZJ\n1xJCBBdJvMQF21a0lfXl39E6tQ2OAw5qlBpiy2OZ9fg/Lug6p56KFMLXJk2adM7XjMbzN58eN24c\nt99+O48//vhpxy0WC9999x2tW7f2HsvJySE/P58VK1aQm5vL9OnT+fDDDxsfPLUb7Hfll1JQWkl6\nSnSTriWECC6SeIWZphYq3V+6l3d2vEm0NopOqZ2JaRuLqqpkVmaSkiJtgURwWLlyZZPe36dPH44e\nPXrW8eeee47HH3+c+++/33ssOzubMWPGANCzZ09sNhtFRUUkJyc3enxzwk8b7CXxEqJlkT1eYaYp\nhUqPlh/l9W2v4lY9/Hnw8/Rx/4KE0oQL2tclRKhauXIlrVq14uKLLz7teEFBAWlpad7PzWYzVqu1\nSWNJz0YhWi6Z8QozF1qo1O6w8/ryeXy95yuORx0hKTqZPw56mk5xF/P19hpKLCYM5gpUaccpWrCq\nqirmzJnD66+/ftZr9T0l2dAN/CkpMfUe73rykmUVrnOeEwyCObaGCvV7CPX4oWXcw4WQxCvMnNlf\n8XxPEr6X8zZfHPmcQ8kHUfFg8ESwY8dW1m3Xe2twWSwqs2evarb9Wjabg3//+wf279dIX0fRLPLz\n8zl69CijR49GVVWsVivjxo3jo48+wmw2Y7FYvOdaLBZSUxu27F5YaKv3uNbjQaMoHDpeds5zAi0l\nJSZoY2uoUL+HUI8fQv8eGpM0ylJjmDmzv+LPPUlod9hZtnUpuyt2UVlWgUkxYdQYKKgpCGiLnzlz\ntrJp01XS11H4Vd2ZrC5duvDtt9+SnZ3NypUrMZvNLFq0iKSkJIYOHcqnn34KwJYtW4iNjW3S/i4A\nnVZDcnyE1PISogWSGa8Q1piN8hfyJOGCVW9wKOkA7ioPLjyc2F2JGuckLj4OwwXOnPmS9HUU/jZ1\n6lQ2bNhAaWkpgwYNYsqUKdx0003e10993QMMHDiQnJwcrrnmGkwmE88//7xPYjAnRLJ1/wkqqpxE\nRuh9ck0hROBJ4hXC6rbc8fVyn6qq5JSuwmgyYiiLxVOuRVtsIPrAHZSr3Xj4oZ9qcCUklFFd7Wbq\n1NxmWfozmysoLpbCq8J/XnrppZ99PTs7+7TPn3rqKZ/H0CY5iq37T3DQYuOS9ok+v74QIjAk8Qph\n/pj5sTvsvLvqLb4tX8ueoh9JTEtEqcmk0mMmrrIH6foHKD6x/rSZs1mz1jfrfq/Jk7N466017N+v\n8RZebWqZDCGCTdf2CXy5MZ/tB4sl8RKiBZHEK4Rd6Eb5hngv521WerI5ouYTmxJH2uFWlJRHUXrw\nEtoZ7sbtOnuc5l76i46OYsaMq0/bkNncyZ8Q/talbTw6rcL2/cXcMijQ0QghfEUSrxDm65Y71kIr\n/9n4L4qTTqDHwJAOw8iIbM8j9/2e2bO3Ulq6nfj4srPG8UcCeKFk35doaYx6LZ3T49l5qIQyRw1x\nUYZAhySE8AFJvEKYr1vu3P/2RE7EnMCj96DTeti853t+0eVy7zjneuw3GHouBkPyJ4Svde+QyM5D\nJew4WEy/bmnnf4MQIuhJ4iWwO+z88+u/sdm1CbQqkdZINHoNlCjcds/5K9IHQ8/FYEj+hPC1bu0T\n+Yh9bNsviZcQLYUkXoI52f/iy8qlaCI1KEbQV+mJy4jnYsPFREWFxgb1YEj+hPC1tqnRxEYZ2H6w\nGFVVG1wRXwgRvKSAapgrrjpBdulXuD0uerf6BTprDI5jHlidwoybnwt0eEKENUVR6NY+kXJHDYcL\n7IEORwjhA5J4hTF7jY15ebNR0NAutj2lR01E19xA29JpdLCv4qOPLOe/iBDCr7p3qC0lsf1gcYAj\nEUL4giReYaraXc38rfMorChkYp9JDNYMwbM3keh9l5CuTpQnA4UIEqdqeG3bL4mXEC2B7PEKM3aH\nnQWr3iC75CsqPBX8qudtjOt6C8olCqXb1pFbPBhFI08GChEs4qIMtEuNZs+RUqqdbox6baBDEkI0\ngSReQc5mc/Dvf//A/v0an1Rkf3fVWyyrWcIJTRHxpgQqjjiwd6hgzpytHDmix25fQEZGW9q2RZ4M\nFCJIdOuQSH6BnR/zS8nqmBTocIQQTSBLjUFuzpytbNp0FVbrFeTmDmb27K2NvpbNbuO9PQv4sXAX\nFRWVXBTbgSJnkbfnY3HxVURH307btvD7318hLXeECBLdTy43bj8gy41ChDpJvIKcLyuy//nLP1Ko\nLQQtGE1G9h3bS6ohVaq+CxHkOqXHY9BrZIO9EC2AJF5BzmyuQFVVgCbtu9p4fAP/s28iJTGVDEd7\nIk+YiLREcdvAO302hhDCP/Q6DZntEjhW5KC4vCrQ4QghmkD2eAW5yZOzeOutNezfr2l0RfbtRdv4\naPcHxOpiSUtpQ1SbKFRVJbMyk6ioKKn6LkQI6HZRInn7TrD9QDFX9Wwd6HCEEI0kiVeQi46OYsaM\nq+vtkdgQB8r2s2DHG+g0Wl4Y9jdW/+8bCioKSDWkctvAO71jSNV3IYJb94tOlpWQxEuIkCaJVwti\nszmYM2crFosJs7mCm+5K5c3dr+JWPUzodg+ZSV3JHNk10GEKIRohLTGSpFgjOw4W43J70Gllp4gQ\noUi+c1uQU08nWq1X8P32LB5+73kqXJX8KvM3dE26JNDhCSGaQFEULu2SgqPKxZY9RYEORwjRSJJ4\ntSAWiwm3x8F+ZrGl7Tj227YzvM119Db/ItChCSF8YPClbQBY+b8jAY5ECNFYkni1IAmJReRxO/kX\n/5MK0z4iorQc2nMg0GEJIXykVVIUXTMS2JVfyrEiR6DDEUI0guzxakFiLvmBStM6VH01evQku3UU\n1BQ0+P1n7hFrapV8IYTvDb60DTsPlfDND0e57ZougQ5HCHGBZMarhVBVlfWV6zFFa4iLNtE6MRmn\n4iTVkNrga9TdI9bUKvlCCP/o1TmZ+GgD3207TlWNK9DhCCEukCReLYCqqize9yllrlLaxLWhnSYD\nY42RDGeGt2REQ0gFeyGCn06rYWCvNlRWu1m/wxrocIQQF0iWGkOY3WHnvZy3+b58I0drjnB5u74k\n21MoNZbW1um69U6iohq+VGg2V2CxqCiK0qAK9rI0KVqqadOmsWrVKpKSkli8eDEAM2fO5JtvvsFg\nMNCuXTuef/55oqOjAZg7dy4ff/wxWq2WJ598kgEDBvg1vqt7tmbxtwdZufkoA3u29v7CJIQIfjLj\nFcLey3mbNepqdni2UWmsIMGWwIM3PMIfx8xg0sjJF5R0QW2V/K5dl2GxvI/Vupjq6hrs9nNv4JWl\nSdFSjRs3jvnz5592bMCAASxZsoTPPvuMjIwM5s6dC8DevXtZtmwZS5cu5dVXX+XPf/6ztwWXvyTE\nGLmsSzJHCu3sO1ru17GEEL4lM14hbIdjOwdc+9Bp9GQmdsPmsDfpetHRURiNJszmESiKws6dKv/4\nxwqMRgMWi4kOHdzceefF3lktWZoULVWfPn04evToacf69+/v/bhXr14sX74cgJUrVzJy5Eh0Oh3p\n6elkZGSQl5dHz549/Rrj4MvS2fRjISt/OEKn9Di/jiWE8B2Z8QpRB8r2s69qD6BwcWJXTDrTBW2k\nP5czk6lVq6q9s1qbNl192qyWNNcW4WrhwoUMHDgQAKvVSqtWrbyvmc1mrFb/773KbBdPq6RINu0q\noNxR4/fxhBC+ITNeIcjiOM7rW+fRqVUXEu2JKBWa03ovNsWZ+7zAcc5ZLWmuLcLR7Nmz0ev13HDD\nDQD1Lis2dM9VSkpMk2IZdVVH5n26lf/tO8EtQ5u/tERT4w8GoX4PoR4/tIx7uBCSeIWYkqpi5uXN\npsJVyW3d76BP2uU+vf6ZyVR1dRw7d9a/4V6aa4tws2jRInJycnj77be9x9LS0jh+/Lj3c4vFQmpq\nw2afCwttTYonq30CRr2WxWv2c+Ulqeh12iZd70KkpMQ0Of5AC/V7CPX4IfTvoTFJoyReIcTutDM3\n7z+UVZdxQ4cbfZ50wdnJlN3u8CZiHTp4uPNOmdUS4eHMmazVq1fz2muv8c4772AwGLzHhwwZwmOP\nPcZdd92F1WolPz+frKysZokxMkLH4Mva8OWGfLI3H+W6vu2aZVwhRONJ4hUiqt3VvL51HoUVhQxq\nO4TB7YY2y7h1E7FQ/81EiIaaOnUqGzZsoLS0lEGDBjFlyhTmzp2L0+lkwoQJAPTs2ZMZM2bQqVMn\nRowYwfXXX49Op2P69OnNWt7h+n4ZrMk9xpJ1B7mqZyuiIvTNNrYQ4sIpqp+ee37llVf48MMPSUpK\nAuDRRx/l6quvPu/7guV/7MGSZNgddj7d9AGfHv6cCk8Ft2b9mjt7TghI3Z5g+TuB4IklWOKA4ItF\nnJ+v/r2+3JDPh9/s5bq+7bh1cCefXPN8gunrrbFC/R5CPX4I/XsIuqXG8ePHM378eH8O0aLZHXbu\nfWU830dtoFqtIdGYSNmBUpReUixRCPGTob3b8PXmw3y96QhDL0snKS4i0CEJIc7Br+Uk/F1EsKWb\nv3weq6u+pRw7NYqTyupK1uzPCXRYQoggo9dpGXtVB1xuD5+u3R/ocIQQP8OvM17vvvsun332Gd27\nd+cPf/gDMTGy/AANb7WzMO8bnAYPqqpF8UTgqK7mfLmstPERIjz165bG8o35fLfVwrW/aEd6anSg\nQxJC1KNJe7zGjx9PUVHRWccfffRRevXqRUJCAoqi8Pe//53CwkKee+65JgXbUsyYsZpNm67ylmjo\n02cNM2acvv9t3eF1jHlxEqXaIjylLtB6MJ4w8NRNj/L4bY836dpCiODg670teftO8I+PcsnqmMQj\nt/i3cn6o782B0L+HUI8fQv8emn2P1xtvvNGg82699Vbuu+++Bp0bLP8A/vxi2L9fg9PpPu3zumPt\nOLGdN7bNp3XkRUT+cAM2wyY8NR66J7fn1ivv+Nm4znftpgimb5BgiSVY4oDgi0U0vx4dEslsF0/e\nvhPsOlRCZkZCoEMSQpzBb3u8CgsLvR9/9dVXdOnS/FWVg9XPtdo5ULaft7e/jlbR8I9fP8ktWUMY\nl/YX7s76A28985fzNr6WNj5ChC9FUbjl5FON7329B5fbE+CIhBBn8tser1mzZrFz5040Gg1t2rTh\n6aef9tdQIae+Vjt2h5052a+wouRL9Bo9Tw9+nktaXcIlv7+wmQxp4yNEeLuoVSxX92zF6tzjLFl3\niNEDLgp0SEKIOvyWeM2cOdNflw559bXaeXnxiyyp/JwafQ0d4jqxOW8jvdP7+OTaQojwcuvgzmzd\nX8wX3x3k0s7JtDPL0q8QwcKv5SREwzicDr4qWUGNu4Z2se1JjTJTUFPg83FsNgezZq1n6tRcZs5c\nh93u8PkYQojAi4zQcdeITNweldeX7pQlRyGCiCReAVbtrmb+1rk43U6qCyPJ3xrBli0W4ojz+Vhz\n5mxly5ZBWK1XkJs7mNmzt/p8DCFEcOjRIYkre6SRb7WzbEN+oMMRQpwkiVcAuT1uFmx/g0Plh0g6\n0R/j+t/i3JtJ9YZhlO/o5fPxLBaTt9WQoihYLCafjyGECB6/GtqZuGgDn689wJFCe6DDEUIgTbID\nwu6w8+6qt8gp+4ZSVynXX3IjnuMD0GivhGpAC8Un1vt8XLO5AotF9db4kicehWjZoiL03HltJv/8\nOI/Xl+zkyTt6o9XI79tCBJJ8BwbAezlv85VrOYc4iMvkQrFCK3O138tATJ6cRa9eqzCb19Or1yp5\n4lGIMNCrczL9upk5aLGxZN2hQIcjRNiTGa8A+L58I8c9xzDpIslMuoTi8mIeaYYyEPLEoxDh6dfD\nurArv5TP1hygQ+tYul+UFOiQhAhbkng1s02WjRytOYLeaCAz6RJ0io5UQ6okRUIIv4k26XlgbA9e\neHcz8z7fwVN39SE5TvZ4ChEIstTYjHac2M5/f3yfrLY9ucF0I2m2NDIrM7lt4J2BDk0I0cJ1aB3L\nb67pgr3Syb8XbcPpcp//TUIIn5MZr2ZysOyAtxXQvZc9wEVxHQIdkhAizAzs2Zr9x8pZm3ecd1bs\nZvzIroEOSYiwIzNezcDqsDB/61zcqofbLxkvSZcQIiAUReH24V3ISIthTd5xcrYcDXRIQoQdSbz8\nrLSqhHl5s6lwVXJLl1/RLbl7oEMSQoQxvU7LA2O6ExWh492vdrP3aFmgQxIirEji5UcOp4N5ebMp\nrS7l+g6juLxV30CHJIQQJMebuG90dzweePmjXI5KcVUhmo0kXn5S2wpoHtYKKwPTBzG47dBAhySE\nEF7dLkpk/MhMHFUu/vZhLkVlUlBZiOYgiZcf/NQK6CC9zX0Y1XGMt1WPEEIEiyt7tOLWwZ0osVXz\n0n9zKa+oCXRIQrR48lSjj6mqyoc/vs/O4p1kJmbyy4t/06Sky2Zz8O9//8D+/RrM5gomT84iOjrK\nhxELIcLZdX3bYausYdn6fP7+YS6P//pSTEb5X4MQ/iIzXj72xf7P2WT9nnYx7bij2wS0Gm2Trjdn\nzlY2bboKq/UKcnMHM3v2Vh9FKoQ4l2nTptG/f39GjRrlPVZWVsaECRO49tprmThxIjabzfvas88+\ny/Dhwxk9ejQ7d+4MRMhNcvPAjlyV1YpDFhv/+jiPGqfU+BLCXyTx8qFVh1ey6vBKUiJTmJh1L0at\nscnXtFhM3hkzRVGwWKTatBD+Nm7cOObPn3/asXnz5tGvXz+WL19O3759mTt3LgA5OTnk5+ezYsUK\nnn76aaZPnx6IkJtEURTuuO5iLu2czK78Uv7xUS6V1a5AhyVEiySJl49ssmxk8b7PiDPGMSnrfqL1\n0T65rtlc4ffm2UKI0/Xp04fY2NjTjmVnZzN27FgAxo4dS3Z2tvf4mDFjAOjZsyc2m42ioqLmDdgH\ntBoN943uTu8uKezKL2XW+z9gkz1fQvicJF4+sPPEDv774/tE6kxMyrqfxAjfNaCdPDmLPn3WYDav\np1evVX5pni2EOL/i4mKSk5MBSElJobi4GICCggLS0tK855nNZqxWa0BibCq9TsN9Y7pxZY80Dlps\n/PW9HyixVQc6LCFaFNlB2UQHyw7w9o7aVkATekwiLaqVT68fHR3FjBlXU1hoO//JQohmd2pGuq6G\nPlCTkhLj63B84vE7Lmf+4m18vno/f33/B565tx+tk8+exQ/W+C9EqN9DqMcPLeMeLoQkXk1gdViY\nv20eLo+bu7pNlFZAQrRgSUlJFBUVkZycTGFhIYmJiUDtDJfFYvGeZ7FYSE1NbdA1g/kXqtH9MtCo\nKp+uOcDv/7mGB8f1oFObOO/rKSkxQR1/Q4T6PYR6/BD699CYpFGWGhvJ2wrIWSGtgIRogc6cyRoy\nZAiffPIJAIsWLWLo0NqiyEOHDuXTTz8FYMuWLcTGxnqXJEOZoijceOVF3HZNF2wVNfz13f+xOvdY\noMMSIuTJjFcjSCsgIVq2qVOnsmHDBkpLSxk0aBBTpkxh0qRJPPzww3z88ce0bt2al19+GYCBAweS\nk5PDNddcg8lk4vnnnw9w9L41tHc6aUmRzPl0G28u28VBi43fDOsc6LCECFmKWt8GhQAKlinHc01/\n1rhrmJP7bw6VH+Tq9IHc2HGs36vSB8tUbLDEAcETS7DEAcEXizi/YPn3aoiC0kpe+TiPI4UOOqXH\n8dTEK3BVOwMdVpME0/dMY4R6/BD69yBLjX7m9rhZsKO2FdBlqb2bJekSQohgkBpv4snb+/CLzFT2\nHinjkb+vYuv+E4EOS4iQI4lXA6mqyke7P2DHiR21rYAym9YKSAghQo3RoOW+0d24ZXBHyh01/P3D\nXBYs/5HqGql0L0RDSeLVQEv2L+Z7y0ZvKyCdRrbHCSHCj6IojOibwUsPD6RNchTf/HCUGW9sZN/R\nskCHJkRIkMSrAXIOf8M3h7N92gpICCFCWYc2cTx1Vx+uu7wdBSWVPPfOZhau2iezX0KchyRe57HZ\n+j2f7/vU562AhBAi1Ol1Wm4d0onHf3MpSbERLF1/iGmvruf7XQX1FpYVQkji9bN2ntjBB7ve80sr\nICGEaCkubpfAMxP7ckP/DGwVNcz+dBsvfrCFo0WOQIcmRNCRxOscDpT81ApofI97fN4KSAghWhKj\nQcu4qzvyzN19yeqYxM5DJcx4fSPvrthNqV36PQpxiuwQr4fVYeH1H2fj8ri5s9sEOsR1DHRIQggR\nEswJkTxyS0+27C3ig6/3kP2/I6zOO8bgS9sw8ooMYqMMgQ5RiICSxOsMqqryxvbXcHgc3Nzll3RP\n7hHokIQQIuT06pRM94sS+XbrcRZ/d5AV3x9m1ZajDO2dzrW/aCcJmAhbknidQVEUuidl0bl1Bheb\negY6HCGECFk6rYaBvdrQv3srVuce44t1B1m2Pp+vvj/CFd3MDO/TlvRUeWBJhBdJvOpxQ8cbQ76N\ngRBCBAu9TsPQ3ulcldWKNXnH+WrTYdbmHWdt3nG6ZiQw/Bdt6dEhCY1GilKLlk8SLyGEEM3CoNcy\ntHc6gy9rQ97eE6z4Pp+dh0rYeaiEhBgjV/ZI48oerTAnRAY6VCH8RhIvIYQQzUqjKPTqnEyvzsnk\nW22s/N9RNu608sV3h/jiu0N0To/jyh6tuKxLCtEmfaDDFcKnJPESQggRMO3MMdw1IpNfD+vM/34s\nZO3W4+w8VMKeI2UsWP4jmRkJ9L44hcs6p8iGfNEiSOIlhBAi4Ix6Lf26p9GvexpFpZVs2Gll04+F\nbD9QzPYDxSxY/iNd0uPp3iGRHh2SaJsajaLInjAReiTxEkIIEVSS401c36891/drT2FpJZt/LGTz\n7gJ2Hy7lx8OlfJyzn7goA90uSqRb+0S6tI0nKS4i0GEL0SCSeAkhhAhaKfEmruvbjuv6tqO8ooYd\nB4rZur+Y7QdO8N02C99tswCQFBtBl7bxdGkbR6c2cbRKipKnJEVQksRLCCFESIiNNHBFtzSu6JaG\nR1U5bLWzK7+E3YdL2X24lHXbLazbXpuIGQ1aMswxXNQqhotaxZJhjiEl3iTJmAg4SbyEEEKEHI2i\nkJEWQ0ZaDNde3g6PqnK8yMHuw6XsP1bOAYuNPScTslMMOg1tUqJIT4kmPSWazI7JRGoVEmKNaGS/\nmGgmkngJIYQIeRpFoU1KNG1Sohl8We2xymoX+VYbB47bOFxg50ihnXyrnQPHTxbHzt4D1CZk5sRI\n0hIjSU0wkRJvIjkugpR4E4mxRrQaTYDuSrREkngJIYRokUxGHRe3S+DidgneYy63B2txBYcL7diq\n3Ow7XILlRAWWkgoOF9jPuoZGUYiPMZAYE0FCjJHEWCMJMRHERxuIjzYSF20gPsqI0aBtzlsTIUwS\nLyGEEGFDp9V4Z8bqtobzqCqltmoKSyspLK2iqKyy9uOyKkrKq9h/rByPqp7zuka9lphIPTGRBmJP\n/hkdqSfapCcqQke0qfbjyAg9JqOWSKOeCKNWljjDUJMSry+//JJXXnmFffv2sXDhQrp16+Z9be7c\nuXz88cdotVqefPJJBgwY0ORghRAi0N58800WLlyIoih06dKF559/noKCAn73u99RVlZGt27dmDlz\nJjqd/F4bSjSKQmJsBImxEVzc7uzXPR6VMkcNxeVVlNiqKbVXU+aoqf3TXkN5RQ22CieHC2y43OdO\n0OpSqJ2VMxm1RBh0RBi1mAw6IgxajAYtEXodxpMfG/VaDHoNRl3tnwa9ltTyaioc1ei1GvR6DQad\nFr1Og16rQadTZIk0SDXpJ0OXLl145ZVXeOqpp047vm/fPpYtW8bSpUuxWCyMHz+eFStWSLE7IURI\ns1qtLFiwgGXLlmEwGHjkkUdYsmQJOTk5jB8/nhEjRjB9+nQWLlzIr371q0CHK3xIo1FIiDGSEGP8\n2fNUVaWy2o2togZ7pRN7pRNHlRN7pQt7pZPKahcVVa6TfzqpqHZRWe2m1F5NVbEbt6dhSVuDYlYU\ndDqlNhHTatBpFbTa2sRMq61NzHRaBa1GQafVoNHUfqzVatBplJ8+P/mxpu7HysmPFQVFo6BR8B73\n/qlw8jUFReHksZMfaxQURSHuuA1beVXtuYoCCt6PFeocAzh5TU4er32p9nUFTv558vip1zh1fp33\neQ+eet+psahz/unnAURF6H3SwqpJiVeHDh2A2i+0urKzsxk5ciQ6nY709HQyMjLIy8ujZ8+eTRlO\nCCECzuPxUFlZiUajoaqqitTUVDZs2MDf/vY3AMaOHcu//vUvSbzClKIoREboiIzQYb7A96qqisvt\nobLaTZXTTU1N7Z/VNW6qalzUuDzUON3UOD3UuNzo9DpKyytxujzUuDw4T/7ncv/0sdPlweXx4HKr\nuFweqp1uHJVO3B4Vl1vF7fbgu1SvZdNqFGbe3/+8yff5+GUu3Gq10qtXL+/nZrMZq9Xqj6GEEKLZ\nmM1mxo8fz6BBgzCZTFx55ZVccsklxMbGojm5rJOWlkZBQUGAIxWhSFEU9Dotep2W2AacX3ePWlN4\nPLUJn9uj/vTfyc89qorn5LEz/1RVtc45tfvk1FOvnbyuR609z+OpTSxVTp6n1r4eHW2k3FaFqta+\n7lFVUH86R+Xk5M7JYwC1k4InX1dBpfb1k4dR67x28szTXj917Kfz8Z6sej88+XmdrDQm0kBsVDPM\neAlO6GAAAA3tSURBVI0fP56ioqKzjj/66KMMGTKk3vecOQMGyDKjECLklZeXk52dzTfffENMTAwP\nP/wwq1evPus8+XknQolGo2DQBOapTF8lj6HkvInXG2+8ccEXTUtL4/jx497PLRYLqampDXpvSkrM\nBY/nLxLL2YIlDgieWIIlDgiuWFqi7777jrZt2xIfHw/AsGHD+OGHHygvL8fj8aDRaEL2511jhHr8\nEPr3EOrxQ8u4hwvhs0ce6s5yDRkyhKVLl1JTU8Phw4fJz88nKyvLV0MJIURAtG7dmtzcXKqrq1FV\nlfXr19O5c2f69u3Ll19+CcCiRYsYOnRogCMVQgQrRa1vXbCBvv76a5555hlKSkqIjY0lMzOT1157\nDagtJ7Fw4UJ0Op2UkxBCtBivvPIKS5YsQafTcckll/Dss89isVj43e9+R3l5OV27dmXWrFno9U3f\nCyKEaHmalHgJIYQQQoiGk+pqQgghhBDNRBIvIYQQQohmIomXEEIIIUQzCcrEa8GCBVx33XWMGjWK\nF198MdDhMH/+fDIzMyktLQ3I+DNnzmTEiBGMHj2aKVOmYLfbmz2G1atXc91113Httdcyb968Zh8f\nasuS3HHHHYwcOZJRo0bx9ttvBySOujweD2PHjuW+++4LaBw2m42HHnqIESNGcP3115ObmxuQON58\n801uuOEGRo0axdSpU6mpqQlIHMEsGL6XLtS0adPo378/o0aN8h4rKytjwoQJXHvttUycOBGbLXhr\nMZ3rZ0co3UNNTQ233HILY8aMYdSoUbzyyisAHDlyhFtvvZVrr72W3/3ud7hcrgBH+vPO/JkZavEP\nGTKEG2+8kTFjxnDzzTcDjfg6UoPM+vXr1fHjx6tOp1NVVVU9ceJEQOM5fvy4OmHCBHXw4MFqSUlJ\nQGL49ttvVbfbraqqqs6aNUt98cUXm3V8t9utDhs2TD1y5IhaU1Oj3njjjerevXubNQZVVdWCggJ1\nx44dqqqqqt1uV4cPHx6QOOp644031KlTp6r33ntvQOP4v//7P3XhwoWqqqqq0+lUbTZbs8dgsVjU\nIUOGqNXV1aqqqurDDz+sLlq0qNnjCGbB8r10ob7//nt1x44d6v+3d/8xVdV/HMef9yI14pJTLxOS\nO0cymhk/osQSho1QF1p4p9DGam1JbFkhxKLLbWjT1A2Lrf5QdGZWI1km5lpuLSFx1gWGYdQcOknG\nj+KScSl+SVzu+f5B3K/gRcGvnHOv3/fjL+7h3HNeZ5zz4X3POfe8165d655WXFys7N+/X1EURdm3\nb5+ye/dureLd1GRjhy9tg6IoysDAgKIoiuJ0OpX09HTl3LlzyubNm5UTJ04oiqIoW7ZsUQ4fPqxl\nxJuaOGb6Wv7k5GSlp6dn3LTp7kded8br8OHDvPTSS8yaNfps17lz52qaZ+fOnRQUFGiaYfny5e52\nJLGxsXR2dqq6/sbGRhYuXMiCBQvw9/dnzZo1VFZWqpoBIDg4mMWLFwMQGBjIokWLNG3N0tnZSXV1\nNenp6ZplAOjr66O+vp7169cDMGvWLAwGgyZZxvoYOp1Odx9D8V/ecixN16OPPsq9945vYlNZWYnZ\nbAZG+1OePHlSi2hT4mnssNvtPrUNAAEBAcDo2S+n04lOp6O2tpbVq1cDo9vw7bffahnxhjyNmTU1\nNT6TH/5ta+RyjZs23f3I6wqvlpYW6uvrycjI4Pnnn+fnn3/WLEtVVRWhoaE88MADmmWY6IsvviAp\nKUnVddrtdkJDQ92v58+fr3kvuvb2dpqamjR9MO9YUa51e5j29nbmzJlDYWEhZrOZoqIirl69qnqO\na/sYJiUlERQUxPLly1XP4c288Vi6Vd3d3RiNRmC0sHE4HBonmpqxsSMmJoY///zTp7bB5XKxbt06\nEhISSEhIwGQy+VSf0IljpsPhYPbs2T6TH0bbgW3cuJH169dz5MgRgGnvRzPSJPtmJuv/mJuby8jI\nCH///Teff/45jY2N5Obmzugnwhtl2bdvHwcPHnRPU2bwkWdT6Ym5d+9e/P39x91noYaZ3O5b0d/f\nT05ODlarlcDAQE0ynDp1CqPRyOLFi6mtrdUkwxin08n58+fZsmULUVFR7Nixg/3795OTk6Nqjol9\nDHNycvjqq69U31+9mbcdS/9vJo4dWn9omi69Xs+XX35JX18fr7zyCs3NzdfN463b5GnMVP5toH0t\nb80/pry8nODgYLq7u3nxxRcJDw+fdmZNCq8b9X8sLy9n1apVAERHR6PX63E4HMyZM0fVLBcvXqSj\no4O0tDQURcFut7sr3Hnz5qmWY8yxY8eorq7W5IbykJAQfvvtN/dru92u2SUkp9NJTk4OaWlppKSk\naJIB4Mcff6Sqqorq6mqGhobo7++noKCA4uJi1bOEhIQQEhJCVFQUAKtXr3Z3kFDTxD6GK1eupKGh\nQQqva3jTsfS/mjdvHleuXMFoNPLHH39oflvIzXgaO3xtG8YYDAaWLl3KTz/9dMt9QtXmaczcuXMn\nvb29PpF/THBwMDB6G1RKSgqNjY3T3o+87lJjSkoKNpsNgMuXL+N0Omes6LqRyMhIvv/+eyorK6mq\nqmL+/PkcO3ZsRoqumzl9+jQHDhxg79693HXXXaqvPyoqitbWVjo6Ovjnn3/4+uuvNetFZ7VaiYiI\n4IUXXtBk/WNef/11Tp06RWVlJSUlJSxbtkyTogvAaDQSGhrK5cuXgdF7JhYtWqR6Dk99DLXI4c28\n6ViarolnJpKTk6moqAB8oz+lp7HDl7ahu7vb/W25q1evYrPZiIiI8Jk+oZ7GzHfffddn8gMMDg7S\n398PwMDAAGfOnCEyMnLa+5HXtQwaHh7GarXS1NSEv78/FouF+Ph4rWPx5JNPcvToUfeneTWtWrWK\n4eFh97pjYmJ4++23Vc1w+vRpduzYgaIobNiwgezsbFXXD3D27Fmee+45IiMj0el06HQ68vLyVL/n\nbaK6ujoOHjxIaWmpZhmampp46623cDqdmEwmdu3aRVBQkOo5PPUxlJ6F43nDsTRd+fn51NbW0tPT\ng9Fo5LXXXiMlJYXNmzfz+++/c9999/H+++9fdwO+t5hs7IiOjiY3N9cntuHChQtYLBZcLhcul4vU\n1FRefvll2trafK5P6LVjpi/lb2tr49VXX0Wn0zEyMsLTTz9NdnY2PT0909qPvK7wEkIIIYS4U3nd\npUYhhBBCiDuVFF5CCCGEECqRwksIIYQQQiVSeAkhhBBCqEQKLyGEEEIIlUjhJYQQQgihEim8hBBC\n+KyMjAzMZjNr1qxhyZIlmM1mzGYzVqt12svKysoa11lgMoWFhZw7d+5W4k7L+fPn+eabb2Z8PUJd\n8hwvIYQQPq+jo4MNGza4O594MtaaxlccOXIEm81GSUmJ1lHEbaRJr0YhhBBiptlsNoqLi4mMjOTC\nhQvk5+fT3d1NWVkZTqcTYFx3lBUrVnDo0CHCw8PJzMzk4YcfpqGhga6uLtauXUtubi4AmZmZbNq0\nicTERN544w0MBgPNzc3Y7Xbi4uLYtWsXAJ2dnRQUFOBwODCZTIyMjJCcnMyzzz47LueVK1fIz8/H\n4XAAkJiYSFZWFnv27GFgYACz2cyyZcuwWCw0NDRQUlLC4OAgADk5OSQlJdHa2kpmZiapqanYbDb0\nej1bt24lLi7O4/ILCgpm/g8gPJLCSwghxB3r4sWLbN++nYceegiAv/76i7S0NACam5vJysriu+++\n8/heu93OZ599Rm9vLykpKaSnp7NgwYLr5rt06RIfffQRLpeLtLQ06urqiI+PZ9u2bSQlJZGVlUV7\nezvPPPMMycnJ173/+PHjREREUFRUBEBvby9BQUFs2rSJmpoa3nvvPXf2bdu28eGHHzJ37lzsdjsZ\nGRmcOHECGC3gYmNjsVqt2Gw28vPzOXnypMflC+1I4SWEEOKOdf/997uLLoCWlhY++OADurq68PPz\no6uri56eHo99eJ966ikAgoKCCA8Pp7W11WPhtXLlSmbNGv13+uCDD9La2kp8fDy1tbW88847AISF\nhU3adzg2NpaysjICAgJYunQpiYmJHuc7e/Ys7e3tbNy40d203M/Pj7a2Nu655x4CAgJITU0F4PHH\nH8fPz4+WlpYpL1+oQwovIYQQd6zAwMBxr/Py8ti6dSsrVqzA5XIRHR3N0NCQx/fefffd7p/1ej0j\nIyPTmk+n000p4yOPPEJFRQU//PADR48e5cCBA3z66afXzacoCkuWLOHQoUPX/a61tfW6aS6XC51O\nN+XlC3X4zl2GQgghxA1M5btifX19hIWFAVBeXj5pMXU7xMfHU1FRAYze/F9XV+dxvvb2dgwGA6mp\nqVgsFn755RcADAbDuMuCcXFxXLp0ifr6eve0xsZG98+Dg4Puy441NTUALFy4cNLlC23IGS8hhBB3\nhKmcYbJarWRnZzN79myeeOIJgoKCPL5/4rIm+92N5isqKuLNN9/k+PHjhIWFERMTM259Y2w2G598\n8gl+fn4oisL27dsBSEhI4OOPP2bdunU89thjWCwW9uzZw+7du+nt7WV4eBiTyURpaSkARqORxsZG\nSktL0el0lJSU4OfnN+nyhTbkcRJCCCHEDBgaGsLf3x+9Xo/dbic9PZ2ysjJMJtNtX9fYtxrPnDlz\n25ctbi854yWEEELMgF9//ZXCwkIURcHlcpGXlzcjRZfwLXLGSwghhBBCJXJzvRBCCCGESqTwEkII\nIYRQiRReQgghhBAqkcJLCCGEEEIlUngJIYQQQqhECi8hhBBCCJX8B3vfi4PTacKnAAAAAElFTkSu\nQmCC\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xc443950\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "#@test {\"output\": \"ignore\"}\n",
-        "\n",
-        "# Use the same input data and parameters as the examples above.\n",
-        "# We're going to build up a list of the errors over time as we train to display later.\n",
-        "losses = []\n",
-        "\n",
-        "with tf.Session() as sess:\n",
-        "    # Set up all the tensors.\n",
-        "    # The input is the x values with the bias appended on to each x.\n",
-        "    input = tf.constant(bias_with_x)\n",
-        "    # We're trying to find the best fit for the target y values.\n",
-        "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
-        "    # Let's set up the weights randomly\n",
-        "    weights = tf.Variable(tf.random_normal([2, 1], 0, 0.1))\n",
-        "\n",
-        "    tf.global_variables_initializer().run()\n",
-        "\n",
-        "    # learning_rate is the step size, so how much we jump from the current spot\n",
-        "    learning_rate = 0.002\n",
-        "\n",
-        "    # The operations in the operation graph.\n",
-        "    # Compute the predicted y values given our current weights\n",
-        "    yhat = tf.matmul(input, weights)\n",
-        "    # How much does this differ from the actual y?\n",
-        "    yerror = tf.subtract(yhat, target)\n",
-        "    # Change the weights by subtracting derivative with respect to that weight\n",
-        "    loss = 0.5 * tf.reduce_sum(tf.multiply(yerror, yerror))\n",
-        "    gradient = tf.reduce_sum(tf.transpose(tf.multiply(input, yerror)), 1, keep_dims=True)\n",
-        "    update_weights = tf.assign_sub(weights, learning_rate * gradient)\n",
-        "    \n",
-        "    # Repeatedly run the operation graph over the training data and weights.\n",
-        "    for _ in range(training_steps):\n",
-        "        sess.run(update_weights)\n",
-        "    \n",
-        "        # Here, we're keeping a history of the losses to plot later\n",
-        "        # so we can see the change in loss as training progresses.\n",
-        "        losses.append(loss.eval())\n",
-        "\n",
-        "    # Training is done, compute final values for the graph.\n",
-        "    betas = weights.eval()\n",
-        "    yhat = yhat.eval()\n",
-        "\n",
-        "# Show the results.\n",
-        "fig, (ax1, ax2) = plt.subplots(1, 2)\n",
-        "plt.subplots_adjust(wspace=.3)\n",
-        "fig.set_size_inches(10, 4)\n",
-        "ax1.scatter(x, y, alpha=.7)\n",
-        "ax1.scatter(x, np.transpose(yhat)[0], c=\"g\", alpha=.6)\n",
-        "line_x_range = (-4, 6)\n",
-        "ax1.plot(line_x_range, [betas[0] + a * betas[1] for a in line_x_range], \"g\", alpha=0.6)\n",
-        "ax2.plot(range(0, training_steps), losses)\n",
-        "ax2.set_ylabel(\"Loss\")\n",
-        "ax2.set_xlabel(\"Training steps\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "TzIETgHwTexL"
-      },
-      "source": [
-        "This code looks very similar to the code above, but without using `l2_loss` or `GradientDescentOptimizer`. Let's look at exactly what it is doing instead.\n",
-        "\n",
-        "This code is the key difference:\n",
-        "\n",
-        "\u003e`loss = 0.5 * tf.reduce_sum(tf.multiply(yerror, yerror))`\n",
-        "\n",
-        "\u003e`gradient = tf.reduce_sum(tf.transpose(tf.multiply(input, yerror)), 1, keep_dims=True)`\n",
-        "\n",
-        "\u003e`update_weights = tf.assign_sub(weights, learning_rate * gradient)`\n",
-        "\n",
-        "The first line calculates the L2 loss manually. It's the same as `l2_loss(yerror)`, which is half of the sum of the squared error, so $\\frac{1}{2} \\sum (\\hat{y} - y)^2$. With this code, you can see exactly what the `l2_loss` operation does. It's the total of all the squared differences between the target and our estimates. And minimizing the L2 loss will minimize how much our estimates of $y$ differ from the true values of $y$.\n",
-        "\n",
-        "The second line calculates $\\begin{bmatrix}\\sum{(\\hat{y} - y)*1} \\\\ \\sum{(\\hat{y} - y)*x_i}\\end{bmatrix}$. What is that? It's the partial derivatives of the L2 loss with respect to $w_1$ and $w_2$, the same thing as what `gradients(loss, weights)` does in the earlier code. Not sure about that? Let's look at it in more detail. The gradient calculation is going to get the partial derivatives of loss with respect to each of the weights so we can change those weights in the direction that will reduce the loss. L2 loss is $\\frac{1}{2} \\sum (\\hat{y} - y)^2$, where $\\hat{y} = w_2 x + w_1$. So, using the chain rule and substituting in for $\\hat{y}$ in the derivative, $\\frac{\\partial}{\\partial w_2} = \\sum{(\\hat{y} - y)\\, *x_i}$ and $\\frac{\\partial}{\\partial w_1} = \\sum{(\\hat{y} - y)\\, *1}$. `GradientDescentOptimizer` does these calculations automatically for you based on the graph structure.\n",
-        "\n",
-        "The third line is equivalent to `weights -= learning_rate * gradient`, so it subtracts a constant the gradient after scaling by the learning rate (to avoid jumping too far each time, which risks moving in the wrong direction). It's also the same thing that `GradientDescentOptimizer(learning_rate).minimize(loss)` does in the earlier code. Gradient descent updates its first parameter based on the values in the second after scaling by the third, so it's equivalent to the `assign_sub(weights, learning_rate * gradient)`.\n",
-        "\n",
-        "Hopefully, this other code gives you a better understanding of what the operations we used previously are actually doing. In practice, you'll want to use those high level operators most of the time rather than calculating things yourself. For this toy example and simple network, it's not too bad to compute and apply the gradients yourself from scratch, but things get more complicated with larger networks."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": null,
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "output_extras": []
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 164,
-          "status": "ok",
-          "timestamp": 1474671842705,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
-        "id": "ty5-b_nYSYWR",
-        "outputId": "311b7bff-5c8b-43ee-da0f-439a879636d1"
-      },
-      "outputs": [],
-      "source": [
-        ""
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "default_view": {},
-      "name": "Untitled",
-      "provenance": [],
-      "version": "0.3.2",
-      "views": {}
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
deleted file mode 100644
index 824fe14..0000000
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ /dev/null
@@ -1,2053 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "9yupXUk1DKOe"
-   },
-   "source": [
-    "# MNIST from scratch\n",
-    "\n",
-    "This notebook walks through an example of training a TensorFlow model to do digit classification using the [MNIST data set](http://yann.lecun.com/exdb/mnist/). MNIST is a labeled set of images of handwritten digits.\n",
-    "\n",
-    "An example follows."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:20.863031",
-     "start_time": "2016-09-16T14:49:20.818734"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "id": "sbUKaF8_uDI_",
-    "outputId": "67a51332-3aea-4c29-8c3d-4752db08ccb3"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: DeprecationWarning: decodestring() is a deprecated alias, use decodebytes()\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAMYAAABFCAYAAAARv5krAAAYl0lEQVR4Ae3dV4wc1bYG4D3YYJuc\nc8455yCSSIYrBAi4EjriAZHECyAk3rAID1gCIXGRgIvASIQr8UTmgDA5imByPpicTcYGY+yrbx+t\nOUWpu2e6u7qnZ7qXVFPVVbv2Xutfce+q7hlasmTJktSAXrnn8vR/3/xXmnnadg1aTfxL3/7rwfSP\nmT+kf/7vf098YRtK+FnaZaf/SS++OjNNathufF9caiT2v/xxqbTGki/SXyM1nODXv/r8+7Tb+r+l\nnxZNcEFHEG/e3LnpoINXSh/PWzxCy/F9eWjOnDlLrr/++jR16tQakgylqdOWTZOGFqX5C/5IjXNL\njdt7/NTvv/+eTjnllLT//vunr776Kl100UVpueWWq8n10lOmpSmTU5o/f0Fa3DDH1ry9p0/++eef\naZ999slYYPS0005LK664Yk2eJ02ekqZNnZx+XzA/LfprYgGxePHitOqqq6YZM2akyfPmzUvXXXdd\nHceoic2EOckxDj300CzPggUL0g033NC3OKy00krDer3pppv6FgcBIjvGUkv9u5paZZVVhoHpl4Mv\nv/wyhfxDQ0NZ7H7EQbacPHny39Tejzj88ccfacqUKRmHEecYf0Nr8GGAQJ8gMHCMPlH0QMzmEBg4\nRnN4DVr3CQIDx+gTRQ/EbA6BgWM0h9egdZ8g8PeliD4RutfF/Ouvfz9OtZy8aNGiNH/+/GGWl112\n2XzseYuVNKtqsaI23Ghw0DYCA8doG8JqO+AUG2+8cVq4cGHaY4890vLLL5/WXXfdfI6jvPDCC3lJ\n8amnnkoezP3000/pl19+GThHtWpIPekYomTxFS7HnkqKjMsss0yGgFE4r62tSBFVJ02aNPyconi9\nV4/JwzHwT9ZNNtkkeZ6w5ZZbph133DH99ttv6ccff8zXX3nllcRRnHNfv2cNGMQWGRaOrWbUrjsG\nBRLAA6U4Lhoqw9h2223ztRBq6aWXzsbgvueffz4Lu9NOO2UnYTgrr7xy7tO9nOH111/Pbb744ov0\nww8/jAvngAdFMvQDDjggG/0GG2yQX1GZNm1aziCCwzrrrJPl3muvvXKwePnll9M333wzHDCKWPbL\nMbuAkfISjnvvvXcW/emnn85lqCBqa4a65hiYR/Gk2RNGRlwm3n7ggQfmdrKD9sqJtdZaKxvCnDlz\n8n3Tp09PXmPYeuutc0SVNQjvnmuvvTa3efzxx9N33303PGZ5rF75DBvvqq233nrp22+/TWeddVby\nikpgxCE4vQDhlQUBRfDw2esbs2fPTquvvnqviNN1PuIdJ4GErVx44YUZowsuuCB9+umn6eeff84B\nspmsWqljhPFDxjGGYx/lDkN33udajCoVlAjRzl4U8LjefRwnPjsXG8OJqKBd8NB1LTU5IHyCd7LJ\nGOYXNoGjFqaGIKtrERDIDKtukfGMH/zRZa1A101+YBF44KfMYzO8VOYYjDWiukiGqc022yyXOUqd\nzTffPJ/z1ialeqNVxA9gi0wzlOJ5juJlR8JeddVV+ZrIKTq4ZvJp/8EHH+SU+txzz+W2SqmxVFZR\nplrH5DTRXmGFFdKuu+6azjjjjOzosl5g6D54CQCI4mGjhNQO5occckh2LvLTA6fqJOEnyhU6kNlk\nZmUuvrtNcFx77bUzhsZWXgoSsm6t4Dsa/tp2DErCmA04HAI4FLjaaqtlBhmnSKiNY4rDtHZFB6jF\nMMH0RVDH+nCPYxtDCFJnKkniRbDitWjTK3sykQUuMLPn3DZGX8SFnCG/fVyz5zCCBtIHTLshdzif\n8fERn8cKXxjCNOwCTu3Qf6yqhV4AQokiP489//zzM0DxnQYKwqAtIkko1kQzFFxvaNcJ6u3Pe+65\nJ/cRRvDee+9lA2BInIyRff/997nNO++8k7t0vl2A6vHWynmyiPJ43WKLLbIijz/++LTddtvlTCdz\nwIWSg9yjxBJ0GN/DDz+c7zv77LOzbEceeWSekwVGgsOsWbNyNo0+qt7DfPvtt8/dmtvIGnPnzk3P\nPPPMsJ6rHrNef/BBeJA90RprrJEDcNhctMkXR/mnbccwuCjNGTbaaKMc8TBZprITxOdgOvbuKxqG\nz6LSJ598kseJ9Gi1CYmSv/76a3YyJZWMZJ6Ceskp8EMusihFEAyUmVaa8G2rxTNHIrd733///eH7\nYeaLNe5xrEzlWNF/HqQDf0Tm+GIbvYdD43MsKAIo/JDgE0G5aFfN8NaWYxiUshikqGYTTUSt0TCk\njXsYNqJQQso+rgGa0vX58ccf56hQTtk+48F92rmvlnE1A0on2uKP0Yrw+Nxzzz0zn+ZhjKwRXq6v\nueaa2TmUiRQfS7SyNeMks9IV9vrvJOl/q622yo4Mfw5Pvm6TMclLdit6shh+YAMnq1E29tEsteUY\nBgMSgxa5MOAzJZcVXQs4bUR8XxhCHIwzMALCBuCcx5q0tF3u133l8XrRMchFiRYNyMxBKM/5IjZl\nWVzjULKwACISytIWFsi56aab5mvOKyEikmdAO/iHY+BDCRUZuoPD1e1akECyLseA7d13352DhdKa\nk8Cmlt3U7TSl9p58FwejYK8ncAwKpDTnGDcARbWiAUjHiNEHsITSPlagpEZChcfrZzwSOfBOiQwX\nLuR3PjAhtwAD08iAMCO/a+5xPTIm3ALjwERf0V+c69QeT7ZujVdLDhgKBrANXAMreMESRkU7rdVP\nrXNtZ4xIpSLH1VdfnR3j4IMPzkbw2Wefpa+//jovo5188slZsZjArAcvFP3YY4+lSy+9NEdTdTTy\n0I5xHHfccfm1CH2LtuORKEqmkwVlVU+sBY+IdJRmE0zeeOONnEXuu+++7AhnnnlmWn/99XMJ5brt\nzTffzHMJx/o555xzkgdb0U8rRtAKrnTYqtG1Ml6teyxInHDCCdlGYByBmG2Z97ChVvFo2zEwbHCR\nTbqP7EDxPjN2pUBEe86AXAcsg+f10TYMSTvnRM1ulQe1wG/nHEXZZEJZUIYQ5cgWMsEgMgqclFdk\ndh+MbFFyuddnWMLNfTYkcuuXHlBkpFYNI3dS+mMMfCHHsZWadfUjmQVn8iLywscG21apMscQwR55\n5JEM3KuvvpoZ5LHOmzgjAvBwzFt2/Oijj3Lm4Ayin/MU/eGHH+b2N998c/5MGSaZ44nw7OEd5Rx7\n7LE5+1EehYXxkpes5li2K6+8Mhv8Lrvsko381ltvzcEBfvHQKh5auk9GPvHEE3NJAx+/eKL/HXbY\nIQcbK3nwN067xAk4s5VHdbvsx0nxrYQeKxJMZAfBA7GlRx99NC9EtCN7JY4RoPBeAHIAyrB3jpHY\nwqu1d02d7HpZcfqINo5dL7eJMXtxTzk2sgWFM/gcsnCakI2cFOk+523O+Qw7WaeYHYpYRp9xn4Bk\nbPdWSfgJXYYM+ne+2xRj2sdx8EDu8rm4Ntp9pY4RSmb0CIPOAVNGoLA47yU4S2xen37ppZdy9CkL\nE/3lm8bJHzJbbiavt2Q9p7AkK7oyXAZOLk7gs9c4PJC0AOE8DDyrgJkaWgYQkSPYuAdpWySfteU8\nHhqKouYq+io6ZfGeZo7xpbT1+jt+jGULfprpq922ePHMBibwjWVq523KVrzBsIzTaMeu1DFi0HI0\nYyyYtAekY5MltbRyihFJiROBKIYTwMCTWJNubwdQFCXFapK9z96mtbjgs3thFKWnUgjBzNZIya5F\nOyUcPG36q4LwRgZ6Ix8HtBk3tirGGU0feAkslHfk5PzBh2cXSkvtWqWOOEaRGcoSHdXDMoYn1tK8\nyaON0ahbCWgFS/vxSnjn5F4ItLeiFAGAzCKc7MDA1OlIjc4pLFKE7FEyxb5ZPNTbtuiv2fvrtddf\nOFsYXcwj8d8qv/XGq3femLvvvnvOvrIYPPEjG+PDseDbDnXcMXiyiGiyyACOPvrovN95552zV3/+\n+ef5zVveznlEo6CICvG5l/d4JSvHP+qoo7JjKDs4PkVSGPm9HSz9W5rlPEoCQYHjVFXyRGnBOcKA\n28VOP/qTBWX6YnS2IKB8qYL/enyGHPbKziOOOCLj6sGeslGW8L6Y4ANr2MY99fpsdL7jjmFwkSTS\nr6gDVCk+tmDQedcJ5LgdwaLPbu7xjJRRNlErSsiQhVHJlOEQoh182o1wRTnharwYs3itnWP9Rd/R\nD5mLW5yveh/YRhYMjItyBh/wjPat8tEVx6B00RKo5513XpIl7rzzzuwEourMmTOz95uIcyBfTSXY\niy++mCOrSFS1klsFrNZ9eGPoJtmeyRx00EE5cpGbIi21XnbZZbkMee2117KMHIKMIVcotVb/vXoO\nz6I0+URoMlVFcBFE7L1+IjNYIo6v/fo+D3tC+FCR+FHuwNUCgfOtUlccI5hnJMoIBhN1sBICqMoN\nNaLP3pkiFGciIIBC4HaEbRWk0dyHb3Mp/EY0I6+NsytvyKxsKhpQr8ozGpm1IZ8IbV+PyllGuyh1\nYBXXOQEcy6R8M5eAHzuxxX3GRvbaCKJ4aRfXrjkG5jEbk00Prxi8SZTJKmc5/PDDc5v99tsvC+hB\njWtqStmD0F4Ma1foMvDtfqZMUc3/lYjMSFFW3NS7JtyyoKzSiTocHoFJHMc+MlK7Mta7n9NbATJe\nrbEYvQWIWCVitIyaXrV3nsG7H2Y2GVcbxyj6NX+waKEPmOvbfShwtjhQDDz5Ygt/uuoY+OPtnICD\nEMBTWsAQUu0NBBsDEgFEWOADAiDaVRERWsCq5i34IRN+TbTJgn8KwzOFuR4KDUXW7Kyik53Ep8w/\n+RkxWeO5S1EM5wVABguXMGp69dk1x87D0ObdL32GHI5tsDQGHtwbm/Hw4TpnKvNY5Ge0x113DEwT\n3tIsIdSnDIfxcxJAevCHfE9cXcmotHXfAw88kIFUdgFjLMn4HuZRuh9FExmjRCCnZxRqcPxz8ioU\nVk9eRhJkPAYHV8ZVFRkjjFSfAtw222yTy2OZ0iv15fHcQ4dKaMcwsBdEEL26RzaIh5+yK7LSBGPn\no8yOZX+vzRhfXzZ8cRrtyzzkzpr803XHwB8wTJYIRol+VY8zqMMBbP0f+cExE1qTdbU7x3jwwQdz\nVBYdesExKNiEWx2MfwoOAyCbJ9uRHZvUTcPmsENhGNE4HBKOHKNqZzQu3KNfX9H1nRABQZlbNkpt\n4SNo4DWIIesDj9qYnwki2giWqol3330348kZLPm7xvi1Pffcc7MzhA3gy/0oeIuxWtmPiWNgNCIF\nYwcCAa2FA1ikJZz1aeUVsBmge9TyoqGoIqKUFdEKCFXcU0/pHJizVMUnXBiBh6IicdTTzsEOnuZk\nDE/2rcJI4KMf/TF+0TucwDhkZ+DGL4/nGkPGV/AIC+2RvfP6ZPTI4gu5XNM/Um7RPzuIFyn1zW7w\npQ9UHj+fbOHPmDlGCOGBGIeQQfwuq0jnISBQfOHft7JEHN94Q5xF6XLFFVfkyKIEGyuiGAo3r6BI\nx0imcM6k+6GHHspOEQbcDq+UTl4BwRu7PstUiPEJFsa9/PLL83nXg6d2xnUvoxS5L7744uGyh/wy\nRpRF9YwSHsHjE088kWWADQeRFThZkTgBstensZG5h4m56oEdcAp9CwTOVUlj6hgECcGBpA6XDaze\niLKhVABQAhKB3cNxbEAL4KoEppm+gjf3OMafDf+UW7zeTL/ltqIiAxBMOIIxnLOHgbFsMGQ4InhE\n0nJfrXw2hnIRD3SFBKmYWDfqE49woFvOzZno3NxM0HDciMjBDsjEBgLTsJHYN+qjmWtj7hjBLKFF\nQgL7qRz14jHHHJPBcC2M3wRPVDT5ohzZRv0Z16O/sdozAKmdopUH5kftTrzJpl+lk29CcgpLw3Bg\npMbwwqF/S80pGJ6xO0WM+8Ybbxw2TuOEoTYakwyovB/JKdzDMVQOHvCRzXju890fL11aGhcMqqIx\ndwwCRkYQDZAaE7lWBhyosQEmQM439MgffDHm0Si8EcuBC0ezcQSZVKYktzFEW+3sfQ4natRvu9eM\nTS9F7IvHo+m/2fb6LNuCc0WsW+mzHq9j6hgE9YCHp5tkez2EAVjlMOmyUlU2Lis8ygVR0rykyolt\nPZCaOY9fr32Qp50X6xi7pWCGbsHBvwLgGIcddljGxvcsjOU1GseyiKjJQWydpiqNsBlei85BfhNx\neJunVCl31x0jBOMAjJ9jRC3OEERDS7QMI0qQohIYgLSq7FJuMZbi9WZA7kRbvFAWx5Dyy449mjED\nG/dyDPW4VSiy2iNvBcCSUdxyyy35OYHrqJUx843j8I/qQpA074BVVdR1x+AIHCIiIGewsqIuds41\ntSSlOxeOFHuOQ/E+2zPEuFYVKM32U3RMvGy44YbZMTg2B2+GOIXXJcjpR9lkUy/QyZ7GUU8zAD9R\nCiuR0oQYVv1IMAk7qFL+rjkGg7GZQPLufffdN69QKJtkCAKKjNGu1p7gMgWDYEDRpkpAmu0rnMLe\nhie/RavcI49Sr1ZW0w6V91ac/IsxmdHPB0U5pQ+4+TExDudNUhPufnaKIn7N6m2k9h11jKLRqP+U\nQJb2eHh4uYjK0LW1D0MpCq0NR4g24RTR/0hCdvM6/m14FtljeTL4D/liedFeO7LYcyh7eMGDY8X1\n6IM8Vp9kWjj2GwWG5IZb2FKVOHTMMTCvDKBgD2Z22223bNynnnpqVrZXBFxjQDZUFJiwIqKHN8qH\nO+64IxvN/fffn9vG/VWC0UpfeC5uZMEbg/ctM/8SzYOxZ599Nhs4ebSx0ECpcDFvMCdRggkesoQ+\nzaHU0N4EgAEnue2227JTON+LgaEVDFu5h+w2Wdl33GFkEUIQqYIqdYwwbJGO8q2xOydqUiTFWpJV\nPzsuUwhlzzFETxlGdFSCqaMB4XwvUzgKWU3AyW4uwFns4QMbilUyxbq8p/4cw3UEB8FDGQUDx/ac\nqB8zRS2dw5qthe3VatPKucocg6JiYu3lP2nfawvekKVITzgJQLH24QTBtPZeE2D89957b27jwZ1I\nwIm8R2OMWHmJ+3pxTzaK8l+HyMrgTzrppMxqOIEsGoZvz0nsyWiliRMUl2G9aOk6POyLZVUvYtBp\nniL4wA1m9lVSW46BOQqKpTLK9FnUsxftvW4swssa4dkhCGFCMNfcp08lhM9KKc4h0obgsa8ShHb6\nCv5DJnu8IwHB9TB852DkOlzIRV6kXbSVMfQj48BWdhE0TLr1Fe3zQR/+gRMK5yjuq4KjZccQ2SlY\njexHmCnSkiLjtsesmlnpQ5naFo1A5GMAHoJxBI709ttv54ygntZWmWEcQMS9VQleRT9kNmfAG0P3\nHRPGbHnVudg4gEyJOAYiE0wikHAAcxHyxndO4KI/WHEK/Qzo7wjAXfaFNdurikaNtIERRTqmYIYd\nE2tGEs8hfJ8iFB/3xV67MCjG8NZbb6Unn3wyC+XfDxfnDxFp496qhK6qn5CDA5twK/fIRH5Gb0MM\nOhxCFgkKjOBoHqKEkmWvueaanG04iTHcP3CKQO0/e3ZhgceP2smqcKyKRuUYlEKhPDL+d5z1c4qV\nFTDnmBIZMwZ9DiKAzTmvCetPNFR7W7fXXt/KLddqTcyjr17bRybkEF5XiQhPHnMuDlF07MCB3I49\nl4EDxTrnfsFBJBxQbQSKeGoROqjdurWzIzoGJqRxS2KUf/rpp2flcRDRjRKVCdpFhCwz7rOVKE5z\n++235/7uuuuuXDq5P5yKEY0np8B3TKb9K1/vLTF0/7MiJtyRPYrq4fx+7R2e7vFDDzDyfx1goPwc\nUGMEYG/rFI3oGAYW0UUyimQIcRwGzbgpVsZAUTYE065xCtc5GUeSHTyg4kzKs/FKoSBljyhvTz6y\n2gseZAwlwgI+cNBGtpV9ZRj4BobjFY9O8g0bQcXWaRpxBE5hHuFnJ0XB6dOn56ge2QGDlK2dFSSG\n4b8kxVzEdSWGVxgYQLzrxJkIGgbTaUE73b9MZ/KNfIMOJpdcckndYZWmFAwv+wgydW/o8wsCK3xn\nz56dFzx8oxPGtk7QiI5h0FBaeGzRKYIpjDN2ig6lB9OiprmI60qNieIMIXvsQy7yotjH9eI+2hbP\nDY4bI8D+2JdnWTYY+iwDs78qaUTHEM0sI1pClAVMnqX9ImGQszB6DHoNOLzZNZlGRlEq9JNB9JOs\nRXvoxDGnsDTudwFUHTNmzMjDqEaU9xYvGgWiZnka0TEo16CeNyCM1SLtwmt5cNEoCOUa5xjQAIFW\nEGBP5rbKdTRr1qwcfGUMthXVTCt917pnRMdwE6ZiQm0JckADBMYCgWLwtXjTSeq/d5Y7ieag7wmD\nwMAxJowqB4JUicDAMapEc9DXhEFgcjxcM7vvR4on7bHS1q84WNkpUr/iEL+aOLRw4cIlQCmuIhUB\nmsjHlpQ9c7EmzjEsN1vd6DeCg8UVT+qRd7b6EQey8wMT+6El8RSu36xhIO8AgQYI9F94bADG4NIA\ngUDg/wHX+3lgThDIegAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<IPython.core.display.Image object>"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from __future__ import print_function\n",
-    "\n",
-    "from IPython.display import Image\n",
-    "import base64\n",
-    "Image(data=base64.decodestring(\"iVBORw0KGgoAAAANSUhEUgAAAMYAAABFCAYAAAARv5krAAAYl0lEQVR4Ae3dV4wc1bYG4D3YYJucc8455yCSSIYrBAi4EjriAZHECyAk3rAID1gCIXGRgIvASIQr8UTmgDA5imByPpicTcYGY+yrbx+tOUWpu2e6u7qnZ7qXVFPVVbv2Xutfce+q7hlasmTJktSAXrnn8vR/3/xXmnnadg1aTfxL3/7rwfSPmT+kf/7vf098YRtK+FnaZaf/SS++OjNNathufF9caiT2v/xxqbTGki/SXyM1nODXv/r8+7Tb+r+lnxZNcEFHEG/e3LnpoINXSh/PWzxCy/F9eWjOnDlLrr/++jR16tQakgylqdOWTZOGFqX5C/5IjXNLjdt7/NTvv/+eTjnllLT//vunr776Kl100UVpueWWq8n10lOmpSmTU5o/f0Fa3DDH1ry9p0/++eefaZ999slYYPS0005LK664Yk2eJ02ekqZNnZx+XzA/LfprYgGxePHitOqqq6YZM2akyfPmzUvXXXddHceoic2EOckxDj300CzPggUL0g033NC3OKy00krDer3pppv6FgcBIjvGUkv9u5paZZVVhoHpl4Mvv/wyhfxDQ0NZ7H7EQbacPHny39Tejzj88ccfacqUKRmHEecYf0Nr8GGAQJ8gMHCMPlH0QMzmEBg4RnN4DVr3CQIDx+gTRQ/EbA6BgWM0h9egdZ8g8PeliD4RutfF/Ouvfz9OtZy8aNGiNH/+/GGWl1122XzseYuVNKtqsaI23Ghw0DYCA8doG8JqO+AUG2+8cVq4cGHaY4890vLLL5/WXXfdfI6jvPDCC3lJ8amnnkoezP3000/pl19+GThHtWpIPekYomTxFS7HnkqKjMsss0yGgFE4r62tSBFVJ02aNPyconi9V4/JwzHwT9ZNNtkkeZ6w5ZZbph133DH99ttv6ccff8zXX3nllcRRnHNfv2cNGMQWGRaOrWbUrjsGBRLAA6U4Lhoqw9h2223ztRBq6aWXzsbgvueffz4Lu9NOO2UnYTgrr7xy7tO9nOH111/Pbb744ov0ww8/jAvngAdFMvQDDjggG/0GG2yQX1GZNm1aziCCwzrrrJPl3muvvXKwePnll9M333wzHDCKWPbLMbuAkfISjnvvvXcW/emnn85lqCBqa4a65hiYR/Gk2RNGRlwm3n7ggQfmdrKD9sqJtdZaKxvCnDlz8n3Tp09PXmPYeuutc0SVNQjvnmuvvTa3efzxx9N33303PGZ5rF75DBvvqq233nrp22+/TWeddVbyikpgxCE4vQDhlQUBRfDw2esbs2fPTquvvnqviNN1PuIdJ4GErVx44YUZowsuuCB9+umn6eeff84BspmsWqljhPFDxjGGYx/lDkN33udajCoVlAjRzl4U8LjefRwnPjsXG8OJqKBd8NB1LTU5IHyCd7LJGOYXNoGjFqaGIKtrERDIDKtukfGMH/zRZa1A101+YBF44KfMYzO8VOYYjDWiukiGqc022yyXOUqdzTffPJ/z1ialeqNVxA9gi0wzlOJ5juJlR8JeddVV+ZrIKTq4ZvJp/8EHH+SU+txzz+W2SqmxVFZRplrH5DTRXmGFFdKuu+6azjjjjOzosl5g6D54CQCI4mGjhNQO5occckh2LvLTA6fqJOEnyhU6kNlkZmUuvrtNcFx77bUzhsZWXgoSsm6t4Dsa/tp2DErCmA04HAI4FLjaaqtlBhmnSKiNY4rDtHZFB6jFMMH0RVDH+nCPYxtDCFJnKkniRbDitWjTK3sykQUuMLPn3DZGX8SFnCG/fVyz5zCCBtIHTLshdzif8fERn8cKXxjCNOwCTu3Qf6yqhV4AQokiP489//zzM0DxnQYKwqAtIkko1kQzFFxvaNcJ6u3Pe+65J/cRRvDee+9lA2BInIyRff/997nNO++8k7t0vl2A6vHWynmyiPJ43WKLLbIijz/++LTddtvlTCdzwIWSg9yjxBJ0GN/DDz+c7zv77LOzbEceeWSekwVGgsOsWbNyNo0+qt7DfPvtt8/dmtvIGnPnzk3PPPPMsJ6rHrNef/BBeJA90RprrJEDcNhctMkXR/mnbccwuCjNGTbaaKMc8TBZprITxOdgOvbuKxqGz6LSJ598kseJ9Gi1CYmSv/76a3YyJZWMZJ6Ceskp8EMusihFEAyUmVaa8G2rxTNHIrd733///eH7YeaLNe5xrEzlWNF/HqQDf0Tm+GIbvYdD43MsKAIo/JDgE0G5aFfN8NaWYxiUshikqGYTTUSt0TCkjXsYNqJQQso+rgGa0vX58ccf56hQTtk+48F92rmvlnE1A0on2uKP0Yrw+Nxzzz0zn+ZhjKwRXq6vueaa2TmUiRQfS7SyNeMks9IV9vrvJOl/q622yo4Mfw5Pvm6TMclLdit6shh+YAMnq1E29tEsteUYBgMSgxa5MOAzJZcVXQs4bUR8XxhCHIwzMALCBuCcx5q0tF3u133l8XrRMchFiRYNyMxBKM/5IjZlWVzjULKwACISytIWFsi56aab5mvOKyEikmdAO/iHY+BDCRUZuoPD1e1akECyLseA7d13352DhdKak8Cmlt3U7TSl9p58FwejYK8ncAwKpDTnGDcARbWiAUjHiNEHsITSPlagpEZChcfrZzwSOfBOiQwXLuR3PjAhtwAD08iAMCO/a+5xPTIm3ALjwERf0V+c69QeT7ZujVdLDhgKBrANXAMreMESRkU7rdVPrXNtZ4xIpSLH1VdfnR3j4IMPzkbw2Wefpa+//jovo5188slZsZjArAcvFP3YY4+lSy+9NEdTdTTy0I5xHHfccfm1CH2LtuORKEqmkwVlVU+sBY+IdJRmE0zeeOONnEXuu+++7AhnnnlmWn/99XMJ5brtzTffzHMJx/o555xzkgdb0U8rRtAKrnTYqtG1Ml6teyxInHDCCdlGYByBmG2Z97ChVvFo2zEwbHCRTbqP7EDxPjN2pUBEe86AXAcsg+f10TYMSTvnRM1ulQe1wG/nHEXZZEJZUIYQ5cgWMsEgMgqclFdkdh+MbFFyuddnWMLNfTYkcuuXHlBkpFYNI3dS+mMMfCHHsZWadfUjmQVn8iLywscG21apMscQwR555JEM3KuvvpoZ5LHOmzgjAvBwzFt2/Oijj3Lm4Ayin/MU/eGHH+b2N998c/5MGSaZ44nw7OEd5Rx77LE5+1EehYXxkpes5li2K6+8Mhv8Lrvsko381ltvzcEBfvHQKh5auk9GPvHEE3NJAx+/eKL/HXbYIQcbK3nwN067xAk4s5VHdbvsx0nxrYQeKxJMZAfBA7GlRx99NC9EtCN7JY4RoPBeAHIAyrB3jpHYwqu1d02d7HpZcfqINo5dL7eJMXtxTzk2sgWFM/gcsnCakI2cFOk+523O+Qw7WaeYHYpYRp9xn4BkbPdWSfgJXYYM+ne+2xRj2sdx8EDu8rm4Ntp9pY4RSmb0CIPOAVNGoLA47yU4S2xen37ppZdy9CkLE/3lm8bJHzJbbiavt2Q9p7AkK7oyXAZOLk7gs9c4PJC0AOE8DDyrgJkaWgYQkSPYuAdpWySfteU8HhqKouYq+io6ZfGeZo7xpbT1+jt+jGULfprpq922ePHMBibwjWVq523KVrzBsIzTaMeu1DFi0HI0YyyYtAekY5MltbRyihFJiROBKIYTwMCTWJNubwdQFCXFapK9z96mtbjgs3thFKWnUgjBzNZIya5FOyUcPG36q4LwRgZ6Ix8HtBk3tirGGU0feAkslHfk5PzBh2cXSkvtWqWOOEaRGcoSHdXDMoYn1tK8yaON0ahbCWgFS/vxSnjn5F4ItLeiFAGAzCKc7MDA1OlIjc4pLFKE7FEyxb5ZPNTbtuiv2fvrtddfOFsYXcwj8d8qv/XGq3femLvvvnvOvrIYPPEjG+PDseDbDnXcMXiyiGiyyACOPvrovN95552zV3/++ef5zVveznlEo6CICvG5l/d4JSvHP+qoo7JjKDs4PkVSGPm9HSz9W5rlPEoCQYHjVFXyRGnBOcKA28VOP/qTBWX6YnS2IKB8qYL/enyGHPbKziOOOCLj6sGeslGW8L6Y4ANr2MY99fpsdL7jjmFwkSTSr6gDVCk+tmDQedcJ5LgdwaLPbu7xjJRRNlErSsiQhVHJlOEQoh182o1wRTnharwYs3itnWP9Rd/RD5mLW5yveh/YRhYMjItyBh/wjPat8tEVx6B00RKo5513XpIl7rzzzuwEourMmTOz95uIcyBfTSXYiy++mCOrSFS1klsFrNZ9eGPoJtmeyRx00EE5cpGbIi21XnbZZbkMee2117KMHIKMIVcotVb/vXoOz6I0+URoMlVFcBFE7L1+IjNYIo6v/fo+D3tC+FCR+FHuwNUCgfOtUlccI5hnJMoIBhN1sBICqMoNNaLP3pkiFGciIIBC4HaEbRWk0dyHb3Mp/EY0I6+NsytvyKxsKhpQr8ozGpm1IZ8IbV+PyllGuyh1YBXXOQEcy6R8M5eAHzuxxX3GRvbaCKJ4aRfXrjkG5jEbk00Prxi8SZTJKmc5/PDDc5v99tsvC+hBjWtqStmD0F4Ma1foMvDtfqZMUc3/lYjMSFFW3NS7JtyyoKzSiTocHoFJHMc+MlK7Mta7n9NbATJerbEYvQWIWCVitIyaXrV3nsG7H2Y2GVcbxyj6NX+waKEPmOvbfShwtjhQDDz5Ygt/uuoY+OPtnICDEMBTWsAQUu0NBBsDEgFEWOADAiDaVRERWsCq5i34IRN+TbTJgn8KwzOFuR4KDUXW7Kyik53Ep8w/+RkxWeO5S1EM5wVABguXMGp69dk1x87D0ObdL32GHI5tsDQGHtwbm/Hw4TpnKvNY5Ge0x113DEwT3tIsIdSnDIfxcxJAevCHfE9cXcmotHXfAw88kIFUdgFjLMn4HuZRuh9FExmjRCCnZxRqcPxz8ioUVk9eRhJkPAYHV8ZVFRkjjFSfAtw222yTy2OZ0iv15fHcQ4dKaMcwsBdEEL26RzaIh5+yK7LSBGPno8yOZX+vzRhfXzZ8cRrtyzzkzpr803XHwB8wTJYIRol+VY8zqMMBbP0f+cExE1qTdbU7x3jwwQdzVBYdesExKNiEWx2MfwoOAyCbJ9uRHZvUTcPmsENhGNE4HBKOHKNqZzQu3KNfX9H1nRABQZlbNkpt4SNo4DWIIesDj9qYnwki2giWqol3330348kZLPm7xvi1Pffcc7MzhA3gy/0oeIuxWtmPiWNgNCIFYwcCAa2FA1ikJZz1aeUVsBmge9TyoqGoIqKUFdEKCFXcU0/pHJizVMUnXBiBh6IicdTTzsEOnuZkDE/2rcJI4KMf/TF+0TucwDhkZ+DGL4/nGkPGV/AIC+2RvfP6ZPTI4gu5XNM/Um7RPzuIFyn1zW7wpQ9UHj+fbOHPmDlGCOGBGIeQQfwuq0jnISBQfOHft7JEHN94Q5xF6XLFFVfkyKIEGyuiGAo3r6BIx0imcM6k+6GHHspOEQbcDq+UTl4BwRu7PstUiPEJFsa9/PLL83nXg6d2xnUvoxS5L7744uGyh/wyRpRF9YwSHsHjE088kWWADQeRFThZkTgBstensZG5h4m56oEdcAp9CwTOVUlj6hgECcGBpA6XDazeiLKhVABQAhKB3cNxbEAL4KoEppm+gjf3OMafDf+UW7zeTL/ltqIiAxBMOIIxnLOHgbFsMGQ4InhE0nJfrXw2hnIRD3SFBKmYWDfqE49woFvOzZno3NxM0HDciMjBDsjEBgLTsJHYN+qjmWtj7hjBLKFFQgL7qRz14jHHHJPBcC2M3wRPVDT5ohzZRv0Z16O/sdozAKmdopUH5kftTrzJpl+lk29CcgpLw3BgpMbwwqF/S80pGJ6xO0WM+8Ybbxw2TuOEoTYakwyovB/JKdzDMVQOHvCRzXju890fL11aGhcMqqIxdwwCRkYQDZAaE7lWBhyosQEmQM439MgffDHm0Si8EcuBC0ezcQSZVKYktzFEW+3sfQ4natRvu9eMTS9F7IvHo+m/2fb6LNuCc0WsW+mzHq9j6hgE9YCHp5tkez2EAVjlMOmyUlU2Lis8ygVR0rykyoltPZCaOY9fr32Qp50X6xi7pWCGbsHBvwLgGIcddljGxvcsjOU1GseyiKjJQWydpiqNsBlei85BfhNxeJunVCl31x0jBOMAjJ9jRC3OEERDS7QMI0qQohIYgLSq7FJuMZbi9WZA7kRbvFAWx5Dyy449mjEDG/dyDPW4VSiy2iNvBcCSUdxyyy35OYHrqJUx843j8I/qQpA074BVVdR1x+AIHCIiIGewsqIuds41tSSlOxeOFHuOQ/E+2zPEuFYVKM32U3RMvGy44YbZMTg2B2+GOIXXJcjpR9lkUy/QyZ7GUU8zAD9RCiuR0oQYVv1IMAk7qFL+rjkGg7GZQPLufffdN69QKJtkCAKKjNGu1p7gMgWDYEDRpkpAmu0rnMLehie/RavcI49Sr1ZW0w6V91ac/IsxmdHPB0U5pQ+4+TExDudNUhPufnaKIn7N6m2k9h11jKLRqP+UQJb2eHh4uYjK0LW1D0MpCq0NR4g24RTR/0hCdvM6/m14FtljeTL4D/liedFeO7LYcyh7eMGDY8X16IM8Vp9kWjj2GwWG5IZb2FKVOHTMMTCvDKBgD2Z22223bNynnnpqVrZXBFxjQDZUFJiwIqKHN8qHO+64IxvN/fffn9vG/VWC0UpfeC5uZMEbg/ctM/8SzYOxZ599Nhs4ebSx0ECpcDFvMCdRggkesoQ+zaHU0N4EgAEnue2227JTON+LgaEVDFu5h+w2Wdl33GFkEUIQqYIqdYwwbJGO8q2xOydqUiTFWpJVPzsuUwhlzzFETxlGdFSCqaMB4XwvUzgKWU3AyW4uwFns4QMbilUyxbq8p/4cw3UEB8FDGQUDx/acqB8zRS2dw5qthe3VatPKucocg6JiYu3lP2nfawvekKVITzgJQLH24QTBtPZeE2D89957b27jwZ1IwIm8R2OMWHmJ+3pxTzaK8l+HyMrgTzrppMxqOIEsGoZvz0nsyWiliRMUl2G9aOk6POyLZVUvYtBpniL4wA1m9lVSW46BOQqKpTLK9FnUsxftvW4swssa4dkhCGFCMNfcp08lhM9KKc4h0obgsa8ShHb6Cv5DJnu8IwHB9TB852DkOlzIRV6kXbSVMfQj48BWdhE0TLr1Fe3zQR/+gRMK5yjuq4KjZccQ2SlYjexHmCnSkiLjtsesmlnpQ5naFo1A5GMAHoJxBI709ttv54ygntZWmWEcQMS9VQleRT9kNmfAG0P3HRPGbHnVudg4gEyJOAYiE0wikHAAcxHyxndO4KI/WHEK/Qzo7wjAXfaFNdurikaNtIERRTqmYIYdE2tGEs8hfJ8iFB/3xV67MCjG8NZbb6Unn3wyC+XfDxfnDxFp496qhK6qn5CDA5twK/fIRH5Gb0MMOhxCFgkKjOBoHqKEkmWvueaanG04iTHcP3CKQO0/e3ZhgceP2smqcKyKRuUYlEKhPDL+d5z1c4qVFTDnmBIZMwZ9DiKAzTmvCetPNFR7W7fXXt/KLddqTcyjr17bRybkEF5XiQhPHnMuDlF07MCB3I49l4EDxTrnfsFBJBxQbQSKeGoROqjdurWzIzoGJqRxS2KUf/rpp2flcRDRjRKVCdpFhCwz7rOVKE5z++235/7uuuuuXDq5P5yKEY0np8B3TKb9K1/vLTF0/7MiJtyRPYrq4fx+7R2e7vFDDzDyfx1goPwcUGMEYG/rFI3oGAYW0UUyimQIcRwGzbgpVsZAUTYE065xCtc5GUeSHTyg4kzKs/FKoSBljyhvTz6y2gseZAwlwgI+cNBGtpV9ZRj4BobjFY9O8g0bQcXWaRpxBE5hHuFnJ0XB6dOn56ge2QGDlK2dFSSG4b8kxVzEdSWGVxgYQLzrxJkIGgbTaUE73b9MZ/KNfIMOJpdcckndYZWmFAwv+wgydW/o8wsCK3xnz56dFzx8oxPGtk7QiI5h0FBaeGzRKYIpjDN2ig6lB9OiprmI60qNieIMIXvsQy7yotjH9eI+2hbPDY4bI8D+2JdnWTYY+iwDs78qaUTHEM0sI1pClAVMnqX9ImGQszB6DHoNOLzZNZlGRlEq9JNB9JOsRXvoxDGnsDTudwFUHTNmzMjDqEaU9xYvGgWiZnka0TEo16CeNyCM1SLtwmt5cNEoCOUa5xjQAIFWEGBP5rbKdTRr1qwcfGUMthXVTCt917pnRMdwE6ZiQm0JckADBMYCgWLwtXjTSeq/d5Y7ieag7wmDwMAxJowqB4JUicDAMapEc9DXhEFgcjxcM7vvR4on7bHS1q84WNkpUr/iEL+aOLRw4cIlQCmuIhUBmsjHlpQ9c7EmzjEsN1vd6DeCg8UVT+qRd7b6EQey8wMT+6El8RSu36xhIO8AgQYI9F94bADG4NIAgUDg/wHX+3lgThDIegAAAABJRU5ErkJggg==\".encode('utf-8')), embed=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "J0QZYD_HuDJF"
-   },
-   "source": [
-    "We're going to be building a model that recognizes these digits as 5, 0, and 4.\n",
-    "\n",
-    "# Imports and input data\n",
-    "\n",
-    "We'll proceed in steps, beginning with importing and inspecting the MNIST data. This doesn't have anything to do with TensorFlow in particular -- we're just downloading the data archive."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:20.958307",
-     "start_time": "2016-09-16T14:49:20.864840"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 110,
-     "status": "ok",
-     "timestamp": 1446749124399,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "w5vKZqr6CDz9",
-    "outputId": "794eac6d-a918-4888-e8cf-a8628474d7f1"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Already downloaded train-images-idx3-ubyte.gz\n",
-      "Already downloaded train-labels-idx1-ubyte.gz\n",
-      "Already downloaded t10k-images-idx3-ubyte.gz\n",
-      "Already downloaded t10k-labels-idx1-ubyte.gz\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "from six.moves.urllib.request import urlretrieve\n",
-    "\n",
-    "SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'\n",
-    "#SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'\n",
-    "# for those who have no access to google storage, use lecun's repo please\n",
-    "WORK_DIRECTORY = \"/tmp/mnist-data\"\n",
-    "\n",
-    "def maybe_download(filename):\n",
-    "    \"\"\"A helper to download the data files if not present.\"\"\"\n",
-    "    if not os.path.exists(WORK_DIRECTORY):\n",
-    "        os.mkdir(WORK_DIRECTORY)\n",
-    "    filepath = os.path.join(WORK_DIRECTORY, filename)\n",
-    "    if not os.path.exists(filepath):\n",
-    "        filepath, _ = urlretrieve(SOURCE_URL + filename, filepath)\n",
-    "        statinfo = os.stat(filepath)\n",
-    "        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')\n",
-    "    else:\n",
-    "        print('Already downloaded', filename)\n",
-    "    return filepath\n",
-    "\n",
-    "train_data_filename = maybe_download('train-images-idx3-ubyte.gz')\n",
-    "train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')\n",
-    "test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')\n",
-    "test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "gCtMhpIoC84F"
-   },
-   "source": [
-    "## Working with the images\n",
-    "\n",
-    "Now we have the files, but the format requires a bit of pre-processing before we can work with it. The data is gzipped, requiring us to decompress it. And, each of the images are grayscale-encoded with values from [0, 255]; we'll normalize these to [-0.5, 0.5].\n",
-    "\n",
-    "Let's try to unpack the data using the documented format:\n",
-    "\n",
-    "    [offset] [type]          [value]          [description] \n",
-    "    0000     32 bit integer  0x00000803(2051) magic number \n",
-    "    0004     32 bit integer  60000            number of images \n",
-    "    0008     32 bit integer  28               number of rows \n",
-    "    0012     32 bit integer  28               number of columns \n",
-    "    0016     unsigned byte   ??               pixel \n",
-    "    0017     unsigned byte   ??               pixel \n",
-    "    ........ \n",
-    "    xxxx     unsigned byte   ??               pixel\n",
-    "    \n",
-    "Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).\n",
-    "\n",
-    "We'll start by reading the first image from the test data as a sanity check."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:22.112407",
-     "start_time": "2016-09-16T14:49:20.960204"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 57,
-     "status": "ok",
-     "timestamp": 1446749125010,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "P_3Fm5BpFMDF",
-    "outputId": "c8e777e0-d891-4eb1-a178-9809f293cc28"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "magic number 2051\n",
-      "image count 10000\n",
-      "rows 28\n",
-      "columns 28\n",
-      "First 10 pixels: [0 0 0 0 0 0 0 0 0 0]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import gzip, binascii, struct, numpy\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "with gzip.open(test_data_filename) as f:\n",
-    "    # Print the header fields.\n",
-    "    for field in ['magic number', 'image count', 'rows', 'columns']:\n",
-    "        # struct.unpack reads the binary data provided by f.read.\n",
-    "        # The format string '>i' decodes a big-endian integer, which\n",
-    "        # is the encoding of the data.\n",
-    "        print(field, struct.unpack('>i', f.read(4))[0])\n",
-    "    \n",
-    "    # Read the first 28x28 set of pixel values. \n",
-    "    # Each pixel is one byte, [0, 255], a uint8.\n",
-    "    buf = f.read(28 * 28)\n",
-    "    image = numpy.frombuffer(buf, dtype=numpy.uint8)\n",
-    "  \n",
-    "    # Print the first few values of image.\n",
-    "    print('First 10 pixels:', image[:10])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "7NXKCQENNRQT"
-   },
-   "source": [
-    "The first 10 pixels are all 0 values. Not very interesting, but also unsurprising. We'd expect most of the pixel values to be the background color, 0.\n",
-    "\n",
-    "We could print all 28 * 28 values, but what we really need to do to make sure we're reading our data properly is look at an image."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:22.525418",
-     "start_time": "2016-09-16T14:49:22.114324"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 887,
-     "status": "ok",
-     "timestamp": 1446749126640,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "F_5w-cOoNLaG",
-    "outputId": "77dabc81-e3ee-4fcf-ac72-88038494fb6c"
-   },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgkAAAFkCAYAAACq4KjhAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzs3XuYbHV95/v3t29V3bX7tu8g9xhwxyTG3YIwitHgSFRC\nzDhPYgeOUU4yMSE+nD0niWNGB0YmN3zCJgokZmJUQHseBoejHhC8JBoFFGUrQd2So4MBJPvSu7ur\n7/ff+WOt3+JXq1dV36q6qrs/r+dZT1dXrar67dqw16e+v5s55xARERFJa6p3A0RERKQxKSSIiIhI\nJoUEERERyaSQICIiIpkUEkRERCSTQoKIiIhkUkgQERGRTAoJIiIikkkhQURERDIpJIiIiEgmhQSR\nbczMnjKzxYzjg/HjOTO7zcwGzWzMzO4xs72p1zjTzO4zswkzO2ZmN5mZ/m0R2QL0P7LI9vYyYH9w\n/FvAAXfHj98CvBF4M/Aq4HTgk/7JcRi4H2gBLgZ+A3gb8L4Nab2I1JRpgycR8czsFuANzrnzzawL\nOAm8xTl3b/z4BcBR4GLn3KNm9nrg08BpzrnB+JzfBv4M2OOcm6/LH0REqkKVBBEBwMxagauAD8d3\nvYyoQvBFf45z7kngaeCS+K6LgSd8QIg9CHQDL651m0Wktlrq3QARaRi/QnRx/1j8+z5g1jk3mjrv\nOFHXBPHP4xmP+8cez3ojM9sFXA78CJheV6tFtrc8cA7woHPuVLVfXCFBRLxrgM86544tc54RjVtY\nTqVzLgc+vtKGiciyrgI+Ue0XVUgQEczsLOC1wJuCu48BbWbWlaom7OX5asEx4MLUy+2Lf6YrDKEf\nAdx1110cOHBgrc2umkOHDnH48OF6NwNQWypppPY0SluOHj3K1VdfDfH/U9WmkCAiEFURjhPNVPAe\nA+aBywA/cPF84Czg4ficR4A/MrPdwbiE1wFF4HsV3m8a4MCBAxw8eLBaf4Y16+7uboh2gNpSSSO1\np5HaEqtJt51Cgsg2Z2ZGNG3xo865RX+/c27UzD4M3Gxmw8AY8AHgIefcN+LTPkcUBu40s3cBpwE3\nArc65+Y28I8hIjWgkCAirwXOBD6S8dghYAG4B8gBDwDX+gedc4tmdgXwV0TVhQngo8D1tW2yiGwE\nhQSRbc4593mgucxjM8A746Pc858BrqhN60SknrROgohse/39/fVuQkJtKa+R2tNIbaklrbgoIhvO\nzA4Cjz322GONNvhLZFM5cuQIfX19AH3OuSPVfn1VEkRERCSTQoKIiIhkUkgQERGRTAoJIiIikklT\nIEWkbt74xjeRy+UzH2tra+UTn/gYL3vZyza4VSLiKSSISN0cO/Yant/qoVRT0+3cd999CgkidaSQ\nICJ1dB2QPQWyuVmbRIrUm8YkiIiISCaFBBEREcmkkCAiIiKZFBJEREQkk0KCiIiIZFJIEBERkUwK\nCSIiIpJJIUFEREQyKSSIiIhIJoUEERERyaSQICIiIpkUEkRERCSTQoKIiIhkUkgQERGRTAoJIiIi\nkkkhQURERDIpJIiIiEgmhQQRERHJpJAgIiIimRQSREREJJNCgoiIiGRSSBAREZFMCgkiIiKSSSFB\nREREMikkiGxzZna6md1pZoNmNmlmj5vZwdQ57zOz5+LHP29mL0w93mtmHzezopkNm9nfmllhY/8k\nIlJtCgki25iZ9QAPATPA5cAB4P8GhoNz3gX8HvDbwEXABPCgmbUFL/WJ+LmXAW8EXgV8aAP+CCJS\nQy31boCI1NV/Ap52zv1mcN+/pM65DrjROfcZADN7K3AceBNwt5kdIAoYfc65b8XnvBO4z8x+3zl3\nrNZ/CBGpDVUSRLa3XwK+aWZ3m9lxMztiZklgMLNzgf3AF/19zrlR4OvAJfFdFwPDPiDEvgA44OW1\n/gOISO0oJIhsb+cBvwM8CbwO+GvgA2Z2dfz4fqKL/fHU847Hj/lzToQPOucWgKHgHBHZhNTdILK9\nNQGPOufeG//+uJm9mCg43FXheUYUHipZyTki0sAUEkS2t38FjqbuOwr8u/j2MaKL/T5Kqwl7gW8F\n5+wNX8DMmoFellYgUg4B3an7+uNDREIDAwMMDAyU3FcsFmv6ngoJItvbQ8AFqfsuIB686Jx7ysyO\nEc1a+CcAM+siGmtwW3z+I0CPmb00GJdwGVG4+Hrltz8MHKx8iogA0N/fT39/aYA+cuQIfX19NXtP\nhQSR7e0w8JCZvRu4m+ji/5vAbwXn3AK8x8x+APwIuBF4FvgUgHPu+2b2IPDfzex3gDbgg8CAZjaI\nbG41G7hoZtea2VNmNmVmXzOzC2v1XiKyNs65bwK/QlTffwL4z8B1zrn/EZxzE9FF/0NElYF24PXO\nudngpX4d+D7RrIb/F/hHonUVRGQTq0klwcx+DfgL4D8AjxJ1PD5oZuc75wZT5+4immP9I2C6Fu0R\n2UbywDnAg865Uyt5gnPufuD+Zc65AbihwuMjwNXlHheRzalW3Q2HgA855+4AMLN3EK3Cdg1wU+rc\ny4GP16gdItvVVUSrIIqIrFnVQ4KZtQJ9wJ/4+5xzzsy+wPOLr4R+BHDXXXdx4MABAA4dOsThw4er\n3bR1a8R2NWKbQO1arWq16+jRo1x99dUQ/38lIrIetagk7AaayV58JT2KGuIuhgMHDnDwYDTKubu7\nO7ndSBqxXY3YJlC7VqsG7VLXnYis20auuKiFVURERDaRWlQSBoEFosVXQnupsLDKoUOH6O6OFlV5\n9NFHufLKKzPnhIpIpB4Lq4jI9lL1kOCcmzOzx4gWU/k0gJlZ/PsHyj3v8OHDSbn1yiuv5NOf/nS1\nmyaypdRjYRUR2V5qNbvhZuBjcVjwUyA7gI+u5MmNWj1oxHY1YptA7VqtRm2XiGxvNQkJzrm7zWw3\n8D6ibodvA5c7506u5PmN+g9mI7arEdsEatdqNWq7RGR7q9myzM6524Hba/X6IiIiUlsbObtBRERE\nNhGFBBEREcmkkCAiIiKZFBJEREQkk0KCiIiIZFJIEBERkUwKCSIiIpJJIUFEREQyKSSIiIhIJoUE\nERERyaSQICIiIpkUEkRERCSTQoKIiIhkUkgQERGRTAoJIiIikkkhQURERDIpJIiIiEgmhQQRERHJ\npJAgIiIimRQSREREJJNCgoiIiGRSSBAREZFMCgkiIiKSSSFBREREMikkiIiISCaFBJFtzMyuN7PF\n1PG94PGcmd1mZoNmNmZm95jZ3tRrnGlm95nZhJkdM7ObzEz/tohsAS31boCI1N13gMsAi3+fDx67\nBXg98GZgFLgN+CRwKUAcBu4HngMuBk4H7gRmgfdsQNtFpIaqnvaX+2YiIg1n3jl30jl3Ij6GAMys\nC7gGOOSc+7Jz7lvA24FXmNlF8XMvB14EXOWce8I59yDwXuBaM9OXEJFNrlYlwe8A+4D98fHKGr2P\niKzfT5rZj83sh2Z2l5mdGd/fR1Rt/KI/0Tn3JPA0cEl818XAE865weD1HgS6gRfXvukiUku1Svrz\nzrmTNXptEamerwFvA54ETgNuAP7RzH6aKODPOudGU885Hj9G/PN4xuP+scer32QR2Si1Cgk/aWY/\nBqaBR4B3O+eeqdF7icgaxd0D3nfM7FHgX4BfJfr/N4sBbiUvv/wph4iKDqH++BCR0MDAAAMDAyX3\nFYvFmr5nLUJC2W8mzrmJGryfiFSJc65oZv8MvBD4AtBmZl2pasJenq8WHAMuTL3MvvhnusKQ4TBw\ncD1NFtk2+vv76e8vDdBHjhyhr6+vZu9Z9ZCwzDeTj5R73qFDh+juLv1GkfWBiEikFt8qzGwH8BPA\nx4DHiGY6XAbcGz9+PnAW8HD8lEeAPzKz3cG4hNcBRUADlkU2uZqPPk59Mynr8OHDHDyobxQiK1WN\nbxVm9n7gM0RB/gXAfyUKBv/DOTdqZh8GbjazYWAM+ADwkHPuG/FLfI4oDNxpZu8iqh7eCNzqnJtb\nz59PROqv5iEh+GZyR63fS0RW7QzgE8Au4CTwVeBi59yp+PFDwAJwD5ADHgCu9U92zi2a2RXAXxFV\nFyaAjwLXb1D7RaSGqh4SKnwzGaj0PBHZeM65iv15zrkZ4J3xUe6cZ4Arqtw0EWkAtagkLPfNRERE\nRDaBWgxc1EhDERGRLUCbsIiIiEgmra2+Cs65sr/72+V+VmJmy54Tnpf+uZbXWuv5IiKyfSgkrNLi\n4iLOuZLD37e4uMjCwkLJT3/bOYeZlb3QV7rgmxlNTU1lj6zXWOl9IiIi5SgkrEIYCsIQ4H+fn59n\nfn6eubm5JbcXFxeTi70PC+EB2Rd0gKamJpqbm2lpaaGlpaXkdktLS+brpG+bWRJUst5DREQkTSFh\nFcKA4I/5+fnk9uzsLDMzM8zOziaH/31hYaHkm3/6Z6UKQ1NTE62trbS2ttLW1lZye3Fxkebm5szg\nUS6MhK8vIiJSjkLCKqWrBmHFYHp6OjmmpqZKfg9DQrqroLm5GciuAAA0NzeTy+WWHH68Q1aVotJt\nT0FBREQqUUhYhXR3w9zcXMkxOTlZ9pifn0+6DXxA8LfTlQAo7SZobm6mvb2dfD5Pe3t70tUBUVeE\n70YIg0d4n389f67/XUREpBKFhFUIA0JYQfDdClNTU0xMTDA+Pl5yjI2NMT8/T3Nzc0kwCI9KXQQt\nLS3MzMzQ0dGRBATnXPI6/nZ4+Ps8f1846yI9RkFERCSkkLAK/sLqQ0IYEMKQMDY2xujoaMkxOzub\nDDr0Aw/Dn+UGNPqQMDc3l8ySgOii39LSQmtr65LZDz44hKEg5Ls3REREKlFIWAXnHPPz88zOzibj\nDvwxOTnJ6OhoZkAYGxtjdna2JBSkw0I6GIShobm5OQkkPpSEYx9aW1szuzHCikVW9cKfsxWVG6SZ\n9Rlnjdmo9HwRke1CIWEVFhcXmZubY2ZmhsnJycxuhfCnvz02Nsbc3FzmRXq57gaAlpYWZmdnkzDS\n0dFBe3s7HR0ddHR0lFQjyoWErOpFS0vLlg4J5Q5fhSn3mZQbG6KAICLbjULCKviQMD09nXQrFIvF\n5JiYmFhyjI+PMzExkQxcTF/Ew4GLwJILkq8kTE1Nkc/nM490SEi/h19PobW1tWR9ha0aEspVCvzP\nlpaWZCppW1tbyW3//OWqCyIi24FCwiosLi4yPz+fVBJGR0cZGRlhaGiI4eHhktkM/lt/enZDeMEK\nD1g6BdL/bGpqIpfLJRc0f9v/THcfpIOCDwd+fQV/+HCx1WR9vuHR2tq6JGjlcjmAJQM/m5qaWFxc\nLJlFIiKyXSgkrEJYSfDdDT4kDA4OlqyPEP6cmppKQkKlNQy8dFBYyYU+q0Lhj/CbcnpBppaWrfmf\nQPozCYNULpdLumr8apg+FPjqih/8CWzJaouIyEpszStEjYRjEiYmJpJKwqlTpzh58mQyy2F6enrJ\n7YWFhbL93OlQ4KW7HNJjC9IVhEohIV2F8MdWrSSkx3+En1k+n2dmZqZkxkjYDRF+Jv7vZyUbdYmI\nbDUKCauQriSMjY2VhIRwOeb0sbCwAFTeeCmUvq9c/3q57gs/zsGv1ugDgj/871uxkuD/3OHYi/D3\nsIIAz08nbWtrY35+viQQhBUfBQUR2W623hWihvxCSn4KpO9y8BWF9AqM/vD7O6RlhYOVXojSF7Ks\n5Z79bR8MfN97ePiQsNq+9o24YK61Tb4qUG6wZqFQKKkgNDc309raSi6XY25uruTPFo5REBHZbhQS\nVsFfTNrb2ykUCnR3dzMxMZF0J5QLCb6svVw3Q3oL6vTqiOG21JUWSwpf098OF4EK3y+cfhk+L11m\nz2pPrVQanxE+FrYnbFdYScjaPdNvihV2RYRHOOMhDB5bsWtGRKQShYRVCEfGFwoFurq6kkGJzrlk\nwaMwHPjf/Qj5cl0FYQAIf6ZvlzvKPT+8mPvZGUASGsqt9hjuAbHagLJWWWMyyh1ZgSo8yo3faG5u\nLgkJWdWGfD6/ZH+McCCjiMh2oZCwCmFI2LFjRzL4zY+MD0NB+qe/MJVb8Mh3ZYQXfv97uDW1Pypt\nWZ0+wkoCPB8QwmmZ5aZnlgsp/r71qDQWIyuwpMNLuVAU7muR9dOHo6wqQ2tra/Jn9H/n/hyFBBHZ\nbhQSViFdSfCD3/wMAj9IMVxC2d8Ov91mHeFFP+uCn96a2h9hN4ffT8If6e4CfxENuz7KhZYwvKQD\nSxgW1qvSjI5ygzLDkFDuqLROwuLi4pKuiHB6abqC4O9TSBCR7UYhYRV8GGhvb0+6GMwsCQ7pGQ1h\nWHDOZY6090dWEEjvNlnpmJmZSd4rvJCG1QN/8UsPeqy0XHQYXtLVi2qEBN+G9G0fDsotEpUVXMIA\nU2k9irCSkF53orW1Nfm79o/7vwuFBBHZbhQSViGsJPgLke/DLhQKFadAOudKFkQKv7n6kJCuBoS3\n06EjPGZmZmhtbWV6ejpZ+CfsUqg0riFrkF84wM+PYwgrHGGlYz2WG6CYFVrSIaFSF0t6QKY/fCUh\nPR7B/3349/CDGH13kUKCiGw3Cgmr4L9Z5vP5kgqC/xYffpsPL+A+JIQrHaZv+90l06EgvR11eDtc\nsMmvFJieyRAOiszqzgjDiw8rPiCUq3CElY71Kjd7IQwG5bpnVhpe0l0a/s+YXsHS/52EAWF2dlaV\nBBHZthQSViGcUw+lJem5uTlyuVzF7oYwGKSDQlbVIB0S0gHB/z41NVWyWFJ6dcVyF3nfZZKucIS3\n/cXXVzbSt9fzWVb6GX6bz1o5MQxV/jlht4OXdWEPP98w8ITdFbWawSEispkoJKxS2FeeVYIOHw8v\nuMCSrobwZ7qs7oPH/Px8ctHP6m7wK0CW2zdienq6JBSEF0Z/kS8XENKVhLALpJqVhKygkK4khGGh\npaUlCUfpA8qv6RCueZAetJmuKoTvF7ZRRGQ7UUhYhfTFJVytMH1h8xed8Bv7cgMX/TfkrDJ/1uyF\ndKUh3DMi7IpYaSUhKyyEYyXS7ajWmISsoJBVSQhvT01NMTY2xvj4OOPj48lCR75tWQtR+d/9+6T/\nvtLjRMJBkwoKIrIdrTokmNmlwB8AfcBpwJucc59OnfM+4DeBHuAh4Heccz9Yf3Prz/fxZ20CFF50\n0v3/QMU+dh8SfEDImgJZbnxAuQGN/lhJJWGjuxvCzy3rdvoCnq4q+B04c7lcstDRwsJCEo6y1nEo\nV0nwr++7gLI20FJAEJHtaC2VhALwbeDvgE+mHzSzdwG/B/wG8BTw34AHzeyAc252HW2tu/DiEt7n\nL2i+T9sP/Ev3kWdN6fO/Zy2iFP5eaWGlcjMi0t0D660kbER3Q1hJKDceobm5mWKxWLJjo99TY3Jy\nMukKgtL1IcJ1I1ZSSfCDQVVJEJHtatUhwTn3APAAgGX/q3kdcKNz7jPxOW8FjgNvAu5ee1MbQxgS\n/HQ6fyEKpw2mpxxC5Z0cK60guNySzOWqDZUqD2utJGRVItbzWa62kuBvFwqFki6GmZkZJicnkymM\nnp/mGQaE8P2yxiSE3UKqJIjIdlbVMQlmdi6wH/iiv885N2pmXwcuYZOHhPQFzS//C8tvzpT1/HQ/\nfKXnVzrKVSH87XJdDeutJFQrJGR9Ln4NinJBIZ/Pl3QxTE5OMjo6mkxhTF/UfSUhfJ+sMSRZYxLS\n7RQR2S6qPXBxP+CIKgeh4/Fjm176wt4IlgsQ660k1GrgIiwNBv6+5aZA+gDjuxhGR0dpb28v6YII\ng1a5GRTpmSh+Wup2rSSY2buBPwZucc79x/i+HHAz8GtADngQ+F3n3IngeWcCfw28GhgD7gD+k3Ou\nOstyikhdbNTsBiMKD2UdOnSI7u7ukvv6+/vp7++vZbtWZTNeKMIdDNPldt/NkZ5pEY6b8IM00xUR\ni5c3Xo9yXQ3pgYX+MR94/M+sdQ589cQHA7+2hf+5uLhILpcjn88nR3t7e8mRz+eTNSbSFYVGMjAw\nwMDAQMl9xWJxza9nZhcCvwU8nnroFuD1wJuBUeA2ovFIl8bPawLuB54DLgZOB+4EZoH3rLlBIlJ3\n1Q4Jx4gCwT5Kqwl7gW9VeuLhw4c5ePBglZuz/WRdyMJv5un7fEgIvzVn7QgZPjd8fjXaWm6tBP/T\nV0T8GBAzW7LgVDooeL6NYVvDgJAVFsKQ4KsKjRgSskL0kSNH6OvrW/VrmdkO4C6iWUnvDe7vAq4B\n3uKc+3J839uBo2Z2kXPuUeBy4EXAa5xzg8ATZvZe4M/M7Abn3Pr6pUSkbqoaEpxzT5nZMeAy4J8g\n+Ufm5UTfPqSG/AU1fTHzYSA8LwwJWXskpL/RZ1Uh0sFhPe3O+pnVdeJv+5CQtVR0OFgxfTQ1NSUr\nU4YVhXQlId3tsN5AtAncBnzGOff38QXeexnRvxPhOKMnzexponFGjxJVD56IA4L3IPBXwItZWpkQ\nkU1iLeskFIAXElUMAM4zs5cAQ865Z4hKk+8xsx8APwJuBJ4FPlWVFkumMCBk9cH72/7buJ926Qdf\nZk3PDC+s4fPDKZvVaHf6dlg9KDclNF1FSC+tnFUJ8bcrdTW0t7eTy+WWrJnQiJWEajGztwA/RxQI\n0vYBs8650dT94Tij/WSPQ/KPKSSIbFJrqSS8DPgHojEGDviL+P6PAdc4524ysw7gQ0SLKX0FeP1m\nXyNhM6gUFLIu9P5bebmpmZUCQtZiRWtpb7nfnXPJ7AsgCQc+DFTafyG9C2R6d8usMQn+Z0dHx5IB\nnFs5JJjZGUTB/t865+ZW81SWGWcUW+acQ0B36r7++BCRULXHIa3EWtZJ+DJQsfbqnLsBuGFtTZL1\nyOpy8IHAP5buPvDPyzqynp81ELIa7Q6lg4if7lhuaep0d4NvW9Y6COmuhnRFIT2jYiuHBKKVU/cA\nj9nzf8Bm4FVm9nvALwI5M+tKVRP28ny14BhwYep198U/0xWGlMOAxiKJrEQ1xyGtlPZu2IKyvqGH\nF/RyF/dKpf+s59YyJIRrPEBpJaHcktNhJSHcVyMMCelwkNXlkO522cIBAeALwM+k7vsocBT4M+DH\nwBzROKN7AczsfOAs4OH4/EeAPzKz3cG4hNcBReB7tWy8iNSWQsIWsJIL2HoucrW4QC4XMMLBin5v\ninC3y4mJiWTnx5mZmaSisLCwUFJFCMOBH29QKBTo6Oigo6OjZDaDn9GQXkBpKy/L7JybIHUhN7MJ\n4JRz7mj8+4eBm81smGgNhA8ADznnvhE/5XPxa9xp0bLspxGNRbp1lV0YItJgFBKkIaRDgx+cOD09\nzcTERMkxPj7OqVOnGBoaolgsMj4+ztTUFHNzc0n3hF9wKZ/PJ6GgUChQKBTo7e2lp6eHzs5OCoVC\nMpsha4XFrRgMViCd4A4BC8A9RIspPQBcm5zs3KKZXUE0m+FhYIKoGnH9RjRWRGpHIUHqLlwZ0f/0\ngxN91WB0dLTkGBoaYmhoiNHR0SQkzM7OJoMWm5ubaWtrI5/P09HRQWdnJ11dXXR2dtLb20t3d/eS\nkNDS0lJ2XMZ24pz7hdTvM8A746Pcc54Brqhx00RkgykkSF1l7XHhuxl8F4MPCcPDw8kxMjLCyMgI\nxWKRiYkJpqenk5AAUVeDryT4kNDd3U1PTw89PT1JSPBdDr6SkBUOtmNQEBEBhQRpAFkbW/nuhrCS\nMDw8zODgIIODg4yOjjI2Nsb4+HjZ7gZfSSgUCnR2dtLT08POnTtLQkKlSgI05l4dIiIbRSFB6qbS\nxlS+u2F6eprx8XGKxWISEo4fP87ExASTk5PJEXY3QOmYhLCS4EPCjh072LFjx5KQEK4L4SkgiMh2\npZAgDSO9BHM4cHFsbIyRkRFOnTrFiRMnmJycZGZmhtnZWWZmZpLbfkxCue4GP2jRz27o6OgoGbio\ncCAi8jyFBKm75SoJ6e6GEydOMD09vWTfBr9DpF9AyU99zOpuCNdLCHd8rBQMFBpEZLtRSJC68WEg\nfSwuLpZ0J/j1EKanp5dUDdIrLIabT/k1EvzPtra2JBCE+zKkV1RUGBARiSgkSN34FRX9Usvhz/Hx\n8SQo+HAQ7vwYhoNw8SRgyZLKfh+G8Ehvja1gICKylEKC1E24o6MfW+CXWx4fHy+pIvjHwoCQriAA\nJbs+VgoL4f3beU0EEZFKFBKkbnx3w9zcHDMzM8myyzMzMxUrCb6bwb+G5y/yvkKQDgfhbpCqJIiI\nLE8hQerGdzeEIcFXD3xICMcipLsbym1IlVVJyOpySG/gpKAgIlJKIUHqJt3dMDU1xeTkZLI/g68k\n+KCQ7m7IWvjIL8m80u4GBQQRkfKa6t0A2b6yuht8SBgbG1vxwEV4PiCkxyRkVRDKDVxUUBARKaVK\ngtRMue2g00svhwFhfHy8ZMllX0nw20H7tRDCGQ1hIGhubl6y/kFbWxutra3JdEgFBBGRlVFIkJpK\nb+Dkf19cXFwSDsbGxigWi4yMjDA8PEyxWGRsbCypJvi1EZxzNDc3JyEgl8uV3O7o6GDv3r3s2rUr\n2RK6o6ODXC6XGRIUFEREsikkSE2ll1oOf4bjEHwFYWRkhKGhIUZGRpKKgt/l0c9sgOc3cAqXV/ZH\nZ2cne/bsYffu3fT29tLV1UWhUCCXy9Ha2qqAICKyQgoJUjNhOMg60pUEX0XwISE9DTK9FbQPCZ2d\nnXR1dSU//R4Nu3btore3t2S3R19JCMOB1kkQEcmmkCA1lQ4K4fLLvpLgZzP4SsLw8DDDw8PJQkrh\nmISwu6GtrY329nY6OzuTjZt27tyZVA+6u7vp7u6mq6sr6W7wlQQ/yFEBQUSkPIUEqamsgOCXYs4a\nsOgrCcPDw0tWYUxvBe3HH3R1ddHT05N0MezatSvZCjq9JbSvJGRVERQWRERKKSRIzaTHI4QBYX5+\nPtnlcXJykrGxsSUhwU91DJ+TriT47obe3l52797N/v372bNnD+3t7cmRz+dpb28vqSRA6QJMYZsV\nFEREIgoJUlPlKgl+6mO4gJKf3eC7G7IGOy4uLgLPj0nw3Q09PT3s2rWLffv2sW/fvmTqo5/+mN75\nUURElqeQIDWTriDMzc0lqyv6KkJ4+G2gfRdD+I3e7/DY0hL9JxtWCtrb2ykUCiVHuEW0/5neElpE\nRCpTSJC+UCxwAAAgAElEQVSaCUNCuNOjP/yAxHChJN+dAJSsoJg+Ojo6SroT/DoJ4aJJ4f4MGm8g\nIrJ6q16W2cwuNbNPm9mPzWzRzK5MPf6R+P7wuL96TZbNwoeE9CZOfkZDuCdDuJpi2KXQ3NycdBf4\nsQW+WuCDQnp1Re30KCJSHWupJBSAbwN/B3yyzDmfBd4G+H+VZ9bwPrIFZFUS/DiErJCQriT4kJA+\n/MJJ5SoJ4XLNqiSIiKzNqkOCc+4B4AEAK/+v7oxz7uR6GiabX7qS4ENCuCV0ursha1+GlpaWpErg\nl2AuFApJQMjn8yWDFNPbQGs9BBGRtanVLpCvNrPjZvZ9M7vdzHbW6H2kgWWNSfDdDX4VRV9JSG8D\nDVElwQ86THc3VKoklNvpUUREVqcWAxc/S9QN8RTwE8CfAveb2SWu3LaAsiX5raDTlYSwuyFrC+h0\nJSEMCT4cZI1J8OsgtLa2llQNVEUQEVmbqocE59zdwa/fNbMngB8Crwb+odrvJ/WzXOZLVxLCgYvh\nmITluhtaW1vJ5XJJSPArKKZDQjizwctaMElERFam5lMgnXNPmdkg8EIqhIRDhw7R3d1dcl9/fz/9\n/f01bqHUiq8khOsj+PEI6Y2b/L4M4cwGPxYhn89TKBSSDZz8Msx+46as1RRh6weDgYEBBgYGSu4r\nFot1ao2IbEU1DwlmdgawC/jXSucdPnyYgwcP1ro5sgF8JcBPZ8yaApnubsjqamhpaSGXyyXjEDo7\nO+nu7qanp4eenp5kC+gwJPixB2FA2KphIStEHzlyhL6+vjq1SES2mlWHBDMrEFUF/L+855nZS4Ch\n+LieaEzCsfi8Pwf+GXiwGg2WxpXufgjHJIQzG8qtk1CpkuD3aOju7mbnzp3J7o47duxIQoKf1bBV\nQ4GIyEZbSyXhZUTdBi4+/iK+/2PA7wI/C7wV6AGeIwoH/8U5N7fu1krDCgNCWElI79UQdjeEIaFc\nJcHvz+ArCT09PclW0H5sQlYlAbZuBUFEZKOsZZ2EL1N56uQvrr05stn5i7zf/TGsJIRTIH1ICGc3\nrGRMgq8k7Nixo2QapF9pUZUEEZHq0eRxWbcwGIQ/syoJfjxCesXFrNUW/ayG9JiE3t7esmMStMPj\n6pjZO8zscTMrxsfDZvaLweM5M7vNzAbNbMzM7jGzvanXONPM7jOzCTM7ZmY3mZn+bRHZArTBk1RF\nVlBYbkyCn/64lkqCrx74aY9hJaESVRmWeAZ4F/CD+Pe3AZ8ys59zzh0FbgFeD7wZGAVuIxpzdClA\nHAbuJ+pavBg4HbgTmAXes2F/ChGpCYUEWRe/FoJf3yD86UNAekvoqakppqamSsKBc65krwa/OJJf\ndtmvtui7GPL5fLKyol8bobm5WYsmrZJz7r7UXe8xs98BLjazHwPXAG+Juxkxs7cDR83sIufco8Dl\nwIuA1zjnBoEnzOy9wJ+Z2Q3OufmN+9OISLWpJCjrEnYnhFWC8fHxksOvixB2M4QBoampqaR7IdwK\nermll7UddHWYWZOZvQXoAB4B+oi+SHzRn+OcexJ4Grgkvuti4Ik4IHgPAt3Aizei3SJSO6okyJql\nN3BKH2NjYyUBIZzRMD09XTIjwl/kW1pacM6tem8GBYS1M7OfJgoFeWAM+BXn3PfN7KXArHNuNPWU\n48D++Pb++Pf04/6xx2vTahHZCAoJsi7pxZL8MTs7WxISsioJ/uKetWNjub0ZwqAQVhEUFNbl+8BL\niKYtvxm4w8xeVeF8I5r+vBzt1SKyySkkyJqFezP46Y3huIOxsbEl1QQ/HmF6ejqpCPiLfDi+oFIl\nodJ20LJ68biB/x3/esTMLgKuA+4G2sysK1VN2Mvz1YJjwIWpl9wX/0xXGDIcIuqZCPXHh4iE6rEU\nu0KCrEu6khBOcUxXEcKAMD09TS6Xo6mpacmYhLa2tmRqox+06O/3sxjCNRHSh6xbE5ADHgPmgcuA\newHM7HzgLODh+NxHgD8ys93BuITXAUXge8u/1WFAy7GLrEQ9lmJXSJB18SEh3AbaD1xMdzeEYWF6\nejoZgwDZezWElYSsnR7T20HL6pnZHxNt7/4M0AlcBfw88Drn3KiZfRi42cyGicYrfAB4yDn3jfgl\nPkcUBu40s3cBpwE3ArdqlVWRzU8hQdYsHLgYhgRfRcia3ZDubpifny87uyGru8F3OWjRpKrZB9xB\ndHEvAv9EFBD+Pn78ELAA3ENUXXgAuNY/2Tm3aGZXAH9FVF2YAD5KtIeLiGxyCglSkV8UyR/hffPz\n80komJiYYHR0lGKxSLFYZGRkhJGREYrFIuPj48m6CGEoaG5uLtnAaceOHXR1dSV7NKQ3cCq3y6Os\nnXPuN5d5fAZ4Z3yUO+cZ4IoqN01EGoBCglSUXiApXDzJr43gKwc+JAwNDTE8PEyxWGR0dLQkJCws\nLACUVA7SIcEvvexDQkdHh5ZdFhGpA4UEqShcXtmHA3873b0wNjaWVBGGhoYYHR1Nxif4zZzCSoKf\nzRCGBF9F8FtBZ+3NoCqCiMjGUEiQZYXhwK+wGE57DCsJIyMjDA8Pc+rUKcbGxpLpkH4jp3CFxXCg\nog8J3d3dyXbQnZ2dqiSIiNSRQoJU5CsJPiTMzc0xPz/P/Px85mwG391w6tQpJiYmki2i5+bmkjEJ\nAM3NzSUbOIXdDT09PfT09CyZ4eDHJKiSICKyMRQSpKIwIPhw4Jdd9ps3laskTE5OJtUHf/hxDeld\nHrNCQtb0R1USREQ2jkKCLCsdFMKQEI5J8AMXfUiYmpoq+5rLhYTe3t5kuqOmPoqI1IdCwjYXbrKU\nJVxR0Y9B8PszpNdCSB8zMzPJksnhPgt+Y6ZwK2g/LqFQKLBjxw4KhYK2ghYRqTOFBFkiDA5+oaRw\nuWV/jI6OcurUKUZGRhgbG2NycrJkBgM8XzFIH62trZmbOKV3ekzv0aCAICKycRQSJBEuluT5WQxT\nU1PJ4ES/3LIfpFguJJhZyQDFdPdBpZDguxa0HbSISP0oJAiwNCD4n2Elwa+q6FdSHB4eThZNCkPC\n3Nwci4uLyTbQvnLgt3v23QzLVRLS3RQKCSIiG0shQZYstxzeXlhYSAYo+urB8PAwQ0NDyYJJflXF\nyclJpqenM7sb2trakjUR/OFDgg8KWSFBXQ0iIvWjkCBAaRdDuFdDWEnwMxiGhoY4efJkshZCuMOj\n725YXFxMugrSIcEPUFxJJSHcAlpBQURkYykkSCK9mZOvJITdDeEUx8HBwWQ1Rb+7Y3pMQrq7IZzu\n6EOCDwjpkBAGBEDdDSIiG0whQUpUCgm+kjA8PMzg4CAnTpwoWU3R356bm6vY3RBOc/SVhHR3Q2tr\na50/CRERUUjY4pZbB8F3KfgjXFlxfn6eU6dOMTw8zMjICKOjo4yNjTExMZFUEMLnhcHAbwOdrh50\ndnbS3d2dbODkw0I+n6etra2km0FEROpLIWGb81s++wWSZmZmmJ2dTW4PDg4yODiYzGLwOzpOT08n\nsxgWFxcBkimP/na4mmKhUKCzs7NkK+ju7u5kA6cwJCggiIg0hqbVnGxm7zazR81s1MyOm9m9ZnZ+\n6pycmd1mZoNmNmZm95jZ3uo2W6rFh4Tp6elkiqMfmHjs2DFOnDiRVBPCWQx+7IEfpBiOQWhubk7W\nQ/CrKfouhnDZ5e7ubjo7O0tCgp/yKCIi9bfaf40vBT4IvBx4LdAKfM7M2oNzbgHeCLwZeBVwOvDJ\n9TdV1mK5b+W+u2FmZiZZRXFkZITBwUGOHz+ezGLw3Q2+q8GPPwg3bSo3UDHcCjqsJPjuBlUSREQa\n06q6G5xzbwh/N7O3ASeAPuCrZtYFXAO8xTn35fictwNHzewi59yjVWm1rJsfP5BVSRgZGUmOYrGY\n/PTdDX49hLm5uWT8gP/2H/5errvBVxJ8gAi3gvb7M4iISP2td0xCD+CAofj3vvg1v+hPcM49aWZP\nA5cACgkNJgwJ4ewFP8XRL8McLsfspzrOzc0layH4YBBu4JTubkiPSfDLM/tZDeHARRERqb81hwSL\nvu7dAnzVOfe9+O79wKxzbjR1+vH4MWkA6YWT0pWE4eFhTp48yYkTJxgfH08WSgoPHxKAkoWOwg2d\nwpkNWWMSsjZ+UiVBRKRxrKeScDvwU8ArV3CuEVUcpI7S4QCe3wo6a+nlwcFBJiYmmJ6eLjl8QFhY\nWEhmM4TrIfjDL7nsA4I/Ojs76ezszNxCWtMfRUQax5pCgpndCrwBuNQ591zw0DGgzcy6UtWEvUTV\nhLIOHTpEd3d3yX39/f309/evpYkSy9q4KbwdrosQLoo0MzOTBAI/SNGvhxAOVPTBINyTwR+9vb2Z\nAxTDikF6VUVZuYGBAQYGBkruKxaLdWqNiGxFqw4JcUD4ZeDnnXNPpx5+DJgHLgPujc8/HzgLeKTS\n6x4+fJiDBw+utjmyAumAkF5R0QcFHxJ8QPBjD8qFBIDm5mZyuVxmxaCnp4edO3cm6yH4VRVbW1tL\nNm1K7/KowLAyWSH6yJEj9PX11alFIrLVrCokmNntQD9wJTBhZvvih4rOuWnn3KiZfRi42cyGgTHg\nA8BDmtlQXz4U+At8+LsPCGFI8EHBz2JYSSUhHJhYaT2EdEhQOBARaUyrrSS8g2hswZdS978duCO+\nfQhYAO4BcsADwLVrb6JUQzoY+J9hd0O6q8Ef4TlhSAiXX25vb2fHjh3JzAVfQejp6SnpbvCVhKzu\nBgUFEZHGstp1Epadm+acmwHeGR/SALK6G/xyylndDT4o+EWTfDDwP8PuBj+LIQwJO3fuZM+ePUk3\ngx+sGIaEcICiAoKISGPS3g3bRDochCEhq7vBVxFmZ2eXVB98QPB7NYSVhK6uLnbu3Mnu3bvp7u5O\ndnn0Mx3C7oYwJHgKCiIijUMhYZtID1gsFxTSYxL8WghZ0yfTYxLC7obdu3fT1dVFPp8nl8slaybk\ncrlkdoOIiDQ2hYQtIGs76PC+sFIQ3p6bm0uWXQ73ZfAzGvz2z+n1DPyRy+Xo6upKjnBmg98COlw3\nobW1VQsmiYhsIgoJW0h6TQR/26+oGE5r9D9HRkY4efIkQ0NDFItFJiYmkh0efUBobW1dcrS0tNDR\n0cGuXbuSGQw+KPipjulgoIWSREQ2F4WELaLcYknOOWZnZ5mammJiYiJZZtkfIyMjnDhxgqGhoaSa\nMD09zfz8PEAy5sB3F/gug3w+T6FQYPfu3clMhs7OTgqFQjJA0e/F4EOCgoKIyOaikLCFZC2W5Ddw\nCpdcHh0dTX6OjIwwNDRUUknwISGsJISDD30Q6OzsTCoJPT09SUgIKwk+HPiNmxQSREQ2D223t0WU\nCwiLi4vMzs4muzwWi0WGhoY4efIkx44d49ixY5w8eZLh4eGkkuC7GyCqJLS2tpLL5SgUCskiSbt2\n7WLPnj1JSOjq6ioJCX4Wgz/C3SIVEhqHmb3bzB41s1EzO25m98arpIbn5MzsNjMbNLMxM7vHzPam\nzjnTzO4zswkzO2ZmN5mZ/n0R2eRUSdhishZMSlcSfEg4efIkIyMjjI+PMzk5mXRFVKok+JAQHn7R\npKxKQjjQMdwtUhrGpcAHgW8S/Xvwp8DnzOyAc24qPucW4PXAm4FR4Dbgk/FzicPA/cBzwMXA6cCd\nwCzwng37k4hI1SkkbAHpMQhhQAgrCRMTE0lIOHHiBMePH2d4eDjZyCnc5dFXEpqammhra0vGIHR1\nddHb27ukghDOcAj3aNCqio3NOfeG8HczextwAugDvmpmXcA1wFucc1+Oz3k7cNTMLoqXW78ceBHw\nGufcIPCEmb0X+DMzu8E5N79xfyIRqSaFhC2i3BoIfrllX0kYHR1NtoH2IcFPiwx/+umPfjyB727o\n7OxMVlXctWtXMtXRT3v0iyf5mQ2y6fQQLb0+FP/eR/TvxBf9Cc65J83saeAS4FGi6sETcUDwHgT+\nCngx8PgGtFtEakAhYQtIb/nsL/L+ou9nMkxOTjI1NZWspOgDgT8/3JchnEbp10loaWlJxieEiyP5\nQKBxB5ubRX9ptwBfdc59L757PzCb2vodoq3f9wfnpLeCPx48ppAgskkpJGwBYUjwF3//c2ZmJhlz\n4ANCevvn9MZNYUjw4wjCkOCnRKbXQghnMMimdDvwU8ArV3CuEVUclrOSc0SkQSkkbAHhAMVwWWW/\naJKvJGSFhLm5uZKNnnw3hVcpJKiSsHWY2a3AG4BLnXPPBQ8dA9rMrCtVTdjL89WCY8CFqZf028in\nKwwph4Du1H398SEioYGBAQYGBkruKxaLNX1PhYQtIF1J8Csr+sNXEsp1N6Q3fVppJSGfzydLLquS\nsHnFAeGXgZ93zj2devgxYB64DLg3Pv984Czg4ficR4A/MrPdwbiE1wFF4HtUdBg4uP4/hMg20N/f\nT39/aYA+cuQIfX19NXtPhYQtwIcEX0mYnp5OQkG4ymK57oYwGKwlJGgthM3LzG4n+tp+JTBhZr4C\nUHTOTTvnRs3sw8DNZjYMjAEfAB5yzn0jPvdzRGHgTjN7F3AacCNwq3NubiP/PCJSXQoJW0C4k2NY\nSZiYmGBsbKykkuBDgg8Kfj2ErANWPibBj0tQJWHTeQfRuIEvpe5/O3BHfPsQsADcA+SAB4Br/YnO\nuUUzu4JoNsPDwATwUeD6GrZbRDaAQsIWEFYS/HoHfnGksbGxkjEJ6e4GvxV0+FqhciHB7+MQDloM\nd3hMVxLMLHO3Sqkv59yyic45NwO8Mz7KnfMMcEUVmyYiDUAhYRNI7+qYvp0VEHwVoVgsJkHBVxJ8\nBcGvhbCccKVEHxjCDZvCFRXDgJAVFEREZPNQSNgkynUJ+F0ewy4Gv2hSsVhkZGQkCQrpkKBv9iIi\nUolCwiZRbsllHxL8dEffzeB3eBweHi4ZlzAzM8Pc3NyKqwgiIrJ9KSRsIumpin5dg6zBir6SMDw8\nXLLioioJIiKyUgoJm0RYSfALH/kjrCSE3Q2+kuAHK4aDFn3AEBERKUchYZMIuxh8OPCDD8NKQtjd\n4Mck+OmO6fURVEkQEZFKFBI2gawdHsPNnMpVEnx3gz8vvVeDQoKIiFSikLBJZHU3+At/uMpiOCbB\ndzekxzKEgx5FRETKUUjYJML9GcJdHsNuhLArIb1hU9YW0OGqiv5n+rZfLMkvpuTXRgjXTkivjyAi\nIluDQsImEVYRfFDwyyv7pZb9Coq+SyErIKSDAlBykU8HgHBFxfQCSlkBQWFBRGTrWNUi+2b2bjN7\n1MxGzey4md0b7wgXnvMlM1sMjoV4ExlZo6yA4CsI6b0YfFAIKwmV9mYAlqyk6EOB35MhrCasZoVF\nERHZ3Fa7E8+lwAeBlwOvBVqBz5lZe3COA/6GaD/5/UQ7wv3h+pu6vS0XFNLdDb7LIT0OIS1dRUjv\n0VCukpCuIigoiIhsPavqbnDOvSH83czeBpwA+oCvBg9NOudOrrt1kvAhYSWVhHRA8M/31YOVVBLC\nsLBcUPCvoYAgIrK1rHdP3x6iysFQ6v6rzOykmT1hZn+SqjTIGmTNavAhwQeFSgMXlxuT4LsQ0uEg\n7GpYbjyCiIhsLWseuGjRleEW4KvOue8FD30c+BfgOeBngZuA84F/v452bmvh9MewkuCnPmaNSfAD\nF7NCQSirqyErJKwkKCg0iIhsLeuZ3XA78FPAK8I7nXN/G/z6XTM7BnzBzM51zj1V7sUOHTpEd3d3\nyX39/f309/evo4lbR7kVF8NFkspNe/TCi7e/3dzcnIw/aGtrI5fLldzesWMHhUKB9vZ28vk8bW1t\nSwYxZoUDBYXaGxgYYGBgoOS+YrFYp9aIyFa0ppBgZrcCbwAudc796zKnfx0w4IVA2ZBw+PBhDh48\nuJbmyDLKTW/0Uxzb29vLHjt37mTPnj3s3LmT7u5uduzYQXt7O62trWVnO8jGyArRR44coa+vr04t\nEpGtZtUhIQ4Ivwz8vHPu6RU85aVE4xaWCxNSI2F3QvizqamppFpQKBSS2/5nb28vO3fupLe3l+7u\nbgqFQlJRWG6mg4iIbG6rCgnxegf9wJXAhJntix8qOuemzew84NeB+4FTwEuAm4EvO+e+U71my2qY\nWTKeID17IZ/PUygU6OrqKnt0d3cnt8NKQlhBUCVBRGTrWW0l4R1EVYEvpe5/O3AHMEu0fsJ1QAF4\nBvifwB+vq5WyLmElIRyI6LsafEjo6emht7c3+dnb21tSZfAVhnw+n3Q3ZK3WqKAgIrI1rHadhIpT\nJp1zzwKvXk+DpPoqLZSUz+fZsWMHXV1d9Pb2smvXrpKjo6ODfD6/5AhDgn8PBQQRka1FezdsA1kh\nobW1lVwul1QSOjs76enpYdeuXezZs4e9e/eyZ8+eJBCkj5aWlqRy4N8jfD8REdn8FBK2gayQkMvl\nSkKC727wIWH//v3s378/c+xB+LuIiGxdCgmbRFNTU0lXQS6XS9ZG8Asr+QWUIFr/oK2tjfb29mTt\nAx8Mwt+7urrYvXt3Mnuhs7OzZAZDS0tL5pgDdS2IiGx9CgmbQFgJaGtrI5/PlyyU5FdVbGpqSsYZ\ndHR0JF0IfnEkf4SLJxUKBXbt2pWsg+ADgq8gKBCIiGxfCgmbRFhJyOVyOOeSqY2+/O8fCwPC+Ph4\nyTiC9LiCjo4Ouru7k8OvrpgVEhQURES2F4WETcBXElpaWmhra0sCgr/PjzXwAWHHjh1MTk4yOTnJ\n1NRUyTnhPgx+nYQdO3aUHL6S4N/DtyH8KSIiW59CwibhL+q+i8H/7rsNfBdDuH20/xkupBSuuujX\nTUgvxZzubvAUFEREtpf1bhVdE+lNaxpFvdqVriTk8/lkVsLnP/95uru76e3tZffu3ezbt4/TTz+d\nM844g7PPPpvzzjuPc889l3POOYezzjqLM888kzPPPJMzzjiD008/ndNOO429e/eW7M1QaUzCSgOC\n/g5Xp1HbJSLbm0LCKtSzXekxCT4kfOYzn6G7uzvZiMmHhDPPPJNzzjmH8847j3POOYezzz6bs88+\nm7POOoszzjiDF7zgBbzgBS9g//79JRs4hQMX17Mng/4OV6dR2yUi25u6GzaB8ALd3Nxc8lhzczOF\nQqEezRIRkS2uISsJIiIiUn8KCSIiIpKpEbob8gBHjx5N7igWixw5cqRuDSqnEdvViG0CtWu1qtWu\n4P+j/LpfTES2vUYICecAXH311SV39vX11aMty2rEdjVim0DtWq0qt+sc4OHlTjKzS4E/APqA04A3\nOec+nTrnfcBvAj3AQ8DvOOd+EDzeC9wKXAEsAp8ErnPOTVTlTyIiddMIIeFB4CrgR8B0fZsisunl\niQLCgys8vwB8G/g7oot7CTN7F/B7wG8ATwH/DXjQzA4452bj0z4B7AMuA9qAjwIfAq5Ov56IbC51\nDwnOuVNE/8iISHUsW0HwnHMPAA8AWPYc1+uAG51zn4nPeStwHHgTcLeZHQAuB/qcc9+Kz3kncJ+Z\n/b5z7ti6/iQiUlcauCgimczsXGA/8EV/n3NuFPg6cEl818XAsA8IsS8ADnj5BjVVRGpEIUFEytlP\ndLE/nrr/ePyYP+dE+KBzbgEYCs4RkU2q7t0NIrLpGFF4WO85wCGgO3Vff3yISGhgYGDJ6qzFYrGm\n76mQICLlHCO62O+jtJqwF/hWcM7e8Elm1gz0srQCkeEwcHD9LRXZBvr7++nvLw3QR44cqemMrYbq\nbjCza83sKTObMrOvmdmFdW7P9Wa2mDq+V4d2XGpmnzazH8dtuDLjnPeZ2XNmNmlmnzezF9a7XWb2\nkYzP7/4at+ndZvaomY2a2XEzu9fMzk+dkzOz28xs0MzGzOweM9tb7jU3sF1fSn1WC2Z2ey3bVYlz\n7imiEHBZ0MYuorEGfnDkI0CPmb00eOplROHi6xvUVBGpkYYJCWb2a8BfANcDLwUeJ5pqtbuuDYPv\nEH2T2h8fr6xDG/w0tWvJKOEG09R+G7gImCD67Nrq2a7YZyn9/GpdR74U+CDRhey1QCvwOTNrD865\nBXgj8GbgVcDpZEz/q0O7HPA3PP95nQb8YS0bZWYFM3uJmf1cfNd58e9nxr/fArzHzH7JzH4GuAN4\nFvgUgHPu+0TTLf+7mV1oZq+I/5wDmtkgsvk1UnfDIeBDzrk7AMzsHUT/kF8D3FTHds07507W8f3X\nPU2tju0CmNnIz88594bwdzN7G9HAuj7gq/E34WuAtzjnvhyf83bgqJld5Jx7tB7tCh6a3OD/3l4G\n/ANRQHFEQR3gY8A1zrmbzKyDaN2DHuArwOuDNRIAfp1oMaUvEC2mdA/Rf5Missk1RCXBzFqJ/rEM\np1o5on90Lin3vA3yk3E5/YdmdlfwDashrHCaWj29Oi6vf9/MbjeznRv8/j1EF7+h+Pc+onAcfl5P\nAk+zsZ9Xul3eVWZ20syeMLM/SVUaqs4592XnXJNzrjl1XBOcc4Nz7nTnXIdz7vJwtcX48RHn3NXO\nuW7nXK9z7recc5O1bLeIbIxGqSTsBprJnmp1wcY3J/E14G3Ak0Sl3xuAfzSzn26gJWdXMk2tXj5L\nVMZ/CvgJ4E+B+83skjgE1lRc3bgF+Kpzzo8l2Q/MxkEqtGGfV5l2AXwc+BfgOeBniSpo5wP/fiPa\nJSKS1ighoZwVTqOqDedcuLTtd8zsUaJ/xH8V+Eh9WrVidf3sAJxzYVfHd83sCeCHwKuJSty1djvw\nU6xsHMlGfl6+Xa8I73TO/W3w63fN7BjwBTM7Nx5EKCKyoRqiuwEYBBaIBmyF9rKiaVQbwzlXBP4Z\nqPnMgVUIp6mFGuqzg2S0/CAb8PmZ2a3AG4BXO+eeCx46BrTFYxNCG/J5pdr1r8uc/nWiv9tG+u9N\nRLaRhggJzrk54DFKp1pZ/PuK16GvNTPbQVQ2X+4f9w2zwmlqDcHMzgB2UePPL74Q/zLwGufc06mH\nHwPmKf28zgfOIprOV692ZXkpUXWjYf57E5HtpZG6G24GPmZmjwGPEs126CDaUa4uzOz9wGeIuhhe\nAKUyqPEAAAr8SURBVPxXogvMQKXn1aAdBaJvk34GwXlm9hJgyDn3DM9PU/sB0W6aNxJMU6tHu+Lj\neqIxCcfi8/6cqBKz0h0K19Km24mmWV4JTJiZr7AUnXPTzrlRM/swcLOZDQNjwAeAh2o1s2El7TKz\n84hmCdwPnAJeQvT/xJedc9+pVbtERCppmJDgnLs7XhPhfUSl828Dl9d5+uEZRDtU7gJOEk1Vuzje\nuXIjVWOa2ka363eJBt+9NW7Tc0Th4L/ElaNaeUfcli+l7n870Rx/iALoAtFUvRzRNM5ra9imlbRr\nlmj9hOuI1p94BvifwB/XuF0iImU1TEgAcM7dTjSoqyE45xpiAfl4Pn/FriHn3A1Esy82zAra9Ysb\n1RbPObdsF5pzbgZ4Z3xsiOXa5Zx7lmhAp4hIw2iIMQkiIiLSeBQSREREJJNCgoiIiGRSSBAREZFM\nCgkiIiKSSSFBREREMikkiIiISCaFBBEREcmkkCAiIiKZFBJEREQkk0KCiIiIZFJIEBERkUwKCSIi\nIpJJIUFEREQyKSSIiIhIJoUEERERydRS7waIiIg0qqeffprBwcGK5+zevZuzzjprg1q0sRQSRERE\nMjz99NNccMEBpqcnK56Xz3fw5JNHt2RQUEgQERHJMDg4GAeEu4ADZc46yvT01QwODiokiIiIbD8H\ngIP1bkRdaOCiiIiIZFJIEBERkUwKCSIiIpJJIUFEREQyKSSIiIhIJoUEEdn2BgYG6t2EhNpSXmO1\np5HaUjuaAikiVWFm1wK/D+wHHgfe6Zz7Rn1btTIDAwO84hWvaIiV9QYGBujv76/pe6xUvdpSbpXD\nv/7rv+aCCy4AGmGVwwGgMf6eakkhQUTWzcx+DfgL4D8AjwKHgAfN7HznXOUrbwUjIyMcOXKk4jnV\nuFhMTU2taGW9XC7PJz95D6eddlpN27OdLbfKYV9fH7CyVQ63+5LK1aCQICLVcAj4kHPuDgAzewfw\nRuAa4Ka1vKBz89x66+3ccsstFc+rxpK4s7OzK1hZ7yvMzPxHrrjiipq3ZzurvMrhIeAwK1nlcKVL\nKlcKfkePHl3LH2FLUUgQkXUxs1agD/gTf59zzpnZF4BL1v7Ki8zPz7KSJXG/8pWvcOBAuXNW822x\n0sp6R4HFDWvPSr4Fz8zMkMvl1n3ORn6bXu7P9fyFOevvorvkvkoX8aNHj1Yt+G1nCgkisl67gWbg\neOr+48AFZZ6Tj378L+CbmScsLEzHt56q8NbfAoyrr766YgNbW3O8//1/zu7duzMfP37cN/1+ojCQ\n5aENac+zzz7LX/7lX/IHf/CfmJubznh2qIkouKzvnHLtefbZZ/n4xz8evUpTE4uLlV9nuXMGBwdX\n+OeC7L+LZ4GPs9LPOVLp7+tJos/m/wSyupCeAD61TFui169X1SF433wtXt+cc7V4XRHZJszsNODH\nwCXOua8H998EvNI5928ynvPrRP/Cikh1XOWc+0S1X1SVBBFZr0FgAdiXun8vS6sL3oPAVcCPgJV8\nrRSRbHngHKL/p6pOlQQRWTcz+xrwdefcdfHvBjwNfMA59/66Nk5E1kyVBBGphpuBj5nZYzw/BbID\n+Gg9GyUi66OQICLr5py728x2A+8j6nb4NnC5c+5kfVsmIuuh7gYRERHJpL0bREREJJNCgoiIiGRS\nSBCRDWVm15rZU2Y2ZWZfM7MLN+A9rzezxdTxveDxnJndZmaDZjZmZveY2d4qvv+lZvZpM/tx/N5X\nZpzzPjN7zswmzezzZvbC1OO9ZvZxMyua2bCZ/a2ZFardFjP7SMZndX+N2vJuM3vUzEbN7LiZ3Wtm\n56fOWfbvxszONLP7zGzCzI6Z2U1mtqrr2wrb8qXU57JgZrfXoC3vMLPH48+3aGYPm9kvbvRnAgoJ\nIrKBgo2grgdeSrRb5IPxoMda+w7RoMr98fHK4LFbiPaaeDPwKuB04JNVfO8C0WDOa4ElA8HM7F3A\n7wG/DVwETBB9Lm3BaZ8gWl/4sritrwI+VO22xD5L6WeV3u6wWm25FPgg8HLgtUAr8Dkzaw/Oqfh3\nE1/47icaiH8x8BvA24gG0Va7LQ74G57/bE4D/rAGbXkGeBfRcud9wN8DnzIzv770Rn0m4JzToUOH\njg05gK8Bfxn8bkTr2/5hjd/3euBImce6gBngV4L7LiBar/eiGrRlEbgydd9zwKFUm6aAX41/PxA/\n76XBOZcD88D+KrflI8D/qvCcF9WiLfHr7I5f+5Ur/bsBXg/MAbuDc34bGAZaqtWW+L5/AG6u8Jya\ntCV+nVPA2zf6M1ElQUQ2RLAR1Bf9fS7612udG0Gt2E/GJfYfmtldZnZmfH8f0TeusF1PEi0GVfN2\nmdm5RN9Kw/cfBb4evP/FwLBz7lvBU79A9M325TVo1qvjkvv3zex2M9sZPHZJDdvSE7/OUPz7Sv5u\nLgaecKVbkj9ItBvUi6vYFu8qMztpZk+Y2Z+kKg1Vb4uZNZnZW4jWHXmEDf5MFBJEZKNU2ghqf43f\n+2tE5dbLgXcA5wL/GPej7wdm4wvzRreL+D0clT+X/cCJ8EHn3ALRBazabfws8FbgF4hK6T8P3G9m\nVsu2xK9/C/BV55wfL7KSv5v9ZH92sMb2lGkLRPuNXA28mmjX0/8DuDN4vGptMbOfNrMxoqrB7USV\ng++zwZ+JFlMSkXozyveNV4VzLlzX/jtm9ijwL8CvUn7viJq3axkref+qt9E5d3fw63fN7Angh0QX\nxn+oYVtuB36K0rEi632vtbbHt+UVJS/m3N8Gv37XzI4BXzSzc51zlbabXEtbvg+8hKii8WbgDjN7\nVYXza/KZqJIgIhtlLRtB1YRzrgj8M/BC4BjQZmZddWrXMaJ/4Ct9Lsfi3xNm1gz0UuM2xhe/QaLP\nqiZtMbNbgTcAr3bOPRc8tJK/m2Ms/ez876tuT6ot/7rM6X7X0/CzqUpbnHPzzrn/7Zw74pz7z0SD\nfK9jgz8ThQQR2RDOuTngMaIR8UBS1r0MeHgj22JmO4CfIBow+BjRoLuwXecDZxH1AddUfBE+lnr/\nLqL+ff+5PAL0mNlLg6deRhQuvk4NmdkZwC7AXzCr2pb4ovzLwGucc0+nHq70dxN+Nj+TmiHzOqAI\nhF0F621LlpcSfTMPP5uqtCVDE5Bjgz+Tqo7a1aFDh45KB1F5f4qoz/tFRNPmTgF7avy+7yeaKnY2\n8G+AzxN9o9oVP3478BRRSb0PeAj4ShXfv0BUOv45olHo/1f8+5nx438Yfw6/BPwM8P8A/x/QFrzG\n/cA3gQuJyuBPAndWsy3xYzcRBZSz4wvRN4GjQGsN2nI70Yj7S4m+6fojnzqn7N8N0cXzcaKxFD9L\nNO7kOHBjNdsCnAe8BzgYfzZXAj8A/r4Gbfljom6Xs4GfBv6UKBj8wkZ+Js45hQQdOnRs7AH8LvAj\norDwCPCyDXjP/7+9O0TJIIjDOPzDoojBZLII3sObeBarJzAoJtFgNYjJa1i8gIJBUYt8ht0gH1OE\nT9PzwKZddv/MhHl3mGGumrZafjStAr+s9n7cX2/aI/9cvVbX1c4Kv38wD8hfS9f5j2eOmmY23ptW\nou8vvWO7umj6G3ypTqvNVdZSbVS3TTMbn9VjddJSiFthLaM6vqrD3/RNU8C5qd7mwfC4WltlLdVu\ndV89zX30MA/eW39Qy9nc9h9zX9w1B4T/bJPFYuGAJwBgzJoEAGBISAAAhoQEAGBISAAAhoQEAGBI\nSAAAhoQEAGBISAAAhoQEAGBISAAAhoQEAGDoG5E93Be1HK29AAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f84680e3fd0>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "%matplotlib inline\n",
-    "\n",
-    "# We'll show the image and its pixel value histogram side-by-side.\n",
-    "_, (ax1, ax2) = plt.subplots(1, 2)\n",
-    "\n",
-    "# To interpret the values as a 28x28 image, we need to reshape\n",
-    "# the numpy array, which is one dimensional.\n",
-    "ax1.imshow(image.reshape(28, 28), cmap=plt.cm.Greys);\n",
-    "\n",
-    "ax2.hist(image, bins=20, range=[0,255]);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "weVoVR-nN0cN"
-   },
-   "source": [
-    "The large number of 0 values correspond to the background of the image, another large mass of value 255 is black, and a mix of grayscale transition values in between.\n",
-    "\n",
-    "Both the image and histogram look sensible. But, it's good practice when training image models to normalize values to be centered around 0.\n",
-    "\n",
-    "We'll do that next. The normalization code is fairly short, and it may be tempting to assume we haven't made mistakes, but we'll double-check by looking at the rendered input and histogram again. Malformed inputs are a surprisingly common source of errors when developing new models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:22.895369",
-     "start_time": "2016-09-16T14:49:22.527595"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 531,
-     "status": "ok",
-     "timestamp": 1446749126656,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "jc1xCZXHNKVp",
-    "outputId": "bd45b3dd-438b-41db-ea8f-d202d4a09e63"
-   },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgcAAAFkCAYAAAC0KZhSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzs3XucZGdd7/vPr2/Vl+nb3BNyI2IwoCLTXJKDQTRKAGNE\ncSsteSHJQQUjhzNuBeGAiWTjJWwzMZAoWxFIAu0rO8gWNiEBolwSQoIZiEGGKO5AAmEuPd1dfb9N\nP+ePtZ6VZ61eVd3VXdVV3f19v17rVdVVq6qe6Ulmfev33Mw5h4iIiIjXVO8GiIiISGNROBAREZEU\nhQMRERFJUTgQERGRFIUDERERSVE4EBERkRSFAxEREUlROBAREZEUhQMRERFJUTgQERGRFIUDkW3M\nzB4zs6Wc473x8wUzu8nMhs1swszuMLO9mfc408w+ZWZTZnbUzK4zM/3bIrKJ6X9gke3tecD+4Pg5\nwAG3x8/fAPw88CrgxcDpwMf8i+MQcCfQAlwA/AbwOuBdG9J6EakJ08ZLIuKZ2Q3AK5xz55lZD3AC\neLVz7uPx888EjgAXOOceNLOXA58ATnPODcfn/DbwZ8Ae59xiXf4gIrIuqhyICABm1gq8BvhA/NDz\niCoC9/hznHOPAo8DF8YPXQA84oNB7G6gF3h2rdssIrXRUu8GiEjD+CWii/qH45/3AfPOufHMeceI\nuiCIb4/lPO+fezjvg8xsF3AJ8B1gdl2tFtne2oFzgLudcyer9aYKByLiXQl82jl3dIXzjGhcwkrK\nnXMJ8JHVNkxEVvQa4KPVejOFAxHBzM4CfhZ4ZfDwUaDNzHoy1YO9PFUdOAo8P/N2++LbbEUh9B2A\n2267jfPPP3+tza6agwcPcujQoXo3A1Bbymmk9jRKW44cOcLll18O8f9T1aJwICIQVQ2OEc088B4C\nFoGLAT8g8TzgLODL8Tn3A283s93BuIOXAkXgm2U+bxbg/PPP58CBA9X6M6xZb29vQ7QD1JZyGqk9\njdSWWFW75xQORLY5MzOi6Ycfcs4t+cedc+Nm9gHgejMbBSaAG4H7nHNfjU/7DFEIuNXM3gqcBlwL\nvM85t7CBfwwRqSKFAxH5WeBM4IM5zx0ETgF3AAXgLuAq/6RzbsnMLgX+iqiaMAV8CLi6tk0WkVpS\nOBDZ5pxznwWaSzw3B7wpPkq9/gng0tq0TkTqQesciMi2Nzg4WO8mJNSW0hqpPY3UllrQCokisuHM\n7ADw0EMPPdRog7pENpXDhw8zMDAAMOCcO1yt91XlQERERFIUDkRERCRF4UBERERSFA5EREQkRVMZ\nRWRTGh4e5tJLf4mjR8ut0gyXXXYpN954/Qa1SmRrUDgQkU3p61//Og88cC/wW0SbSeZ5kA9+8IMK\nByIVUjgQkU3ubUQ71uZ5DyV2jRaRMjTmQERERFIUDkRERCRF4UBERERSFA5EREQkReFAREREUhQO\nREREJEXhQERERFIUDkRERCRF4UBERERSFA5EREQkReFAREREUhQOREREJEXhQERERFIUDkRERCRF\n4UBERERSFA5EREQkReFAREREUhQOREREJEXhQERERFIUDkRERCRF4UBERERSFA5EREQkReFARERE\nUhQOREREJEXhQGSbM7PTzexWMxs2s2kze9jMDmTOeZeZPRk//1kze0bm+X4z+4iZFc1s1Mz+1sy6\nNvZPIiLVonAgso2ZWR9wHzAHXAKcD/xXYDQ4563A7wK/DbwAmALuNrO24K0+Gr/2YuDngRcD79+A\nP4KI1EBLvRsgInX1h8DjzrnXB499N3POm4FrnXOfBDCz1wLHgFcCt5vZ+UTBYsA597X4nDcBnzKz\n33fOHa31H0JEqkuVA5Ht7ReAfzGz283smJkdNrMkKJjZ04H9wD3+MefcOPAAcGH80AXAqA8Gsc8B\nDnhhrf8AIlJ9Cgci29u5wBuBR4GXAn8N3Ghml8fP7ye6yB/LvO5Y/Jw/53j4pHPuFDASnCMim4i6\nFUS2tybgQefcO+OfHzazZxMFhtvKvM6IQkM5qzlHRBqQwoHI9vYD4EjmsSPAL8f3jxJd5PeRrh7s\nBb4WnLM3fAMzawb6WV5xSDl48CC9vb2pxwYHBxkcHFz9n0BkmxgaGmJoaCj1WLFYrMlnKRyIbG/3\nAc/MPPZM4kGJzrnHzOwo0SyEfwUwsx6isQQ3xeffD/SZ2XODcQcXE4WKB8p9+KFDhzhw4EC5U0Qk\nlhecDx8+zMDAQNU/S+FAZHs7BNxnZm8Dbie66L8e+M3gnBuAd5jZt4HvANcC3wP+EcA59y0zuxv4\nGzN7I9AGvBcY0kwFkc2pZgMSzewqM3vMzGbM7Ctm9vxafZaIrI1z7l+AXwIGgUeA/w94s3Pu74Nz\nriO62L+fqBLQAbzcOTcfvNWvA98imqXwv4EvEq2LICKbUE0qB2b2a8BfAL8FPAgcJFo05Tzn3HDm\n3F1Ec6S/A8zWoj0i20g7cA5wt3Pu5Gpe4Jy7E7hzhXOuAa4p8/wYcHmp50Vkc6lVt8JB4P3OuVsA\nzOwNRKumXQlclzn3EuAjNWqHyHb1GqJVC0VEKlb1cGBmrcAA8Cf+MeecM7PP8dSiKaHvANx2222c\nf/75QDSC+dChQ9Vu2ro1YrsasU2gdlWqWu06cuQIl19+OcT/X4mIrEUtKge7gWbyF03JjoqGuCvh\n/PPPT0Yt9/b2NuQI5kZsVyO2CdSuStWgXeqiE5E128gVErUgioiIyCZQi8rBMHCKaNGU0F7KLIgS\nLoby4IMPctlll2kxFJEyNnJBFBHZXqoeDpxzC2b2ENEiKJ8AMDOLf76x1OvCxVAuu+wyPvGJT1S7\naSJbykYuiCIi20utZitcD3w4Dgl+KmMn8KHVvLhRqwWN2K5GbBOoXZVq1HaJyPZUk3DgnLvdzHYD\n7yLqXvg6cIlz7sRqXt+o/1A2YrsasU2gdlWqUdslIttTzZZPds7dDNxcq/cXERGR2tjI2QoiIiKy\nCSgciIiISIrCgYiIiKQoHIiIiEiKwoGIiIikKByIiIhIisKBiIiIpCgciIiISIrCgYiIiKQoHIiI\niEiKwoGIiIikKByIiIhIisKBiIiIpCgciIiISIrCgYiIiKQoHIiIiEiKwoGIiIikKByIiIhIisKB\niIiIpCgciIiISIrCgYiIiKQoHIiIiEiKwoGIiIikKByIiIhIisKBiIiIpCgciGxjZna1mS1ljm8G\nzxfM7CYzGzazCTO7w8z2Zt7jTDP7lJlNmdlRM7vOzPRvi8gm1lLvBohI3X0DuBiw+OfF4LkbgJcD\nrwLGgZuAjwEXAcQh4E7gSeAC4HTgVmAeeMcGtF1EaqDq6X6lbyIi0nAWnXMnnHPH42MEwMx6gCuB\ng865LzjnvgZcAbzIzF4Qv/YS4EeA1zjnHnHO3Q28E7jKzPTlQ2STqlXp7xvAPmB/fPxkjT5HRNbv\nh83s+2b2n2Z2m5mdGT8+QFRdvMef6Jx7FHgcuDB+6ALgEefccPB+dwO9wLNr33QRqYVaJftF59yJ\nGr23iFTPV4DXAY8CpwHXAF80sx8lCvbzzrnxzGuOxc8R3x7Led4/93D1mywitVarcPDDZvZ9YBa4\nH3ibc+6JGn2WiKxR3A3gfcPMHgS+C/wq0f+/eQxwq3n7lU44ePAgvb29qccGBwcZHBxcxduLbC9D\nQ0MMDQ2lHisWizX5rFqEg5LfRJxzUzX4PBGpEudc0cz+HXgG8Dmgzcx6MtWDvTxVHTgKPD/zNvvi\n22xFYZlDhw5x4MCBdbZaZHvIC86HDx9mYGCg6p9V9XCwwjeRD5Z6nb5BiFSmFt8izGwH8EPAh4GH\niGYuXAx8PH7+POAs4MvxS+4H3m5mu4NxBy8FioAGIotsUjUfTZz5JlKSvkGIVKYa3yLM7D3AJ4kC\n/NOAPyYKBH/vnBs3sw8A15vZKDAB3Ajc55z7avwWnyEKAbea2VuJqoXXAu9zzi2s588nIvVT83AQ\nfBO5pdafJSIVOwP4KLALOAHcC1zgnDsZP38QOAXcARSAu4Cr/Iudc0tmdinwV0TVhCngQ8DVG9R+\nEamBqoeDMt9Ehsq9TkQ2nnOubL+dc24OeFN8lDrnCeDSKjdNROqoFpWDlb6JiIiISAOrxYBEjSAU\nERHZxLQ5ioiIiKRo7fMKOOdK/uzvl7otx8xWPCc8L3u7lvda6/kiIrL1KRxUaGlpCedc6vCPLS0t\ncerUqdStv++cw8xKXuDLXejNjKamppJH3nus9jEREZEshYMKhGEgvPj7nxcXF1lcXGRhYWHZ/aWl\npeQi70NCeED+hRygqamJ5uZmWlpaaGlpSd1vaWnJfZ/sfTNLAkreZ4iIiHgKBxUIg4E/FhcXk/vz\n8/PMzc0xPz+fHP7nU6dOpb7pZ2/LVRSamppobW2ltbWVtra21P2lpSWam5tzA0epEBK+v4iISJbC\nQYWyVYKwQjA7O5scMzMzqZ/DcJDtEmhubgbyv/EDNDc3UygUlh1+PENeVaLcfU8BQURE8igcVCDb\nrbCwsJA6pqenSx6Li4tJ94APBv5+9ps/pLsDmpub6ejooL29nY6OjqRLA6IuB99dEAaO8DH/fv5c\n/7OIiEgehYMKhMEgrBj47oOZmRmmpqaYnJxMHRMTEywuLtLc3JwKBOFRriugpaWFubk5Ojs7k2Dg\nnEvex98PD/+Y5x8LZ1FkxyCIiIiAwkFF/AXVh4MwGIThYGJigvHx8dQxPz+fDCb0AwrD21IDFX04\nWFhYSGY9QHSxb2lpobW1ddlsBh8YwjAQ8t0YIiIieRQOKuCcY3Fxkfn5+WRcgT+mp6cZHx/PDQYT\nExPMz8+nwkA2JGQDQRgWmpubkyDiw0g4tqG1tTW3uyKsUORVK/w5W1GpwZd5v+O8MRnlXi8istUp\nHFRgaWmJhYUF5ubmmJ6ezu0+CG/9/YmJCRYWFnIvzit1KwC0tLQwPz+fhJDOzk46Ojro7Oyks7Mz\nVX0oFQ7yqhUtLS1bOhyUOnzVpdTvpNTYDwUDEdkuFA4q4MPB7Oxs0n1QLBaTY2pqatkxOTnJ1NRU\nMiAxe/EOByQCyy5EvnIwMzNDe3t77pENB9nP8OshtLa2ptZH2KrhoFRlwN+2tLQkU0Lb2tpS9/3r\nV6omiIhsZQoHFVhaWmJxcTGpHIyPjzM2NsbIyAijo6Op2Qn+W352tkJ4oQoPWD6V0d82NTVRKBSS\nC5m/72+z3QTZgOBDgV8fwR8+VGw1eb/f8GhtbV0WsAqFAsCyAZ1NTU0sLS2lZoWIiGx1CgcVCCsH\nvlvBh4Ph4eHU+gbh7czMTBIOyq1B4GUDwmou8HkVCX+E34yzCym1tGzN/wSyv5MwQBUKhaRLxq9e\n6cOAr6b4QZ3AlqyuiIiUszWvDDUSjjmYmppKKgcnT57kxIkTyayF2dnZZfdPnTpVsh87Gwa8bNdC\nduxAtmJQLhxkqw7+2KqVg+z4jvB31t7eztzcXGoGSNjdEP5O/N/PajbQEhHZKhQOKpCtHExMTKTC\nQbhscvY4deoUUH5DpFD2sVL956W6Kfw4Br+6og8G/vA/b8XKgf9zh2Mrwp/DigE8NS20ra2NxcXF\nVBAIKzwKCCKyXWy9K0MN+QWQ/FRG37XgKwjZFRP94fdfyMoLBau9AGUvYHnLMvv7PhD4vvXw8OGg\n0r70jbhQrrVNvgpQahBmV1dXqmLQ3NxMa2srhUKBhYWF1J8tHIMgIrJdKBxUwF9EOjo66Orqore3\nl6mpqaTboFQ48OXrlboTsltBZ1czDLeHLrfIUfie/n64eFP4eeE0yvB12XJ6Xntqpdz4i/C5sD1h\nu8LKQd5uln6zqrDLITzCGQxh4NiKXTAiInkUDioQjnTv6uqip6cnGWzonEsWKgpDgf/Zj3gv1SUQ\nXvjD2+z9Ukep14cXcT/bAkjCQqnVGcM9GioNJmuVN+ai1JEXpMKj1PiM5ubmVDjIqy60t7cv278i\nHKAoIrLVKRxUIAwHO3bsSAa1+ZHuYRjI3voLUqmFinyXRXjB9z+HW0T7o9zW0dkjrBzAU8EgnF5Z\napplqXDiH1uPcmMt8oJKNrSUCkPhvhN5tz4U5VUVWltbkz+j/zv35ygciMh2oXBQgWzlwA9q8zMC\n/ODDcKljfz/8Npt3hBf7vAt9dotof4TdGX6/B39kuwX8xTPs4igVVsLQkg0qYUhYr3IzNEoNtgzD\nQamj3DoHS0tLy7ocwmmi2YqBf0zhQES2C4WDCvgQ0NHRkXQlmFkSGLIzFMKQ4JzLHTnvj7wAkN39\nsdwxNzeXfFZ4AQ2rBf6ilx3MWG5Z5zC0ZKsV1QgHvg3Z+z4UlFrcKS+whMGl3HoSYeUgu25Ea2tr\n8nftn/d/FwoHIrJdKBxUIKwc+AuQ76Pu6uoqO5XROZdayCj8purDQfbbf3g/GzbCY25ujtbWVmZn\nZ5MFe8Kug3LjFvIG74UD9/w4hbCiEVY21mOlgYd5YSUbDsp1pWQHWvrDVw6y4w3834f/DD840XcL\nKRyIyHahcFAB/02yvb09VTHw39rDb+/hhduHg3Blwux9v9tjNgxkt4UO74cLLfmV/bIzE8LBjnnd\nFmFo8SHFB4NSFY2wsrFepWYjhIGgVDfMakNLtuvC/xmzK076v5MwGMzPz6tyICLbjsJBBcI58ZAu\nPS8sLFAoFMp2K4SBIBsQ8qoE2XCQDQb+55mZmdQiR9nVEEtd3H3XSLaiEd73F11fycjeX8/vstxt\n+O09b6XDMEz514TdC17eBT38/YZBJ+yWqNWMDBGRzUDhoEJhX3heqTl8PrzQAsu6FMLbbPncB47F\nxcXkYp/XreBXbCy1r8Ps7GwqDIQXRH9xLxUMspWDsKujmpWDvICQrRyEIaGlpSUJRdkDSq/JEK5Z\nkB2Mma0ihJ8XtlFEZDtQOKhA9qISri6YvaD5i034DX2lAYn+G3FeOT9vNkK2shDu6RB2Oay2cpAX\nEsKxENl2VGvMQV5AyKschPdnZmaYmJhgcnKSycnJZIEi37a8BaT8z/5zsn9f2XEg4WBIBQQR2U4q\nDgdmdhHwB8AAcBrwSufcJzLnvAt4PdAH3Ae80Tn37fU3t/58H37e5jzhxSbbvw+U7UP34cAHg7yp\njKX6/0sNVPTHaioHG92tEP7e8u5nL9zZKoLfEbNQKCQLFJ06dSoJRXnrMJSqHPj39109eRtbKRiI\nyHaylspBF/B14O+Aj2WfNLO3Ar8L/AbwGPDfgLvN7Hzn3Pw62lp34UUlfMxfyHyftR/Ql+0Dz5ua\n53/OW/wo/LncgkilZjhkuwHWWznYiG6FsHJQarxBc3MzxWIxtYOi3/Nieno66fKB9PoO4boPq6kc\n+EGeqhyIyHZTcThwzt0F3AVg+f9avhm41jn3yfic1wLHgFcCt6+9qY0hDAd+Wpy/AIXT/7JTB6H8\nzorlVvxbaenkUtWFcpWGtVYO8ioP6/ldVlo58Pe7urpSXQlzc3NMT08nUxE9P10zDAbh5+WNOQi7\nf1Q5EJHtqKpjDszs6cB+4B7/mHNu3MweAC5kk4eD7IXML9MLK2+alPf6bD97udeXO0pVHfz9Ul0K\n660cVCsc5P1e/BoSpQJCe3t7qithenqa8fHxZCpi9mLuKwfh5+SNEckbc5Btp4jIVlftAYn7AUdU\nKQgdi5/b9LIX9EawUnBYb+WgVgMSYXkg8I+tNJXRBxfflTA+Pk5HR0eqqyEMWKVmRGRnlvjppdu1\ncmBmbwPeDdzgnPu9+LECcD3wa0ABuBv4Hefc8eB1ZwJ/DbwEmABuAf7QOVedZTRFZENt1GwFIwoN\nJR08eJDe3t7UY4ODgwwODtayXRXZjBeIcEfBbFndd2dkZ06E4yL84MtsBcTiZYjXo1SXQnbAoH/O\nBx1/m7dOga+W+EDg16bwt0tLSxQKBdrb25Ojo6MjdbS3tydrRGQrCI1kaGiIoaGh1GPFYnHN72dm\nzwd+E3g489QNwMuBVwHjwE1E440uil/XBNwJPAlcAJwO3ArMA+9Yc4NEpG6qHQ6OEgWBfaSrB3uB\nr5V74aFDhzhw4ECVm7P95F3Awm/i2cd8OAi/Jeft0Bi+Nnx9Ndpaaq0Df+srIH6Mh5ktWygqGxA8\n38awrWEwyAsJYTjwVYRGDAd54fnw4cMMDAxU/F5mtgO4jWiW0TuDx3uAK4FXO+e+ED92BXDEzF7g\nnHsQuAT4EeCnnXPDwCNm9k7gz8zsGufc+vqfRGTDVTUcOOceM7OjwMXAv0Lyj8sLib5tSA35C2n2\nIuZDQHheGA7y9jDIfoPPqzpkA8N62p13m9dF4u/7cJC3pHM4CDF7NDU1JStJhhWEbOUg272w3iC0\nCdwEfNI590/xhd17HtG/E+E4okfN7HGicUQPElULHomDgXc38FfAs1leiRCRBreWdQ66gGcQVQgA\nzjWz5wAjzrkniEqQ7zCzbwPfAa4Fvgf8Y1VaLLnCYJDXx+7v+2/ffvqkH1SZN80yvKCGrw+nXlaj\n3dn7YbWg1NTObNUguwRyXuXD3y/XpdDR0UGhUFi25kEjVg6qxcxeDfwEURDI2gfMO+fGM4+H44j2\nkz/OyD+ncCCyyaylcvA84J+JxhA44C/ixz8MXOmcu87MOoH3Ey2C9CXg5Zt9jYPNoFxAyLvA+2/h\npaZYlgsGeYsMraW9pX52ziWzKYAkFPgQUG5/hOyujNndJvPGHPjbzs7OZQMzt3I4MLMziAL9zznn\nFip5KSuMI4pt+rFGIo2i2uOMylnLOgdfAMrWWJ1z1wDXrK1Jsh55XQs+CPjnst0E/nV5R97r8wY4\nVqPdoWwA8dMWSy0hne1W8G3LW8cg26WQrSBkZ0hs5XBAtNLpHuAhe+oP2Ay82Mx+F3gZUDCznkz1\nYC9PVQeOAs/PvO+++DZbUUjRWCOR1avmOKOVaG+FLSjvG3l4IS91US9X4s97bS3DQbhGA6QrB6WW\nhg4rB+G+F2E4yIaCvK6FbPfKFg4GAJ8Dfizz2IeAI8CfAd8HFojGEX0cwMzOA84Cvhyffz/wdjPb\nHYw7eClQBL5Zy8aLSG0oHGwBq7lwrefiVosL40rBIhyE6PeOCHefnJqaSnZinJubSyoIp06dSlUN\nwlDgxxN0dXXR2dlJZ2dnanaCn6GQXfhoKy+f7JybInMBN7Mp4KRz7kj88weA681slGgNgxuB+5xz\nX41f8pn4PW61aPn004jGGr2vwq4KEWkQCgfSELJhwQ86nJ2dZWpqKnVMTk5y8uRJRkZGKBaLTE5O\nMjMzw8LCQtIN4RdKam9vT8JAV1cXXV1d9Pf309fXR3d3N11dXcnshLwVEbdiIFiFbHI7CJwC7iBa\nBOku4KrkZOeWzOxSotkJXwamiKoPV29EY0Wk+hQOpO7ClQz9rR906KsE4+PjqWNkZISRkRHGx8eT\ncDA/P58MRmxubqatrY329nY6Ozvp7u6mp6eH7u5u+vv76e3tXRYOWlpaSo672E6ccz+T+XkOeFN8\nlHrNE8ClNW6aiGwQhQOpq7w9KHx3gu9K8OFgdHQ0OcbGxhgbG6NYLDI1NcXs7GwSDiDqUvCVAx8O\nent76evro6+vLwkHvmvBVw7yQsF2DAgisr0pHEjd5W045bsVwsrB6Ogow8PDDA8PMz4+zsTEBJOT\nkyW7FXzloKuri+7ubvr6+ti5c2cqHJSrHEBj7qUhIlJrCgdSN+U2jPLdCrOzs0xOTlIsFpNwcOzY\nMaamppienk6OsFsB0mMOwsqBDwc7duxgx44dy8JBuK6Dp2AgItuNwoE0jOxSyeGAxImJCcbGxjh5\n8iTHjx9nenqaubk55ufnmZubS+77MQeluhX8YEQ/W6GzszM1IFGhQERE4UAawEqVg2y3wvHjx5md\nnV22r4LfsdEvfOSnMOZ1K4TrHYQ7MJYLBAoLIrJdKBxI3fgQkD2WlpZS3QZ+PYPZ2dllVYLsiojh\nplB+jQN/29bWlgSBcN+E7AqICgEist0pHEjd+BUQ/ZLI4e3k5GQSEHwoCHdiDENBuOgRsGzpY79P\nQnhkt6hWIBAReYrCgdRNuMOiHzvgl0WenJxMVQ38c2EwyFYMgNQujOVCQvj4dl7TQEQkj8KB1I3v\nVlhYWGBubi5ZHnlubq5s5cB3J/j38PzF3VcEsqEg3J1RlQMRkdIUDqRufLdCGA58tcCHg3CsQbZb\nodRGUXmVg7yuhezGSgoIIiIRhQOpm2y3wszMDNPT08n+Cb5y4ANCtlshb8Eiv3TyarsVFAxERJZr\nqncDZPvK61bw4WBiYmLVAxLhqWCQHXOQVzEoNSBRAUFEJKLKgdRMqW2Zs0skh8FgcnIytTSyrxz4\nbZn9WgbhDIUwCDQ3Ny9bv6CtrY3W1tZkWqOCgYhIeQoHUlPZjZX8z0tLS8tCwcTEBMVikbGxMUZH\nRykWi0xMTCTVA7+2gXOO5ubm5OJfKBRS9zs7O9m7dy+7du1Ktmbu7OykUCjkhgMFBBGRNIUDqans\nksjhbTjOwFcMxsbGGBkZYWxsLKkg+F0X/UwFeGpjpXAZZH90d3ezZ88edu/eTX9/Pz09PXR1dVEo\nFGhtbVUwEBFZgcKB1EwYCvKObOXAVw18OMhOZ8xuyezDQXd3Nz09Pcmt30Nh165d9Pf3p3Zf9JWD\nMBRonQMRkTSFA6mpbEAIl0n2lQM/O8FXDkZHRxkdHU0WQArHHITdCm1tbXR0dNDd3Z1sqLRz586k\nWtDb20tvby89PT1Jt4KvHPjBiwoGIiLLKRxITeUFA79kct5ARF85GB0dXbZqYnZLZj++oKenh76+\nvqQrYdeuXcmWzNmtmX3lIK9qoJAgIhJROJCayY43CIPB4uJisuvi9PQ0ExMTy8KBn7IYviZbOfDd\nCv39/ezevZv9+/ezZ88eOjo6kqO9vZ2Ojo5U5QDSCyeFbVZAEJHtTuFAaqpU5cBPYQwXPvKzFXy3\nQt4gxqWlJeCpMQe+W6Gvr49du3axb98+9u3bl0xh9NMYszsxiohIaQoHUjPZisHCwkKyGqKvGoSH\n347ZdyWE3+D9jostLdF/smFloKOjg66urtQRbtXsb7NbM4uISD6FA6mZMByEOy/6ww80DBc48t0G\nQGrFw+yUuZ/ZAAAgAElEQVTR2dmZ6jbw6xyEix2F+ydoPIGIyOpVvHyymV1kZp8ws++b2ZKZXZZ5\n/oPx4+FxZ/WaLJuFDwfZzZX8DIVwz4Rw9cOw66C5uTnpFvBjB3x1wAeE7GqI2nlRRGR91lI56AK+\nDvwd8LES53waeB3g/zWeW8PnyBaQVznw4wzywkG2cuDDQfbwCx6VqhyEyyqrciAiUpmKw4Fz7i7g\nLgAr/a/tnHPuxHoaJptftnLgw0G4NXO2WyFv34SWlpakKuCXSu7q6kqCQXt7e2rwYXY7Zq1nICJS\nmVrtyvgSMztmZt8ys5vNbGeNPkcaWN6YA9+t4Fc99JWD7HbMEFUO/GDCbLdCucpBqZ0XRURkdWox\nIPHTRN0NjwE/BPwpcKeZXehKbdMnW5LfkjlbOQi7FfK2Ys5WDsJw4ENB3pgDv45Ba2trqkqgqoGI\nSGWqHg6cc7cHP/6bmT0C/CfwEuCfq/15Uj8rZb1s5SAckBiOOVipW6G1tZVCoZCEA7/iYTYchDMV\nvLyFjkREpLyaT2V0zj1mZsPAMygTDg4ePEhvb2/qscHBQQYHB2vcQqkVXzkI1zfw4w2yGyr5fRPC\nmQp+rEF7eztdXV3Jxkp+uWS/oVLe6oew9QPB0NAQQ0NDqceKxWKdWiMiW0nNw4GZnQHsAn5Q7rxD\nhw5x4MCBWjdHNoD/5u+nJeZNZcx2K+R1KbS0tFAoFJJxBt3d3fT29tLX10dfX1+yFXMYDvzYgjAY\nbNWQkBeeDx8+zMDAQJ1aJCJbRcXhwMy6iKoA/l/cc83sOcBIfFxNNObgaHzenwP/DtxdjQZL48p2\nM4RjDsKZCqXWOShXOfB7KPT29rJz585kt8UdO3Yk4cDPUtiqYUBEZKOspXLwPKLuARcffxE//mHg\nd4AfB14L9AFPEoWCP3LOLay7tdKwwmAQVg6yeymE3QphOChVOfD7J/jKQV9fX7Ilsx97kFc5gK1b\nMRARqbW1rHPwBcpPgXzZ2psjm52/uPvdGMPKQTiV0YeDcLbCasYc+MrBjh07UtMZ/cqIqhyIiKyf\nJn/LuoWBILzNqxz48QbZFRLzVkf0sxSyYw76+/tLjjnQjouVMbM3mNnDZlaMjy+b2cuC5wtmdpOZ\nDZvZhJndYWZ7M+9xppl9ysymzOyomV1nZvq3RWQT08ZLUhV5AWGlMQd+GuNaKge+WuCnL4aVg3JU\nVVjmCeCtwLfjn18H/KOZ/YRz7ghwA/By4FXAOHAT0ZiiiwDiEHAnURfiBcDpwK3APPCODftTiEhV\nKRzIuvi1DPz6BOGtv/hnt2aemZlhZmYmFQqcc6m9FPyiRn55ZL86ou9KaG9vT1ZC9GsbNDc3a7Gj\nCjnnPpV56B1m9kbgAjP7PnAl8Oq4OxEzuwI4YmYvcM49CFwC/Ajw0865YeARM3sn8Gdmdo1zbnHj\n/jQiUi0q/cm6hN0GYVVgcnIydfh1DcLuhDAYNDU1pboRwi2ZV1oiWdsyV4eZNZnZq4FO4H5ggOgL\nxD3+HOfco8DjwIXxQxcAj8TBwLsb6AWevRHtFpHqU+VA1iy7sVL2mJiYSAWDcIbC7OxsaoaDv7i3\ntLTgnKt47wQFg7Uzsx8lCgPtwATwS865b5nZc4F559x45iXHgP3x/f3xz9nn/XMP16bVIlJLCgey\nLtlFjvwxPz+fCgd5lQN/Uc/bQbHU3glhQAirBgoI6/It4DlE049fBdxiZi8uc74RTWNeifZSEdmk\nFA5kzcK9E/w0xXBcwcTExLLqgR9vMDs7m1QA/MU9HD9QrnJQbltmqVw8LuD/xD8eNrMXAG8Gbgfa\nzKwnUz3Yy1PVgaPA8zNvuS++zVYUltGy6SKrt5FLpiscyLpkKwfhVMVs1SAMBrOzsxQKBZqampaN\nOWhra0umKPrBiP5xPyshXNMge8i6NQEF4CFgEbgY+DiAmZ0HnAV8OT73fuDtZrY7GHfwUqAIfHOl\nD9Ky6SKrt5FLpiscyLr4cBBux+wHJGa7FcKQMDs7m4wxgPy9FMLKQd7Oi9ltmaVyZvZuom3WnwC6\ngdcAPwW81Dk3bmYfAK43s1Gi8Qg3Avc5574av8VniELArWb2VuA04FrgfVoVVWTzUjiQNQsHJIbh\nwFcN8mYrZLsVFhcXS85WyOtW8F0LWuyoavYBtxBd1IvAvxIFg3+Knz8InALuIKom3AVc5V/snFsy\ns0uBvyKqJkwBHyLaY0VENimFAynLL2bkj/CxxcXFJAxMTU0xPj5OsVikWCwyNjbG2NgYxWKRycnJ\nZF2DMAw0NzenNlbasWMHPT09yR4K2Y2VSu26KGvnnHv9Cs/PAW+Kj1LnPAFcWuWmiUgdKRxIWdmF\njcJFj/zaBr5S4MPByMgIo6OjFItFxsfHU+Hg1KlTAKlKQTYc+CWSfTjo7OzU8sgiIhtI4UDKCpdB\n9qHA3892I0xMTCRVg5GREcbHx5PxB36TpbBy4GcnhOHAVw38lsx5eyeoaiAiUlsKB7KiMBT4FRHD\n6Yth5WBsbIzR0VFOnjzJxMREMq3Rb7AUrogYDkD04aC3tzfZlrm7u1uVAxGROlA4kLJ85cCHg4WF\nBRYXF1lcXMydneC7FU6ePMnU1FSyVfPCwkIy5gCgubk5tbFS2K3Q19dHX1/fshkLfsyBKgciIrWl\ncCBlhcHAhwK/PLLfVKlU5WB6ejqpNvjDj1vI7rqYFw7ypjGqciAiUnsKB7KibEAIw0E45sAPSPTh\nYGZmpuR7rhQO+vv7k2mLmsIoIrKxFA62uXDzozzhCoh+jIHfPyG7lkH2mJubS5Y2DvdB8BsmhVsy\n+3EHXV1d7Nixg66uLm3JLCJSJwoHskwYGPwCR+GyyP4YHx/n5MmTjI2NMTExwfT0dGpGAjxVIcge\nra2tuZsrZXdezO6hoGAgIlJ7CgeSCBc58vyshJmZmWTQoV8W2Q8+LBUOzCw18DDbTVAuHPguBG3L\nLCKy8RQOBFgeDPxtWDnwqyD6lQ9HR0eTxY7CcLCwsMDS0lKyHbOvFPhtl313wkqVg2x3hMKBiMjG\nUDiQZcsih/dPnTqVDDz01YLR0VFGRkaShY78KojT09PMzs7mdiu0tbUlaxr4w4cDHxDywoG6FERE\nNp7CgQDproRwL4WwcuBnJIyMjHDixIlkLYNwx0XfrbC0tJR0CWTDgR94uJrKQbgVswKCiMjGUDiQ\nRHaTJV85CLsVwqmKw8PDyeqHfrfF7JiDbLdCOG3RhwMfDLLhIAwGgLoVREQ2iMKBpJQLB75yMDo6\nyvDwMMePH0+tfujvLywslO1WCKcr+spBtluhtbW1zr8JEZHtS+Fgi1tpHQPfdeCPcCXExcVFTp48\nyejoKGNjY4yPjzMxMcHU1FRSMQhfFwYCvx1ztlrQ3d1Nb29vsrGSDwnt7e20tbWluhNERKQ+FA62\nOb/1sl/YaG5ujvn5+eT+8PAww8PDyawEv8Pi7OxsMithaWkJIJm66O+Hqx92dXXR3d2d2pK5t7c3\n2VgpDAcKBiIi9dVUyclm9jYze9DMxs3smJl93MzOy5xTMLObzGzYzCbM7A4z21vdZku1+HAwOzub\nTFX0Aw6PHj3K8ePHk+pBOCvBjy3wgw/DMQbNzc3JegZ+9UPflRAuj9zb20t3d3cqHPipiyIiUj+V\n/it8EfBe4IXAzwKtwGfMrCM45wbg54FXAS8GTgc+tv6mylqs9C3cdyvMzc0lqx6OjY0xPDzMsWPH\nklkJvlvBdyn48QXhZkqlBiCGWzKHlQPfraDKgYhIY6moW8E594rwZzN7HXAcGADuNbMe4Erg1c65\nL8TnXAEcMbMXOOcerEqrZd38+IC8ysHY2FhyFIvF5NZ3K/j1DBYWFpLxAf7bfvhzqW4FXznwwSHc\nktnvnyAiIvWz3jEHfYADRuKfB+L3vMef4Jx71MweBy4EFA4aTBgOwtkIfqqiXy45XDbZT1lcWFhI\n1jLwgSDcWCnbrZAdc+CXUfazFMIBiSIiUj9rDgcWfb27AbjXOffN+OH9wLxzbjxz+rH4OWkA2QWP\nspWD0dFRTpw4wfHjx5mcnEwWOAoPHw6A1AJF4UZL4UyFvDEHeRsyqXIgIlJ/66kc3Aw8C/jJVZxr\nRBUGqaNsKICntmTOWyJ5eHiYqakpZmdnU4cPBqdOnUpmJ4TrGfjDL43sg4E/uru76e7uzt3KWdMY\nRUTqb03hwMzeB7wCuMg592Tw1FGgzcx6MtWDvUTVg5IOHjxIb29v6rHBwUEGBwfX0kSJ5W2oFN4P\n1zUIFzOam5tLgoAffOjXMwgHIPpAEO6Z4I/+/v7cgYdhhSC7CqKs3tDQEENDQ6nHisVinVojIltJ\nxeEgDga/CPyUc+7xzNMPAYvAxcDH4/PPA84C7i/3vocOHeLAgQOVNkdWIRsMsisg+oDgw4EPBn5s\nQalwANDc3EyhUMitEPT19bFz585kPQO/CmJra2tqM6XsrosKCquTF54PHz7MwMBAnVokIltFReHA\nzG4GBoHLgCkz2xc/VXTOzTrnxs3sA8D1ZjYKTAA3AvdppkJ9+TDgL+zhzz4YhOHABwQ/K2E1lYNw\nwGG59Qyy4UChQESksVRaOXgD0diBz2cevwK4Jb5/EDgF3AEUgLuAq9beRKmGbCDwt2G3QrZLwR/h\nOWE4CJdJ7ujoYMeOHclMBF8x6OvrS3Ur+MpBXreCAoKISGOodJ2DFeeYOefmgDfFhzSAvG4Fv+xx\nXreCDwh+sSMfCPxt2K3gZyWE4WDnzp3s2bMn6U7wgxDDcBAOPFQwEBFpLNpbYZvIhoIwHOR1K/iq\nwfz8/LJqgw8Gfi+FsHLQ09PDzp072b17N729vcmui37mQtitEIYDTwFBRKT+FA62iexAxFIBITvm\nwK9lkDcNMjvmIOxW2L17Nz09PbS3t1MoFJI1DwqFQjJbQUREGpPCwRaQty1z+FhYGQjvLywsJMsj\nh/sm+BkKfhvm7HoE/igUCvT09CRHOFPBb8UcrnvQ2tqqhY5ERDYBhYMtJLumgb/vV0AMpyf627Gx\nMU6cOMHIyAjFYpGpqalkx0UfDFpbW5cdLS0tdHZ2smvXrmRGgg8IfspiNhBogSMRkc1B4WCLKLXI\nkXOO+fl5ZmZmmJqaSpZD9sfY2BjHjx9nZGQkqR7Mzs6yuLgIkIwp8N0Cvmugvb2drq4udu/encxM\n6O7upqurKxl46PdK8OFAAUFEZHNQONhC8hY58hsrhUsjj4+PJ7djY2OMjIykKgc+HISVg3BQoQ8A\n3d3dSeWgr68vCQdh5cCHAr+hksKBiEjj0/Z3W0SpYLC0tMT8/Hyy62KxWGRkZIQTJ05w9OhRjh49\nyokTJxgdHU0qB75bAaLKQWtrK4VCga6urmRxo127drFnz54kHPT09KTCgZ+V4I9w90aFg8ZhZm8z\nswfNbNzMjpnZx+NVTcNzCmZ2k5kNm9mEmd1hZnsz55xpZp8ysykzO2pm15mZ/n0R2aRUOdhi8hY6\nylYOfDg4ceIEY2NjTE5OMj09nXQ5lKsc+HAQHn6xo7zKQTiAMdy9URrGRcB7gX8h+vfgT4HPmNn5\nzrmZ+JwbgJcDrwLGgZuAj8WvJQ4BdwJPAhcApwO3AvPAOzbsTyIiVaNwsAVkxxiEwSCsHExNTSXh\n4Pjx4xw7dozR0dFkg6Vw10VfOWhqaqKtrS0ZY9DT00N/f/+yikE4YyHcQ0GrIDY259wrwp/N7HXA\ncWAAuNfMeoArgVc7574Qn3MFcMTMXhAvi34J8CPATzvnhoFHzOydwJ+Z2TXOucWN+xOJSDUoHGwR\npdYw8Msi+8rB+Ph4sh2zDwd+emN466cx+vECvluhu7s7WQVx165dyZRFP33RL3rkZyrIptNHtET6\nSPzzANG/E/f4E5xzj5rZ48CFwINE1YJH4mDg3Q38FfBs4OENaLeIVJHCwRaQ3XrZX9z9xd7PTJie\nnmZmZiZZ+dAHAX9+uG9COB3Sr3PQ0tKSjD8IFzXyQUDjCjY3i/7SbgDudc59M354PzCf2YIdoi3Y\n9wfnZLdkPxY8p3AgsskoHGwBYTjwF31/Ozc3l4wp8MEguw1zdkOlMBz4cQJhOPBTG7NrGYQzEmRT\nuhl4FvCTqzjXiCoMK1nNOSLSYBQOtoBw4GG4/LFf7MhXDvLCwcLCQmoDJt8d4ZULB6ocbB1m9j7g\nFcBFzrkng6eOAm1m1pOpHuzlqerAUeD5mbf027lnKwopBw8epLe3N/XY4OAgg4ODFf4JRLa+oaEh\nhoaGUo8Vi8WafJbCwRaQrRz4lRD94SsHpboVspsxrbZy0N7eniyNrMrB5hUHg18Efso593jm6YeA\nReBi4OPx+ecBZwFfjs+5H3i7me0Oxh28FCgC36SMQ4cOceDAgar8OUS2urzgfPjwYQYGBqr+WQoH\nW4APB75yMDs7m4SBcFXEUt0KYSBYSzjQWgabl5ndDAwClwFTZua/8Redc7POuXEz+wBwvZmNAhPA\njcB9zrmvxud+higE3GpmbwVOA64F3uecW9jIP4+IVIfCwRYQ7qwYVg6mpqaYmJhIVQ58OPABwa9n\nkHfA6scc+HEHqhxsOm8gGhfw+czjVwC3xPcPAqeAO4ACcBdwlT/RObdkZpcSzU74MjAFfAi4uobt\nFpEaUjjYAsLKgV+vwC9qNDExkRpzkO1W8Fsyh+8VKhUO/D4L4WDEcMfFbOXAzHJ3j5T6cs6tmOSc\nc3PAm+Kj1DlPAJdWsWkiUkcKB5tAdpfF7P28YOCrBsViMQkIvnLgKwZ+LYOVhCsb+qAQbqQUroAY\nBoO8gCAiIo1P4WCTKFX697suhl0JfrGjYrHI2NhYEhCy4UDf5EVEJI/CwSZRamlkHw78tEXfneB3\nXBwdHU2NO5ibm2NhYWHVVQMREdl+FA42keyUQ78uQd4gRF85GB0dTa2QqMqBiIisROFgkwgrB37B\nIn+ElYOwW8FXDvwgxHAwog8WIiIiWQoHm0TYleBDgR9UGFYOwm4FP+bAT1vMrm+gyoGIiORRONgE\n8nZcDDdZKlU58N0K/rzsXgoKByIikkfhYJPI61bwF/xwVcRwzIHvVsiOVQgHM4qIiGQpHGwS4f4J\n4a6LYXdB2GWQ3UgpbyvmcBVEf5u97xc58osg+bUNwrUPsusbiIjI5qZwsEmEVQMfEPwyyH5JZL/i\noe86yAsG2YAApC7u2Qt/uAJiduGjvGCgkCAisvlVtAi+mb3NzB40s3EzO2ZmH493aAvP+byZLQXH\nqXhzF1mjvGDgKwbZvRJ8QAgrB+X2TgCWrXzow4DfMyGsHlSyIqKIiGxOle6QcxHwXuCFwM8CrcBn\nzKwjOMcB/4NoP/f9RDu0vWX9Td3eVgoI2W4F37WQHWeQla0aZPdQKFU5yFYNFBBERLaOiroVnHOv\nCH82s9cBx4EB4N7gqWnn3Il1t04SPhyspnKQDQb+9b5asJrKQRgSVgoI/j0UDEREtob17q3bR1Qp\nGMk8/hozO2Fmj5jZn2QqC7IGebMUfDjwAaHcgMSVxhz4roJsKAi7FFYabyAiIlvDmgckWnRFuAG4\n1zn3zeCpjwDfBZ4Efhy4DjgP+JV1tHNbC6cxhpUDP4Uxb8yBH5CYFwZCeV0KeeFgNQFBYUFEZGtY\nz2yFm4FnAS8KH3TO/W3w47+Z2VHgc2b2dOfcY6Xe7ODBg/T29qYeGxwcZHBwcB1N3DpKrZAYLm5U\navqiF160/f3m5uZkfEFbWxuFQiF1f8eOHXR1ddHR0UF7ezttbW3LBifmhQIFhNobGhpiaGgo9Vix\nWKxTa0RkK1lTODCz9wGvAC5yzv1ghdMfAAx4BlAyHBw6dIgDBw6spTmyglLTFP1UxY6OjpLHzp07\n2bNnDzt37qS3t5cdO3bQ0dFBa2trydkLsjHywvPhw4cZGBioU4tEZKuoOBzEweAXgZ9yzj2+ipc8\nl2hcwkohQmok7DYIb5uamlLVga6uruS+v+3v72fnzp309/fT29tLV1dXUkFYaeaCiIhsThWFg3i9\ngkHgMmDKzPbFTxWdc7Nmdi7w68CdwEngOcD1wBecc9+oXrOlEmaWjBfIzkZob2+nq6uLnp6ekkdv\nb29yP6wchBUDVQ5ERLaOSisHbyCqAnw+8/gVwC3APNH6B28GuoAngP8JvHtdrZR1CSsH4QBD36Xg\nw0FfXx/9/f3JbX9/f6qq4CsK7e3tSbdC3uqKCggiIptbpesclJ366Jz7HvCS9TRIqq/cAkft7e3s\n2LGDnp4e+vv72bVrV+ro7Oykvb192RGGA/8ZCgYiIluD9lbYBvLCQWtrK4VCIakcdHd309fXx65d\nu9izZw979+5lz549SRDIHi0tLUmlwH9G+HkiIrJ5KRxsA3nhoFAopMKB71bw4WD//v3s378/d2xB\n+LOIiGw9CgebRFNTU6pLoFAoJGsb+AWR/MJHEK1f0NbWRkdHR7J2gQ8E4c89PT3s3r07mY3Q3d2d\nmpHQ0tKSO6ZAXQgiIluXwsEmEH7zb2tro729PbXAkV8FsampKRlH0NnZmXQV+EWN/BEuetTV1cWu\nXbuSdQx8MPAVAwUBEZHtR+FgkwgrB4VCAedcMkXRl/n9c2EwmJycTI0TyI4b6OzspLe3Nzn8aoh5\n4UABQURke1A42AR85aClpYW2trYkGPjH/FgCHwx27NjB9PQ009PTzMzMpM4J90nw6xzs2LEjdfjK\ngf8M34bwVkREti6Fg03CX8x9V4L/2XcP+K6EcBtnfxsugBSukujXPcgumZztVvAUEEREtof1btlc\nE9nNZBpFvdqVrRy0t7cnsww++9nP0tvbS39/P7t372bfvn2cfvrpnHHGGZx99tmce+65PP3pT+ec\nc87hrLPO4swzz+TMM8/kjDPO4PTTT+e0005j7969qb0Tyo05WG0w0N9hZRq1XSKyPSkcVKCe7cqO\nOfDh4JOf/CS9vb3JBkk+HJx55pmcc845nHvuuZxzzjmcffbZnH322Zx11lmcccYZPO1pT+NpT3sa\n+/fvT22sFA5IXM+eCfo7rEyjtktEtid1K2wC4YW5ubk59VxzczNdXV31aJaIiGxRDVk5EBERkfpR\nOBAREZGURuhWaAc4cuRI8kCxWOTw4cN1a1ApjdiuRmwTqF2Vqla7gv+P2tf9ZiKybTVCODgH4PLL\nL089ODAwUI+2rKgR29WIbQK1q1JVbtc5wJdXOsnMLgL+ABgATgNe6Zz7ROacdwGvB/qA+4A3Oue+\nHTzfD7wPuBRYAj4GvNk5N1WVP4mIbLhGCAd3A68BvgPM1rcpIpteO1EwuHuV53cBXwf+juiinmJm\nbwV+F/gN4DHgvwF3m9n5zrn5+LSPAvuAi4E24EPA+4HLs+8nIptD3cOBc+4k0T8uIlIdK1YMPOfc\nXcBdAJY/V/XNwLXOuU/G57wWOAa8ErjdzM4HLgEGnHNfi895E/ApM/t959zRdf1JRKQuNCBRRHKZ\n2dOB/cA9/jHn3DjwAHBh/NAFwKgPBrHPAQ544QY1VUSqTOFARErZT3SRP5Z5/Fj8nD/nePikc+4U\nMBKcIyKbTN27FURk0zGi0LDeczh48CC9vb2pxwYHBxkcHFx760S2qKGhoWWrqRaLxZp8lsKBiJRy\nlOgiv4909WAv8LXgnL3hi8ysGehnecVhmUOHDnHgwIGqNFZkq8sLzocPH67JDKyG6lYws6vM7DEz\nmzGzr5jZ8+vcnqvNbClzfLMO7bjIzD5hZt+P23BZzjnvMrMnzWzazD5rZs+od7vM7IM5v787a9ym\nt5nZg2Y2bmbHzOzjZnZe5pyCmd1kZsNmNmFmd5jZ3lLvuYHt+nzmd3XKzG6uZbvKcc49RnTxvzho\nYw/RWAI/6PF+oM/Mnhu89GKiUPHABjVVRKqsYcKBmf0a8BfA1cBzgYeJpkztrmvD4BtE35z2x8dP\n1qENfrrZVeSUaoPpZr8NvACYIvrdtdWzXbFPk/791bpefBHwXqIL2M8CrcBnzKwjOOcG4OeBVwEv\nBk4nZxpfHdrlgP/BU7+v04C31LJRZtZlZs8xs5+IHzo3/vnM+OcbgHeY2S+Y2Y8BtwDfA/4RwDn3\nLaJpk39jZs83sxfFf84hzVQQ2bwaqVvhIPB+59wtAGb2BqJ/wK8Erqtjuxadcyfq+Pnrnm5Wx3YB\nzG3k788594rwZzN7HdGAuQHg3vib75XAq51zX4jPuQI4YmYvcM49WI92BU9Nb/B/b88D/pkomDii\ngA7wYeBK59x1ZtZJtG5BH/Al4OXBGgcAv060CNLniBZBuoPov0kR2aQaonJgZq1E/0iGU6Yc0T82\nF5Z63Qb54bhs/p9mdlvwjaohrHK6WT29JC6jf8vMbjaznRv8+X1EF72R+OcBolAc/r4eBR5nY39f\n2XZ5rzGzE2b2iJn9SaayUHXOuS8455qcc82Z48rgnGucc6c75zqdc5eEqyPGz4855y53zvU65/qd\nc7/pnJuuZbtFpLYapXKwG2gmf8rUMze+OYmvAK8DHiUq8V4DfNHMfrSBloZdzXSzevk0Ubn+MeCH\ngD8F7jSzC+PwV1NxNeMG4F7nnB8rsh+YjwNUaMN+XyXaBfAR4LvAk8CPE1XMzgN+ZSPaJSLiNUo4\nKGVV06FqxTkXLkH7DTN7kOgf718FPlifVq1aXX93AM65sEvj38zsEeA/gZcQlbJr7WbgWaxunMhG\n/r58u14UPuic+9vgx38zs6PA58zs6fHgQBGRDdEQ3QrAMHCKaCBWaC+rmA61UZxzReDfgZrPBKhA\nON0s1FC/O0hGvw+zAb8/M3sf8ArgJc65J4OnjgJt8diD0Ib8vjLt+sEKpz9A9HfbSP+9icg20BDh\nwDm3ADxEesqUxT+vep34WjOzHUTl8ZX+Ud8wq5xu1hDM7AxgFzX+/cUX4F8Efto593jm6YeARdK/\nr/OAs4im5dWrXXmeS1TNaJj/3kRke2ikboXrgQ+b2UPAg0SzFzqJdnirCzN7D/BJoq6EpwF/THRh\nGVCUctUAAAqBSURBVCr3uhq0o4vo26OfEXCumT0HGHHOPcFT082+TbS75bUE083q0a74uJpozMHR\n+Lw/J6q8rHbHwLW06Wai6ZKXAVNm5isqRefcrHNu3Mw+AFxvZqPABHAjcF+tZiqspl1mdi7RqP87\ngZPAc4j+n/iCc+4btWqXiEiehgkHzrnb4zUN3kVUIv86cEmdpxGeQbRj5C7gBNGUswvinSQ3UjWm\nm210u36HaFDda+M2PUkUCv4orhTVyhvitnw+8/gVRHP0IQqep4im3BWIpmNeVcM2raZd80TrH7yZ\naP2IJ4D/Cby7xu0SEVmmYcIBgHPuZqLBWg3BOdcQC7zH8/HLdgE5564hmk2xYVbRrpdtVFs859yK\nXWXOuTngTfGxIVZql3Pue0QDNUVE6q4hxhyIiIhI41A4EBERkRSFAxEREUlROBAREZEUhQMRERFJ\nUTgQERGRFIUDERERSVE4EBERkRSFAxEREUlROBAREZEUhQMRERFJUTgQERGRFIUDERERSVE4EBER\nkRSFAxEREUlROBAREZGUlno3QEREZCt6/PHHGR4eLnvO7t27OeusszaoRauncCAiIlJljz/+OM98\n5vnMzk6XPa+9vZNHHz3ScAFB4UBERKTKhoeH42BwG3B+ibOOMDt7OcPDwwoHIiIi28f5wIF6N6Ji\nGpAoIiIiKQoHIiIikqJwICIiIikKByIiIpKicCAiIiIpCgcisu0NDQ3VuwkJtaW0RmpPI7WlFjSV\nUUSqwsyuAn4f2A88DLzJOffV+rZqdYaGhhgcHFz3+6y0It5qVsOrVluqoZHaAqtrTzX+DqrVls1M\n4UBE1s3Mfg34C+C3gAeBg8DdZnaec678+rE1trS0xOHDh8ueMzMzs+7PWc2KeIVCOx/72B2cdtpp\nNW3LdrWav4PVrEi4mmWPt/rfk8KBiFTDQeD9zrlbAMzsDcDPA1cC19WvWaNMT08xMDBQ9qympmYe\nf/zxdX2jXHlFvC8xN/d7XHrppTVvy3a18t/ByisSrnbZ46amJj71qU+VDHpHjhyprPENRuFARNbF\nzFqBAeBP/GPOOWdmnwMurFvDAJgCTrHSErZLS5fzpS99ifPPL3VOJeXoUiviHQGWNqwtq/n2Ozc3\nR6FQKPn86OjoilWXjdw4aGZmpmx7nrogl1+VsNyF+8iRI6tY9vhLLC39vysGvc1M4UBE1ms30Awc\nyzx+DHhmide0w/q+Xf3Hf/xHfG8I2FXirEfi28fKvNPXALj88svLfl5ra4H3vOfP2b17d+7zjz3m\nP+NOoiCQdd+GtWV4eJg/+IM/ZGFhtuz7RGPSl8qesVLVZaW2QPQte2mp/OesdM7w8DD33PNPK7Yn\nUurv4GuArfj7jZT7e3o0vv2/gVJdRI8A/1imLU99xnr+Pwhe277mN8lhzrlqvp+IbDNmdhrwfeBC\n59wDwePXAT/pnPu/cl7z68BHNq6VIlvea5xzH63Wm6lyICLrNUxUu9+XeXwvy6sJ3t3Aa4DvACt9\nvRWR0tqBc4j+n6oaVQ5EZN3M7CvAA865N8c/G/A4cKNz7j11bZyIVEyVAxGphuuBD5vZQzw1lbET\n+FA9GyUia6NwICLr5py73cx2A+8i6l74OnCJc+5EfVsmImuhbgURERFJ0d4KIiIikqJwICIiIikK\nByJSc2bWb2YfMbOimY2a2d+aWdcqXnehmd1jZpPxaz9vZqWX9Ktxe4LXf9rMlszsso1uS3z+jWb2\nLTObMrPvmtlfmlnPGj77KjN7zMxmzOwrZvb8Fc7/L2Z2JD7/YTN7eaWfWa32mNnrzeyLZjYSH59d\nqf21akvmda+O/9v4h3q1xcx6zewmM3syfs23zOxllXymwoGIbISPEq1FezHRngsvBt5f7gVmdiHw\naeAu4Hnx8T5WWtKvRu0J2nWQaF2Hag3YqrQtpxMty/d7wI8CvwG8DPjbSj402CzrauC5RDtp3h0P\nLM07/8K4rX8D/ATwv4D/ZWbPquRzq9Ue4Kfi9rwEuAB4AvhMvCjXRrfFv+5s4D3AF9fbhrW2JV7O\n/HPAWcAvE61S+ptEC5WtnnNOhw4dOmp2AD9CdEF/bvDYJcAisL/M6+4HrmmU9sTnPQf4LtECT0vA\nZfVqS+Z9fgWYAZoqeM1XgL8Mfjbge8BbSpz/98Ancv6Obq7S30tF7cl5fRNQBC6vR1viz/8ScAXw\nQeAf6vF7Ad4A/AfQvJ7PVeVARGrtQmDUOfe14LHPEX3zfmHeC8xsT/zcsJndZ2ZH4y6FF9WjPXGb\nOoi+qV7lnDtehXasuS05+oBx59yqqirBZln3+MdcdGUpt1nWhfHzobvLnL9qa2xPVhfQCozUqS1X\nA8edcx9cz+dXoS2/QBza4v9vHjGzt5lZRdd7hQMRqbX9QOpi6pw7RfSP+P4Srzk3vr2aqMR+CXAY\nuMfMfqgO7QE4BNzrnPvf6/z8arQlEZeX38Equ0Vi5TbLKvW5+ys8vxJraU/WnxOVzrMBpuZtiUPr\nFcDr1/nZ624L0f87/4Xo+v5y4FrgvwJvr+SDFQ5EZE3M7E/jgVeljlNmdl65t6B0v73/t+mvnXO3\nOOceds79HtF2eFdudHvigYc/Q7Ty44pq/LsJP6cb+BTwDeCPV9O2ld5yNZ+7jvMrtdrfwx8Cvwq8\n0jk3v5FtMbMdwK3AbzrnRmv02atqS6yJKDz8lnPua86524F3A2+s5AO0QqKIrNV/J+pbLef/AEeJ\n+ugTZtYM9FN6Y6YfxLfZvWyPEA202uj2/DTRN7KimYWP/4OZfdE59zMb2BZ/3g6isv4Y8MtxxWG1\n1rJZ1tEKz6/EWtoDgJn9PvAW4GLn3L/VoS0/BJwNfNKe+o+jKW7bPPBM51y5/Z+r2RaI/t+Zj7sf\nvCPAfjNrcc4truaDFQ5EZE2ccyeBkyudZ2b3A31m9tygb/1iom8/D+S9xjn3HTN7kmikdeg84M6N\nbg/wp0Sj9EPfAN4MLOtmqHFbfMXgbqJBiJdV+m3ZObdg0T4YFwOfiN/T4p9vLPGy+3Oe/7n48XVZ\nY3swsz8gKpe/NDNuYyPbcgT4scxj7wZ2AP8P0SyKjWoLwH3AYOaxZwI/WG0w8B+uQ4cOHTU9iC7o\n/wI8H3gRUffArcHzpxP9I/u84LE3A6PAq4i+nV0LTAFPr0d7ct5j3bMV1tIWoovOV4j2r3g60bdK\nf1QyW+FXicLFa4lmTbyfKNDsiZ+/BfiT4PwLgXmiKZTPBK4h2m77WVX6b6TS9rwl/vxfyvwOuja6\nLTmvr+ZshUp/L2cQzdr4S+CHiabHHgX+sKLPrUbjdejQoaPcQTSa/rb4H61Rom/hncHzZxOVT1+c\ned1biKYOTgD3AhfWsz2Z9zhVpXBQUVuI5vefyhxL8e1ZFX727wDfiS8+95MOZ/8E/F3m/FcB34rP\n/1eizbWq+d/JqtsDPJbzezgF/NFGtyXntVULB2v8e3oh8GVgmmha41uJ91Ja7aGNl0RERCRFsxVE\nREQkReFAREREUhQOREREJEXhQERERFIUDkRERCRF4UBERERSFA5EREQkReFA5P9vt44FAAAAAAb5\nW++eQ1EEwMgBADByAACMHAAAE5OOiZtgWJb5AAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f8444471358>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Let's convert the uint8 image to 32 bit floats and rescale \n",
-    "# the values to be centered around 0, between [-0.5, 0.5]. \n",
-    "# \n",
-    "# We again plot the image and histogram to check that we \n",
-    "# haven't mangled the data.\n",
-    "scaled = image.astype(numpy.float32)\n",
-    "scaled = (scaled - (255 / 2.0)) / 255\n",
-    "_, (ax1, ax2) = plt.subplots(1, 2)\n",
-    "ax1.imshow(scaled.reshape(28, 28), cmap=plt.cm.Greys);\n",
-    "ax2.hist(scaled, bins=20, range=[-0.5, 0.5]);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "PlqlwkX-O0Hd"
-   },
-   "source": [
-    "Great -- we've retained the correct image data while properly rescaling to the range [-0.5, 0.5].\n",
-    "\n",
-    "## Reading the labels\n",
-    "\n",
-    "Let's next unpack the test label data. The format here is similar: a magic number followed by a count followed by the labels as `uint8` values. In more detail:\n",
-    "\n",
-    "    [offset] [type]          [value]          [description] \n",
-    "    0000     32 bit integer  0x00000801(2049) magic number (MSB first) \n",
-    "    0004     32 bit integer  10000            number of items \n",
-    "    0008     unsigned byte   ??               label \n",
-    "    0009     unsigned byte   ??               label \n",
-    "    ........ \n",
-    "    xxxx     unsigned byte   ??               label\n",
-    "\n",
-    "As with the image data, let's read  the first test set value to sanity check our input path. We'll expect a 7."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:22.925176",
-     "start_time": "2016-09-16T14:49:22.897739"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 90,
-     "status": "ok",
-     "timestamp": 1446749126903,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "d8zv9yZzQOnV",
-    "outputId": "ad203b2c-f095-4035-e0cd-7869c078da3d"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "magic number 2049\n",
-      "label count 10000\n",
-      "First label: 7\n"
-     ]
-    }
-   ],
-   "source": [
-    "with gzip.open(test_labels_filename) as f:\n",
-    "    # Print the header fields.\n",
-    "    for field in ['magic number', 'label count']:\n",
-    "        print(field, struct.unpack('>i', f.read(4))[0])\n",
-    "\n",
-    "    print('First label:', struct.unpack('B', f.read(1))[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "zAGrQSXCQtIm"
-   },
-   "source": [
-    "Indeed, the first label of the test set is 7.\n",
-    "\n",
-    "## Forming the training, testing, and validation data sets\n",
-    "\n",
-    "Now that we understand how to read a single element, we can read a much larger set that we'll use for training, testing, and validation.\n",
-    "\n",
-    "### Image data\n",
-    "\n",
-    "The code below is a generalization of our prototyping above that reads the entire test and training data set."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:23.525119",
-     "start_time": "2016-09-16T14:49:22.928289"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 734,
-     "status": "ok",
-     "timestamp": 1446749128718,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "ofFZ5oJeRMDA",
-    "outputId": "ff2de90b-aed9-4ce5-db8c-9123496186b1"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracting /tmp/mnist-data/train-images-idx3-ubyte.gz\n",
-      "Extracting /tmp/mnist-data/t10k-images-idx3-ubyte.gz\n"
-     ]
-    }
-   ],
-   "source": [
-    "IMAGE_SIZE = 28\n",
-    "PIXEL_DEPTH = 255\n",
-    "\n",
-    "def extract_data(filename, num_images):\n",
-    "    \"\"\"Extract the images into a 4D tensor [image index, y, x, channels].\n",
-    "  \n",
-    "    For MNIST data, the number of channels is always 1.\n",
-    "\n",
-    "    Values are rescaled from [0, 255] down to [-0.5, 0.5].\n",
-    "    \"\"\"\n",
-    "    print('Extracting', filename)\n",
-    "    with gzip.open(filename) as bytestream:\n",
-    "        # Skip the magic number and dimensions; we know these values.\n",
-    "        bytestream.read(16)\n",
-    "\n",
-    "        buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images)\n",
-    "        data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)\n",
-    "        data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH\n",
-    "        data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, 1)\n",
-    "        return data\n",
-    "\n",
-    "train_data = extract_data(train_data_filename, 60000)\n",
-    "test_data = extract_data(test_data_filename, 10000)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "0x4rwXxUR96O"
-   },
-   "source": [
-    "A crucial difference here is how we `reshape` the array of pixel values. Instead of one image that's 28x28, we now have a set of 60,000 images, each one being 28x28. We also include a number of channels, which for grayscale images as we have here is 1.\n",
-    "\n",
-    "Let's make sure we've got the reshaping parameters right by inspecting the dimensions and the first two images. (Again, mangled input is a very common source of errors.)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:23.829853",
-     "start_time": "2016-09-16T14:49:23.527283"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {},
-      {}
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 400,
-     "status": "ok",
-     "timestamp": 1446749129657,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "0AwSo8mlSja_",
-    "outputId": "11490c39-7c67-4fe5-982c-ca8278294d96"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training data shape (60000, 28, 28, 1)\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfwAAAD+CAYAAADf7besAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJztvWuMrFtaHvasvlZVV1d19z7nzCHBF2A8ZpAD8dkYZwJj\nxhpLjsfyAWQHsoFgsKLEAVvO/mEQEskQkxgFC87YwERYDgSE2RI2cTJIMIPBXDzAMOLskHAZQ8DD\ndebss3v3rbqr7/XlR/fz7ed7a62vqrqquuvyPtLSd+nqqtXV37ue9d5DlmVwOBwOh8Mx21i46wk4\nHA6Hw+EYP5zwHQ6Hw+GYAzjhOxwOh8MxB3DCdzgcDodjDuCE73A4HA7HHMAJ3+FwOByOOYATvsPh\ncDgccwAnfIfD4XA45gBO+A6Hw+FwzAGc8B0Oh8PhmAOMjfBDCF8XQvh4COE4hPCREMKfG9dnORyO\nu4fLvMMx2QjjqKUfQvgyAN8P4L8G8FEADwH85wDelmXZtnntPQB/GcDvAjgZ+WQcjvlCBcCfBPCh\nLMue3daHDiLz1693uXc4RoP+ZT7LspEPAB8B8I/lOgD4QwBfH3ntlwPIfPjwMdLx5eOQ7VHIvMu9\nDx9jGT1lfgkjRghhGcB9AP+Q97Isy0IIPwngHZFf+V0A+MEf/EG8/e1vBwA8fPgQr7322qinNjQm\ncV6TOCfA5zUoRjWvj33sY/jKr/xK4FqubgM3kHlgSuR+EucE+LwGxSTO6y5kfuSED+AFAIsAnpj7\nTwD86cjrTwDg7W9/O1555RUAQLPZzM8nCZM4r0mcE+DzGhRjmNdtmskHlXlgSuR+EucE+LwGxSTO\n6y5k/jaj9AOuzA4Oh2M+4DLvcEwQxqHhbwO4BPAWc/8ldGsAOR4+fIhmswkA+OhHP4pXX30VDx48\nwIMHD8YwRYdj+vHo0SM8evSocG9/f/8upnIjmQdc7h2OQTCszI+c8LMsOw8hvA7g3QA+AAAhhHB9\n/U9Sv/faa6/l5o1XX30VH/jAB0Y9NYdjphAjxsePH+P+/fu3Oo+byjzgcu9wDIJhZX4cGj4AfAeA\n779eBJiiUwPwv/fzy5O6u5/EeU3inACf16CY1HkNgKFkHpjM72AS5wT4vAbFJM7rLuY0ljx8AAgh\nfC2Ar8eVme9XAPzdLMt+OfK6VwC8/vrrr09cUIXDMW2Q3f79LMse3+Zn9yvz1691uXc4RoBBZH5c\nGj6yLHs/gPeP6/0dDsdkwWXe4ZhseC19h8PhcDjmAE74DofD4XDMAZzwHQ6Hw+GYAzjhOxwOh8Mx\nB3DCdzgcDodjDuCE73A4HA7HHMAJ3+FwOByOOYATvsPhcDgccwAnfIfD4XA45gBO+A6Hw+FwzAGc\n8B0Oh8PhmAM44TscDofDMQdwwnc4HA6HYw7ghO9wOBwOxxzACd/hcDgcjjmAE77D4XA4HHMAJ3yH\nw+FwOOYAS3c9AYfD4XBMD7IsSx45Op1OzxF73cLCQtcIIUTv2wEAIYTS47zDCd/hcDgcA0HJ3RL9\nxcUFLi8vcXFxER1lP1taWuo5FhcXo/dDCF0DcLJXOOE7HA6Ho28oudtjp9PB2dkZzs/PcXZ21nXe\n697KygpWVlawvLycn+vQ+3oOoGAJ4DnhpH8FJ3yHw+FwDISU6f7y8hJnZ2c4PT3FyckJTk5OCuex\na71fqVQKY3V1tfRep9MBgJzoOS+SPcnfcQUnfIfD4XAMBEv4l5eXuTn/7OwMx8fHOD4+Rrvdzo/2\n3N47Pj5GrVZDtVpFrVYrDHvv4uIitywsLCzkpn7GEwBXWr1eO5zwHQ6HwzEAYmTPQcI/OTlBu93G\n4eEhjo6OCkeO2HW9Xi+MtbW1/Pz09BRnZ2d5HABwRepLS0tYWVlBlmVYXFzM50k/vpP+czjhOxwO\nh2MgpEifvnkl/FarhYODg/xozzkODw/RaDSwvr6ORqNROD85Ocn9/KrZLy4uYmVlBRcXF/nc6L93\nou+GE/4c4C4ffC4MsfN+fjd2TsSicnVXnxo6l9jceqHsczUyOHbtcEw6UnLA+9Tkz8/Pu8bZ2VmB\nxFutFvb397vInUN/1mq18gA+G9XPYQMFgeeyxUA+jpWVla7gvXmHE76jgFFuDmIBPXpe9rllhG39\ndva4sLBQWCS4QNl7Oie9p7AkTa0i9rk6+Dp7dDimBakNMTV4HcfHx/mRWr2a63nNY7vdxsnJCc7P\nz3N/PIBC4N/JyQmWlpZyws6yLLciMPBPP69araJSqaBarRYG1wqHE74DcbIdBRjEk8rFHbSAh16n\n0naYokNfoqYA6ZEaSuwIFIlez2lCTKUPLS8v53nBPM+yDMvLyyP5Th2O20Asv57nDMqzvvmjo6N8\ntNvt0nNuEM7OzgobbQ38Ozk5iZI9P5/vtba2hv39fdTrdayvr6Ner+Pi4iL36XNNcIyB8EMI7wXw\nXnP732VZ9lmj/izH8IiZzXuZ0vsFhTeWk3t+fh418esiE9PCeWRqju7qqfmHEApaQCwtiAFATCHS\nI4AuczyPi4uLhc+156urqzn5c5Hk780yXO5nC7Fce56TcOmHtyZ7RuinhsohzfeUfyV8S/aUV8YH\nHB0dFTT5ZrOJk5OTfNNOsq/Vanf2PU4axqXh/xqAdwOganRR8lrHHcESe0rjvilUeEmySrhlfvVY\n9K9eVyoVrK2t4fz8PHcPLCwsYHl5GYuLi7i4uMDp6WkhDYgahi46sfMyP/zS0hLW1tbyUavVCtck\nf2osJPs5CSByuZ8hWFnUwjo0pe/t7WF3dxc7OzvY3d3F3t5eQcZj57bYjpr0syzL1wygaOLne2gu\n/urqan5+fHyck/3CwgJWVlZQrVa73IfzjHER/kWWZU/H9N6OEaCM7EdF+BrcQ/LVUeajL3MHXFxc\noFarRcl+dXUVy8vLBdOf9SPSh2gHTY1AOuhuZWUF6+vrefTw+vp6bpq07gqS/dLSUldswIzC5X5G\nEHOpkfRVrvb39/Hs2TM8ffoU29vb2NnZSVbUY2Cfus8oL9akz3Oa8ekiK6vEd3Z2VjDjr62t4ezs\nbF5kry+Mi/D/VAjhjwCcAPhFAN+YZdkfjOmzHCWIkXbKZH/TqPUU7O5c/W7tdrvLN2hNiHaB0PN6\nvZ4L8uLiIpaXl1GpVAoWAX7m0dFRl+mRPkduAnh+dHQEIB2Jv7q6imaziY2NjdxaQD+kfmckey5M\nc6Lhu9zPCKwsqnXNEv7Ozg6ePn2KJ0+e4OnTp6UbdQ2O1U2ENekr2dtAWRsjw3F5eYnFxUWsrq6i\nVquh0Wjksum4wjgI/yMAvhrAbwL4FADfDODnQgh/JsuyozF8nmMApDYAZSlrN4X68Gky12IbdlHR\nowbo6JHnp6enAJBH4K6urqJareaLhVoVSPg0P5alCbVarSTZLywsYHV1FVtbWzg6OspNlKrRA8/z\ngEn2auKfYbjczwh0HYhlssQ0/DfffBOf/OQn8eTJk2TcjRJ7SungRoBxOEB3DI3NfOE5gALZczM+\nB7LXN0ZO+FmWfUgufy2E8FEAvwfgSwF836g/b1rQS9Pu53fLTO+9hn0f/b1U2tywgnJ6eppH59oo\n3qOjo2gEsNXwU9H03Mlr3i13+hcXF9jf38fe3l4+9vf380Fip6WBxM33t+04LemnUvNU6+AipL8/\ny3C5ny6UrQk026fG9vY2nj17lm+e6SI7Pj7G6elpsvVtaj1JdbWLyYy+h65VIYQu14G6DWz2zSCd\n9GZJdseelpdl2X4I4bcAvLXsdQ8fPkSz2Szce/DgAR48eDDO6d06+g2Ms/dTmnBMqJS4lUz5vla4\ny0xww2j55+fnXbWy9djLh5/Kn7+8vMy1epL94uJiLpgnJye5Ns9AohjZUwNg9S7guRZh8+k5NCo4\nlvNbqVQKfka+xzgWjUePHuHRo0eFe/v7+yP/nJvA5X7ykZK9i4uLrtx6Hc+ePcP29jZ2d3dxcHCQ\nW7u0vn1s7VGUEW9MVlIFrPhZIYSuIF87ZqF97rAyP3bCDyHUAXwGgB8oe91rr72GV155ZdzTuVOk\nTOYp85beK9PCLTFqz2n1LccGTXTWfK6pczcFTfmx7lg2Sj9F+qk8fAbmUJum0GZZhuPj4y4NX0mf\nCxQ1EtuII6a58zpG8HrNqGG1OIyL8GPE+PjxY9y/f3/knzUoXO4nGzGLGs8pt2qd03z73d1dPHv2\nDDs7OwXC17K3MUWDKMuCiR3LzmPuB0v61Pa1ba492vedVAwr8+PIw/9HAH4UV+a8/xDA/4ir9JxH\nZb83LygzufPnsd9Jabq2zGXsPEakuptn2oxNoyEp3xR8b2sS1Hz3lGsi9jO9p5X2KLj820j4JH1r\nzm+324X5aFqQavQkbXUZ2Nx/JX6bi2/nN8twuZ8+6KZfN9aMf2m323lpXMqOlasU4afWN6JXaWq+\nRo927gRl18YaWE1fW+hq+1x9z1mX03Fo+J8K4IcA3APwFMCHAfwnWZY9G8NnTR1S5MufpV6fMlOl\nCtvoiO3m1ZxfViRjGMLXzUjsmCJzImbGU1+6BuvwO2JRntgixcA8Lemp0cPA877a1M5tGlCK7K1J\nX10N80D4cLmfKsSsaCRM1fAPDg5y1xhz7ZnlohkuKcLXzyJiwbBl2r7V6GMKgtXwY+sk1wz+PpUE\nJfpZJ/1xBO258y0B+6Aq8erP7bma7WOavBa2sVo6o1RjZvEsu6qapalythTmMIRvd9t2B172NwMo\nBM7ZAaAgpJoRsLKykozCPzg4yAOLYgFFatLXXF9q7dakHzPxz6OG73I/fbCuQu14p4S/t7eH7e1t\nbG9v4+nTpzg8POzqd0/C1xS4MkUGQIHsbXBragOga5f+DbrhV5emrpNaT1/fXzcAfM9ZlVevpX/L\nSPmp+TN9HaGBdbFUtVQjC44YuXGcnZ11NbrQ3fuwkfp2x58K5LHX4bqqnQbM6TVfY8n+9PQUy8vL\naLVaXW05eWS6j34WYQmfxXw4Ymb8sqA9G2PgcEwCyjRjJfxWq5X77N9880288cYbuUvMlqfW4NcU\nYoQeI/3Y65TkVTtXt0SK9Bmlr+Su/nt9X3s+S3DCHxF6acL6MMY035h5W3+3LE0mFkmr98oI//T0\nNCdHjsPDw5wgh9HwhwHz2JV4OZeFhYWuTdDp6WkuvOfn5/nfTksHd/n8/mOpdzxnqU7rl69UKqjX\n64WSurVarauWvvr8newd48BN5ZK/p5tkazHUPvZ0h5H0t7e384j8soZYKcTM+XakAusYc6RkrsqS\nblh0XaTFUuUytqakXIizBCf8ESMVcKbNH+zuWP3sKf+UzS9Vc762q4w1ikml8PF9maKmuejDavbD\nghq+1swuO+q5WgRojq9Wq6jValhfXy8E8MQWG9Xo7VhbW8sr7bEzV61WQ6VSKUTmqyl/Hkz6jtuF\narwxpNYhnlv3n7oBW60Wnj59ip2dHezv76PVaqHdbucZLVoOt1cMUmruNt2V2Sw2K0aPWZYVXJV6\n5JxY+2N/fx+1Wg3Ly8tYWFjA6elp7mpTeeY1P98W9pm1ttZO+CNELDqVg1onfV42Jz0V3aq/rxqt\nHmObCBWGWMAezzVXXrtXTRLhV6vVXJvmsAKr51rStlKp5Nq4WjzK4gNSbXc5F9bRbzQaqNfreTqe\nNeMr6TvhO0aJFLmmFAYr+4zC17gdjlarlafd7e3tFQifVjJbIvemZK8Er0GysZr5WZYV5kkiprme\nWUEkfLbF7XQ6aLfbpbE3sbbWS0tLMye3TvgjhvXN8zzWyEVHKl+e72F9Uuqnimn9em2FXd+bQkKL\nwKRo+Foyt1aroV6vo16v5w1rYgsCz5mjT/+61WCU8HUnr4SvFfz0ml36OB9uRKjhU6OwlfYcjlHC\navgxwo2tQ+rKI7mrK49xLixcpYSvpnxrKYzF5ZTNPRYrY9Ne7eh0Omi1WrmMA89JnoV3GHtAOaRl\n9fDwsNDVUsfFxUXuipv1ttZO+COEJXub23pycpK3lNS67ru7uznhx1LnNALVxgCoL7usWp4dnK91\nNaiGf1f+e2rE3PFXq1XU6/XclL6xsVHQpq1vjn74VLpilmVdJjs92l2+vjf7a9NFwCOD9axp0DV8\nxziQInuV7VjKHc9tnn2q/DTH0dFRQcOPEX4/UL+9ypemvWp8jJ53Op0C2Wtdf15Tww8hdG1s2OWS\nQ9e6VFvru1Z8Rg0n/BHDkr6mutiGE6xJvb29nRNzKn2uLPCuV0CgzovnPNrgN93B3wVIjDENv9ls\nYmtrC1tbW4W0N+vzW1hYKN0A8f1jjTi0oE/Ml0gtPxZLwMVIFzUbCexwjAIxH37MnG/XA16fnJzk\nwXnseMfAPFajtKm66sOPuQkGmbtq+GrKp/uOpExL2vr6OjqdToHsSeZW21fNvt1u4+DgAGtra9jY\n2ECz2cxLaus6p2styb6fIMRpgxP+CJEie9tham9vL09zefLkCZ48edJlJovtoMv8/KlNghVGe66L\ngtUGbhu2ypb68En4m5ubePHFF7GystLVyEbN8jHNhtcUaPu7qRr69mfqPrDHfiqIORzDIrZ5t8eY\nQqDBbdR89/b2sLOzk+fZ7+7uRsthx4rr8LP6JX1bOMumv9Jltr6+jmaziUajgWaziWazmRMw45a4\nnjK/nn+XavZ872q1WghOttU1OX+uDWrenyU44Y8YMdLX3FbdUTOv9Y/+6I9yQYoRVRlp22sr+LHX\np+ac+v3bgCXElEl/a2sLL7zwApaXl6OatJJr6u+yWr0dqXQh3WDEoohtCp6TvOM20UvDVysXW0cz\nBXd3dxfb29t48803sbOzk4wJUu3efm6/sHKkcTiq4dOFt7m5ic3Nzfxv4NzVnx9CyEn87Oysa7O9\nurpaiFHinJn+y9cp2buG77gRbEU43QTYinhqduPxrh+6mKZqtdYUuaUsEvybUu+hvnTu/Onfq9fr\nBSG1eb29ECP8mN89tpEocwV4gJ7jNlFmxYoF8+rQFrd7e3u5z56lcmPlafk5w65HKXeDjR2ibKm2\nXZYyawOf9brT6URrlTBXny45W6/jrtfeUcMJf8RQYqA2qUFiMY1Sq8epaclWmLqrvyel5cYKVFgN\nO+YqUJdB7PeBdE17RuymCL8f87klbhu8ZzcQqUIhMauCw3FbYDBwrHgOlYlYmu7Z2Vnur3/27FlX\n6p1t1DVq4lOlh7IEXK0XzKbRv0nnQJmzNTbW1tZyK2lMaeL7W4VLg5Xt5znhO3oipXGmNEMdMb/8\nXYNmrl4lbmMjlkFga+iXEX6qxC0j4u0mw75H2d8UK7hjo+pTFoTYax2O2wa1+FiXS1uEy/rkmXrH\n6PxWq9UVzFbmVrwpLOmqUpNlWWGTojn/Nnqe64Ja/mxDLDXzW6tCqvGYfuaskT3ghD9SWAJS83KZ\nZq8bAD6Uk0Ikqg1repoWpkhpwgxW1H7UdF2krAP2M62PjwJODZ+v75fs9f1TR33PmBvD/k6/n+tw\njBKa7ssCXlrMq6wLpvbL4FE1fBs0PCptV+ObtL69Zg9oATDVuClvsSqaa2trBcuAtTLy+7IuD9fw\nHUPBkj3N+rGc75iGr1Hmk0D6akKz7WKZ854izouLCywtLeWFMYDn5rwy7VwJX0lfNXxL+Hrs528q\nM9vra1Lng7gQHI5xQDV8pthxaFqdrabHaPVY5U8Src34GRXxqX+d17oBINnHTPoqr7Gy2YuLi/la\no5sIJX/9LI2jUsK3VoFZghP+CFFmAk6leOnQXeykkIjV8LWULQk/Nc7PzwuR6xR0NeXxM+x1SsNX\nk77OMXbe6+/qxyVQZoWInTsctwVq+Er4WjRHyd9uBmwvD1t4qyzQdhTzBrrN+yTslMZNJcoqAdTw\n1eJm3Qaxz0tlIVg3wizBCX8MsGTPh7WM7PmzSSN93VEr4bPYTMx6oYSvlg4N1OmlNccIX6P1l5eX\nC3O8yd/V67zfe5Pwf3LMH6jhaxEdpthZ4reV85RIbaEuJeTYcRjYmiFW5jVoL0a+VsPX8tkxsqfC\noZ/Zr0nfNXxHT/ChY3Q+CTwW+KYBaTETVyqiPYVhcmNTsOZ87QlfqVRK09TOzs4KhM+/x/rZYuQf\n+45Uy1cN3+GYRZTJMDfQVsMn4e/u7uLg4CA5yrTX1DozqOssNf8UkYYQotq2vl6tjVQAaJUgsZPs\nWXUzpeFr0J6a892H77gxdAOgzVfW19exsbGBk5OTvL1tqnWumsBjfmd9iPWopXV7zTHmyw4h5Hnv\nbDShTWOq1WoytY0avgYR2XONnLXn/Dx2xtNSug7HPCFVQIpkxQh8+ufVf891ZdDccq4HsdicXgGv\n1mKgI5ado+d2k2+bWFUqla6/RefF74YBw7YYVtl3PA9wwh8hyh4srcOuteFJ6DatRvNmaRpPmc7L\nWuT2Q/ipPPuFhYXcP8YudWwNu76+nvvNUuPi4qJQ5MKe282JbljYNIOWBG1M4yZ0x7wg5kcn0ZHw\ntYAMNX0G5lkTeb9+6TL3Y9mglh4bXItSga9qTSyz7tkCPbrpyLLnpXe5ZvB+LxP9OCykkwYn/DHB\n+nhVw6/ValhfX893qlmW5TWebd4shbVMyNiYR6NuAeQmq17zVGGzbgduTmiRYLnLZrOJ9fX1wu7a\nagG6kYnlA9ugGT3SEsJe80wF1FRHh2PWYX3e6vKzGv7x8XGu3R8eHnYRfr+BaDZQN9Y1MnVvYWGh\na2NPKyRbdVuLok23i3XAJOkr2dv4IgA52Z+cnGB5eTlXXBxXcMIfA7ijVNJfXFzMd6hra2uFHS+F\nhAKiRxKj3fnqOD09zfNp+XAzYKUf6G5ehZmETw1fu9Vtbm6i0Wj0TMvjgmQrfdniIPa4srJS6DWv\nGr7DMQ+wWr2N7VHCV1cZNXyuH+qjHkTDt6m4mpJr73EsLCwUsgKU7K0SRJJXhSGl4dtmNrHsIdbR\n18Y5/Zr05wVO+CNGjOyBog+fwgc8TzEpK5RxdnbW9eDr9fHxcS5swPOuUf0Qfsycpp/BxjWNRgMb\nGxt585p79+6h2WwW/P6W9LXwTqwvvVokrH9/aWnJTfqOuYclfPWHpzR8kq2NQLcBcGUgGccyc7Qe\nhr2/tLSE/f39QnQ8MwnUVx9zIWpWTozsV1ZWknX2K5VKblFst9uFuB+7Xowj1XBa4IQ/BmhUKM/V\nh0/B02hTEp0OChH9UanBntAarKJ9onvBavgaCU+TviX8F198EZubm0lfHAlf8115zqPNET46Oso/\nN4SAer3eFbTnGr5jnpAie40wV8LXwD0rc/1q+DYtVntYMENHM3X0WrtGap0Arcqp64Ql+15Be7E0\nYUbqc907PDzMi3NxLmXkPk+k74Q/RujOUjV8vV5dXUWtVisIKoVLCb+sS9Th4SGA5/6rdrudt43s\nZ47qP7NlbFOE/9JLL2FraysZgEOB11QXPV5cXKDVauVpQrVaDQcHB/nunpH6MQ3f4ZgHaES+zSHX\nKnGW7LmBLsux7wWrQWtFO7ra7HmtVssrYCrZHx0ddWnavQg/ZdLXeV1cXORBfKzSd3h4WHADWpN+\nP+Q+yxsAJ/wRoZeZWUmVpindqdpgOTVrnZ+f51quNautrq7mFao0cO/o6Cgny1j6G4edl+bZ05yv\ng+lyHLEcep7T4pAaqTRD1jBgJoASvvvkHLMCSyz2Ws329nh2dpY3vaF1jGl42m7bDv2M2CYdQCG4\nWNNxVe5J/jpYEMvGIum6dH5+Hg364zmDgWnd43qnpbzV+qAbIFUOdE1N5eFr0HCs+I7m4yvs+jNN\n65ET/i1BNWnuUFX4Yn2gl5aWsLq6ivPz86Q5f3V1FZ1OJ8/tVwGj4MfIlg+1+upUyCngGxsbOfEy\nYl596Urw+rfG/nYSOaEbDI28pYbfaDQKgk8hdjhmCdafzHPNvomltT558gRPnz7Ni+yQ8G0t+Ji/\n2qbf6nF5eTlPwWWGjp5zHVBLJM8XFxcL64i222XKnAb+WR/9iy++iBdffBFbW1toNpuo1+td2rq6\nHLieMSiaGwgle65HGux4enpaMPlzHeJ3aIMdrVITW/emAQMTfgjhnQD+PoD7AD4FwBdnWfYB85p/\nAOC/ArAB4OcB/LdZlv328NOdbqiQqYatgX4x/9TFxUVp0F6n00G9Xi8sCLYJhZ4zVY+lLSksmjLI\nXPtms4lGo9GVIqc757KHXwVU7wHIXRqan8u5ZFmWbzrUj+8m/duHy/z4kMqxz7IsD2y1tfA52M9+\nZ2cHrVYrr5GvhG9LxGrueiwrh0pGo9FIDsqirkU8hhDydcSSPU39tmKmHpkFtLW1hY2NDdTr9dxy\nsLS0VChT3ul0ctLn+2o6n6YK87tQV4iuYUwD5vppNX5rhbRr2rTgJhr+GoBfAfC9AH7E/jCE8A0A\n/g6Avwng4wD+JwAfCiG8PcuysyHmOtWw0fCEPpSxwDlq6CkTGLVhS/ZK+LZPNoBCzX7tLU3tvtFo\nYHNzM6rh07TGvyt2tH93jPRJ+HRvaBAjgNytYDX8adtVzwBc5seIWI49feBsZbu/v9812Mt+b28P\nBwcHUcKPkT0QT7vjulOtVvM0XK27wXNq2+qC5DmAPC7p/Py8YLmjohOL8OeRCgY/jxq+Ej7XLvt9\nxTR8zeoh4bPkN9ejTqfTRfhWw+fnaSZSLBtr0jEw4WdZ9kEAHwSAEP9r/x6Ab8my7EevX/NVAJ4A\n+GIAP3zzqU4/+NDbazWnUQDPz8/zFD7uZPkg6wNtCV/JnQ8r/Wf6gGu5Xq0RQA2/2Wxic3Mz96lZ\nk741l6X+XrVcqHaRZVnu1tBNB01rALpiFdykfzdwmR8frHavfmMWkCHh7+zs5Br97u5uoSNerJ+9\nfX9FLK2Nclar1fIgXY7Nzc1cAaAmHxtZluVWyZjlDkDua9cIf55rnICW8NYiOtZywXPduHCNsoRP\nk75dC2nOtxZSBgSqVVbPpw0j9eGHED4NwMsAfor3siw7CCH8EoB3YI6FP6bhqllKyd5GtvM1GtGq\nfrcsywq7Uq3BzyYS+tDTxK+Eb8v+UuDpR9fgOWr4ZX772H1dFEj4+vn6twOIVttyDX+y4DI/HGKF\ndUj61PAHvKTzAAAgAElEQVSPjo5ywt/e3sabb76J7e3taM0OltMu0+6BokmfRK+BunTrMTNHhxKe\nJV1qy9rDXjOSWK6bQX72yI2AHWrNTMUmlJE9UFR0eM00Zkv46gLVxmb87jQQcJrWo1EH7b0MIMPV\n7l7x5Ppncw31A5GoywSe5zQl2dK1uonQEppaaOPy8rJrN0vNgQ+u5tpqnf/Nzc2CHz2m4ffzN+tR\nFyBddGw0MYBCFS7d5DgmCi7zQ8KuAZTbmIb/5ptv4o033sCTJ0+6ilnxaGvN8zMUsTx7ati64Sfh\n37t3Lx8LCwtdwb9aDIiZQTHL3eLiYiGdT5tkMa0vttHnmmM3MHrUEtzWpK8WFKC4FnJOsUwHWkmz\nLCvUErAByNOC24rSD7haFOYaZSbwWJSujlh0qF7bID0VRKCb7NUkHwvaI+FTs9eIXOvDj2kPNv0n\ndn5T8p6mHfUcw2W+D8TIXk3P6sPf2dnB06dP8clPfhKf+MQnkkpCPx3xrGVNI+up3XMN2NzcxL17\n9/KCWyGE6EaD5XN1M6+ZOIyMpxKhR57b4Dir2JShLGgPeL4GUglS5YkuEZvhpFVRdU6MI5g2jJrw\n38CVoL8FxR3/SwD+77JffPjwIZrNZuHegwcP8ODBgxFP8W7QL0mp2cj+bozw9TX6QFrN2EaYqsat\n+b4sTcna/FyMgOeBPppSWLawODGPF48ePcKjR48K9/b39297GjeWeWD25T6lZZPouQnXDBpeM89e\nO99Rg6flL9Y1ThELqg3XkfRaSIekqyl4Ng9e03EXFxfR6XQKZvbYGmTLdduS2RqbE1uj7Jo3iv+H\nKiSq+esayMyIVquFarUaTYm2MVm3gWFlfqSEn2XZx0MIbwB4N4D/FwBCCA0Afx7Ad5f97muvvYZX\nXnlllNOZSsR8QypM+hqex7R+W6vakj5fzwdeq3bxYT84OCgIMwVYm1jEMI2mrmlEjBgfP36M+/fv\n39ochpF5YD7k3lrueG473tmhhK+97W0UPhC3nMWIk2uAJXzV7G39C5uKy/dRE7v9PA0KJNmz66eS\nvS2oFSP6cf0/9JzaP9dAKj0HBweFOgOaYmj//tvAsDJ/kzz8NQBvxdWuHgA+PYTwOQB2siz7AwDv\nA/BNIYTfBvC7AL4FwB8C+L8G/ax5hprGeB076uvLiD5VKIcaPjUN7anNMsB8uJmjO0iJTsf0w2V+\neMTcdFartJXpDg4OCr3tNXI8Jn9WvlMxPwsLC4UIecbtqIYfK2vdi/BtrM3S0lJeQ0Sj3W3mjd1M\njJvw9X/CwMKUlZOEr82H1B0ybbiJhv+5AH4aV/65DMC3X9//fgB/K8uybwsh1AB8D66KcPxbAH8l\n83zcnlCSt9GfKRO/3ovt5svM+UDRpG93t7Eoen3wHXMDl/kbIhbJrmZkq1VSm2dkvjXp2za3Ze4+\n69LT85iGHzPp05ytpKwBxxrIxiI4WgHPBvYtLCx0FetRf3vqbxnl/4NQ076ugVbp4YbFkv00roM3\nycP/WQCl0QpZln0zgG++2ZTmG5bsY1p+6jxG9iqkMU1fNXw+7LYWte1KNUhfbcf0w2V+OKTS12Ja\n5dHRUZ5ff3BwUCB81fA1+yYl/1ocx6bzKuFTwyfZU8OPmfRtEJ1+nv5dqYDCEEK0SU6srseoNXzr\nv1eolVOVHlo3LNnT0jnzhO8YP2Jkb38W+51+/ff2PezCo7tumvErlUquYUzjg+5w3CUs6Wsam9Uq\n2UVSm+PEfPgAouVeNWAuVrBraWkpqeHTh8+NQJlJ32r7sWI4dpND0rTmf6uIAOMJ+k2RvsYxsfaB\nTUFWsp/WddAJf4Jxkwe+H5O+vrcN2js+Ps4f8CzLCgV5tHqfa/gOR38oS72LaZWtVisvn5vS8Eme\nGrSnMh/TonUwCE2j9FXD1yI4NmhPc9BjAXC9jjH342347HWe+t3ZOCb+L2x3Pq0noAV5pglO+BOC\nYR52NeHZnvYMjrH1sjW9jg+/FvvgfLTGtJbs1Ra3Ov/bElqHYxLQS8Oz/nqtoHl2dpab8TUNrB/t\nXk3MsR7yZefLy8t5c6xY62s2q9Ioetshc9rlPKbhq6WTcRUMKmQdAS1q1k+9g0mDE/4MwJqbqtVq\nQQvnQqO5vnx4WRyDwTfU9umj02IUqmVooQ1r5puFBcHhGAWURGxlvJOTE+zt7eVDG+PQf69kz74a\nAArlau1QM7wleh7X19cLTXEYpKeR86k2s7OKmMtFN2pcU7Ua6LTBCX8GQMKnRm99fEr4XHSorav5\nj5r++fl5XhSkVqsVekTbbnzqNsiyYglgh2NekcrvZuodj5boec2UPN1oq1zb/hc61tbWCsVyYkfm\n3dOUr4SvmwVbuGtWYd0u/L+pVYbD1kCYJjjhzwC0At7q6mpXZTyt4EWi5uIDFHN2+TDTvK8afqz1\nbqzG/SwvDA5HLygRaGBeu90umPCZekcTPo88Z/e7WP49NXy63NT/3mg0UK1Wk2RPE7XdKGgbanUF\nzEvDqpiGT0WJpK+li6eR9J3wZwBq0rdkv7Ky0qXVq7ahu1q7w82yLM8L1j7RatbngqCgtj8Pi4TD\nQdjFPxYQSz89B8mdJny9ZlaM9sdIEb42u9nc3Cw0ookRvnUB2GI4KQ1/VmU6Flhpzfmq4U+j/x5w\nwp8JKOEDRbJfXV3NCdqaFY+Pj7t2sTRlcZGp1WpdPnzV8O0ud1q7SDkco4I+/2rSZxS+mvAZpBc7\nstWtElCK8Nnohk1u6vV6kuz1PNWdTtP45sGkD6CU7C3hT6N2DzjhzwRI+ECR7LWgjprxlfAvLi5y\nTV/7cGv1rzINX1Nt2EVqGgXB4RgGsWfeVrGkhn9wcIC9vT3s7u4WtH2r/dsNtZ7HTPqbm5t44YUX\n8NJLL6HRaJQSfVkVPlukZx6C9vQ7VuJPafcetOe4M2gRDO1exYeXWrlq6iRykjZ99iR8S/bqw6e1\nQKP0uThM8+7X4RgWNuc8VtDl4OAAu7u72NnZyYmeQ68vLi66itxorr016ZPw3/KWt6DZbCa1d60B\nn5LTVOGuWUbKpG/bjbsP3zER0KpXwPMCExrNW6/XC0FA/DnT8KiR0GKgC1a73Uar1cLe3h4qlUqe\nGUDXgbaOXF1dLe3alRr9Liyzvvg4pge6+NtBwtBqeppzb2NkbPvbMlP8Cy+8gHv37mFzcxMbGxtd\nNfDVF29z8jlve+SaMY9ptraIUNn1tMIJf0Zgq26p4NLEX61Wsba2lu9YsyzLfXUkfC5QJycn+QaC\n19ROSOaXl5fRPGAOpvqVdfCzpkPOaR4WGMfsIFZGlrKj8TOW9En2tqAV5YA9LFgKV89feOEFvPji\ni7h37x6azSbW19e7Uuu0MI/V1HWdsNXnrFWBr3dMN5zwZwwxIdaFo16v5wuKEq6a9U9PT3Otn1oK\nCX9/fz/X3M/Pz/MI39hRC3fEhq0QRo1mHsyHjtlBLIeboxfh27gYraTHznLVarWrIl69XsfW1lY+\nqOFr/ftU8xxu7mNkr+R+F2VvHeOFE/4MwgqvavhK9iRkJXsuSIzUJeGfnp7i6OioUJHv9PS0oHXo\niJkUrcbBUr8s8Qv0F+Xvi49jklBWK78X4dsKfLHUOxJ+s9ksVMfj4D3V8LW7XVmDGqB7veA9J/vZ\ngxP+DMGa9QnV8JXsaXbnwqQdu6hpq4ZPU3un08mL8pDwq9Vq11BCj/khLy4usLq6WpgzNwQOx7Qg\nFuylQV9K+AyE1cA8Er2Wb7UaPqPwt7a2cO/ePWxtbXVVyqP/nhq+DfKzhK8NtGKkbl/rxD/98JV1\nxqBCzGtq01xEWHijVqthaWkpX5DYi5sagvrwT09Pu8j+8PAwJ3cGCunRBvLZoSl8nKen9TmmCTbg\nLRbhbTV8bZZj8+w5AHQR/sbGRp5299JLL3VVy7Ny129XOr22CoP772cLTvgzipiGT7I/Pz/PG+ws\nLS3lmgcrfVnCp4Z/eXmZR+uz656t4c3AwOPj464KXnao6ZKFg6axx7RjvhHz4Svha10La9K3Uf2a\n8mVN+iT8l19+GS+//HJfAbNAmrxjcmbN+o7ZghP+DKBMMDXqneZyatdsb8vqXgwIorZQqVRweXmZ\nF9RhUBGb7iwuLnY11KnVavnipv20Y0cld53fxcVFNDq43wXIFyrHbYIxMLYjJYtdadEqrYXBXhZA\nN9EyXZbdL+v1el4+d2trCy+++GJpnr0tdx3DPMnJTUt9z5ry4YQ/Z6BPj6D2r4FB7I7HRSsWfawd\npZi2xxgC2yFM8/T1WK/XCy13tc90pVKJFv6IFQGZp4XLMVmgBcyWrub5zs4O9vf38xQ82/Uu5mPn\nOTfe1Wq10J+esTE2C8azW8rh/T2c8OcCtrBNp9MpVOajP5+Ez0Upy7JCL2571KAkkr3e48JkjzzX\nYiMaocwNg9bz1nP9m+zf6XDcJmIZLiym0263sbu7m9fMJ+FTdoDn7iz7jC8uLhZ88+oeoxyl6t3P\nuxzYgjmxn/WyipZdTzOc8OcEuhBoHi4Jn4FBbNjB3fDR0VHBDHl8fJznF5PcY2R/enraVd3LNvOg\nJqSlK4lOp9NlrrR1++3f5zt4x21DNXzWqtB6+NTwSfi0nMXiV6yMKNmT8HXTHNPu/fnvRqrPQSxY\ncdbhhD9H0AecAXk06dP3zoYdjJrXxh4atc+Ifb5eyV7z7a3mokdrxtfqgFmWFVwA1tdPK4Wb9h13\nCavha3McNsixJn0lfBtbo0NN+jHSjzW+cRm4QkxLt99N6t4swwl/DmBJUaOCVcMneaupnwV06Apg\nvj6A3BLQ6XSwsLCAs7OznuVz9Z6tLAY813gAoFKpFDpTKdlbDd/+nQ7HbYEb3VhznL29vb5M+nRz\naWBrzKSvPvxYUx3X8tMY1AI4i+TvhD8nSPm71Ydvyb5Wq+XpfFpTnyk/2p879t42dsCea/ASX0/z\nPy0I1vTJz4xp9v345xyOUUI1fO03sbe3h2fPnuXafcqkbzV8rVQZM+mrhm/z6p3si+jXlF+Wpjjo\n+086nPBnHLHqe7EcfZKokv36+nquUavJkpXwSMipxiF2HvZcyVwLBNGPabV+TSlUP74TveOuoD58\n1fBJ+CR6Dhulr8+2puGpdm81fNvi1p/7NGJd7vrx36fIfBpJXuGEPwcoWxDUpBjTOtjWU3Pt6XsH\nkAfbxY4kf6B7cwAgL/hzeHjYZarMsiz/HO0ixveIpSTxvNff7AukYxDoM2sr62lhHcqKVtM7Ojoq\nNMfhs6zxKtakz+JVLGTFUrmaihdzaTni6JXNo4qQuhxjfUBsgOS0rSVO+I78QedDrmZ2jd6nZkJt\nvFardTX/0GslaXvkuWpG7PCl7gPN09f0PV0ANZqff0+ZVcPhGAQpCxY3pSR7zb+3XfC0z72NWVHt\nXs351PBtKp4/y/1jkGJdJHtdU7gJ0zXGdh2cJgxM+CGEdwL4+wDuA/gUAF+cZdkH5OffB+Bvml/7\nYJZl7xlmoo7xQR90JcoQQiF635r919bWCoubXfC4yOkAnmtJbNpD36d259NCJrGOYtR6uBhqw5FU\ntLIvlDfDvMt8rGwuz0n2SvpaVY/3bZ0J664iuZDwteKl7W8/jURzF+inTofV7jWrSAk/ZmGZxvXk\nJhr+GoBfAfC9AH4k8ZofB/DVAPiNnN7gcxy3ADUrUmvR4DpG7yvZa0MP9U9qwRG6A1Tz10p81P4Z\n7MRFjPd0EY2Z9tkVzFYto8DaYCb38w+FuZd5BufZYQnfkj6D9LTWhKag2vgUxs+kTPqu4feHQcie\n5zaegm7GGOHbNWZaMDDhZ1n2QQAfBICQ/mtPsyx7OszEHLcHzcnndSwqXsm+Xq/nLT6Zp6+aiJKv\nBtfRvw8gT/M7OTnpymdWzd768FmJL0b2WqBHMW2COUmYd5lXDZ/PLwefUWvW13N9vW0OpSZk1fBj\nJn3X8G+Gfqx9quTESJ/f/zxq+P3gXSGEJwB2AfwbAN+UZdnOmD7LMQT0QQeQ59szOp+vsWTPgLv9\n/X2sra1hf3+/K+hO/VxK9labpxmf5L+8vIx2u10w4WtAoJpVqSlptz0bxa9/q2NsmFmZt1Yp2wWv\nlw/fWgViGr714ZeZ9P05Hg6pDcCgJv1p3HiNg/B/HFdmv48D+AwA3wrgx0II78imPadhRqEBK1qU\nJ1aER030JycnudlRFyTgecod8HzB5EJJgaO2Y037i4uLWF1d7TKFqnakiyY3I1q1jxsXCiXN+r5Y\njgUzLfMatKdkb7X7FOlrsJ/68C3hx6L0mYPvJv3+EUsBTl0rBg3amwuTfi9kWfbDcvnrIYRfBfA7\nAN4F4KdH/XmO4dDroVXfomop7IYXW4g0LU9z7tUMSiKm5kSQHxiIpxHRyh26YOpunBsErdjHjcy0\nCee0YB5kXk36DDa1WSmxDQAtWHYAiD6/vUz606pZ9oNhC9/E1opY7E5sE2BN+WpxsQWPptnSMva0\nvCzLPh5C2AbwVpQI/8OHD9FsNgv3Hjx4gAcPHox5ho5+YLV/Wx1sbW0tD+6jZaBSqeSBfEdHR6jX\n6/n50dFRwR9qz6n502x6fHycL3hA9wKsflK6BVK9wqdRUGN49OgRHj16VLi3v79/R7N5jn5lHpgu\nuU/l4vcaQLrolTUdx6rsUcOcZqLpBxpcG4ONpbCtulutVr7ecLOlVsFYC2KOer2Oer2O9fV1rK+v\no9ls5mNjYwONRiO3ZvL/cRf/i2FlfuyEH0L4VAD3AHyy7HWvvfYaXnnllXFPxzEgtMCE9YurX58p\neGpm58LVbrextrZWiOinUNpBDQlAgfDb7XYurDZFyvpWT09Pu3bn6uufFQ0pRoyPHz/G/fv372hG\nV+hX5oHpk/t+Sd5apAB0PXcx0zEJXyvsqWY5rcFi/aAfsrdpvjpI+EyHVBef9dHb5l5ra2s54Tca\njfy4sbGBjY2NfENw14Q/rMzfJA9/DVc7d/6lnx5C+BwAO9fjvbjy571x/br/BcBvAfjQoJ/lmAzE\nyB64EkJq+Iy8V82f6UXsE86jPeeuXGv2k8wZyKeavUbtpyKmaRKt1Wq5wLOMsGMwzLvMx8h7ENK3\nm2aSBKO+Nf2LPns158+Lhh+DdeNxTYi5UpgtxCwfuvdiNQ+s5a9Mw282m11BlNMaT3ETDf9zcWWm\ny67Ht1/f/34AXwvgswF8FYANAJ/AldD/D1mWnQ89W8edwBK+Llqrq6sFM75q/SR1RvSz7ChHq9XC\nwcFBrrmQ7BcWFgq+UiV7xg6Q5NWXqoFT6+vr0WJBMxBDdhdwmcfg2r0+aypDWvuil4avJadtmuus\nI7bJUqufrXdwcHBQMOnbMsY2XkI3Wkr4jUYDjUYjN+dvbGzk/xduyPj/mDbcJA//ZwGUPXH/2c2n\n45g0WN+jBsCFEPJe9Uqq1Wq1IJAkeJ7zWKlUusj++Pg4v6Y2DxTJfnl5Od+9pyKmtQwwNftqteqE\nfwO4zD/HTUhfZUj9x9TwGSAW8+HbWJRZNukrrGYPPF8DKOdqJWy32wUNX334tlGRBkjS7be2tpZr\n9zHC1w3CNGdMeC19R1/QoD0AuUavmj1T6UjETLVT4tdrzdnXEruapw8UyZ4uA35OqvBJrFiQmvcc\njkFhSWhQDV8LWtkKe9Q0tVtetVrNzfi2wtu8IGXSJ+EfHR3lBcCo4ZeZ9GN9C1hbpCxozzbRmVb3\nihO+oyeslm8XMm1bq0Pr4cdylJkrr2SvmwCa9WM+UA3w08/hZ1iy16BCh2NQ2Mh8Pe83St+Sfipo\nT4lIi7yU9YmYJcSyIQB0xfWQ8OkatFH6MZN+qm9BL5O+jexX98w0wQnfUYqyspRZluULUmyhoxnS\n5rZysDXuwcFB7hezqXdl81J/qKbsZFmWBz6xwY82MKHlIJabO20C7Lgb9EP6RMqkH4saVxM+N7/6\nrE8jyVj0E40fOzIWiGm9JPr9/X3s7+/j4OCgYNJnDA9QDCZWK4rV7BmcZ9MiNW7JBl9OE5zwHUMj\n9eDTf86Wu3YRZMqRrRNeZlEgNE3n7Oys8LsLCwt5kKBtU0qff8xq4KTvGBdiZGHTXe21reY2C89l\nLwubLUOsVTaPj49zct/f38fe3l7hSC2fZbmp3WswsWr1moLXbDZRr9cLlUNjfvpp/x844TuGghbL\nsMJgzZd8De/HUo5iGkysIAdN/owXUMEk4TNDwLbYpeY07eY5x3QhpSVa0rdkb7XJWXxObWAeN+Y6\n2u02Dg4OCkSvg5o/NXy68BhNT5cJLX8k/GazmRO/pt7ZIMlZ2Hg54TtGApKyCgPJPubHXFxc7Coq\nogJmSd5ea+EdauzU+gEU0gBtm12W3tUAqtj8HY5hkXKJpbR7ex57/SzBBuXZSHwNyD08PCxo97u7\nu/lxd3e3q2kRa4NQ1mN+exbXaTabuUlfqxtyszALZA844TtGACVLJU2tZ69Ev7S0hIuLi0J9agqX\n3VGnTICq4fNa6/JrKqCa9En69Jvq3zDtwuyYTKTM+CkTfsqcP2vPZyz1jjKtGT6U41arVSB8kv3O\nzg52d3cLLjst0c11h4SvGr6WzrUm/ZSGb8+nCU74jpEgpSGrlkLB63Q6eRpfyodvNxEWqg0wSIra\nfpZlBXO+lu7lghBLmXI4RoWyuJZexG81fH2/aSUaixjZxzR85tofHR3h4OCgEKSn2v3Ozk60vr5N\nG05p+PV6PQ/WK+tOOO3fvxO+Y2SI+d65cNkI5k6nE20MUuZLV/InwatmT82o0+n0DNrT99QuWw7H\nuGC19Zif3mr49ndnDbHUO63hcXx8XMizt9o9yX5nZydXEGLfp6biWR/+xsZGoRUx16VZLHTkhO8Y\nCjENvJ+FKsuyaMvJmEk/FbTHXbzVfhjRq6SvJv2Li4uuxdVmETgco0Qs8KvMfx8L1CuzeM0C1E1H\nDZ/avU3Bi5G+Vi+0hXJs7r3m3G9sbKBWq+Vpw7FqerPy3TvhO0qRKoLBc9WOU+ex0el0sLOzg/39\n/TyVxubOxj6TiJlBebRlMFWANUBQa5PPoo/UcfcIIRRywJkHvry83NXz3jbIiVnM9DiJSJnqebSd\nLu2RZvvUaLVaOD4+zt1yi4uLeWlcWyOf3/XKygrW1tbwwgsv4IUXXsC9e/fyIL1arZZ/92WWxmn4\n7vuBE76jJ8qImzmytlVtzJ9mx7Nnz7C7u5tXyGI5zJS2be/ZAiY23c/u1i3p2ypm0y7MjskDCT9V\nv10LvMQCxaYR6h6LrRdaftsey8j+8PCwsE4AVx0HK5UKOp1OXlXTNrqhz35zczMfGxsbOeHbOCJd\nF6b5/xCDE76jFFYr13Pbn5qCqyO2EaDZ7tmzZwUNn4LMSPsy8xk1IK1URk1KiT6l4Ws97FkUbMfd\ngs+urZlv67fbIDGtNDnNiK0b2hNDe2DosYzwaQXUlLulpSVUq1UsLCzk1fO4kdLj2tpanm/PEetv\nHwseniU44Tt6wgqtDgbYaMc6nlvyt4MaPptenJyc5JuEmM9eof53alA01auGb1P/tGSp3cnPmnA7\n7h6q4ceixJkGZktLT/OzaMk+tl7EumeyxW1qnJycFCyHWZbl31esp709J/HroIav7YdnuUmRE76j\nJ6zwqglfe9GnUuBS49mzZ9jb20v68PnZMdi8/li9/jItXzcMs2q+c9w9+JxpHXdqndTwqWWqWXna\nn8XYekHCt+l2eiwj/LOzsy65JeEvLCx0dbnjYBU9jcJPReTP+prghO8ohQ24UbK3+bJaKOP4+LhQ\nzjY2GGnbrw/fwpK+dh3To/rvuajGoqIdjlEj5sNXDT9G+NNu0o9p+HTlkfDZHZM97HksI/yLi4uu\nDbxupJrNZsFPv7W1hY2NDWxubqLRaESbE9kmRXZdmDU44Tt6IiW89NtTgJkGxx27avux0Wq18nQb\nS/i9UEb2MQ3fBu3xPWJHh2NUiPnwVcO3pVxnwaQPdGv4umbYXvba7a5sdDqdPLOB2Tj04ddqtZzw\nGYmvo9FodFUy1AHMx3rghD8HKNOWYxG1ek4hjY3z8/PcFBczz6mJ35r7T09P82YXfC39/oMQfsqs\nb4dG4GpJXYdj3EgV1knFkEw60dh1Qu9lWdYVwKvnWjFPC+lotztq+2yEw7WBmwitnMcWt+vr63nv\neg7V9huNxl1+ZRMDJ/w5QSqfviwnVrV4rVLHcXZ2VtDq7VGjcXXwvja3Idmzel4v2MVTi2542p1j\nnCirD2FfZ2NdaAnjs6pa/6AycFdI+ed5bSPv9ZxmfFr29Jy97NnaNoSA5eVlVKtVAFfWEo2yt4NE\nT1eJ+uYdV3DCn3HYQhixAjix3TiPKcFV0la/vUbclgXu2UA/ft4gFe/UrK9peWWFdZz0HbcJzT3n\n864R4SR7KwOTDLX8xSyAsQh8ntOqR3M+jzzn+qCEDyCXaQ3Es+ckfltQxwn/OZzw5wCpXHo1v5UR\nu9ajt+f2qOcpsx6H/cx+CF/NnrbwjpK+LaDhGr7jLkAZ0w5wWkmvVqsVzNaDbnrvAhq0a61/1OLV\nXafnOmgJ1HNdm2i1YzU9Buaxna0eWQ9fgyErlYpr+AZO+HOCVCGMy8vLQitK25JSd+g6eN/2rNZz\nagCxoju24pZtaWmRaswTS8/z0rmOceAmJKwm/dPT0/x5BK6e4Xq9fmO31l1Bi+iovHOjz6h7Hbyn\nXSytosDNEL8jZjfwXrVazX3z1le/sbGRFzRiXQPX8LvhhD8HsFH2tlKezY3VIDxG09rdOXfltuiO\nXpeV2E1tAvrRbmJkb036MQ2fv+twjAqx2vH25yR8JXuS+vr6ek7402bS17gEjna7nUfdMxBPr9vt\ndmmqrm2ZrQ1t6vV6NCBva2sLm5ubOcHb7Bwn/Odwwp8jKPGTXC3hq0+NfjX1t+l5u90uaOhWY+ei\nFosbKKveZ5FqYmEjnlM+fPffO8aJWNQ6YWtW8N7FxQWyLCt0dJwWk7768LUOB1vZHhwc5J3stKPd\n7pSfvJsAAB3uSURBVO5uvmbEIvm5+QeQ18pYXl7O4xzoqyfhb21tFYYtmT3LFfNuCif8OUGq3CVN\n+kr4NoLWFsfQtJlU4A4XNH52ak6x8xSs4Npc2jINv6z7mH6+Lw6OftAr1VXPqQ0DRRN/p9PJrWjT\nZtKPET6VARL+zs4Onj17VhjHx8fJjCBudOiz1yh9VtFT7X5rawv37t3Lj7a1th4dV3DCnwL0IsyY\nFq2jLHju+Pi4y99mid5q+NRKYn56vR40vS6Wq2zzlXnOamXaHEODdjY2NtBoNPJa5b06kfnC4BgV\n7LOkVjU+3yQ2+rC1YBU31VmWJVtAp8zUdhNbli9vLW2xoF47AOTkbke73cbh4SF2d3e7mmLZGhus\nQEiZ5vurPGttfG14o/0H1PzvpvvecMKfMsS04rJc+k6nk8yHZ1qdTY/RYaNo1W+vfneND9DFoQyx\nile2IQ5z6u05q2txkPxZiIMRvJqiQzOhwzEs+u1VbwmWpA8g6U6jhnxxcZHMPmHxqNjnckMRI3Ge\n2825vS5ra60bFDvUQqh59Z1OJyf5lBYeQsiJPZZ6xyY4sXbCjv4wEOGHEL4RwJcA+EwAxwB+AcA3\nZFn2W/KaVQDfAeDLAKwC+BCAr82y7M1RTXpeYXfpPLdtajkoxDYC30bjx9JneB77HRtgZAm/X9gI\ne13MtAlObDAalyTPc2r+1Aq0BaYT/s3gcl+OfszHlNMQQu6n1lQ9daft7++jVqvh4uIiWTlStX9+\ntp7HYmNs7E6smJZaAGMZNiysE8vc4bDVNum+CCEkm9RwaCEdza3nuW0p7IQ/GAbV8N8J4DsB/PL1\n734rgJ8IIbw9y7Lj69e8D8BfAfDXARwA+G4AP3L9u44bImaS47XmxPaqhpc6t/dstTybo88gm5tq\n90Cxk5itkqekrmTe79D2l67hDw2Xe8QJvZ/nScmeJm3mmasPnCbxVquFarWKy8vLrkZQahK3wah6\nzXXB+sk5bDqdptWqBS82bCpdLM1O0/SU8K21Qo9LS0ullfTW19dzmXbCvxkGIvwsy96j1yGErwbw\nJoD7AD4cQmgA+FsA/ossy372+jVfA+BjIYTPy7LsoyOZ9Zwi5Vez0fa2cE7MD68m+1juPa9j0fca\nC5Dy/fUDXQCouTCVplKp5P46+vJiPcTtkefsPW5bYDrhDw6X++GgMqLXsaC3VquVk9nl5WX+XFv/\nt/Xp26GEby1+l5eXXfU1rPxbpUGvbb0NO2KbBNbAZyCeHZR7S/La5la7C1rCd7nuD8P68DcAZAB2\nrq/vX7/nT/EFWZb9Zgjh9wG8A8BcC/6oYAlfI2ZjRXRs7WqNwC/randycpI062lU7aBET1gNXzUZ\n+uLtDp/nWlzDEnulUkk20PGFYSRwue8TNL3rpnhhYSG/f35+nuevHx0dFQrGaNlrG+y2vLycWwxi\nrV11XYiVzE41vGLWQKr/Ra9xfn4eXRP496q7zrayXl1dLdXw6/V6QcZdwx8cNyb8cLVyvg/Ah7Ms\n+43r2y8DOMuy7MC8/Mn1zxxDIKXdWw1fd+/tdrtQ/IKdqTiOjo6SgstgG0vses156Rz7hfrwbXtb\nJfxYdS1q8RxcCLiApAIBfXEYDi73g6d8WdLjEUCXhk+yX1xczOWa8qaEeXFxUSB3yhI3FLb8rR0a\nJMhsHFsNz27+1exf5vu3mTV6ra2sdaPODXyZD79WqxWK6jjhD45hNPz3A/gsAF/Qx2sDrjSCucWg\n2q8l0VjgjXanSkXNsvLV3t5edGjDitji0C9iwUP22r6Ggh4zz9fr9a68W62qRYK3WgIXAsfY4HLf\nB/S5t7Ks8qFR+nx+SY42R13fN9bbXa87nU5XFUw91/K3tuaGBuvG/PWp4GBuTmxGDeekRG9jc3i0\nUfkcNOfb8tlO+IPhRoQfQvguAO8B8M4syz4hP3oDwEoIoWF2+y/harefxMOHD9FsNgv3Hjx4gAcP\nHtxkihMPuwFIReBzpHbUahJMtanV/tM2N1bT66yZvh+kcoTVP2/L3/I65X+vVCpYW1sr+O9sxD19\nflpr2242Zh2PHj3Co0ePCvf29/fH9nku9/0jJt+6AVCzu5K++qRtlz0b2Jci+34IP9a1juc2WFe1\nei0MpGl21N6zLMtl0x5j6bR2KMEzQI9WD5vJM489MoaV+YEJ/1rovwjAF2ZZ9vvmx68DuADwbgD/\n6vr1bwPwxwH8Ytn7vvbaa3jllVcGnc7UIWb+TgXjqWYf60in1xpdb0esDSX9dDYFJ1XeNgU111lh\npF+eR+tTV5OeJX4KPwtv6C5fCV93+fMWvBMjxsePH+P+/fsj/yyX+9GCsq+k3m63c22VhK3WO8ow\nybEX4ad6XLD+Ripbh8G6sd+LxROwcA7v0cpGolbLW6xgVurcFthJVdCcJwwr84Pm4b8fwAMArwI4\nCiG85fpH+1mWnWRZdhBC+N8AfEcIYRdAC8A/AfDz8x6pq4hp87auvPrKWREvVegi1n861uVOBVs1\nfBuMN2xqnQ3CU/+6PU+NVGEdLgDU7O0i4Bg9XO5HA2vOp4Z/dnZWME3TRM4Nfbvd7ko7rVQq0fbP\n3HhzwxBrbmXbXseUiVign7r5YhsNbgA0lsae20p6eiTBWzefavg2PmDeNPxhMaiG/7dx5ZP7GXP/\nawD8wPX5QwCXAP4lrgpwfBDA1918irODlLne+ujtoNmeaTta0z7WcjKVHxsLwmGO7E0K6NjCOdZ8\np8Rtj9TUbXR96jqWYqeWBRf+scLlfkRQTdia9DXFllp4SiZWV1eT5afVpJ8i/Ng9m4OvaXxqCVRZ\ni+XSpzb5tNwxzTY2UhsFDdCzmxxH/xg0D7+nCpVl2SmAv3s9HBGUme5jDSUo/EdHR3nZTY2616ja\n2DHm/9c4ALUm3MSHT2HX6FndzatA67VdwGLR9jFfoAY2+QIwfrjcjx7WpM9zavba5jUVnFpG+FmW\nlabPpYrqqMUvVqa70+kUNtgq+7HoeztYBVNddXodK6PNo8YL2OBFR3/wWvp3gJS/PpXvrjXvtRMV\nu1ExyCaVQlOWR1/WLKMfcIGJpdvQD6+pNex61Wg0usrlWrIvqyOeKjjiwu+YJrDMLjX709PTwjMf\ni33RPu83IXytkmkzfmKVM+05cNW+FkA0pTZmlue5Er3W1uA9Gw9UZsJ3mR8cTvi3iJRJ36bcWVMa\ni+nQpL+3t4dnz57h6dOn2N7exuHhYVd1PS2UESuCMWwOPYAus57Wv9fdvKbXaS59Wa18FhZJCbgV\nchd6xyShn+eRm3ySvX3OU7ExGqgaI0Xrw48NK//9rAk8Z5CezjGVbmevmW5Lotdc+/X19Wh6byzV\n13EzOOEPgF6EaHfHsfPUSJnY2Pxme3s71+rZfpJpdlo8JxaZ2wu6wOh1LOdXj0tLS127eB3r6+t5\n1zoeVcCtydL668rm63DcJlQWVAPVHhBWA19ZWUluti2RxtYWW0AnFqQa0+5Vw7flcWNrQmzzrJt5\nuxbQepcaqdoajNBX955WzFxdXR3Df86hcMIfAlZoUz3nY92nbEML60ezzSp2d3fzsbe3l8yn15a1\ng5rl7Yg1uuCgKS9V057CreY67U9PLd7m17qJzjFpsES/tLSUb9TVlK0xKSS4cF3+9iaBsdYKyNx8\n/kwD9Gwci65H2tXS/l082nPrQtOhzak04p4xO2VBe9TyvcXt3cAJ/wZI5dKTnG1jCWrbltjttSV7\nnp+enhaK52gBHb5/jOwHjbSPdbHSQDwNJNKUu1Q+vc2p5aKg3etiufQOxyRBtV2VExaZKZMHAF0b\neyCt1Vso2WvzHZrtY9Y3avhW0Yh9pg2C43WqLe/S0lJXIK4NyrXlb+33w82QTbdzjB9O+DeE9XfZ\ndBrbZlbNa7Hc1lTeK9+TFbC0KhZLYMYC8/otnqOLmY2OpU9eC+KUpcqlyD82VldXC1YD1/Adkwyb\ngkpYMrOkBqCwgQeek3g/sMFyqvVbl5ue2yBga1Uo+11tcGPT5FZXV/PYnNhYW1tLbhT0PbUBjtfQ\nuD044Q+IlO+NO2pNodPqdtpq1vrTqKGnWtEyaC/WvpZFMmJRt4OY9G0uPYf1z2mPeWrqdrGLRdzH\nAvNiEbn9pNnYAiYOxzhhTfoqVxqhHmsIk2VXZbH5vJK8aZovA59zjZSn9k6tvyxTxUbd2zUhFqOj\nGTd288JzddHZbpb1er2wibfuwFisg3exvD044d8AsfQ1TavRnHkG16n5PdZysixXPtabWn/HkvxN\nTfq6eJGwbd6sHumvSw0V6NhuP1UW1OGYJCgxxsrIKtlbPz7N7gAKWne/BKcKBck/FcFurWO96mtY\nM75uvvl32VK4tVqtEF1v+12sr69Hswb0vWMbAZf724ET/ghAQVINX3Pmd3d3cXh4mMyVZ8U7S+qq\n5acKYahvLpV61wuq4VtfG1vUqoBrTn2sKpZG2pfl1Zal3TkckwRq1nq9sLBQMOnHgvZI7qrZawpe\nL6j53pK1Hu29VLqdblZ4tMTMDbkG39JHzzRbZt7YLJxGo1FqeYht8jVDyDFezB3h9xsoE7u2UbM2\nj17N+a1WC/v7+9jd3cWzZ8/QarWi7SZ5HdPytUjGqGAXCFslj7t67V7FnvQczKfX1DqbXkffXD+a\niMMxyYilrVJzv7y87CJ8lR+a3i3hk1xTa03Zeb9zjl1ba0WsqJXm1JPstUiOrgW2vkaj0Rhono7b\nxdwRPlDeijZVfSpVkUoLZ1Cb1/S5vb09HBwcFIrjxArkxLrWDSrkChV4q1nrcWVlpeCXt+e2QMb6\n+jpqtVpuvlRTfVlFLIdjmhGzRqmWz0JTjUajsElfW1tLtq4+OTkprYI5rPzHzOk2Xkd96+q7t9H3\ndOMp+TPF1lPrpgdzR/jW/G219FiUvJ7bSniaWkd/Pevcc9CHnyqMY/PubWnLQWBJ1kbgx6LwbU6t\nTaWzgq+CrouGmvBj89CjwzFtUMJn/jtli4TPjBkG2NVqtWiXS5K+deHp+bBztUQeK9EbS7u1KbU2\nvVZTbD2Xfrowd4QPoIvktQ1tqkQt/ex2A6BHdq+z3exarRaOj49LI/FjhXmGIXs9V9+8HfTTx9pV\nat68jdTXrnXWJBjThhyOaYX6xPVZVitZtVotaPbMZT86OsrbWGtLa57brpbA8wDgYeds0+s0cybV\nw8K6JWw6re14qYTvcj75mDvCt2Sv5nmNso/1lI9FyOs9zY9XQWe+fKrdJLV66z4YRMNPBb4p4acE\nl1q7avA04fXKvbdVuGxqXSyoyOGYNsR84tTiqeGT7HUTYNeEo6MjVKvVwtrQbrfzjbIqHsPOV2vc\n2xbVscY2tgd9r9bVmkvvGv50YO4IH0CXr55aNftSax69EjeD61Tr16NuDuywHapisQGxuvuDIJaL\na5tbqElOSV67WOl5qmpWrGOXLe/pZO+YJcSi4Enuluwpa7qOHB4e5prx0dFRl3as1TqHlRdq+Er4\n6ppL1cDnMRaEy3s2j96r5U0P5o7wrd9etW5L+GqSp1k+1Xe+16AWH8uNLWtRe9PAHSV+K/haNMO2\np7SD0fa2UxfvxdJuPL3OMWuIPcu8t7y8DKCb7M/OzvLqmIeHhwVCVQsZ0N0edxTzLZP7mAtPffax\nvvQarxPLqXfCn3zMHeED6DLpk/RJ+NTobQAetXyteqfV78oK5mjTi9ix7LwflOW9qknftqy17Slt\n5Szu3GN5syk/vRO9Y1YRC0Llpnh5ebmrKVa73S5o0a1Wq2AaB4pkPyp/uAbtcQPC1rQbGxuFTb21\n7tVqtWi6nrrtUuuNY7Ixd4RvNXslZRK47spbrVZeQEfTaSzxn5yclDbFiRW8IGJ+936EKJYiFKty\nZXNmm81mfmQBHbsAcLdPbSM1R4dj1tHrOaf/nQ1t1GoXk2kl0zLr3unpafTzYgV4LCqVSjJXfnNz\nsyvdTtPuarVaci3pp/S1Y3Ixl4RPQrZlblkhT4ea9Fke13bCsyl7qaA7qxnruc2Rt+cpqBYfa2O7\ntLTUVfNaNXgG6dHUSB9dqmiOw+HoBn3wqbQ9mtYZl0OZ4u8sLS0VsmaazWYeuDeItY+fv7KykpT7\nRqNRSLWLda6zsTgu/7OBuSN8m2+vle80cpZkr6TPSH07LOHHgu6sFm4JPeUj5znfIwZtemODaVZW\nVrraV+q5kr0GEcU613mancORhpULW7JatX7ep+yvrq7mgXXNZhOtVmuoXHwW1IrJPPvR26h8jbZ3\nwp9NzB3hU8NnKp2m4Wl3u5iGb/30Npc+1q0upuFbn5gGxVjSphAqrPDZtpN22NxZm0drU21Uw3eS\ndzh6IyUf6ttXstcgONXsG41GnqPPXPybaPhLS0vRXHq15Nnce86Hsh/z1TumG3NL+Fpkh9o9e83r\nUYk/lkuvx1THOu1wpbt+HWWEvby8HM0DJhiQl8qrTbWupUavmwueW5N+7HMdDke8MA/vUdatZq+F\nsEj2ttfGMD00FhcXe64p1hKo5bGd6GcTc0f4NmCPGr5WwVKyVw3/9PS0tMZ+rLGOLgRqyrepLtpl\nyx5XVlaSpBtCyLX4VN/6lPVAP99uQspM+g6Ho4gyDT9G9lyDWKEv1jyLLsFBi28Bzyv9pdLrrMvQ\nHvl+HsczW5g7wichxzR8S/aW9M/OzpKknmpDaU36KTM+c2Vjo1Kp5L+v78VjpVLp8tXpdSxnNlYd\nz313DsfNEbPCaVBup9PJfflcP2wRLtv2epi5pAKAbUXMmOzbv8fXgdnAXBK+NenbVDyr4fM4TBBN\nyqRv+8/HCmJUq9Vo/i+P1Wq1q4COnqdyZvsldDVZ6j2Hw1FEjPRnpSBNbB1wTBfmjvA1aKZSqRQC\n7ghbjpZ5qsMQPqte2S5VvE41qanValhdXS0lfLaztG1rY53rbkL4+nkOh+M5YnIxy7Iyy3/bPGDu\nCN+SOU1owPO+8Yx61zSZzc3NoTpY8X1j/jRuPlKNLNSHHzOzqQ8/llrH17qZ3uFwOOYXAxF+COEb\nAXwJgM8EcAzgFwB8Q5ZlvyWv+RkAf0F+LQPwPVmWfe3Qsx0BlPBVs1czu02ToW9/WMK3kfmpKH3b\nrpK1ulM+NW5eNCK/V/EcJ31Hv5gFuXc4HINr+O8E8J0Afvn6d78VwE+EEN6eZdnx9WsyAP8UwH8P\ngKzSHsFcRwIl/E6nUzDx059Ozd62yB20e50iXFfTitWmtuZ+myIXq62t19wwqKsgpuHb33M4+sTU\ny73D4RiQ8LMse49ehxC+GsCbAO4D+LD8qJ1l2dOhZzcGkPBVs6cJn5Hy2vJWS+8OGzWbqrLXS/tX\n4o69byzNj61rU5H9Dke/mAW5dzgcw/vwN3C1s98x978ihPBfAngDwI8C+BbRBO4UJFvtcGWL6PRq\ngHMTaJS+PY9tBOyxn7/JptrZjYITvWNEmDq5dzgcQxB+uGKP9wH4cJZlvyE/+ucAfg/AJwB8NoBv\nA/A2AH9jiHmODGpa1/z5WOEce29Un18WcX+TSPph0+4cjn4xrXLvcDiG0/DfD+CzAHy+3syy7J/J\n5a+HEN4A8JMhhE/LsuzjQ3zeSDArObEOxx1hKuXe4XDckPBDCN8F4D0A3pll2Sd7vPyXcBXE81YA\nScF/+PAhms1m4d6DBw/w4MGDm0zR4Zh5PHr0CI8ePSrc29/fH9vnudw7HHeLYWU+DOqXvhb6LwLw\nhVmW/fs+Xv/5AH4OwOdkWfZrkZ+/AuD1119/Ha+88spAc3E4HEU8fvwY9+/fB4D7WZY9HtX7utw7\nHJOJQWR+0Dz89wN4AOBVAEchhLdc/2g/y7KTEMKnA/hyAD8G4BmAzwHwHQB+Nib0Dodj8uFy73DM\nBgY16f9tXEXn/oy5/zUAfgDAGYC/BODvAVgD8AcA/gWA/3moWTocjruEy73DMQMYNA+/NOIty7I/\nBPCuYSbkcDgmCy73DsdswEPWHQ6Hw+GYAzjhOxwOh8MxB3DCdzgcDodjDuCE73A4HA7HHMAJ3+Fw\nOByOOYATvsPhcDgccwAnfIfD4XA45gATSfi2VvCkYBLnNYlzAnxeg2JS53WbmMTvYBLnBPi8BsUk\nzusu5uSEPwAmcV6TOCfA5zUoJnVet4lJ/A4mcU6Az2tQTOK8nPAdDofD4XCMBU74DofD4XDMAZzw\nHQ6Hw+GYAwzaLW8cqADAxz72sfzG/v4+Hj8eWSvvkWES5zWJcwJ8XoNiVPMSOaoM/WbjxVTI/STO\nCfB5DYpJnNddyHzIsmzoDxwGIYQvB/DP73QSDsfs4SuyLPuhu55ECi73DsfI0VPmJ4Hw7wH4ywB+\nF8DJnU7G4Zh+VAD8SQAfyrLs2R3PJQmXe4djZOhb5u+c8B0Oh8PhcIwfHrTncDgcDsccwAnf4XA4\nHI45gBO+w+FwOBxzACd8h8PhcDjmABNF+CGErwshfDyEcBxC+EgI4c/d8XzeG0LomPEbdzCPd4YQ\nPhBC+KPrObwaec0/CCF8IoTQDiH86xDCW+96XiGE74t8fz825jl9YwjhoyGEgxDCkxDCvwohvM28\nZjWE8N0hhO0QQiuE8C9DCC9NwLx+xnxXlyGE949zXpMAl/vkPFzu+5+Ty30fmBjCDyF8GYBvB/Be\nAH8WwP8D4EMhhBfudGLArwF4C4CXr8cX3MEc1gD8CoCvA9CVVhFC+AYAfwfAfwPg8wAc4eq7W7nL\neV3jx1H8/h6MeU7vBPCdAP48gL8EYBnAT4QQqvKa9wH4qwD+OoC/AOA/APAjEzCvDMA/xfPv61MA\nfP2Y53WncLkvhct9/3C57wdZlk3EAPARAP9YrgOAPwTw9Xc4p/cCeHzX342ZUwfAq+beJwA8lOsG\ngGMAX3rH8/o+AP/HHX9fL1zP7QvkuzkF8CXymj99/ZrPu6t5Xd/7aQDfcdfP2C3/f1zu+5uTy/1g\n83K5j4yJ0PBDCMsA7gP4Kd7Lrr6JnwTwjrua1zX+1LXp6ndCCD8YQvhjdzyfAkIIn4arXaF+dwcA\nfgl3/90BwLuuTVn/LoTw/hDC1i1//gaudtA719f3cVVSWr+v3wTw+7jd78vOi/iKEMLTEMKvhhD+\nodEEZgou9zeHy31PuNxHMAm19IGrXc8igCfm/hNc7cLuCh8B8NUAfhNXZpZvBvBzIYQ/k2XZ0R3O\nS/Eyrh6g2Hf38u1Pp4Afx5XJ7OMAPgPAtwL4sRDCO64X9rEihBBwZcb7cJZl9MG+DODsenFU3Nr3\nlZgXcFVq9vdwpbl9NoBvA/A2AH/jNuZ1B3C5vzlc7hNwuU9jUgg/hYC0j2jsyLLsQ3L5ayGEj+Lq\nH/OluDJbTTLu9LsDgCzLflgufz2E8KsAfgfAu3Blxho33g/gs9Cf//U2vy/O6/P1ZpZl/0wufz2E\n8AaAnwwhfFqWZR+/pblNAlzubw6Xe5f7JCbCpA9gG8AlroIWFC+hewd7Z8iybB/AbwEYeyTsAHgD\nVw/tRH93AHD98G7jFr6/EMJ3AXgPgHdlWfYJ+dEbAFZCCA3zK7fyfZl5fbLHy38JV//bSXreRgmX\n+5vD5T4Cl/tyTAThZ1l2DuB1AO/mvWvzx7sB/MJdzcsihFDHlYmq1z/s1nAtTG+g+N01cBUVOjHf\nHQCEED4VwD2M+fu7Fq4vAvAXsyz7ffPj1wFcoPh9vQ3AHwfwi3c4rxj+LK60j4l53kYJl/ubw+U+\n+jku971w29GTJdGLX4qrCNOvAvCZAL4HwDMAL97hnP4RrtI3/gSA/xTAv8bVbvDeLc9jDcDnAPiP\ncRXh+d9dX/+x659//fV39dcA/EcA/k8A/x+Albua1/XPvg1XC9CfwJWg/TKAjwFYHuOc3g9gF1fp\nMG+RUTGv+TiuTIz3Afw8gH875u+qdF4APh3ANwF45fr7ehXAbwP4N3fx7N/is+1yn56Hy33/c3K5\n72c+t/kA9/HlfC2u2mUe42rX9bl3PJ9HuEoROsZVNOcPAfi0O5jHF14L1qUZ3yuv+WZcBX20AXwI\nwFvvcl64atn4QVxpIScA/j2A/3XcC3liPpcAvkpes4qr3NhtAC0A/wLAS3c5LwCfCuBnADy9/h/+\nJq6Cneq3/bzd9nC5T87D5b7/Obnc9zG8Pa7D4XA4HHOAifDhOxwOh8PhGC+c8B0Oh8PhmAM44Tsc\nDofDMQdwwnc4HA6HYw7ghO9wOBwOxxzACd/hcDgcjjmAE77D4XA4HHMAJ3yHw+FwOOYATvgOh8Ph\ncMwBnPAdDofD4ZgDOOE7HA6HwzEHcMJ3OBwOh2MO8P8Dr1I8gqTY9vYAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f8444461630>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "print('Training data shape', train_data.shape)\n",
-    "_, (ax1, ax2) = plt.subplots(1, 2)\n",
-    "ax1.imshow(train_data[0].reshape(28, 28), cmap=plt.cm.Greys);\n",
-    "ax2.imshow(train_data[1].reshape(28, 28), cmap=plt.cm.Greys);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "cwBhQ3ouTQcW"
-   },
-   "source": [
-    "Looks good. Now we know how to index our full set of training and test images."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "PBCB9aYxRvBi"
-   },
-   "source": [
-    "### Label data\n",
-    "\n",
-    "Let's move on to loading the full set of labels. As is typical in classification problems, we'll convert our input labels into a [1-hot](https://en.wikipedia.org/wiki/One-hot) encoding over a length 10 vector corresponding to 10 digits. The vector [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], for example, would correspond to the digit 1."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:23.854577",
-     "start_time": "2016-09-16T14:49:23.831545"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 191,
-     "status": "ok",
-     "timestamp": 1446749131421,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "9pK1j2WlRwY9",
-    "outputId": "1ca31655-e14f-405a-b266-6a6c78827af5"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracting /tmp/mnist-data/train-labels-idx1-ubyte.gz\n",
-      "Extracting /tmp/mnist-data/t10k-labels-idx1-ubyte.gz\n"
-     ]
-    }
-   ],
-   "source": [
-    "NUM_LABELS = 10\n",
-    "\n",
-    "def extract_labels(filename, num_images):\n",
-    "    \"\"\"Extract the labels into a 1-hot matrix [image index, label index].\"\"\"\n",
-    "    print('Extracting', filename)\n",
-    "    with gzip.open(filename) as bytestream:\n",
-    "        # Skip the magic number and count; we know these values.\n",
-    "        bytestream.read(8)\n",
-    "        buf = bytestream.read(1 * num_images)\n",
-    "        labels = numpy.frombuffer(buf, dtype=numpy.uint8)\n",
-    "    # Convert to dense 1-hot representation.\n",
-    "    return (numpy.arange(NUM_LABELS) == labels[:, None]).astype(numpy.float32)\n",
-    "\n",
-    "train_labels = extract_labels(train_labels_filename, 60000)\n",
-    "test_labels = extract_labels(test_labels_filename, 10000)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "hb3Vaq72UUxW"
-   },
-   "source": [
-    "As with our image data, we'll double-check that our 1-hot encoding of the first few values matches our expectations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:23.864350",
-     "start_time": "2016-09-16T14:49:23.857177"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 127,
-     "status": "ok",
-     "timestamp": 1446749132853,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "uEBID71nUVj1",
-    "outputId": "3f318310-18dd-49ed-9943-47b4aae7ee69"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training labels shape (60000, 10)\n",
-      "First label vector [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]\n",
-      "Second label vector [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('Training labels shape', train_labels.shape)\n",
-    "print('First label vector', train_labels[0])\n",
-    "print('Second label vector', train_labels[1])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "5EwtEhxRUneF"
-   },
-   "source": [
-    "The 1-hot encoding looks reasonable.\n",
-    "\n",
-    "### Segmenting data into training, test, and validation\n",
-    "\n",
-    "The final step in preparing our data is to split it into three sets: training, test, and validation. This isn't the format of the original data set, so we'll take a small slice of the training data and treat that as our validation set."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:23.874014",
-     "start_time": "2016-09-16T14:49:23.866161"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 176,
-     "status": "ok",
-     "timestamp": 1446749134110,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "e7aBYBtIVxHE",
-    "outputId": "bdeae1a8-daff-4743-e594-f1d2229c0f4e"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Validation shape (5000, 28, 28, 1)\n",
-      "Train size 55000\n"
-     ]
-    }
-   ],
-   "source": [
-    "VALIDATION_SIZE = 5000\n",
-    "\n",
-    "validation_data = train_data[:VALIDATION_SIZE, :, :, :]\n",
-    "validation_labels = train_labels[:VALIDATION_SIZE]\n",
-    "train_data = train_data[VALIDATION_SIZE:, :, :, :]\n",
-    "train_labels = train_labels[VALIDATION_SIZE:]\n",
-    "\n",
-    "train_size = train_labels.shape[0]\n",
-    "\n",
-    "print('Validation shape', validation_data.shape)\n",
-    "print('Train size', train_size)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "1JFhEH8EVj4O"
-   },
-   "source": [
-    "# Defining the model\n",
-    "\n",
-    "Now that we've prepared our data, we're ready to define our model.\n",
-    "\n",
-    "The comments describe the architecture, which fairly typical of models that process image data. The raw input passes through several [convolution](https://en.wikipedia.org/wiki/Convolutional_neural_network#Convolutional_layer) and [max pooling](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer) layers with [rectified linear](https://en.wikipedia.org/wiki/Convolutional_neural_network#ReLU_layer) activations before several fully connected layers and a [softmax](https://en.wikipedia.org/wiki/Convolutional_neural_network#Loss_layer) loss for predicting the output class. During training, we use [dropout](https://en.wikipedia.org/wiki/Convolutional_neural_network#Dropout_method).\n",
-    "\n",
-    "We'll separate our model definition into three steps:\n",
-    "\n",
-    "1. Defining the variables that will hold the trainable weights.\n",
-    "1. Defining the basic model graph structure described above. And,\n",
-    "1. Stamping out several copies of the model graph for training, testing, and validation.\n",
-    "\n",
-    "We'll start with the variables."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:28.803525",
-     "start_time": "2016-09-16T14:49:23.875999"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 2081,
-     "status": "ok",
-     "timestamp": 1446749138298,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "Q1VfiAzjzuK8",
-    "outputId": "f53a39c9-3a52-47ca-d7a3-9f9d84eccf63"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Done\n"
-     ]
-    }
-   ],
-   "source": [
-    "import tensorflow as tf\n",
-    "\n",
-    "# We'll bundle groups of examples during training for efficiency.\n",
-    "# This defines the size of the batch.\n",
-    "BATCH_SIZE = 60\n",
-    "# We have only one channel in our grayscale images.\n",
-    "NUM_CHANNELS = 1\n",
-    "# The random seed that defines initialization.\n",
-    "SEED = 42\n",
-    "\n",
-    "# This is where training samples and labels are fed to the graph.\n",
-    "# These placeholder nodes will be fed a batch of training data at each\n",
-    "# training step, which we'll write once we define the graph structure.\n",
-    "train_data_node = tf.placeholder(\n",
-    "  tf.float32,\n",
-    "  shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))\n",
-    "train_labels_node = tf.placeholder(tf.float32,\n",
-    "                                   shape=(BATCH_SIZE, NUM_LABELS))\n",
-    "\n",
-    "# For the validation and test data, we'll just hold the entire dataset in\n",
-    "# one constant node.\n",
-    "validation_data_node = tf.constant(validation_data)\n",
-    "test_data_node = tf.constant(test_data)\n",
-    "\n",
-    "# The variables below hold all the trainable weights. For each, the\n",
-    "# parameter defines how the variables will be initialized.\n",
-    "conv1_weights = tf.Variable(\n",
-    "  tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.\n",
-    "                      stddev=0.1,\n",
-    "                      seed=SEED))\n",
-    "conv1_biases = tf.Variable(tf.zeros([32]))\n",
-    "conv2_weights = tf.Variable(\n",
-    "  tf.truncated_normal([5, 5, 32, 64],\n",
-    "                      stddev=0.1,\n",
-    "                      seed=SEED))\n",
-    "conv2_biases = tf.Variable(tf.constant(0.1, shape=[64]))\n",
-    "fc1_weights = tf.Variable(  # fully connected, depth 512.\n",
-    "  tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],\n",
-    "                      stddev=0.1,\n",
-    "                      seed=SEED))\n",
-    "fc1_biases = tf.Variable(tf.constant(0.1, shape=[512]))\n",
-    "fc2_weights = tf.Variable(\n",
-    "  tf.truncated_normal([512, NUM_LABELS],\n",
-    "                      stddev=0.1,\n",
-    "                      seed=SEED))\n",
-    "fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS]))\n",
-    "\n",
-    "print('Done')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "QHB_u04Z4HO6"
-   },
-   "source": [
-    "Now that we've defined the variables to be trained, we're ready to wire them together into a TensorFlow graph.\n",
-    "\n",
-    "We'll define a helper to do this, `model`, which will return copies of the graph suitable for training and testing. Note the `train` argument, which controls whether or not dropout is used in the hidden layer. (We want to use dropout only during training.)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:28.834326",
-     "start_time": "2016-09-16T14:49:28.805723"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 772,
-     "status": "ok",
-     "timestamp": 1446749138306,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "V85_B9QF3uBp",
-    "outputId": "457d3e49-73ad-4451-c196-421dd4681efc"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Done\n"
-     ]
-    }
-   ],
-   "source": [
-    "def model(data, train=False):\n",
-    "    \"\"\"The Model definition.\"\"\"\n",
-    "    # 2D convolution, with 'SAME' padding (i.e. the output feature map has\n",
-    "    # the same size as the input). Note that {strides} is a 4D array whose\n",
-    "    # shape matches the data layout: [image index, y, x, depth].\n",
-    "    conv = tf.nn.conv2d(data,\n",
-    "                        conv1_weights,\n",
-    "                        strides=[1, 1, 1, 1],\n",
-    "                        padding='SAME')\n",
-    "\n",
-    "    # Bias and rectified linear non-linearity.\n",
-    "    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))\n",
-    "\n",
-    "    # Max pooling. The kernel size spec ksize also follows the layout of\n",
-    "    # the data. Here we have a pooling window of 2, and a stride of 2.\n",
-    "    pool = tf.nn.max_pool(relu,\n",
-    "                          ksize=[1, 2, 2, 1],\n",
-    "                          strides=[1, 2, 2, 1],\n",
-    "                          padding='SAME')\n",
-    "    conv = tf.nn.conv2d(pool,\n",
-    "                        conv2_weights,\n",
-    "                        strides=[1, 1, 1, 1],\n",
-    "                        padding='SAME')\n",
-    "    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))\n",
-    "    pool = tf.nn.max_pool(relu,\n",
-    "                          ksize=[1, 2, 2, 1],\n",
-    "                          strides=[1, 2, 2, 1],\n",
-    "                          padding='SAME')\n",
-    "\n",
-    "    # Reshape the feature map cuboid into a 2D matrix to feed it to the\n",
-    "    # fully connected layers.\n",
-    "    pool_shape = pool.get_shape().as_list()\n",
-    "    reshape = tf.reshape(\n",
-    "        pool,\n",
-    "        [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])\n",
-    "  \n",
-    "    # Fully connected layer. Note that the '+' operation automatically\n",
-    "    # broadcasts the biases.\n",
-    "    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)\n",
-    "\n",
-    "    # Add a 50% dropout during training only. Dropout also scales\n",
-    "    # activations such that no rescaling is needed at evaluation time.\n",
-    "    if train:\n",
-    "        hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)\n",
-    "    return tf.matmul(hidden, fc2_weights) + fc2_biases\n",
-    "\n",
-    "print('Done')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "7bvEtt8C4fLC"
-   },
-   "source": [
-    "Having defined the basic structure of the graph, we're ready to stamp out multiple copies for training, testing, and validation.\n",
-    "\n",
-    "Here, we'll do some customizations depending on which graph we're constructing. `train_prediction` holds the training graph, for which we use cross-entropy loss and weight regularization. We'll adjust the learning rate during training -- that's handled by the `exponential_decay` operation, which is itself an argument to the `MomentumOptimizer` that performs the actual training.\n",
-    "\n",
-    "The validation and prediction graphs are much simpler to generate -- we need only create copies of the model with the validation and test inputs and a softmax classifier as the output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.058141",
-     "start_time": "2016-09-16T14:49:28.836169"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 269,
-     "status": "ok",
-     "timestamp": 1446749139596,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "9pR1EBNT3sCv",
-    "outputId": "570681b1-f33e-4618-b742-48e12aa58132"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Done\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Training computation: logits + cross-entropy loss.\n",
-    "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
-    "  labels=train_labels_node, logits=logits))\n",
-    "\n",
-    "# L2 regularization for the fully connected parameters.\n",
-    "regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +\n",
-    "                tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))\n",
-    "# Add the regularization term to the loss.\n",
-    "loss += 5e-4 * regularizers\n",
-    "\n",
-    "# Optimizer: set up a variable that's incremented once per batch and\n",
-    "# controls the learning rate decay.\n",
-    "batch = tf.Variable(0)\n",
-    "# Decay once per epoch, using an exponential schedule starting at 0.01.\n",
-    "learning_rate = tf.train.exponential_decay(\n",
-    "  0.01,                # Base learning rate.\n",
-    "  batch * BATCH_SIZE,  # Current index into the dataset.\n",
-    "  train_size,          # Decay step.\n",
-    "  0.95,                # Decay rate.\n",
-    "  staircase=True)\n",
-    "# Use simple momentum for the optimization.\n",
-    "optimizer = tf.train.MomentumOptimizer(learning_rate,\n",
-    "                                       0.9).minimize(loss,\n",
-    "                                                     global_step=batch)\n",
-    "\n",
-    "# Predictions for the minibatch, validation set and test set.\n",
-    "train_prediction = tf.nn.softmax(logits)\n",
-    "# We'll compute them only once in a while by calling their {eval()} method.\n",
-    "validation_prediction = tf.nn.softmax(model(validation_data_node))\n",
-    "test_prediction = tf.nn.softmax(model(test_data_node))\n",
-    "\n",
-    "print('Done')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "4T21uZJq5UfH"
-   },
-   "source": [
-    "# Training and visualizing results\n",
-    "\n",
-    "Now that we have the training, test, and validation graphs, we're ready to actually go through the training loop and periodically evaluate loss and error.\n",
-    "\n",
-    "All of these operations take place in the context of a session. In Python, we'd write something like:\n",
-    "\n",
-    "    with tf.Session() as s:\n",
-    "      ...training / test / evaluation loop...\n",
-    "  \n",
-    "But, here, we'll want to keep the session open so we can poke at values as we work out the details of training. The TensorFlow API includes a function for this, `InteractiveSession`.\n",
-    "\n",
-    "We'll start by creating a session and initializing the variables we defined above."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.357483",
-     "start_time": "2016-09-16T14:49:29.059952"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "collapsed": true,
-    "id": "z6Kc5iql6qxV"
-   },
-   "outputs": [],
-   "source": [
-    "# Create a new interactive session that we'll use in\n",
-    "# subsequent code cells.\n",
-    "s = tf.InteractiveSession()\n",
-    "\n",
-    "# Use our newly created session as the default for \n",
-    "# subsequent operations.\n",
-    "s.as_default()\n",
-    "\n",
-    "# Initialize all the variables we defined above.\n",
-    "tf.global_variables_initializer().run()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "hcG8H-Ka6_mw"
-   },
-   "source": [
-    "Now we're ready to perform operations on the graph. Let's start with one round of training. We're going to organize our training steps into batches for efficiency; i.e., training using a small set of examples at each step rather than a single example."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.584699",
-     "start_time": "2016-09-16T14:49:29.359107"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 386,
-     "status": "ok",
-     "timestamp": 1446749389138,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "LYVxeEox71Pg",
-    "outputId": "9184b5df-009a-4b1b-e312-5be94351351f"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Done\n"
-     ]
-    }
-   ],
-   "source": [
-    "BATCH_SIZE = 60\n",
-    "\n",
-    "# Grab the first BATCH_SIZE examples and labels.\n",
-    "batch_data = train_data[:BATCH_SIZE, :, :, :]\n",
-    "batch_labels = train_labels[:BATCH_SIZE]\n",
-    "\n",
-    "# This dictionary maps the batch data (as a numpy array) to the\n",
-    "# node in the graph it should be fed to.\n",
-    "feed_dict = {train_data_node: batch_data,\n",
-    "             train_labels_node: batch_labels}\n",
-    "\n",
-    "# Run the graph and fetch some of the nodes.\n",
-    "_, l, lr, predictions = s.run(\n",
-    "  [optimizer, loss, learning_rate, train_prediction],\n",
-    "  feed_dict=feed_dict)\n",
-    "\n",
-    "print('Done')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "7bL4-RNm_K-B"
-   },
-   "source": [
-    "Let's take a look at the predictions. How did we do? Recall that the output will be probabilities over the possible classes, so let's look at those probabilities."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.593985",
-     "start_time": "2016-09-16T14:49:29.586233"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 160,
-     "status": "ok",
-     "timestamp": 1446749519023,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "2eNitV_4_ZUL",
-    "outputId": "f1340dd1-255b-4523-bf62-7e3ebb361333"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[  2.25393116e-04   4.76219611e-05   1.66867452e-03   5.67827519e-05\n",
-      "   6.03432178e-01   4.34969068e-02   2.19316553e-05   1.41286102e-04\n",
-      "   1.54903100e-05   3.50893795e-01]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(predictions[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "X5MgraJb_eQZ"
-   },
-   "source": [
-    "As expected without training, the predictions are all noise. Let's write a scoring function that picks the class with the maximum probability and compares with the example's label. We'll start by converting the probability vectors returned by the softmax into predictions we can match against the labels."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.606284",
-     "start_time": "2016-09-16T14:49:29.597095"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 220,
-     "status": "ok",
-     "timestamp": 1446750411574,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "wMMlUf5rCKgT",
-    "outputId": "2c10e96d-52b6-47b0-b6eb-969ad462d46b"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "First prediction 4\n",
-      "(60, 10)\n",
-      "All predictions [4 4 2 7 7 7 7 7 7 7 7 7 0 8 9 0 7 7 0 7 4 0 5 0 9 9 7 0 7 4 7 7 7 0 7 7 9\n",
-      " 7 9 9 0 7 7 7 2 7 0 7 2 9 9 9 9 9 0 7 9 4 8 7]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# The highest probability in the first entry.\n",
-    "print('First prediction', numpy.argmax(predictions[0]))\n",
-    "\n",
-    "# But, predictions is actually a list of BATCH_SIZE probability vectors.\n",
-    "print(predictions.shape)\n",
-    "\n",
-    "# So, we'll take the highest probability for each vector.\n",
-    "print('All predictions', numpy.argmax(predictions, 1))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "8pMCIZ3_C2ni"
-   },
-   "source": [
-    "Next, we can do the same thing for our labels -- using `argmax` to convert our 1-hot encoding into a digit class."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.615484",
-     "start_time": "2016-09-16T14:49:29.609168"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 232,
-     "status": "ok",
-     "timestamp": 1446750498351,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "kZWp4T0JDDUe",
-    "outputId": "47b588cd-bc82-45c3-a5d0-8d84dc27a3be"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Batch labels [7 3 4 6 1 8 1 0 9 8 0 3 1 2 7 0 2 9 6 0 1 6 7 1 9 7 6 5 5 8 8 3 4 4 8 7 3\n",
-      " 6 4 6 6 3 8 8 9 9 4 4 0 7 8 1 0 0 1 8 5 7 1 7]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('Batch labels', numpy.argmax(batch_labels, 1))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "bi5Z6whtDiht"
-   },
-   "source": [
-    "Now we can compare the predicted and label classes to compute the error rate and confusion matrix for this batch."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.841313",
-     "start_time": "2016-09-16T14:49:29.618274"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      },
-      {
-       "item_id": 2
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 330,
-     "status": "ok",
-     "timestamp": 1446751307304,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "U4hrLW4CDtQB",
-    "outputId": "720494a3-cbf9-4687-9d94-e64a33fdd78f"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.06666666666666667\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVgAAAFdCAYAAABGoXXzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAFfZJREFUeJzt3X2wZHV95/H3hwcdnrEk6CSaVYG4JDOFmcEHVEBFYcoq\nSTAVZYBKsRQJENmyZrfWSKHFGMqsa0rAqGRlfQBEoDAVY6iFQZFAlqeiYCqEkUGXJ2FhHASciQwz\nicP89o/TQ+7cebqnu3+nb/e8X1X9R5/p09/v7T7zub/+nXN/nVIKkqTh223UDUjSpDJgJakSA1aS\nKjFgJakSA1aSKjFgJakSA1aSKtmj5pMneTVwAvA4sKFmLUnqyBzgDcBNpZTndvTAqgFLE67frlxD\nkkbhVODqHT2gdsA+DsBeV8Huh/f3DOuXwF4X97Xrgtvu7K9mzyNLvsohF5/V177LFz4wUG1YBiwa\n8DlGUPvC/l6vl121BE7r7/3mfwxWepBjDQY73gY51mDQ421Mj7WR1X4W+FvYnG87UDtgm2mB3Q+H\nPRb09ww5oO9991vwTH81e/Y4YB/2W3Bon3s/O1Dt5lPI3AGfYwS139jn+7zZ3gf0/xyDHs0DHGsw\n2PE22LEGgx1vY3qsjb72Tqc9PcklSZUYsJJUiQErSZXM/oB9xeKRlT548XtGVhvm7Zq1jxrd++2x\nZu1hM2B3YLQH/fxds/Y7Ddju7aLHWge1Z3/AStKYMmAlqRIDVpIq6Stgk3wsyWNJ1ie5O8lbh92Y\nJI271gGb5KPAF4ALgN8F7gduSnLQkHuTpLHWzwh2CfDVUsqVpZSHgLOBF4EzhtqZJI25VgGbZE9g\nIfDDzdtK873fNwNHDbc1SRpvbUewBwG7A6unbV8NvHYoHUnShBjWaloBynb/df2SZqWiqV6xeKQX\ndkvSzj0ArJi2bebfHdA2YJ8FXgJeM237wWw9qv13e1080DJwkjQa89n6L75WAZfNaO9WUwSllF8B\n9wHHbd6WJL37g61uLUkTpp8pgouAK5LcB9xDc1XB3sDlQ+xLksZe64AtpVzXu+b1z2mmCv4JOKGU\n8vNhNydJ46yvk1yllEuBS4fciyRNFNcikKRKDFhJqsSAlaRKDFhJqsSAlaRKDFhJqsSAlaRKDFhJ\nqiTNcq6VnjxZANwHfwLMrVZHkrrz8mIvC0spy3f0SEewklSJAStJlRiwklSJAStJlRiwklSJAStJ\nlRiwklSJAStJlRiwklSJAStJlRiwklRJ64BNcnSSv0/yVJJNSU6s0Zgkjbt+RrD70HxV98eAeivF\nSNKYa/213aWUZcAygCQZekeSNCGcg5WkSgxYSaqk9RRBf5YBc6ZtmwfM76a8JPXlAWDFtG0bZrx3\nRwG7CL/RQNL4mc/WA8GXv9Fgp5wikKRKWo9gk+wDHApsvoLgTUmOAJ4vpTw5zOYkaZz1M0VwJPAP\nNNfAFuALve1XAGcMqS9JGnv9XAd7G04tSNJOGZSSVIkBK0mVGLCSVIkBK0mVGLCSVIkBK0mVGLCS\nVIkBK0mVdLPYy75nwR4LOim1hTVLu6+5qztw6ehq+37vekZxvG1cDi+42IskjZQBK0mVGLCSVIkB\nK0mVGLCSVIkBK0mVGLCSVIkBK0mVGLCSVIkBK0mVGLCSVEmrgE1yXpJ7kvxLktVJvpvkt2o1J0nj\nrO0I9mjgS8DbgfcDewLfT7LXsBuTpHHXajWtUsoHp95PcjrwDLAQuH14bUnS+Bt0DvZAoADPD6EX\nSZoofQdskgCXALeXUh4cXkuSNBkGWXD7UuC3gXft9JHrl0AO2HLbKxY3N0marf7tmuY2VVk74937\nCtgkXwY+CBxdSlm10x32ung032ggSYPY1kBw43J4YeGMdm8dsL1w/T3g2FLKE233l6RdRauATXIp\nsBg4EViX5DW9f1pbStkw7OYkaZy1Pcl1NrA/cCvw9JTbR4bbliSNv7bXwfqntZI0QwamJFViwEpS\nJQasJFViwEpSJQasJFViwEpSJQasJFViwEpSJYOspjVzL3wVmNtJKY3YmqWj7kC7kpEcbztf32oz\nR7CSVIkBK0mVGLCSVIkBK0mVGLCSVIkBK0mVGLCSVIkBK0mVGLCSVIkBK0mVGLCSVEmrgE1ydpL7\nk6zt3e5MsqhWc5I0ztqOYJ8E/gxY2LvdAnwvyeHDbkySxl3br+3+39M2fSrJOcA7gJVD60qSJkDf\nyxUm2Q34CLA3cNfQOpKkCdE6YJPMownUOcAvgZNKKQ8NuzFJGnf9jGAfAo4ADgT+ALgyyTE7Dtll\nNHk81Txgfh/lJakrDwArpm3bMOO9WwdsKWUj8Gjv7vIkbwM+Dpyz/b0W4TcaSBo/89l6ILgKuGxG\new/jOtjdgFcO4XkkaaK0GsEm+SxwI83lWvsBpwLHAscPvzVJGm9tpwheA1xJ83l/LfDPwPGllFuG\n3Zgkjbu218GeWasRSZo0rkUgSZUYsJJUiQErSZUYsJJUiQErSZUYsJJUiQErSZUYsJJUSd/rwY6F\nA5eOrvaaEdbeVX/uXdlVS0dX+9zRlZ7tx5sjWEmqxICVpEoMWEmqxICVpEoMWEmqxICVpEoMWEmq\nxICVpEoMWEmqxICVpEoMWEmqZKCATXJekk1JLhpWQ5I0KfoO2CRvBf4YuH947UjS5OgrYJPsC1wF\nnAmsGWpHkjQh+h3BfgW4vpRyyzCbkaRJ0no92CQnA28Bjhx+O5I0OVoFbJLXAZcAHyil/Grmey4D\n5kzbNg+Y36a8JHXsAWDFtG0bZrx32xHsQuDXgPuSpLdtd+CYJOcCryyllK13WwTMbVlKkkZtPlsP\nBFcBl81o77YBe/M2ql0OrAQ+t+1wlaRdU6uALaWsAx6cui3JOuC5UsrKYTYmSeNuGH/J5ahVkrZh\n4G+VLaW8bxiNSNKkcS0CSarEgJWkSgxYSarEgJWkSgxYSarEgJWkSgxYSarEgJWkSgxYSapk4L/k\nmtXWLB11B6Oxq/7cu7JzR1jb4227HMFKUiUGrCRVYsBKUiUGrCRVYsBKUiUGrCRVYsBKUiUGrCRV\nYsBKUiUGrCRV0ipgk1yQZNO024M731OSdj39rEWwAjgOSO/+xuG1I0mTo5+A3VhK+fnQO5GkCdPP\nHOxhSZ5K8kiSq5K8fuhdSdIEaBuwdwOnAycAZwNvBP4xyT5D7kuSxl6rKYJSyk1T7q5Icg/wU+Aj\nwDe3v+cyYM60bfOA+W3KS1LHHqA57TTVhhnvPdCC26WUtUl+Ahy640cuAuYOUkqSRmA+Ww8EVwGX\nzWjvga6DTbIvcEivoiRpirbXwf5lkmOS/Ick7wS+S3OZ1jVVupOkMdZ2iuB1wNXAq4GfA7cD7yil\nPDfsxiRp3LU9ybW4ViOSNGlci0CSKjFgJakSA1aSKjFgJakSA1aSKjFgJakSA1aSKjFgJakSA1aS\nKhloNS1Js8SapaPuQNvgCFaSKjFgJakSA1aSKjFgJakSA1aSKjFgJakSA1aSKjFgJakSA1aSKjFg\nJamS1gGb5NeTfCvJs0leTHJ/kgU1mpOkcdZqLYIkBwJ3AD8ETgCeBQ4DfjH81iRpvLVd7OWTwBOl\nlDOnbPvpEPuRpInRdorgQ8C9Sa5LsjrJ8iRn7nQvSdoFtQ3YNwHnAD8Gjgf+J/BXSU4bdmOSNO7a\nThHsBtxTSvl07/79SX6HJnSv2v5uy4A507bNA+a3LC9JXXoAWDFt24YZ7902YFcBK6dtWwl8eMe7\nLQLmtiwlSaM2n60HgquAy2a0d9spgjuAN0/b9mY80SVJW2kbsBcD70hyXpJDkpwCnAl8efitSdJ4\naxWwpZR7gZOAxTSTE+cDHy+lXFuhN0kaa62/9LCUcgNwQ4VeJGmiuBaBJFViwEpSJQasJFViwEpS\nJQasJFViwEpSJQasJFViwEpSJQasJFXS+i+5+rHgvvnst+DQLkpt4bZvL+q85svOHV3pY3+xbGS1\nd9XXHHbd1/2C0zKy2reWGzuv+cvlD7N84cwe6whWkioxYCWpEgNWkioxYCWpEgNWkioxYCWpEgNW\nkioxYCWpEgNWkioxYCWpklYBm+SxJJu2cftSrQYlaVy1XYvgSGD3KffnA98HrhtaR5I0IVoFbCnl\nuan3k3wIeKSU8n+G2pUkTYC+52CT7AmcCnx9eO1I0uQY5CTXScABwBVD6kWSJsog68GeAdxYSvnZ\nzh74yJKvsscB+2yx7eDF7+Hgxe8ZoLwk1fXMNbfyzDW3brFt49p1M96/r4BN8pvA+4Hfn8njD7n4\nrJEsuC1Jg9jWQLBZcPs/z2j/fqcIzgBWAzf0ub8kTbzWAZskwOnA5aWUTUPvSJImRD8j2PcDrwe+\nOeReJGmitJ6DLaX8gC3/2ECStA2uRSBJlRiwklSJAStJlRiwklSJAStJlRiwklSJAStJlRiwklSJ\nAStJlaSUUu/JkwXAffAnwNxqdbbrwKXd15wN1iwddQfSBFsFXAawsJSyfEePdAQrSZUYsJJUiQEr\nSZUYsJJUiQErSZUYsJJUiQErSZUYsJJUiQErSZUYsJJUSauATbJbkguTPJrkxSQPJ/lUreYkaZy1\n/VbZTwJnAX8EPAgcCVyeZE0p5cvDbk6SxlnbgD0K+F4pZVnv/hNJTgHeNty2JGn8tZ2DvRM4Lslh\nAEmOAN4F3DDsxiRp3LUdwX4O2B94KMlLNAF9finl2qF3Jkljrm3AfhQ4BTiZZg72LcAXkzxdSvnW\n9ndbBsyZtm0eML9leUnq0gPAimnbNsx477YB+3ngL0op3+nd/1GSNwDnATsI2EWMZMFtSRrIfLYe\nCL684PZOtZ2D3RuY/hUIm/p4HkmaeG1HsNcD5yd5EvgRsABYAnxt2I1J0rhrG7DnAhcCXwEOBp4G\n/rq3TZI0RauALaWsA/5L7yZJ2gHnTiWpEgNWkioxYCWpEgNWkioxYCWpEgNWkioxYCWpEgNWkiox\nYCWpkrZ/KtufC8+CNy7opNQWzu2+5MvWLB1d7QNHWFsjcewvlu38QZXc9qpFI6s9EhuXwwt1VtOS\nJM2QAStJlRiwklSJAStJlRiwklSJAStJlRiwklSJAStJlRiwklSJAStJlbQO2CT7JrkkyeNJXkxy\ne5IjazQnSeOsnxHs14HjgFOBecAPgJuTzB1mY5I07loFbJI5wIeB/1ZKuaOU8mgp5TPAw8A5NRqU\npHHVdgS7B7A78K/Ttq8H3j2UjiRpQrQK2FLKC8BdwKeTzE2yW5LTgKMApwgkaYp+1oM9DfgG8BSw\nEVgOXA1sf8HXq5bA3gdsue2oxfDOxX2Ul6SO/Ns1zW2qsnbGu7cO2FLKY8B7k+wF7F9KWZ3kWuCx\n7e502sWjWXBbkgbxisXNbaqNy+GFhTPave/rYEsp63vh+irgBODv+n0uSZpErUewSY4HAvwYOAz4\nPLASuHyonUnSmOtnDvYA4L8DvwE8D/wN8KlSykvDbEySxl0/c7DfAb5ToRdJmiiuRSBJlRiwklSJ\nAStJlcz+gL3zmp0/ppbpFxh36oHRlR7lz23tzj1zza0jqz3pr/nsD9i7JvsN2L4Voys94Qe9tbdk\nwNYz+wNWksaUAStJlRiwklRJP3/J1cYcAJ5e2f8zvLgWHlve374b+y8LNKvmbOyzNqsGLL6h/+fo\nu+eegX7uAVm7L79c/nDf+25cu26g/Qd6zcbxNX/p5Tybs7OHppTSvsAMJTkF+Ha1ApI0OqeWUq7e\n0QNqB+yraVbaepxmSCZJ424O8AbgplLKczt6YNWAlaRdmSe5JKkSA1aSKjFgJakSA1aSKjFgJamS\nWRuwST6W5LEk65PcneStHdU9OsnfJ3kqyaYkJ3ZRt1f7vCT3JPmXJKuTfDfJb3VU++wk9ydZ27vd\nmWRRF7Wn9XFe73W/qKN6F/TqTb092FHtX0/yrSTPJnmx9/p38vXLvf9b03/uTUm+VLnubkkuTPJo\n72d+OMmnatacVn/fJJckebxX//YkR9aqNysDNslHgS8AFwC/C9wP3JTkoA7K7wP8E/AxoOtr2I4G\nvgS8HXg/sCfw/d5XpNf2JPBnwMLe7Rbge0kO76A2AL1fon9M8353aQXwGuC1vdu7axdMciBwB/Cv\nNNeKHw78V+AXtWv3HMm//7yvBT5Ac7xfV7nuJ4GzgD8F/iPwCeATSc6tXHezrwPHAacC84AfADcn\nmVulWill1t2Au4EvTrkf4P8Bn+i4j03AiSN8HQ7q9fDuEdV/DvhPHdXal+abit8H/ANwUUd1LwCW\nj+C1/Rxw2yje1+30cwnwkw7qXA/8r2nb/ga4soPac4BfAYumbb8X+PMaNWfdCDbJnjQjqB9u3laa\nV+Fm4KhR9TUiB9KMKp7vsmjvY9zJwN7AXR2V/QpwfSnllo7qTXVYb0rokSRXJXl9BzU/BNyb5Lre\ndNDyJGd2UHcrvf9zp9KM7mq7EzguyWG92kcA7wJu6KD2HsDuNJ8aplpPpU8ttRd76cdBNC/C6mnb\nVwNv7r6d0UgSmlHF7aWUruYE59EE6hzgl8BJpZSHOqh7MvAWmo+tXbsbOJ1m9DwXWAr8Y5J5pZR1\nFeu+CTiHZirsszTTQn+VZEMp5aqKdbflJOAA4IoOan0O2B94KMlLNNOU55dSrq1duJTyQpK7gE8n\neYgmU06hGbj93xo1Z2PAbk/ofk50lC4Ffpvmt3tXHgKOoBk5/wFwZZJjaoZsktfR/CL5QCnlV7Xq\nbE8p5aYpd1ckuQf4KfAR4JsVS+8G3FNK+XTv/v1JfocmdLsO2DOAG0spP+ug1kdpQu1k4EGaX6xf\nTPJ0KeVbHdQ/DfgG8BTNenvLgauBKicXZ2PAPgu8RHPSYaqD2XpUO5GSfBn4IHB0KWXQdQ9nrJSy\nEXi0d3d5krcBH6f5T1/LQuDXgPt6o3ZoPsEc0zvx8creFFEnSilrk/wEOLRyqVXA9HU8VwIfrlx3\nC0l+k+aE6u93VPLzwF+UUr7Tu/+jJG8AzgOqB2wp5THgvb0Tx/uXUlYnuRZ4rEa9WTcH2xvF3Edz\npg94+ePycTTzNxOtF66/B7y3lPLEiNvZDXhl5Ro3A/NpRjJH9G730ozijugyXKG5jAc4hMEX9N2Z\nO9h6yuvNNKPnLp1BM3DpYg4Umnn96e/pJjrOolLK+l64vormKo6/q1FnNo5gAS4CrkhyH3APsITm\njbm8duEk+9CMXjaPpt7Um4h/vpTyZOXalwKLgROBdUk2j+LXllKqLveY5LPAjTSXa+1Hc9LjWOD4\nmnV785xbzDEnWQc8V0oZYKX2mUnylzRntn8K/AbwGZqPjrW/Ee9i4I4k59FcGvV24Eyay9Q60Ru4\nnA5cXkrZ1FHZ64HzkzwJ/Ijmo/kS4GtdFE9yPM3/7R8Dh9GMqFdSK1tqXxoxwCUVf0qzjux6mhMv\nR3ZU91ia36gvTbt9o4Pa26r7EvBHHdT+Gs30wHrgZ8D3gfeN6L2/he4u07qG5hLA9cATNPNxb+yo\n9geBfwZepAmbMzp+nT/QO74O7bDmPjQDqMeAdTQnlz4D7NFR/T8EHu69308BXwT2q1XP9WAlqZJZ\nNwcrSZPCgJWkSgxYSarEgJWkSgxYSarEgJWkSgxYSarEgJWkSgxYSarEgJWkSgxYSark/wMpmofq\n9OY6UgAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f841ece8128>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "correct = numpy.sum(numpy.argmax(predictions, 1) == numpy.argmax(batch_labels, 1))\n",
-    "total = predictions.shape[0]\n",
-    "\n",
-    "print(float(correct) / float(total))\n",
-    "\n",
-    "confusions = numpy.zeros([10, 10], numpy.float32)\n",
-    "bundled = zip(numpy.argmax(predictions, 1), numpy.argmax(batch_labels, 1))\n",
-    "for predicted, actual in bundled:\n",
-    "  confusions[predicted, actual] += 1\n",
-    "\n",
-    "plt.grid(False)\n",
-    "plt.xticks(numpy.arange(NUM_LABELS))\n",
-    "plt.yticks(numpy.arange(NUM_LABELS))\n",
-    "plt.imshow(confusions, cmap=plt.cm.jet, interpolation='nearest');"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "iZmx_9DiDXQ3"
-   },
-   "source": [
-    "Now let's wrap this up into our scoring function."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:49:29.857607",
-     "start_time": "2016-09-16T14:49:29.843904"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 178,
-     "status": "ok",
-     "timestamp": 1446751995007,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "DPJie7bPDaLa",
-    "outputId": "a06c64ed-f95f-416f-a621-44cccdaba0f8"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Done\n"
-     ]
-    }
-   ],
-   "source": [
-    "def error_rate(predictions, labels):\n",
-    "    \"\"\"Return the error rate and confusions.\"\"\"\n",
-    "    correct = numpy.sum(numpy.argmax(predictions, 1) == numpy.argmax(labels, 1))\n",
-    "    total = predictions.shape[0]\n",
-    "\n",
-    "    error = 100.0 - (100 * float(correct) / float(total))\n",
-    "\n",
-    "    confusions = numpy.zeros([10, 10], numpy.float32)\n",
-    "    bundled = zip(numpy.argmax(predictions, 1), numpy.argmax(labels, 1))\n",
-    "    for predicted, actual in bundled:\n",
-    "        confusions[predicted, actual] += 1\n",
-    "    \n",
-    "    return error, confusions\n",
-    "\n",
-    "print('Done')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "sLv22cjeB5Rd"
-   },
-   "source": [
-    "We'll need to train for some time to actually see useful predicted values. Let's define a loop that will go through our data. We'll print the loss and error periodically.\n",
-    "\n",
-    "Here, we want to iterate over the entire data set rather than just the first batch, so we'll need to slice the data to that end.\n",
-    "\n",
-    "(One pass through our training set will take some time on a CPU, so be patient if you are executing this notebook.)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:53:26.998313",
-     "start_time": "2016-09-16T14:49:29.860079"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "id": "4cgKJrS1_vej"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Step 0 of 916\n",
-      "Mini-batch loss: 7.71249 Error: 91.66667 Learning rate: 0.01000\n",
-      "Validation error: 88.9%\n",
-      "Step 100 of 916\n",
-      "Mini-batch loss: 3.28715 Error: 8.33333 Learning rate: 0.01000\n",
-      "Validation error: 5.8%\n",
-      "Step 200 of 916\n",
-      "Mini-batch loss: 3.30949 Error: 8.33333 Learning rate: 0.01000\n",
-      "Validation error: 3.6%\n",
-      "Step 300 of 916\n",
-      "Mini-batch loss: 3.15385 Error: 3.33333 Learning rate: 0.01000\n",
-      "Validation error: 3.1%\n",
-      "Step 400 of 916\n",
-      "Mini-batch loss: 3.08212 Error: 1.66667 Learning rate: 0.01000\n",
-      "Validation error: 2.7%\n",
-      "Step 500 of 916\n",
-      "Mini-batch loss: 3.02827 Error: 1.66667 Learning rate: 0.01000\n",
-      "Validation error: 2.2%\n",
-      "Step 600 of 916\n",
-      "Mini-batch loss: 3.03260 Error: 5.00000 Learning rate: 0.01000\n",
-      "Validation error: 1.9%\n",
-      "Step 700 of 916\n",
-      "Mini-batch loss: 3.16032 Error: 6.66667 Learning rate: 0.01000\n",
-      "Validation error: 2.2%\n",
-      "Step 800 of 916\n",
-      "Mini-batch loss: 3.06246 Error: 3.33333 Learning rate: 0.01000\n",
-      "Validation error: 2.0%\n",
-      "Step 900 of 916\n",
-      "Mini-batch loss: 2.85098 Error: 0.00000 Learning rate: 0.01000\n",
-      "Validation error: 1.9%\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Train over the first 1/4th of our training set.\n",
-    "steps = train_size // BATCH_SIZE\n",
-    "for step in range(steps):\n",
-    "    # Compute the offset of the current minibatch in the data.\n",
-    "    # Note that we could use better randomization across epochs.\n",
-    "    offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)\n",
-    "    batch_data = train_data[offset:(offset + BATCH_SIZE), :, :, :]\n",
-    "    batch_labels = train_labels[offset:(offset + BATCH_SIZE)]\n",
-    "    # This dictionary maps the batch data (as a numpy array) to the\n",
-    "    # node in the graph it should be fed to.\n",
-    "    feed_dict = {train_data_node: batch_data,\n",
-    "                 train_labels_node: batch_labels}\n",
-    "    # Run the graph and fetch some of the nodes.\n",
-    "    _, l, lr, predictions = s.run(\n",
-    "      [optimizer, loss, learning_rate, train_prediction],\n",
-    "      feed_dict=feed_dict)\n",
-    "    \n",
-    "    # Print out the loss periodically.\n",
-    "    if step % 100 == 0:\n",
-    "        error, _ = error_rate(predictions, batch_labels)\n",
-    "        print('Step %d of %d' % (step, steps))\n",
-    "        print('Mini-batch loss: %.5f Error: %.5f Learning rate: %.5f' % (l, error, lr))\n",
-    "        print('Validation error: %.1f%%' % error_rate(\n",
-    "              validation_prediction.eval(), validation_labels)[0])\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "J4LskgGXIDAm"
-   },
-   "source": [
-    "The error seems to have gone down. Let's evaluate the results using the test set.\n",
-    "\n",
-    "To help identify rare mispredictions, we'll include the raw count of each (prediction, label) pair in the confusion matrix."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:55:10.942063",
-     "start_time": "2016-09-16T14:53:26.999971"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      },
-      {
-       "item_id": 2
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 436,
-     "status": "ok",
-     "timestamp": 1446752934104,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "6Yh1jGFuIKc_",
-    "outputId": "4e411de4-0fe2-451b-e4ca-8a4854f0db89"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Test error: 2.0%\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAFyCAYAAAA+gYtsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzs3XucTfX++PHXZ1/mwlxQidwODVLjfinRzSRKkk4hdVxy\nyS0Shn4cE75FSuUwHUaF6MitTEqKcknOOXNcGkUMYmhoMIyZMRfGfH5/rGHGEHv27L3X3jPv5+Ox\nH7P3+uy13u+199rvWfuzPnstpbVGCCGEb7CYnYAQQgjHSdEWQggfIkVbCCF8iBRtIYTwIVK0hRDC\nh0jRFkIIHyJFWwghfIgUbSGE8CE2sxMoLqXUTUAH4DCQbW42QgjhEgHAX4BvtNYp13uizxVtjIL9\nidlJCCGEGzwH/Ot6T/DFon0YYHFNaBDg3AJGJsG71Zybt3nCQOdmvGwt0LGEy5DYZSd2ST6ia4DH\nSjB/bgnm9eXX3IzYp4DPIL++XY8vFu1sMAp2s3LOLSDU6vy8UNXZGfMFuGAZErvsxLaXMPZtJZj/\nQglj++prbmrsG3b5yoFIIYTwIVK0hRDCh0jRFkIIH1Imi/azFc2MHi6xJbaHNDQxdll9zd0fW4q2\nx5n5QZLYZSt2YxNjl9XX3P2xy2TRFkIIXyVFWwghfIjXFG2l1FCl1CGlVJZS6j9KqZZm5ySEEN7G\nK4q2Uqo7MAOIApoC8cA3SqmbTU1MCCG8jFcUbWAkMFdr/bHWei8wCMgEXjA3LSGE8C6mF22llB1o\nDnx3aZrWWgPrgdbFXmC12jDrK9iYAl8fgV6jr2zv2g8++xV+TIfVB+H+xwvamrSBhVth8xlj3mGv\nO7VOZrPbrcyd25mDB0eQmjqO3buH0qdPE7PTKvWGDGlJXNwAsrImsHJld4/H79y5Ljt29CM9fQxH\nj77EgAFNPZ6Dp5n5ms+c+SiJiSNJTR3HkSMjmTGjA1ar+0uqN5x75GbACiQXmZ4M1C/WkpSC976A\n7z+D4Y9DjTCYsw6Sj8I3S+GpAfDscBjbDfb/DBVvhoDyBfO+swoWToeF90LVmhCzAZIOwecfuGA1\nPcdms3DsWDrt2i0kMTGVVq2q8fXXz3P0aBrfffeb2emVWklJ6UyZspmHH65D9eohHo3doUMdZs/u\nwHPPxbJly1FCQvy59dbyHs3BDGa+5tHRcYwdu47s7FwqVQpk+fJuREa2YerUH9wa1xuK9p9RgP6z\nxpFJxomfCnuxRX061aoHcyeB1nBkP6z6EJ4aCN8ug0GTYMLzRsEGOHMK4+xaQFAohFSELz82Hh8/\nAv9dD3XNHPPpnKysC0yatPHy47i4JDZsOETbtjWlaLtRbOxeAJo2reLxAjJ58gNMnryFLVuOApCW\nlkNaWo5HczCDma95QkLBaa8tFkVenqZu3UoOzPkz8EuRaY5fGsAbivYp4CJwa5Hplbl67/uyd6td\n40x9FfK/mlgskJeXf98KdRvBX+rDTbfCnS1g4gfG9K1r4Z1RkJkB6akQ+xF07Q/zp0HVWnD3w/D6\nIBetpnn8/W20alWNxYt3mZ2KcIPAQBvNm1ehevVg9u4dRHCwHz/8cJQRI74lOfmc2emVapGRbRg/\n/n6Cgvw4dSqTyMh1DszVkKt/hHMciHEopul92lrrC8B2IOLSNKWUyn+8tVgLS9wHxw7D4Mlgs0Od\nO+GJvhAUAiH5/wFbRcCzzaBHE6P/e9Q7BfOvW27slf87C1YlwObV8B9H3gTv9sEHT7BvXwqrVu01\nOxXhBhUrBqKUokuXekREfEJY2PucP3+RxYu7mJ1aqTd9+o+Ehk6lQYPZzJmzjeTkDLfHNL1o53sH\nGKiU6qWUugOYA5QDFhRrKRcvwsgucEcz+DYJ/m+RsfecmgJZ+S/mh28Ye9VpZ+CjqXB/Z2N6rXrw\nbiy8NQLu9odHbjOK/vCprltLE0RHd6Ju3Up07fqp2akIN8nIOA/AzJn/IykpnaysXKKiNvPQQ7UI\nCPCGL9OlX0JCCrt2JbNgwZNuj+UVRVtrvQwYBUwGdgKNgA5a65PFXtihvTC0I7SrDD2bg38A7NgE\nh/dBTpF+I6UK7oeFGwcsN6wy+sNPn4DVC6FNSa78Ya7o6E60alWN9u0XXf5gi9InLS2HI0fOXjFN\nKWMzLryJC/fy87MSFuZIn3bJeEXRBtBav6+1/ovWOlBr3Vprvc2pBYWFQ0Ag2GzQrqvRPRIzBc7n\nwJrF0HeccdAxKBR6R8LGVcZ8e7bDLbfBA/l73hVvhk5/g707XLSGnjV79mO0bl2d9u0/Jj299B+Q\n8gYWi8Lf34bdbsViUfj5WbHZPPMRi4nZyfDhLalaNYiAABsTJ97H+vWHyMoqySXDvJ9Zr3m5cnZ6\n925CSIg/AOHhlRk//j7Wrj3g9tjKGBLtO5RSzYDt2+v9ySXDhkyGZ4aA3Q8S4uHd0fDzf422gEAY\nO9so5jnZsCkWZoyC7Eyj/b5OxgiT6nWM9v98C2+PNLpSLsX/Kcrt61hSNWqEcvjwy2Rn55Kbm3d5\nr2vx4l0MHfqV2emVWhMnPkBU1IMU/kxt2pRIRMTCEizVscuNKQVvvtmOPn0aoTVs2JDISy99w8mT\nmSWIXZLLjXmGe17zGwsMtLNqVQ+aNq2Cv7+NEyfOsWLFHl57bSM5Oc78o7x8ILK51vq6e4qlr2i7\nO74PFG1RmpTkGpEl5f1Fu/RwvGh7TfeIEEKIG5OiLYQQPkSKthBC+BAp2kII4UOkaAshhA+Roi2E\nED7EZ3/j2jxhIFDV43GjmOTxmJdMQoYblj0y7M7zzBhm6Xgplj1tIYTwIVK0hRDCh0jRFkIIHyJF\nWwghfIgUbSGE8CFStIUQwodI0RZCCB8iRVsIIXyIFG0hhPAhUrSFEMKHeEXRVkrdp5T6QimVpJTK\nU0o9YXZOQgjhjbyiaAPlgZ+AoYBvXf9MCCE8yCtOGKW1XgusBVBKKZPTEUIIr+Ute9pCCCEcIEW7\nhFoOGcKAuDgmZGXRfeXKK9oemjSJwfHx/P38eTrMmHFFW6WwMLqvXMmoY8cYm5JC382bqdG69RXP\nCe/Rg6G7d/Pq2bP0/89/aN78tmLn5+9vY//+4aSkjC3+yvmgIUNaEhc3gKysCaxc2d3sdDzOzPe7\nLG1raWmjOXu24JaTM46dO/t7JLYU7RJKT0pi85QpbI+JuaotZf9+vh0zhn2xsVe1BVSowP41a3g/\nPJw3b7qJ+IULeW7NGgIrVgSgxr338vg//8nnvXoxNTSUnR9+yJo1zxEU5Fes/CZPfohDh844t3I+\nKCkpnSlTNhMTs93sVExh5vtdlra1kJC3CQ0tuP366ymWLNntkdg+XLTXAkuK3H72eBZ7Y2PZt3o1\nmSkpV7XtWryYg99+S056+lVtx7ZtY8eHH5J1+jQAOz78kLyLF7m1USMA6j/xBHtXreLYdqP4bJ83\nj4yM83Tt2sDh3Jo1q0rHjmG8+eaPzqyaT4qN3cvq1ftISck0OxWPM/P9Lovb2iUtW95GgwY3s3Dh\nLgfniAcWF7mtcTieVxyIdE5HzLhyjbtUDg/HLyiIk3v2AKAsFihyTFYpaNToVoeWZ7EoYmI6M3jw\nl9hsPvy/WTjEzPe7rG9rL7zQmK+/Pkhy8jkH52icfyvsGPC+Q3N7xSuslCqvlGqslGqSP6lO/uMa\npibmIQGhoTy9ZAk/vP46506eBGD/mjU06NqV6vfcg8VqpeWQIdSsGUpIiL9Dyxwzpg3btx9n69aj\n7kxdeAkz3++yvK0FBNjo0eNO5s3b6bGY3rKn3QLYgDFGWwOXjtotBF4wKylP8A8J4bmvvyZx82Y2\nTZlyefrhjRtZ+/LLPPHBB5SvXJmE1atZt+43h77216lTkUGDWtCkyRwAZBRl6Wbm+13Wt7Xu3e/k\n3LkLrFlzwGMxvaJoa6034SV7/Z7kFxTE82vXcuKXX/hq6NCr2nfOn8/O+fMBsFit9D2Uwnvv/eeG\ny23btiaVK5cnIeEllAK73UpwsB/JyWPo1OkTtm075vJ1EeYx8/0u69tav36NWbBgF9qDPwn0iqLt\ny5TFgtVux2q3G/f9/NB5eeTl5mKxWrHYbFisVpTVarRdvEjexYv4BQXxt2+/JWXfPlYPHHjVci1W\nK7fcdRfJu3YRWKkSEW+8wW+/neGbb278H33p0t2sW/fb5cf33luDefM607jxPzl5snQfoLNYFHa7\nFbvdisWi8POzkpenyc3NMzs1tzHz/S7L21q9epW4997q9O37pUfjStEuofsnTODBqCh0/r/a8ZmZ\nJG7axMKICDrPm0eT3r0vt7UaNoz4hQuJ7dePBl27Uq1VKyqHh9Pgr381FqY1q198kV8+/RSL3c6T\n8+dTKSyM3Jwc9q5aRefOSxzKKScnl+PHC0asnDx5Dq3hjz8yXLvyXmjChPuJinrw8muemTmeTZsS\niYhYaHJm7mPm+12Wt7UXXmjMpk1HOHjQs8Mclfbkfr0LKKWaAdthIGaMHoliksdjXjKJKNNiC1F2\n2E2IeXn0SHOt9Y7rPbPM9SMLIYQvk6IthBA+RIq2EEL4ECnaQgjhQ6RoCyGED5GiLYQQPkTGaReT\nmcPuzvqbN9wwNEeGG5Y9gSbGzjIx9gUTYuY6/EzZ0xZCCB8iRVsIIXyIFG0hhPAhUrSFEMKHSNEW\nQggfIkVbCCF8iBRtIYTwIVK0hRDCh0jRFkIIHyJFWwghfIgUbSGE8CGmF22l1KtKqTilVJpSKlkp\n9blSqp7ZeQkhhDcyvWgD9wGzgLuBhzEu0PatUsrMs9UIIYRXMv0sf1rrxwo/Vkr1AU4AzYEtZuQk\nhBDeyhv2tIuqAGjgtCsWNmRIS+LiBpCVNYGVK7u7YpFeE9v+4hDKb4kjODWLwE9XXtkYFETgwk8I\nTk4l6NAx/MaNv6LZf+Ikyv8vnuD08/i/OeOKNuu9bQg+mUbwibPG7WQaubkTeffdjg7lZeZrPnPm\noyQmjiQ1dRxHjoxkxowOWK3euJn7viFDmhIX14usrFGsXPnkFW2TJrUlPr4v58+PZsaMdm7Pxd/f\nxv79w0lJGev2WJeYtZ171daslFLAe8AWrfUeVywzKSmdKVM2ExOz3RWL86rY+lgSOVOncP7DmKva\nAt6bDaEVSL+9OpkP34/fCwOwP/vc5fa8A/vJfnUMuatjr5r34tYfSb8lhPTKoaRXDiXjztvJzc1j\nyZKfHcrLzNc8OjqO+vVnUaHCNJo0mUOTJlWIjGzj8TzKgqSkDKZM2UpMzE9Xte3ff4YxYzYSG3vA\nI7lMnvwQhw6d8UisS8zazk3vHinifeBOwIFP2VogoMi0cKDhFVNiY/cC0LRpFapXD3FBio5zd+xL\nBdfSpClUq17QEBCA/enunHugNWRkkJdxgPPvz8Lepx8XlnwCwIV/LQZAd+txwzj2v/Vh//7TxMUl\nOZSXma95QkLK5fsWiyIvT1O3biWP5lBWxMbuB6Bp01upXj34irbFi3cD0KNHA7fn0axZVTp2DOOV\nV75h2bJn3B7vEue385+BX4pMy3Z4bq8p2kqp2cBjwH1a6+M3nqMjUNXNWfkmS736YLeTtyv+8rSL\nu37Cb8yrTi3P3qsvH8zZ4ar03C4ysg3jx99PUJAfp05lEhm5zuyUhJtYLIqYmM4MHvwlNptXdRxc\nR0OK7lzCceDqb8zX4hVrmV+wuwAPaa2PmJ2Pr1NBQXDuHGh9eZpOTUUFB19nrmuztmmL5S+1WbQo\n/sZP9hLTp/9IaOhUGjSYzZw520hOzjA7JeEmY8a0Yfv242zdetTsVDzG9KKtlHofeA7oCZxTSt2a\nfyva9yEcpDMyoFw5UOryNBUaik5PL/ay7L1fIPfLLzh92sxr9jknISGFXbuSWbDgyRs/WficOnUq\nMmhQi8vfpFSh7b0084bukUEYo0U2FpneF/jY49mUAnkJ++DCBSyNGpMXbxwksjZuSt5uxw4kXhYU\nhP2pZ8js3hWHDjN4IT8/K2Fh0qddGrVtW5PKlcuTkPASSoHdbiU42I/k5DF06vQJ27YdMztFtzC9\naGut3bq3b7Eo7HYrdrsVi0Xh52clL0+Tm5vnzrCeiW2xgN2OstuN+35+kJcH2dlcWLEU/6gpZPXu\niaXyrfgNHkZ2VKFhf1Yr2GzGX6vVmPfiReOWz96jJzrlFBe/W09xirZZr3m5cnaeeeYuPv/8V9LS\ncggPr8z48fexdq1nRjCUNcb7bMFut1z1PlutCpvNgtWqsFqNtosX87h4Ud94wQ5aunQ369b9dvnx\nvffWYN68zjRu/E9Onsx0WZw/Y9Z2rrR23YvoCUqpZsB2GIgjByInTnyAqKgHKbyemzYlEhGx0H1J\nuin2Wf9JVzz2Gz8R//FRV/RdX/xhE5kdI4xx2tFzsT36ODozk/P/nMX5N9+4/LyAmI+wP9/7inkv\nLF5I9ov9Lj8uv/k/XFj7FeffmEJoTpTDeZr1mgcG2lm1qgdNm1bB39/GiRPnWLFiD6+9tpGcnFy3\nxi6drv+j5IkT2xAV1abI+3yUiIhP+eijx+jdO/yKtoULf6Ffv68djF387rj776/F55/34Kab3iz2\nvM5w7XZ++UBkc631dY/6l/qiXZoULdqeVJyiLUoLM88k4XvHUErG8aJt+oFIIYQQjpOiLYQQPkSK\nthBC+BAp2kII4UOkaAshhA+Roi2EED7E9B/XCMeZOexO1zZvuKE6JMMNzWHmsDu7ibEvmBj7xmRP\nWwghfIgUbSGE8CFStIUQwodI0RZCCB8iRVsIIXyIFG0hhPAhUrSFEMKHSNEWQggfIkVbCCF8iBRt\nIYTwIaYXbaXUIKVUvFLqbP5tq1Kqo9l5CSGENzK9aANHgbFA8/zb90CsUqqBqVkJIYQXMv2EUVrr\nr4pMmqCUGgzcA/xqQkpCCOG1TC/ahSmlLEA3oBzwb5PTEUIIr+MN3SMopcKVUulADvA+0FVrvdfk\ntErEbrcyd25nDh4cQWrqOHbvHkqfPk08Fn/mzEdJTBxJauo4jhwZyYwZHbBa3fR216gNH30FO1Jg\nyxEYMLqg7fY7YNE6o+3fSfB/c8DP32irWh12pUH82YLbvvMw53P35OlGZr/fZjFzvdPSRnP2bMEt\nJ2ccO3f290hs8PBnrBBv2dPeCzQGKgB/BT5WSt3vy4XbZrNw7Fg67dotJDExlVatqvH1189z9Gga\n3333m9vjR0fHMXbsOrKzc6lUKZDly7sRGdmGqVN/cG0gpSDmC/jmM+j3OPwlDD5eB8ePwpdLYeYS\n+N8P0LsDhFSAD7+Cl/4OMybA8d+hUUjBsmw22JoEq5e4NkcPMPv9NouZ6x0S8vYVj3/6qT9Llux2\na8zCPPYZK8IrirbWOhe49A7vUEq1AkYAg/98rrVAQJFp4UBDN2RYfFlZF5g0aePlx3FxSWzYcIi2\nbWt65EOckJBy+b7FosjL09StW8n1gerUh9r14B+TQGs4tB+WfQg9BhpFu0ZtmDAI8vIg9TR89wU0\nuefay3qkK1gs8K3v7Wmb/X6bxVvWu2XL22jQ4GYWLtzlsZjOf8Z+Bn4pMi3b4bheUbSvwQL4X/8p\nHYGqnsjFJfz9bbRqVY3Fiz23UUVGtmH8+PsJCvLj1KlMIiPXuT6IxVLwNy/PuG+1wh2NjPsxb8Ff\ne8Oen4w97Ue6wr/mXntZz7wAsZ/ABe++cogjzHi/vYFZ6/3CC435+uuDJCef82hc5z5jDbl65/I4\nEONQTNP7tJVSryul2iqlauX3bU8FHgAWm52bK33wwRPs25fCqlWe6/GZPv1HQkOn0qDBbObM2UZy\ncobrg/y2D5IOw8jJYLdD3Tvh6b4QlN/tsXkttGgLP6cbfdrHjsCK+Vcv57aa0OZhWDrP9TmawIz3\n2xuYsd4BATZ69LiTefN2eizmJR75jBVhetEGbgU+xujXXo8xVvsRrfX3pmblQtHRnahbtxJdu35q\nSvyEhBR27UpmwYInXb/wixdhYBe4q5nRHz1jESz/CFJTIDgUFq039qzvDIRmlSArE9795OrlPNMX\ndu+ABM/1SbqL2e+3Wcxa7+7d7+TcuQusWXPAo3ELc+tnrAjTu0e01p473GuC6OhOtGpVjXbtFpKR\ncd60PPz8rISFuaFPG+DgXuhT6EeskVPhv5ug1u3gHwiLoo3p6WmwZC58uObqZfy1D0S/7p78PMhb\n3m9PM3O9+/VrzIIFu9Dao2Gv4tbPWCHesKddas2e/RitW1enffuPSU/P8VjccuXs9O7dhJAQ47BA\neHhlxo+/j7Vr3bQnUj8cAgKN0R8duhrdI7OnGMX8XDo8N8jo8y4fZByg3L3jyvnvewQq3ARf+vae\nqVnvt9nMXO969Spx773VmT8/3qNxPf4ZK8T0Pe3SqkaNUAYPbkl2di6JiSNRyhhcsXjxLoYOLfoj\nUNfSGnr2bMhbb7XH39/GiRPnWLFiD6+9ttE9AR/rBs8PAbsf7I03ukv27zHaBnSGcdNh9BuQmwvb\nf4Qxfa6c/5kX4OvlcM79/YHuYub7bSaz1/uFFxqzadMRDh484/ZYhXn8M1aI0mZ/pygmpVQzYDsM\nxJdGj/g6XXuSabHVoSjTYguz2E2MbcbopcujR5prrXdc75nSPSKEED5EirYQQvgQKdpCCOFDpGgL\nIYQPkaIthBA+RIq2EEL4EBmnLRxi5rA7fbeJww3/K8MNzWHmScPMGG7oeCmWPW0hhPAhUrSFEMKH\nSNEWQggf4nBHilLqHUefq7V+xbl0hBBCXE9xDkQ2LfK4OWAF9uU/rgdcBLa7IC8hhBDX4HDR1lo/\ndOm+UuoVIB3orbU+kz+tIjAfcO9VLYUQogxztk97FPDqpYINkH9/Qn6bEEIIN3C2aIcAt1xj+i1A\nsPPpCCGEuB5ni/bnwHyl1FNKqepKqWpKqb8CHwKfuS49IYQQhTn7i8hBwNvAvyj4+VAuRtEe44K8\nhBBCXINTRVtrnQkMUUqNAW4HFHBAa33OlckJIYS4Ukl/XFM1/5agtT6nlFIlTUgp9apSKq8448KF\nEKKscKpoK6VuUkp9ByQAayi4WOOHSqkZziajlGoJDAA8e2llIYTwEc7uab+LcRqumkBmoelLgY7O\nLFApFQQsBvoDqU7mJYQQpZqzRfsRYKzW+vci0/cDtZxcZjSwWmv9vZPzi7LottrwzlfwTQrEHoHn\nRhe0vb8BNmXBd2fhuzTjb6VbC9r/cgfMWmfMuzoJxs4BP3/Pr4MLDBnSkri4AWRlTWDlyu5mp+Mx\nZq33Rx89Tnb2WM6eHU1a2mjOnh1Nq1a3eSS2s6NHynPlHvYllYCc4i5MKdUDaAK0cDIfURYpBW99\nARs/g1GPQ/Uw+Mc6SD4K65eC1jB7DCyffe35J/0Ldm2BER0gqIJR/F/4O8yZ4Nn1cIGkpHSmTNnM\nww/XoXr1ELPT8Rgz1zs6ejujRq33aExwvmj/APQC/p7/WCulLEAksKE4C1JKVQfeA9prrYtx5vO1\nQECRaeFAw+KEF76sVn2oWQ8+nGQU6KP7YfWH8ORAo2iDUdj/zG21YfpgyMuDtNPwwxcQfo9ncnex\n2Ni9ADRtWqVMFW3fXO944Oci07IdntvZoh0JfKeUagH4AdOBuzD2tNsUc1nNMX5Jub3Q6BMrcL9S\nahjgr7XWV8/WkYLjn6JMUpZCf/OM+xYrhDUqeE6fCfDCRPgjET59D9YuLmj719vQqTfs/8nY036g\nK6ya67H0hW/r1ashvXo15PjxDObPj+fdd+McnLNx/q2wY8D7Ds3t7DjtX5RS9YBhGCeOCsL4JWS0\n1vp4MRe3nqt3jxcAvwLTrl2whQCO7IPjh2HgZJgXBTXqwuN9oXz+Htf74+DQHsjOhBYR8PoyOJdm\n7FED/HstTJgP36cbhX/zKvhyvllrI3zIzJn/Y/To7zh9OotWrW5j2bKnuHhR849//M/tsZ0d8lcT\nSNNav6617qa1fkxrPUFrfTy/zWFa63Na6z2Fb8A5IEVr/asz+Yky4uJFiOwC9ZsZBxJfWwRffgRn\nU4z23XGQmWF0f8StM/aiH84/WBUUCrPWG9MeCIRHKhnFfdIn5q2P8Bnx8cmcPp0FQFzcMaZN20r3\n7nd6JLazo0cOcY0TRimlbspvKynZuxaOObwXXu4Ij1aG3s3BLwB2brr2c/PyCu5Xux38A2FFtFH8\nz6UZBfzexzyTtyhVPNkf4GzRVly7sAZRnB71P6G1bidXvxEOuT3cKL5WGzzY1ege+WiK0UXSuiP4\nBxgHI1u0gydfhA0rjPkS90JmOjw1CCwWKBcEXQbCvh3mro+TLBaFv78Nu92KxaLw87Nis5X+qwma\ntd5PP30HQUF+ADRvXpWxY1uzYoVnOgaK1add6KflGpiilCo87M8K3A385KLchLixiG7w1yFg84MD\n8TCmi9GPHXoT9IuCyUuM5x0/DDNHwsbPjcfZmTC6MwybDoPegIu5sOtHmNLHnPUooQkT7icq6kEu\nHQLKzBzPpk2JREQsNDkz9zJrvYcNa8HcuY9hs1lISkpn9uxtxTgQWTKqOMf5lFKXhvM9APwbOF+o\n+TxwGHhba73fVQleI4dmwHYYiIweKRv03ZNMi63+G2VabGEW+42f4nKXR48011pf9+tesfa0L11y\nTCk1HxihtU5zNkUhhBDF52znz8tco+ArpSoppXxlhLsQQvgcZ4v2p0CPa0zvlt8mhBDCDZwt2ndz\n7Z+rb8xvE0II4QbOFm1/rt0fbgcCnU9HCCHE9ThbtOMwhm8UNQjY7nw6QgghrsfZE0ZNANYrpRoD\n3+VPiwBaYpxrWwghhBs4e8KoH5VSrTGuvN4NyAJ2Af3cOUZbmMm8Xi8zx0rrUeaNEQdQM5y+ep8L\nmDmi14yx0pcU4wzRLpPr8DOd3dNGa/0T8Jyz8wshhCg+h4u2Uirk0o9pbjQWW350I4QQ7lGcPe0z\nSqmqWusTGBfevdbv3y+dSMrqiuSEEEJcqThFux1wOv/+Q27IRQghxA04XLS11puudV8IIYTnFKdP\nu9GNn2XQWu9yLh0hhBDXU5zukZ8w+qv/7AIIhUmfthBCuEFxfhFZG6iT//evGJcVGwI0zb8NAQ7m\ntwkhhHCZors7AAAgAElEQVSD4vRpJ166r5RaDgzXWq8p9JRdSqmjwBRgletSFEIIcYmz5x5pyLUv\n4HsIKNYliZVSUUqpvCK3PU7mJYQQpZqzRftX4FWllN+lCfn3X81vK65fgFuBKvm3tk7mJYQQpZqz\nP2MfBKwGfldK7cI4MNk4/29nJ5aXq7U+6WQuQghRZjh7wqg4pVRt4HngDowRJcuAf2mtzzmxyLpK\nqSQgG+OCwa9qrY86k5sQQpRmznaPoLXO1FrHaK1f0VqP1FrPc7Jg/wfoA3TA2IOvDWxWSpV3NrfC\nhgxpSVzcALKyJrByZXdXLNIhdruVuXM7c/DgCFJTx7F791D69GnisfhmGDKkKXFxvcjKGsXKlU9e\n0bZsWReSkoaQmvoyBw4M5NVX7zEpSxcKqQq9P4NJJ+G1ZHhuCZS76cZtjrQ7aMiQxsTF9SQrazgr\nV179Jbdfv3B+/bUP6enDOHjwBR5/vI6za3udHMz5jAFUrRrEZ589zcmTI0lOfpklS57kpps8c0bK\nmTMfJTFxJKmp4zhyZCQzZnTAanW6pDrM6QhKqb8ppbYopY4ppWrlTxuplOpSnOVorb/RWq/UWv+i\ntV4HPAZUxDjla4klJaUzZcpmYmI8e20Gm83CsWPptGu3kAoVptG37ypmzOhARITrPzTeIikpgylT\nthIT89NVba+99iO1as2hQoX3eOCBJTz33F08+2wDE7J0oafeBzRMqQFv1AZ7IDw588ZtjrQ7yHjN\n/0NMzM9XtQ0Y0JCXX25Gt25fEhw8m7vvXsLPP59yalWvn4M5nzGA99/viNaaGjVmUbt2NIGBdmbO\n9Mwp/aOj46hffxYVKkyjSZM5NGlShcjINm6P61TRVkoNBt4BvsYosJd+THMG40rtTtNanwUSgLDr\nP3MtsKTI7eoNNzZ2L6tX7yMlJbMkaRVbVtYFJk3aSGJiKgBxcUls2HCItm1rejQPT4qN3c/q1QdI\nScm+qm3PnlPk5uYBoBTk5Wnq1q3k6RRdq1JtiF8GudlwPhPil0KVhjduc6TdQbGxB1m9+jdSUrKu\nmK4UTJrUmhEjNlwu1KdOZZGY6PoTcJr1GQOoXbsCy5b9SnZ2LpmZF1i6dA8NG97ikdgJCSlkZxvn\nwbZYVDG26Z+5unatdTius3vaLwEDtNavc+XZu7dhDAd0mlIqCLgdOH79Z3YEni1yK1Fot/L3t9Gq\nVTXi4/8wOxXTzJ7dnoyMkSQmDqZ8eTsLFlz9T9anbJoBjbuBfzAEhELTZ2HPF0bb5nf+vO1G87pA\n/fqVuPXW8rRoUYXffutHYmJ/5s59mKAgMy8u4HozZvyXbt0aEBzsR2ioP88+exdffOG567BERrbh\n7NlXSU4eQ6NGtzJrVpwDczXk6trV0eGYzhbt2sDOa0zPAYrVF62Uekspdb9SqpZS6l7gc4x/BEuc\nzM0rffDBE+zbl8KqVXvNTsU0w4atIyjoXVq0WMiiRbs5c+bqPXKfcngrBFWGKWdg0ikIqADfTzPa\nDv345203mtcFKlUKACAiogbNmi2mSZPF1K4dyjvvPOiyGN5g69bfqVy5PGfOjOLUqVeoUMGfadO2\neiz+9Ok/Eho6lQYNZjNnzjaSkzPcHtPZon0IuNZRtY4Uf5x2deBfwF7gU+AkcI/WOsXJ3LxOdHQn\n6tatRNeun5qdilfYuTOZ9PTzzJjRzuxUSubFdfDbD/BqORgfBIlbYeC6/Lb1f952o3ldICPjPABv\nvBFHamoOZ85kM3VqHJ07l65jKuvW9eSHH45Qrtx0goKms3Xr76xb19PjeSQkpLBrVzILFjx54yeX\nkLNF+x0gWinVHWO4Xyul1HhgKjC9OAvSWj+rta6utQ7UWtfUWvfUWl/r15Y+KTq6E61aVaN9+0WX\nP0gC7HYLYWEVzE7DeeUqQYVa8OMsuHgecnNgyyyo2Sq/reY12u6GwIrXmTe/3QX27TtDVtaV1x1U\nyiWL9hqVKgVSq1Yos2Zt4/z5i+TkXGTWrG3cfXc1KlYM8Hg+fn5WwsLcf5zGqaKttf4AGAv8H1AO\nY095EDBCa+1Vu5MWi8Lf34bdbsViUfj5WbHZ3D8sB2D27Mdo3bo67dt/THp6jkdimsl4ra3Y7ZYr\nXusaNYLp2rUe5coZ/amtW1dj+PDmrF3rw/+bM0/Dqf1w71Cw+oHNH9oMg9Sjf9529ihknblxezH8\n2Wuek3ORxYt/Zdy4VoSG+hMa6k9kZEtWrTrg8pfCrM/Y6dNZ7N9/mqFDW+DnZ8Xf38qwYS04ejTN\n7V1v5crZ6d27CSEh/gCEh1dm/Pj7WLvW9a9vUUrrG51ltcgMSimgBnBCa52tlCoHBOVfhsztlFLN\ngO0wEKh6w+dPnPgAUVEPUng9N21KJCJiofuSBGrUCOXw4ZfJzs4lNzcPpUBrWLx4F0OHfuXW2O5x\n47GvEye2ISqqTZHX+ii9e3/FJ590Jjz8ZiwWxbFjGXz88S+8+eZ/HYyddeOnuMl1r8Z+S33o8h7U\naAEoSNoJq0fB8V3Xb7vRvIXc6GrsEyfeQ1RU6yKv+e9ERKwgMNDG7Nnt6No1jOzsXGJjDzJq1CYy\nMx298rdjI03c8xlz7IBp/fo38d577WnRoipKGV1vo0atZ9eukpSjG1+NPTDQzqpVPWjatAr+/jZO\nnDjHihV7eO21jeTkOH5l9QLHgRiA5lrrHdd7pjNF24Lxy8W7tNaeO0xbEL9YRVu4imd+sHBtXlq0\nPeBGRdu9zLw+t5mjXG5ctF3P8aJd7O8wWus8YD9Q/J9vCSGEKBFnO57GAW8ppcJdmYwQQojrc/Ys\nfx9jHICMV0qdp8j3V621j//UTQghvJOzRbtEP1UXQgjhnGIV7fyDkGOAJwA/4DtgktbavCNFQghR\nhhS3T/v/Aa8DGUASMAJ439VJCSGEuLbiFu3ewBCtdQet9ZMYV6npmb8HLoQQws2K26ddE+N0rABo\nrdcrpTRwG/C7KxMT3qZs9oCpGVGmxtfPjDIttlpu5rqbMVbaNxR3D9mG8cOawi5g7kh4IYQoM4q7\np62ABUqpwifSCADmKKUuX2pMa/2UK5ITQghxpeIW7WudTGCxKxIRQghxY8Uq2lrrvu5KRAghxI3J\nqA8hhPAhUrSFEMKHSNEWQggfIkVbCCF8iBRtIYTwIVK0hRDCh3hF0VZK3aaUWqSUOqWUylRKxedf\nVkwIIUQhzp5P22WUUhWAHzFO89oBOAXUBYp3WWohhCgDTC/aGJcuO6K17l9oWqJZyQghhDfzhu6R\nzsA2pdQypVSyUmqHUqr/Dedy0MyZj5KYOJLU1HEcOTKSGTM6YLV6w2oLd/L3t7F//3BSUsaanYp7\nVK4N476Cj1Lg/SPQebQxPfhmeGmRMW3+GZi2DZo/XjCf1Q4Tv4eYP4z2d3ZDhMs+bsIDvKF61QEG\nA/uAR4A5wD+UUs+7YuHR0XHUrz+LChWm0aTJHJo0qUJkZBtXLFp4scmTH+LQoVLaw6YURH4Bv22D\nfjfDlAjoOAzu7Q4BQfDbDvh/raBvRVgWBSOWwG31jXnzcuGjYfBiVaP97aeg+xSof6+56yQc5g3d\nIxYgTmv99/zH8UqpuzAK+XVORrUW4wSDhYUDDa+YkpCQUhDIosjL09StK9cdLs2aNatKx45hvPLK\nNyxb9ozZ6bjebfXhtnqwfBJoDcf3w/cfwsMDYetS+Ordgufu+AqO7YO69xh/tYbf9xS0K2VMqxIG\n+7Z6fl3KpJ+BX4pMK3rG6z/nDUX7OPBrkWm/Ajc4vWtHoKpDASIj2zB+/P0EBflx6lQmkZHrnEhT\n+AKLRRET05nBg7/EZvOGL5JucOlCURYLXMzLv2+Fmo2ufm7ILVCtARzZdeX0sV9Aw4fB5g+J8RD3\nuXtzFoU0pOjOpVEGYxya2xu26h+B+kWm1ceFByOnT/+R0NCpNGgwmzlztpGcnOGqRQsvM2ZMG7Zv\nP87WrUfNTsV9ju2DE4eh22Sjj7r6nfBQXygXcuXzrDaja2Trp3Bo55Vtbz4Bz5eD1x6A/66E82Xz\nykS+yBuK9rvAPUqpV5VStyulegL9gdmuDpSQkMKuXcksWPCkqxctvECdOhUZNKjF5W9SSimTM3KT\nvIvwVheo3QzmJsGwRbDhI0gv6ArEaoNXVkB2Bswd+OfL2rsFKlSBJ8a4P2/hEqZ3j2ittymlugLT\ngL8Dh4ARWutP3RHPz89KWJj0aZdGbdvWpHLl8iQkvIRSYLdbCQ72Izl5DJ06fcK2bcfMTtF1kvbC\nGx0LHvecCns2GfetNnhlufF3ehejyF+P1Q5V6rovV+FSphdtAK31GmCNq5dbrpydZ565i88//5W0\ntBzCwyszfvx9rF17wNWhhBdYunQ369b9dvnxvffWYN68zjRu/E9Onsw0MTM3qBEOyQfh4gVo3hke\n7AuT2xl9268sB79yMO3xqwt2rUZGP/feLZB7AZp0hLY9YY4M+/MVXlG03UVr6NmzIW+91R5/fxsn\nTpxjxYo9vPbaRrNTE26Qk5PL8ePplx+fPHkOreGPP0rhMYx7u8EjQ8DmZxxIfKuLMSqkwX1GET+f\nbYzhBuOD8PkbEPsmWGzw7BtQtZ4x/eRhWDgS/r3M1NURjlNaa7NzKJb8c5Jsh4E4OnpECF+ln5lk\nWmy1PMq02GXP5dEjzbXWO673TG84ECmEEMJBUrSFEMKHSNEWQggfIkVbCCF8iBRtIYTwIVK0hRDC\nh0jRFkIIH1Kqf1xT+tjNTsAkF8xOwDRmjpXOq2TeGHHLaTPHiJvxOXO8FMuethBC+BAp2kII4UOk\naAshhA+Roi2EED5EirYQQvgQKdpCCOFDpGgLIYQPkaIthBA+RIq2EEL4ENOLtlLqkFIq7xq3WWbn\nJoQQ3sYbfsbeArAWetwQ+BaQi9YJIUQRphdtrXVK4cdKqc7AQa31DyalJIQQXsv07pHClFJ24Dng\nQ7NzEUIIb+RVRRvoCoQCC12xMLvdyty5nTl4cASpqePYvXsoffo0ccWivdpHHz1OdvZYzp4dTVra\naM6eHU2rVrd5NIfOneuyY0c/0tPHcPToSwwY0NQjcYcMaUlc3ACysiawcmV3j8T0hthu128IrI+D\npCxYuPLKtqAgmPsJHEqF3cdg1PjitX+0DHYnGe3bDsDIVx1Oa+bMR0lMHElq6jiOHBnJjBkdsFo9\nV9bM2M5N7x4p4gXga631H65YmM1m4dixdNq1W0hiYiqtWlXj66+f5+jRNL777jdXhPBa0dHbGTVq\nvSmxO3Sow+zZHXjuuVi2bDlKSIg/t95a3iOxk5LSmTJlMw8/XIfq1UM8EtMbYrvd8SR4ewo88DDc\nVv3KtjdnQ2gFaFgdKleBz9bDkcOw/BMH21+DgwmQmwu3VYPl3/Dsnr0sWfLzDdOKjo5j7Nh1ZGfn\nUqlSIMuXdyMysg1Tp7q/d9Ws7dxrirZSqibwMPCkY3OsBQKKTAvHOI5pyMq6wKRJGy8/jotLYsOG\nQ7RtW7PUF20zTZ78AJMnb2HLlqMApKXlkJaW45HYsbF7AWjatIrHC6eZsd1uTazxt2HTK4t2QAA8\n2R06toaMDMg4APNmwXP9jKJ8o3aAfXsKBVKQl0fdupUcSishoeCQmMWiyMvTDs9bUs5v5/FA0X9I\n2Q7H9abukReAZGCNY0/vCDxb5NbwunP4+9to1aoa8fEu2ZH3ar16NeTkyZHs2jWAkSNbeSxuYKCN\n5s2rUL16MHv3DiIpaTifftrVY3vawsPC6oPdDr/EF0z75Se4q5Fxv+4d12+/ZPpsOJIB8YlQrjwL\nFvzkcAqRkW04e/ZVkpPH0KjRrcyaFVeCFXJMybbzxsDzRW6PORzbK4q2UkoBfYAFWus8d8X54IMn\n2LcvhVWr9rorhFeYOfN/1K8/h1tueZf+/b9ixIhWDB/e0iOxK1YMRClFly71iIj4hLCw9zl//iKL\nF3fxSHzhYeWDIPMcaF0w7WwqBAUb98uVv377JZHDoGYQRLSAZYs4c8bxPc/p038kNHQqDRrMZs6c\nbSQnZ5RghRxj5nbuFUUbo1ukBjDfXQGioztRt24lunb91F0hvEZ8fDKnT2cBEBd3jGnTttK9+50e\niZ2RcR4w/nEkJaWTlZVLVNRmHnqoFgEBXtMbJ1zlXAYElgOlCqaFhEJGumPtRe3aCRnpzJjxSLFT\nSUhIYdeuZBYscLCHtQTM3M69omhrrddpra1a6wPuWH50dCdatapG+/aLLr/YZUnhnRx3S0vL4ciR\ns1dMU8rIofDnVpQSB/bBhQsQ3rhgWsOmsOdnx9qvxW4nLMy5fmk/P6vT8xaHmdu5VxRtd5o9+zFa\nt65O+/Yfk57umYNhZnv66TsICvIDoHnzqowd25oVK371WPyYmJ0MH96SqlWDCAiwMXHifaxff4is\nrFy3x7ZYFP7+Nux2KxaLws/Pis3mmc3czNhuZ7GAv7/RP22xgJ8f2GyQnQ2rlsKrUyA4GOqEQf9h\nsGieMd+N2qvVgMe7QrlyxuOWrWHAcNauvfH+W7lydnr3bkJIiD8A4eGVGT/+PofmdQWztnOlPbkb\n5gJKqWbAdhgIVL3uc2vUCOXw4ZfJzs4lNzfv8n/CxYt3MXToVx7J17Ucu0r0xo3P07BhZWw2C0lJ\n6XzwwU+8885/3ZxbAaXgzTfb0adPI7SGDRsSeemlbzh5MtPJJTp+NfaJEx8gKupBCm/XmzYlEhHh\nkqH/XhvbHa64GvuYiRAZdeXXtq2b4MkIYxz2jLnQ4XHIzIQPZsE7bxQ873rt1WrAnMXQINz4Z/DH\nMVj6MZaooiPDrhYYaGfVqh40bVoFf38bJ06cY8WKPbz22kZyckpSOB37nLl2Oz8GvA/QXGu947px\nS3PRLn0c25hKH8eLtnCdK4q2h1lOR5kW25zPmeNFu5R8dxNCiLJBirYQQvgQKdpCCOFDpGgLIYQP\nkaIthBA+RIq2EEL4ECnaQgjhQ3z4ZBA2zBlPaeaYYRmvXPaYNzbfzLHSup15Y8TV92ast+M/BpI9\nbSGE8CFStIUQwodI0RZCCB8iRVsIIXyIFG0hhPAhUrSFEMKHSNEWQggfIkVbCCF8iBRtIYTwIVK0\nhRDCh5hetJVSFqXUFKXUb0qpTKXUAaXUBLPzEkIIb+QN5x4ZB7wI9AL2AC2ABUqpVK31bFMzE0II\nL+MNRbs1EKu1Xpv/+IhSqifQysSchBDCK5nePQJsBSKUUnUBlFKNgTbAmpIu+KOPHic7eyxnz44m\nLW00Z8+OplWr20q6WIcNGdKSuLgBZGVNYOXK7h6La3bsmTMfJTFxJKmp4zhyZCQzZnTAavXspubv\nb2P//uGkpIz1aFyzVK0axGefPc3JkyNJTn6ZJUue5KabAj0S26PbWtXa8MZX8HkKLDkC3UZf/ZwK\ntxjtc7Zf3fbsOFj8G3yZDvN/hfotnE7FrM+YNxTtacBSYK9S6jywHXhPa/2pKxYeHb2d0NC3CQl5\nm9DQt4mLO+aKxTokKSmdKVM2ExNzjY2nFMeOjo6jfv1ZVKgwjSZN5tCkSRUiI9t4NIfJkx/i0KEz\nHo1ppvff74jWmho1ZlG7djSBgXZmznzEI7E9tq0pBVO+gIRt8NTNMCYCnhwGDxUpmC/Nhv3XyKXf\n69DqURjdDh4Phsj2cOKI0+mY9Rnzhu6R7kBPoAdGn3YTYKZS6pjWetGfz7YGCCgyrSHQ2D1ZOiE2\ndi8ATZtWoXr1kDITOyEh5fJ9i0WRl6epW7eSx+I3a1aVjh3DeOWVb1i27BmPxTVT7doVmDp1K9nZ\nxnmZly7dw7hxrT0S22PbWo36UL0efDwJtIbf98PXH0KngbBhqfGce5+A4IqwbhH89eWCeYMqwF9H\nQv+G8MdhY9rJ30uUjvPr/TPwS5Fp2Q7P7Q1FezrwhtZ6ef7j3UqpvwCvAtcp2o8BN+7q6NWrIb16\nNeT48Qzmz4/n3XfjSpqvcEBkZBvGj7+foCA/Tp3KJDJynUfiWiyKmJjODB78JTabN3yR9IwZM/5L\nt24NWLPmABaL4tln7+KLL/abnZZrKUuhv3nGfYsV6jQy7pcPgUEzYGwHaNj2ynnvvAfOZ0NET3j8\nRTifA5uWwUcTIO+ix1bB0DD/VthxIMahub1hqy4H6CLT8nBBbjNn/o/69edwyy3v0r//V4wY0Yrh\nw1uWdLHCAdOn/0ho6FQaNJjNnDnbSE7O8EjcMWPasH37cbZuPeqReN5i69bfqVy5PGfOjOLUqVeo\nUMGfadO2mp2Wax3dB8mHoc9ksNmh1p3QoS+Uy9/LHfAmrP0Ijv929bzBlaB8KNwWBn8Lg5H3G10l\nPXzvmIc3FO3VwHil1GNKqVpKqa7ASOCzki44Pj6Z06ezAIiLO8a0aVvp3v3Oki5WFENCQgq7diWz\nYMGTbo9Vp05FBg1qcXmvXinl9pjeYt26nvzwwxHKlZtOUNB0tm79nXXrepqdlmvlXYS/d4G6zWBp\nEry6yCjSaSkQ3gbuagOfTjeeW/S9z8owulQWTDT2uE8lwWczoXVnz69HCXlD98gwYAoQDVQGjgH/\nzJ/mUrro/rzwCD8/K2Fh7u/Tbtu2JpUrlych4SWUArvdSnCwH8nJY+jU6RO2bfPcQWhPqlQpkFq1\nQpk1axvnzxtf9WfN2saYMa2pWDGAM2cc7y/1ekf2wriOBY/7T4Vdm6BphDGyZPlxY7rdH/wDYUUy\nDGgIB+PNydcNTN/T1lqf01q/orWurbUur7Wuq7WO0lo7fqXLP/H003cQFOQHQPPmVRk7tjUrVvxa\n4pwdZbEo/P1t2O1WLBaFn5/VY/2sZsUuV85O795NCAnxByA8vDLjx9/H2rUH3B576dLdhIX9gyZN\n5tC48Rz69/+CtLQcGjf+Jzt3/uH2+GY5fTqL/ftPM3RoC/z8rPj7Wxk2rAVHj6Z5pGB7dFurHW4U\nY6sN2naFjn1h8f/B8hnQux4MbGzcFkw0CvzAxnDmBCQnwo710CsK/ALgpqrw5Evw4yqnUzHrM+YN\ne9puM2xYC+bOfQybzUJSUjqzZ2/z6IHICRPuJyrqQXT+Ln5m5ng2bUokImJhqY2tNfTs2ZC33mqP\nv7+NEyfOsWLFHl57baNb4wLk5ORy/Hj65ccnT55Da/jjD8/0p5upS5flvPdee5KShqMU7NyZzBNP\nLPNIbI9uaw90gy5DwOZn7D3/vQsc3m20ZZ8reF7GGbh4AU4X+mf9xnMwap6x933uLKxfBMvecjoV\nsz5jSvtYn4FSqhmwHYbgyOgR17tgQkxRdtlNjG3etq7bTTIttvo+yoSol0ePNNda77jeM03vHhFC\nCOE4KdpCCOFDpGgLIYQPkaIthBA+RIq2EEL4ECnaQgjhQ6RoCyGED/HhH9fkImOmywrPnMz/2rJM\njA3mbuPmjRE3Z6y0Qdf1/BjxHdnQ3MFznMmethBC+BAp2kII4UOkaAshhA+Roi2EED5EirYQQvgQ\nKdpCCOFDpGgLIYQPkaIthBA+RIq2EEL4EK8o2kqpIKXUe0qpw0qpTKXUFqVUC7PzEkIIb+MVRRv4\nEIgAngPCgXXAeqVUVVOzEkIIL2N60VZKBQBPAWO01j9qrX/TWk8CDgCDzc1OCCG8i+lFG+OkVVYg\np8j0LKCt59MRQgjvZXrR1lpnAP8G/q6UqqqUsiilngdaA9I9IhwyZEhT4uJ6kZU1ipUrn7yibdKk\ntsTH9+X8+dHMmNHOpAxLt86d67JjRz/S08dw9OhLDBjQ1KPx/f1t7N8/nJSUse4LUqM2zPsK4lJg\n0xHoN7qg7fY7YME6o+2HJJg8B/z8C9rLB8GMT2B7Kmw5BoPHO52G6UU73/OAApKAbGAY8C/goplJ\nCd+RlJTBlClbiYn56aq2/fvPMGbMRmJjD5iQWenXoUMdZs/uwPDh3xIc/BZ33RXDxo2JHs1h8uSH\nOHTojPsCKAX//AJ+3gZ33wy9I+D5YdCpu9H+zhI4+Cvccwt0bgh3NIahfy+Yf+JsCKkA91eH5+6H\nbgPgieecSsUrzqettT4EPKSUCgRCtNbJSqlPgUN/PtdaIKDItHCgobvSFF4sNnY/AE2b3kr16sFX\ntC1evBuAHj0aeDyvsmDy5AeYPHkLW7YYJ4ROS8shLa1ob6f7NGtWlY4dw3jllW9YtuwZ9wSpUx/+\nUg9mTwKt4fB+WPEhdBsIXy2F6rVh4iDIy4PU0/D9F9D4HmNe/wB4rDt0aw3nMljyxwGyomfR4ol+\nTJjzCQBni7F76hVF+xKtdRaQpZSqCHQARv/5szsivSdCmCsw0Ebz5lWoXj2YvXsHERzsxw8/HGXE\niG9JTj7n9vgWiyImpjODB3+JzebGjgNluRTQKMwAVivc0ci4/8Fb0LU3/PqTsUfdvit8Otdoq1Mf\nbHbYGw/As8HAsZ+g4at8cZvxFJ+7CIJS6hGlVAel1F+UUu2B74FfgQXmZiaEuJ6KFQNRStGlSz0i\nIj4hLOx9zp+/yOLFXTwSf8yYNmzffpytWx2seM46tA+SDsOIyWC3Q9id8FRfKB9itP+wFpq3hZ3p\nRp/2sSOwcr7RVi4Iss4Ze+iXpKVC+eCrwjjCK4o2EApEU1CoNwMdtNbSpy2EF8vIOA/AzJn/Iykp\nnaysXKKiNvPQQ7UICHDvF/k6dSoyaFALIiPXAaCUcl+wixdhSBe4s5lRlN9aBCs/gtQUCA6FBeth\n6VxoFAgtK0F2pnHgESAzAwLKGf3ilwSHwrl0p1Lxiu4RrfVyYLnZeQghiictLYcjR85eMU0pY6fS\nnTUUoG3bmlSuXJ6EhJdQCux2K8HBfiQnj6FTp0/Ytu2YawMe3Av9OhY8Hj0V/rcJat4OAYGwONqY\nnpFmdI3MW2M8/m0f5F4wDk7+mn+gvEFTSPjZqTS8ZU9biBKxWBT+/lbsdgsWi8LPz3q5j9NqNdqs\nVoXVarRZrW6uKGVITMxOhg9vSdWqQQQE2Jg48T7Wrz9EVlauW+MuXbqbsLB/0KTJHBo3nkP//l+Q\nlmPDHrMAAAntSURBVJZD48b/ZOfOP1wfsF64UZxtNnikq9E9Ej0Ffttr7DU/O8jo8y4fBN0Hwp6d\nxnw52bBmKbw8xegSqRVmjDxZNs+pNLxiT1uIkpow4V6iotqg8/sNMzNfYdOmo0REfMq8eY/Su3f4\n5bZhw5qxcOEv9Ov3tZkplxrTpm2lYsUA4uP7ozVs2JBIr15fuD1uTk4ux48XdDGcPHkOreGPPzLc\nE/DRbtBzCNj9YF88DO4CB/YYbS92hsjp8Mob/P/27j3YqrIO4/j3AS/kBS2t1IpRExW1SEEdTSA0\nHLNRs6ZEJTW0q1pDTRhjDl6ycXTyiKSNo3lFUCtvTCpomBfUITgzOAioJCjeSLHQ8GRy+PXHu05s\nNqjD4ax3u/Z+PjPrj73O2udZa+1zfvvd73r3eulcBe0z4ayT1zz3gjPh/KvgkReh4224aSJMndyt\n3VDUdo5XgKT9gDnwPTx6pFV8pIHZHQ3MbrRNG5j9bsOSo/952TNrRo8Mioj299vW3SNmZhXiom1m\nViEtWrS7d9XW2VXNXver7fm06jmf28Dsxh33lO6N4tsgLVq05zm7pbIbWUBa9Zw38g2jccftom1m\nZmtx0TYzqxAXbTOzCqnil2v6AEyaNJQBA7p3q80xYx6kre3oHt0pZ3+Ys++hrW1Eg7Jb9ZxPp63t\nKw3K3rjjbqf7z10xZgztbW0b/LwFCxbAqFGw7v2m11HFL9ecANzc6P0wMyvBiRHxvl+VrGLR3o50\nr+0lpFluzMyqrg+wMzAtIpa/34aVK9pmZq3MFyLNzCrERdvMrEJctM3MKsRF28ysQlqqaEs6XdJi\nSR2SnpC0f6bcIZLulvSSpNWSsgyelTRO0ixJb0paJukOSbvnyC7yfyBprqQVxfKYpCM++Jk9vh/j\nivN+aaa88UVe7TI/U/ZOkm6S9Lqkt4vzv1+m7MXrOe7VkiaWnNtL0gWSniuOeZGkX5aZWZe/laTL\nJC0p8h+VNLisvJYp2pKOA34DjAf2Jd1FaJqk7TPEb0m61dzpQM7hOkOAicCBwJdJd7WfLinXrAJL\ngbOAQcUyA7hLUve+FdUNxRvzd8l/16h5wCeBHYrlkLIDJW0LzATeIQ2LHQD8DPhn2dmFwaw53h2A\nEaS/99tKzv0F8H3gR8CewFhgrKQzSs7t8nvgMOBEYB/gfuABSeXM0hIRLbEATwATah4LeBEYm3k/\nVgNHN+gcbF/kH9LA12E58J1MWVsBTwOHAg8Cl2bKHQ+0N+DcXgQ81KjXdj37cxnwTIacqcDVdev+\nCNyYIbsPaZqdI+rWzwbOLyOzJVrakjYltfT+0rUu0pl9ADioUfvVANuSWj5v5A4uPsKOBLYAHs8U\newUwNSJmZMqr1b/oDvu7pEmSPpMh8yhgtqTbiu6wdkmnZchdR/E/dyKpFVq2x4DDJPUvsgcCXwTu\nyZC9CdCb9OmmVgclfbqq4r1HumN70oldVrd+GbBH/t3JT5JILZ9HIyJL/2qRuw+pSPcB3gKOjYiF\nGXJHAl8gfWTP7QngFFIrf0fgXOBhSftExMoSc3cFfkjqBryQ1C12uaT/RMSkEnPX51hgG+CGDFkX\nAX2BhZI6Sd2+Z0fELWUHR8S/JT0OnCNpIammnEBqDD5bRmarFO33IvL2MTfSlcBepBZITguBgaRW\n/jeAGyUNLbNwS/o06Q1qRERknyE2IqbVPJwnaRbwPPAt4LoSo3sBsyLinOLxXEl7kwp57qI9Grg3\nIl7NkHUcqVCOBOaT3qwnSHo5Im7KkD8KuBZ4CVgFtAOTgVIuALdK0X4d6CRdGKr1CdZtfTcdSb8F\njgSGRMQrObMjYhXwXPGwXdIBwE9IhaQsg4CPA3OKTxiQPmkNLS5ObV50j2URESskPQPsVnLUK8CC\nunULgK+XnLsWSf1IF76/linyYuDXEfGH4vFTknYGxgGlF+2IWAwMLy7w942IZZJuARaXkdcSfdpF\na2sO6Qov8P/ugsNI/WFNqyjYxwDDI+KFRu8P6W9u85IzHgA+R2pxDSyW2aTW5sCcBRvSkDDgs6Si\nWqaZrNvdtweplZ/TaFJjKEefMqTrJPWv6Woy17eI6CgK9kdJo3fuLCOnVVraAJcCN0iaA8wCxpBe\n7OvLDpa0JamV1dXq27W4WPJGRCwtMfdK4HjgaGClpK5PGisiovQ7JEq6ELiXNPRva9KFqWHA4WXm\nFv3Ga/XbS1oJLI+I+pZoj5N0CWlEw/PAp4DzSB+bp5Qc3QbMlDSONMzuQOA00pDHLIrG0CnA9RGx\nOlPsVOBsSUuBp0jdEmOAa3KESzqc9L/9NNCf1PJfQFm1pewhMR+mhTSOcwnpyu7jwOBMucNI7/yd\ndcu1JeeuL7MTOCnTcV9D6hrpAF4FpgOHNui1n0G+IX9TSMNJO4AXSP2bu2TKPhJ4EnibVMBGZz7P\nI4q/sd0yZm5JapQtBlaSLgCeB2ySKf+bwKLi9X4JmABsXVaeb81qZlYhLdGnbWbWLFy0zcwqxEXb\nzKxCXLTNzCrERdvMrEJctM3MKsRF28ysQly0zcwqxEXbzKxCXLTNMpH0YK55Kq15uWhbU5J0kKRV\nku7ewOddJ+n2svbLbGO5aFuzGg1cDgwrbYJVswZw0bamI2kL0iwxvwP+DJxc9/O9JE2VtELSm5Ie\nkrSLpPHFtsdIWi2pU9JQScOKx31rfsfAYl2/4vHHJE2WtFTSSklPFlOemfUoF21rRiOBhRHxLHAz\ncGrXDyTtBDxMuo3ml0j3Xr6WdG/5S0j3ob6PNMvRjqyZJGN9t8OsXdeHNNHCkcDewFWkqdX276mD\nMoPWmgTBWsdo1kwzdR/Qt5iX8mHgDOBfwPER0Vlss6jriZI6gM0i4rWadR8YGBEvk+7p3OUKSUeQ\n7rX8t404FrO1uKVtTUXSHsABwK0ARWG+jVTIIU099khNwe6p3F6Szim6RZZLeos0Q0+/nswxc0vb\nms2ppEl8X65rIb8j6cekbpEN1TVtVu0v3LRum7HAmaRJi+eRZlCZAGzWjTyz9+SibU1DUm/g28BP\ngfvrfnwnqa/7SeAkSb3fo7X9X1LRr/UaqWDvCKwo1u1bt83BwF0RMaXYF5HmC5yPWQ9y94g1k6OA\nbUlzb86vXYDbSa3wicA2wK2SBknaTdIoSf2L37EE+Lyk3SVtJ2kTUp/3UuDcYvuvkt4Yaj0LjCjG\nhw8gXYjcoewDttbjom3NZDRwf0S8tZ6f/QkYTJodfThpMti/kkZ8nAa8W2x3NWlW7dnAP4CDI2IV\nqZW+JzAX+Dlwdt3v/xXQTrrwOQN4BbijbhtPyGobzRP7mplViFvaZmYV4qJtZlYhLtpmZhXiom1m\nViEu2mZmFeKibWZWIS7aZmYV4qJtZlYhLtpmZhXiom1mViEu2mZmFfI/1HOp484XZnwAAAAASUVO\nRK5CYII=\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f841eb30f98>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "test_error, confusions = error_rate(test_prediction.eval(), test_labels)\n",
-    "print('Test error: %.1f%%' % test_error)\n",
-    "\n",
-    "plt.xlabel('Actual')\n",
-    "plt.ylabel('Predicted')\n",
-    "plt.grid(False)\n",
-    "plt.xticks(numpy.arange(NUM_LABELS))\n",
-    "plt.yticks(numpy.arange(NUM_LABELS))\n",
-    "plt.imshow(confusions, cmap=plt.cm.jet, interpolation='nearest');\n",
-    "\n",
-    "for i, cas in enumerate(confusions):\n",
-    "    for j, count in enumerate(cas):\n",
-    "        if count > 0:\n",
-    "            xoff = .07 * len(str(count))\n",
-    "            plt.text(j-xoff, i+.2, int(count), fontsize=9, color='white')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "yLnS4dGiMwI1"
-   },
-   "source": [
-    "We can see here that we're mostly accurate, with some errors you might expect, e.g., '9' is often confused as '4'.\n",
-    "\n",
-    "Let's do another sanity check to make sure this matches roughly the distribution of our test set, e.g., it seems like we have fewer '5' values."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2016-09-16T14:55:18.083458",
-     "start_time": "2016-09-16T14:55:17.830485"
-    },
-    "cellView": "both",
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     },
-     "output_extras": [
-      {
-       "item_id": 1
-      }
-     ]
-    },
-    "colab_type": "code",
-    "collapsed": false,
-    "executionInfo": {
-     "elapsed": 352,
-     "status": "ok",
-     "timestamp": 1446753006584,
-     "user": {
-      "color": "#1FA15D",
-      "displayName": "Michael Piatek",
-      "isAnonymous": false,
-      "isMe": true,
-      "permissionId": "00327059602783983041",
-      "photoUrl": "//lh6.googleusercontent.com/-wKJwK_OPl34/AAAAAAAAAAI/AAAAAAAAAlk/Rh3u6O2Z7ns/s50-c-k-no/photo.jpg",
-      "sessionId": "716a6ad5e180d821",
-      "userId": "106975671469698476657"
-     },
-     "user_tz": 480
-    },
-    "id": "x5KOv1AJMgzV",
-    "outputId": "2acdf737-bab6-408f-8b3c-05fa66d04fe6"
-   },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhIAAAFkCAYAAAB1rtL+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAHRhJREFUeJzt3X/wXXV95/HnC1AotCRMKQRXWHVVjNaiCQisBcemQv2N\ndWY1mLFKcdSCZdL6o6yyUtm2SkehgrqOP1bll4O4XWSkxmJbrUCh/NBaDey6YgPFhH7llw1GlLz3\nj3MuXi4h5PvJ9+Z+7zfPx8yd5H7O55y8P5Pke1/3cz7nnFQVkiRJLXaZdAGSJGl6GSQkSVIzg4Qk\nSWpmkJAkSc0MEpIkqZlBQpIkNTNISJKkZgYJSZLUzCAhSZKaGSQkSVKzWQeJJEcl+UKSf02yOcnL\nhrbtluR9Sf4pyb/3fT6d5ICRY+yT5IIk9yS5K8nHk+w10ufXknwtyY+T/EuSt7UPU5IkjUPLjMRe\nwDeAk4DRB3XsCTwL+GPg2cArgIOBS0f6XQgsBVYALwaOBj462Jjkl4A1wC3AMuBtwOlJTmyoV5Ik\njUm256FdSTYDx1XVF7bS51DgGuA/VtVtSZYC3waWV9WNfZ9jgS8Cj6+q9UneDJwBLKmqn/V9/gx4\neVU9vblgSZI0p3bEGonFdDMXd/fvjwDuGoSI3hV9n8OH+nxtECJ6a4CDkywac72SJGkb7TbOgyfZ\nHXgvcGFV/XvfvAS4Y7hfVT2Q5M5+26DP90YOt2Fo2z1b+LN+GTgW+D6waS7qlyRpJ7EH8ARgTVX9\ncDY7ji1IJNkN+BzdTMPvbcsuPHzNxeh2ttLnWOCCbS5QkiSNeg3dOsZtNpYgMRQiDgR+Y2g2AmA9\nsN9I/12Bffptgz77jxx2sM8Gtuz7AOeffz5Lly5trn2+WL16NWedddaky5gzjmf+WkhjAcczny2k\nscDCGs/atWtZtWoV9J+lszHnQWIoRDwJeH5V3TXS5WpgcZJnD62TWEE343DtUJ//nmTXqnqgbzsG\nuLmqHnZao7cJYOnSpSxbtmyORjM5ixYtWhDjGHA889dCGgs4nvlsIY0FFt54erNeGtByH4m9khyS\n5Fl905P69wf2Mwufp7tkcxXwmCT796/HAFTVTXQLJz+W5LAkzwXOAS6qqsGMxIXA/cAnkzw9yauA\n3wfeP9t6JUnS+LTMSBwK/C3dWoXi5x/un6a7f8RL+/Zv9O2DtQ/PB77Wtx0PnEt3tcZm4BLglMEf\nUFX39peEngtcB8wAp1fVJxrqlSRJYzLrIFFVX2XrMxmPOstRVXfTzVhsrc+3gOfNrjpJkrQj+ayN\neWrlypWTLmFOOZ75ayGNBRzPfLaQxgILbzyttuvOlvNJkmXA9ddff/1CXPwiSdLY3HDDDSxfvhy6\nu07fMJt9nZGQJEnNDBKSJKmZQUKSJDUzSEiSpGYGCUmS1MwgIUmSmhkkJElSM4OEJElqNpbHiGt2\n1q1bx8zMzKTLaLLvvvty0EEHTboMSdKEGCQmbN26dRx88FI2bbpv0qU02WOPPbn55rWGCUnaSRkk\nJmxmZqYPEecDSyddziytZdOmVczMzBgkJGknZZCYN5YCPiNEkjRdXGwpSZKaGSQkSVIzg4QkSWpm\nkJAkSc0MEpIkqZlBQpIkNTNISJKkZgYJSZLUzCAhSZKaGSQkSVIzg4QkSWpmkJAkSc0MEpIkqZlB\nQpIkNTNISJKkZgYJSZLUzCAhSZKaGSQkSVIzg4QkSWpmkJAkSc0MEpIkqZlBQpIkNTNISJKkZrtN\nugBNv7Vr1066hFnbd999OeiggyZdhiRNPYOEtsMPgF1YtWrVpAuZtT322JObb15rmNCCtm7dOmZm\nZiZdRhPD/vQwSGg73A1sBs4Hlk64ltlYy6ZNq5iZmfEHlRasdevWcfDBS9m06b5Jl9LEsD89DBKa\nA0uBZZMuQtKQmZmZPkRMW9AHw/50mXWQSHIU8DZgOXAAcFxVfWGkz3uAE4HFwJXAm6vqu0Pb9wHO\nBV5C95X288ApVbVxqM+v9X0OA+4Azq2qP59tvZK0czPoa7xartrYC/gGcBJQoxuTvAM4GXgj8Bxg\nI7AmyWOHul1I9697BfBi4Gjgo0PH+CVgDXAL3f+AtwGnJzmxoV5JkjQms56RqKovAV8CSJItdDkF\nOKOqLuv7vBbYABwHXJxkKXAssLyqbuz7vAX4YpK3VtV6YBXwGOB3q+pnwNokzwb+APj4bGuWtmQa\nrzYBF6FJ89m0LnDdnp+Hc7pGIskTgSXAVwZtVXVvkmuAI4GLgSOAuwYhoncF3ezG4cClfZ+v9SFi\nYA3w9iSLquqeuaxbO5vpvdoEXIQmzVfTvsC11VwvtlxCFwg2jLRv6LcN+twxvLGqHkhy50if723h\nGINtBglth2m92gRchCbNX9O9wPVy4LSmPXfUVRthC+spZtlncBplq8dZvXo1ixYtekjbypUrWbly\n5aPVqJ2Oi9AkjcN8/9lyUf8adlvz0eY6SKyn+8Dfn4fOSuwH3DjUZ7/hnZLsCuzTbxv02X/k2IN9\nRmc7HuKss85i2bL5/BcoSdIkrexfwy6gW544e3P6rI2quoUuBKwYtCXZm27tw1V909XA4n7x5MAK\nugBy7VCfo/uAMXAMcLPrIyRJmj9mHSSS7JXkkCTP6pue1L8/sH9/NvCuJC9N8kzgM3RzJpcCVNVN\ndAsnP5bksCTPBc4BLuqv2IDu8tD7gU8meXqSVwG/D7y/cZySJGkMWk5tHAr8Ld1aheLnH+6fBk6o\nqjOT7El3X4jFwN8DL6yq+4eOcTzdzaauoFv1dgndZaPAg1d6HNv3uQ6YAU6vqk88WnEXX3wxV199\ndcOwJuPWW2+ddAmaQtN46aqXrUoLU8t9JL7Ko8xkVNXpwOlb2X43j3Iypqq+BTxvtvW9733vJ5me\np6NX/XTSJWiqTO+lq162Ki1MC/BZG9dQNU2LLX8X+OSki9DUmNZLV71sVbM3bTNv01bvXFmAQULa\nGcz3y8uk7TG9M287I4OEJGmemdaZt/abOk0zg4QkaZ6atpk3T21IkrZgGh/EtLOer9eOZ5CQpK3Y\nWR/EJG0rg4QkbcX0Pohp5zxfrx3PICFJ28Tz9dKWTM+dmyRJ0rxjkJAkSc0MEpIkqZlBQpIkNTNI\nSJKkZgYJSZLUzCAhSZKaGSQkSVIzg4QkSWpmkJAkSc0MEpIkqZlBQpIkNTNISJKkZj79U9IOs3bt\n9D2RchprlnYkg4SkHeAHwC6sWrVq0oVImmMGCUk7wN3AZuB8YOmEa5mty4HTJl2ENG8ZJCTtQEuB\nZZMuYpY8tSFtjYstJUlSM4OEJElqZpCQJEnNDBKSJKmZQUKSJDUzSEiSpGYGCUmS1MwgIUmSmhkk\nJElSM4OEJElqZpCQJEnNDBKSJKmZQUKSJDUzSEiSpGYGCUmS1GzOg0SSXZKckeR7Se5L8t0k79pC\nv/ckub3v89dJnjyyfZ8kFyS5J8ldST6eZK+5rleSJLUbx4zEHwFvBH4PeBrwduDtSU4edEjyDuDk\nvt9zgI3AmiSPHTrOhcBSYAXwYuBo4KNjqFeSJDXabQzHPBK4tKq+1L9fl+R4usAwcApwRlVdBpDk\ntcAG4Djg4iRLgWOB5VV1Y9/nLcAXk7y1qtaPoW5JkjRL45iRuApYkeQpAEkOAZ4LXN6/fyKwBPjK\nYIequhe4hi6EABwB3DUIEb0rgAIOH0PNkiSpwThmJN4L7A3clOQBurDyzqr6bL99CV0g2DCy34Z+\n26DPHcMbq+qBJHcO9ZEkSRM2jiDxKuB44NXAd4BnAX+R5PaqOm8r+4UuYGzNtvSRJEk7yDiCxJnA\nn1bV5/r3307yBOBU4DxgPV0g2J+HzkrsBwxOZazv3z8oya7APjx8JmPEamDRSNvK/iVJ0s7uov41\n7Lbmo40jSOzJw2cNNtOvx6iqW5Ksp7sa458AkuxNt/bhQ33/q4HFSZ49tE5iBV0AuWbrf/xZwLLt\nHoQkSQvTlr5cXwCsajraOILEZcA7k9wKfJvuU3018PGhPmcD70ryXeD7wBl0cehSgKq6Kcka4GNJ\n3gw8FjgHuMgrNiRJmj/GESROpgsGH6I7PXE78JG+DYCqOjPJnnT3hVgM/D3wwqq6f+g4xwPn0l2t\nsRm4hO6yUUmSNE/MeZCoqo3AH/SvrfU7HTh9K9vvpnWeRZIk7RA+a0OSJDUzSEiSpGYGCUmS1Mwg\nIUmSmhkkJElSM4OEJElqZpCQJEnNDBKSJKmZQUKSJDUzSEiSpGYGCUmS1MwgIUmSmhkkJElSM4OE\nJElqZpCQJEnNDBKSJKmZQUKSJDUzSEiSpGYGCUmS1MwgIUmSmhkkJElSM4OEJElqZpCQJEnNDBKS\nJKmZQUKSJDUzSEiSpGYGCUmS1MwgIUmSmhkkJElSM4OEJElqZpCQJEnNDBKSJKmZQUKSJDUzSEiS\npGYGCUmS1MwgIUmSmhkkJElSM4OEJElqZpCQJEnNDBKSJKmZQUKSJDUbS5BI8rgk5yWZSXJfkm8m\nWTbS5z1Jbu+3/3WSJ49s3yfJBUnuSXJXko8n2Wsc9UqSpDZzHiSSLAauBH4CHAssBf4QuGuozzuA\nk4E3As8BNgJrkjx26FAX9vuuAF4MHA18dK7rlSRJ7XYbwzH/CFhXVScOtf3LSJ9TgDOq6jKAJK8F\nNgDHARcnWUoXQpZX1Y19n7cAX0zy1qpaP4a6JUnSLI3j1MZLgeuSXJxkQ5IbkjwYKpI8EVgCfGXQ\nVlX3AtcAR/ZNRwB3DUJE7wqggMPHULMkSWowjiDxJODNwM3AMcD/AD6YZFW/fQldINgwst+Gftug\nzx3DG6vqAeDOoT6SJGnCxnFqYxfg2qo6rX//zSTPoAsX529lv9AFjK3Zhj6rgUUjbSv7lyRJO7uL\n+tew25qPNo4g8QNg7UjbWuC3+9+vpwsE+/PQWYn9gBuH+uw3fIAkuwL78PCZjBFnAcu23kWSpJ3W\nlr5cXwCs2kLfRzeOUxtXAgePtB1Mv+Cyqm6hCworBhuT7E239uGqvulqYHGSZw8dYwVdALlmDDVL\nkqQG45iROAu4MsmpwMV0AeFE4A1Dfc4G3pXku8D3gTPo5lUuBaiqm5KsAT6W5M3AY4FzgIu8YkOS\npPljzoNEVV2X5BXAe4HTgFuAU6rqs0N9zkyyJ919IRYDfw+8sKruHzrU8cC5dFdrbAYuobtsVJIk\nzRPjmJGgqi4HLn+UPqcDp29l+920nrCRJEk7hM/akCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJktTM\nICFJkpoZJCRJUjODhCRJamaQkCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJktTMICFJkpoZJCRJUjOD\nhCRJamaQkCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJktTMICFJkpoZJCRJUjODhCRJamaQkCRJzQwS\nkiSpmUFCkiQ1M0hIkqRmBglJktTMICFJkpoZJCRJUjODhCRJamaQkCRJzQwSkiSpmUFCkiQ1M0hI\nkqRmBglJktTMICFJkpoZJCRJUjODhCRJamaQkCRJzcYeJJKcmmRzkg8Mte2e5ENJZpL8KMklSfYb\n2e/AJF9MsjHJ+iRnJjH4SJI0j4z1gznJYcAbgG+ObDobeDHwSuBo4HHA54f22wW4HNgNOAL4HeB1\nwHvGWa8kSZqdsQWJJL8InA+cCNw91L43cAKwuqq+WlU3Aq8HnpvkOX23Y4GnAa+pqm9V1RrgNOCk\nJLuNq2ZJkjQ745yR+BBwWVX9zUj7oXQzDV8ZNFTVzcA64Mi+6QjgW1U1M7TfGmAR8IyxVSxJkmZl\nLN/uk7waeBZdaBi1P3B/Vd070r4BWNL/fkn/fnT7YNvoqRJJkjQBcx4kkjyebg3EC6rqp7PZFaht\n6PcofVbTTVwMW9m/JEna2V3Uv4bd1ny0ccxILAd+Bbg+Sfq2XYGjk5wM/Bawe5K9R2Yl9uPnsw7r\ngcNGjrt//+voTMWIs4BlzcVLkrSwbenL9QXAqqajjWONxBXAM+lObRzSv66jW3g5+P1PgRWDHZI8\nFTgIuKpvuhp4ZpJ9h457DHAP8J0x1CxJkhrM+YxEVW1k5MM+yUbgh1W1tn//CeADSe4CfgR8ELiy\nqv6x3+XL/THOS/IO4ADgDODcWZ4ukSRJY7SjLqUcXdewGngAuATYHfgScNKDnas2J3kJ8BG6WYqN\nwKeAd++IYiVJ0rbZIUGiqn5j5P1PgLf0r0fa51bgJWMuTZIkbQdvOS1JkpoZJCRJUjODhCRJamaQ\nkCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJktTMICFJkpoZJCRJUjODhCRJamaQkCRJzQwSkiSpmUFC\nkiQ1M0hIkqRmBglJktTMICFJkpoZJCRJUjODhCRJamaQkCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJ\nktTMICFJkpoZJCRJUjODhCRJamaQkCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJktTMICFJkpoZJCRJ\nUjODhCRJamaQkCRJzQwSkiSpmUFCkiQ1M0hIkqRmBglJktRszoNEklOTXJvk3iQbkvxlkqeO9Nk9\nyYeSzCT5UZJLkuw30ufAJF9MsjHJ+iRnJjH4SJI0j4zjg/ko4BzgcOA3gccAX07yC0N9zgZeDLwS\nOBp4HPD5wcY+MFwO7AYcAfwO8DrgPWOoV5IkNdptrg9YVS8afp/kdcAdwHLg60n2Bk4AXl1VX+37\nvB5Ym+Q5VXUtcCzwNOD5VTUDfCvJacB7k5xeVT+b67olSdLs7YhTBYuBAu7s3y+nCzBfGXSoqpuB\ndcCRfdMRwLf6EDGwBlgEPGPcBUuSpG0z1iCRJHSnMb5eVd/pm5cA91fVvSPdN/TbBn02bGE7Q30k\nSdKEzfmpjREfBp4O/Po29A3dzMWj2ZY+kiRpBxhbkEhyLvAi4Kiqun1o03rgsUn2HpmV2I+fzzqs\nBw4bOeT+/a+jMxUjVtOdARm2sn9JkrSzu6h/Dbut+WhjCRJ9iHg58LyqWjey+XrgZ8AK4C/7/k8F\nDgKu6vtcDfzXJPsOrZM4BrgH+A5bdRawbPsHIUnSgrSlL9cXAKuajjbnQSLJh+kqfBmwMclgJuGe\nqtpUVfcm+QTwgSR3AT8CPghcWVX/2Pf9Ml1gOC/JO4ADgDOAc6vqp3NdsyRJajOOGYk30a1j+LuR\n9tcDn+l/vxp4ALgE2B34EnDSoGNVbU7yEuAjdLMUG4FPAe8eQ72SJKnROO4j8ahXglTVT4C39K9H\n6nMr8JI5LE2SJM0xbzktSZKaGSQkSVIzg4QkSWpmkJAkSc0MEpIkqZlBQpIkNTNISJKkZgYJSZLU\nzCAhSZKaGSQkSVIzg4QkSWpmkJAkSc0MEpIkqZlBQpIkNTNISJKkZgYJSZLUzCAhSZKaGSQkSVIz\ng4QkSWpmkJAkSc0MEpIkqZlBQpIkNTNISJKkZgYJSZLUzCAhSZKaGSQkSVIzg4QkSWpmkJAkSc0M\nEpIkqZlBQpIkNTNISJKkZgYJSZLUzCAhSZKaGSQkSVIzg4QkSWpmkJAkSc0MEpIkqZlBQpIkNTNI\nSJKkZgYJSZLUzCAxb1006QLmmOOZvxbSWMDxzGcLaSyw8MbTZl4HiSQnJbklyY+T/EOSwyZd046z\n0P6BOp75ayGNBRzPfLaQxgILbzxt5m2QSPIq4P3Au4FnA98E1iTZd6KFSZKkB83bIAGsBj5aVZ+p\nqpuANwH3ASdMtixJkjQwL4NEkscAy4GvDNqqqoArgCMnVZckSXqo3SZdwCPYF9gV2DDSvgE4+BH2\n2aP75X8B142rrjG4qf/1cmDtUPttwAU7vpxZubL/dbT2LZlP45lN3Y9kUuOZi9pH7YixjKPuRzLX\n49mRtW9J63gmXfeWbOtY5mPtWzI6nmmpe0sGtQ8+S7ddui/680uSA4B/BY6sqmuG2s8Efr2q/vMW\n9jme+fNJJUnSNHpNVV04mx3m64zEDPAAsP9I+348fJZiYA3wGuD7wKaxVSZJ0sKzB/AEus/SWZmX\nMxIASf4BuKaqTunfB1gHfLCq/nyixUmSJGD+zkgAfAD4dJLrgWvpruLYE/jUJIuSJEk/N2+DRFVd\n3N8z4j10pzi+ARxbVf822cokSdLAvD21IUmS5r95eR8JSZI0HQwSkiSp2YIIEgvl4V5JjkryhST/\nmmRzkpdNuqbtkeTUJNcmuTfJhiR/meSpk66rRZI3Jflmknv611VJfmvSdc2V/u9qc5IPTLqWFkne\n3dc//PrOpOtqleRxSc5LMpPkvv7f3rJJ19Wi/9k8+nezOck5k66tRZJdkpyR5Hv93813k7xr0nW1\nSvKLSc5O8v1+PF9PcuhsjjH1QWKBPdxrL7pFpScBC2HxylHAOcDhwG8CjwG+nOQXJlpVm1uBd9Dd\nun058DfApUmWTrSqOdAH7zfQ/d+ZZv9MtzB7Sf/69cmW0ybJYrrbDP4EOBZYCvwhcNck69oOh/Lz\nv5MlwAvofr5dPMmitsMfAW8Efg94GvB24O1JTp5oVe0+Aayguw/TrwJ/DVzR3xhym0z9YstHuN/E\nrXT3mzhzosVthySbgeOq6guTrmWu9OHuDuDoqvr6pOvZXkl+CLy1qv7npGtpleQXgeuBNwOnATdW\n1R9MtqrZS/Ju4OVVNZXf2ocleS/dXX2fN+laxiHJ2cCLqmpaZycvA9ZX1RuG2i4B7quq106ustlL\nsgfwI+ClVfWlofbrgMur6r9ty3GmekbCh3tNncV030TunHQh26Of2nw13X1Nrp50PdvpQ8BlVfU3\nky5kDjylPy34/5Kcn+TASRfU6KXAdUku7k8J3pDkxEkXNRf6n9mvofsWPK2uAlYkeQpAkkOA59I9\nYGPa7Eb3XKufjLT/mFnM6M3b+0hso5aHe2kC+pmis4GvV9VUnrtO8qt0wWGQ4l/RP+J+KvVh6Fl0\nU8/T7h+A1wE3AwcApwNfS/KrVbVxgnW1eBLdDNH7gT+hOzX4wSSbqur8iVa2/V4BLAI+PelCtsN7\ngb2Bm5I8QPeF/J1V9dnJljV7VfXvSa4GTktyE91n5/F0X8T/77YeZ9qDxCMJC2ONwULyYeDpdMl9\nWt0EHEI3s/JK4DNJjp7GMJHk8XTB7gVV9dNJ17O9qmr4+QD/nORa4F+A/wJM26mnXYBrq+q0/v03\nkzyDLlxMe5A4Afirqlo/6UK2w6voPmxfDXyHLoz/RZLbq+q8iVbWZhXwSboHZf4MuAG4ENjm04TT\nHiRaHu6lHSzJucCLgKOq6geTrqdVVf0M+F7/9oYkzwFOofsBP22WA78CXN/PFkE3u3d0v2hs95ri\nBVRVdU+S/wM8edK1NPgBD38G9VrgtydQy5xJchDdouvjJl3LdjoT+NOq+lz//ttJngCcCkxdkKiq\nW4Dn94vg966qDUk+C9yyrceY6jUS/Tep6+lWnAIPTqGvoDuPpQnrQ8TLgedX1bpJ1zPHdgF2n3QR\nja4Ankn3beqQ/nUd3TfeQ6Y5RMCDi0j/E92H8rS5koefmj2YboZlmp1A9wVvGtcSDNuTh894b2b6\nP09/3IeIfeiuFvrf27rvtM9IwAJ6uFeSvei+QQ2+IT6pX8hzZ1XdOrnK2iT5MLASeBmwMclg5uie\nqpqqR70n+RPgr+iuCPolugVjzwOOmWRdrfp1Aw9Zq5JkI/DDqhr9NjzvJflz4DK6D9v/APwx3TTt\nRZOsq9FZwJVJTqW7RPJw4ES6S3SnUv8F73XAp6pq84TL2V6XAe9McivwbbpTAKuBj0+0qkZJjqH7\nzLkZeArdjMtaZvEZOvVBYoE93OtQ4G/p0m7RLbaCbmHSCZMqaju8iW4cfzfS/nrgMzu8mu2zP13N\nBwD3AP8EHLNArnYYmOZZiMfTndf9ZeDfgK8DR1TVDydaVYOqui7JK+gW9Z1GN8V8yjQu5hvym8CB\nTN96lS05GTiD7oqn/YDbgY/0bdNoEfBndAH8TuAS4F1V9cC2HmDq7yMhSZImZ6rP6UiSpMkySEiS\npGYGCUmS1MwgIUmSmhkkJElSM4OEJElqZpCQJEnNDBKSJKmZQUKSJDUzSEiSpGYGCUmS1Oz/A/lA\nG1beKa9dAAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f841c174f60>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "plt.xticks(numpy.arange(NUM_LABELS))\n",
-    "plt.hist(numpy.argmax(test_labels, 1));"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "E6DzLSK5M1ju"
-   },
-   "source": [
-    "Indeed, we appear to have fewer 5 labels in the test set. So, on the whole, it seems like our model is learning and our early results are sensible.\n",
-    "\n",
-    "But, we've only done one round of training. We can greatly improve accuracy by training for longer. To try this out, just re-execute the training cell above."
-   ]
-  }
- ],
- "metadata": {
-  "anaconda-cloud": {},
-  "colab": {
-   "default_view": {},
-   "name": "Untitled",
-   "provenance": [],
-   "version": "0.3.2",
-   "views": {}
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/tensorflow/tools/docker/notebooks/BUILD b/tensorflow/tools/docker/notebooks/BUILD
deleted file mode 100644
index e9f2689..0000000
--- a/tensorflow/tools/docker/notebooks/BUILD
+++ /dev/null
@@ -1,5 +0,0 @@
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
diff --git a/tensorflow/tools/docker/notebooks/LICENSE b/tensorflow/tools/docker/notebooks/LICENSE
deleted file mode 100644
index dea770e..0000000
--- a/tensorflow/tools/docker/notebooks/LICENSE
+++ /dev/null
@@ -1,13 +0,0 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
deleted file mode 100755
index 570aa82..0000000
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ /dev/null
@@ -1,548 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Parameterized build and test for TensorFlow Docker images.
-#
-# Usage:
-#   parameterized_docker_build.sh
-#
-# The script obeys the following environment variables:
-#   TF_DOCKER_BUILD_TYPE: (CPU | GPU | MKL | MKL-HOROVOD)
-#     CPU, GPU, MKL or MKL-HOROVOD image
-#
-#   TF_DOCKER_BUILD_IS_DEVEL: (NO | YES)
-#     Is this developer image
-#
-#   TF_DOCKER_BUILD_DEVEL_BRANCH
-#     (Required if TF_DOCKER_BUILD_IS_DEVEL is YES)
-#     Specifies the branch to checkout for devel docker images
-#
-#   TF_DOCKER_BUILD_CENTRAL_PIP
-#     (Optional)
-#     If set to a non-empty string, will use it as the URL from which the
-#     pip wheel file will be downloaded (instead of building the pip locally).
-#
-#   TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL
-#     (Optional)
-#     If set to a non-empty string, we will treat TF_DOCKER_BUILD_CENTRAL_PIP
-#     as a path rather than a url.
-#
-#   TF_DOCKER_BUILD_IMAGE_NAME:
-#     (Optional)
-#     If set to any non-empty value, will use it as the image of the
-#     newly-built image. If not set, the tag prefix tensorflow/tensorflow
-#     will be used.
-#
-#   TF_DOCKER_BUILD_VERSION:
-#     (Optinal)
-#     If set to any non-empty value, will use the version (e.g., 0.8.0) as the
-#     tag prefix of the image. Additional strings, e.g., "-devel-gpu", will be
-#     appended to the tag. If not set, the default tag prefix "latest" will be
-#     used.
-#
-#   TF_DOCKER_BUILD_PORT
-#     (Optional)
-#     If set to any non-empty and valid port number, will use that port number
-#     during basic checks on the newly-built docker image.
-#
-#   TF_DOCKER_BUILD_PUSH_CMD
-#     (Optional)
-#     If set to a valid binary/script path, will call the script with the final
-#     tagged image name with an argument, to push the image to a central repo
-#     such as gcr.io or Docker Hub.
-#
-#   TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS
-#     (Optional)
-#     Do not set this along with TF_DOCKER_BUILD_PUSH_CMD. We will push with the
-#     direct commands as opposed to a script.
-#
-#   TF_DOCKER_USERNAME
-#     (Optional)
-#     Dockerhub username for pushing a package.
-#
-#   TF_DOCKER_EMAIL
-#     (Optional)
-#     Dockerhub email for pushing a package.
-#
-#   TF_DOCKER_PASSWORD
-#     (Optional)
-#     Dockerhub password for pushing a package.
-#
-#   TF_DOCKER_BUILD_PYTHON_VERSION
-#     (Optional)
-#     Specifies the desired Python version. Defaults to PYTHON2.
-#
-#   TF_DOCKER_BUILD_OPTIONS
-#     (Optional)
-#     Specifies the desired build options. Defaults to OPT.
-#
-#   TF_DOCKER_BUILD_ARGS
-#     (Optional)
-#     A list (array) of docker build args. Will be passed to docker build
-#     command as list of --build-arg parameters.
-#
-#   TF_BAZEL_BUILD_OPTIONS
-#     (Optional)
-#     Bazel compiler flags to be passed to the bazelrc file
-
-# Script directory
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "${SCRIPT_DIR}/../ci_build/builds/builds_common.sh"
-
-# Help functions
-CHECK_FAILED=0
-mark_check_failed() {
-  # Usage: mark_check_failed <FAILURE_MESSAGE>
-  echo $1
-  CHECK_FAILED=1
-}
-
-TF_DOCKER_BUILD_TYPE=$(to_lower ${TF_DOCKER_BUILD_TYPE})
-TF_DOCKER_BUILD_IS_DEVEL=$(to_lower ${TF_DOCKER_BUILD_IS_DEVEL})
-TF_DOCKER_BUILD_PYTHON_VERSION=$(to_lower ${TF_DOCKER_BUILD_PYTHON_VERSION:-PYTHON2})
-TF_DOCKER_BUILD_OPTIONS=$(to_lower ${TF_DOCKER_BUILD_OPTIONS:-OPT})
-
-echo "Required build parameters:"
-echo "  TF_DOCKER_BUILD_TYPE=${TF_DOCKER_BUILD_TYPE}"
-echo "  TF_DOCKER_BUILD_IS_DEVEL=${TF_DOCKER_BUILD_IS_DEVEL}"
-echo "  TF_DOCKER_BUILD_DEVEL_BRANCH=${TF_DOCKER_BUILD_DEVEL_BRANCH}"
-echo ""
-echo "Optional build parameters:"
-echo "  TF_DOCKER_BUILD_CENTRAL_PIP=${TF_DOCKER_BUILD_CENTRAL_PIP}"
-echo "  TF_DOCKER_BUILD_IMAGE_NAME=${TF_DOCKER_BUILD_IMAGE_NAME}"
-echo "  TF_DOCKER_BUILD_VERSION=${TF_DOCKER_BUILD_VERSION}"
-echo "  TF_DOCKER_BUILD_PORT=${TF_DOCKER_BUILD_PORT}"
-echo "  TF_DOCKER_BUILD_PUSH_CMD=${TF_DOCKER_BUILD_PUSH_CMD}"
-echo "  TF_DOCKER_BUILD_ARGS=${TF_DOCKER_BUILD_ARGS[@]:-()}"
-echo "  TF_BAZEL_BUILD_OPTIONS=${TF_BAZEL_BUILD_OPTIONS}"
-
-
-CONTAINER_PORT=${TF_DOCKER_BUILD_PORT:-8888}
-
-# Make sure that docker is available on path
-if [[ -z $(which docker) ]]; then
-  die "ERROR: docker is not available on path"
-fi
-
-# Validate the environment-variable options and construct the final image name
-# Final image name with tag
-FINAL_IMAGE_NAME=${TF_DOCKER_BUILD_IMAGE_NAME:-tensorflow/tensorflow}
-FINAL_TAG=${TF_DOCKER_BUILD_VERSION:-latest}
-
-# Original (unmodified) Dockerfile
-ORIG_DOCKERFILE="Dockerfile"
-
-if [[ ${TF_DOCKER_BUILD_IS_DEVEL} == "yes" ]]; then
-  FINAL_TAG="${FINAL_TAG}-devel"
-  ORIG_DOCKERFILE="${ORIG_DOCKERFILE}.devel"
-
-  if [[ -z "${TF_DOCKER_BUILD_DEVEL_BRANCH}" ]]; then
-    die "ERROR: TF_DOCKER_BUILD_DEVEL_BRANCH is missing for devel docker build"
-  fi
-elif [[ ${TF_DOCKER_BUILD_IS_DEVEL} == "no" ]]; then
-  :
-else
-  die "ERROR: Unrecognized value in TF_DOCKER_BUILD_IS_DEVEL: "\
-"${TF_DOCKER_BUILD_IS_DEVEL}"
-fi
-
-if [[ ${TF_DOCKER_BUILD_TYPE} == "cpu" ]]; then
-  DOCKER_BINARY="docker"
-elif [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]]; then
-  DOCKER_BINARY="docker"
-  FINAL_TAG="${FINAL_TAG}-mkl"
-  if [[ ${ORIG_DOCKERFILE} == *"."* ]]; then
-    # There is already a dot in the tag, use "-"
-    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}-mkl"
-  else
-    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}.mkl"
-  fi
-elif [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
-  DOCKER_BINARY="docker"
-  FINAL_TAG="${FINAL_TAG}-mkl-horovod"
-  if [[ ${ORIG_DOCKERFILE} == *"."* ]]; then
-    # There is already a dot in the tag, use "-"
-    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}-mkl-horovod"
-  else
-    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}.mkl-horovod"
-  fi
-elif   [[ ${TF_DOCKER_BUILD_TYPE} == "gpu" ]]; then
-  DOCKER_BINARY="nvidia-docker"
-
-  FINAL_TAG="${FINAL_TAG}-gpu"
-  if [[ ${ORIG_DOCKERFILE} == *"."* ]]; then
-    # There is already a dot in the tag, use "-"
-    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}-gpu"
-  else
-    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}.gpu"
-  fi
-else
-  die "ERROR: Unrecognized value in TF_DOCKER_BUILD_TYPE: "\
-"${TF_DOCKER_BUILD_TYPE}"
-fi
-
-if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python2" ]]; then
-  :
-elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-  FINAL_TAG="${FINAL_TAG}-py3"
-elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
-  FINAL_TAG="${FINAL_TAG}-py3.6"
-else
-  die "Unrecognized value in TF_DOCKER_BUILD_PYTHON_VERSION: "\
-"${TF_DOCKER_BUILD_PYTHON_VERSION}"
-fi
-
-# Verify that the original Dockerfile exists
-ORIG_DOCKERFILE="${SCRIPT_DIR}/${ORIG_DOCKERFILE}"
-if [[ ! -f "${ORIG_DOCKERFILE}" ]]; then
-  die "ERROR: Cannot find Dockerfile at: ${ORIG_DOCKERFILE}"
-fi
-
-echo ""
-echo "FINAL_IMAGE_NAME: ${FINAL_IMAGE_NAME}"
-echo "FINAL_TAG: ${FINAL_TAG}"
-echo "Original Dockerfile: ${ORIG_DOCKERFILE}"
-echo ""
-
-# Create tmp directory for Docker build
-TMP_DIR=$(mktemp -d)
-echo ""
-echo "Docker build will occur in temporary directory: ${TMP_DIR}"
-
-# Copy all files to tmp directory for Docker build
-cp -r ${SCRIPT_DIR}/* "${TMP_DIR}/"
-
-if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
-  DOCKERFILE="${TMP_DIR}/Dockerfile"
-
-  if [[ -z "${TF_DOCKER_BUILD_CENTRAL_PIP}" ]]; then
-    # Perform local build of the required PIP whl file
-    export TF_BUILD_CONTAINER_TYPE=${TF_DOCKER_BUILD_TYPE}
-    export TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION}
-    export TF_BUILD_OPTIONS=${TF_DOCKER_BUILD_OPTIONS}
-    export TF_BUILD_IS_PIP="PIP"
-
-    if [[ "${TF_DOCKER_BUILD_TYPE}" == "mkl" ]]; then
-      die "FAIL: Non-development MKL builds require a pre-built pip whl."
-    fi
-
-    if [[ "${TF_DOCKER_BUILD_TYPE}" == "mkl-horovod" ]]; then
-      die "FAIL: Non-development MKL-HOROVOD builds require a pre-built pip whl."
-    fi
-
-    if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
-      export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
-  "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0"
-    fi
-
-    pushd "${SCRIPT_DIR}/../../../"
-    rm -rf pip_test/whl &&
-    tensorflow/tools/ci_build/ci_parameterized_build.sh
-    PIP_BUILD_EXIT_CODE=$?
-    popd
-
-    # Was the pip build successful?
-    if [[ ${PIP_BUILD_EXIT_CODE} != "0" ]]; then
-      die "FAIL: Failed to build pip file locally"
-    fi
-
-    PIP_WHL=$(ls pip_test/whl/*.whl | head -1)
-    if [[ -z "${PIP_WHL}" ]]; then
-      die "ERROR: Cannot locate the locally-built pip whl file"
-    fi
-    echo "Locally-built PIP whl file is at: ${PIP_WHL}"
-
-    # Copy the pip file to tmp directory
-    cp "${PIP_WHL}" "${TMP_DIR}/" || \
-        die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
-
-    # Use string replacement to put the correct file name into the Dockerfile
-    PIP_WHL=$(basename "${PIP_WHL}")
-
-    # Modify the non-devel Dockerfile to point to the correct pip whl file
-    # location
-    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
-"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
-"COPY ${PIP_WHL} /\n"\
-"RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
-    > "${DOCKERFILE}"
-
-  # Build from a local whl file path rather than an URL
-  elif [[ ! -z "${TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL}" ]]; then
-    PIP_WHL="${TF_DOCKER_BUILD_CENTRAL_PIP}"
-    if [[ -z "${PIP_WHL}" ]]; then
-      die "ERROR: Cannot locate the specified pip whl file"
-    fi
-    echo "Specified PIP whl file is at: ${PIP_WHL}"
-
-    # Copy the pip file to tmp directory
-    cp "${PIP_WHL}" "${TMP_DIR}/" || \
-        die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
-
-    # Use string replacement to put the correct file name into the Dockerfile
-    PIP_WHL=$(basename "${PIP_WHL}")
-
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
-        [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
-      TF_DOCKER_BUILD_ARGS+=("--build-arg TF_WHL_URL=${PIP_WHL}" )
-      cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
-    else
-      # Modify the non-devel Dockerfile to point to the correct pip whl file
-      # location
-      sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
-"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
-"COPY ${PIP_WHL} /\n"\
-"RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
-      > "${DOCKERFILE}"    
-    fi
-    echo "Using local pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
-    echo
-  else
-    echo "Downloading pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
-        [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
-      pushd "${TMP_DIR}/"
-      curl -O ${TF_DOCKER_BUILD_CENTRAL_PIP}
-      popd
-      PIP_WHL_PATH=`find ${TMP_DIR} -name "*.whl"`
-      PIP_WHL=$(basename "${PIP_WHL_PATH}")
-      echo "PIP_WHL= ${PIP_WHL}"    
-      echo
-      TF_DOCKER_BUILD_ARGS+=("--build-arg TF_WHL_URL=${PIP_WHL}")
-      cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
-    else
-      # Modify the non-devel Dockerfile to point to the correct pip whl URL.
-      sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
-"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
-"RUN pip --no-cache-dir install ${TF_DOCKER_BUILD_CENTRAL_PIP}" "${ORIG_DOCKERFILE}" \
-      > "${DOCKERFILE}"
-    fi
-  fi
-
-  echo "Modified Dockerfile at: ${DOCKERFILE}"
-  echo
-
-  # Modify python/pip version if necessary.
-  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
-          [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
-        TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${TF_DOCKER_BUILD_PYTHON_VERSION}")
-        TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON_DEV=python3-dev")
-        TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
-        cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
-    else
-        if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
-            sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
-            sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-            sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
-        then
-          echo "Modified Dockerfile for python version "\
-    "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
-        else
-          die "FAILED to modify ${DOCKERFILE} for python3"
-        fi
-    fi
-  fi
-else # TF_DOCKER_BUILD_IS_DEVEL == 'yes'
-  DOCKERFILE="${TMP_DIR}/Dockerfile"
-
-  # Set up Dockerfile ARGS for mkl and mkl-horovod build
-  if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
-      [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
-    if [[ -z "${TF_BAZEL_BUILD_OPTIONS// }" ]]; then
-      TF_BAZEL_BUILD_OPTIONS=("--config=mkl --copt=-mavx --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0")
-    else
-      TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}"
-    fi   
-    TF_DOCKER_BUILD_ARGS+=("--build-arg TF_BUILD_VERSION=${TF_DOCKER_BUILD_DEVEL_BRANCH}")
-    echo "TF_DOCKER_BUILD_ARGS=${TF_DOCKER_BUILD_ARGS[@]}"
-
-    # Pass the build options to bazel using the user-specific .bazelrc file
-    echo "build ${TF_BAZEL_BUILD_OPTIONS}" >> ${TMP_DIR}/.bazelrc
-    cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
-  else
-    # Modify the devel Dockerfile to specify the git branch
-    sed "s/^RUN git clone --branch=.* --depth=1/RUN git clone --branch=${TF_DOCKER_BUILD_DEVEL_BRANCH} --depth=1/" \
-        "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
-  fi
-
-  # Modify python/pip version if necessary.
-  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]] || [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
-        TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${TF_DOCKER_BUILD_PYTHON_VERSION}")
-        TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON3_DEV=python3-dev")
-        TF_DOCKER_BUILD_ARGS+=("--build-arg WHL_DIR=/tmp/pip3")
-        TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
-        cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
-    else
-      if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]] && [[ "${TF_DOCKER_BUILD_TYPE}" != "mkl" ]]; then
-        die "Python 3.6 build only supported for MKL builds."
-      fi
-      if sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
-         sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
-         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
-         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-         sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
-      then
-        echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
-      else
-        die "FAILED to modify ${DOCKERFILE} for python3"
-      fi
-    fi
-  fi
-fi
-
-# Perform docker build
-# Intermediate image name with tag
-IMG="${USER}/tensorflow:${FINAL_TAG}"
-echo "Building docker image with image name and tag: ${IMG}"
-echo "TF_DOCKER_BUILD_ARGS=${TF_DOCKER_BUILD_ARGS[@]}"
-CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${IMG} -f ${DOCKERFILE} ${TMP_DIR}"
-echo "CMD=${CMD}"
-${CMD}
-
-if [[ $? == "0" ]]; then
-  echo "${DOCKER_BINARY} build of ${IMG} succeeded"
-else
-  die "FAIL: ${DOCKER_BINARY} build of ${IMG} with Dockerfile ${DOCKERFILE} "\
-"failed"
-fi
-
-
-# Make sure that there is no other containers of the same image running
-# TODO(cais): Move to an earlier place.
-if "${DOCKER_BINARY}" ps | grep -q "${IMG}"; then
-  die "ERROR: It appears that there are docker containers of the image "\
-"${IMG} running. Please stop them before proceeding"
-fi
-
-# Start a docker container from the newly-built docker image
-DOCKER_RUN_LOG="${TMP_DIR}/docker_run.log"
-echo ""
-echo "Running docker container from image ${IMG}..."
-echo "  Log file is at: ${DOCKER_RUN_LOG}"
-echo ""
-
-if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
-  "${DOCKER_BINARY}" run --rm -p ${CONTAINER_PORT}:${CONTAINER_PORT} \
-      -v ${TMP_DIR}/notebooks:/root/notebooks "${IMG}" \
-      2>&1 > "${DOCKER_RUN_LOG}" &
-
-  # Get the container ID
-  CONTAINER_ID=""
-  while [[ -z ${CONTAINER_ID} ]]; do
-    sleep 1
-    echo "Polling for container ID..."
-    CONTAINER_ID=$("${DOCKER_BINARY}" ps | grep "${IMG}" | awk '{print $1}')
-  done
-
-  echo "ID of the running docker container: ${CONTAINER_ID}"
-  echo ""
-
-  if [[ ${TF_DOCKER_BUILD_IS_DEVEL} == "no" ]]; then
-    # Non-devel docker build: Do some basic sanity checks on jupyter notebook
-    # on the running docker container
-    echo ""
-    echo "Performing basic sanity checks on the running container..."
-    if wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null
-    then
-      echo "  PASS: wget tree"
-    else
-      mark_check_failed "  FAIL: wget tree"
-    fi
-
-    for NB in ${TMP_DIR}/notebooks/*.ipynb; do
-      NB_BASENAME=$(basename "${NB}")
-      NB_URL="http://127.0.0.1:${CONTAINER_PORT}/notebooks/${NB_BASENAME}"
-      if wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null
-      then
-        echo "  PASS: wget ${NB_URL}"
-      else
-        mark_check_failed  "  FAIL: wget ${NB_URL}"
-      fi
-    done
-  fi
-
-  # Stop the running docker container
-  sleep 1
-  "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}
-fi
-
-
-# Clean up
-echo "Cleaning up temporary directory: ${TMP_DIR} ..."
-rm -rf "${TMP_DIR}" || echo "ERROR: Failed to remove directory ${TMP_DIR}"
-
-
-# Summarize result
-echo ""
-if [[ ${CHECK_FAILED} == "0" ]]; then
-  echo "PASS: basic checks on newly-built image \"${IMG}\" succeeded"
-else
-  die "FAIL: basic checks on newly-built image \"${IMG}\" failed"
-fi
-
-
-# Apply the final image name and tag
-FINAL_IMG="${FINAL_IMAGE_NAME}:${FINAL_TAG}"
-
-DOCKER_VER=$("${DOCKER_BINARY}" version | grep Version | head -1 | awk '{print $NF}')
-if [[ -z "${DOCKER_VER}" ]]; then
-  die "ERROR: Failed to determine ${DOCKER_BINARY} version"
-fi
-DOCKER_MAJOR_VER=$(echo "${DOCKER_VER}" | cut -d. -f 1)
-DOCKER_MINOR_VER=$(echo "${DOCKER_VER}" | cut -d. -f 2)
-
-FORCE_TAG=""
-if [[ "${DOCKER_MAJOR_VER}" -le 1 ]] && \
-   [[ "${DOCKER_MINOR_VER}" -le 9 ]]; then
-  FORCE_TAG="--force"
-fi
-
-"${DOCKER_BINARY}" tag ${FORCE_TAG} "${IMG}" "${FINAL_IMG}" || \
-    die "Failed to tag intermediate docker image ${IMG} as ${FINAL_IMG}"
-
-echo ""
-echo "Successfully tagged docker image: ${FINAL_IMG}"
-
-# Optional: call command specified by TF_DOCKER_BUILD_PUSH_CMD to push image
-if [[ ! -z "${TF_DOCKER_BUILD_PUSH_CMD}" ]]; then
-  ${TF_DOCKER_BUILD_PUSH_CMD} ${FINAL_IMG}
-  if [[ $? == "0" ]]; then
-    echo "Successfully pushed Docker image ${FINAL_IMG}"
-  else
-    die "FAIL: Failed to push Docker image ${FINAL_IMG}"
-  fi
-fi
-
-# Optional: set TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS to push image
-if [[ ! -z "${TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS}" ]]; then
-
-  docker login -u "${TF_DOCKER_USERNAME}" \
-  -p "${TF_DOCKER_PASSWORD}"
-
-  if [[ $? != "0" ]]; then
-    die "FAIL: Unable to login. Invalid credentials."
-  fi
-  docker push "${FINAL_IMG}"
-  if [[ $? == "0" ]]; then
-    docker logout
-    echo "Successfully pushed Docker image ${FINAL_IMG}"
-  else
-    docker logout
-    die "FAIL: Failed to push Docker image ${FINAL_IMG}"
-  fi
-fi
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index 50b0cc5..927246a 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -2,16 +2,13 @@
 
 This directory houses TensorFlow's Dockerfiles and the infrastructure used to
 create and deploy them to
-[Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow).
+[TensorFlow's Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow).
 
 **DO NOT EDIT THE DOCKERFILES/ DIRECTORY MANUALLY!** The files within are
 maintained by `assembler.py`, which builds Dockerfiles from the files in
 `partials/` and the rules in `spec.yml`. See
 [the Contributing section](#contributing) for more information.
 
-These Dockerfiles are planned to replace the Dockerfiles used to generate
-[TensorFlow's official Docker images](https://hub.docker.com/r/tensorflow/tensorflow).
-
 ## Building
 
 The Dockerfiles in the `dockerfiles` directory must have their build context set
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 5e84f36..ca0b5a6 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -38,6 +38,7 @@
 import re
 import shutil
 import sys
+import json
 
 from absl import app
 from absl import flags
@@ -582,17 +583,42 @@
       image, logs = None, []
       if not FLAGS.dry_run:
         try:
-          image, logs = dock.images.build(
+          # Use low level APIClient in order to stream log output
+          resp = dock.api.build(
               timeout=FLAGS.hub_timeout,
               path='.',
               nocache=FLAGS.nocache,
               dockerfile=dockerfile,
               buildargs=tag_def['cli_args'],
               tag=repo_tag)
-
-          # Print logs after finishing
-          log_lines = [l.get('stream', '') for l in logs]
-          eprint(''.join(log_lines))
+          last_event = None
+          image_id = None
+          # Manually process log output extracting build success and image id
+          # in order to get built image
+          while True:
+            try:
+              output = next(resp).decode('utf-8')
+              json_output = json.loads(output.strip('\r\n'))
+              if 'stream' in json_output:
+                eprint(json_output['stream'], end='')
+                match = re.search(r'(^Successfully built |sha256:)([0-9a-f]+)$',
+                                  json_output['stream'])
+                if match:
+                  image_id = match.group(2)
+                last_event = json_output['stream']
+                # collect all log lines into the logs object
+                logs.append(json_output)
+            except StopIteration:
+              eprint('Docker image build complete.')
+              break
+            except ValueError:
+              eprint('Error parsing from docker image build: {}'.format(output))
+          # If Image ID is not set, the image failed to built properly. Raise
+          # an error in this case with the last log line and all logs
+          if image_id:
+            image = dock.images.get(image_id)
+          else:
+            raise docker.errors.BuildError(last_event or 'Unknown', logs)
 
           # Run tests if requested, and dump output
           # Could be improved by backgrounding, but would need better
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index b11e1b9..fd3c807 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -34,6 +34,7 @@
         pkg-config \
         rsync \
         software-properties-common \
+	sudo \
         unzip \
         zip \
         zlib1g-dev \
@@ -77,6 +78,7 @@
     wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -90,6 +92,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index 6bb1146..02816e5 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -34,6 +34,7 @@
         pkg-config \
         rsync \
         software-properties-common \
+	sudo \
         unzip \
         zip \
         zlib1g-dev \
@@ -77,6 +78,7 @@
     wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -90,6 +92,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 6013b59..0867e22 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -110,6 +110,7 @@
     wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -123,6 +124,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 86c01ad..3e52a11 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -110,6 +110,7 @@
     wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -123,6 +124,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index c8384f7..ab5ea14 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -34,6 +34,7 @@
         pkg-config \
         rsync \
         software-properties-common \
+	sudo \
         unzip \
         zip \
         zlib1g-dev \
@@ -76,6 +77,7 @@
     git \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -89,6 +91,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 08f880e..54d393b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -34,6 +34,7 @@
         pkg-config \
         rsync \
         software-properties-common \
+	sudo \
         unzip \
         zip \
         zlib1g-dev \
@@ -76,6 +77,7 @@
     git \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -89,6 +91,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 6935dd1..bb331f6 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -109,6 +109,7 @@
     git \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -122,6 +123,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 42eca62..3c7a455 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -109,6 +109,7 @@
     git \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -122,6 +123,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index 8156db6..3e617b6 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -5,6 +5,7 @@
     wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -18,6 +19,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index 0397ab5..471f7f8 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -4,6 +4,7 @@
     git \
     openjdk-8-jdk \
     ${PYTHON}-dev \
+    virtualenv \
     swig
 
 RUN ${PIP} --no-cache-dir install \
@@ -17,6 +18,7 @@
     scipy \
     sklearn \
     pandas \
+    portpicker \
     && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
     enum34
 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
index a1fd901..b97ab03 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -11,6 +11,7 @@
         pkg-config \
         rsync \
         software-properties-common \
+	sudo \
         unzip \
         zip \
         zlib1g-dev \
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index aaee454..bd01a58 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -25,6 +25,7 @@
     srcs = [
         "doc_generator_visitor_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":doc_generator_visitor",
@@ -44,6 +45,7 @@
     name = "doc_controls_test",
     size = "small",
     srcs = ["doc_controls_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":doc_controls",
@@ -68,6 +70,7 @@
     name = "parser_test",
     size = "small",
     srcs = ["parser_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":parser",
@@ -103,6 +106,7 @@
     name = "generate_lib_test",
     size = "small",
     srcs = ["generate_lib_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":generate_lib",
@@ -114,6 +118,7 @@
 py_binary(
     name = "generate",
     srcs = ["generate.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":generate_lib",
@@ -127,6 +132,7 @@
     name = "build_docs_test",
     size = "medium",
     srcs = ["build_docs_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         # No reason to run sanitizers or fastbuild for this test.
@@ -145,6 +151,7 @@
 py_test(
     name = "generate2_test",
     srcs = ["generate2_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "manual",
@@ -162,6 +169,7 @@
 py_binary(
     name = "generate2",
     srcs = ["generate2.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":generate2_lib"],
 )
@@ -186,6 +194,7 @@
     name = "py_guide_parser_test",
     size = "small",
     srcs = ["py_guide_parser_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":py_guide_parser",
diff --git a/tensorflow/tools/docs/doc_controls.py b/tensorflow/tools/docs/doc_controls.py
index 5e52644..27a1d20 100644
--- a/tensorflow/tools/docs/doc_controls.py
+++ b/tensorflow/tools/docs/doc_controls.py
@@ -240,6 +240,9 @@
   return obj
 
 
+do_not_doc_in_subclasses = for_subclass_implementers
+
+
 def should_skip(obj):
   """Returns true if docs generation should be skipped for this object.
 
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 8a63b83..7b3796e 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -89,6 +89,28 @@
     return (canonical_score,) + scores
 
 
+def _hide_layer_and_module_methods():
+  """Hide methods and properties defined in the base classes of keras layers."""
+  # __dict__ only sees attributes defined in *this* class, not on parent classes
+  module_contents = list(tf.Module.__dict__.items())
+  layer_contents = list(tf.keras.layers.Layer.__dict__.items())
+
+  for name, obj in module_contents + layer_contents:
+    if name == "__init__":
+      continue
+
+    if isinstance(obj, property):
+      obj = obj.fget
+
+    if isinstance(obj, (staticmethod, classmethod)):
+      obj = obj.__func__
+
+    try:
+      doc_controls.do_not_doc_in_subclasses(obj)
+    except AttributeError:
+      pass
+
+
 def build_docs(output_dir, code_url_prefix, search_hints=True):
   """Build api docs for tensorflow v2.
 
@@ -97,6 +119,8 @@
     code_url_prefix: prefix for "Defined in" links.
     search_hints: Bool. Include meta-data search hints at the top of each file.
   """
+  _hide_layer_and_module_methods()
+
   try:
     doc_controls.do_not_generate_docs(tf.tools)
   except AttributeError:
@@ -105,16 +129,16 @@
   base_dir = path.dirname(tf.__file__)
   base_dirs = (
       base_dir,
-      path.normpath(path.join(base_dir, "../../tensorflow")),
+      # External packages base directories,
       path.dirname(tensorboard.__file__),
       path.dirname(tensorflow_estimator.__file__),
   )
 
   code_url_prefixes = (
       code_url_prefix,
-      # External packages source repositories
-      "https://github.com/tensorflow/tensorboard/tree/master/tensorboard"
-      "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator"
+      # External packages source repositories,
+      "https://github.com/tensorflow/tensorboard/tree/master/tensorboard",
+      "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator",
   )
 
   doc_generator = generate_lib.DocGenerator(
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index f0149cd..46c4933 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -4,6 +4,7 @@
 package(default_visibility = ["//visibility:private"])
 
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow:tensorflow.bzl", "VERSION", "VERSION_MAJOR", "if_cuda", "if_macos", "if_not_windows")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
@@ -196,7 +197,9 @@
             "@grpc//:LICENSE",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
-    ) + tf_additional_license_deps(),
+    ) + if_rocm([
+        "@rocprim_archive//:LICENSE.txt",
+    ]) + tf_additional_license_deps(),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
@@ -261,6 +264,8 @@
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
+    ]) + if_rocm([
+        "@rocprim_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
index 0d9f26c..402c709 100644
--- a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -21,40 +21,66 @@
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/optimization/optimization_pass_runner.h"
 
-int main(int argc, char** argv) {
+namespace tensorflow {
+namespace {
+Status RealMain(int argc, char** argv) {
+  string input_file_path;
+  string output_file_path;
+  string optimization_pass;
+
+  const std::vector<Flag> flag_list = {
+      Flag("input_file_path", &input_file_path, "Location of the input graph."),
+      Flag("output_file_path", &output_file_path,
+           "Location to write the resulting graph."),
+      // For now only a single optimization pass can be run.
+      Flag("optimization_pass", &optimization_pass,
+           "Which optimization pass to run."),
+  };
+  if (!Flags::Parse(&argc, argv, flag_list)) {
+    return errors::FailedPrecondition("Invalid flags passed");
+  }
+  port::InitMain(argv[0], &argc, &argv);
+
+  if (input_file_path.empty()) {
+    return errors::FailedPrecondition("input_file_path is a required flag.");
+  }
+  if (output_file_path.empty()) {
+    return errors::FailedPrecondition("output_file_path is a required flag.");
+  }
+  if (optimization_pass.empty()) {
+    return errors::FailedPrecondition("optimization_pass is a required flag.");
+  }
+
+  GraphDef graphdef_input;
+  TF_RETURN_IF_ERROR(
+      ReadTextProto(Env::Default(), input_file_path, &graphdef_input));
+
   tensorflow::OptimizationPassRunner runner;
-  // Add fake devices for CPU, GPU, and XLA to ensure we have all devices we
-  // need.
+
   // Most machines in our servers currently use 8 gpus. There is nothing special
   // about this number and it can be decreased or increased to test other
   // configurations.
-  int num_gpus_per_machine = 8;
-  for (int i = 0; i < num_gpus_per_machine; i++) {
-    TF_CHECK_OK(runner.AddDevice(
-        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i),
-        tensorflow::DEVICE_CPU));
-    TF_CHECK_OK(runner.AddDevice(
-        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i),
-        tensorflow::DEVICE_GPU));
-    TF_CHECK_OK(runner.AddDevice(
-        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_CPU:", i),
-        tensorflow::DEVICE_XLA_CPU));
-    TF_CHECK_OK(runner.AddDevice(
-        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_GPU:", i),
-        tensorflow::DEVICE_XLA_GPU));
-    TF_CHECK_OK(runner.AddDevice(
-        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU_XLA_JIT:", i),
-        tensorflow::DEVICE_CPU_XLA_JIT));
-    TF_CHECK_OK(runner.AddDevice(
-        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU_XLA_JIT:", i),
-        tensorflow::DEVICE_GPU_XLA_JIT));
-  }
+  TF_RETURN_IF_ERROR(runner.AddCpus(8));
+  TF_RETURN_IF_ERROR(runner.AddGpus(8));
+
   // This binary is used to test TF:XLA behavior, so turn on auto_jit.
-  TF_CHECK_OK(runner.SetJitLevel(tensorflow::OptimizerOptions::GlobalJitLevel::
-                                     OptimizerOptions_GlobalJitLevel_ON_2));
-  // Run the actual "main" function.
-  TF_CHECK_OK(runner.RunMain(argc, argv));
+  TF_RETURN_IF_ERROR(
+      runner.SetJitLevel(tensorflow::OptimizerOptions::GlobalJitLevel::
+                             OptimizerOptions_GlobalJitLevel_ON_2));
+  GraphDef graphdef_output;
+  TF_RETURN_IF_ERROR(runner.Run(optimization_pass, std::move(graphdef_input),
+                                &graphdef_output));
+  return WriteTextProto(Env::Default(), output_file_path, graphdef_output);
+}
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  TF_CHECK_OK(tensorflow::RealMain(argc, argv));
+  return 0;
 }
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 231ff08..162d39d 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -38,13 +38,10 @@
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
-
 namespace {
 // A fake device used to populate a DeviceSet.
 class FakeDevice : public Device {
@@ -68,37 +65,34 @@
   device_attributes.set_device_type(DeviceType(type).type());
   return std::unique_ptr<Device>(new FakeDevice(device_attributes));
 }
+
+Status FindPassWithName(absl::string_view name,
+                        GraphOptimizationPass** result) {
+  *result = nullptr;
+  // Run the optimization pass specified by the command line flag.
+  for (const auto& groups_and_passes :
+       OptimizationPassRegistry::Global()->groups()) {
+    for (const auto& phase_and_passes : groups_and_passes.second) {
+      for (const auto& pass : phase_and_passes.second) {
+        if (pass->name() == name) {
+          if (*result) {
+            return errors::Internal("Found more than one pass with name ",
+                                    name);
+          }
+          *result = pass.get();
+        }
+      }
+    }
+  }
+
+  return *result == nullptr
+             ? errors::Internal("Could not find pass with name ", name)
+             : Status::OK();
+}
 }  // namespace
 
-Status OptimizationPassRunner::RunMain(int argc, char** argv) {
-  string input_file_path;
-  string output_file_path;
-  string optimization_pass;
-
-  const std::vector<Flag> flag_list = {
-      Flag("input_file_path", &input_file_path, "Location of the input graph."),
-      Flag("output_file_path", &output_file_path,
-           "Location to write the resulting graph."),
-      // For now only a single optimization pass can be run.
-      Flag("optimization_pass", &optimization_pass,
-           "Which optimization pass to run."),
-  };
-  if (!Flags::Parse(&argc, argv, flag_list)) {
-    return errors::FailedPrecondition("Invalid flags passed");
-  }
-  port::InitMain(argv[0], &argc, &argv);
-
-  if (input_file_path.empty()) {
-    return errors::FailedPrecondition("input_file_path is a required flag.");
-  }
-  if (output_file_path.empty()) {
-    return errors::FailedPrecondition("output_file_path is a required flag.");
-  }
-  if (optimization_pass.empty()) {
-    return errors::FailedPrecondition("optimization_pass is a required flag.");
-  }
-
-  // Turn on XLA Auto-Jit.
+Status OptimizationPassRunner::Run(absl::string_view pass_to_run,
+                                   GraphDef input, GraphDef* result) {
   auto session_options = absl::make_unique<SessionOptions>();
   session_options->config.mutable_graph_options()
       ->mutable_optimizer_options()
@@ -107,19 +101,18 @@
   std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
 
   GraphOptimizationPassOptions options;
-  options.session_options = session_options.release();
+  options.session_options = session_options.get();
   options.graph = &graph;
-  options.flib_def =
-      new FunctionLibraryDefinition((*options.graph)->op_registry(), flib);
+  std::unique_ptr<FunctionLibraryDefinition> flib_def(
+      new FunctionLibraryDefinition((*options.graph)->op_registry(), flib));
+  options.flib_def = flib_def.get();
 
   // Grab the data
-  GraphDef graphdef;
   GraphConstructorOptions graph_opts;
   graph_opts.expect_device_spec = true;
   graph_opts.allow_internal_ops = true;
-  TF_RETURN_IF_ERROR(ReadTextProto(Env::Default(), input_file_path, &graphdef));
   TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(graph_opts, graphdef, options.graph->get()));
+      ConvertGraphDefToGraph(graph_opts, input, options.graph->get()));
 
   // Add all devices that were previously configured with AddDevice.
   DeviceSet device_set;
@@ -128,27 +121,11 @@
   }
   options.device_set = &device_set;
 
-  Status result = errors::NotFound(
-      "An OptimizationPass was not found with the desired name.");
+  GraphOptimizationPass* pass;
+  TF_RETURN_IF_ERROR(FindPassWithName(pass_to_run, &pass));
+  TF_RETURN_IF_ERROR(pass->Run(options));
 
-  // Run the optimization pass specified by the command line flag.
-  for (const auto& groups_and_passes :
-       OptimizationPassRegistry::Global()->groups()) {
-    for (const auto& phase_and_passes : groups_and_passes.second) {
-      for (const auto& pass : phase_and_passes.second) {
-        if (pass->name() == optimization_pass) {
-          result = pass->Run(options);
-        }
-      }
-    }
-  }
-
-  TF_RETURN_IF_ERROR(result);
-
-  // Write out the result.
-  options.graph->get()->ToGraphDef(&graphdef);
-  TF_RETURN_IF_ERROR(
-      WriteTextProto(Env::Default(), output_file_path, graphdef));
+  options.graph->get()->ToGraphDef(result);
   return Status::OK();
 }
 
@@ -158,10 +135,17 @@
   return Status::OK();
 }
 
-Status OptimizationPassRunner::AddDevice(const string& name,
-                                         const string& type) {
-  devices_.push_back(FakeDevice::Make(name, type));
+Status OptimizationPassRunner::AddDevices(absl::string_view type, int count) {
+  for (int i = 0; i < count; i++) {
+    devices_.push_back(FakeDevice::Make(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:", type, ":", i),
+        absl::StrCat(type)));
+    devices_.push_back(FakeDevice::Make(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_", type, ":",
+                     i),
+        absl::StrCat(type)));
+  }
+
   return Status::OK();
 }
-
 }  // namespace tensorflow
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.h b/tensorflow/tools/optimization/optimization_pass_runner.h
index 3b26f64..e0ce859 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.h
+++ b/tensorflow/tools/optimization/optimization_pass_runner.h
@@ -34,24 +34,23 @@
       : jit_level_(OptimizerOptions::GlobalJitLevel::
                        OptimizerOptions_GlobalJitLevel_DEFAULT) {}
 
-  // Add a fake device to the (initially empty) DeviceSet used for optimization.
-  // Names are of the form: "/job:localhost/replica:0/task:0/device:CPU:0"
-  Status AddDevice(const string& name, const string& type);
-
   // Increasing the Jit level will cause XLA to compile parts of the tensorflow
   // graph that it is able to.
   Status SetJitLevel(OptimizerOptions::GlobalJitLevel jit_level);
 
-  // This can be called after adding devices and setting the jit level to parse
-  // command line flags and run the specified job. All 3 flags are required:
-  // input_file_path, output_file_path, optimization_pass.
-  //
-  // If this library becomes heavily used, the caller should be responsible for
-  // parsing any command line flags desired rather than this Method handling the
-  // work of a main() function.
-  Status RunMain(int argc, char** argv);
+  Status Run(absl::string_view pass_to_run, GraphDef input, GraphDef* result);
+
+  Status AddCpus(int count) {
+    return AddDevices(tensorflow::DEVICE_CPU, count);
+  }
+
+  Status AddGpus(int count) {
+    return AddDevices(tensorflow::DEVICE_GPU, count);
+  }
 
  private:
+  Status AddDevices(absl::string_view type, int count);
+
   OptimizerOptions::GlobalJitLevel jit_level_;
   std::vector<std::unique_ptr<Device>> devices_;
 };
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c8ac9e4..87f7676 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -11,6 +11,7 @@
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
 load(
@@ -59,6 +60,7 @@
     "setup.py",
     ":included_headers",
     "//tensorflow:tensorflow_py",
+    "//tensorflow/examples/saved_model/integration_tests:mnist_util",
     "//tensorflow/lite/python/testdata:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
@@ -83,6 +85,7 @@
     "//tensorflow/python/distribute:combinations",
     "//tensorflow/python/eager:eager_pip",
     "//tensorflow/python/keras/distribute:distribute_strategy_test_lib",
+    "//tensorflow/python/keras/distribute:mnist_multi_worker_lib",
     "//tensorflow/python/keras/mixed_precision/experimental:test_util",
     "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
@@ -93,6 +96,12 @@
     "//tensorflow/python/tools/api/generator:create_python_api",
     "//tensorflow/python:test_ops",
     "//tensorflow/python:while_v2",
+    "//tensorflow/tools/common:public_api",
+    "//tensorflow/tools/common:test_module1",
+    "//tensorflow/tools/docs:doc_generator_visitor",
+    "//tensorflow/tools/docs:generate_lib",
+    "//tensorflow/tools/docs:parser",
+    "//tensorflow/tools/docs:py_guide_parser",
 ]
 
 COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
@@ -242,6 +251,8 @@
         "@ngraph_tf//:LICENSE",
         "@nlohmann_json_lib//:LICENSE.MIT",
         "@tbb//:LICENSE",
+    ]) + if_rocm([
+        "@rocprim_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
 )
 
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index dad55bd..305033c 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -125,7 +125,7 @@
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
   pushd ${RUNFILES%org_tensorflow} > /dev/null
-  for header in $(find protobuf_archive -regex ".*\.\(h\|inc\)"); do
+  for header in $(find protobuf_archive -name "*.h" -o -name "*.inc"); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 5f21c5a..571fe6d 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -36,7 +36,6 @@
     "tensorflow/lite/delegates/gpu",
     "tensorflow/lite/delegates/gpu/metal",
     "tensorflow/lite/delegates/gpu/metal/kernels",
-    "tensorflow/lite/examples/android",
     "tensorflow/lite/experimental/objc",
     "tensorflow/lite/experimental/swift",
 ]
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index cedf149..0e7240f 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -53,7 +53,7 @@
     'absl-py >= 0.7.0',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'google_pasta >= 0.1.2',
+    'google_pasta >= 0.1.6',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
     'numpy >= 1.14.5, < 2.0',
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index ef12226..0d26c01 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -32,6 +32,7 @@
 py_binary(
     name = "system_info",
     srcs = ["system_info.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":system_info_lib",
@@ -55,6 +56,7 @@
 py_binary(
     name = "run_and_gather_logs",
     srcs = ["run_and_gather_logs.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [":run_and_gather_logs_main_lib"],
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5567bc9..d4e4de2 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -55,6 +55,8 @@
 # path_prefix is no longer used.
 # tf_repo_name is thought to be under consideration.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
+    """All external dependencies for TF builds."""
+
     # Note that we check the minimum bazel version in WORKSPACE.
     clang6_configure(name = "local_config_clang6")
     cc_download_clang_toolchain(name = "local_config_download_clang")
@@ -135,11 +137,11 @@
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "00d5ad2c5702be911239df287504f9da6985e8bc563ec6a8436552da1ac2938c",
-        strip_prefix = "abseil-cpp-d902eb869bcfacc1bad14933ed9af4bed006d481",
+        sha256 = "29495db3b8db37e1247a161cadf3d4bbbedb030a04a82e7f12b5c0d521d052e0",
+        strip_prefix = "abseil-cpp-27c2f6e2f3b5929fbd322b0f0ca392eb02efd9f8",
         urls = [
-            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/d902eb869bcfacc1bad14933ed9af4bed006d481.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/d902eb869bcfacc1bad14933ed9af4bed006d481.tar.gz",
+            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/27c2f6e2f3b5929fbd322b0f0ca392eb02efd9f8.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/27c2f6e2f3b5929fbd322b0f0ca392eb02efd9f8.tar.gz",
         ],
     )
 
@@ -147,11 +149,11 @@
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "8b3e1c0494af6b616ef3f2a107e093be1ea57c6a34f277edb2bdb1dbf3e3870a",
-        strip_prefix = "eigen-eigen-4fe5a1014743",
+        sha256 = "74845ea27e19a1bcf63f3f271de62e06798f23e0467bb9d45b83a94918941b23",
+        strip_prefix = "eigen-eigen-20cbc6576426",
         urls = [
-            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/4fe5a1014743.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/4fe5a1014743.tar.gz",
+            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/20cbc6576426.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/20cbc6576426.tar.gz",
         ],
     )
 
@@ -169,11 +171,11 @@
     tf_http_archive(
         name = "libxsmm_archive",
         build_file = clean_dep("//third_party:libxsmm.BUILD"),
-        sha256 = "cd8532021352b4a0290d209f7f9bfd7c2411e08286a893af3577a43457287bfa",
-        strip_prefix = "libxsmm-1.9",
+        sha256 = "5fc1972471cd8e2b8b64ea017590193739fc88d9818e3d086621e5c08e86ea35",
+        strip_prefix = "libxsmm-1.11",
         urls = [
-            "http://mirror.tensorflow.org/github.com/hfp/libxsmm/archive/1.9.tar.gz",
-            "https://github.com/hfp/libxsmm/archive/1.9.tar.gz",
+            "http://mirror.tensorflow.org/github.com/hfp/libxsmm/archive/1.11.tar.gz",
+            "https://github.com/hfp/libxsmm/archive/1.11.tar.gz",
         ],
     )
 
@@ -190,15 +192,15 @@
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "f5600fdf3efd28e3142a60c20574e349511104fc6f658faf7974f6ae2def245a",
-        strip_prefix = "google-cloud-cpp-0.8.1",
+        sha256 = "a072103546cfa041ad8bfc599fe5a20c58e005a1a0ee18e94b2554dc3d485604",
+        strip_prefix = "google-cloud-cpp-0.9.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "http://mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.8.1.tar.gz",
-            "https://github.com/googleapis/google-cloud-cpp/archive/v0.8.1.tar.gz",
+            "http://mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.9.0.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v0.9.0.tar.gz",
         ],
     )
 
@@ -216,11 +218,11 @@
 
     tf_http_archive(
         name = "gemmlowp",
-        sha256 = "dcf6e2aed522d74ac76b54038c19f0138565f4778a8821ab6679738755ebf6c2",
-        strip_prefix = "gemmlowp-dec2b7dd5f6f0043070af4587d2a9dc156f4ebab",
+        sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834",
+        strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/gemmlowp/archive/dec2b7dd5f6f0043070af4587d2a9dc156f4ebab.zip",
-            "https://github.com/google/gemmlowp/archive/dec2b7dd5f6f0043070af4587d2a9dc156f4ebab.zip",
+            "http://mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
+            "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
         ],
     )
 
@@ -712,6 +714,17 @@
     )
 
     tf_http_archive(
+        name = "rocprim_archive",
+        build_file = clean_dep("//third_party:rocprim.BUILD"),
+        sha256 = "12adf5bf3641d73c92915f102b17951f978704551fdcb9ed7f6311ed299b1d80",
+        strip_prefix = "rocPRIM-eff7d0687baf57db2507a31663a3dea72eed9093",
+        urls = [
+            "https://mirror.bazel.build/github.com/ROCmSoftwarePlatform/rocPRIM/archive/eff7d0687baf57db2507a31663a3dea72eed9093.tar.gz",
+            "https://github.com/ROCmSoftwarePlatform/rocPRIM/archive/eff7d0687baf57db2507a31663a3dea72eed9093.tar.gz",
+        ],
+    )
+
+    tf_http_archive(
         name = "cython",
         build_file = clean_dep("//third_party:cython.BUILD"),
         delete = ["BUILD.bazel"],
@@ -821,11 +834,11 @@
     tf_http_archive(
         name = "tflite_ovic_testdata",
         build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
-        sha256 = "21288dccc517acee47fa9648d4d3da28bf0fef5381911ed7b4d2ee36366ffa20",
+        sha256 = "033c941b7829b05ca55a124a26a6a0581b1ececc154a2153cafcfdb54f80dca2",
         strip_prefix = "ovic",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip",
-            "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip",
+            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
+            "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
         ],
     )
 
diff --git a/third_party/FP16/BUILD.bazel b/third_party/FP16/BUILD.bazel
index b2bb250..e1018be 100644
--- a/third_party/FP16/BUILD.bazel
+++ b/third_party/FP16/BUILD.bazel
@@ -9,7 +9,7 @@
 
 cc_library(
     name = "FP16",
-    hdrs = ["include/fp16.h"],
+    hdrs = glob(["include/**/*.h"]),
     includes = ["include"],
     strip_include_prefix = "include",
 )
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index e59e80f..26ff228 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -12,6 +12,9 @@
 cc_library(
     name = "aws",
     srcs = select({
+        "@org_tensorflow//tensorflow:linux_aarch64": glob([
+            "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
+        ]),
         "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
diff --git a/third_party/eigen3/gpu_packet_math.patch b/third_party/eigen3/gpu_packet_math.patch
index 3d58caf..b8afb1b 100644
--- a/third_party/eigen3/gpu_packet_math.patch
+++ b/third_party/eigen3/gpu_packet_math.patch
@@ -4,7 +4,7 @@
    return make_double2(from, from);
  }
  
-+#if defined(EIGEN_CUDA_ARCH)
++#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  namespace {
  
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
@@ -12,7 +12,7 @@
  pcmp_eq<double2>(const double2& a, const double2& b) {
    return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
  }
-+#endif  // EIGEN_CUDA_ARCH
++#endif  // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
  
  template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
    return make_float4(a, a+1, a+2, a+3);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 8df6782..2b16715 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -2,8 +2,8 @@
 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
 #ifdef _MSC_VER
 
-#include <immintrin.h>
 #include <emmintrin.h>
+#include <immintrin.h>
 #include <smmintrin.h>
 
 #endif
@@ -178,37 +178,73 @@
 struct unpacket_traits<Packet32q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 32, alignment = Aligned32, vectorizable = true };
+  enum {
+    size = 32,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet16q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 16, alignment = Aligned32, vectorizable = true };
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet16q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 16, alignment = Aligned32, vectorizable = true };
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet8q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 8, alignment = Aligned32, vectorizable = true };
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
-  enum { size = 32, alignment = Aligned32, vectorizable = true };
+  enum {
+    size = 32,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet8q32i> {
   typedef QInt32 type;
   typedef Packet4q32i half;
-  enum { size = 8, alignment = Aligned32, vectorizable = true };
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
 // Unaligned load
@@ -232,7 +268,7 @@
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
       reinterpret_cast<const __m128i*>(from));
@@ -283,8 +319,8 @@
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
-      reinterpret_cast<__m128i*>(to), from.val);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
+                                               from.val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
@@ -298,8 +334,8 @@
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
-      reinterpret_cast<__m128i*>(to), from.val);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
+                                               from.val);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 84750c1..6c77aa7 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -127,25 +127,45 @@
 struct unpacket_traits<Packet64q8i> {
   typedef QInt8 type;
   typedef Packet32q8i half;
-  enum { size = 64, alignment = Aligned64 };
+  enum {
+    size = 64,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet32q16i> {
   typedef QInt16 type;
   typedef Packet16q16i half;
-  enum { size = 32, alignment = Aligned64 };
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet64q8u> {
   typedef QUInt8 type;
   typedef Packet32q8u half;
-  enum { size = 64, alignment = Aligned64 };
+  enum {
+    size = 64,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 template <>
 struct unpacket_traits<Packet16q32i> {
   typedef QInt32 type;
   typedef Packet8q32i half;
-  enum { size = 16, alignment = Aligned64 };
+  enum {
+    size = 16,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
 // Unaligned load
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 3e404d5..44b9aaf 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -28,13 +28,11 @@
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -44,20 +42,14 @@
         %{cuda_headers}
     ],
     includes = [
-        ".",
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
         "cuda/include",
-        "cuda/include/crt",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart_static",
     srcs = ["cuda/lib/%{cudart_static_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -65,104 +57,63 @@
         "-lpthread",
         %{cudart_static_linkopt}
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda_driver",
     srcs = ["cuda/lib/%{cuda_driver_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart",
     srcs = ["cuda/lib/%{cudart_lib}"],
     data = ["cuda/lib/%{cudart_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cublas",
     srcs = ["cuda/lib/%{cublas_lib}"],
     data = ["cuda/lib/%{cublas_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
     data = ["cuda/lib/%{cusolver_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn",
     srcs = ["cuda/lib/%{cudnn_lib}"],
     data = ["cuda/lib/%{cudnn_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/include"],
 )
 
 cc_library(
     name = "cufft",
     srcs = ["cuda/lib/%{cufft_lib}"],
     data = ["cuda/lib/%{cufft_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "curand",
     srcs = ["cuda/lib/%{curand_lib}"],
     data = ["cuda/lib/%{curand_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas",
         ":cuda_headers",
@@ -179,40 +130,25 @@
         "cuda/cuda_config.h",
         ":cuda-extras",
     ],
-    includes = [
-        ".",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/extras/CUPTI/include/"],
 )
 
 cc_library(
     name = "cupti_dsos",
     data = ["cuda/lib/%{cupti_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/%{cusparse_lib}"],
     data = ["cuda/lib/%{cusparse_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "libdevice_root",
     data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
 )
 
 %{copy_rules}
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index a8dc722..fefd378 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -28,13 +28,11 @@
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -44,11 +42,9 @@
         %{cuda_headers}
     ],
     includes = [
-        ".",
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
         "cuda/include",
-        "cuda/include/crt",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
@@ -60,70 +56,57 @@
     # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
     interface_library = "cuda/lib/%{cudart_static_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cuda_driver",
     interface_library = "cuda/lib/%{cuda_driver_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cudart",
     interface_library = "cuda/lib/%{cudart_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cublas",
     interface_library = "cuda/lib/%{cublas_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cusolver",
     interface_library = "cuda/lib/%{cusolver_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cudnn",
     interface_library = "cuda/lib/%{cudnn_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/include"],
 )
 
 cc_import(
     name = "cufft",
     interface_library = "cuda/lib/%{cufft_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "curand",
     interface_library = "cuda/lib/%{curand_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas",
         ":cuda_headers",
@@ -140,32 +123,24 @@
         "cuda/cuda_config.h",
         ":cuda-extras",
     ],
-    includes = [
-        ".",
-        "cuda/",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/extras/CUPTI/include/"],
 )
 
 cc_import(
     name = "cupti_dsos",
     interface_library = "cuda/lib/%{cupti_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cusparse",
     interface_library = "cuda/lib/%{cusparse_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "libdevice_root",
     data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
 )
 
 %{copy_rules}
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index c66eac0..b2e3f66 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -335,10 +335,8 @@
     return "\n".join(inc_entries)
 
 def enable_cuda(repository_ctx):
-    if "TF_NEED_CUDA" in repository_ctx.os.environ:
-        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-        return enable_cuda == "1"
-    return False
+    """Returns whether to build with CUDA support."""
+    return int(repository_ctx.os.environ.get("TF_NEED_CUDA", False))
 
 def matches_version(environ_version, detected_version):
     """Checks whether the user-specified version matches the detected version.
@@ -691,13 +689,16 @@
 
     is_windows = _is_windows(repository_ctx)
     cuda_version = config["cuda_version"].split(".")
-    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_version[0], cuda_version[1])
+    cuda_major = cuda_version[0]
+    cuda_minor = cuda_version[1]
+
+    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
     cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
 
     # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
     # It changed from 'x.y' to just 'x' in CUDA 10.1.
-    if (int(cuda_version[0]), int(cuda_version[1])) >= (10, 1):
-        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_version[0]
+    if (int(cuda_major), int(cuda_minor)) >= (10, 1):
+        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
     else:
         cuda_lib_version = cuda_version
 
@@ -982,8 +983,9 @@
             out_dir = "cuda/extras/CUPTI/include",
         ),
     ]
+    included_files = _read_dir(repository_ctx, cuda_include_path)
 
-    if cublas_include_path != cuda_include_path:
+    if not any([file.endswith("cublas.h") for file in included_files]):
         copy_rules.append(make_copy_files_rule(
             repository_ctx,
             name = "cublas-include",
@@ -1022,7 +1024,6 @@
     ))
 
     # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
-    included_files = _read_dir(repository_ctx, cuda_include_path)
     if not any([file.endswith("cudnn.h") for file in included_files]):
         copy_rules.append(make_copy_files_rule(
             repository_ctx,
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index faabd82..90f11f1 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -53,6 +53,7 @@
 tf_<library>_library_dir: ...
 """
 
+import io
 import os
 import glob
 import platform
@@ -104,6 +105,10 @@
   """
   if actual_version is None:
     return False
+
+  # Strip spaces from the versions.
+  actual_version = actual_version.strip()
+  required_version = required_version.strip()
   return actual_version.startswith(required_version)
 
 
@@ -115,8 +120,8 @@
 
 def _get_header_version(path, name):
   """Returns preprocessor defines in C header file."""
-  for line in open(path, "r").readlines():
-    match = re.match("#define %s (\d+)" % name, line)
+  for line in io.open(path, "r", encoding="utf-8").readlines():
+    match = re.match("#define %s +(\d+)" % name, line)
     if match:
       return match.group(1)
   return ""
@@ -402,6 +407,20 @@
   return default
 
 
+def _get_legacy_path(env_name, default=[]):
+  """Returns a path specified by a legacy environment variable.
+
+  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
+  '/usr/lib/x86_64-linux-gnu' would previously find both library and header
+  paths. Detect those and return '/usr', otherwise forward to _list_from_env().
+  """
+  if env_name in os.environ:
+    match = re.match("^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name])
+    if match:
+      return [match.group(1)]
+  return _list_from_env(env_name, default)
+
+
 def _normalize_path(path):
   """Returns normalized path, with forward slashes on Windows."""
   path = os.path.normpath(path)
@@ -423,23 +442,26 @@
     cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
     result.update(_find_cuda_config(cuda_paths, cuda_version))
 
+    cublas_paths = _list_from_env("CUBLAS_INSTALL_PATH", base_paths)
+    # Add cuda paths in case CuBLAS is installed under CUDA_TOOLKIT_PATH.
+    cublas_paths += list(set(cuda_paths) - set(cublas_paths))
     cuda_version = result["cuda_version"]
     cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
-    result.update(_find_cublas_config(cuda_paths, cublas_version, cuda_version))
+    result.update(
+        _find_cublas_config(cublas_paths, cublas_version, cuda_version))
 
   if "cudnn" in libraries:
-    cudnn_paths = _list_from_env("CUDNN_INSTALL_PATH", base_paths)
+    cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
     cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
     result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
 
   if "nccl" in libraries:
-    nccl_paths = _list_from_env("NCCL_INSTALL_PATH",
-                                base_paths) + _list_from_env("NCCL_HDR_PATH")
+    nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
     nccl_version = os.environ.get("TF_NCCL_VERSION", "")
     result.update(_find_nccl_config(nccl_paths, nccl_version))
 
   if "tensorrt" in libraries:
-    tensorrt_paths = _list_from_env("TENSORRT_INSTALL_PATH", base_paths)
+    tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
     tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "")
     result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
 
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 63b8cd4..758cce8 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -194,6 +194,10 @@
     inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/8.0.0/include/")
     inc_dirs.append("/opt/rocm/hcc/lib/clang/8.0.0/include")
 
+    # Support hcc based off clang 9.0.0, included in ROCm2.2
+    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/")
+    inc_dirs.append("/opt/rocm/hcc/lib/clang/9.0.0/include")
+
     inc_entries = []
     for inc_dir in inc_dirs:
         inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index cfb2599..c0c03fe 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -91,7 +91,12 @@
     else:
         # Create target for locally installed NCCL.
         config = find_cuda_config(repository_ctx, ["nccl"])
-        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config)
+        config_wrap = {
+            "%{nccl_version}": config["nccl_version"],
+            "%{nccl_header_dir}": config["nccl_include_dir"],
+            "%{nccl_library_dir}": config["nccl_library_dir"],
+        }
+        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config_wrap)
 
 nccl_configure = repository_rule(
     implementation = _nccl_configure_impl,
diff --git a/third_party/rocprim.BUILD b/third_party/rocprim.BUILD
new file mode 100644
index 0000000..bd9d497
--- /dev/null
+++ b/third_party/rocprim.BUILD
@@ -0,0 +1,58 @@
+# Description: rocPRIM library which is a set of primitives for GPU programming on AMD ROCm stack.
+
+licenses(["notice"])  # BSD
+
+exports_files(["LICENSE.txt"])
+
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm", "rocm_default_copts")
+
+filegroup(
+    name = "rocprim_headers",
+    srcs = glob([
+        "hipcub/include/**",
+        "rocprim/include/**",
+    ]),
+)
+
+cc_library(
+    name = "rocprim",
+    srcs = [
+        "hipcub_version.hpp",
+        "rocprim_version.hpp",
+    ],
+    hdrs = if_rocm([":rocprim_headers"]),
+    includes = [
+        ".",
+        "hipcub/include",
+        "rocprim/include",
+        "rocprim/include/rocprim",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
+)
+
+genrule(
+    name = "rocprim_version_hpp",
+    srcs = ["rocprim/include/rocprim/rocprim_version.hpp.in"],
+    outs = ["rocprim_version.hpp"],
+    cmd = ("sed " +
+           "-e 's/@rocprim_VERSION_MAJOR@/1/g' " +
+           "-e 's/@rocprim_VERSION_MINOR@/0/g' " +
+           "-e 's/@rocprim_VERSION_PATCH@/0/g' " +
+           "$< >$@"),
+    message = "Creating rocPRIM version header...",
+)
+
+genrule(
+    name = "hipcub_version_hpp",
+    srcs = ["hipcub/include/hipcub/hipcub_version.hpp.in"],
+    outs = ["hipcub_version.hpp"],
+    cmd = ("sed " +
+           "-e 's/@rocprim_VERSION_MAJOR@/0/g' " +
+           "-e 's/@rocprim_VERSION_MINOR@/3/g' " +
+           "-e 's/@rocprim_VERSION_PATCH@/0/g' " +
+           "$< >$@"),
+    message = "Creating hipcub version header...",
+)
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 004cc0e..3c5550a 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -18,6 +18,7 @@
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
 _TF_TENSORRT_CONFIG_REPO = "TF_TENSORRT_CONFIG_REPO"
 _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
+_TF_NEED_TENSORRT = "TF_NEED_TENSORRT"
 
 _TF_TENSORRT_LIBS = ["nvinfer", "nvinfer_plugin"]
 _TF_TENSORRT_HEADERS = ["NvInfer.h", "NvUtils.h", "NvInferPlugin.h"]
@@ -36,13 +37,16 @@
 def _create_dummy_repository(repository_ctx):
     """Create a dummy TensorRT repository."""
     _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_false"})
-
     _tpl(repository_ctx, "BUILD", {
-        "%{tensorrt_genrules}": "",
-        "%{tensorrt_headers}": "[]",
-        "%{tensorrt_libs}": "[]",
+        "%{copy_rules}": "",
+        "\":tensorrt_include\"": "",
+        "\":tensorrt_lib\"": "",
     })
 
+def enable_tensorrt(repository_ctx):
+    """Returns whether to build with TensorRT support."""
+    return int(repository_ctx.os.environ.get(_TF_NEED_TENSORRT, False))
+
 def _tensorrt_configure_impl(repository_ctx):
     """Implementation of the tensorrt_configure repository rule."""
     if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
@@ -56,7 +60,7 @@
         )
         return
 
-    if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
+    if not enable_tensorrt(repository_ctx):
         _create_dummy_repository(repository_ctx)
         return
 
@@ -80,7 +84,7 @@
             repository_ctx,
             name = "tensorrt_include",
             srcs = [include_dir + header for header in headers],
-            outs = ["tensorrt/incude/" + header for header in headers],
+            outs = ["tensorrt/include/" + header for header in headers],
         ),
     ]
 
@@ -90,7 +94,6 @@
     # Set up BUILD file.
     _tpl(repository_ctx, "BUILD", {
         "%{copy_rules}": "\n".join(copy_rules),
-        "%{tensorrt_libs}": str(libraries),
     })
 
 tensorrt_configure = repository_rule(
@@ -99,6 +102,7 @@
         _TENSORRT_INSTALL_PATH,
         _TF_TENSORRT_VERSION,
         _TF_TENSORRT_CONFIG_REPO,
+        _TF_NEED_TENSORRT,
         "TF_CUDA_PATHS",
     ],
 )
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 98561b9..f33b9a5 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -39,6 +39,25 @@
         """ % container_digests["ubuntu16.04"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6.
+platform(
+    name = "rbe_centos6",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/tensorflow-testing/nosla-centos6@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["centos6"],
+)
+
 # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/BUILD b/third_party/toolchains/preconfig/centos6/gcc7/BUILD
new file mode 100755
index 0000000..5d97f20
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/gcc7/BUILD
@@ -0,0 +1,121 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
+
+package(default_visibility = ["//visibility:public"])
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "cc_wrapper",
+    srcs = ["cc_wrapper.sh"],
+)
+
+filegroup(
+    name = "compiler_deps",
+    srcs = glob(["extra_tools/**"]) + [":empty"],
+)
+
+# This is the entry point for --crosstool_top.  Toolchains are found
+# by lopping off the name of --crosstool_top and searching for
+# the "${CPU}" entry in the toolchains attribute.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "k8|gcc": ":cc-compiler-k8",
+        "k8": ":cc-compiler-k8",
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-k8",
+    all_files = ":compiler_deps",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":compiler_deps",
+    dwp_files = ":empty",
+    linker_files = ":compiler_deps",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_gnu_x86",
+    toolchain_identifier = "linux_gnu_x86",
+)
+
+cc_toolchain_config(
+    name = "linux_gnu_x86",
+    compiler = "gcc",
+    cpu = "k8",
+)
+
+toolchain(
+    name = "cc-toolchain-k8",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    toolchain = ":cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+cc_toolchain_config(
+    name = "stub_armeabi-v7a",
+    compiler = "compiler",
+    cpu = "armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE b/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
new file mode 100644
index 0000000..bc05b4c
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
+workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
new file mode 100755
index 0000000..1829574
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
@@ -0,0 +1,1734 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "make_variable",  # @unused
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _windows_msvc_impl(ctx):
+    toolchain_identifier = "msvc_x64"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "msvcrt"
+    compiler = "msvc-cl"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+
+    cxx_builtin_include_directories = [
+        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
+        "/usr/local/include",
+        "/opt/rh/devtoolset-7/root/usr/include",
+        "/usr/include",
+        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
+        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
+        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
+    ]
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    action_configs = [
+        assemble_action,
+        preprocess_assemble_action,
+        c_compile_action,
+        cpp_compile_action,
+        cpp_link_executable_action,
+        cpp_link_dynamic_library_action,
+        cpp_link_nodeps_dynamic_library_action,
+        cpp_link_static_library_action,
+    ]
+
+    msvc_link_env_feature = feature(
+        name = "msvc_link_env",
+        env_sets = [
+            env_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                env_entries = [env_entry(key = "LIB", value = "")],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/wd4117",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{unfiltered_compile_flags}"],
+                        iterate_over = "unfiltered_compile_flags",
+                        expand_if_available = "unfiltered_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{libopts}"],
+                        iterate_over = "libopts",
+                        expand_if_available = "libopts",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fastbuild_feature = feature(
+        name = "fastbuild",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    dbg_feature = feature(
+        name = "dbg",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    opt_feature = feature(
+        name = "opt",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/O2"])],
+            ),
+        ],
+        implies = ["frame_pointer"],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0601",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_compile_env_feature = feature(
+        name = "msvc_compile_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                env_entries = [env_entry(key = "INCLUDE", value = "")],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable_assertions",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    ignore_noisy_warnings_feature = feature(
+        name = "ignore_noisy_warnings",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [flag_group(flags = ["/ignore:4221"])],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    treat_warnings_as_errors_feature = feature(
+        name = "treat_warnings_as_errors",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/WX"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    frame_pointer_feature = feature(
+        name = "frame_pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Oy-"])],
+            ),
+        ],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_not_available = "output_preprocess_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    smaller_binary_feature = feature(
+        name = "smaller_binary",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ""),
+                    env_entry(key = "TMP", value = ""),
+                    env_entry(key = "TEMP", value = ""),
+                ],
+            ),
+        ],
+        implies = ["msvc_compile_env", "msvc_link_env"],
+    )
+
+    features = [
+        no_legacy_features_feature,
+        nologo_feature,
+        has_configured_linker_path_feature,
+        no_stripping_feature,
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        default_compile_flags_feature,
+        msvc_env_feature,
+        msvc_compile_env_feature,
+        msvc_link_env_feature,
+        include_paths_feature,
+        preprocessor_defines_feature,
+        parse_showincludes_feature,
+        generate_pdb_file_feature,
+        shared_flag_feature,
+        linkstamps_feature,
+        output_execpath_flags_feature,
+        archiver_flags_feature,
+        input_param_flags_feature,
+        linker_subsystem_flag_feature,
+        user_link_flags_feature,
+        default_link_flags_feature,
+        linker_param_file_feature,
+        static_link_msvcrt_feature,
+        static_link_msvcrt_no_debug_feature,
+        dynamic_link_msvcrt_no_debug_feature,
+        static_link_msvcrt_debug_feature,
+        dynamic_link_msvcrt_debug_feature,
+        dbg_feature,
+        fastbuild_feature,
+        opt_feature,
+        frame_pointer_feature,
+        disable_assertions_feature,
+        determinism_feature,
+        treat_warnings_as_errors_feature,
+        smaller_binary_feature,
+        ignore_noisy_warnings_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+        compiler_output_flags_feature,
+        compiler_input_flags_feature,
+        def_file_feature,
+        windows_export_all_symbols_feature,
+        no_windows_export_all_symbols_feature,
+        supports_dynamic_linker_feature,
+        supports_interface_shared_libraries_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "object_file",
+            prefix = "",
+            extension = ".obj",
+        ),
+        artifact_name_pattern(
+            category_name = "static_library",
+            prefix = "",
+            extension = ".lib",
+        ),
+        artifact_name_pattern(
+            category_name = "alwayslink_static_library",
+            prefix = "",
+            extension = ".lo.lib",
+        ),
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+        artifact_name_pattern(
+            category_name = "dynamic_library",
+            prefix = "",
+            extension = ".dll",
+        ),
+        artifact_name_pattern(
+            category_name = "interface_library",
+            prefix = "",
+            extension = ".if.lib",
+        ),
+    ]
+
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = ""),
+        tool_path(name = "ml", path = ""),
+        tool_path(name = "cpp", path = ""),
+        tool_path(name = "gcc", path = ""),
+        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(name = "ld", path = ""),
+        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(
+            name = "objcopy",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "objdump",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "strip",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = None,
+    )
+
+def _windows_msys_mingw_impl(ctx):
+    toolchain_identifier = "msys_x64_mingw"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "mingw"
+    compiler = "mingw-gcc"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "NOT_USED"),
+                ],
+            ),
+        ],
+    )
+
+    msys_mingw_flags = [
+    ]
+    msys_mingw_link_flags = [
+    ]
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
+            ),
+        ],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        supports_dynamic_linker_feature,
+    ]
+
+    cxx_builtin_include_directories = [
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+    ]
+
+    make_variables = []
+    tool_paths = [
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _armeabi_impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "compat-ld", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _impl(ctx):
+    if ctx.attr.cpu == "armeabi-v7a":
+        return _armeabi_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
+        return _windows_msvc_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+        return _windows_msys_mingw_impl(ctx)
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/opt/rh/devtoolset-7/root/usr/bin/ar"),
+        tool_path(name = "ld", path = "/opt/rh/devtoolset-7/root/usr/bin/ld"),
+        tool_path(name = "cpp", path = "/opt/rh/devtoolset-7/root/usr/bin/cpp"),
+        tool_path(name = "gcc", path = "/opt/rh/devtoolset-7/root/usr/bin/gcc"),
+        tool_path(name = "dwp", path = "/opt/rh/devtoolset-7/root/usr/bin/dwp"),
+        tool_path(name = "gcov", path = "/opt/rh/devtoolset-7/root/usr/bin/gcov"),
+        tool_path(name = "nm", path = "/opt/rh/devtoolset-7/root/usr/bin/nm"),
+        tool_path(name = "objcopy", path = "/opt/rh/devtoolset-7/root/usr/bin/objcopy"),
+        tool_path(name = "objdump", path = "/opt/rh/devtoolset-7/root/usr/bin/objdump"),
+        tool_path(name = "strip", path = "/opt/rh/devtoolset-7/root/usr/bin/strip"),
+    ]
+
+    cxx_builtin_include_directories = [
+        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
+        "/usr/local/include",
+        "/opt/rh/devtoolset-7/root/usr/include",
+        "/usr/include",
+        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
+        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
+        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
+    ]
+
+    action_configs = []
+
+    compile_flags = [
+        "-U_FORTIFY_SOURCE",
+        "-fstack-protector",
+        "-Wall",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fno-omit-frame-pointer",
+    ]
+
+    dbg_compile_flags = [
+        "-g",
+    ]
+
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ]
+
+    cxx_flags = [
+        "-std=c++0x",
+    ]
+
+    link_flags = [
+        "-fuse-ld=gold",
+        "-Wl,-no-as-needed",
+        "-Wl,-z,relro,-z,now",
+        "-B/opt/rh/devtoolset-7/root/usr/bin",
+        "-pass-exit-codes",
+        "-lstdc++",
+        "-lm",
+    ]
+
+    opt_link_flags = [
+        "-Wl,--gc-sections",
+    ]
+
+    unfiltered_compile_flags = [
+        "-fno-canonical-system-headers",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+    ]
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "NOT_USED"),
+                ],
+            ),
+        ],
+    )
+
+    windows_features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+    ]
+
+    coverage_feature = feature(
+        name = "coverage",
+        provides = ["profile"],
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(flags = ["--coverage"]),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [
+                    flag_group(flags = ["--coverage"]),
+                ],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(
+        name = "supports_pic",
+        enabled = True,
+    )
+    supports_start_end_lib_feature = feature(
+        name = "supports_start_end_lib",
+        enabled = True,
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
+                with_features = [with_feature_set(features = ["dbg"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    dbg_feature = feature(name = "dbg")
+
+    opt_feature = feature(name = "opt")
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fdo_optimize_feature = feature(
+        name = "fdo_optimize",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-fprofile-use=%{fdo_profile_path}",
+                            "-fprofile-correction",
+                        ],
+                        expand_if_available = "fdo_profile_path",
+                    ),
+                ],
+            ),
+        ],
+        provides = ["profile"],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
+            ),
+        ],
+    )
+
+    features = [
+        supports_pic_feature,
+        supports_start_end_lib_feature,
+        coverage_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        fdo_optimize_feature,
+        supports_dynamic_linker_feature,
+        dbg_feature,
+        opt_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+    ]
+
+    artifact_name_patterns = [
+    ]
+
+    make_variables = []
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = "linux_gnu_x86",
+        host_system_name = "i686-unknown-linux-gnu",
+        target_system_name = "x86_64-unknown-linux-gnu",
+        target_cpu = "k8",
+        target_libc = "glibc_2.19",
+        compiler = "gcc",
+        abi_version = "gcc",
+        abi_libc_version = "glibc_2.19",
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = "",
+        cc_target_os = None,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/docker/run_jupyter.sh b/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
similarity index 64%
copy from tensorflow/tools/docker/run_jupyter.sh
copy to third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
index 2771aea..5a5465c 100755
--- a/tensorflow/tools/docker/run_jupyter.sh
+++ b/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
@@ -1,18 +1,25 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#!/bin/bash
+#
+# Copyright 2015 The Bazel Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+#
+# Ship the environment to the C++ action
+#
+set -eu
+
+# Set-up the environment
 
 
-jupyter notebook "$@"
+# Call the C++ compiler
+/opt/rh/devtoolset-7/root/usr/bin/gcc "$@"
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl b/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
new file mode 100755
index 0000000..45c0285
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc b/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
new file mode 100755
index 0000000..237c8ce
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
@@ -0,0 +1 @@
+int main() {}
diff --git a/third_party/toolchains/preconfig/centos6/py/BUILD b/third_party/toolchains/preconfig/centos6/py/BUILD
new file mode 100755
index 0000000..b8de94c
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/py/BUILD
@@ -0,0 +1,174 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/boolobject.h",
+        "python_include/bufferobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cStringIO.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/cobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/intobject.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/node.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig-64.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pygetopt.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymactoolbox.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/stringobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/timefuncs.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "/opt/rh/python27/root/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyconfig-64.h" "$(@D)/python_include/pyconfig-64.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/centos6/py/WORKSPACE b/third_party/toolchains/preconfig/centos6/py/WORKSPACE
new file mode 100644
index 0000000..1d298fe
--- /dev/null
+++ b/third_party/toolchains/preconfig/centos6/py/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
+workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index 385cbd8..2d8e733 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -10,6 +10,22 @@
 )
 
 tensorflow_rbe_config(
+    name = "centos6-py-gcc7",
+    compiler = "gcc",
+    compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
+    os = "centos6",
+    python_version = "2",
+)
+
+tensorflow_rbe_config(
+    name = "centos6-py3-gcc7",
+    compiler = "gcc",
+    compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
+    os = "centos6",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
     name = "centos6-py3-gcc7-cuda10.0-cudnn7-tensorrt5",
     compiler = "gcc",
     compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index fb62f5e..9531e95 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,5 +1,7 @@
+"""SHA 256 values for each image."""
 container_digests = {
-    "ubuntu16.04": "sha256:d0d98c53111c3ec071aa81632a2b0d6f210e5c2411c5172e31f99002125ec4de",
+    "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
+    "centos6": "sha256:8402dc2bc0e9baa31a32caf182bf6a4f5f91852d1d5e3079175dfb4d2237cde8",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a874e7a102abb7da5861dd468b68a25f360bb976a39a26c1e123b770e7900322",
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index 6d891ce..ec4b7c1 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -53,7 +53,7 @@
     docker_toolchain_autoconfig(
         name = name,
         base = base,
-        bazel_version = "0.23.2",
+        bazel_version = "0.24.1",
         config_repos = config_repos,
         env = env,
         mount_project = "$(mount_project)",
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
index 7adfcb7..85503db 100644
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -12,6 +12,13 @@
     container_repositories()
 
     container_pull(
+        name = "centos6",
+        registry = "gcr.io",
+        repository = "tensorflow-testing/nosla-centos6",
+        digest = container_digests["centos6"],
+    )
+
+    container_pull(
         name = "ubuntu16.04",
         registry = "gcr.io",
         repository = "tensorflow-testing/nosla-ubuntu16.04",
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
index 960a38f..ae8cf1e 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -28,37 +28,30 @@
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda_headers",
     hdrs = [
         "cuda/cuda_config.h",
+        ":cublas-include",
         ":cuda-include",
         ":cudnn-include",
     ],
     includes = [
-        ".",
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
         "cuda/include",
-        "cuda/include/crt",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart_static",
     srcs = ["cuda/lib/libcudart_static.a"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -66,104 +59,63 @@
         "-lpthread",
         "-lrt",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda_driver",
     srcs = ["cuda/lib/libcuda.so"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart",
     srcs = ["cuda/lib/libcudart.so.10.0"],
     data = ["cuda/lib/libcudart.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cublas",
     srcs = ["cuda/lib/libcublas.so.10.0"],
     data = ["cuda/lib/libcublas.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/libcusolver.so.10.0"],
     data = ["cuda/lib/libcusolver.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn",
     srcs = ["cuda/lib/libcudnn.so.7"],
     data = ["cuda/lib/libcudnn.so.7"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/include"],
 )
 
 cc_library(
     name = "cufft",
     srcs = ["cuda/lib/libcufft.so.10.0"],
     data = ["cuda/lib/libcufft.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "curand",
     srcs = ["cuda/lib/libcurand.so.10.0"],
     data = ["cuda/lib/libcurand.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas",
         ":cuda_headers",
@@ -180,40 +132,25 @@
         "cuda/cuda_config.h",
         ":cuda-extras",
     ],
-    includes = [
-        ".",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/extras/CUPTI/include/"],
 )
 
 cc_library(
     name = "cupti_dsos",
     data = ["cuda/lib/libcupti.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/libcusparse.so.10.0"],
     data = ["cuda/lib/libcusparse.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "libdevice_root",
     data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
 )
 
 genrule(
@@ -1253,6 +1190,8 @@
     cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
+filegroup(name = "cublas-include")
+
 genrule(
     name = "cuda-lib",
     outs = [
@@ -1267,7 +1206,7 @@
         "cuda/lib/libcupti.so.10.0",
         "cuda/lib/libcusparse.so.10.0",
     ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" $(location cuda/lib/libcudart.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" $(location cuda/lib/libcublas.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" $(location cuda/lib/libcusolver.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" $(location cuda/lib/libcusparse.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" $(location cuda/lib/libcurand.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" $(location cuda/lib/libcufft.so.10.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" $(location cuda/lib/libcupti.so.10.0) """,
+    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" $(location cuda/lib/libcudart.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" $(location cuda/lib/libcublas.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" $(location cuda/lib/libcusolver.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" $(location cuda/lib/libcurand.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" $(location cuda/lib/libcufft.so.10.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" $(location cuda/lib/libcupti.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" $(location cuda/lib/libcusparse.so.10.0) """,
 )
 
 genrule(
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
index 518a3b0..a8a0e57 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
@@ -3,12 +3,12 @@
 
 licenses(["notice"])
 
-exports_files(["LICENSE"])
-
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 cc_library(
     name = "tensorrt_headers",
     hdrs = [":tensorrt_include"],
@@ -18,15 +18,9 @@
 
 cc_library(
     name = "tensorrt",
-    srcs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
+    srcs = [":tensorrt_lib"],
     copts = cuda_default_copts(),
-    data = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
+    data = [":tensorrt_lib"],
     include_prefix = "",
     linkstatic = 1,
     visibility = ["//visibility:public"],
